This vignette introduces the workflow to simulated single-cell RNA-seq (scRNA-seq) data after retrieving gene sets from the web interface of AEGIS.
The pipeline relies on commonly used packages, so no complicated installation is required.
import os
import json
import numpy as np
import pandas as pd
from scipy import stats, sparse
# for plotting
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
Prior to this tutorial, we randomly selected 10 genes respectively each from three descendants of the cell cycle process (GO:0022402):
The interaction is recorded as part of our Supplementary Videos. For convenience, these genes are copy/pasted here.
gene_sets = {
"GO:0000082": [ # G1/S transition of mitotic cell cycle
'CDKN2A',
'CUL2',
'ORC5',
'TFDP3',
'PHF8',
'BCAT1',
'CACUL1',
'ITGB1',
'CDC7',
'USP29',
],
"GO:0000086": [ # G2/M transition of mitotic cell cycle
'HAUS7',
'CDKN2B',
'DCTN1',
'CEP41',
'LCMT1',
'PCM1',
'FBXW11',
'HAUS2',
'CEP57',
'CEP250',
],
"GO:0071850": [ # mitotic cell cycle arrest
'E4F1',
'CDC14B',
'CDC14A',
'DUSP1',
'CDKN1B',
'MAGI2',
'CDC14C',
'GADD45A',
'NKX3-1',
'MCPH1',
],
}
pop_names = list(gene_sets.keys()) + ["control"]
print("Populations: {}".format(pop_names))
Next, we specify the non-differentially expressed genes, and check that the selected gene set names are valid. A convient way to get a list of gene names is to used the local file automatically stored for AEGIS. The version-controlled gene name lists are downloadable from: http://stanford.edu/~jjzhu/fileshare/aegis/. You can download a zipped file (e.g., raw_20180719.tar.gz
), unzip it, and load the file geneid2sym_*.json
from the path:
CACHE_DIR = "/YOUR/LOCAL/PATH"
# load the files from local
SPECIES = "human"
file_name = os.path.join(CACHE_DIR, "geneid2sym_{}.json".format(SPECIES))
with open(file_name) as json_file:
gene_conversions = json.load(json_file)
all_genes = list(gene_conversions.values())
print("Number of initial genes: {}".format(len(all_genes)))
print(all_genes[:10])
As most scRNA-seq pipelines filter out genes with low variability as pre-processing step, we randomly selected 2060 genes (including the signature genes) to be our variable genes. This scale is common for scRNA-seq studies, especially those captured with droplet technologies.
total_gene_list = []
sig_gene_params = {}
for pop in pop_names:
if pop == "control":
continue
total_gene_list += gene_sets[pop] # add signal genes
for gene in gene_sets[pop]:
sig_gene_params[gene] = {}
# randomly add other non-differentially expressed genes
np.random.seed(101010)
retain_percentage = 0.1
num_other_genes = int(len(all_genes) * retain_percentage)
all_genes = np.array(all_genes)
sel_idx = np.random.choice(len(all_genes), num_other_genes, replace=False)
other_genes = all_genes[sel_idx]
for sym in other_genes: # add the rest of genes
if sym not in sig_gene_params:
total_gene_list.append(sym)
print("Number of variable genes: {}".format(len(total_gene_list)))
print("{} ...".format(total_gene_list[:5]))
Using the selected gene set, we can create cell sub-populations that are specific to different phases of the cell cycle. In the example, we used 120, 150, 100, 300 cells for G1/S transition, G2/M transition, cell cycle arrest, and control respectively. Each cell type is supposed to have enriched expression of its 10 signature genes; the control cell type is the baseline and does not have enrichment of any of the signature genes.
np.random.seed(111111)
num_cells = {
"GO:0000082": 120,
"GO:0000086": 150,
"GO:0071850": 100,
"control": 300,
}
tot_num_cells = sum(num_cells.values())
print("Total number of cells:\t{}".format(tot_num_cells))
To model the counts in the matrix, we choose a zero-inflated negative binomial model similar to Risso et al. 2018.
Under this model, signature and non-signature genes entail different negative binomial parameters. The non-signature follows a negative binomial with parameters $n$, $p$, where $n$ is interpreted as the number of failures and $p$ is the probability of success. To increase the variability among the signature genes, we set gene-specific parameters for $p$ based on a normal distribution. (A low value of $p$ would generally lead to a higher mean expression.)
np.random.seed(222222)
n = 30 # general shape parameter
p_null = 0.85
p_sig_m = 0.80
p_sig_s = 0.10
for pop in gene_sets:
for gene in gene_sets[pop]:
p = p_sig_s * np.random.normal() + p_sig_m
sig_gene_params[gene] = {"n": n,
"p": p,
"size":num_cells[pop]}
# plot the two distributions
fig, ax = plt.subplots(1, 1)
x_range = np.arange(0, 20)
for i, p_parm in enumerate([p_null, p_sig_m]):
z = "background" if i == 0 else "signal"
ax.bar(x_range,
stats.nbinom.pmf(x_range, n, p_parm),
alpha=0.5,
label="{} ({}, {})".format(z, n, p_parm))
plt.legend()
plt.show()
Here we set the proportion of zero counts, or the dropout probability for the techinical effect. (For efficiency, we work with sparse matrices here.)
background_sparsity = 0.20 # zero fraction: 1-sparsity
np.random.seed(111111)
rvs = stats.nbinom(n=n, p=p_null).rvs
back_mat = sparse.random(tot_num_cells,
len(total_gene_list),
background_sparsity,
data_rvs=rvs)
null_hist = np.histogram(back_mat.A, bins=x_range) # save for plotting later
print("Matrix dimension: {}".format(back_mat.shape))
For efficiency, we generate the expression of the signature genes via a block diagonal matrix and overwrite the background matrix in appropriate positions.
signal_sparsity = 0.50 # zero fraction: 1-sparsity
np.random.seed(222222)
diag_blocks = []
for pop in pop_names:
n_cells = num_cells[pop]
print("{}: {} cells".format(pop, n_cells))
if pop == "control":
n_null_genes = len(total_gene_list) - len(sig_gene_params)
sub_mtx = np.zeros((n_cells, n_null_genes))
else:
sub_mtx = np.zeros((n_cells, len(gene_sets[pop])))
for i, gene in enumerate(gene_sets[pop]):
gene_expr = stats.nbinom.rvs(**sig_gene_params[gene])
sel = np.random.binomial(1, signal_sparsity, len(gene_expr))
sub_mtx[:,i] = gene_expr * sel
print(sub_mtx.shape)
diag_blocks.append(sub_mtx)
mat = back_mat + sparse.block_diag(diag_blocks)
flat_matrices = [m.flatten() for m in diag_blocks[:3]]
signal_counts = np.concatenate(flat_matrices)
print("Number of signal counts: {}".format(len(signal_counts)))
sig_hist = np.histogram(signal_counts, bins=x_range)
fig, ax = plt.subplots(1, 1)
for i, hist_cnt in enumerate([null_hist, sig_hist]):
z = "background" if i == 0 else "signal"
norm_freq = hist_cnt[0] / sum(hist_cnt[0])
ax.bar(hist_cnt[1][:-1],
norm_freq,
alpha=0.5,
label="{} ({}, {})".format(z, n, p_parm))
plt.legend()
plt.show()
The secondary analysis includes three visual components: 1) heatmap view of the expression, 2) projection of the cells in the PC space and 3) the embedding of the cells with t-SNE. These are among the most commonly used techniques for exploratory analysis, even though numerous variants exist.
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns
Generate a cell label vector and some meta information for the class labels
label_ids = []
for i, pop in enumerate(pop_names):
label_ids += [i] * num_cells[pop]
col_map = {
"GO:0000082": "#a6cee3",
"GO:0000086": "#b2df8a",
"GO:0071850": "#fb9a99",
"control": "#EDC500",
}
dark_col_map = {
"GO:0000082": "#001D71",
"GO:0000086": "#014B06",
"GO:0071850": "#630000",
"control": "#EDC500",
}
text_alias = {
"GO:0000082": "G1/S \ntransition \nof mitotic \ncell cycle",
"GO:0000086": "G2/M \ntransition \nof mitotic \ncell cycle",
"GO:0071850": "mitotic cell \ncycle arrest",
}
We organize the genes and cell types according to cell types, and include 10 random non-signal genes for the controlled population. Since we front-loaded the signal genes, we just show the top 40 genes in the heatmap. We should expect a block-diagonal structure.
num_genes_to_plot = 40
gene_names = total_gene_list[:num_genes_to_plot]
plot_mtx = StandardScaler().fit_transform(np.log(1+mat.todense()[:, :num_genes_to_plot])).T
plot_mtx = pd.DataFrame(plot_mtx)
plot_mtx["genes"] = gene_names
plot_mtx = plot_mtx.set_index("genes")
g = sns.clustermap(plot_mtx,
row_cluster=False,
col_cluster=False,
cmap="inferno",
figsize=(12, 10))
g.ax_heatmap.get_xaxis().set_visible(False)
g.ax_heatmap.set_position([.25, 0.26, 0.7, 0.6])
g.cax.set_position([.15, 0.35, 0.03, .45])
PCA is a critical step in dimension reduction. It is also one of the most interpretable unsupervised learning approaches that we can leverage for insightful representation of the data. Note that, before running PCA, we first center the data as a good practice.
np.random.seed(121212)
mat_scaled = StandardScaler(with_std=False).fit_transform(mat.todense())
pc_fit = PCA(n_components=50).fit(mat_scaled)
top_pcs = pc_fit.transform(mat_scaled)
To visualize with t-SNE, we use the top 10 PCs. In practice, there are multiple ways to customize t-SNE, but we selected this as it displayed the cell types sufficiently well.
np.random.seed(121212)
tsne_proj = TSNE(n_components=2).fit_transform(top_pcs[:,:10])
print(tsne_proj.shape)
Finally, we can visualize the data and the underlying annotations we designed. For the PCs, we provide biplots that indicate the direction of the signal genes with respect to the PCs. This helps highlight the important ground truth variables.
pc_select = [0,1]
sig_genes = []
sig_group = []
sig_idx = []
offset = 0
max_arr = 10 # the number of arrows to include
for pop in pop_names:
if pop == "control":
continue
for i, gene in enumerate(gene_sets[pop]):
if i == max_arr:
break
sig_genes.append(gene)
sig_group.append(pop)
sig_idx.append(offset + i)
offset += len(gene_sets[pop])
arrow_scaling = np.power( pc_fit.singular_values_[pc_select], 0.4)
v_comp = pc_fit.components_[pc_select,:][:,sig_idx]
arrow_heads = arrow_scaling * v_comp.T # broadcast
arrow_df = pd.DataFrame(arrow_heads, columns=["x", "y"])
arrow_df["ssquare"] = arrow_df["x"]**2 + arrow_df["y"]**2
arrow_df["gene"] = sig_genes
arrow_df["index"] = sig_genes
arrow_df["group"] = sig_group
arrow_df = arrow_df.set_index("index")
pca_proj = top_pcs[:, pc_select] / arrow_scaling
print("Projection dimension: {}".format(pca_proj.shape))
arrow_df.head()
The t-SNE and PCA plots are customized to for better rendering.
label_size = 15
text_size = 13
tick_label_size = 12
def plot_scatter_groups(projected,
label_ids,
label_ann,
col_map,
ax=None,
clean=False):
n_labs = len(label_ann)
clist = [col_map[lab] for lab in label_ann]
cmap = LinearSegmentedColormap.from_list("my_list", clist, len(clist))
scat = ax.scatter(projected[:, 0], projected[:, 1],
c=label_ids,
edgecolor="#383838",
alpha=1.0, s=24.0,
cmap=cmap)
dist = (n_labs - 1) / n_labs
tick_pos = dist * np.arange(n_labs) + dist / 2
if clean:
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
else:
cb = fig.colorbar(scat, ticks=tick_pos)
cb.set_ticklabels(label_ann)
def plot_pc_arrows(arrow_df,
col_map,
ax=None,
clean=False):
grp_max = arrow_df[["ssquare", "gene", "group"]].groupby("group")["ssquare"].transform(max)
idx = grp_max == arrow_df["ssquare"]
max_genes = arrow_df[idx].index
print(max_genes)
# setup labels
if not clean:
ax.set_xlabel("principle component {}".format(pc_select[0]+1),
family="arial",
size=label_size)
ax.set_ylabel("principle component {}".format(pc_select[1]+1),
family="arial",
size=label_size)
for index, row in arrow_df.iterrows():
ax.arrow(0, 0, row["x"], row["y"],
color=col_map[row["group"]],
alpha=0.9,
width=0.0001, head_width=0.20)
x_pos = row["x"]*1.1
y_pos = row["y"]*1.1
# manual customization
if (row["gene"] == 'CDC7'):
x_pos = row["x"]*1.0
y_pos = row["y"]*1.4
if (row["gene"] == "CDKN2B"):
x_pos = row["x"]*0.9
y_pos = row["y"]*1.15
if (row["gene"] == "MCPH1"):
x_pos = row["x"]*0.9
y_pos = row["y"]*1.15
if clean:
if row["gene"] in max_genes:
plot_text = True
else:
plot_text = False
else:
plot_text = True
if plot_text:
ax.text(x_pos, y_pos, row["gene"],
color=col_map[row["group"]],
size = text_size,
style="italic",
family="arial")
ax.set_xlim(-3.0, 5.7)
ax.set_ylim(-3.3, 4.5)
ax.set_yticks(np.arange(-3, 4, 2.0))
ax.tick_params(labelsize=tick_label_size)
def plot_tsne_ann(ax, text_alias, col_map, clean=True):
ax.set_xlim(-50, 55)
ax.set_ylim(-40, 50)
ax.tick_params(labelsize=tick_label_size)
ax.set_yticks(np.arange(-40, 50, 20))
if not clean:
ax.set_xlabel("t-SNE dimension 1", family="arial", size=label_size)
ax.set_ylabel("t-SNE dimension 2", family="arial", size=label_size)
for term in text_alias:
x_pos = 0
y_pos = 0
if term == "GO:0000082":
x_pos = 32
y_pos = -38
if term == "GO:0000086":
x_pos = 20
y_pos = 20
if term == "GO:0071850":
x_pos = -47
y_pos = 20
ax.text(x_pos, y_pos, text_alias[term],
size = text_size,
color=col_map[term], family="arial")
full_figsize = (5.2, 4.3)
fig, ax = plt.subplots(1,1, figsize=full_figsize)
plot_scatter_groups(pca_proj, label_ids, pop_names, col_map, ax=ax)
plot_pc_arrows(arrow_df, dark_col_map, ax=ax)
plt.show()
fig, ax = plt.subplots(1,1, figsize=full_figsize)
plot_scatter_groups(tsne_proj, label_ids, pop_names, col_map, ax=ax)
plot_tsne_ann(ax, text_alias, dark_col_map)
plt.show()