In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%load_ext line_profiler
In [ ]:
import scanpy as sc
from scipy.io import mmread
import random
import pandas as pd

import src.scanpy_unicoord as scu
import torch
from src.visualization import *
from line_profiler import LineProfiler
import imageio
In [ ]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
# sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.settings.set_figure_params(vector_friendly=False)

load data

In [ ]:
adata = sc.read_h5ad(r'D:\hECA\Lung.Adult.pp.h5ad')
In [ ]:
adata = adata.raw.to_adata()
sc.pp.normalize_total(adata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(adata)
adata

model

In [ ]:
scu.model_unicoord_in_adata(adata, n_cont=50, n_diff=0, n_clus = [],
                            obs_fitting=['cell_type','seq_tech'])
In [ ]:
scu.train_unicoord_in_adata(adata, epochs=10, chunk_size=20000, slot = "cur")
In [16]:
fig = draw_loss_curves(adata.uns['unc_stuffs']['trainer'].losses)
# if save_figs:
#     fig.savefig(os.path.join(savePath, 'img', 'fig1_lossCurves.png'))
fig.show()
<ipython-input-16-17c745c75a86>:4: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()

generation

In [21]:
bdata = scu.generate_unicoord_in_adata(adata[random.sample(list(adata.obs_names),10000),:],
                                       chunk_size=5000)
bdata
Out[21]:
AnnData object with n_obs × n_vars = 10000 × 20770
    obs: 'user_id', 'study_id', 'cell_id', 'organ', 'region', 'subregion', 'seq_tech', 'sample_status', 'donor_id', 'donor_gender', 'donor_age', 'original_name', 'cl_name', 'hcad_name', 'tissue_type', 'cell_type', 'marker_gene', 'cid', 'RNA_snn_res.0.4', 'seurat_clusters', 'nCount_RNA', 'nFeature_RNA', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'unc_training'
    var: 'features'
In [25]:
bdata.obs_names[0][:-4]
Out[25]:
'GATGAAAGTGAGGGTT.1.HCATisStab7747198_4'
In [29]:
map(lambda x:x[:-4], ['12345','234125','342315'])
Out[29]:
<map at 0x21dc2f30af0>
In [30]:
bdata.obsm['X_umap'] = adata[list(map(lambda x:x[:-4],bdata.obs_names)),:].obsm['X_umap']
In [31]:
sc.pl.umap(bdata, color=['cell_type','seq_tech'], 
           ncols = 1)
In [32]:
# bdata = bdata.raw.to_adata()
scu.predcit_unicoord_in_adata(bdata,adata)
In [33]:
sc.pl.umap(bdata, color=['cell_type','cell_type_unc_infered', 
                         'seq_tech','seq_tech_unc_infered'], 
           ncols = 2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'seq_tech_unc_infered' as categorical

set seq tech

In [34]:
from src.utils import pairwise_distances
def find_nearest_neighbors(query, ref, genes):
    gquery = query[:,genes].X
    gref = ref[:,genes].X
    dis = pairwise_distances(torch.FloatTensor(gquery), torch.FloatTensor(gref))
    return np.array(np.argmin(dis, axis = 1))
In [36]:
import itertools
cells = list(itertools.chain(*[random.sample(list(adata.obs_names[adata.obs.seq_tech==ct]), 2000) \
                               for ct in adata.obs.seq_tech.value_counts().index]))
In [38]:
def create_figs(set_value, title):
    cdata = scu.generate_unicoord_in_adata(adata[cells,:].copy(),adata, 
                                       set_value=set_value)
    nn = find_nearest_neighbors(cdata,bdata, bdata.var_names)
    gen = pd.Series(['original']*len(bdata))
    gen[np.unique(nn)] = 'generated'
    gen = gen.astype('category')
    gen.index = bdata.obs_names
    bdata.obs['gen'] = gen
    bdata.uns['gen_colors'] = ['#F8E621','#440154']
    f = sc.pl.umap(bdata, color=['gen'], return_fig=True, s=20, title=title)
    return f
In [45]:
images = [create_figs({'seq_tech':t}, 'seq_tech=%s'%(t))\
          for t in bdata.obs.seq_tech.value_counts().index]
In [46]:
import os
In [47]:
gif = []
for f in images:
    f.savefig('./tmp.png')
    gif.append(imageio.imread('./tmp.png'))
    os.remove('./tmp.png')
In [51]:
imageio.mimsave('./hECALung_generation_seq_tech.gif', gif, duration = 0.5)

merge batch

In [57]:
cdata = scu.generate_unicoord_in_adata(adata, 
                                       set_value={'seq_tech':"10X"})
In [58]:
cdata
Out[58]:
AnnData object with n_obs × n_vars = 54615 × 20770
    obs: 'user_id', 'study_id', 'cell_id', 'organ', 'region', 'subregion', 'seq_tech', 'sample_status', 'donor_id', 'donor_gender', 'donor_age', 'original_name', 'cl_name', 'hcad_name', 'tissue_type', 'cell_type', 'marker_gene', 'cid', 'RNA_snn_res.0.4', 'seurat_clusters', 'nCount_RNA', 'nFeature_RNA', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'unc_training'
    var: 'features'
In [59]:
sc.pp.normalize_total(cdata)
sc.pp.log1p(cdata)
sc.pp.highly_variable_genes(cdata)
cdata.raw = cdata
cdata = cdata[:, cdata.var.highly_variable]
sc.pp.scale(cdata)
sc.tl.pca(cdata)
sc.pp.neighbors(cdata)
sc.tl.leiden(cdata)
sc.tl.umap(cdata)
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:15)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:09)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:17)
running Leiden clustering
    finished: found 38 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:13)
computing UMAP
G:\anaconda3\envs\torch_geo\lib\site-packages\sklearn\manifold\_spectral_embedding.py:260: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
  warnings.warn(
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:52)
In [61]:
sc.pl.umap(cdata, color=['cell_type','seq_tech'], s = 1,
           ncols = 1)
In [62]:
ct_mapping = {"Type I alveolar cell" : "Epithelial cells",
"Type I alveolar cell/Type II alveolar cell" : "Epithelial cells",
"Type II alveolar cell" : "Epithelial cells",
"Club cell" : "Epithelial cells",
"Ciliated columnar cell" : "Epithelial cells",
"Perineural epithelial cell" : "Epithelial cells",
"Epithelial cell" : "Epithelial cells",
"Lymphatic endothelial cell" : "Endothelial cells",
"Vascular endothelial cell" : "Endothelial cells",
"Endothelial cell" : "Endothelial cells",
"Fibrocyte" : "Fibroblasts",
"Smooth muscle cell" : "Fibroblasts",
"Dendritic cell" : "Myeloid cells",
"Macrophage" : "Myeloid cells",
"Monocyte" : "Myeloid cells",
"Neutrophilic granulocyte" : "Myeloid cells",
"Myeloid cell" : "Myeloid cells",
"Mast cell" : "MAST cells",
"NK cell" : "T/NK cells",
"T cell" : "T/NK cells",
"CD8 T cell" : "T/NK cells",
"B cell" : "B lymphocytes",
"Plasma B cell" : "B lymphocytes",
"Chondrocyte" : "rare types",
"Megakaryocyte" : "rare types"}
In [66]:
adata.obs['cell_types_refined'] = [ct_mapping[c] if c in ct_mapping else "rare types" for c in adata.obs['cell_type']]
In [78]:
sc.pl.umap(adata, color=['cell_type','cell_types_refined','seq_tech'], 
           ncols = 1)
In [72]:
cdata.obs['cell_types_refined'] = list(adata.obs['cell_types_refined'])
In [79]:
sc.pl.umap(cdata, color=['cell_type','cell_types_refined','seq_tech'], s = 1,
           ncols = 1)