%matplotlib inline
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

import scanpy as sc
from scipy.io import mmread
import random
import pandas as pd

import src.scanpy_unicoord as scu
import torch
from src.visualization import *
from line_profiler import LineProfiler
import imageio

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
# sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.settings.set_figure_params(vector_friendly=False)

load data¶

adata = sc.read_h5ad(r'D:\hECA\Lung.Adult.pp.h5ad')

adata = adata.raw.to_adata()
sc.pp.normalize_total(adata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(adata)
adata

model¶

scu.model_unicoord_in_adata(adata, n_cont=50, n_diff=0, n_clus = [],
                            obs_fitting=['cell_type','seq_tech'])

scu.train_unicoord_in_adata(adata, epochs=10, chunk_size=20000, slot = "cur")

fig = draw_loss_curves(adata.uns['unc_stuffs']['trainer'].losses)
# if save_figs:
#     fig.savefig(os.path.join(savePath, 'img', 'fig1_lossCurves.png'))
fig.show()

<ipython-input-16-17c745c75a86>:4: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()

generation¶

bdata = scu.generate_unicoord_in_adata(adata[random.sample(list(adata.obs_names),10000),:],
                                       chunk_size=5000)
bdata

AnnData object with n_obs × n_vars = 10000 × 20770
    obs: 'user_id', 'study_id', 'cell_id', 'organ', 'region', 'subregion', 'seq_tech', 'sample_status', 'donor_id', 'donor_gender', 'donor_age', 'original_name', 'cl_name', 'hcad_name', 'tissue_type', 'cell_type', 'marker_gene', 'cid', 'RNA_snn_res.0.4', 'seurat_clusters', 'nCount_RNA', 'nFeature_RNA', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'unc_training'
    var: 'features'

bdata.obs_names[0][:-4]

'GATGAAAGTGAGGGTT.1.HCATisStab7747198_4'

map(lambda x:x[:-4], ['12345','234125','342315'])

<map at 0x21dc2f30af0>

bdata.obsm['X_umap'] = adata[list(map(lambda x:x[:-4],bdata.obs_names)),:].obsm['X_umap']

sc.pl.umap(bdata, color=['cell_type','seq_tech'], 
           ncols = 1)

# bdata = bdata.raw.to_adata()
scu.predcit_unicoord_in_adata(bdata,adata)

sc.pl.umap(bdata, color=['cell_type','cell_type_unc_infered', 
                         'seq_tech','seq_tech_unc_infered'], 
           ncols = 2)

G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'seq_tech_unc_infered' as categorical

set seq tech¶

from src.utils import pairwise_distances
def find_nearest_neighbors(query, ref, genes):
    gquery = query[:,genes].X
    gref = ref[:,genes].X
    dis = pairwise_distances(torch.FloatTensor(gquery), torch.FloatTensor(gref))
    return np.array(np.argmin(dis, axis = 1))

import itertools
cells = list(itertools.chain(*[random.sample(list(adata.obs_names[adata.obs.seq_tech==ct]), 2000) \
                               for ct in adata.obs.seq_tech.value_counts().index]))

def create_figs(set_value, title):
    cdata = scu.generate_unicoord_in_adata(adata[cells,:].copy(),adata, 
                                       set_value=set_value)
    nn = find_nearest_neighbors(cdata,bdata, bdata.var_names)
    gen = pd.Series(['original']*len(bdata))
    gen[np.unique(nn)] = 'generated'
    gen = gen.astype('category')
    gen.index = bdata.obs_names
    bdata.obs['gen'] = gen
    bdata.uns['gen_colors'] = ['#F8E621','#440154']
    f = sc.pl.umap(bdata, color=['gen'], return_fig=True, s=20, title=title)
    return f

images = [create_figs({'seq_tech':t}, 'seq_tech=%s'%(t))\
          for t in bdata.obs.seq_tech.value_counts().index]

import os

gif = []
for f in images:
    f.savefig('./tmp.png')
    gif.append(imageio.imread('./tmp.png'))
    os.remove('./tmp.png')

imageio.mimsave('./hECALung_generation_seq_tech.gif', gif, duration = 0.5)

merge batch¶

cdata = scu.generate_unicoord_in_adata(adata, 
                                       set_value={'seq_tech':"10X"})

cdata

AnnData object with n_obs × n_vars = 54615 × 20770
    obs: 'user_id', 'study_id', 'cell_id', 'organ', 'region', 'subregion', 'seq_tech', 'sample_status', 'donor_id', 'donor_gender', 'donor_age', 'original_name', 'cl_name', 'hcad_name', 'tissue_type', 'cell_type', 'marker_gene', 'cid', 'RNA_snn_res.0.4', 'seurat_clusters', 'nCount_RNA', 'nFeature_RNA', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'unc_training'
    var: 'features'

sc.pp.normalize_total(cdata)
sc.pp.log1p(cdata)
sc.pp.highly_variable_genes(cdata)
cdata.raw = cdata
cdata = cdata[:, cdata.var.highly_variable]
sc.pp.scale(cdata)
sc.tl.pca(cdata)
sc.pp.neighbors(cdata)
sc.tl.leiden(cdata)
sc.tl.umap(cdata)

normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:15)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:09)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:17)
running Leiden clustering
    finished: found 38 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:13)
computing UMAP

G:\anaconda3\envs\torch_geo\lib\site-packages\sklearn\manifold\_spectral_embedding.py:260: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
  warnings.warn(

    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:52)

sc.pl.umap(cdata, color=['cell_type','seq_tech'], s = 1,
           ncols = 1)

ct_mapping = {"Type I alveolar cell" : "Epithelial cells",
"Type I alveolar cell/Type II alveolar cell" : "Epithelial cells",
"Type II alveolar cell" : "Epithelial cells",
"Club cell" : "Epithelial cells",
"Ciliated columnar cell" : "Epithelial cells",
"Perineural epithelial cell" : "Epithelial cells",
"Epithelial cell" : "Epithelial cells",
"Lymphatic endothelial cell" : "Endothelial cells",
"Vascular endothelial cell" : "Endothelial cells",
"Endothelial cell" : "Endothelial cells",
"Fibrocyte" : "Fibroblasts",
"Smooth muscle cell" : "Fibroblasts",
"Dendritic cell" : "Myeloid cells",
"Macrophage" : "Myeloid cells",
"Monocyte" : "Myeloid cells",
"Neutrophilic granulocyte" : "Myeloid cells",
"Myeloid cell" : "Myeloid cells",
"Mast cell" : "MAST cells",
"NK cell" : "T/NK cells",
"T cell" : "T/NK cells",
"CD8 T cell" : "T/NK cells",
"B cell" : "B lymphocytes",
"Plasma B cell" : "B lymphocytes",
"Chondrocyte" : "rare types",
"Megakaryocyte" : "rare types"}

adata.obs['cell_types_refined'] = [ct_mapping[c] if c in ct_mapping else "rare types" for c in adata.obs['cell_type']]

sc.pl.umap(adata, color=['cell_type','cell_types_refined','seq_tech'], 
           ncols = 1)

cdata.obs['cell_types_refined'] = list(adata.obs['cell_types_refined'])

sc.pl.umap(cdata, color=['cell_type','cell_types_refined','seq_tech'], s = 1,
           ncols = 1)