In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%load_ext line_profiler
In [2]:
import scanpy as sc
from scipy.io import mmread
import random
import pandas as pd

import src.scanpy_unicoord as scu
import torch
from src.visualization import *
from line_profiler import LineProfiler
import imageio
In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
# sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.settings.set_figure_params(vector_friendly=False)
scanpy==1.8.1 anndata==0.7.6 umap==0.5.1 numpy==1.22.3 scipy==1.7.1 pandas==1.3.3 scikit-learn==1.0 statsmodels==0.13.0 python-igraph==0.9.6 pynndescent==0.5.4

load data

In [4]:
mtx_file = './datasets/Heart_EC/exploratory.data.allgene.mtx'
meta_file = './datasets/Heart_EC/exploratory.metadata.csv'
In [5]:
anno = pd.read_csv(meta_file, index_col=0)
In [6]:
mtx = mmread(mtx_file)
In [7]:
mtx
Out[7]:
<12030x11899 sparse matrix of type '<class 'numpy.float64'>'
	with 17513716 stored elements in COOrdinate format>
In [8]:
genes = pd.read_csv('./datasets/Heart_EC/gene_names.txt', index_col=1)
del genes['Unnamed: 0']
In [9]:
adata = sc.AnnData(X = mtx.T.tocsr(), obs = anno, var = genes)
In [10]:
adata
Out[10]:
AnnData object with n_obs × n_vars = 11899 × 12030
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Observation', 'Tissue', 'Endothelial.cell', 'Cluster', 'RNA_snn_res.1', 'seurat_clusters', 'ACV', 'RNA_snn_res.0.3', 'RNA_snn_res.0.1', 'group', 'integrated_snn_res.1', 'integrated_snn_res.0.2', 'integrated_snn_res.0.1', 'cluster', 'diffusion.pseudotime'
In [11]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata)
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata)
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.leiden(adata)
sc.tl.umap(adata)
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:03)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:12)
running Leiden clustering
    finished: found 10 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:01)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:09)
In [12]:
sc.pl.umap(adata, color=['Cluster','Tissue', 'ACV','diffusion.pseudotime'], 
           ncols = 1)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'orig.ident' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Tissue' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Endothelial.cell' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Cluster' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'ACV' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'group' as categorical
In [13]:
adata = adata.raw.to_adata()
In [14]:
adata.write_h5ad('./datasets/Heart_EC/Heart_EC.h5ad')

model

In [15]:
adata
Out[15]:
AnnData object with n_obs × n_vars = 11899 × 12030
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Observation', 'Tissue', 'Endothelial.cell', 'Cluster', 'RNA_snn_res.1', 'seurat_clusters', 'ACV', 'RNA_snn_res.0.3', 'RNA_snn_res.0.1', 'group', 'integrated_snn_res.1', 'integrated_snn_res.0.2', 'integrated_snn_res.0.1', 'cluster', 'diffusion.pseudotime', 'leiden'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap', 'Cluster_colors', 'Tissue_colors', 'ACV_colors'
    obsm: 'X_pca', 'X_umap'
In [16]:
diff = adata.obs['diffusion.pseudotime']
diff = (diff-min(diff))/(max(diff)-min(diff))

adata.obs['diff_scale'] = diff * 20
In [17]:
scu.model_unicoord_in_adata(adata, n_cont=50, n_diff=0, n_clus = [],
                            obs_fitting=['Tissue','diff_scale'])
In [18]:
scu.train_unicoord_in_adata(adata, epochs=100, chunk_size=20000, slot = "cur")
training chunk 1 / 1 of the data
Epoch 100: 100%|█████████████████████████████████████████| 100/100 [01:52<00:00,  1.12s/it, Epoch_average_loss=2389.29]
In [19]:
fig = draw_loss_curves(adata.uns['unc_stuffs']['trainer'].losses)
# if save_figs:
#     fig.savefig(os.path.join(savePath, 'img', 'fig1_lossCurves.png'))
fig.show()
<ipython-input-19-17c745c75a86>:4: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()
In [20]:
bdata = adata[~adata.obs.unc_training,:].copy()
In [21]:
scu.predcit_unicoord_in_adata(bdata,adata)
In [22]:
sc.pl.umap(bdata, color=['Tissue','Tissue_unc_infered', 
                         'diff_scale','diff_scale_unc_infered'], 
           ncols = 2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Tissue_unc_infered' as categorical

generation

In [23]:
bdata = scu.generate_unicoord_in_adata(adata)
bdata
Out[23]:
AnnData object with n_obs × n_vars = 11899 × 12030
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Observation', 'Tissue', 'Endothelial.cell', 'Cluster', 'RNA_snn_res.1', 'seurat_clusters', 'ACV', 'RNA_snn_res.0.3', 'RNA_snn_res.0.1', 'group', 'integrated_snn_res.1', 'integrated_snn_res.0.2', 'integrated_snn_res.0.1', 'cluster', 'diffusion.pseudotime', 'leiden', 'diff_scale', 'unc_training'
    var: 'features'
In [24]:
bdata.obsm['X_umap'] = adata.obsm['X_umap']
In [25]:
sc.pl.umap(bdata, color=['Cluster','Tissue', 'ACV','diff_scale'], 
           ncols = 1)
In [26]:
# bdata = bdata.raw.to_adata()
scu.predcit_unicoord_in_adata(bdata,adata)
In [27]:
sc.pl.umap(bdata, color=['Tissue','Tissue_unc_infered', 
                         'diff_scale','diff_scale_unc_infered'], 
           ncols = 2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Tissue_unc_infered' as categorical

set diff

In [28]:
from src.utils import pairwise_distances
def find_nearest_neighbors(query, ref, genes):
    gquery = query[:,genes].X
    gref = ref[:,genes].X
    dis = pairwise_distances(torch.FloatTensor(gquery), torch.FloatTensor(gref))
    return np.array(np.argmin(dis, axis = 1))
In [29]:
import itertools
cells = list(itertools.chain(*[random.sample(list(adata.obs_names[adata.obs.Tissue==ct]), 500) \
                               for ct in adata.obs.Tissue.value_counts().index]))
In [41]:
def create_figs(set_value, title):
    cdata = scu.generate_unicoord_in_adata(adata[cells,:].copy(),adata, 
                                       set_value=set_value)
    nn = find_nearest_neighbors(cdata,bdata, bdata.var_names)
    gen = pd.Series(['original']*len(bdata))
    gen[np.unique(nn)] = 'generated'
    gen = gen.astype('category')
    gen.index = bdata.obs_names
    bdata.obs['gen'] = gen
    bdata.uns['gen_colors'] = ['#F8E621','#440154']
    f = sc.pl.umap(bdata, color=['gen'], return_fig=True, s=20, title=import ostitle)
    return f
In [42]:
images = [create_figs({'diff_scale':np.random.random(len(cdata))+i}, 'pst=%d'%(i))\
          for i in range(20)]
In [43]:
images
Out[43]:
[<Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>]
In [47]:
gif = []
for f in images:
    f.savefig('./tmp.png')
    gif.append(imageio.imread('./tmp.png'))
    os.remove('./tmp.png')
In [49]:
imageio.mimsave('./EC_generation_pst.gif', gif)
In [74]:
f = create_figs({'Tissue':'brain','diff_scale':5}, "'Tissue':'brain','diff_scale':5")
In [79]:
f = create_figs({}, "no manipulation")
In [75]:
cdata
Out[75]:
AnnData object with n_obs × n_vars = 2000 × 12030
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Observation', 'Tissue', 'Endothelial.cell', 'Cluster', 'RNA_snn_res.1', 'seurat_clusters', 'ACV', 'RNA_snn_res.0.3', 'RNA_snn_res.0.1', 'group', 'integrated_snn_res.1', 'integrated_snn_res.0.2', 'integrated_snn_res.0.1', 'cluster', 'diffusion.pseudotime', 'leiden', 'diff_scale', 'unc_training'
    var: 'features'

set tissue

In [41]:
def create_figs(set_value, title):
    cdata = scu.generate_unicoord_in_adata(adata[cells,:].copy(),adata, 
                                       set_value=set_value)
    nn = find_nearest_neighbors(cdata,bdata, bdata.var_names)
    gen = pd.Series(['original']*len(bdata))
    gen[np.unique(nn)] = 'generated'
    gen = gen.astype('category')
    gen.index = bdata.obs_names
    bdata.obs['gen'] = gen
    bdata.uns['gen_colors'] = ['#F8E621','#440154']
    f = sc.pl.umap(bdata, color=['gen'], return_fig=True, s=20, title=title)
    return f
In [51]:
images = [create_figs({'Tissue':t}, 'organ=%s'%(t))\
          for t in cdata.obs.Tissue.value_counts().index]
In [52]:
images
Out[52]:
[<Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>,
 <Figure size 320x320 with 1 Axes>]
In [53]:
gif = []
for f in images:
    f.savefig('./tmp.png')
    gif.append(imageio.imread('./tmp.png'))
    os.remove('./tmp.png')
In [56]:
imageio.mimsave('./EC_generation_organs.gif', gif, duration = 0.5)
In [ ]:
import scanpy_unicoord as scu
import scanpy as sc

adata = sc.read_h5ad('./Liver_cancer.h5ad')

scu.model_unicoord_in_adata(adata, n_cont=50, n_diff=0, n_disc, n_clus = [],
                            obs_fitting=['Tissue','pst','Cell_type','batch'])

scu.train_unicoord_in_adata(adata, epochs=10, chunk_size=20000, slot = "cur")

scu.embed_unicoord_in_adata(adata, only_unsup=True)

bdata = sc.read_h5ad('./Lung_cancer.h5ad')
scu.predcit_unicoord_in_adata(bdata, adata)

cdata = scu.generate_unicoord_in_adata(adata[cells,:].copy(),adata, 
                                       set_value={'Cell_type':'T cells', 'pst':5})
In [78]:
adata.uns['unc_stuffs'].keys()
Out[78]:
dict_keys(['genes_used', 'obs_fitting', 'disc_mapping', 'n_dims', 'dim_definition', 'model', 'loss_weights', 'optimizer', 'slot', 'trainer'])