In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%load_ext line_profiler
In [3]:
import scanpy as sc

import random

import src.scanpy_unicoord as scu
import torch
from src.visualization import *
from line_profiler import LineProfiler
In [4]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
# sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.settings.set_figure_params(vector_friendly=False)
scanpy==1.8.1 anndata==0.7.6 umap==0.5.1 numpy==1.19.5 scipy==1.7.1 pandas==1.3.3 scikit-learn==1.0 statsmodels==0.13.0 python-igraph==0.9.6 pynndescent==0.5.4

load liver cancer data

In [5]:
adata = sc.read_h5ad(r'D:\hECA\Liver_cancer.pp.h5ad')
In [6]:
adata = adata.raw.to_adata()
sc.pp.normalize_total(adata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(adata)
adata
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['S100A9', 'S100A8', 'S100A6', 'RPS27', 'APOA2', 'RGS5', 'CHI3L1', 'REG1A', 'REG3A', 'TMSB10', 'GNLY', 'FABP1', 'IGKC', 'CCL20', 'MTRNR2L12', 'TF', 'APOD', 'IGFBP7', 'JCHAIN', 'ALB', 'SPP1', 'FGG', 'SPINK1', 'HLA-DRA', 'ACTB', 'IGFBP1', 'SERPINE1', 'TMSB4X', 'TIMP1', 'MTRNR2L10', 'FABP4', 'CCL19', 'CCL21', 'TXN', 'ORM1', 'HSPA5', 'HBB', 'HBG2', 'MTRNR2L8', 'SAA2', 'SAA1', 'PGA5', 'FTH1', 'NEAT1', 'MALAT1', 'APOC3', 'APOA1', 'ACTA2', 'GAPDH', 'IFNG', 'LYZ', 'NTS', 'LUM', 'GZMB', 'SERPINA1', 'HSP90AA1', 'IGHA2', 'IGHG4', 'IGHG2', 'IGHGP', 'IGHA1', 'IGHG1', 'IGHG3', 'IGHD', 'IGHM', 'IGHV3-23', 'B2M', 'HBA2', 'HBA1', 'TPSB2', 'TPSAB1', 'MT2A', 'MT1G', 'MT1X', 'HP', 'PLCG2', 'MTRNR2L1', 'CCL3', 'CCL4', 'CCL3L3', 'CCL4L2', 'COL1A1', 'APOH', 'TTR', 'APOE', 'APOC1', 'FTL', 'IGLL5', 'IGLC2', 'IGLC3', 'TFF3', 'TFF1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP6', 'MT-CO3']
    finished (0:00:00)
Out[6]:
AnnData object with n_obs × n_vars = 47497 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p'
    obsm: 'X_pca', 'X_umap'

model and training

In [7]:
scu.model_unicoord_in_adata(adata, n_cont=50, n_diff=0, n_clus = [],
                            obs_fitting=['Type'])
In [8]:
scu.train_unicoord_in_adata(adata, epochs=10, chunk_size=20000, slot = "cur")
training chunk 1 / 3 of the data
Epoch 10: 100%|████████████████████████████████████████████| 10/10 [00:22<00:00,  2.26s/it, Epoch_average_loss=2603.86]
training chunk 2 / 3 of the data
Epoch 10: 100%|████████████████████████████████████████████| 10/10 [00:22<00:00,  2.26s/it, Epoch_average_loss=2527.81]
training chunk 3 / 3 of the data
Epoch 10: 100%|████████████████████████████████████████████| 10/10 [00:08<00:00,  1.13it/s, Epoch_average_loss=2523.50]
In [9]:
fig = draw_loss_curves(adata.uns['unc_stuffs']['trainer'].losses)
# if save_figs:
#     fig.savefig(os.path.join(savePath, 'img', 'fig1_lossCurves.png'))
fig.show()
<ipython-input-9-17c745c75a86>:4: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()
In [27]:
scu.embed_unicoord_in_adata(adata, chunk_size=5000)
In [28]:
sc.pp.neighbors(adata, use_rep='unicoord')
computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
In [29]:
sc.tl.leiden(adata, resolution=0.5)
running Leiden clustering
    finished: found 25 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:19)
In [30]:
sc.tl.umap(adata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:43)
In [1]:
sc.pl.embedding(adata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color=['leiden','Type','S_ID','Sample'], ncols=2)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-5c7a4a5e5a3d> in <module>
----> 1 sc.pl.embedding(adata, 'X_umap',legend_loc='on data', legend_fontsize=10,
      2                 color=['leiden','Type','S_ID','Sample'], ncols=2)

NameError: name 'sc' is not defined
In [43]:
scu.predcit_unicoord_in_adata(adata)
In [44]:
ct1 = adata.obs.Type
ct2 = adata.obs.Type_unc_infered
ct1 = ct1[adata.obs.Type != 'unclassified']
ct2 = ct2[adata.obs.Type != 'unclassified']

accuracy_score(ct1, ct2)
Out[44]:
0.9350523926795113

predict test set

In [32]:
bdata = adata[~adata.obs.unc_training,:].copy()
bdata
Out[32]:
AnnData object with n_obs × n_vars = 9499 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'neighbors', 'pca', 'umap', 'log1p', 'unc_stuffs'
    obsm: 'X_pca', 'X_umap', 'unicoord'
    obsp: 'distances', 'connectivities'
In [33]:
scu.predcit_unicoord_in_adata(bdata, adata)
In [35]:
sc.pl.embedding(bdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color= ['Type', 'Type_unc_infered'], ncols=2)
In [36]:
from sklearn.metrics import accuracy_score
In [45]:
ct1 = bdata.obs.Type
ct2 = bdata.obs.Type_unc_infered
ct1 = ct1[bdata.obs.Type != 'unclassified']
ct2 = ct2[bdata.obs.Type != 'unclassified']

accuracy_score(ct1, ct2)
Out[45]:
0.9334374624534423
In [39]:
bdata.obs.groupby(['Type','Type_unc_infered']).size().unstack()
Out[39]:
Type_unc_infered B cells CAFs Malignant cells T cells TAMs TECs rare types unclassified
Type
B cells 473 0 0 0 3 0 0 1
CAFs 0 298 0 0 0 1 0 0
Malignant cells 0 3 1778 1 0 3 262 244
T cells 1 0 0 3781 1 0 0 10
TAMs 5 0 1 0 906 0 0 18
TECs 0 0 0 0 0 533 0 0
unclassified 50 74 97 421 89 80 33 332

generate data

In [10]:
import itertools
cells = list(itertools.chain(*[random.sample(list(adata.obs_names[adata.obs.Type==ct]), 1000) \
                               for ct in adata.obs.Type.value_counts().index if ct!='unclassified']))
In [11]:
bdata = adata[cells,:].copy()
bdata
Out[11]:
AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p', 'unc_stuffs'
    obsm: 'X_pca', 'X_umap'
In [13]:
scu.predcit_unicoord_in_adata(bdata, adata)
In [14]:
sc.pl.embedding(bdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color= ['Type', 'Type_unc_infered'], ncols=2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Type_unc_infered' as categorical

gen without set cell type

In [56]:
cdata = scu.generate_unicoord_in_adata(bdata, adata)
In [57]:
sc.pp.normalize_total(cdata)
sc.pp.log1p(cdata)
sc.pp.highly_variable_genes(cdata)
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
In [58]:
cdata.raw = cdata
cdata = cdata[:, cdata.var.highly_variable]
cdata
Out[58]:
View of AnnData object with n_obs × n_vars = 6000 × 2250
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
In [59]:
sc.pp.scale(cdata)
G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
In [60]:
sc.tl.pca(cdata)
sc.pp.neighbors(cdata)
sc.tl.leiden(cdata)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 26 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
In [61]:
sc.tl.umap(cdata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)
In [62]:
sc.pl.umap(cdata, color='Type')
In [63]:
cdata = cdata.raw.to_adata()
In [64]:
scu.predcit_unicoord_in_adata(cdata,adata)
In [67]:
sc.pl.umap(cdata, color=['Type','Type_unc_infered', 'CD3E','COL1A1'], ncols = 2)

gen and setting to T cells

In [68]:
cdata = scu.generate_unicoord_in_adata(bdata, adata,
                                       set_value={'Type':'T cells'})
In [69]:
sc.pp.normalize_total(cdata)
sc.pp.log1p(cdata)
sc.pp.highly_variable_genes(cdata)
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
In [70]:
cdata.raw = cdata
cdata = cdata[:, cdata.var.highly_variable]
cdata
Out[70]:
View of AnnData object with n_obs × n_vars = 6000 × 1633
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
In [71]:
sc.pp.scale(cdata)
G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
In [72]:
sc.tl.pca(cdata)
sc.pp.neighbors(cdata)
sc.tl.leiden(cdata)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 11 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
In [73]:
sc.tl.umap(cdata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)
In [74]:
sc.pl.umap(cdata, color='Type')
In [75]:
cdata = cdata.raw.to_adata()
In [76]:
scu.predcit_unicoord_in_adata(cdata,adata)
In [77]:
sc.pl.umap(cdata, color=['Type','Type_unc_infered', 'CD3E','COL1A1'], ncols = 2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Type_unc_infered' as categorical

gen from only T cells

In [87]:
bdata = adata[adata.obs.Type == 'T cells',:][:6000,:].copy()
bdata
Out[87]:
AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p', 'unc_stuffs'
    obsm: 'X_pca', 'X_umap'
In [88]:
cdata = scu.generate_unicoord_in_adata(bdata, adata)
In [89]:
sc.pp.normalize_total(cdata)
sc.pp.log1p(cdata)
sc.pp.highly_variable_genes(cdata)
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
In [90]:
cdata.raw = cdata
cdata = cdata[:, cdata.var.highly_variable]
cdata
Out[90]:
View of AnnData object with n_obs × n_vars = 6000 × 1554
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
In [91]:
sc.pp.scale(cdata)
G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
In [92]:
sc.tl.pca(cdata)
sc.pp.neighbors(cdata)
sc.tl.leiden(cdata)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 14 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
In [93]:
sc.tl.umap(cdata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)
In [94]:
sc.pl.umap(cdata, color='Type')
In [95]:
cdata = cdata.raw.to_adata()
In [96]:
scu.predcit_unicoord_in_adata(cdata,adata)
In [97]:
sc.pl.umap(cdata, color=['Type','Type_unc_infered', 'CD3E','COL1A1'], ncols = 2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Type_unc_infered' as categorical

gen and setting to CAF cells

In [98]:
bdata = adata[cells,:].copy()
bdata
Out[98]:
AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p', 'unc_stuffs'
    obsm: 'X_pca', 'X_umap'
In [99]:
cdata = scu.generate_unicoord_in_adata(bdata, adata,
                                       set_value={'Type':'CAFs'})
In [100]:
sc.pp.normalize_total(cdata)
sc.pp.log1p(cdata)
sc.pp.highly_variable_genes(cdata)
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:02)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
In [101]:
cdata.raw = cdata
cdata = cdata[:, cdata.var.highly_variable]
cdata
Out[101]:
View of AnnData object with n_obs × n_vars = 6000 × 1664
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
In [102]:
sc.pp.scale(cdata)
G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
In [103]:
sc.tl.pca(cdata)
sc.pp.neighbors(cdata)
sc.tl.leiden(cdata)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 13 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
In [104]:
sc.tl.umap(cdata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:11)
In [105]:
sc.pl.umap(cdata, color='Type')
In [106]:
cdata = cdata.raw.to_adata()
In [107]:
scu.predcit_unicoord_in_adata(cdata,adata)
In [108]:
sc.pl.umap(cdata, color=['Type','Type_unc_infered', 'CD3E', 'COL1A1'], ncols = 2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Type_unc_infered' as categorical

integrate gen and native data

gen & native

In [15]:
cdata = scu.generate_unicoord_in_adata(bdata,adata)
In [16]:
cdata
Out[16]:
AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features'
In [17]:
ddata = cdata.concatenate(bdata, index_unique=None)
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
In [18]:
ddata
Out[18]:
AnnData object with n_obs × n_vars = 12000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features-0', 'gene_ids-1', 'feature_types-1', 'n_cells-1', 'mt-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1'
In [19]:
sc.pp.normalize_total(ddata)
sc.pp.log1p(ddata)
sc.pp.highly_variable_genes(ddata)
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
In [20]:
ddata.raw = ddata
ddata = ddata[:, ddata.var.highly_variable]
ddata
Out[20]:
View of AnnData object with n_obs × n_vars = 12000 × 2457
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features-0', 'gene_ids-1', 'feature_types-1', 'n_cells-1', 'mt-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
In [21]:
sc.pp.scale(ddata)
G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
In [22]:
sc.tl.pca(ddata)
sc.pp.neighbors(ddata)
sc.tl.leiden(ddata)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:11)
running Leiden clustering
    finished: found 37 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
In [23]:
sc.tl.umap(ddata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)
In [25]:
sc.pl.umap(ddata, color=['Type','batch', 'CD3E', 'COL1A1'], 
           legend_loc='on data', ncols=2)

gen (T) & native

In [29]:
cdata = scu.generate_unicoord_in_adata(bdata,adata, set_value={'Type':'T cells'})
cdata
Out[29]:
AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features'
In [31]:
ddata = cdata.concatenate(bdata, index_unique=None)
ddata
Out[31]:
AnnData object with n_obs × n_vars = 12000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features-0', 'gene_ids-1', 'feature_types-1', 'n_cells-1', 'mt-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1'
In [32]:
sc.pp.normalize_total(ddata)
sc.pp.log1p(ddata)
sc.pp.highly_variable_genes(ddata)
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
In [33]:
ddata.raw = ddata
ddata = ddata[:, ddata.var.highly_variable]
ddata
Out[33]:
View of AnnData object with n_obs × n_vars = 12000 × 2333
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features-0', 'gene_ids-1', 'feature_types-1', 'n_cells-1', 'mt-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
In [34]:
sc.pp.scale(ddata)
G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
In [35]:
sc.tl.pca(ddata)
sc.pp.neighbors(ddata)
sc.tl.leiden(ddata)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 29 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:01)
In [36]:
sc.tl.umap(ddata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:09)
In [37]:
sc.pl.umap(ddata, color=['Type','batch', 'CD3E', 'COL1A1'], 
           legend_loc='on data', ncols=2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Cell' as categorical

gen (CAF) & native

In [39]:
cdata = scu.generate_unicoord_in_adata(bdata,adata, set_value={'Type':'CAFs'})
cdata
Out[39]:
AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features'
In [40]:
ddata = cdata.concatenate(bdata, index_unique=None)
ddata
Out[40]:
AnnData object with n_obs × n_vars = 12000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features-0', 'gene_ids-1', 'feature_types-1', 'n_cells-1', 'mt-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1'
In [41]:
sc.pp.normalize_total(ddata)
sc.pp.log1p(ddata)
sc.pp.highly_variable_genes(ddata)
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
In [42]:
ddata.raw = ddata
ddata = ddata[:, ddata.var.highly_variable]
ddata
Out[42]:
View of AnnData object with n_obs × n_vars = 12000 × 2362
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features-0', 'gene_ids-1', 'feature_types-1', 'n_cells-1', 'mt-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
In [43]:
sc.pp.scale(ddata)
G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
In [44]:
sc.tl.pca(ddata)
sc.pp.neighbors(ddata)
sc.tl.leiden(ddata)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 27 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:01)
In [45]:
sc.tl.umap(ddata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:09)
In [46]:
sc.pl.umap(ddata, color=['Type','batch', 'CD3E', 'COL1A1'], 
           legend_loc='on data', ncols=2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Cell' as categorical

gen (T) & gen

In [47]:
cdata = scu.generate_unicoord_in_adata(bdata,adata, set_value={'Type':'T cells'})
cdata
Out[47]:
AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features'
In [48]:
cdata1 = scu.generate_unicoord_in_adata(bdata,adata)
cdata1
Out[48]:
AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features'
In [49]:
ddata = cdata.concatenate(cdata1)
ddata
Out[49]:
AnnData object with n_obs × n_vars = 12000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features'
In [50]:
sc.pp.normalize_total(ddata)
sc.pp.log1p(ddata)
sc.pp.highly_variable_genes(ddata)
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:02)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
In [51]:
ddata.raw = ddata
ddata = ddata[:, ddata.var.highly_variable]
ddata
Out[51]:
View of AnnData object with n_obs × n_vars = 12000 × 2124
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
In [52]:
sc.pp.scale(ddata)
G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
In [53]:
sc.tl.pca(ddata)
sc.pp.neighbors(ddata)
sc.tl.leiden(ddata)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 24 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:01)
In [54]:
sc.tl.umap(ddata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:09)
In [55]:
sc.pl.umap(ddata, color=['Type','batch', 'CD3E', 'COL1A1'], 
           legend_loc='on data', ncols=2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Cell' as categorical

gen (CAFs) & gen

In [78]:
cdata = scu.generate_unicoord_in_adata(bdata,adata, set_value={'Type':'CAFs'})
cdata
Out[78]:
AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features'
In [79]:
cdata1 = scu.generate_unicoord_in_adata(bdata,adata)
cdata1
Out[79]:
AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features'
In [80]:
ddata = cdata.concatenate(cdata1)
ddata
Out[80]:
AnnData object with n_obs × n_vars = 12000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features'
In [81]:
sc.pp.normalize_total(ddata)
sc.pp.log1p(ddata)
sc.pp.highly_variable_genes(ddata)
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:02)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
In [82]:
ddata.raw = ddata
ddata = ddata[:, ddata.var.highly_variable]
ddata
Out[82]:
View of AnnData object with n_obs × n_vars = 12000 × 2169
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
In [83]:
sc.pp.scale(ddata)
G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
In [84]:
sc.tl.pca(ddata)
sc.pp.neighbors(ddata)
sc.tl.leiden(ddata)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 29 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:01)
In [85]:
sc.tl.umap(ddata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:09)
In [86]:
sc.pl.umap(ddata, color=['Type','batch', 'CD3E', 'COL1A1'], 
           legend_loc='on data', ncols=2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Cell' as categorical