%matplotlib inline
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

import scanpy as sc

import random

import src.scanpy_unicoord as scu
import torch
from src.visualization import *
from line_profiler import LineProfiler

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
# sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.settings.set_figure_params(vector_friendly=False)

scanpy==1.8.1 anndata==0.7.6 umap==0.5.1 numpy==1.19.5 scipy==1.7.1 pandas==1.3.3 scikit-learn==1.0 statsmodels==0.13.0 python-igraph==0.9.6 pynndescent==0.5.4

load liver cancer data¶

adata = sc.read_h5ad(r'D:\hECA\Liver_cancer.pp.h5ad')

adata = adata.raw.to_adata()
sc.pp.normalize_total(adata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(adata)
adata

normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['S100A9', 'S100A8', 'S100A6', 'RPS27', 'APOA2', 'RGS5', 'CHI3L1', 'REG1A', 'REG3A', 'TMSB10', 'GNLY', 'FABP1', 'IGKC', 'CCL20', 'MTRNR2L12', 'TF', 'APOD', 'IGFBP7', 'JCHAIN', 'ALB', 'SPP1', 'FGG', 'SPINK1', 'HLA-DRA', 'ACTB', 'IGFBP1', 'SERPINE1', 'TMSB4X', 'TIMP1', 'MTRNR2L10', 'FABP4', 'CCL19', 'CCL21', 'TXN', 'ORM1', 'HSPA5', 'HBB', 'HBG2', 'MTRNR2L8', 'SAA2', 'SAA1', 'PGA5', 'FTH1', 'NEAT1', 'MALAT1', 'APOC3', 'APOA1', 'ACTA2', 'GAPDH', 'IFNG', 'LYZ', 'NTS', 'LUM', 'GZMB', 'SERPINA1', 'HSP90AA1', 'IGHA2', 'IGHG4', 'IGHG2', 'IGHGP', 'IGHA1', 'IGHG1', 'IGHG3', 'IGHD', 'IGHM', 'IGHV3-23', 'B2M', 'HBA2', 'HBA1', 'TPSB2', 'TPSAB1', 'MT2A', 'MT1G', 'MT1X', 'HP', 'PLCG2', 'MTRNR2L1', 'CCL3', 'CCL4', 'CCL3L3', 'CCL4L2', 'COL1A1', 'APOH', 'TTR', 'APOE', 'APOC1', 'FTL', 'IGLL5', 'IGLC2', 'IGLC3', 'TFF3', 'TFF1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP6', 'MT-CO3']
    finished (0:00:00)

AnnData object with n_obs × n_vars = 47497 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p'
    obsm: 'X_pca', 'X_umap'

model and training¶

scu.model_unicoord_in_adata(adata, n_cont=50, n_diff=0, n_clus = [],
                            obs_fitting=['Type'])

scu.train_unicoord_in_adata(adata, epochs=10, chunk_size=20000, slot = "cur")

training chunk 1 / 3 of the data

Epoch 10: 100%|████████████████████████████████████████████| 10/10 [00:22<00:00,  2.26s/it, Epoch_average_loss=2603.86]

training chunk 2 / 3 of the data

Epoch 10: 100%|████████████████████████████████████████████| 10/10 [00:22<00:00,  2.26s/it, Epoch_average_loss=2527.81]

training chunk 3 / 3 of the data

Epoch 10: 100%|████████████████████████████████████████████| 10/10 [00:08<00:00,  1.13it/s, Epoch_average_loss=2523.50]

fig = draw_loss_curves(adata.uns['unc_stuffs']['trainer'].losses)
# if save_figs:
#     fig.savefig(os.path.join(savePath, 'img', 'fig1_lossCurves.png'))
fig.show()

<ipython-input-9-17c745c75a86>:4: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()

scu.embed_unicoord_in_adata(adata, chunk_size=5000)

sc.pp.neighbors(adata, use_rep='unicoord')

computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)

sc.tl.leiden(adata, resolution=0.5)

running Leiden clustering
    finished: found 25 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:19)

sc.tl.umap(adata)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:43)

sc.pl.embedding(adata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color=['leiden','Type','S_ID','Sample'], ncols=2)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-5c7a4a5e5a3d> in <module>
----> 1 sc.pl.embedding(adata, 'X_umap',legend_loc='on data', legend_fontsize=10,
      2                 color=['leiden','Type','S_ID','Sample'], ncols=2)

NameError: name 'sc' is not defined

scu.predcit_unicoord_in_adata(adata)

ct1 = adata.obs.Type
ct2 = adata.obs.Type_unc_infered
ct1 = ct1[adata.obs.Type != 'unclassified']
ct2 = ct2[adata.obs.Type != 'unclassified']

accuracy_score(ct1, ct2)

0.9350523926795113

predict test set¶

bdata = adata[~adata.obs.unc_training,:].copy()
bdata

AnnData object with n_obs × n_vars = 9499 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'neighbors', 'pca', 'umap', 'log1p', 'unc_stuffs'
    obsm: 'X_pca', 'X_umap', 'unicoord'
    obsp: 'distances', 'connectivities'

scu.predcit_unicoord_in_adata(bdata, adata)

sc.pl.embedding(bdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color= ['Type', 'Type_unc_infered'], ncols=2)

from sklearn.metrics import accuracy_score

ct1 = bdata.obs.Type
ct2 = bdata.obs.Type_unc_infered
ct1 = ct1[bdata.obs.Type != 'unclassified']
ct2 = ct2[bdata.obs.Type != 'unclassified']

accuracy_score(ct1, ct2)

0.9334374624534423

bdata.obs.groupby(['Type','Type_unc_infered']).size().unstack()

generate data¶

import itertools
cells = list(itertools.chain(*[random.sample(list(adata.obs_names[adata.obs.Type==ct]), 1000) \
                               for ct in adata.obs.Type.value_counts().index if ct!='unclassified']))

bdata = adata[cells,:].copy()
bdata

AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p', 'unc_stuffs'
    obsm: 'X_pca', 'X_umap'

scu.predcit_unicoord_in_adata(bdata, adata)

sc.pl.embedding(bdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color= ['Type', 'Type_unc_infered'], ncols=2)

G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Type_unc_infered' as categorical

gen without set cell type¶

cdata = scu.generate_unicoord_in_adata(bdata, adata)

sc.pp.normalize_total(cdata)
sc.pp.log1p(cdata)
sc.pp.highly_variable_genes(cdata)

normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

cdata.raw = cdata
cdata = cdata[:, cdata.var.highly_variable]
cdata

View of AnnData object with n_obs × n_vars = 6000 × 2250
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'

sc.pp.scale(cdata)

G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)

sc.tl.pca(cdata)
sc.pp.neighbors(cdata)
sc.tl.leiden(cdata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 26 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)

sc.tl.umap(cdata)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)

sc.pl.umap(cdata, color='Type')

cdata = cdata.raw.to_adata()

scu.predcit_unicoord_in_adata(cdata,adata)

sc.pl.umap(cdata, color=['Type','Type_unc_infered', 'CD3E','COL1A1'], ncols = 2)

gen and setting to T cells¶

cdata = scu.generate_unicoord_in_adata(bdata, adata,
                                       set_value={'Type':'T cells'})

sc.pp.normalize_total(cdata)
sc.pp.log1p(cdata)
sc.pp.highly_variable_genes(cdata)

normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

cdata.raw = cdata
cdata = cdata[:, cdata.var.highly_variable]
cdata

View of AnnData object with n_obs × n_vars = 6000 × 1633
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'

sc.pp.scale(cdata)

G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)

sc.tl.pca(cdata)
sc.pp.neighbors(cdata)
sc.tl.leiden(cdata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 11 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)

sc.tl.umap(cdata)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)

sc.pl.umap(cdata, color='Type')

cdata = cdata.raw.to_adata()

scu.predcit_unicoord_in_adata(cdata,adata)

sc.pl.umap(cdata, color=['Type','Type_unc_infered', 'CD3E','COL1A1'], ncols = 2)

G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Type_unc_infered' as categorical

gen from only T cells¶

bdata = adata[adata.obs.Type == 'T cells',:][:6000,:].copy()
bdata

AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p', 'unc_stuffs'
    obsm: 'X_pca', 'X_umap'

cdata = scu.generate_unicoord_in_adata(bdata, adata)

sc.pp.normalize_total(cdata)
sc.pp.log1p(cdata)
sc.pp.highly_variable_genes(cdata)

normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

cdata.raw = cdata
cdata = cdata[:, cdata.var.highly_variable]
cdata

View of AnnData object with n_obs × n_vars = 6000 × 1554
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'

sc.pp.scale(cdata)

G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)

sc.tl.pca(cdata)
sc.pp.neighbors(cdata)
sc.tl.leiden(cdata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 14 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)

sc.tl.umap(cdata)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)

sc.pl.umap(cdata, color='Type')

cdata = cdata.raw.to_adata()

scu.predcit_unicoord_in_adata(cdata,adata)

sc.pl.umap(cdata, color=['Type','Type_unc_infered', 'CD3E','COL1A1'], ncols = 2)

G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Type_unc_infered' as categorical

gen and setting to CAF cells¶

bdata = adata[cells,:].copy()
bdata

AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p', 'unc_stuffs'
    obsm: 'X_pca', 'X_umap'

cdata = scu.generate_unicoord_in_adata(bdata, adata,
                                       set_value={'Type':'CAFs'})

sc.pp.normalize_total(cdata)
sc.pp.log1p(cdata)
sc.pp.highly_variable_genes(cdata)

normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:02)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

cdata.raw = cdata
cdata = cdata[:, cdata.var.highly_variable]
cdata

View of AnnData object with n_obs × n_vars = 6000 × 1664
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'

sc.pp.scale(cdata)

G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)

sc.tl.pca(cdata)
sc.pp.neighbors(cdata)
sc.tl.leiden(cdata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 13 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)

sc.tl.umap(cdata)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:11)

sc.pl.umap(cdata, color='Type')

cdata = cdata.raw.to_adata()

scu.predcit_unicoord_in_adata(cdata,adata)

sc.pl.umap(cdata, color=['Type','Type_unc_infered', 'CD3E', 'COL1A1'], ncols = 2)

G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Type_unc_infered' as categorical

integrate gen and native data¶

gen & native¶

cdata = scu.generate_unicoord_in_adata(bdata,adata)

cdata

AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features'

ddata = cdata.concatenate(bdata, index_unique=None)

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.

ddata

AnnData object with n_obs × n_vars = 12000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features-0', 'gene_ids-1', 'feature_types-1', 'n_cells-1', 'mt-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1'

sc.pp.normalize_total(ddata)
sc.pp.log1p(ddata)
sc.pp.highly_variable_genes(ddata)

normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

ddata.raw = ddata
ddata = ddata[:, ddata.var.highly_variable]
ddata

View of AnnData object with n_obs × n_vars = 12000 × 2457
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features-0', 'gene_ids-1', 'feature_types-1', 'n_cells-1', 'mt-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'

sc.pp.scale(ddata)

G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.

... as `zero_center=True`, sparse input is densified and may lead to large memory consumption

sc.tl.pca(ddata)
sc.pp.neighbors(ddata)
sc.tl.leiden(ddata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:11)
running Leiden clustering
    finished: found 37 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)

sc.tl.umap(ddata)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)

sc.pl.umap(ddata, color=['Type','batch', 'CD3E', 'COL1A1'], 
           legend_loc='on data', ncols=2)

gen (T) & native¶

cdata = scu.generate_unicoord_in_adata(bdata,adata, set_value={'Type':'T cells'})
cdata

AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features'

ddata = cdata.concatenate(bdata, index_unique=None)
ddata

AnnData object with n_obs × n_vars = 12000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features-0', 'gene_ids-1', 'feature_types-1', 'n_cells-1', 'mt-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1'

sc.pp.normalize_total(ddata)
sc.pp.log1p(ddata)
sc.pp.highly_variable_genes(ddata)

normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

ddata.raw = ddata
ddata = ddata[:, ddata.var.highly_variable]
ddata

View of AnnData object with n_obs × n_vars = 12000 × 2333
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features-0', 'gene_ids-1', 'feature_types-1', 'n_cells-1', 'mt-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'

sc.pp.scale(ddata)

G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)

... as `zero_center=True`, sparse input is densified and may lead to large memory consumption

sc.tl.pca(ddata)
sc.pp.neighbors(ddata)
sc.tl.leiden(ddata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 29 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:01)

sc.tl.umap(ddata)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:09)

sc.pl.umap(ddata, color=['Type','batch', 'CD3E', 'COL1A1'], 
           legend_loc='on data', ncols=2)

G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Cell' as categorical

gen (CAF) & native¶

cdata = scu.generate_unicoord_in_adata(bdata,adata, set_value={'Type':'CAFs'})
cdata

AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features'

ddata = cdata.concatenate(bdata, index_unique=None)
ddata

AnnData object with n_obs × n_vars = 12000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features-0', 'gene_ids-1', 'feature_types-1', 'n_cells-1', 'mt-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1'

sc.pp.normalize_total(ddata)
sc.pp.log1p(ddata)
sc.pp.highly_variable_genes(ddata)

normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

ddata.raw = ddata
ddata = ddata[:, ddata.var.highly_variable]
ddata

View of AnnData object with n_obs × n_vars = 12000 × 2362
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features-0', 'gene_ids-1', 'feature_types-1', 'n_cells-1', 'mt-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'

sc.pp.scale(ddata)

G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)

... as `zero_center=True`, sparse input is densified and may lead to large memory consumption

sc.tl.pca(ddata)
sc.pp.neighbors(ddata)
sc.tl.leiden(ddata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 27 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:01)

sc.tl.umap(ddata)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:09)

sc.pl.umap(ddata, color=['Type','batch', 'CD3E', 'COL1A1'], 
           legend_loc='on data', ncols=2)

G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Cell' as categorical

gen (T) & gen¶

cdata = scu.generate_unicoord_in_adata(bdata,adata, set_value={'Type':'T cells'})
cdata

AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features'

cdata1 = scu.generate_unicoord_in_adata(bdata,adata)
cdata1

AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features'

ddata = cdata.concatenate(cdata1)
ddata

AnnData object with n_obs × n_vars = 12000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features'

sc.pp.normalize_total(ddata)
sc.pp.log1p(ddata)
sc.pp.highly_variable_genes(ddata)

normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:02)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

ddata.raw = ddata
ddata = ddata[:, ddata.var.highly_variable]
ddata

View of AnnData object with n_obs × n_vars = 12000 × 2124
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'

sc.pp.scale(ddata)

G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)

sc.tl.pca(ddata)
sc.pp.neighbors(ddata)
sc.tl.leiden(ddata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 24 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:01)

sc.tl.umap(ddata)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:09)

sc.pl.umap(ddata, color=['Type','batch', 'CD3E', 'COL1A1'], 
           legend_loc='on data', ncols=2)

G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Cell' as categorical

gen (CAFs) & gen¶

cdata = scu.generate_unicoord_in_adata(bdata,adata, set_value={'Type':'CAFs'})
cdata

AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features'

cdata1 = scu.generate_unicoord_in_adata(bdata,adata)
cdata1

AnnData object with n_obs × n_vars = 6000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered'
    var: 'features'

ddata = cdata.concatenate(cdata1)
ddata

AnnData object with n_obs × n_vars = 12000 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features'

sc.pp.normalize_total(ddata)
sc.pp.log1p(ddata)
sc.pp.highly_variable_genes(ddata)

normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:02)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

ddata.raw = ddata
ddata = ddata[:, ddata.var.highly_variable]
ddata

View of AnnData object with n_obs × n_vars = 12000 × 2169
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training', 'Type_unc_infered', 'batch'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'

sc.pp.scale(ddata)

G:\anaconda3\envs\torch_geo\lib\site-packages\scanpy\preprocessing\_simple.py:843: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)

sc.tl.pca(ddata)
sc.pp.neighbors(ddata)
sc.tl.leiden(ddata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Leiden clustering
    finished: found 29 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:01)

sc.tl.umap(ddata)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:09)

sc.pl.umap(ddata, color=['Type','batch', 'CD3E', 'COL1A1'], 
           legend_loc='on data', ncols=2)

G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Cell' as categorical

Type_unc_infered	B cells	CAFs	Malignant cells	T cells	TAMs	TECs	rare types	unclassified
Type
B cells	473	0	0	0	3	0	0	1
CAFs	0	298	0	0	0	1	0	0
Malignant cells	0	3	1778	1	0	3	262	244
T cells	1	0	0	3781	1	0	0	10
TAMs	5	0	1	0	906	0	0	18
TECs	0	0	0	0	0	533	0	0
unclassified	50	74	97	421	89	80	33	332