%matplotlib inline
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

import scanpy as sc
import random
from unicoord import scu
from unicoord.visualization import draw_loss_curves
import torch
from line_profiler import LineProfiler

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
# sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.settings.set_figure_params(vector_friendly=False)

scanpy==1.8.1 anndata==0.7.6 umap==0.5.1 numpy==1.22.3 scipy==1.7.1 pandas==1.3.3 scikit-learn==1.0 statsmodels==0.13.0 python-igraph==0.9.6 pynndescent==0.5.4

Tabula Sapiens data¶

adata = sc.read_h5ad(r'F:\h5ad\tabularSapiens\obs_and_var.h5ad')

genes = pd.read_table('./protein_coding_genes.txt', header=None)[0]
g = list(set(genes) & set(adata.var_names))

adata

AnnData object with n_obs × n_vars = 481120 × 58870
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'

scu.model_unicoord_in_adata(adata, genes_used=g,
                            n_cont=50, n_diff=0, n_clus = [],
                            obs_fitting=['cell_ontology_class','anatomical_information',
                                         'method','organ_tissue','donor','gender','compartment'], 
                            min_obs = 2000)

unc_stuffs = adata.uns['unc_stuffs']

for idx in range(3):
    bdata = sc.read_h5ad(r'F:\h5ad\tabularSapiens\TBSP_%s.h5ad'%(str(idx)))
    scu.train_unicoord_in_adata(bdata, unc_stuffs=unc_stuffs,
                                epochs=10, chunk_size=20000, slot = "cur")
    torch.cuda.empty_cache()

training chunk 1 / 10 of the data

training chunk 2 / 10 of the data

training chunk 3 / 10 of the data

training chunk 4 / 10 of the data

training chunk 5 / 10 of the data

training chunk 6 / 10 of the data

training chunk 7 / 10 of the data

training chunk 8 / 10 of the data

training chunk 9 / 10 of the data

training chunk 10 / 10 of the data

fig = draw_loss_curves(unc_stuffs['loss'])
# if save_figs:
#     fig.savefig(os.path.join(savePath, 'img', 'fig1_lossCurves.png'))
fig.show()

<ipython-input-10-a1f761dfb391>:4: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()

scu.write_scu_h5ad(adata, './pretrained_models/TBSP.h5ad', only_model=True)

predict liver cancer¶

cdata = scu.load_scu_h5ad('./pretrained_models/TBSP.h5ad')
cdata

AnnData object with n_obs × n_vars = 2 × 2
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'unc_stuffs'

ddata = sc.read_h5ad(r'D:\hECA\Liver_cancer.pp.h5ad')

ddata = ddata.raw.to_adata()
sc.pp.normalize_total(ddata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(ddata)
ddata

normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['S100A9', 'S100A8', 'S100A6', 'RPS27', 'APOA2', 'RGS5', 'CHI3L1', 'REG1A', 'REG3A', 'TMSB10', 'GNLY', 'FABP1', 'IGKC', 'CCL20', 'MTRNR2L12', 'TF', 'APOD', 'IGFBP7', 'JCHAIN', 'ALB', 'SPP1', 'FGG', 'SPINK1', 'HLA-DRA', 'ACTB', 'IGFBP1', 'SERPINE1', 'TMSB4X', 'TIMP1', 'MTRNR2L10', 'FABP4', 'CCL19', 'CCL21', 'TXN', 'ORM1', 'HSPA5', 'HBB', 'HBG2', 'MTRNR2L8', 'SAA2', 'SAA1', 'PGA5', 'FTH1', 'NEAT1', 'MALAT1', 'APOC3', 'APOA1', 'ACTA2', 'GAPDH', 'IFNG', 'LYZ', 'NTS', 'LUM', 'GZMB', 'SERPINA1', 'HSP90AA1', 'IGHA2', 'IGHG4', 'IGHG2', 'IGHGP', 'IGHA1', 'IGHG1', 'IGHG3', 'IGHD', 'IGHM', 'IGHV3-23', 'B2M', 'HBA2', 'HBA1', 'TPSB2', 'TPSAB1', 'MT2A', 'MT1G', 'MT1X', 'HP', 'PLCG2', 'MTRNR2L1', 'CCL3', 'CCL4', 'CCL3L3', 'CCL4L2', 'COL1A1', 'APOH', 'TTR', 'APOE', 'APOC1', 'FTL', 'IGLL5', 'IGLC2', 'IGLC3', 'TFF3', 'TFF1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP6', 'MT-CO3']
    finished (0:00:00)

AnnData object with n_obs × n_vars = 47497 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p'
    obsm: 'X_pca', 'X_umap'

scu.predcit_unicoord_in_adata(ddata,adata_ref=cdata,chunk_size=1000)

4038 needed genes are not exist in the query adata, filled with zeros

Trying to set attribute `.var` of view, copying.
G:\anaconda3\envs\torch_geo\lib\site-packages\pandas\core\indexing.py:1732: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)

sc.pl.embedding(ddata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color= ['Type']+[s+'_unc_infered' for s in \
                                 ['cell_ontology_class','anatomical_information',
                                  'method','organ_tissue','donor','gender','compartment']], ncols=2)

G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_ontology_class_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'anatomical_information_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'method_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'organ_tissue_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'donor_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'gender_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'compartment_unc_infered' as categorical

ddata.obs.groupby(['cell_ontology_class_unc_infered','Type']).size().unstack().to_csv('TBSP_HCC.csv')

predict LUAD¶

cdata = scu.load_scu_h5ad('./pretrained_models/TBSP.h5ad')
cdata

AnnData object with n_obs × n_vars = 2 × 2
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'unc_stuffs'

ddata = sc.read_h5ad(r'D:\hECA\Lung_cancer_nLung.pp.h5ad')

ddata = ddata.raw.to_adata()
sc.pp.normalize_total(ddata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(ddata)
ddata

normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['B2M', 'BPIFB1', 'CCL19', 'CCL21', 'CCL3', 'CCL3L3', 'CCL4', 'CCL4L2', 'CFD', 'CST3', 'CXCL10', 'DCN', 'FTH1', 'FTL', 'GZMB', 'HBA1', 'HBA2', 'HBB', 'HLA-DRA', 'IFNG', 'IGHA1', 'IGHA2', 'IGHD', 'IGHG1', 'IGHG2', 'IGHG3', 'IGHG4', 'IGHM', 'IGKC', 'IGLC2', 'IGLC3', 'IGLC7', 'ITLN1', 'JCHAIN', 'LYZ', 'MALAT1', 'MGP', 'MSMB', 'MT2A', 'PPBP', 'S100A9', 'SCGB1A1', 'SCGB3A1', 'SCGB3A2', 'SFRP4', 'SFTPA1', 'SFTPA2', 'SFTPC', 'SLPI', 'SPP1', 'TMSB4X', 'TPSB2']
    finished (0:00:00)

AnnData object with n_obs × n_vars = 40258 × 27578
    obs: 'Index', 'Barcode', 'Sample', 'Sample_Origin', 'Cell_type', 'Cell_type.refined', 'Cell_subtype', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'Cell_subtype_colors', 'Cell_type.refined_colors', 'Sample_Origin_colors', 'Sample_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p'
    obsm: 'X_pca', 'X_umap'

scu.predcit_unicoord_in_adata(ddata,adata_ref=cdata,chunk_size=1000)

1596 needed genes are not exist in the query adata, filled with zeros

Trying to set attribute `.var` of view, copying.
G:\anaconda3\envs\torch_geo\lib\site-packages\pandas\core\indexing.py:1732: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)

sc.pl.embedding(ddata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color= ['Cell_type.refined','Cell_subtype']+[s+'_unc_infered' for s in \
                                 ['cell_ontology_class','anatomical_information',
                                  'method','organ_tissue','donor','gender','compartment']], ncols=2)

G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_ontology_class_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'anatomical_information_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'method_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'organ_tissue_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'donor_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'gender_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'compartment_unc_infered' as categorical

ddata.obs.groupby(['cell_ontology_class_unc_infered','Cell_subtype']).size().unstack().to_csv('TBSP_LUAD.csv')

RP_genes = [g for g in cdata.uns['unc_stuffs']['genes_used'] if g.startswith("RPL") or g.startswith("RPS")]

pd.Series(RP_genes).to_csv('./RP_genes.csv')

ddata.obs.columns

Index(['n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt',
       'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type',
       'cell_ontology_class_unc_infered', 'anatomical_information_unc_infered',
       'method_unc_infered', 'organ_tissue_unc_infered', 'donor_unc_infered',
       'gender_unc_infered', 'compartment_unc_infered'],
      dtype='object')

ddata.obs['Cell_subtype'][ddata.obs.organ_tissue_unc_infered == 'Muscle'].value_counts()

NA                       2153
NK                       1834
CD4+ Th                   845
Cytotoxic CD8+ T          633
CD8 low T                 231
Naive CD4+ T              162
Treg                       63
MAST                       39
Undetermined               32
Exhausted CD8+ T           31
Naive CD8+ T               21
Follicular B cells         20
Ciliated                    5
GrB-secreting B cells       2
CD8+/CD4+ Mixed Th          2
AT2                         1
Exhausted Tfh               1
Monocytes                   1
EPCs                        1
Stalk-like ECs              1
Pericytes                   0
Myofibroblasts              0
Pleural Mac                 0
Smooth muscle cells         0
Tip-like ECs                0
Tumor ECs                   0
mo-Mac                      0
Plasma cells                0
AT1                         0
Mesothelial cells           0
MALT B cells                0
Lymphatic ECs               0
GC B cells in the DZ        0
FB-like cells               0
Club                        0
COL14A1+ matrix FBs         0
COL13A1+ matrix FBs         0
CD207+CD1a+ LCs             0
CD163+CD14+ DCs             0
CD141+ DCs                  0
CD1c+ DCs                   0
Alveolar Mac                0
Activated DCs               0
pDCs                        0
Name: Cell_subtype, dtype: int64

ddata.obs.organ_tissue_unc_infered.value_counts()

Lung               16123
Muscle              6078
Bladder             4365
Spleen              2738
Bone_Marrow         2083
Skin                1778
Tongue              1676
Large_Intestine     1562
Pancreas             610
Prostate             598
Uterus               510
Trachea              472
Salivary_Gland       462
Small_Intestine      373
Vasculature          277
Heart                184
Fat                  123
Mammary              100
Liver                 82
Blood                 22
Lymph_Node            17
Eye                   15
Kidney                10
Name: organ_tissue_unc_infered, dtype: int64

adata.obs.organ_tissue.value_counts()