In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%load_ext line_profiler
In [2]:
import scanpy as sc
import random
from unicoord import scu
from unicoord.visualization import draw_loss_curves
import torch
from line_profiler import LineProfiler
In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
# sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.settings.set_figure_params(vector_friendly=False)
scanpy==1.8.1 anndata==0.7.6 umap==0.5.1 numpy==1.19.5 scipy==1.7.1 pandas==1.3.3 scikit-learn==1.0 statsmodels==0.13.0 python-igraph==0.9.6 pynndescent==0.5.4

load liver cancer data

In [4]:
adata = sc.read_h5ad(r'D:\hECA\Liver_cancer.pp.h5ad')
In [5]:
adata = adata.raw.to_adata()
sc.pp.normalize_total(adata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(adata)
adata
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['S100A9', 'S100A8', 'S100A6', 'RPS27', 'APOA2', 'RGS5', 'CHI3L1', 'REG1A', 'REG3A', 'TMSB10', 'GNLY', 'FABP1', 'IGKC', 'CCL20', 'MTRNR2L12', 'TF', 'APOD', 'IGFBP7', 'JCHAIN', 'ALB', 'SPP1', 'FGG', 'SPINK1', 'HLA-DRA', 'ACTB', 'IGFBP1', 'SERPINE1', 'TMSB4X', 'TIMP1', 'MTRNR2L10', 'FABP4', 'CCL19', 'CCL21', 'TXN', 'ORM1', 'HSPA5', 'HBB', 'HBG2', 'MTRNR2L8', 'SAA2', 'SAA1', 'PGA5', 'FTH1', 'NEAT1', 'MALAT1', 'APOC3', 'APOA1', 'ACTA2', 'GAPDH', 'IFNG', 'LYZ', 'NTS', 'LUM', 'GZMB', 'SERPINA1', 'HSP90AA1', 'IGHA2', 'IGHG4', 'IGHG2', 'IGHGP', 'IGHA1', 'IGHG1', 'IGHG3', 'IGHD', 'IGHM', 'IGHV3-23', 'B2M', 'HBA2', 'HBA1', 'TPSB2', 'TPSAB1', 'MT2A', 'MT1G', 'MT1X', 'HP', 'PLCG2', 'MTRNR2L1', 'CCL3', 'CCL4', 'CCL3L3', 'CCL4L2', 'COL1A1', 'APOH', 'TTR', 'APOE', 'APOC1', 'FTL', 'IGLL5', 'IGLC2', 'IGLC3', 'TFF3', 'TFF1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP6', 'MT-CO3']
    finished (0:00:00)
Out[5]:
AnnData object with n_obs × n_vars = 47497 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p'
    obsm: 'X_pca', 'X_umap'

model and training

In [6]:
scu.model_unicoord_in_adata(adata, n_diff=0, 
                            obs_fitting=["S_ID", "Type"])
In [13]:
scu.train_unicoord_in_adata(adata, epochs=2, chunk_size=50000, slot = "cur")
training chunk 1 / 1 of the data
Epoch 2: 100%|███████████████████████████████████████████████| 2/2 [00:11<00:00,  5.66s/it, Epoch_average_loss=5415.29]
In [16]:
fig = draw_loss_curves(adata.uns['unc_stuffs']['trainer'].losses)
# if save_figs:
#     fig.savefig(os.path.join(savePath, 'img', 'fig1_lossCurves.png'))
fig.show()
<ipython-input-16-17c745c75a86>:4: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()
In [ ]:
for data in 
In [11]:
scu.embed_unicoord_in_adata(adata, chunk_size=5000)
In [11]:
sc.pp.neighbors(adata, use_rep='unicoord')
computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:17)
In [12]:
sc.tl.leiden(adata, resolution=0.5)
running Leiden clustering
    finished: found 16 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:12)
In [13]:
sc.tl.umap(adata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:38)
In [14]:
sc.pl.embedding(adata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color=['leiden','Type','S_ID','Sample'], ncols=2)

predict test set

In [30]:
bdata = adata[~adata.obs.unc_training,:].copy()
bdata
Out[30]:
AnnData object with n_obs × n_vars = 9499 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type', 'unc_training'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'neighbors', 'pca', 'umap', 'log1p', 'unc_stuffs'
    obsm: 'X_pca', 'X_umap', 'unicoord'
    obsp: 'distances', 'connectivities'
In [31]:
scu.predcit_unicoord_in_adata(bdata, adata)
In [32]:
sc.pl.embedding(bdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color= ['S_ID', 'S_ID_unc_infered', 'Type', 'Type_unc_infered'], ncols=2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'S_ID_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Type_unc_infered' as categorical

predict lung cancer data

In [24]:
cdata = sc.read_h5ad(r'D:\hECA\Lung_cancer_tLung.pp.h5ad')
cdata
Out[24]:
AnnData object with n_obs × n_vars = 41287 × 1943
    obs: 'Index', 'Barcode', 'Sample', 'Sample_Origin', 'Cell_type', 'Cell_type.refined', 'Cell_subtype', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Cell_subtype_colors', 'Cell_type.refined_colors', 'Sample_Origin_colors', 'Sample_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'
In [25]:
sc.pl.embedding(cdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color=['leiden','Cell_type.refined','Sample','Cell_subtype','Sample_Origin'], ncols=2)
In [26]:
cdata = cdata.raw.to_adata()
sc.pp.normalize_total(cdata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(cdata)
cdata
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['APOE', 'B2M', 'BPIFB1', 'CCL18', 'CCL19', 'CCL2', 'CCL21', 'CCL3', 'CCL3L3', 'CCL4', 'CCL4L2', 'CD74', 'CLU', 'COL1A1', 'COL1A2', 'COL3A1', 'CST3', 'CXCL13', 'CXCL8', 'EEF1A1', 'FTH1', 'FTL', 'GNLY', 'HBA1', 'HBA2', 'HBB', 'HLA-DRA', 'IGHA1', 'IGHA2', 'IGHD', 'IGHE', 'IGHG1', 'IGHG2', 'IGHG3', 'IGHG4', 'IGHGP', 'IGHM', 'IGKC', 'IGLC2', 'IGLC3', 'IGLC7', 'IGLL5', 'JCHAIN', 'JUN', 'LYZ', 'MALAT1', 'MGP', 'MT2A', 'NEAT1', 'NTS', 'RPS16', 'RPS19', 'S100A6', 'S100A9', 'SCGB1A1', 'SCGB3A1', 'SCGB3A2', 'SFTPA1', 'SFTPA2', 'SFTPC', 'SLPI', 'SPP1', 'TFF3', 'TIMP1', 'TMSB10', 'TMSB4X', 'TPSAB1', 'TPSB2']
    finished (0:00:00)
Out[26]:
AnnData object with n_obs × n_vars = 41287 × 27578
    obs: 'Index', 'Barcode', 'Sample', 'Sample_Origin', 'Cell_type', 'Cell_type.refined', 'Cell_subtype', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'Cell_subtype_colors', 'Cell_type.refined_colors', 'Sample_Origin_colors', 'Sample_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p'
    obsm: 'X_pca', 'X_umap'
In [27]:
scu.predcit_unicoord_in_adata(cdata, adata)
9010 genes are not exist in anndata, filled with zeros
Trying to set attribute `.var` of view, copying.
G:\anaconda3\envs\torch_geo\lib\site-packages\pandas\core\indexing.py:1732: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
In [28]:
sc.pl.embedding(cdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color= ['Sample', 'S_ID_unc_infered', 'Cell_type.refined', 'Type_unc_infered'], ncols=2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'S_ID_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Type_unc_infered' as categorical

predict lung ECA data

In [133]:
cdata = sc.read_h5ad(r'D:\hECA\Lung.Adult.pp.h5ad')
In [134]:
sc.pl.embedding(cdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color=['leiden','cell_type','study_id','tissue_type'], ncols=2)
In [135]:
cdata = cdata.raw.to_adata()
sc.pp.normalize_total(cdata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(cdata)
cdata
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['CCL21', 'FTL', 'SCGB1A1', 'SCGB3A1', 'SCGB3A2', 'SFTPA2', 'SFTPC']
    finished (0:00:00)
Out[135]:
AnnData object with n_obs × n_vars = 54615 × 20770
    obs: 'user_id', 'study_id', 'cell_id', 'organ', 'region', 'subregion', 'seq_tech', 'sample_status', 'donor_id', 'donor_gender', 'donor_age', 'original_name', 'cl_name', 'hcad_name', 'tissue_type', 'cell_type', 'marker_gene', 'cid', 'RNA_snn_res.0.4', 'seurat_clusters', 'nCount_RNA', 'nFeature_RNA', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'cell_type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'study_id_colors', 'tissue_type_colors', 'umap', 'log1p'
    obsm: 'X_pca', 'X_umap'
In [136]:
scu.predcit_unicoord_in_adata(cdata, adata)
5662 genes are not exist in anndata, filled with zeros
Trying to set attribute `.var` of view, copying.
G:\anaconda3\envs\torch_geo\lib\site-packages\pandas\core\indexing.py:1732: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
In [137]:
sc.pl.embedding(cdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color= ['study_id', 'S_ID_unc_infered', 'cell_type', 'Type_unc_infered'], ncols=2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'S_ID_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Type_unc_infered' as categorical