In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
import scanpy as sc
import random
from unicoord import scu
from unicoord.visualization import draw_loss_curves
import torch
from line_profiler import LineProfiler
In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
# sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.settings.set_figure_params(vector_friendly=False)
scanpy==1.8.1 anndata==0.7.6 umap==0.5.1 numpy==1.22.3 scipy==1.7.1 pandas==1.3.3 scikit-learn==1.0 statsmodels==0.13.0 python-igraph==0.9.6 pynndescent==0.5.4

load hECA data

In [ ]:
adata = sc.read_h5ad(r"F:\h5ad\hECA_eachCT2000_pcGenes.h5ad")
sc.pp.normalize_total(adata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(adata)
adata

model and training

In [ ]:
scu.model_unicoord_in_adata(adata, n_diff=0, n_clus=[], n_cont=20,
                            obs_fitting = ['seq_tech', 'organ','cell_type'])
In [30]:
scu.train_unicoord_in_adata(adata, epochs=2, slot = 'cur', chunk_size=20000)
training chunk 1 / 5 of the data
Epoch 10: 100%|████████████████████████████████████████████| 10/10 [00:21<00:00,  2.19s/it, Epoch_average_loss=1717.55]
training chunk 2 / 5 of the data
Epoch 10: 100%|████████████████████████████████████████████| 10/10 [00:21<00:00,  2.19s/it, Epoch_average_loss=1685.16]
training chunk 3 / 5 of the data
Epoch 10: 100%|████████████████████████████████████████████| 10/10 [00:21<00:00,  2.19s/it, Epoch_average_loss=1662.82]
training chunk 4 / 5 of the data
Epoch 10: 100%|████████████████████████████████████████████| 10/10 [00:22<00:00,  2.27s/it, Epoch_average_loss=1645.18]
training chunk 5 / 5 of the data
Epoch 10: 100%|████████████████████████████████████████████| 10/10 [00:17<00:00,  1.79s/it, Epoch_average_loss=1625.56]
In [31]:
fig = draw_loss_curves(adata.uns['unc_stuffs']['trainer'].losses)
# if save_figs:
#     fig.savefig(os.path.join(savePath, 'img', 'fig1_lossCurves.png'))
fig.show()
<ipython-input-31-17c745c75a86>:4: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()
In [32]:
scu.embed_unicoord_in_adata(adata, only_sup=False)
In [33]:
adata.obsm['unicoord'].shape
Out[33]:
(96000, 20)
In [34]:
sc.pp.neighbors(adata, use_rep='unicoord')
computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:05)
In [35]:
sc.tl.leiden(adata, resolution=0.5)
running Leiden clustering
    finished: found 45 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:27)
In [36]:
sc.tl.umap(adata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:01:38)
In [38]:
sc.pl.embedding(adata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color=['seq_tech', 'study_id', 'tissue_type','cell_type','organ'], ncols=2)

predict test set

In [42]:
bdata = adata[~adata.obs.unc_training,:].copy()
bdata
Out[42]:
AnnData object with n_obs × n_vars = 9494 × 2000
    obs: 'user_id', 'study_id', 'cell_id', 'organ', 'region', 'subregion', 'seq_tech', 'sample_status', 'donor_id', 'donor_gender', 'donor_age', 'original_name', 'cl_name', 'hcad_name', 'tissue_type', 'cell_type', 'marker_gene', 'cid', 'RNA_snn_res.0.4', 'seurat_clusters', 'olfactory receptor activity', 'sensory perception of smell', 'nuclear-transcribed mRNA catabolic process', 'ribosome assembly', 'regulation of protein modification by small protein conjugation or removal', 'translation regulator activity', 'enzyme inhibitor activity', 'mitochondrial membrane organization', 'mitochondrial transport', 'nucleoside triphosphate metabolic process', 'negative regulation of protein ubiquitination', 'protein targeting', 'negative regulation of binding', 'negative regulation of translation', 'regulation of protein stability', 'negative regulation of hydrolase activity', 'regulation of protein catabolic process', 'positive regulation of cellular protein localization', 'regulation of ATP metabolic process', 'positive regulation of protein modification by small protein conjugation or removal', 'positive regulation of proteolysis involved in cellular protein catabolic process', 'negative regulation of cytoskeleton organization', 'phosphoprotein phosphatase activity', 'epithelial tube morphogenesis', 'maintenance of location in cell', 'cellular response to steroid hormone stimulus', 'protein polyubiquitination', 'regulation of supramolecular fiber organization', 'regulation of actin filament organization', 'protein localization to plasma membrane', 'unc_training', 'leiden', 'subtypes'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable'
    uns: 'unc_stuffs', 'neighbors', 'leiden', 'umap', 'seq_tech_colors', 'tissue_type_colors', 'cell_type_colors', 'subtypes_colors'
    obsm: 'unicoord', 'X_umap'
    obsp: 'distances', 'connectivities'
In [43]:
scu.predcit_unicoord_in_adata(bdata, adata)
In [44]:
sc.pl.embedding(bdata, 'X_umap',legend_loc='on data', +
                color= ['seq_tech', 'seq_tech_unc_infered', 
                        'cell_type', 'cell_type_unc_infered',
                        'organ','organ_infered'], ncols=2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'seq_tech_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'tissue_type_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type_unc_infered' as categorical

predict LUAD data

In [52]:
cdata = sc.read_h5ad(r'D:\hECA\Lung_cancer.pp.h5ad')
In [53]:
cdata
Out[53]:
AnnData object with n_obs × n_vars = 188954 × 1943
    obs: 'Index', 'Barcode', 'Sample', 'Sample_Origin', 'Cell_type', 'Cell_type.refined', 'Cell_subtype', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Cell_subtype_colors', 'Cell_type.refined_colors', 'Sample_Origin_colors', 'Sample_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'
In [54]:
sc.pl.embedding(cdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color=['leiden','Sample','Cell_type.refined','Cell_subtype'], ncols=2)
In [55]:
cdata = cdata.raw.to_adata()
sc.pp.normalize_total(cdata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(cdata)
cdata
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['ACTB', 'AGR2', 'AKR1B1', 'APOD', 'APOE', 'AREG', 'B2M', 'BPIFA1', 'BPIFB1', 'CCL18', 'CCL19', 'CCL2', 'CCL21', 'CCL3', 'CCL3L3', 'CCL4', 'CCL4L2', 'CD74', 'CFD', 'CLU', 'COL1A1', 'COL1A2', 'COL3A1', 'CST3', 'CSTB', 'CXCL10', 'CXCL13', 'CXCL2', 'CXCL8', 'DCN', 'EEF1A1', 'FDCSP', 'FTH1', 'FTL', 'GNLY', 'GZMB', 'HBA1', 'HBA2', 'HBB', 'HLA-DRA', 'HP', 'HPGD', 'HSP90AA1', 'HSPA6', 'HSPB1', 'IER2', 'IFNG', 'IGHA1', 'IGHA2', 'IGHD', 'IGHE', 'IGHG1', 'IGHG2', 'IGHG3', 'IGHG4', 'IGHGP', 'IGHM', 'IGKC', 'IGLC2', 'IGLC3', 'IGLC7', 'IGLL5', 'ITLN1', 'JCHAIN', 'JUN', 'JUNB', 'LYZ', 'MALAT1', 'MGP', 'MSMB', 'MT1G', 'MT1X', 'MT2A', 'NEAT1', 'NFKBIA', 'NTS', 'PIP', 'PPBP', 'PTGDS', 'RPL10', 'RPL13', 'RPL13A', 'RPL37', 'RPLP1', 'RPS10', 'RPS16', 'RPS18', 'RPS19', 'RPS27', 'S100A6', 'S100A8', 'S100A9', 'SCGB1A1', 'SCGB3A1', 'SCGB3A2', 'SFRP4', 'SFTPA1', 'SFTPA2', 'SFTPC', 'SLPI', 'SPP1', 'TAGLN', 'TFF3', 'TIMP1', 'TMSB10', 'TMSB4X', 'TPSAB1', 'TPSB2', 'TXN']
    finished (0:00:04)
Out[55]:
AnnData object with n_obs × n_vars = 188954 × 27578
    obs: 'Index', 'Barcode', 'Sample', 'Sample_Origin', 'Cell_type', 'Cell_type.refined', 'Cell_subtype', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'Cell_subtype_colors', 'Cell_type.refined_colors', 'Sample_Origin_colors', 'Sample_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p'
    obsm: 'X_pca', 'X_umap'
In [56]:
scu.predcit_unicoord_in_adata(cdata, adata)
1589 needed genes are not exist in the query adata, filled with zeros
Trying to set attribute `.var` of view, copying.
G:\anaconda3\envs\torch_geo\lib\site-packages\pandas\core\indexing.py:1732: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
In [64]:
sc.pl.embedding(cdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color= ['Sample', 'seq_tech_unc_infered', 
                        'Cell_type.refined', 'cell_type_unc_infered',
                        'Cell_subtype', 'organ_unc_infered'], ncols=2)
In [46]:
confusion_mtx = pd.read_csv('./table.csv', index_col=0)
In [59]:
confusion_mtx.columns
Out[59]:
Index(['AT1', 'AT2', 'Club', 'Ciliated', 'Malignant cells', 'tS1', 'tS2',
       'Lymphatic ECs', 'Stalk-like ECs', 'Tip-like ECs', 'Tumor ECs',
       'COL13A1+ matrix FBs', 'COL14A1+ matrix FBs', 'Myofibroblasts',
       'Pericytes', 'Smooth muscle cells', 'Activated DCs', 'CD1c+ DCs',
       'CD141+ DCs', 'CD163+CD14+ DCs', 'Alveolar Mac', 'Microglia/Mac',
       'Pleural Mac', 'mo-Mac', 'Monocytes', 'MAST', 'NK', 'Naive CD4+ T',
       'CD4+ Th', 'Treg', 'Exhausted Tfh', 'Naive CD8+ T', 'CD8 low T',
       'CD8+/CD4+ Mixed Th', 'Cytotoxic CD8+ T', 'Exhausted CD8+ T',
       'Follicular B cells', 'GC B cells in the DZ', 'GC B cells in the LZ',
       'GrB-secreting B cells', 'MALT B cells', 'Plasma cells', 'Undetermined',
       'rare types'],
      dtype='object')
In [63]:
cdata.obs.groupby(['cell_type_unc_infered','Cell_type.refined']).size().unstack()
Out[63]:
Cell_type.refined B lymphocytes Endothelial cells Epithelial cells Fibroblasts MAST cells Myeloid cells NA T/NK cells
cell_type_unc_infered
Acinar cell 0 0 15 0 0 0 0 1
Astrocyte 0 0 1 2 0 0 12 0
B cell 12190 0 611 3 1 651 509 415
Basal cell 0 0 217 1 0 0 0 0
Bipolar cell 0 0 2 0 0 0 0 0
Cardiomyocyte cell 0 0 1 0 0 0 0 0
Chief cell 0 0 3 0 0 0 0 0
Dendritic cell 117 2 450 32 2 1740 91 87
Endocardial cell 0 0 1 0 0 0 0 0
Endothelial cell 2 6 46 48 0 6 4 0
Enterocyte progenitor 0 0 312 0 0 0 3 0
Epithelial cell 3 0 2231 7 0 8 13 1
Erythrocyte 27 0 30 0 0 3 69 1
Fibroblast 0 0 14 271 0 7 1 0
Fibrocyte 0 0 3 104 0 0 0 0
Follicular epithelial cell 0 0 5 0 0 0 0 0
Goblet cell 0 0 548 13 0 0 0 0
Hemopoietic stem cell 555 0 136 1 0 64 769 549
Inhibitory neuron 0 0 3 0 0 0 52 0
Intercalated cell 0 0 5 0 0 0 0 0
Loop of henle 0 0 1 0 0 0 0 0
Macrophage 7 12 52 51 15 15373 5 0
Mast cell 829 0 620 274 3332 288 88 84
Mesenchymal cell 80 0 39 36 0 2 16 5
Microglia 86 0 24 0 0 4215 37 8
Monocyte 75 4 14 1 0 15378 12 1
Muller cell 0 0 36 0 0 0 30 0
NK cell 112 11 58 10 3 42 8848 18630
Neuron 0 0 4 0 0 2 4 3
Neutrophilic granulocyte 0 0 12 2 0 3 5 0
Oligodendrocyte 0 0 0 0 0 0 488 0
Oligodendrocyte precursor cell 0 0 0 0 0 0 2 0
Pericyte 0 1 31 50 0 0 2 0
Perineural epithelial cell 0 0 22 0 0 0 3 0
Plasma B cell 401 0 8 0 0 0 1 0
Proliferating T cell 5 0 221 2 0 1 1 0
Proximal convoluted tubule 0 0 4 1 0 0 0 0
Smooth muscle cell 0 0 386 169 0 4 13 0
Stromal cell 0 0 3 625 0 16 3 0
T cell 12726 7 82 11 2 742 16744 44322
Type II alveolar cell 70 0 15559 119 0 12 21 0
Vascular endothelial cell 80 1933 124 1668 1 62 217 33
In [61]:
confusion_matrix = cdata.obs.groupby(['cell_type_unc_infered','Cell_subtype']).size().unstack().loc[:,list(confusion_mtx.columns)[:-1]]
confusion_matrix.to_csv('./hECA_pred_LUAD.csv')
In [102]:
ct_mapping = {"Epithelial cell" : "Epithelial cells",
"Type II alveolar cell" : "Epithelial cells",
"Goblet cell" : "Epithelial cells",
"Basal cell" : "Epithelial cells",
"Enterocyte progenitor" : "Epithelial cells",
"Vascular endothelial cell" : "Endothelial cells",
"Stromal cell" :"Fibroblasts",
"Fibroblast" : "Fibroblasts",
"Fibrocyte" : "Fibroblasts",
"Smooth muscle cell" : "Fibroblasts",
"Dendritic cell" : "Myeloid cells",
"Macrophage" : "Myeloid cells",
"Monocyte" : "Myeloid cells",
"Microglia" : "Myeloid cells",
"Mast cell" : "MAST cells",
"NK cell" : "T/NK cells",
"T cell" : "T/NK cells",
"B cell" : "B lymphocytes",
"Plasma B cell" : "B lymphocytes",
"Acinar cell" : 'other',
"Astrocyte" : 'other',
"Bipolar cell" : 'other',
"Cardiomyocyte cell" : 'other',
"Chief cell" : 'other',
"Endocardial cell" : 'other',
"Endothelial cell" : 'other',
"Erythrocyte" : 'other',
"Follicular epithelial cell" : 'other',
"Hemopoietic stem cell" : 'other',
"Inhibitory neuron" : 'other',
"Intercalated cell" : 'other',
"Loop of henle" : 'other',
"Mesenchymal cell" : 'other',
"Muller cell" : 'other',
"Neutrophilic granulocyte" : 'other',
"Neuron" : 'other',
"Oligodendrocyte" : 'other',
"Oligodendrocyte precursor cell" : 'other',
"Proliferating T cell" : 'other',
"Proximal convoluted tubule" : 'other',
"Pericyte" : 'other',
"Perineural epithelial cell" : 'other'}
In [114]:
ct1 = cdata.obs['Cell_type.refined']
ct2 = pd.Series([ct_mapping[c] if c in ct_mapping else "rare types" \
                 for c in cdata.obs['cell_type_unc_infered']])
ct1 = ct1[list(~cdata.obs['Cell_type.refined'].isin(['Undetermined','NA']))]
ct2 = ct2[list(~cdata.obs['Cell_type.refined'].isin(['Undetermined','NA']))]
In [115]:
from sklearn.metrics import accuracy_score
In [116]:
accuracy_score(ct1, ct2)
Out[116]:
0.8549266273439782
In [117]:
ct2.index = ct1.index
a = pd.concat([ct1,ct2], axis=1)
a.columns = ['LUAD_label', 'pred_label']
In [118]:
a.groupby(['LUAD_label', 'pred_label']).size().unstack()
Out[118]:
pred_label B lymphocytes Endothelial cells Epithelial cells Fibroblasts MAST cells Myeloid cells T/NK cells other
LUAD_label
B lymphocytes 12591 80 73 0 829 285 12838 669
Endothelial cells 0 1933 0 0 0 18 18 7
Epithelial cells 619 124 18867 406 620 540 140 618
Fibroblasts 3 1668 140 1169 274 84 21 142
MAST cells 1 1 0 0 3332 17 5 0
Myeloid cells 651 62 20 27 288 36706 784 81
NA 0 0 0 0 0 0 0 0
T/NK cells 415 33 1 0 84 96 62952 559
In [119]:
confusion_mtx = pd.read_csv('./hECA_pred_LUAD.csv', index_col=0)
In [121]:
sns.set(rc = {'figure.figsize':(15,10)})

hcl = sns.heatmap(np.log(np.add(confusion_mtx, 1)),
                  cmap="YlGnBu",
                  xticklabels = True).figure.savefig('../UniCoord/hECA_pred_LUAD.pdf', bbox_inches="tight", dpi = 200)
In [122]:
hcl = sns.heatmap(confusion_mtx/np.sum(confusion_mtx, axis=0),
                  cmap="YlGnBu",
                  xticklabels = True).figure.savefig('../UniCoord/hECA_pred_LUAD_ratio.pdf', bbox_inches="tight", dpi = 200)
In [123]:
hcl = sns.heatmap((confusion_mtx.T/np.sum(confusion_mtx, axis=1)).T,
                  cmap="YlGnBu",
                  xticklabels = True)

predict lung ECA data

In [124]:
cdata = sc.read_h5ad(r'D:\hECA\Lung.Adult.pp.h5ad')
In [129]:
sc.pl.embedding(cdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color=['leiden','cell_type','study_id','tissue_type'], ncols=2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'seq_tech_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'organ_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type_unc_infered' as categorical
In [126]:
cdata = cdata.raw.to_adata()
sc.pp.normalize_total(cdata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(cdata)
cdata
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['CCL21', 'FTL', 'SCGB1A1', 'SCGB3A1', 'SCGB3A2', 'SFTPA2', 'SFTPC']
    finished (0:00:00)
Out[126]:
AnnData object with n_obs × n_vars = 54615 × 20770
    obs: 'user_id', 'study_id', 'cell_id', 'organ', 'region', 'subregion', 'seq_tech', 'sample_status', 'donor_id', 'donor_gender', 'donor_age', 'original_name', 'cl_name', 'hcad_name', 'tissue_type', 'cell_type', 'marker_gene', 'cid', 'RNA_snn_res.0.4', 'seurat_clusters', 'nCount_RNA', 'nFeature_RNA', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'cell_type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'study_id_colors', 'tissue_type_colors', 'umap', 'log1p'
    obsm: 'X_pca', 'X_umap'
In [127]:
scu.predcit_unicoord_in_adata(cdata, adata)
2024 needed genes are not exist in the query adata, filled with zeros
Trying to set attribute `.var` of view, copying.
G:\anaconda3\envs\torch_geo\lib\site-packages\pandas\core\indexing.py:1732: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
In [131]:
sc.pl.embedding(cdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color= ['seq_tech', 'seq_tech_unc_infered', 'cell_type', 'cell_type_unc_infered'], ncols=2)
In [135]:
ct1_mapping = {"Epithelial cell" : "Epithelial cells",
"Type II alveolar cell" : "Epithelial cells",
"Goblet cell" : "Epithelial cells",
"Basal cell" : "Epithelial cells",
"Enterocyte progenitor" : "Epithelial cells",
"Vascular endothelial cell" : "Endothelial cells",
"Stromal cell" :"Fibroblasts",
"Fibroblast" : "Fibroblasts",
"Fibrocyte" : "Fibroblasts",
"Smooth muscle cell" : "Fibroblasts",
"Dendritic cell" : "Myeloid cells",
"Macrophage" : "Myeloid cells",
"Monocyte" : "Myeloid cells",
"Microglia" : "Myeloid cells",
"Mast cell" : "MAST cells",
"NK cell" : "T/NK cells",
"T cell" : "T/NK cells",
"B cell" : "B lymphocytes",
"Plasma B cell" : "B lymphocytes",
"Acinar cell" : 'other',
"Astrocyte" : 'other',
"Bipolar cell" : 'other',
"Cardiomyocyte cell" : 'other',
"Chief cell" : 'other',
"Endocardial cell" : 'other',
"Endothelial cell" : 'other',
"Erythrocyte" : 'other',
"Follicular epithelial cell" : 'other',
"Hemopoietic stem cell" : 'other',
"Inhibitory neuron" : 'other',
"Intercalated cell" : 'other',
"Loop of henle" : 'other',
"Mesenchymal cell" : 'other',
"Muller cell" : 'other',
"Neutrophilic granulocyte" : 'other',
"Neuron" : 'other',
"Oligodendrocyte" : 'other',
"Oligodendrocyte precursor cell" : 'other',
"Proliferating T cell" : 'other',
"Proximal convoluted tubule" : 'other',
"Pericyte" : 'other',
"Perineural epithelial cell" : 'other'}
In [136]:
ct2_mapping = {"Type I alveolar cell" : "Epithelial cells",
"Type I alveolar cell/Type II alveolar cell" : "Epithelial cells",
"Type II alveolar cell" : "Epithelial cells",
"Club cell" : "Epithelial cells",
"Ciliated columnar cell" : "Epithelial cells",
"Perineural epithelial cell" : "Epithelial cells",
"Epithelial cell" : "Epithelial cells",
"Lymphatic endothelial cell" : "Endothelial cells",
"Vascular endothelial cell" : "Endothelial cells",
"Endothelial cell" : "Endothelial cells",
"Fibrocyte" : "Fibroblasts",
"Smooth muscle cell" : "Fibroblasts",
"Dendritic cell" : "Myeloid cells",
"Macrophage" : "Myeloid cells",
"Monocyte" : "Myeloid cells",
"Neutrophilic granulocyte" : "Myeloid cells",
"Myeloid cell" : "Myeloid cells",
"Mast cell" : "MAST cells",
"NK cell" : "T/NK cells",
"T cell" : "T/NK cells",
"CD8 T cell" : "T/NK cells",
"B cell" : "B lymphocytes",
"Plasma B cell" : "B lymphocytes",
"Chondrocyte" : "rare types",
"Megakaryocyte" : "rare types"}
In [137]:
ct1 = [ct1_mapping[c] if c in ct1_mapping else "rare types" 
                 for c in cdata.obs['cell_type_unc_infered']]
ct2 = [ct2_mapping[c] if c in ct2_mapping else "rare types" 
                 for c in cdata.obs['cell_type']]
In [143]:
from sklearn.metrics import f1_score
In [145]:
f1_score(ct1, ct2, average='micro')
Out[145]:
0.8633342488327382
In [147]:
f1_score(ct1, ct2, average='macro')
Out[147]:
0.6409801311060411
In [146]:
accuracy_score(ct1, ct2)
Out[146]:
0.8633342488327382
In [140]:
a = pd.DataFrame([ct1,ct2]).T
a.columns = ['pred','true']
In [141]:
a.groupby(['pred','true']).size().unstack()
Out[141]:
true B lymphocytes Endothelial cells Epithelial cells Fibroblasts MAST cells Myeloid cells T/NK cells rare types
pred
B lymphocytes 925.0 NaN 7.0 7.0 NaN 28.0 348.0 NaN
Endothelial cells 50.0 3727.0 115.0 1801.0 8.0 42.0 144.0 11.0
Epithelial cells 63.0 36.0 7678.0 830.0 23.0 63.0 206.0 3.0
Fibroblasts 2.0 1.0 34.0 830.0 NaN 1.0 2.0 1.0
MAST cells 79.0 9.0 28.0 93.0 3169.0 70.0 224.0 NaN
Myeloid cells 76.0 15.0 78.0 201.0 38.0 12007.0 345.0 12.0
T/NK cells 570.0 4.0 15.0 13.0 20.0 42.0 18803.0 NaN
other 22.0 644.0 297.0 415.0 2.0 230.0 89.0 7.0
rare types NaN NaN NaN NaN NaN NaN NaN 12.0