In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
import scanpy as sc
import random
from unicoord import scu
from unicoord.visualization import draw_loss_curves
import torch
from line_profiler import LineProfiler
In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
# sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.settings.set_figure_params(vector_friendly=False)
scanpy==1.8.1 anndata==0.7.6 umap==0.5.1 numpy==1.19.5 scipy==1.7.1 pandas==1.3.3 scikit-learn==1.0 statsmodels==0.13.0 python-igraph==0.9.6 pynndescent==0.5.4

load LUAD data

In [4]:
adata = sc.read_h5ad(r"D:\hECA\Lung_cancer.pp.h5ad")
adata
Out[4]:
AnnData object with n_obs × n_vars = 188954 × 1943
    obs: 'Index', 'Barcode', 'Sample', 'Sample_Origin', 'Cell_type', 'Cell_type.refined', 'Cell_subtype', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Cell_subtype_colors', 'Cell_type.refined_colors', 'Sample_Origin_colors', 'Sample_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'
In [5]:
adata = adata.raw.to_adata()
sc.pp.normalize_total(adata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(adata)
adata
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['ACTB', 'AGR2', 'AKR1B1', 'APOD', 'APOE', 'AREG', 'B2M', 'BPIFA1', 'BPIFB1', 'CCL18', 'CCL19', 'CCL2', 'CCL21', 'CCL3', 'CCL3L3', 'CCL4', 'CCL4L2', 'CD74', 'CFD', 'CLU', 'COL1A1', 'COL1A2', 'COL3A1', 'CST3', 'CSTB', 'CXCL10', 'CXCL13', 'CXCL2', 'CXCL8', 'DCN', 'EEF1A1', 'FDCSP', 'FTH1', 'FTL', 'GNLY', 'GZMB', 'HBA1', 'HBA2', 'HBB', 'HLA-DRA', 'HP', 'HPGD', 'HSP90AA1', 'HSPA6', 'HSPB1', 'IER2', 'IFNG', 'IGHA1', 'IGHA2', 'IGHD', 'IGHE', 'IGHG1', 'IGHG2', 'IGHG3', 'IGHG4', 'IGHGP', 'IGHM', 'IGKC', 'IGLC2', 'IGLC3', 'IGLC7', 'IGLL5', 'ITLN1', 'JCHAIN', 'JUN', 'JUNB', 'LYZ', 'MALAT1', 'MGP', 'MSMB', 'MT1G', 'MT1X', 'MT2A', 'NEAT1', 'NFKBIA', 'NTS', 'PIP', 'PPBP', 'PTGDS', 'RPL10', 'RPL13', 'RPL13A', 'RPL37', 'RPLP1', 'RPS10', 'RPS16', 'RPS18', 'RPS19', 'RPS27', 'S100A6', 'S100A8', 'S100A9', 'SCGB1A1', 'SCGB3A1', 'SCGB3A2', 'SFRP4', 'SFTPA1', 'SFTPA2', 'SFTPC', 'SLPI', 'SPP1', 'TAGLN', 'TFF3', 'TIMP1', 'TMSB10', 'TMSB4X', 'TPSAB1', 'TPSB2', 'TXN']
    finished (0:00:04)
Out[5]:
AnnData object with n_obs × n_vars = 188954 × 27578
    obs: 'Index', 'Barcode', 'Sample', 'Sample_Origin', 'Cell_type', 'Cell_type.refined', 'Cell_subtype', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'Cell_subtype_colors', 'Cell_type.refined_colors', 'Sample_Origin_colors', 'Sample_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p'
    obsm: 'X_pca', 'X_umap'
In [6]:
adata = adata[adata.obs['Cell_type.refined']!='NA',:].copy()
adata
Out[6]:
AnnData object with n_obs × n_vars = 160891 × 27578
    obs: 'Index', 'Barcode', 'Sample', 'Sample_Origin', 'Cell_type', 'Cell_type.refined', 'Cell_subtype', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'Cell_subtype_colors', 'Cell_type.refined_colors', 'Sample_Origin_colors', 'Sample_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p'
    obsm: 'X_pca', 'X_umap'

model and training

In [16]:
scu.model_unicoord_in_adata(adata, n_diff=0, n_clus=[], n_cont=5,
                            obs_fitting=['Cell_type.refined', 'Cell_subtype','Sample'])
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-16-e2257340feda> in <module>
----> 1 scu.model_unicoord_in_adata(adata, n_diff=0, n_clus=[], n_cont=5,
      2                             obs_fitting=['Cell_type.refined', 'Cell_subtype','Sample'])

E:\Github\UniCoord\src\scanpy_unicoord.py in model_unicoord_in_adata(adata, obs_fitting, min_obs, genes_used, n_disc, n_clus, n_cont, n_diff)
     64 			   	   'disc': unc_stuffs['n_dims']['disc']['n_disc_sup'] + n_clus + n_disc}
     65 
---> 66 	model = VAE(latent_spec=latent_spec, 
     67                                 data_size = (len(genes_used),1),
     68 				use_cuda = gpu)

E:\Github\UniCoord\src\models.py in __init__(self, data_size, latent_spec, temperature, use_cuda, use_cnn)
     28             If True moves model to GPU
     29         """
---> 30         super(VAE, self).__init__()
     31         self.use_cuda = use_cuda
     32 

TypeError: super(type, obj): obj must be an instance or subtype of type
In [13]:
scu.train_unicoord_in_adata(adata, epochs=10, slot = 'cur', chunk_size=20000)
training chunk 1 / 9 of the data
Epoch 7:  70%|████████████████████████████████▏             | 7/10 [00:19<00:08,  2.83s/it, Epoch_average_loss=3079.13]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-13-4d04417aa1ca> in <module>
----> 1 scu.train_unicoord_in_adata(adata, epochs=10, slot = 'cur', chunk_size=20000)

E:\Github\UniCoord\src\scanpy_unicoord.py in train_unicoord_in_adata(adata, train_with_all, epochs, chunk_size, loss_weights, optimizer, slot)
    183                 print("training chunk %d / %d of the data"%(idx+1, len(cells_chunk)))
    184                 train_loader = DataLoader(data, batch_size=128,shuffle=True)
--> 185                 trainer.train(train_loader, epochs = epochs)
    186 
    187 

E:\Github\UniCoord\src\training.py in train(self, data_loader, epochs, save_training_gif)
    229             ran = trange(epochs)
    230         for epoch in ran:
--> 231             mean_epoch_loss = self._train_epoch(data_loader)
    232             loss_showed = self.batch_size * self.model.num_pixels * mean_epoch_loss
    233             if self.verbose:

E:\Github\UniCoord\src\training.py in _train_epoch(self, data_loader)
    250                                # self.print_loss_every
    251         for batch_idx, (data, label) in enumerate(data_loader):
--> 252             iter_loss = self._train_iteration(data, label)
    253             epoch_loss += iter_loss
    254             print_every_loss += iter_loss

E:\Github\UniCoord\src\training.py in _train_iteration(self, data, label)
    280         #     data = data.cuda()
    281 
--> 282         self.optimizer.zero_grad()
    283         recon_batch, latent_dist = self.model(data.view(data.size()[0],-1))
    284         loss = self._loss_function(data, recon_batch, latent_dist,label)

~\AppData\Roaming\Python\Python38\site-packages\torch\optim\optimizer.py in zero_grad(self, set_to_none)
    215                             else:
    216                                 p.grad.requires_grad_(False)
--> 217                             p.grad.zero_()
    218 
    219     def step(self, closure):

~\AppData\Roaming\Python\Python38\site-packages\torch\autograd\profiler.py in __exit__(self, exc_type, exc_value, traceback)
    619     def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
    620         if self.run_callbacks_on_exit:
--> 621             torch.ops.profiler._record_function_exit(self.handle)
    622 
    623     def _call_end_callbacks_on_future(self, fut: Future[Any]) -> Future[Any]:

KeyboardInterrupt: 
In [42]:
fig = draw_loss_curves(adata.uns['unc_stuffs']['trainer'].losses)
# if save_figs:
#     fig.savefig(os.path.join(savePath, 'img', 'fig1_lossCurves.png'))
fig.show()
<ipython-input-42-17c745c75a86>:4: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()
In [85]:
scu.embed_unicoord_in_adata(adata, only_sup=True)
In [86]:
adata.obsm['unicoord'].shape
Out[86]:
(47472, 30)
In [87]:
sc.pp.neighbors(adata, use_rep='unicoord', )
computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
In [88]:
sc.tl.leiden(adata, resolution=0.5)
running Leiden clustering
    finished: found 14 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:21)
In [89]:
sc.tl.umap(adata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:37)
In [12]:
sc.pl.embedding(adata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color=['Cell_type.refined', 'Cell_subtype','Sample'], ncols=2)

predict test set

In [13]:
bdata = adata[~adata.obs.unc_training,:].copy()
bdata
Out[13]:
AnnData object with n_obs × n_vars = 37791 × 27578
    obs: 'Index', 'Barcode', 'Sample', 'Sample_Origin', 'Cell_type', 'Cell_type.refined', 'Cell_subtype', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'unc_training'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'Cell_subtype_colors', 'Cell_type.refined_colors', 'Sample_Origin_colors', 'Sample_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p', 'unc_stuffs'
    obsm: 'X_pca', 'X_umap'
In [14]:
scu.predcit_unicoord_in_adata(bdata, adata)
In [16]:
sc.pl.embedding(bdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color= ['Cell_type.refined','Cell_type.refined_unc_infered', 
                        'Cell_subtype','Cell_subtype_unc_infered',
                        'Sample','Sample_unc_infered'], ncols=2)

predict liver cancer data

In [43]:
cdata = sc.read_h5ad(r'D:\hECA\Liver_cancer.pp.h5ad')
In [44]:
sc.pl.embedding(cdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color=['leiden','Type','S_ID','Sample'], ncols=2)
In [45]:
cdata = cdata.raw.to_adata()
sc.pp.normalize_total(cdata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(cdata)
cdata
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['S100A9', 'S100A8', 'S100A6', 'RPS27', 'APOA2', 'RGS5', 'CHI3L1', 'REG1A', 'REG3A', 'TMSB10', 'GNLY', 'FABP1', 'IGKC', 'CCL20', 'MTRNR2L12', 'TF', 'APOD', 'IGFBP7', 'JCHAIN', 'ALB', 'SPP1', 'FGG', 'SPINK1', 'HLA-DRA', 'ACTB', 'IGFBP1', 'SERPINE1', 'TMSB4X', 'TIMP1', 'MTRNR2L10', 'FABP4', 'CCL19', 'CCL21', 'TXN', 'ORM1', 'HSPA5', 'HBB', 'HBG2', 'MTRNR2L8', 'SAA2', 'SAA1', 'PGA5', 'FTH1', 'NEAT1', 'MALAT1', 'APOC3', 'APOA1', 'ACTA2', 'GAPDH', 'IFNG', 'LYZ', 'NTS', 'LUM', 'GZMB', 'SERPINA1', 'HSP90AA1', 'IGHA2', 'IGHG4', 'IGHG2', 'IGHGP', 'IGHA1', 'IGHG1', 'IGHG3', 'IGHD', 'IGHM', 'IGHV3-23', 'B2M', 'HBA2', 'HBA1', 'TPSB2', 'TPSAB1', 'MT2A', 'MT1G', 'MT1X', 'HP', 'PLCG2', 'MTRNR2L1', 'CCL3', 'CCL4', 'CCL3L3', 'CCL4L2', 'COL1A1', 'APOH', 'TTR', 'APOE', 'APOC1', 'FTL', 'IGLL5', 'IGLC2', 'IGLC3', 'TFF3', 'TFF1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP6', 'MT-CO3']
    finished (0:00:00)
Out[45]:
AnnData object with n_obs × n_vars = 47497 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p'
    obsm: 'X_pca', 'X_umap'
In [48]:
scu.predcit_unicoord_in_adata(cdata, adata, chunk_size=20000)
99 genes are not exist in anndata, filled with zeros
Trying to set attribute `.var` of view, copying.
G:\anaconda3\envs\torch_geo\lib\site-packages\pandas\core\indexing.py:1732: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
In [49]:
sc.pl.embedding(cdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color= ['Type','Cell_type.refined_unc_infered', 
                        'S_ID','Cell_subtype_unc_infered',
                        'Sample','Sample_unc_infered'], ncols=2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Cell_type.refined_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Cell_subtype_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Sample_unc_infered' as categorical
In [50]:
cdata.obs.groupby(['Type', 'Cell_type.refined_unc_infered']).size().unstack()
Out[50]:
Cell_type.refined_unc_infered B lymphocytes Endothelial cells Epithelial cells Fibroblasts MAST cells Myeloid cells T/NK cells rare types
Type
B cells 2223 2 0 0 0 25 7 1
CAFs 0 9 1 1383 0 0 2 0
Malignant cells 14 47 11040 76 1 290 13 0
T cells 11 2 6 0 4 6 19097 4
TAMs 9 0 0 1 18 4626 3 0
TECs 2 2470 8 8 0 4 4 1
unclassified 328 444 1868 332 29 587 2465 26

predict lung ECA data

In [51]:
cdata = sc.read_h5ad(r'D:\hECA\Lung.Adult.pp.h5ad')
In [52]:
sc.pl.embedding(cdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color=['leiden','cell_type','study_id','tissue_type'], ncols=2)
In [53]:
cdata = cdata.raw.to_adata()
sc.pp.normalize_total(cdata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(cdata)
cdata
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['CCL21', 'FTL', 'SCGB1A1', 'SCGB3A1', 'SCGB3A2', 'SFTPA2', 'SFTPC']
    finished (0:00:00)
Out[53]:
AnnData object with n_obs × n_vars = 54615 × 20770
    obs: 'user_id', 'study_id', 'cell_id', 'organ', 'region', 'subregion', 'seq_tech', 'sample_status', 'donor_id', 'donor_gender', 'donor_age', 'original_name', 'cl_name', 'hcad_name', 'tissue_type', 'cell_type', 'marker_gene', 'cid', 'RNA_snn_res.0.4', 'seurat_clusters', 'nCount_RNA', 'nFeature_RNA', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'cell_type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'study_id_colors', 'tissue_type_colors', 'umap', 'log1p'
    obsm: 'X_pca', 'X_umap'
In [54]:
scu.predcit_unicoord_in_adata(cdata, adata)
3007 genes are not exist in anndata, filled with zeros
Trying to set attribute `.var` of view, copying.
G:\anaconda3\envs\torch_geo\lib\site-packages\pandas\core\indexing.py:1732: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
In [55]:
sc.pl.embedding(cdata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color= ['cell_type','Cell_type.refined_unc_infered', 
                        'study_id','Cell_subtype_unc_infered',
                        'tissue_type','Sample_unc_infered'], ncols=2)
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Cell_type.refined_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Cell_subtype_unc_infered' as categorical
G:\anaconda3\envs\torch_geo\lib\site-packages\anndata\_core\anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Sample_unc_infered' as categorical
In [58]:
cdata.obs.groupby(['cell_type', 'Cell_type.refined_unc_infered']).size().unstack()
Out[58]:
Cell_type.refined_unc_infered B lymphocytes Endothelial cells Epithelial cells Fibroblasts MAST cells Myeloid cells T/NK cells rare types
cell_type
B cell 1498 42 34 20 0 15 3 0
CD8 T cell 0 0 2 0 6 6 358 0
Chondrocyte 0 9 2 19 0 0 0 0
Ciliated columnar cell 0 1 198 0 0 0 1 0
Club cell 0 24 537 8 1 5 0 0
Dendritic cell 5 21 25 13 3 1364 6 0
Endothelial cell 1 1639 6 9 2 9 11 0
Epithelial cell 0 0 31 0 0 3 1 0
Fibrocyte 1 531 139 2672 6 39 15 1
Lymphatic endothelial cell 0 215 5 5 0 0 0 1
Macrophage 0 6 3 1 1 5893 2 0
Mast cell 3 45 49 16 2960 66 121 0
Megakaryocyte 0 12 0 4 0 0 0 0
Monocyte 0 0 0 0 0 3795 14 0
Myeloid cell 0 0 0 0 0 412 0 0
NK cell 5 63 138 18 2 64 7267 1
Neutrophilic granulocyte 1 16 4 9 3 884 2 0
Perineural epithelial cell 0 52 221 83 2 4 3 0
Plasma B cell 163 1 3 0 0 4 4 0
Smooth muscle cell 0 136 25 606 4 8 6 1
T cell 46 85 47 34 39 82 11838 60
Type I alveolar cell 0 30 1432 5 0 2 2 0
Type I alveolar cell/Type II alveolar cell 0 1 433 0 2 1 14 0
Type II alveolar cell 0 61 5031 43 1 18 1 0
Vascular endothelial cell 2 2466 12 31 3 8 7 4
In [64]:
cdata.obs['Cell_type.refined_unc_infered'].value_counts()
Out[64]:
T/NK cells           19676
Myeloid cells        12682
Epithelial cells      8377
Endothelial cells     5456
Fibroblasts           3596
MAST cells            3035
B lymphocytes         1725
rare types              68
Name: Cell_type.refined_unc_infered, dtype: int64
In [65]:
ct_mapping = {"Type I alveolar cell" : "Epithelial cells",
"Type I alveolar cell/Type II alveolar cell" : "Epithelial cells",
"Type II alveolar cell" : "Epithelial cells",
"Club cell" : "Epithelial cells",
"Ciliated columnar cell" : "Epithelial cells",
"Perineural epithelial cell" : "Epithelial cells",
"Epithelial cell" : "Epithelial cells",
"Lymphatic endothelial cell" : "Endothelial cells",
"Vascular endothelial cell" : "Endothelial cells",
"Endothelial cell" : "Endothelial cells",
"Fibrocyte" : "Fibroblasts",
"Smooth muscle cell" : "Fibroblasts",
"Dendritic cell" : "Myeloid cells",
"Macrophage" : "Myeloid cells",
"Monocyte" : "Myeloid cells",
"Neutrophilic granulocyte" : "Myeloid cells",
"Myeloid cell" : "Myeloid cells",
"Mast cell" : "MAST cells",
"NK cell" : "T/NK cells",
"T cell" : "T/NK cells",
"CD8 T cell" : "T/NK cells",
"B cell" : "B lymphocytes",
"Plasma B cell" : "B lymphocytes",
"Chondrocyte" : "rare types",
"Megakaryocyte" : "rare types"}
In [67]:
ct1 = cdata.obs['Cell_type.refined_unc_infered']
ct2 = [ct_mapping[c] if c in ct_mapping else "rare types" for c in cdata.obs['cell_type']]
In [68]:
from sklearn.metrics import accuracy_score
In [69]:
accuracy_score(ct2, ct1)
Out[69]:
0.9505264121578321
In [62]:
import seaborn as sns
In [61]:
confusion_mtx = pd.read_csv('./table.csv', index_col=0)
In [63]:
hcl = sns.heatmap(confusion_mtx)
In [56]:
cdata.obs.groupby(['cell_type', 'Cell_subtype_unc_infered']).size().unstack().to_csv('./table.csv')
In [57]:
cdata.obs.cell_type.value_counts()
Out[57]:
T cell                                        12231
NK cell                                        7558
Macrophage                                     5906
Type II alveolar cell                          5155
Monocyte                                       3809
Fibrocyte                                      3404
Mast cell                                      3260
Vascular endothelial cell                      2533
Endothelial cell                               1677
B cell                                         1612
Type I alveolar cell                           1471
Dendritic cell                                 1437
Neutrophilic granulocyte                        919
Smooth muscle cell                              786
Club cell                                       575
Type I alveolar cell/Type II alveolar cell      451
Myeloid cell                                    412
CD8 T cell                                      372
Perineural epithelial cell                      365
Lymphatic endothelial cell                      226
Ciliated columnar cell                          200
Plasma B cell                                   175
Epithelial cell                                  35
Chondrocyte                                      30
Megakaryocyte                                    16
Name: cell_type, dtype: int64