In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%load_ext line_profiler
In [2]:
import scanpy as sc
import random
from unicoord import scu
from unicoord.visualization import draw_loss_curves
import torch
from line_profiler import LineProfiler
In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
# sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.settings.set_figure_params(vector_friendly=False)
scanpy==1.9.1 anndata==0.7.5 umap==0.3.10 numpy==1.17.3 scipy==1.7.3 pandas==1.3.5 scikit-learn==0.22 statsmodels==0.10.2 python-igraph==0.7.1 louvain==0.6.1 leidenalg==0.7.0

load liver cancer data

In [23]:
adata = sc.read_h5ad('h5ad/Liver_cancer.pp.h5ad')
In [24]:
adata
Out[24]:
AnnData object with n_obs × n_vars = 47497 × 1925
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'
In [25]:
sc.pl.embedding(adata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color=['leiden','Type','S_ID','Sample'], ncols=2)
In [5]:
adata = adata.raw.to_adata()
sc.pp.normalize_total(adata, target_sum=1e4 ,exclude_highly_expressed= True)
sc.pp.log1p(adata)
adata
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['S100A9', 'S100A8', 'S100A6', 'RPS27', 'APOA2', 'RGS5', 'CHI3L1', 'REG1A', 'REG3A', 'TMSB10', 'GNLY', 'FABP1', 'IGKC', 'CCL20', 'MTRNR2L12', 'TF', 'APOD', 'IGFBP7', 'JCHAIN', 'ALB', 'SPP1', 'FGG', 'SPINK1', 'HLA-DRA', 'ACTB', 'IGFBP1', 'SERPINE1', 'TMSB4X', 'TIMP1', 'MTRNR2L10', 'FABP4', 'CCL19', 'CCL21', 'TXN', 'ORM1', 'HSPA5', 'HBB', 'HBG2', 'MTRNR2L8', 'SAA2', 'SAA1', 'PGA5', 'FTH1', 'NEAT1', 'MALAT1', 'APOC3', 'APOA1', 'ACTA2', 'GAPDH', 'IFNG', 'LYZ', 'NTS', 'LUM', 'GZMB', 'SERPINA1', 'HSP90AA1', 'IGHA2', 'IGHG4', 'IGHG2', 'IGHGP', 'IGHA1', 'IGHG1', 'IGHG3', 'IGHD', 'IGHM', 'IGHV3-23', 'B2M', 'HBA2', 'HBA1', 'TPSB2', 'TPSAB1', 'MT2A', 'MT1G', 'MT1X', 'HP', 'PLCG2', 'MTRNR2L1', 'CCL3', 'CCL4', 'CCL3L3', 'CCL4L2', 'COL1A1', 'APOH', 'TTR', 'APOE', 'APOC1', 'FTL', 'IGLL5', 'IGLC2', 'IGLC3', 'TFF3', 'TFF1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP6', 'MT-CO3']
    finished (0:00:00)
Out[5]:
AnnData object with n_obs × n_vars = 47497 × 18667
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'S_ID', 'Sample', 'Cell', 'Type'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'S_ID_colors', 'Sample_colors', 'Type_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'umap', 'log1p'
    obsm: 'X_pca', 'X_umap'

supervise batch

In [6]:
scu.model_unicoord_in_adata(adata, n_cont=30, n_diff=0, n_clus = [],
                            obs_fitting=['S_ID'])
In [7]:
scu.train_unicoord_in_adata(adata, epochs=100, chunk_size=20000, slot = "cur")
/home/gaohx/anaconda3/envs/scvi/lib/python3.7/site-packages/pandas/core/arrays/categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.remove_unused_categories is deprecated and will be removed in a future version.
  res = method(*args, **kwargs)
training chunk 1 / 3 of the data
training chunk 2 / 3 of the data
training chunk 3 / 3 of the data

In [9]:
fig = draw_loss_curves(adata.uns['unc_stuffs']['loss'])
# if save_figs:
#     fig.savefig(os.path.join(savePath, 'img', 'fig1_lossCurves.png'))
fig.show()
In [10]:
scu.embed_unicoord_in_adata(adata, chunk_size=5000)
In [11]:
sc.pp.neighbors(adata, use_rep='unicoord')
computing neighbors
/home/gaohx/anaconda3/envs/scvi/lib/python3.7/site-packages/numba/typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../../home/gaohx/anaconda3/envs/scvi/lib/python3.7/site-packages/umap/rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^

  state.func_ir.loc))
/home/gaohx/anaconda3/envs/scvi/lib/python3.7/site-packages/umap/nndescent.py:92: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../../home/gaohx/anaconda3/envs/scvi/lib/python3.7/site-packages/umap/utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
^

  current_graph, n_vertices, n_neighbors, max_candidates, rng_state
/home/gaohx/anaconda3/envs/scvi/lib/python3.7/site-packages/numba/typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../../home/gaohx/anaconda3/envs/scvi/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  state.func_ir.loc))
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:09)
In [12]:
sc.tl.leiden(adata, resolution=0.5)
running Leiden clustering
    finished: found 20 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:18)
In [13]:
sc.tl.umap(adata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:44)
In [14]:
sc.pl.embedding(adata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color=['leiden','Type','S_ID','Sample'], ncols=2)

not supervise cell type

In [15]:
scu.model_unicoord_in_adata(adata, n_cont=30, n_diff=0, n_clus = [],
                            obs_fitting=[])
In [16]:
scu.train_unicoord_in_adata(adata, epochs=100, chunk_size=20000, slot = "cur")
/home/gaohx/anaconda3/envs/scvi/lib/python3.7/site-packages/pandas/core/arrays/categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.remove_unused_categories is deprecated and will be removed in a future version.
  res = method(*args, **kwargs)
training chunk 1 / 3 of the data
training chunk 2 / 3 of the data
training chunk 3 / 3 of the data

In [17]:
fig = draw_loss_curves(adata.uns['unc_stuffs']['loss'])
# if save_figs:
#     fig.savefig(os.path.join(savePath, 'img', 'fig1_lossCurves.png'))
fig.show()
In [18]:
scu.embed_unicoord_in_adata(adata, chunk_size=5000)
In [19]:
sc.pp.neighbors(adata, use_rep='unicoord')
computing neighbors
/home/gaohx/anaconda3/envs/scvi/lib/python3.7/site-packages/numba/typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../../home/gaohx/anaconda3/envs/scvi/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  state.func_ir.loc))
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:06)
In [20]:
sc.tl.leiden(adata, resolution=0.5)
running Leiden clustering
    finished: found 49 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:09)
In [21]:
sc.tl.umap(adata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:51)
In [22]:
sc.pl.embedding(adata, 'X_umap',legend_loc='on data', legend_fontsize=10,
                color=['leiden','Type','S_ID','Sample'], ncols=2)