Source code for python_bioinformagicks.tools._subset_by_geosketching
import anndata as ad
import numpy as np
from geosketch import gs
[docs]
def subset_by_geosketching(
adata: ad.AnnData,
n_cells_to_keep: int = None,
frac_cells_to_keep: float = 0.33,
use_rep: str = "X_pca"
):
"""
Subsamples a single-cell dataset using geometric sketching
to more fairly represent rare and common cell types
Defaults to keeping a fraction of cells. To use an
exact target cell number, specify `n_cells_to_keep`, which
will take priority over `frac_cells_to_keep`.
References:
* https://doi.org/10.1016/j.cels.2019.05.003
* https://github.com/brianhie/geosketch
Parameters
----------
adata: ad.AnnData
The original anndata object, with `use_rep` calculated
and stored in the :code:`adata.obsm[use_rep]` slot.
n_cells_to_keep: int (default: None)
The number of cells to keep in the subsampled
adata object. Must be less than :code:`len(adata.obs.index)`.
frac_cells_to_keep: float (0,1)
Fraction of cells to keep; overridden by
`n_cells_to_keep` when not None.
use_rep: str (default: "X_pca")
The latent space representation to use.
Must be a key in :code:`adata.obsm`.
Returns
-------
mask: list[bool]
A boolean mask of length `len(adata.obs.index)` where
`True` indicates which cells to keep after geosketching.
"""
n_cells = 0
if (n_cells_to_keep is not None):
if ((n_cells_to_keep < len(adata.obs.index)) and (n_cells_to_keep > 0)):
n_cells = n_cells_to_keep
else:
print("[ERROR] n_cells_to_keep specified, but not in range (0, len(adata.obs.index))")
print("[WARN] defaulting to 33% of cells")
if (n_cells == 0):
if ((frac_cells_to_keep >= 1) or (frac_cells_to_keep <= 0)):
print("[ERROR] frac_cells_to_keep is not in range (0,1)")
print("[WARN] defaulting to 33% of cells")
frac_cells_to_keep = 0.33
n_cells = int(frac_cells_to_keep * len(adata.obs.index))
X_orig = adata.obsm[use_rep]
mask_idx = gs(X_orig, n_cells, replace=False)
mask = np.zeros_like(adata.obs.index, dtype=np.bool_)
mask[mask_idx] = True
return mask