Source code for metacells.tools.properly_sampled

"""
Properly Sampled
----------------
"""

from typing import Optional
from typing import Union

import numpy as np
from anndata import AnnData  # type: ignore

import metacells.parameters as pr
import metacells.utilities as ut

__all__ = [
    "compute_excluded_gene_umis",
    "find_properly_sampled_cells",
    "find_properly_sampled_genes",
]


[docs] def compute_excluded_gene_umis( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", ) -> None: """ Given an ``excluded_gene`` mask, compute the total ``excluded_umis`` of each cell. """ umis_per_gene_per_cell = ut.get_vo_proper(adata, what, layout="column_major") excluded_genes_mask = ut.get_v_numpy(adata, "excluded_gene") umis_per_excluded_gene_per_cell = umis_per_gene_per_cell[:, excluded_genes_mask] umis_per_excluded_gene_per_cell = ut.to_layout(umis_per_excluded_gene_per_cell, layout="row_major") excluded_umis_per_cell = ut.sum_per(umis_per_excluded_gene_per_cell, per="row") ut.set_o_data(adata, "excluded_umis", excluded_umis_per_cell)
[docs] @ut.logged() @ut.timed_call() @ut.expand_doc() def find_properly_sampled_cells( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_cell_total: Optional[int], max_cell_total: Optional[int], max_excluded_genes_fraction: Optional[float], inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Detect cells with a "proper" amount of ``what`` (default: {what}) data. Due to both technical effects and natural variance between cells, the total number of UMIs varies from cell to cell. We often would like to work on cells that contain a sufficient number of UMIs for meaningful analysis; we sometimes also wish to exclude cells which have "too many" UMIs. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Observation (Cell) Annotations ``properly_sampled_cell`` A boolean mask indicating whether each cell has a "proper" amount of UMIs. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the observation names). **Computation Parameters** 1. Exclude all cells whose total data is less than the ``min_cell_total`` (no default), unless it is ``None``. 2. Exclude all cells whose total data is more than the ``max_cell_total`` (no default), unless it is ``None``. 3. If ``max_excluded_genes_fraction`` (no default) is not ``None``, then exclude all cells whose sum of the excluded data (as defined by the ``excluded_gene`` mask) divided by the total data is more than the specified threshold. """ total_umis_per_cell = ut.get_o_numpy(adata, what, sum=True) cells_mask = np.full(adata.n_obs, True, dtype="bool") if min_cell_total is not None: cells_mask = cells_mask & (total_umis_per_cell >= min_cell_total) if max_cell_total is not None: cells_mask = cells_mask & (total_umis_per_cell <= max_cell_total) if max_excluded_genes_fraction is not None: if not ut.has_data(adata, "excluded_umis"): compute_excluded_gene_umis(adata, what) excluded_umis_per_cell = ut.get_o_numpy(adata, "excluded_umis") excluded_umis_fraction_per_cell = excluded_umis_per_cell / total_umis_per_cell cells_mask = cells_mask & (excluded_umis_fraction_per_cell <= max_excluded_genes_fraction) if inplace: ut.set_o_data(adata, "properly_sampled_cell", cells_mask) return None ut.log_return("properly_sampled_cell", cells_mask) return ut.to_pandas_series(cells_mask, index=adata.obs_names)
[docs] @ut.logged() @ut.timed_call() @ut.expand_doc() def find_properly_sampled_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_total: int = pr.properly_sampled_min_gene_total, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Detect genes with a "proper" amount of ``what`` (default: {what}) data. Due to both technical effects and natural variance between genes, the expression of genes varies greatly between cells. This is exactly the information we are trying to analyze. We often would like to work on genes that have a sufficient level of expression for meaningful analysis. Specifically, it doesn't make sense to analyze genes that have zero expression in all the cells. .. todo:: Provide additional optional criteria for "properly sampled genes"? **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``properly_sampled_gene`` A boolean mask indicating whether each gene has a "proper" number of UMIs. If ``inplace`` (default: {inplace}), this is written to the data and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Exclude all genes whose total data is less than the ``min_gene_total`` (default: {min_gene_total}). """ total_of_genes = ut.get_v_numpy(adata, what, sum=True) genes_mask = total_of_genes >= min_gene_total if inplace: ut.set_v_data(adata, "properly_sampled_gene", genes_mask) return None ut.log_return("properly_sampled_gene", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.obs_names)