Source code for metacells.tools.distinct

""",
Distincts
---------
"""

import sys
from typing import Optional
from typing import Tuple
from typing import Union

import numpy as np
from anndata import AnnData  # type: ignore

import metacells.parameters as pr
import metacells.utilities as ut

if "sphinx" not in sys.argv[0]:
    import metacells.extensions as xt  # type: ignore

__all__ = [
    "compute_distinct_folds",
    "find_distinct_genes",
    "compute_subset_distinct_genes",
]


[docs] @ut.logged() @ut.timed_call() @ut.expand_doc() def compute_distinct_folds( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, normalization: float = 0, inplace: bool = True, ) -> Optional[ut.PandasFrame]: """ Compute for each observation (cell) and each variable (gene) how much is the ``what`` (default: {what}) value different from the overall population. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Per-Observation-Per-Variable (Cell-Gene) Annotations: ``distinct_ratio`` For each gene in each cell, the log based 2 of the ratio between the fraction of the gene in the cell and the fraction of the gene in the overall population (sum of cells). If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas frame (indexed by the observation and distinct gene rank). **Computation Parameters** 1. Compute, for each gene, the fraction of the gene's values out of the total sum of the values (that is, the mean fraction of the gene's expression in the population). 2. Compute, for each cell, for each gene, the fraction of the gene's value out of the sum of the values in the cell (that is, the fraction of the gene's expression in the cell). 3. Divide the two to the distinct ratio (that is, how much the gene's expression in the cell is different from the overall population), first adding the ``normalization`` (default: {normalization}) to both. 4. Compute the log (base 2) of the result and use it as the fold factor. """ columns_data = ut.get_vo_proper(adata, what, layout="column_major") fractions_of_genes_in_data = ut.fraction_per(columns_data, per="column") fractions_of_genes_in_data += normalization total_umis_of_cells = ut.get_o_numpy(adata, what, sum=True) total_umis_of_cells[total_umis_of_cells == 0] = 1 rows_data = ut.get_vo_proper(adata, what, layout="row_major") fraction_of_genes_in_cells = ut.to_numpy_matrix(rows_data) / total_umis_of_cells[:, np.newaxis] fraction_of_genes_in_cells += normalization zeros_mask = fractions_of_genes_in_data <= 0 fractions_of_genes_in_data[zeros_mask] = -1 fraction_of_genes_in_cells[:, zeros_mask] = -1 ratio_of_genes_in_cells = fraction_of_genes_in_cells ratio_of_genes_in_cells /= fractions_of_genes_in_data assert np.min(np.min(ratio_of_genes_in_cells)) > 0 fold_of_genes_in_cells = np.log2(ratio_of_genes_in_cells, out=ratio_of_genes_in_cells) if inplace: ut.set_vo_data(adata, "distinct_fold", fold_of_genes_in_cells) return None return ut.to_pandas_frame(fold_of_genes_in_cells, index=adata.obs_names, columns=adata.var_names)
[docs] @ut.logged() @ut.timed_call() @ut.expand_doc() def find_distinct_genes( adata: AnnData, what: Union[str, ut.Matrix] = "distinct_fold", *, distinct_genes_count: int = pr.distinct_genes_count, inplace: bool = True, ) -> Optional[Tuple[ut.PandasFrame, ut.PandasFrame]]: """ Find for each observation (cell) the genes in which its ``what`` (default: {what}) value is most distinct from the general population. This is typically applied to the metacells data rather than to the cells data. **Input** Annotated ``adata``, where the observations are (mata)cells and the variables are genes, including a per-observation-per-variable annotated folds data, {what}), e.g. as computed by :py:func:`compute_distinct_folds`. **Returns** Observation-Any (Cell) Annotations ``cell_distinct_gene_indices`` For each cell, the indices of its top ``distinct_genes_count`` genes. ``cell_distinct_gene_folds`` For each cell, the fold factor of its top ``distinct_genes_count``. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as two pandas frames (indexed by the observation and distinct gene rank). **Computation Parameters** 1. Fetch the previously computed per-observation-per-variable ``what`` data. 2. Keep the ``distinct_genes_count`` (default: {distinct_genes_count}) top absolute fold factors. """ assert 0 < distinct_genes_count < adata.n_vars distinct_gene_indices = np.empty((adata.n_obs, distinct_genes_count), dtype="int32") distinct_gene_folds = np.empty((adata.n_obs, distinct_genes_count), dtype="float32") fold_in_cells = ut.mustbe_numpy_matrix(ut.get_vo_proper(adata, what, layout="row_major")) extension_name = f"top_distinct_{fold_in_cells.dtype}_t" extension = getattr(xt, extension_name) extension(distinct_gene_indices, distinct_gene_folds, fold_in_cells) if inplace: ut.set_oa_data(adata, "cell_distinct_gene_indices", distinct_gene_indices) ut.set_oa_data(adata, "cell_distinct_gene_folds", distinct_gene_folds) return None return ( ut.to_pandas_frame(distinct_gene_indices, index=adata.obs_names), ut.to_pandas_frame(distinct_gene_folds, index=adata.obs_names), )
[docs] @ut.logged() @ut.timed_call() @ut.expand_doc() def compute_subset_distinct_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, prefix: Optional[str] = None, scale: Optional[Union[bool, str, ut.NumpyVector]], subset: Union[str, ut.NumpyVector], normalization: float, ) -> Optional[Tuple[ut.PandasSeries, ut.PandasSeries]]: """ Given a subset of the observations (cells), compute for each gene how distinct its ``what`` (default: {what}) value is in the subset compared to the overall population. This is the area-under-curve of the receiver operating characteristic (AUROC) for the gene, that is, the probability that a randomly selected observation (cell) in the subset will have a higher value than a randomly selected observation (cell) outside the subset. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``<prefix>_fold`` Store the ratio of the expression of the gene in the subset as opposed to the rest of the population. ``<prefix>_auroc`` Store the distinctiveness of the gene in the subset as opposed to the rest of the population. If ``prefix`` (default: {prefix}), is specified, this is written to the data. Otherwise this is returned as two pandas series (indexed by the gene names). **Computation Parameters** 1. Use the ``subset`` to assign a boolean label to each observation (cell). The ``subset`` can be a vector of integer observation names, or a boolean mask, or the string name of a per-observation annotation containing the boolean mask. 2. If ``scale`` is ``False``, use the data as-is. If it is ``True``, divide the data by the sum of each observation (cell). If it is a string, it should be the name of a per-observation annotation to use. Otherwise, it should be a vector of the scale factor for each observation (cell). 3. Compute the fold ratios using the ``normalization`` (no default!) and the AUROC for each gene, for the scaled data based on this mask. """ if isinstance(subset, str): subset = ut.get_o_numpy(adata, subset) if subset.dtype != "bool": mask: ut.NumpyVector = np.full(adata.n_obs, False) mask[subset] = True subset = mask scale_of_cells: Optional[ut.NumpyVector] = None if not isinstance(scale, bool): scale_of_cells = ut.maybe_o_numpy(adata, scale, formatter=ut.sizes_description) elif scale: scale_of_cells = ut.get_o_numpy(adata, what, sum=True) else: scale_of_cells = None matrix = ut.get_vo_proper(adata, what, layout="column_major").transpose() fold_of_genes, auroc_of_genes = ut.matrix_rows_folds_and_aurocs( matrix, columns_subset=subset, columns_scale=scale_of_cells, normalization=normalization ) if prefix is not None: ut.set_v_data(adata, f"{prefix}_auroc", auroc_of_genes) ut.set_v_data(adata, f"{prefix}_fold", fold_of_genes) return None return ( ut.to_pandas_series(fold_of_genes, index=adata.var_names), ut.to_pandas_series(auroc_of_genes, index=adata.var_names), )