Source code for metacells.tools.properly_sampled

"""
Properly Sampled
----------------
"""

from typing import Optional
from typing import Union

import numpy as np
from anndata import AnnData  # type: ignore

import metacells.parameters as pr
import metacells.utilities as ut

__all__ = [
    "compute_excluded_gene_umis",
    "find_properly_sampled_cells",
    "find_properly_sampled_genes",
]



[docs]
def compute_excluded_gene_umis(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
) -> None:
    """
    Given an ``excluded_gene`` mask, compute the total ``excluded_umis`` of each cell.
    """
    umis_per_gene_per_cell = ut.get_vo_proper(adata, what, layout="column_major")
    excluded_genes_mask = ut.get_v_numpy(adata, "excluded_gene")
    umis_per_excluded_gene_per_cell = umis_per_gene_per_cell[:, excluded_genes_mask]
    umis_per_excluded_gene_per_cell = ut.to_layout(umis_per_excluded_gene_per_cell, layout="row_major")
    excluded_umis_per_cell = ut.sum_per(umis_per_excluded_gene_per_cell, per="row")
    ut.set_o_data(adata, "excluded_umis", excluded_umis_per_cell)




[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def find_properly_sampled_cells(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_cell_total: Optional[int],
    max_cell_total: Optional[int],
    max_excluded_genes_fraction: Optional[float],
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect cells with a "proper" amount of ``what`` (default: {what}) data.

    Due to both technical effects and natural variance between cells, the total number of UMIs
    varies from cell to cell. We often would like to work on cells that contain a sufficient number
    of UMIs for meaningful analysis; we sometimes also wish to exclude cells which have "too many"
    UMIs.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Observation (Cell) Annotations
        ``properly_sampled_cell``
            A boolean mask indicating whether each cell has a "proper" amount of UMIs.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the observation names).

    **Computation Parameters**

    1. Exclude all cells whose total data is less than the ``min_cell_total`` (no default), unless
       it is ``None``.

    2. Exclude all cells whose total data is more than the ``max_cell_total`` (no default), unless
       it is ``None``.

    3. If ``max_excluded_genes_fraction`` (no default) is not ``None``, then exclude all cells whose sum of the excluded
       data (as defined by the ``excluded_gene`` mask) divided by the total data is more than the specified threshold.
    """
    total_umis_per_cell = ut.get_o_numpy(adata, what, sum=True)

    cells_mask = np.full(adata.n_obs, True, dtype="bool")

    if min_cell_total is not None:
        cells_mask = cells_mask & (total_umis_per_cell >= min_cell_total)

    if max_cell_total is not None:
        cells_mask = cells_mask & (total_umis_per_cell <= max_cell_total)

    if max_excluded_genes_fraction is not None:
        if not ut.has_data(adata, "excluded_umis"):
            compute_excluded_gene_umis(adata, what)
        excluded_umis_per_cell = ut.get_o_numpy(adata, "excluded_umis")
        excluded_umis_fraction_per_cell = excluded_umis_per_cell / total_umis_per_cell
        cells_mask = cells_mask & (excluded_umis_fraction_per_cell <= max_excluded_genes_fraction)

    if inplace:
        ut.set_o_data(adata, "properly_sampled_cell", cells_mask)
        return None

    ut.log_return("properly_sampled_cell", cells_mask)
    return ut.to_pandas_series(cells_mask, index=adata.obs_names)




[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def find_properly_sampled_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_total: int = pr.properly_sampled_min_gene_total,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect genes with a "proper" amount of ``what`` (default: {what}) data.

    Due to both technical effects and natural variance between genes, the expression of genes varies
    greatly between cells. This is exactly the information we are trying to analyze. We often would
    like to work on genes that have a sufficient level of expression for meaningful analysis.
    Specifically, it doesn't make sense to analyze genes that have zero expression in all the cells.

    .. todo::

        Provide additional optional criteria for "properly sampled genes"?

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``properly_sampled_gene``
            A boolean mask indicating whether each gene has a "proper" number of UMIs.

    If ``inplace`` (default: {inplace}), this is written to the data and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Exclude all genes whose total data is less than the ``min_gene_total`` (default:
       {min_gene_total}).
    """
    total_of_genes = ut.get_v_numpy(adata, what, sum=True)

    genes_mask = total_of_genes >= min_gene_total

    if inplace:
        ut.set_v_data(adata, "properly_sampled_gene", genes_mask)
        return None

    ut.log_return("properly_sampled_gene", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.obs_names)