Source code for metacells.tools.downsample

"""
Downsample
----------
"""

from typing import Optional
from typing import Tuple
from typing import Union

import numpy as np
from anndata import AnnData  # type: ignore

import metacells.parameters as pr
import metacells.utilities as ut

__all__ = [
    "downsample_cells",
]



[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def downsample_cells(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    downsample_min_cell_quantile: float = pr.downsample_min_cell_quantile,
    downsample_min_samples: float = pr.downsample_min_samples,
    downsample_max_cell_quantile: float = pr.downsample_max_cell_quantile,
    inplace: bool = True,
    random_seed: int,
) -> Optional[Tuple[int, ut.PandasFrame]]:
    """
    Downsample the values of ``what`` (default: {what}) data.

    Downsampling is an effective way to get the same number of samples in multiple cells
    (that is, the same number of total UMIs in multiple cells), and serves as an alternative to
    normalization (e.g., working with UMI fractions instead of raw UMI counts).

    Downsampling is especially important when computing correlations between cells. When there is
    high variance between the total UMI count in different cells, then normalization will return
    higher correlation values between cells with a higher total UMI count, which will result in an
    inflated estimation of their similarity to other cells. Downsampling avoids this effect.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Unstructured Annotations
        ``downsample_samples``
            The target total number of samples in each downsampled cell.

    Variable-Observation (Gene-Cell) Annotations
        ``downsampled``
            The downsampled data where the total number of samples in each cell is at most
            ``downsample_samples``.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this
    is returned as a tuple with the samples and a pandas data frame (indexed by the cell and gene names).

    **Computation Parameters**

    1. Compute the total samples in each cell.

    2. Decide on the value to downsample to. We would like all cells to end up with at least some
       reasonable number of samples (total UMIs) ``downsample_min_samples`` (default:
       {downsample_min_samples}). We'd also like all (most) cells to end up with the highest
       reasonable downsampled total number of samples, so if possible we increase the number of
       samples, as long as at most ``downsample_min_cell_quantile`` (default:
       {downsample_min_cell_quantile}) cells will have lower number of samples. We'd also like all
       (most) cells to end up with the same downsampled total number of samples, so if we have to we
       decrease the number of samples to ensure at most ``downsample_max_cell_quantile`` (default:
       {downsample_max_cell_quantile}) cells will have a lower number of samples.

    3. Downsample each cell so that it has at most the selected number of samples. Specify a non-zero
       ``random_seed`` to make this reproducible.
    """
    total_per_cell = ut.get_o_numpy(adata, what, sum=True)

    samples = int(
        round(
            min(
                max(downsample_min_samples, np.quantile(total_per_cell, downsample_min_cell_quantile)),
                np.quantile(total_per_cell, downsample_max_cell_quantile),
            )
        )
    )

    ut.log_calc("samples", samples)

    data = ut.get_vo_proper(adata, what, layout="row_major")
    assert ut.shaped_dtype(data) == "float32"
    downsampled = ut.downsample_matrix(data, per="row", samples=samples, random_seed=random_seed)
    if inplace:
        ut.set_vo_data(adata, "downsampled", downsampled)
        ut.set_m_data(adata, "downsample_samples", samples)
        return None

    return samples, ut.to_pandas_frame(downsampled, index=adata.obs_names, columns=adata.var_names)