Source code for metacells.tools.similarity

"""
Cross-Similarity
----------------
"""

from typing import Optional
from typing import Union

import numpy as np
from anndata import AnnData  # type: ignore

import metacells.parameters as pr
import metacells.utilities as ut

__all__ = [
    "compute_obs_obs_similarity",
    "compute_var_var_similarity",
]



[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def compute_obs_obs_similarity(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    method: str = pr.similarity_method,
    logistics_location: float = pr.logistics_location,
    logistics_slope: float = pr.logistics_slope,
    top: Optional[int] = None,
    bottom: Optional[int] = None,
    inplace: bool = True,
    reproducible: bool,
) -> Optional[ut.PandasFrame]:
    """
    Compute a measure of the similarity between the observations (cells) of ``what`` (default: {what}).

    If ``reproducible`` is ``True``, a slower (still parallel) but
    reproducible algorithm will be used to compute Pearson correlations.

    The ``method`` (default: {method}) can be one of:
    * ``pearson`` for computing Pearson correlation.
    * ``abs_pearson`` for computing the absolute Pearson correlation.
    * ``repeated_pearson`` for computing correlations-of-correlations.
    * ``repeated_abs_pearson`` for computing absolute correlations-of-correlations.
    * ``logistics`` for computing the logistics function.
    * ``logistics_pearson`` for computing correlations-of-logistics.
    * ``logistics_abs_pearson`` for computing absolute correlations-of-logistics.

    If using the logistics function, use the ``logistics_slope`` (default: {logistics_slope}) and
    ``logistics_location`` (default: {logistics_location}).

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Observations-Pair (cells) Annotations
        ``obs_similarity``
            A square matrix where each entry is the similarity between a pair of cells.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas data frame (indexed by the observation names).

    **Computation Parameters**

    1. If ``method`` (default: {method}) is ``logistics`` or ``logistics_pearson``, compute the mean
       value of the logistics function between the variables of each pair of observations (cells).
       Otherwise, it should be ``pearson`` or ``repeated_pearson``, so compute the cross-correlation
       between all the observations.

    2. If the ``method`` is ``logistics_pearson`` or ``repeated_pearson``, then compute the
       cross-correlation of the results of the previous step. That is, two observations (cells) will
       be similar if they are similar to the rest of the observations (cells) in the same way. This
       compensates for the extreme sparsity of the data.

    3. If ``top`` and/or ``bottom`` are specified, keep just these number of most-similar and/or least-similar values in
       each row (turning the result into a compressed matrix format).
    """
    return _compute_elements_similarity(
        adata,
        "obs",
        "row",
        what,
        method=method,
        reproducible=reproducible,
        logistics_location=logistics_location,
        logistics_slope=logistics_slope,
        top=top,
        bottom=bottom,
        inplace=inplace,
    )




[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def compute_var_var_similarity(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    method: str = pr.similarity_method,
    logistics_location: float = pr.logistics_location,
    logistics_slope: float = pr.logistics_slope,
    top: Optional[int] = None,
    bottom: Optional[int] = None,
    inplace: bool = True,
    reproducible: bool,
) -> Optional[ut.PandasFrame]:
    """
    Compute a measure of the similarity between the variables (genes) of ``what`` (default: {what}).

    If ``reproducible`` is ``True``, a slower (still parallel) but
    reproducible algorithm will be used to compute Pearson correlations.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    The ``method`` (default: {method}) can be one of:
    * ``pearson`` for computing Pearson correlation.
    * ``abs_pearson`` for computing the absolute Pearson correlation.
    * ``repeated_pearson`` for computing correlations-of-correlations.
    * ``repeated_abs_pearson`` for computing absolute correlations-of-correlations.
    * ``logistics`` for computing the logistics function.
    * ``logistics_pearson`` for computing correlations-of-logistics.
    * ``logistics_abs_pearson`` for computing absolute correlations-of-logistics.

    If using the logistics function, use the ``logistics_slope`` (default: {logistics_slope}) and
    ``logistics_location`` (default: {logistics_location}).

    **Returns**

    Variable-Pair (genes) Annotations
        ``var_similarity``
            A square matrix where each entry is the similarity between a pair of genes.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas data frame (indexed by the variable names).

    **Computation Parameters**

    1. If ``method`` (default: {method}) is ``logistics`` or ``logistics_pearson``, compute the mean
       value of the logistics function between the variables of each pair of variables (genes).
       Otherwise, it should be ``pearson`` or ``repeated_pearson``, so compute the cross-correlation
       between all the variables.

    2. If the ``method`` is ``logistics_pearson`` or ``repeated_pearson``, then compute the
       cross-correlation of the results of the previous step. That is, two variables (genes) will
       be similar if they are similar to the rest of the variables (genes) in the same way. This
       compensates for the extreme sparsity of the data.

    3. If ``top`` and/or ``bottom`` are specified, keep just these number of most-similar and/or least-similar values in
       each row (turning the result into a compressed matrix format).
    """
    return _compute_elements_similarity(
        adata,
        "var",
        "column",
        what,
        method=method,
        reproducible=reproducible,
        logistics_location=logistics_location,
        logistics_slope=logistics_slope,
        top=top,
        bottom=bottom,
        inplace=inplace,
    )



def _compute_elements_similarity(  # pylint: disable=too-many-branches
    adata: AnnData,
    elements: str,
    per: str,
    what: Union[str, ut.Matrix],
    *,
    method: str,
    reproducible: bool,
    logistics_location: float,
    logistics_slope: float,
    top: Optional[int],
    bottom: Optional[int],
    inplace: bool,
) -> Optional[ut.PandasFrame]:
    assert elements in ("obs", "var")

    assert method in (
        "logistics",
        "logistics_pearson",
        "logistics_abs_pearson",
        "pearson",
        "abs_pearson",
        "repeated_pearson",
        "repeated_abs_pearson",
    ), f"invalid similarity method: {method}"

    data = ut.get_vo_proper(adata, what, layout=f"{per}_major")
    dense = ut.to_numpy_matrix(data)

    similarity: ut.ProperMatrix
    if method in ("logistics", "logistics_pearson", "logistics_abs_pearson"):
        similarity = ut.logistics(dense, location=logistics_location, slope=logistics_slope, per=per)
        similarity *= -1
        similarity += 1
    else:
        similarity = ut.corrcoef(dense, per=per, reproducible=reproducible)
        if method in ("abs_pearson", "repeated_abs_pearson"):
            np.absolute(similarity, out=similarity)

    if method in ("repeated_pearson", "repeated_abs_pearson", "logistics_pearson", "logistics_abs_pearson"):
        similarity = ut.corrcoef(similarity, per=None, reproducible=reproducible)
        if method in ("repeated_abs_pearson", "logistics_abs_pearson"):
            np.absolute(similarity, out=similarity)

    if top is not None:
        top_similarity = ut.top_per(similarity, top, per="row")

    if bottom is not None:
        similarity *= -1
        bottom_similarity = ut.top_per(similarity, bottom, per="row")
        bottom_similarity *= -1  # type: ignore

    if top is not None:
        if bottom is not None:
            assert top + bottom <= similarity.shape[0]
            similarity = top_similarity + bottom_similarity  # type: ignore
        else:
            similarity = top_similarity
    else:
        if bottom is not None:
            similarity = bottom_similarity

    if inplace:
        to = elements + "_similarity"
        if elements == "obs":
            ut.set_oo_data(adata, to, similarity)
        else:
            ut.set_vv_data(adata, to, similarity)
        return None

    if elements == "obs":
        names = adata.obs_names
    else:
        names = adata.var_names

    return ut.to_pandas_frame(similarity, index=names, columns=names)