Source code for metacells.tools.similarity

"""
Cross-Similarity
----------------
"""

from typing import Optional
from typing import Union

import numpy as np
from anndata import AnnData  # type: ignore

import metacells.parameters as pr
import metacells.utilities as ut

__all__ = [
    "compute_obs_obs_similarity",
    "compute_var_var_similarity",
]


[docs] @ut.logged() @ut.timed_call() @ut.expand_doc() def compute_obs_obs_similarity( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, method: str = pr.similarity_method, logistics_location: float = pr.logistics_location, logistics_slope: float = pr.logistics_slope, top: Optional[int] = None, bottom: Optional[int] = None, inplace: bool = True, reproducible: bool, ) -> Optional[ut.PandasFrame]: """ Compute a measure of the similarity between the observations (cells) of ``what`` (default: {what}). If ``reproducible`` is ``True``, a slower (still parallel) but reproducible algorithm will be used to compute Pearson correlations. The ``method`` (default: {method}) can be one of: * ``pearson`` for computing Pearson correlation. * ``abs_pearson`` for computing the absolute Pearson correlation. * ``repeated_pearson`` for computing correlations-of-correlations. * ``repeated_abs_pearson`` for computing absolute correlations-of-correlations. * ``logistics`` for computing the logistics function. * ``logistics_pearson`` for computing correlations-of-logistics. * ``logistics_abs_pearson`` for computing absolute correlations-of-logistics. If using the logistics function, use the ``logistics_slope`` (default: {logistics_slope}) and ``logistics_location`` (default: {logistics_location}). **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Observations-Pair (cells) Annotations ``obs_similarity`` A square matrix where each entry is the similarity between a pair of cells. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas data frame (indexed by the observation names). **Computation Parameters** 1. If ``method`` (default: {method}) is ``logistics`` or ``logistics_pearson``, compute the mean value of the logistics function between the variables of each pair of observations (cells). Otherwise, it should be ``pearson`` or ``repeated_pearson``, so compute the cross-correlation between all the observations. 2. If the ``method`` is ``logistics_pearson`` or ``repeated_pearson``, then compute the cross-correlation of the results of the previous step. That is, two observations (cells) will be similar if they are similar to the rest of the observations (cells) in the same way. This compensates for the extreme sparsity of the data. 3. If ``top`` and/or ``bottom`` are specified, keep just these number of most-similar and/or least-similar values in each row (turning the result into a compressed matrix format). """ return _compute_elements_similarity( adata, "obs", "row", what, method=method, reproducible=reproducible, logistics_location=logistics_location, logistics_slope=logistics_slope, top=top, bottom=bottom, inplace=inplace, )
[docs] @ut.logged() @ut.timed_call() @ut.expand_doc() def compute_var_var_similarity( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, method: str = pr.similarity_method, logistics_location: float = pr.logistics_location, logistics_slope: float = pr.logistics_slope, top: Optional[int] = None, bottom: Optional[int] = None, inplace: bool = True, reproducible: bool, ) -> Optional[ut.PandasFrame]: """ Compute a measure of the similarity between the variables (genes) of ``what`` (default: {what}). If ``reproducible`` is ``True``, a slower (still parallel) but reproducible algorithm will be used to compute Pearson correlations. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. The ``method`` (default: {method}) can be one of: * ``pearson`` for computing Pearson correlation. * ``abs_pearson`` for computing the absolute Pearson correlation. * ``repeated_pearson`` for computing correlations-of-correlations. * ``repeated_abs_pearson`` for computing absolute correlations-of-correlations. * ``logistics`` for computing the logistics function. * ``logistics_pearson`` for computing correlations-of-logistics. * ``logistics_abs_pearson`` for computing absolute correlations-of-logistics. If using the logistics function, use the ``logistics_slope`` (default: {logistics_slope}) and ``logistics_location`` (default: {logistics_location}). **Returns** Variable-Pair (genes) Annotations ``var_similarity`` A square matrix where each entry is the similarity between a pair of genes. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas data frame (indexed by the variable names). **Computation Parameters** 1. If ``method`` (default: {method}) is ``logistics`` or ``logistics_pearson``, compute the mean value of the logistics function between the variables of each pair of variables (genes). Otherwise, it should be ``pearson`` or ``repeated_pearson``, so compute the cross-correlation between all the variables. 2. If the ``method`` is ``logistics_pearson`` or ``repeated_pearson``, then compute the cross-correlation of the results of the previous step. That is, two variables (genes) will be similar if they are similar to the rest of the variables (genes) in the same way. This compensates for the extreme sparsity of the data. 3. If ``top`` and/or ``bottom`` are specified, keep just these number of most-similar and/or least-similar values in each row (turning the result into a compressed matrix format). """ return _compute_elements_similarity( adata, "var", "column", what, method=method, reproducible=reproducible, logistics_location=logistics_location, logistics_slope=logistics_slope, top=top, bottom=bottom, inplace=inplace, )
def _compute_elements_similarity( # pylint: disable=too-many-branches adata: AnnData, elements: str, per: str, what: Union[str, ut.Matrix], *, method: str, reproducible: bool, logistics_location: float, logistics_slope: float, top: Optional[int], bottom: Optional[int], inplace: bool, ) -> Optional[ut.PandasFrame]: assert elements in ("obs", "var") assert method in ( "logistics", "logistics_pearson", "logistics_abs_pearson", "pearson", "abs_pearson", "repeated_pearson", "repeated_abs_pearson", ), f"invalid similarity method: {method}" data = ut.get_vo_proper(adata, what, layout=f"{per}_major") dense = ut.to_numpy_matrix(data) similarity: ut.ProperMatrix if method in ("logistics", "logistics_pearson", "logistics_abs_pearson"): similarity = ut.logistics(dense, location=logistics_location, slope=logistics_slope, per=per) similarity *= -1 similarity += 1 else: similarity = ut.corrcoef(dense, per=per, reproducible=reproducible) if method in ("abs_pearson", "repeated_abs_pearson"): np.absolute(similarity, out=similarity) if method in ("repeated_pearson", "repeated_abs_pearson", "logistics_pearson", "logistics_abs_pearson"): similarity = ut.corrcoef(similarity, per=None, reproducible=reproducible) if method in ("repeated_abs_pearson", "logistics_abs_pearson"): np.absolute(similarity, out=similarity) if top is not None: top_similarity = ut.top_per(similarity, top, per="row") if bottom is not None: similarity *= -1 bottom_similarity = ut.top_per(similarity, bottom, per="row") bottom_similarity *= -1 # type: ignore if top is not None: if bottom is not None: assert top + bottom <= similarity.shape[0] similarity = top_similarity + bottom_similarity # type: ignore else: similarity = top_similarity else: if bottom is not None: similarity = bottom_similarity if inplace: to = elements + "_similarity" if elements == "obs": ut.set_oo_data(adata, to, similarity) else: ut.set_vv_data(adata, to, similarity) return None if elements == "obs": names = adata.obs_names else: names = adata.var_names return ut.to_pandas_frame(similarity, index=names, columns=names)