Source code for metacells.pipeline.umap

"""
UMAP
----

It is useful to project the metacells data to a 2D scatter plot (where each point is a metacell).
The steps provided here are expected to yield a reasonable such projection, though as always you
might need to tweak the parameters or even the overall flow for specific data sets.
"""

from re import Pattern
from typing import Collection
from typing import Optional
from typing import Tuple
from typing import Union

import igraph as ig  # type: ignore
import numpy as np
from anndata import AnnData  # type: ignore
from scipy import sparse  # type: ignore

import metacells.parameters as pr
import metacells.tools as tl
import metacells.utilities as ut

__all__ = [
    "compute_knn_by_markers",
    "compute_umap_by_markers",
]



[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def compute_knn_by_markers(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    marker_gene_names: Optional[Collection[str]] = None,
    marker_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    max_marker_genes: Optional[int] = pr.umap_max_marker_genes,
    ignore_lateral_genes: bool = pr.umap_ignore_lateral_genes,
    similarity_value_regularization: float = pr.umap_similarity_value_regularization,
    similarity_log_data: bool = pr.umap_similarity_log_data,
    similarity_method: str = pr.umap_similarity_method,
    logistics_location: float = pr.logistics_location,
    logistics_slope: float = pr.logistics_slope,
    k: int,
    balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
    incoming_degree_factor: float = pr.knn_incoming_degree_factor,
    outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
    min_outgoing_degree: int = pr.markers_knn_min_outgoing_degree,
    reproducible: bool,
) -> ut.PandasFrame:
    """
    Compute KNN graph between metacells based on marker genes.

    If ``reproducible`` is ``True``, a slower (still parallel) but
    reproducible algorithm will be used to compute Pearson correlations.

    **Input**

    Annotated ``adata`` where each observation is a metacells and the variables are genes, are genes, where ``what`` is
    a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix. Should contain a ``marker_gene`` mask unless explicitly specifying the marker genes.

    **Returns**

    Sets the following in ``adata``:

    Observations-Pair (Metacells) Annotations
        ``obs_outgoing_weights``
            A sparse square matrix where each non-zero entry is the weight of an edge between a pair
            of cells or genes, where the sum of the weights of the outgoing edges for each element
            is 1 (there is always at least one such edge).

    Also return a pandas data frame of the similarities between the observations (metacells).

    **Computation Parameters**

    1. If ``marker_gene_names`` and/or ``marker_gene_patterns`` were specified, use the matching genes.
       Otherwise, use the ``marker_gene`` mask.

    2. If ``ignore_lateral_genes`` (default: {ignore_lateral_genes}), then remove any genes marked as lateral
       from the mask.

    3. If ``max_marker_genes`` (default: {max_marker_genes}) is not ``None``, then pick this number
       of marker genes with the highest variance.

    4. Compute the fractions of each ``marker_gene`` in each cell, and add the
       ``similarity_value_regularization`` (default: {similarity_value_regularization}) to it.

    5. If ``similarity_log_data`` (default: {similarity_log_data}), invoke the
       :py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of
       the data.

    6. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` using
       ``similarity_method`` (default: {similarity_method}), ``logistics_location`` (default:
       {logistics_slope}) and ``logistics_slope`` (default: {logistics_slope}) and convert this
       to distances.

    7. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` using the distances,
       ``k`` (no default!), ``balanced_ranks_factor`` (default: {balanced_ranks_factor}),
       ``incoming_degree_factor`` (default: {incoming_degree_factor}), ``outgoing_degree_factor``
       (default: {outgoing_degree_factor}) and ``min_outgoing_degree`` (default: {min_outgoing_degree})
       to compute a "skeleton" graph to overlay on top of the UMAP graph.
    """
    assert max_marker_genes is None or max_marker_genes > 0
    if marker_gene_names is None and marker_gene_patterns is None:
        marker_genes_mask = ut.get_v_numpy(adata, "marker_gene")
    else:
        marker_genes_series = tl.find_named_genes(adata, names=marker_gene_names, patterns=marker_gene_patterns)
        assert marker_genes_series is not None
        marker_genes_mask = ut.to_numpy_vector(marker_genes_series)

    if ignore_lateral_genes:
        lateral_genes_mask = ut.get_v_numpy(adata, "lateral_gene")
        marker_genes_mask = marker_genes_mask & ~lateral_genes_mask
        ut.log_calc("marker_genes_mask", marker_genes_mask)

    all_fractions = ut.get_vo_proper(adata, what, layout="row_major")

    index_per_marker_gene = np.where(marker_genes_mask)[0]
    fraction_per_metacell_per_marker_gene = ut.to_numpy_matrix(all_fractions[:, index_per_marker_gene])

    if max_marker_genes is not None and max_marker_genes < len(index_per_marker_gene):
        fraction_per_metacell_per_marker_gene = ut.to_layout(
            fraction_per_metacell_per_marker_gene, layout="column_major"
        )
        variance_per_marker_gene = ut.variance_per(fraction_per_metacell_per_marker_gene, per="column")
        variance_per_marker_gene *= -1
        chosen_positions = np.argpartition(variance_per_marker_gene, max_marker_genes)[:max_marker_genes]
        ut.log_calc("chosen_positions", chosen_positions)
        index_per_marker_gene = index_per_marker_gene[chosen_positions]
        fraction_per_metacell_per_marker_gene = fraction_per_metacell_per_marker_gene[:, chosen_positions]

    fraction_per_metacell_per_marker_gene = ut.to_layout(fraction_per_metacell_per_marker_gene, layout="row_major")
    fraction_per_metacell_per_marker_gene += similarity_value_regularization

    if similarity_log_data:
        fraction_per_metacell_per_marker_gene = ut.log_data(fraction_per_metacell_per_marker_gene, base=2)

    tdata = ut.slice(adata, vars=index_per_marker_gene)
    similarities = tl.compute_obs_obs_similarity(
        tdata,
        fraction_per_metacell_per_marker_gene,
        method=similarity_method,
        reproducible=reproducible,
        logistics_location=logistics_location,
        logistics_slope=logistics_slope,
        inplace=False,
    )
    assert similarities is not None

    tl.compute_obs_obs_knn_graph(
        adata,
        similarities,
        k=k,
        balanced_ranks_factor=balanced_ranks_factor,
        incoming_degree_factor=incoming_degree_factor,
        outgoing_degree_factor=outgoing_degree_factor,
        min_outgoing_degree=min_outgoing_degree,
    )

    return similarities




[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def compute_umap_by_markers(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    marker_gene_names: Optional[Collection[str]] = None,
    marker_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    max_marker_genes: Optional[int] = pr.umap_max_marker_genes,
    ignore_lateral_genes: bool = pr.umap_ignore_lateral_genes,
    similarity_value_regularization: float = pr.umap_similarity_value_regularization,
    similarity_log_data: bool = pr.umap_similarity_log_data,
    similarity_method: str = pr.umap_similarity_method,
    logistics_location: float = pr.logistics_location,
    logistics_slope: float = pr.logistics_slope,
    skeleton_k: int = pr.skeleton_k,
    balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
    incoming_degree_factor: float = pr.knn_incoming_degree_factor,
    outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
    min_outgoing_degree: int = pr.markers_knn_min_outgoing_degree,
    umap_k: int = pr.umap_k,
    dimensions: int = 2,
    min_dist: float = pr.umap_min_dist,
    spread: float = pr.umap_spread,
    random_seed: int,
) -> None:
    """
    Compute a UMAP projection of the (meta)cells.

    **Input**

    Annotated ``adata`` where each observation is a metacells and the variables are genes, are genes, where ``what`` is
    a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix. Should contain a ``marker_gene`` mask unless explicitly specifying the marker genes.

    **Returns**

    Sets the following annotations in ``adata``:

    Observation-Observation (Metacell-Metacell) Annotations
        ``umap_distances``
            A sparse symmetric matrix of the graph of distances between the metacells.

    Observation (Metacell) Annotations
        ``x``, ``y`` (if ``dimensions`` is 2)
            The X and Y coordinates of each metacell in the UMAP projection.

        ``u``, ``v``, ``w`` (if ``dimensions`` is 3)
            The U, V, W coordinates of each metacell in the UMAP projection.

    **Computation Parameters**

    1. Invoke :py:func:`metacells.pipeline.umap.compute_knn_by_markers` using
       ``marker_gene_names`` (default: {marker_gene_names}), ``marker_gene_patterns`` (default: {marker_gene_patterns}),
       ``max_marker_genes`` (default: {max_marker_genes}), ``ignore_lateral_genes`` (default: {ignore_lateral_genes}),
       ``similarity_value_regularization`` (default: {similarity_value_regularization}), ``similarity_log_data``
       (default: {similarity_log_data}), ``similarity_method`` (default: {similarity_method}), ``logistics_location``
       (default: {logistics_location}), ``logistics_slope`` (default: {logistics_slope}), ``skeleton_k`` (default:
       {skeleton_k}), ``balanced_ranks_factor`` (default: {balanced_ranks_factor}), ``incoming_degree_factor`` (default:
       {incoming_degree_factor}) and ``outgoing_degree_factor`` (default: {outgoing_degree_factor}) and
       ``min_outgoing_degree`` (default: {min_outgoing_degree}) to compute a "skeleton" graph to overlay on top of the
       UMAP graph. Specify a non-zero ``random_seed`` to make this reproducible.

    2. Invoke :py:func:`metacells.tools.layout.umap_by_distances` using the distances, ``umap_k``
       (default: {umap_k}), ``min_dist`` (default: {min_dist}), ``spread`` (default: {spread}),
       dimensions (default: {dimensions}) and ``random_seed``.

    .. note::

        Keep in mind that the KNN graph used by UMAP (controlled by ``umap_k``) is *not* identical to the KNN graph we
        compute (controlled by ``skeleton_k``). By default, we choose ``skeleton_k < umap_k``, as the purpose of the
        skeleton KNN is to highlight the "strong" structure of the data; in practice this strong skeleton is highly
        compatible with the structure used by UMAP, so it serves it purpose reasonably well. It would have been nice to
        make these compatible, but UMAP is not friendly towards dictating a KNN graph from the outside.
    """
    similarities = compute_knn_by_markers(
        adata,
        what,
        marker_gene_names=marker_gene_names,
        marker_gene_patterns=marker_gene_patterns,
        max_marker_genes=max_marker_genes,
        ignore_lateral_genes=ignore_lateral_genes,
        similarity_value_regularization=similarity_value_regularization,
        similarity_log_data=similarity_log_data,
        similarity_method=similarity_method,
        logistics_location=logistics_location,
        logistics_slope=logistics_slope,
        k=skeleton_k,
        balanced_ranks_factor=balanced_ranks_factor,
        incoming_degree_factor=incoming_degree_factor,
        outgoing_degree_factor=outgoing_degree_factor,
        min_outgoing_degree=min_outgoing_degree,
        reproducible=(random_seed != 0),
    )

    distances = ut.to_numpy_matrix(similarities)
    distances *= -1
    distances += 1
    np.fill_diagonal(distances, 0.0)
    distances = sparse.csr_matrix(distances)

    ut.set_oo_data(adata, "umap_distances", distances)

    tl.umap_by_distances(
        adata, distances, k=umap_k, dimensions=dimensions, min_dist=min_dist, spread=spread, random_seed=random_seed
    )



def _build_igraph(edge_weights: ut.Matrix) -> Tuple[ig.Graph, ut.NumpyVector]:
    edge_weights = ut.to_proper_matrix(edge_weights)
    assert edge_weights.shape[0] == edge_weights.shape[1]
    size = edge_weights.shape[0]

    sources, targets = edge_weights.nonzero()
    weights_array = ut.to_numpy_vector(edge_weights[sources, targets]).astype("float64")

    graph = ig.Graph(directed=True)
    graph.add_vertices(size)
    graph.add_edges(list(zip(sources, targets)))
    graph.es["weight"] = weights_array

    return graph, weights_array