"""
UMAP
----
It is useful to project the metacells data to a 2D scatter plot (where each point is a metacell).
The steps provided here are expected to yield a reasonable such projection, though as always you
might need to tweak the parameters or even the overall flow for specific data sets.
"""
from re import Pattern
from typing import Collection
from typing import Optional
from typing import Tuple
from typing import Union
import igraph as ig # type: ignore
import numpy as np
from anndata import AnnData # type: ignore
from scipy import sparse # type: ignore
import metacells.parameters as pr
import metacells.tools as tl
import metacells.utilities as ut
__all__ = [
"compute_knn_by_markers",
"compute_umap_by_markers",
]
[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def compute_knn_by_markers(
adata: AnnData,
what: Union[str, ut.Matrix] = "__x__",
*,
marker_gene_names: Optional[Collection[str]] = None,
marker_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
max_marker_genes: Optional[int] = pr.umap_max_marker_genes,
ignore_lateral_genes: bool = pr.umap_ignore_lateral_genes,
similarity_value_regularization: float = pr.umap_similarity_value_regularization,
similarity_log_data: bool = pr.umap_similarity_log_data,
similarity_method: str = pr.umap_similarity_method,
logistics_location: float = pr.logistics_location,
logistics_slope: float = pr.logistics_slope,
k: int,
balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
incoming_degree_factor: float = pr.knn_incoming_degree_factor,
outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
min_outgoing_degree: int = pr.markers_knn_min_outgoing_degree,
reproducible: bool,
) -> ut.PandasFrame:
"""
Compute KNN graph between metacells based on marker genes.
If ``reproducible`` is ``True``, a slower (still parallel) but
reproducible algorithm will be used to compute Pearson correlations.
**Input**
Annotated ``adata`` where each observation is a metacells and the variables are genes, are genes, where ``what`` is
a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
matrix. Should contain a ``marker_gene`` mask unless explicitly specifying the marker genes.
**Returns**
Sets the following in ``adata``:
Observations-Pair (Metacells) Annotations
``obs_outgoing_weights``
A sparse square matrix where each non-zero entry is the weight of an edge between a pair
of cells or genes, where the sum of the weights of the outgoing edges for each element
is 1 (there is always at least one such edge).
Also return a pandas data frame of the similarities between the observations (metacells).
**Computation Parameters**
1. If ``marker_gene_names`` and/or ``marker_gene_patterns`` were specified, use the matching genes.
Otherwise, use the ``marker_gene`` mask.
2. If ``ignore_lateral_genes`` (default: {ignore_lateral_genes}), then remove any genes marked as lateral
from the mask.
3. If ``max_marker_genes`` (default: {max_marker_genes}) is not ``None``, then pick this number
of marker genes with the highest variance.
4. Compute the fractions of each ``marker_gene`` in each cell, and add the
``similarity_value_regularization`` (default: {similarity_value_regularization}) to it.
5. If ``similarity_log_data`` (default: {similarity_log_data}), invoke the
:py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of
the data.
6. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` using
``similarity_method`` (default: {similarity_method}), ``logistics_location`` (default:
{logistics_slope}) and ``logistics_slope`` (default: {logistics_slope}) and convert this
to distances.
7. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` using the distances,
``k`` (no default!), ``balanced_ranks_factor`` (default: {balanced_ranks_factor}),
``incoming_degree_factor`` (default: {incoming_degree_factor}), ``outgoing_degree_factor``
(default: {outgoing_degree_factor}) and ``min_outgoing_degree`` (default: {min_outgoing_degree})
to compute a "skeleton" graph to overlay on top of the UMAP graph.
"""
assert max_marker_genes is None or max_marker_genes > 0
if marker_gene_names is None and marker_gene_patterns is None:
marker_genes_mask = ut.get_v_numpy(adata, "marker_gene")
else:
marker_genes_series = tl.find_named_genes(adata, names=marker_gene_names, patterns=marker_gene_patterns)
assert marker_genes_series is not None
marker_genes_mask = ut.to_numpy_vector(marker_genes_series)
if ignore_lateral_genes:
lateral_genes_mask = ut.get_v_numpy(adata, "lateral_gene")
marker_genes_mask = marker_genes_mask & ~lateral_genes_mask
ut.log_calc("marker_genes_mask", marker_genes_mask)
all_fractions = ut.get_vo_proper(adata, what, layout="row_major")
index_per_marker_gene = np.where(marker_genes_mask)[0]
fraction_per_metacell_per_marker_gene = ut.to_numpy_matrix(all_fractions[:, index_per_marker_gene])
if max_marker_genes is not None and max_marker_genes < len(index_per_marker_gene):
fraction_per_metacell_per_marker_gene = ut.to_layout(
fraction_per_metacell_per_marker_gene, layout="column_major"
)
variance_per_marker_gene = ut.variance_per(fraction_per_metacell_per_marker_gene, per="column")
variance_per_marker_gene *= -1
chosen_positions = np.argpartition(variance_per_marker_gene, max_marker_genes)[:max_marker_genes]
ut.log_calc("chosen_positions", chosen_positions)
index_per_marker_gene = index_per_marker_gene[chosen_positions]
fraction_per_metacell_per_marker_gene = fraction_per_metacell_per_marker_gene[:, chosen_positions]
fraction_per_metacell_per_marker_gene = ut.to_layout(fraction_per_metacell_per_marker_gene, layout="row_major")
fraction_per_metacell_per_marker_gene += similarity_value_regularization
if similarity_log_data:
fraction_per_metacell_per_marker_gene = ut.log_data(fraction_per_metacell_per_marker_gene, base=2)
tdata = ut.slice(adata, vars=index_per_marker_gene)
similarities = tl.compute_obs_obs_similarity(
tdata,
fraction_per_metacell_per_marker_gene,
method=similarity_method,
reproducible=reproducible,
logistics_location=logistics_location,
logistics_slope=logistics_slope,
inplace=False,
)
assert similarities is not None
tl.compute_obs_obs_knn_graph(
adata,
similarities,
k=k,
balanced_ranks_factor=balanced_ranks_factor,
incoming_degree_factor=incoming_degree_factor,
outgoing_degree_factor=outgoing_degree_factor,
min_outgoing_degree=min_outgoing_degree,
)
return similarities
[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def compute_umap_by_markers(
adata: AnnData,
what: Union[str, ut.Matrix] = "__x__",
*,
marker_gene_names: Optional[Collection[str]] = None,
marker_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
max_marker_genes: Optional[int] = pr.umap_max_marker_genes,
ignore_lateral_genes: bool = pr.umap_ignore_lateral_genes,
similarity_value_regularization: float = pr.umap_similarity_value_regularization,
similarity_log_data: bool = pr.umap_similarity_log_data,
similarity_method: str = pr.umap_similarity_method,
logistics_location: float = pr.logistics_location,
logistics_slope: float = pr.logistics_slope,
skeleton_k: int = pr.skeleton_k,
balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
incoming_degree_factor: float = pr.knn_incoming_degree_factor,
outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
min_outgoing_degree: int = pr.markers_knn_min_outgoing_degree,
umap_k: int = pr.umap_k,
dimensions: int = 2,
min_dist: float = pr.umap_min_dist,
spread: float = pr.umap_spread,
random_seed: int,
) -> None:
"""
Compute a UMAP projection of the (meta)cells.
**Input**
Annotated ``adata`` where each observation is a metacells and the variables are genes, are genes, where ``what`` is
a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
matrix. Should contain a ``marker_gene`` mask unless explicitly specifying the marker genes.
**Returns**
Sets the following annotations in ``adata``:
Observation-Observation (Metacell-Metacell) Annotations
``umap_distances``
A sparse symmetric matrix of the graph of distances between the metacells.
Observation (Metacell) Annotations
``x``, ``y`` (if ``dimensions`` is 2)
The X and Y coordinates of each metacell in the UMAP projection.
``u``, ``v``, ``w`` (if ``dimensions`` is 3)
The U, V, W coordinates of each metacell in the UMAP projection.
**Computation Parameters**
1. Invoke :py:func:`metacells.pipeline.umap.compute_knn_by_markers` using
``marker_gene_names`` (default: {marker_gene_names}), ``marker_gene_patterns`` (default: {marker_gene_patterns}),
``max_marker_genes`` (default: {max_marker_genes}), ``ignore_lateral_genes`` (default: {ignore_lateral_genes}),
``similarity_value_regularization`` (default: {similarity_value_regularization}), ``similarity_log_data``
(default: {similarity_log_data}), ``similarity_method`` (default: {similarity_method}), ``logistics_location``
(default: {logistics_location}), ``logistics_slope`` (default: {logistics_slope}), ``skeleton_k`` (default:
{skeleton_k}), ``balanced_ranks_factor`` (default: {balanced_ranks_factor}), ``incoming_degree_factor`` (default:
{incoming_degree_factor}) and ``outgoing_degree_factor`` (default: {outgoing_degree_factor}) and
``min_outgoing_degree`` (default: {min_outgoing_degree}) to compute a "skeleton" graph to overlay on top of the
UMAP graph. Specify a non-zero ``random_seed`` to make this reproducible.
2. Invoke :py:func:`metacells.tools.layout.umap_by_distances` using the distances, ``umap_k``
(default: {umap_k}), ``min_dist`` (default: {min_dist}), ``spread`` (default: {spread}),
dimensions (default: {dimensions}) and ``random_seed``.
.. note::
Keep in mind that the KNN graph used by UMAP (controlled by ``umap_k``) is *not* identical to the KNN graph we
compute (controlled by ``skeleton_k``). By default, we choose ``skeleton_k < umap_k``, as the purpose of the
skeleton KNN is to highlight the "strong" structure of the data; in practice this strong skeleton is highly
compatible with the structure used by UMAP, so it serves it purpose reasonably well. It would have been nice to
make these compatible, but UMAP is not friendly towards dictating a KNN graph from the outside.
"""
similarities = compute_knn_by_markers(
adata,
what,
marker_gene_names=marker_gene_names,
marker_gene_patterns=marker_gene_patterns,
max_marker_genes=max_marker_genes,
ignore_lateral_genes=ignore_lateral_genes,
similarity_value_regularization=similarity_value_regularization,
similarity_log_data=similarity_log_data,
similarity_method=similarity_method,
logistics_location=logistics_location,
logistics_slope=logistics_slope,
k=skeleton_k,
balanced_ranks_factor=balanced_ranks_factor,
incoming_degree_factor=incoming_degree_factor,
outgoing_degree_factor=outgoing_degree_factor,
min_outgoing_degree=min_outgoing_degree,
reproducible=(random_seed != 0),
)
distances = ut.to_numpy_matrix(similarities)
distances *= -1
distances += 1
np.fill_diagonal(distances, 0.0)
distances = sparse.csr_matrix(distances)
ut.set_oo_data(adata, "umap_distances", distances)
tl.umap_by_distances(
adata, distances, k=umap_k, dimensions=dimensions, min_dist=min_dist, spread=spread, random_seed=random_seed
)
def _build_igraph(edge_weights: ut.Matrix) -> Tuple[ig.Graph, ut.NumpyVector]:
edge_weights = ut.to_proper_matrix(edge_weights)
assert edge_weights.shape[0] == edge_weights.shape[1]
size = edge_weights.shape[0]
sources, targets = edge_weights.nonzero()
weights_array = ut.to_numpy_vector(edge_weights[sources, targets]).astype("float64")
graph = ig.Graph(directed=True)
graph.add_vertices(size)
graph.add_edges(list(zip(sources, targets)))
graph.es["weight"] = weights_array
return graph, weights_array