Source code for metacells.pipeline.direct
"""
Direct
------
"""
from typing import Optional
from typing import Union
import numpy as np
from anndata import AnnData # type: ignore
import metacells.parameters as pr
import metacells.tools as tl
import metacells.utilities as ut
from .select import extract_selected_data
__all__ = [
"compute_direct_metacells",
]
[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def compute_direct_metacells(
adata: AnnData,
what: Union[str, ut.Matrix] = "__x__",
*,
select_downsample_min_samples: int = pr.select_downsample_min_samples,
select_downsample_min_cell_quantile: float = pr.select_downsample_min_cell_quantile,
select_downsample_max_cell_quantile: float = pr.select_downsample_max_cell_quantile,
select_min_gene_total: Optional[int] = pr.select_min_gene_total,
select_min_gene_top3: Optional[int] = pr.select_min_gene_top3,
select_min_gene_relative_variance: Optional[float] = pr.select_min_gene_relative_variance,
select_min_genes: int = pr.select_min_genes,
cells_similarity_value_regularization: float = pr.cells_similarity_value_regularization,
cells_similarity_log_data: bool = pr.cells_similarity_log_data,
cells_similarity_method: str = pr.cells_similarity_method,
target_metacell_size: int = pr.target_metacell_size,
min_metacell_size: int = pr.min_metacell_size,
target_metacell_umis: int = pr.target_metacell_umis,
cell_umis: Optional[ut.NumpyVector] = pr.cell_umis,
knn_k: Optional[int] = pr.knn_k,
knn_k_size_factor: float = pr.candidates_knn_k_size_factor,
knn_k_umis_quantile: float = pr.knn_k_umis_quantile,
min_knn_k: Optional[int] = pr.min_knn_k,
knn_balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
knn_incoming_degree_factor: float = pr.knn_incoming_degree_factor,
knn_outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
knn_min_outgoing_degree: int = pr.knn_min_outgoing_degree,
min_seed_size_quantile: float = pr.min_seed_size_quantile,
max_seed_size_quantile: float = pr.max_seed_size_quantile,
candidates_cooldown_pass: float = pr.cooldown_pass,
candidates_cooldown_node: float = pr.cooldown_node,
candidates_cooldown_phase: float = pr.cooldown_phase,
candidates_min_split_size_factor: float = pr.candidates_min_split_size_factor,
candidates_max_merge_size_factor: float = pr.candidates_max_merge_size_factor,
candidates_max_split_min_cut_strength: float = pr.max_split_min_cut_strength,
candidates_min_cut_seed_cells: int = pr.min_cut_seed_cells,
must_complete_cover: bool = False,
deviants_policy: str = pr.deviants_policy,
deviants_min_gene_fold_factor: float = pr.deviants_min_gene_fold_factor,
deviants_gap_skip_cells: int = pr.deviants_gap_skip_cells,
deviants_min_noisy_gene_fold_factor: float = pr.deviants_min_noisy_gene_fold_factor,
deviants_max_gene_fraction: float = pr.deviants_max_gene_fraction,
deviants_max_cell_fraction: Optional[float] = pr.deviants_max_cell_fraction,
deviants_max_gap_cells_count: int = pr.deviants_max_gap_cells_count,
deviants_max_gap_cells_fraction: float = pr.deviants_max_gap_cells_fraction,
dissolve_min_robust_size_factor: float = pr.dissolve_min_robust_size_factor,
dissolve_min_convincing_gene_fold_factor: Optional[float] = pr.dissolve_min_convincing_gene_fold_factor,
random_seed: int,
) -> None:
"""
Directly compute metacells using ``what`` (default: {what}) data.
This directly computes the metacells on the whole data. Like any method that directly looks at
the whole data at once, the amount of CPU and memory needed becomes unreasonable when the data
size grows. Above O(10,000) you are much better off using the divide-and-conquer method.
.. note::
The current implementation is naive in that it computes the full dense N^2 correlation
matrix, and only then extracts the sparse graph out of it. We actually need two copies where
each requires 4 bytes per entry, so for O(100,000) cells, we have storage of
O(100,000,000,000). In addition, the implementation is serial for the graph clustering
phases.
It is possible to mitigate this by fusing the correlations phase and the graph generation
phase, parallelizing the result, and also (somehow) parallelizing the graph clustering
phase. This might increase the "reasonable" size for the direct approach to O(100,000).
We have decided not to invest in this direction since it won't allow us to push the size to
O(1,000,000) and above. Instead we provide the divide-and-conquer method, which easily
scales to O(1,000,000) on a single multi-core server, and to "unlimited" size if we further
enhance the implementation to use a distributed compute cluster of such servers.
.. todo::
Should :py:func:`compute_direct_metacells` avoid computing the graph and partition it for a
very small number of cells?
**Input**
The presumably "clean" annotated ``adata``, where the observations are cells and the variables
are genes, where ``what`` is a per-variable-per-observation matrix or the name of a
per-variable-per-observation annotation containing such a matrix.
**Returns**
Sets the following annotations in ``adata``:
Unstructured Annotations
``downsample_samples``
The target total number of samples in each downsampled cell.
Observation-Variable (Cell-Gene) Annotations:
``downsampled``
The downsampled data where the total number of samples in each cell is at most ``downsample_samples``.
Variable (Gene) Annotations
``high_total_gene``
A boolean mask of genes with "high" expression level (unless a ``select_gene`` mask exists).
``high_relative_variance_gene``
A boolean mask of genes with "high" normalized variance, relative to other genes with a similar expression
level (unless a ``select_gene`` mask exists).
``selected_gene``
A boolean mask of the actually selected genes.
Observation (Cell) Annotations
``metacell``
The integer index of the metacell each cell belongs to. The metacells are in no
particular order. Cells with no metacell assignment ("outliers") are given a metacell
index of ``-1``.
**Computation Parameters**
0. If ``cell_umis`` is not specified, use the sum of the ``what`` data for each cell.
1. Invoke :py:func:`metacells.pipeline.select.extract_selected_data` to extract "select" data
from the clean data, using the
``select_downsample_min_samples`` (default: {select_downsample_min_samples}),
``select_downsample_min_cell_quantile`` (default: {select_downsample_min_cell_quantile}),
``select_downsample_max_cell_quantile`` (default: {select_downsample_max_cell_quantile}),
``select_min_gene_total`` (default: {select_min_gene_total}),
``select_min_gene_top3`` (default: {select_min_gene_top3}),
``select_min_gene_relative_variance`` (default: {select_min_gene_relative_variance}) and
``random_seed``.
2. Compute the fractions of each variable in each cell, and add the
``cells_similarity_value_regularization`` (default: {cells_similarity_value_regularization}) to
it.
3. If ``cells_similarity_log_data`` (default: {cells_similarity_log_data}), invoke the
:py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of
the data.
4. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` to compute the
similarity between each pair of cells, using the
``cells_similarity_method`` (default: {cells_similarity_method}).
5. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` to compute a K-Nearest-Neighbors graph,
using the ``knn_balanced_ranks_factor`` (default: {knn_balanced_ranks_factor}), ``knn_incoming_degree_factor``
(default: {knn_incoming_degree_factor}) ``knn_outgoing_degree_factor`` (default: {knn_outgoing_degree_factor})
and ``knn_min_outgoing_degree`` (default: {knn_min_outgoing_degree})(. If ``knn_k`` (default: {knn_k}) is not
specified, then it is chosen to be the ``knn_k_size_factor`` (default: {knn_k_size_factor} times the median
number of cells required to reach the target metacell size, or the ``knn_k_umis_quantile`` (default:
{knn_k_umis_quantile}) the number of cells required, or ``min_knn_k`` (default: {min_knn_k}), whichever is
largest.
6. Invoke :py:func:`metacells.tools.candidates.compute_candidate_metacells` to compute
the candidate metacells, using the
``min_seed_size_quantile`` (default: {min_seed_size_quantile}),
``max_seed_size_quantile`` (default: {max_seed_size_quantile}),
``candidates_cooldown_pass`` (default: {candidates_cooldown_pass}),
``candidates_cooldown_node`` (default: {candidates_cooldown_node}),
``candidates_cooldown_phase`` (default: {candidates_cooldown_phase}),
``candidates_min_split_size_factor`` (default: {candidates_min_split_size_factor}),
``candidates_max_merge_size_factor`` (default: {candidates_max_merge_size_factor}),
and
``random_seed``. This tries to build metacells of the
``target_metacell_size`` (default: {target_metacell_size}),
``min_metacell_size`` (default: {min_metacell_size}), and
``target_metacell_umis`` (default: {target_metacell_umis})
using the ``cell_umis`` (default: {cell_umis}).
7. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke
:py:func:`metacells.tools.deviants.find_deviant_cells` to remove deviants from the candidate
metacells, using the
``deviants_policy`` (default: {deviants_policy}),
``deviants_min_gene_fold_factor`` (default: {deviants_min_gene_fold_factor}),
``deviants_gap_skip_cells`` (default: {deviants_gap_skip_cells}),
``deviants_min_noisy_gene_fold_factor`` (default: {deviants_min_noisy_gene_fold_factor}),
``deviants_max_gene_fraction`` (default: {deviants_max_gene_fraction})
and
``deviants_max_cell_fraction`` (default: {deviants_max_cell_fraction}).
9. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke
:py:func:`metacells.tools.dissolve.dissolve_metacells` to dissolve small unconvincing
metacells, using the same
``target_metacell_size`` (default: {target_metacell_size}),
``min_metacell_size`` (default: {min_metacell_size}),
``target_metacell_umis`` (default: {target_metacell_umis}),
``cell_umis`` (default: {cell_umis})
and the
``dissolve_min_robust_size_factor`` (default: {dissolve_min_robust_size_factor}).
"""
assert (
target_metacell_size < 1000
), f"target_metacell_size: {target_metacell_size} seems to be in UMIs, should be in cells"
if cell_umis is None:
cell_umis = ut.get_o_numpy(adata, what, sum=True, formatter=ut.sizes_description).astype("float32")
else:
assert cell_umis.dtype == "float32"
assert isinstance(cell_umis, ut.NumpyVector)
if (
adata.n_obs <= 2 * min(min_metacell_size, target_metacell_size * candidates_max_merge_size_factor)
or np.sum(cell_umis) <= 2 * target_metacell_umis * candidates_max_merge_size_factor
):
candidate_of_cells = np.zeros(adata.n_obs, dtype="int32")
ut.set_v_data(adata, "selected_gene", np.zeros(adata.n_vars, dtype="bool"))
else:
sdata = extract_selected_data(
adata,
what,
top_level=False,
downsample_min_samples=select_downsample_min_samples,
downsample_min_cell_quantile=select_downsample_min_cell_quantile,
downsample_max_cell_quantile=select_downsample_max_cell_quantile,
min_gene_relative_variance=select_min_gene_relative_variance,
min_gene_total=select_min_gene_total,
min_gene_top3=select_min_gene_top3,
min_genes=select_min_genes,
random_seed=random_seed,
)
data = ut.get_vo_proper(sdata, "downsampled", layout="row_major")
data = ut.to_numpy_matrix(data, copy=True)
if cells_similarity_value_regularization > 0:
data += cells_similarity_value_regularization
if cells_similarity_log_data:
data = ut.log_data(data, base=2)
if knn_k is None:
knn_k_by_size = int(round(adata.n_obs / target_metacell_size))
ut.log_calc("knn_k by size", knn_k_by_size)
median_cell_umis = float(np.median(cell_umis))
ut.log_calc("median_cell_umis", median_cell_umis)
knn_k_of_median = int(round(target_metacell_umis / median_cell_umis))
ut.log_calc("knn_k of median_cell_umis", knn_k_of_median)
knn_k_by_median = int(round(knn_k_size_factor * target_metacell_umis / median_cell_umis))
ut.log_calc("knn_k by median_cell_umis", knn_k_by_median)
quantile_cell_umis = np.quantile(cell_umis, knn_k_umis_quantile)
ut.log_calc("quantile_cell_umis", quantile_cell_umis)
knn_k_by_quantile = int(round(target_metacell_umis / quantile_cell_umis))
ut.log_calc("knn_k by quantile_cell_umis", knn_k_by_quantile)
knn_k = max(knn_k_by_size, knn_k_by_median, knn_k_by_quantile, min_knn_k or 0)
if knn_k == 0:
ut.log_calc("knn_k: 0 (too small, trying a single metacell)")
ut.set_o_data(sdata, "candidate", np.full(sdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0")
elif knn_k_of_median >= sdata.n_obs:
ut.log_calc(f"knn_k of median: {knn_k_of_median} (too large, trying a single metacell)")
ut.set_o_data(sdata, "candidate", np.full(sdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0")
else:
ut.log_calc("knn_k", knn_k)
tl.compute_obs_obs_similarity(sdata, data, method=cells_similarity_method, reproducible=random_seed != 0)
tl.compute_obs_obs_knn_graph(
sdata,
k=knn_k,
balanced_ranks_factor=knn_balanced_ranks_factor,
incoming_degree_factor=knn_incoming_degree_factor,
outgoing_degree_factor=knn_outgoing_degree_factor,
min_outgoing_degree=knn_min_outgoing_degree,
)
tl.compute_candidate_metacells(
sdata,
target_metacell_size=target_metacell_size,
min_metacell_size=min_metacell_size,
target_metacell_umis=target_metacell_umis,
cell_umis=cell_umis,
min_seed_size_quantile=min_seed_size_quantile,
max_seed_size_quantile=max_seed_size_quantile,
cooldown_pass=candidates_cooldown_pass,
cooldown_node=candidates_cooldown_node,
cooldown_phase=candidates_cooldown_phase,
min_split_size_factor=candidates_min_split_size_factor,
max_merge_size_factor=candidates_max_merge_size_factor,
max_split_min_cut_strength=candidates_max_split_min_cut_strength,
min_cut_seed_cells=candidates_min_cut_seed_cells,
must_complete_cover=must_complete_cover,
random_seed=random_seed,
)
candidate_of_cells = ut.get_o_numpy(sdata, "candidate", formatter=ut.groups_description)
if must_complete_cover:
assert np.min(candidate_of_cells) == 0
ut.set_o_data(adata, "metacell", candidate_of_cells, formatter=ut.groups_description)
else:
deviants = tl.find_deviant_cells(
adata,
candidates=candidate_of_cells,
policy=deviants_policy,
min_gene_fold_factor=deviants_min_gene_fold_factor,
gap_skip_cells=deviants_gap_skip_cells,
min_noisy_gene_fold_factor=deviants_min_noisy_gene_fold_factor,
max_gene_fraction=deviants_max_gene_fraction,
max_cell_fraction=deviants_max_cell_fraction,
max_gap_cells_count=deviants_max_gap_cells_count,
max_gap_cells_fraction=deviants_max_gap_cells_fraction,
)
tl.dissolve_metacells(
adata,
candidates=candidate_of_cells,
deviants=deviants,
target_metacell_size=target_metacell_size,
min_metacell_size=min_metacell_size,
target_metacell_umis=target_metacell_umis,
cell_umis=cell_umis,
min_robust_size_factor=dissolve_min_robust_size_factor,
min_convincing_gene_fold_factor=dissolve_min_convincing_gene_fold_factor,
)