Source code for metacells.pipeline.direct

"""
Direct
------
"""

from typing import Optional
from typing import Union

import numpy as np
from anndata import AnnData  # type: ignore

import metacells.parameters as pr
import metacells.tools as tl
import metacells.utilities as ut

from .select import extract_selected_data

__all__ = [
    "compute_direct_metacells",
]


[docs] @ut.logged() @ut.timed_call() @ut.expand_doc() def compute_direct_metacells( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, select_downsample_min_samples: int = pr.select_downsample_min_samples, select_downsample_min_cell_quantile: float = pr.select_downsample_min_cell_quantile, select_downsample_max_cell_quantile: float = pr.select_downsample_max_cell_quantile, select_min_gene_total: Optional[int] = pr.select_min_gene_total, select_min_gene_top3: Optional[int] = pr.select_min_gene_top3, select_min_gene_relative_variance: Optional[float] = pr.select_min_gene_relative_variance, select_min_genes: int = pr.select_min_genes, cells_similarity_value_regularization: float = pr.cells_similarity_value_regularization, cells_similarity_log_data: bool = pr.cells_similarity_log_data, cells_similarity_method: str = pr.cells_similarity_method, target_metacell_size: int = pr.target_metacell_size, min_metacell_size: int = pr.min_metacell_size, target_metacell_umis: int = pr.target_metacell_umis, cell_umis: Optional[ut.NumpyVector] = pr.cell_umis, knn_k: Optional[int] = pr.knn_k, knn_k_size_factor: float = pr.candidates_knn_k_size_factor, knn_k_umis_quantile: float = pr.knn_k_umis_quantile, min_knn_k: Optional[int] = pr.min_knn_k, knn_balanced_ranks_factor: float = pr.knn_balanced_ranks_factor, knn_incoming_degree_factor: float = pr.knn_incoming_degree_factor, knn_outgoing_degree_factor: float = pr.knn_outgoing_degree_factor, knn_min_outgoing_degree: int = pr.knn_min_outgoing_degree, min_seed_size_quantile: float = pr.min_seed_size_quantile, max_seed_size_quantile: float = pr.max_seed_size_quantile, candidates_cooldown_pass: float = pr.cooldown_pass, candidates_cooldown_node: float = pr.cooldown_node, candidates_cooldown_phase: float = pr.cooldown_phase, candidates_min_split_size_factor: float = pr.candidates_min_split_size_factor, candidates_max_merge_size_factor: float = pr.candidates_max_merge_size_factor, candidates_max_split_min_cut_strength: float = pr.max_split_min_cut_strength, candidates_min_cut_seed_cells: int = pr.min_cut_seed_cells, must_complete_cover: bool = False, deviants_policy: str = pr.deviants_policy, deviants_min_gene_fold_factor: float = pr.deviants_min_gene_fold_factor, deviants_gap_skip_cells: int = pr.deviants_gap_skip_cells, deviants_min_noisy_gene_fold_factor: float = pr.deviants_min_noisy_gene_fold_factor, deviants_max_gene_fraction: float = pr.deviants_max_gene_fraction, deviants_max_cell_fraction: Optional[float] = pr.deviants_max_cell_fraction, deviants_max_gap_cells_count: int = pr.deviants_max_gap_cells_count, deviants_max_gap_cells_fraction: float = pr.deviants_max_gap_cells_fraction, dissolve_min_robust_size_factor: float = pr.dissolve_min_robust_size_factor, dissolve_min_convincing_gene_fold_factor: Optional[float] = pr.dissolve_min_convincing_gene_fold_factor, random_seed: int, ) -> None: """ Directly compute metacells using ``what`` (default: {what}) data. This directly computes the metacells on the whole data. Like any method that directly looks at the whole data at once, the amount of CPU and memory needed becomes unreasonable when the data size grows. Above O(10,000) you are much better off using the divide-and-conquer method. .. note:: The current implementation is naive in that it computes the full dense N^2 correlation matrix, and only then extracts the sparse graph out of it. We actually need two copies where each requires 4 bytes per entry, so for O(100,000) cells, we have storage of O(100,000,000,000). In addition, the implementation is serial for the graph clustering phases. It is possible to mitigate this by fusing the correlations phase and the graph generation phase, parallelizing the result, and also (somehow) parallelizing the graph clustering phase. This might increase the "reasonable" size for the direct approach to O(100,000). We have decided not to invest in this direction since it won't allow us to push the size to O(1,000,000) and above. Instead we provide the divide-and-conquer method, which easily scales to O(1,000,000) on a single multi-core server, and to "unlimited" size if we further enhance the implementation to use a distributed compute cluster of such servers. .. todo:: Should :py:func:`compute_direct_metacells` avoid computing the graph and partition it for a very small number of cells? **Input** The presumably "clean" annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Sets the following annotations in ``adata``: Unstructured Annotations ``downsample_samples`` The target total number of samples in each downsampled cell. Observation-Variable (Cell-Gene) Annotations: ``downsampled`` The downsampled data where the total number of samples in each cell is at most ``downsample_samples``. Variable (Gene) Annotations ``high_total_gene`` A boolean mask of genes with "high" expression level (unless a ``select_gene`` mask exists). ``high_relative_variance_gene`` A boolean mask of genes with "high" normalized variance, relative to other genes with a similar expression level (unless a ``select_gene`` mask exists). ``selected_gene`` A boolean mask of the actually selected genes. Observation (Cell) Annotations ``metacell`` The integer index of the metacell each cell belongs to. The metacells are in no particular order. Cells with no metacell assignment ("outliers") are given a metacell index of ``-1``. **Computation Parameters** 0. If ``cell_umis`` is not specified, use the sum of the ``what`` data for each cell. 1. Invoke :py:func:`metacells.pipeline.select.extract_selected_data` to extract "select" data from the clean data, using the ``select_downsample_min_samples`` (default: {select_downsample_min_samples}), ``select_downsample_min_cell_quantile`` (default: {select_downsample_min_cell_quantile}), ``select_downsample_max_cell_quantile`` (default: {select_downsample_max_cell_quantile}), ``select_min_gene_total`` (default: {select_min_gene_total}), ``select_min_gene_top3`` (default: {select_min_gene_top3}), ``select_min_gene_relative_variance`` (default: {select_min_gene_relative_variance}) and ``random_seed``. 2. Compute the fractions of each variable in each cell, and add the ``cells_similarity_value_regularization`` (default: {cells_similarity_value_regularization}) to it. 3. If ``cells_similarity_log_data`` (default: {cells_similarity_log_data}), invoke the :py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of the data. 4. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` to compute the similarity between each pair of cells, using the ``cells_similarity_method`` (default: {cells_similarity_method}). 5. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` to compute a K-Nearest-Neighbors graph, using the ``knn_balanced_ranks_factor`` (default: {knn_balanced_ranks_factor}), ``knn_incoming_degree_factor`` (default: {knn_incoming_degree_factor}) ``knn_outgoing_degree_factor`` (default: {knn_outgoing_degree_factor}) and ``knn_min_outgoing_degree`` (default: {knn_min_outgoing_degree})(. If ``knn_k`` (default: {knn_k}) is not specified, then it is chosen to be the ``knn_k_size_factor`` (default: {knn_k_size_factor} times the median number of cells required to reach the target metacell size, or the ``knn_k_umis_quantile`` (default: {knn_k_umis_quantile}) the number of cells required, or ``min_knn_k`` (default: {min_knn_k}), whichever is largest. 6. Invoke :py:func:`metacells.tools.candidates.compute_candidate_metacells` to compute the candidate metacells, using the ``min_seed_size_quantile`` (default: {min_seed_size_quantile}), ``max_seed_size_quantile`` (default: {max_seed_size_quantile}), ``candidates_cooldown_pass`` (default: {candidates_cooldown_pass}), ``candidates_cooldown_node`` (default: {candidates_cooldown_node}), ``candidates_cooldown_phase`` (default: {candidates_cooldown_phase}), ``candidates_min_split_size_factor`` (default: {candidates_min_split_size_factor}), ``candidates_max_merge_size_factor`` (default: {candidates_max_merge_size_factor}), and ``random_seed``. This tries to build metacells of the ``target_metacell_size`` (default: {target_metacell_size}), ``min_metacell_size`` (default: {min_metacell_size}), and ``target_metacell_umis`` (default: {target_metacell_umis}) using the ``cell_umis`` (default: {cell_umis}). 7. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke :py:func:`metacells.tools.deviants.find_deviant_cells` to remove deviants from the candidate metacells, using the ``deviants_policy`` (default: {deviants_policy}), ``deviants_min_gene_fold_factor`` (default: {deviants_min_gene_fold_factor}), ``deviants_gap_skip_cells`` (default: {deviants_gap_skip_cells}), ``deviants_min_noisy_gene_fold_factor`` (default: {deviants_min_noisy_gene_fold_factor}), ``deviants_max_gene_fraction`` (default: {deviants_max_gene_fraction}) and ``deviants_max_cell_fraction`` (default: {deviants_max_cell_fraction}). 9. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke :py:func:`metacells.tools.dissolve.dissolve_metacells` to dissolve small unconvincing metacells, using the same ``target_metacell_size`` (default: {target_metacell_size}), ``min_metacell_size`` (default: {min_metacell_size}), ``target_metacell_umis`` (default: {target_metacell_umis}), ``cell_umis`` (default: {cell_umis}) and the ``dissolve_min_robust_size_factor`` (default: {dissolve_min_robust_size_factor}). """ assert ( target_metacell_size < 1000 ), f"target_metacell_size: {target_metacell_size} seems to be in UMIs, should be in cells" if cell_umis is None: cell_umis = ut.get_o_numpy(adata, what, sum=True, formatter=ut.sizes_description).astype("float32") else: assert cell_umis.dtype == "float32" assert isinstance(cell_umis, ut.NumpyVector) if ( adata.n_obs <= 2 * min(min_metacell_size, target_metacell_size * candidates_max_merge_size_factor) or np.sum(cell_umis) <= 2 * target_metacell_umis * candidates_max_merge_size_factor ): candidate_of_cells = np.zeros(adata.n_obs, dtype="int32") ut.set_v_data(adata, "selected_gene", np.zeros(adata.n_vars, dtype="bool")) else: sdata = extract_selected_data( adata, what, top_level=False, downsample_min_samples=select_downsample_min_samples, downsample_min_cell_quantile=select_downsample_min_cell_quantile, downsample_max_cell_quantile=select_downsample_max_cell_quantile, min_gene_relative_variance=select_min_gene_relative_variance, min_gene_total=select_min_gene_total, min_gene_top3=select_min_gene_top3, min_genes=select_min_genes, random_seed=random_seed, ) data = ut.get_vo_proper(sdata, "downsampled", layout="row_major") data = ut.to_numpy_matrix(data, copy=True) if cells_similarity_value_regularization > 0: data += cells_similarity_value_regularization if cells_similarity_log_data: data = ut.log_data(data, base=2) if knn_k is None: knn_k_by_size = int(round(adata.n_obs / target_metacell_size)) ut.log_calc("knn_k by size", knn_k_by_size) median_cell_umis = float(np.median(cell_umis)) ut.log_calc("median_cell_umis", median_cell_umis) knn_k_of_median = int(round(target_metacell_umis / median_cell_umis)) ut.log_calc("knn_k of median_cell_umis", knn_k_of_median) knn_k_by_median = int(round(knn_k_size_factor * target_metacell_umis / median_cell_umis)) ut.log_calc("knn_k by median_cell_umis", knn_k_by_median) quantile_cell_umis = np.quantile(cell_umis, knn_k_umis_quantile) ut.log_calc("quantile_cell_umis", quantile_cell_umis) knn_k_by_quantile = int(round(target_metacell_umis / quantile_cell_umis)) ut.log_calc("knn_k by quantile_cell_umis", knn_k_by_quantile) knn_k = max(knn_k_by_size, knn_k_by_median, knn_k_by_quantile, min_knn_k or 0) if knn_k == 0: ut.log_calc("knn_k: 0 (too small, trying a single metacell)") ut.set_o_data(sdata, "candidate", np.full(sdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0") elif knn_k_of_median >= sdata.n_obs: ut.log_calc(f"knn_k of median: {knn_k_of_median} (too large, trying a single metacell)") ut.set_o_data(sdata, "candidate", np.full(sdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0") else: ut.log_calc("knn_k", knn_k) tl.compute_obs_obs_similarity(sdata, data, method=cells_similarity_method, reproducible=random_seed != 0) tl.compute_obs_obs_knn_graph( sdata, k=knn_k, balanced_ranks_factor=knn_balanced_ranks_factor, incoming_degree_factor=knn_incoming_degree_factor, outgoing_degree_factor=knn_outgoing_degree_factor, min_outgoing_degree=knn_min_outgoing_degree, ) tl.compute_candidate_metacells( sdata, target_metacell_size=target_metacell_size, min_metacell_size=min_metacell_size, target_metacell_umis=target_metacell_umis, cell_umis=cell_umis, min_seed_size_quantile=min_seed_size_quantile, max_seed_size_quantile=max_seed_size_quantile, cooldown_pass=candidates_cooldown_pass, cooldown_node=candidates_cooldown_node, cooldown_phase=candidates_cooldown_phase, min_split_size_factor=candidates_min_split_size_factor, max_merge_size_factor=candidates_max_merge_size_factor, max_split_min_cut_strength=candidates_max_split_min_cut_strength, min_cut_seed_cells=candidates_min_cut_seed_cells, must_complete_cover=must_complete_cover, random_seed=random_seed, ) candidate_of_cells = ut.get_o_numpy(sdata, "candidate", formatter=ut.groups_description) if must_complete_cover: assert np.min(candidate_of_cells) == 0 ut.set_o_data(adata, "metacell", candidate_of_cells, formatter=ut.groups_description) else: deviants = tl.find_deviant_cells( adata, candidates=candidate_of_cells, policy=deviants_policy, min_gene_fold_factor=deviants_min_gene_fold_factor, gap_skip_cells=deviants_gap_skip_cells, min_noisy_gene_fold_factor=deviants_min_noisy_gene_fold_factor, max_gene_fraction=deviants_max_gene_fraction, max_cell_fraction=deviants_max_cell_fraction, max_gap_cells_count=deviants_max_gap_cells_count, max_gap_cells_fraction=deviants_max_gap_cells_fraction, ) tl.dissolve_metacells( adata, candidates=candidate_of_cells, deviants=deviants, target_metacell_size=target_metacell_size, min_metacell_size=min_metacell_size, target_metacell_umis=target_metacell_umis, cell_umis=cell_umis, min_robust_size_factor=dissolve_min_robust_size_factor, min_convincing_gene_fold_factor=dissolve_min_convincing_gene_fold_factor, )