Source code for metacells.tools.rare

"""
Rare
----
"""

from typing import List
from typing import Optional
from typing import Set
from typing import Tuple
from typing import Union

import numpy as np
import scipy.cluster.hierarchy as sch  # type: ignore
import scipy.spatial.distance as scd  # type: ignore
from anndata import AnnData  # type: ignore

import metacells.parameters as pr
import metacells.utilities as ut

from .similarity import compute_var_var_similarity

__all__ = [
    "find_rare_gene_modules",
]



[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def find_rare_gene_modules(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    max_genes: int = pr.rare_max_genes,
    max_gene_cell_fraction: float = pr.rare_max_gene_cell_fraction,
    min_gene_maximum: int = pr.rare_min_gene_maximum,
    genes_similarity_method: str = pr.rare_genes_similarity_method,
    genes_cluster_method: str = pr.rare_genes_cluster_method,
    min_genes_of_modules: int = pr.rare_min_genes_of_modules,
    min_cells_of_modules: int = pr.rare_min_cells_of_modules,
    target_metacell_size: float = pr.target_metacell_size,
    target_pile_size: int = pr.min_target_pile_size,
    max_cells_factor_of_random_pile: float = pr.rare_max_cells_factor_of_random_pile,
    min_module_correlation: float = pr.rare_min_module_correlation,
    min_related_gene_fold_factor: float = pr.rare_min_related_gene_fold_factor,
    max_related_gene_increase_factor: float = pr.rare_max_related_gene_increase_factor,
    min_cell_module_total: int = pr.rare_min_cell_module_total,
    inplace: bool = True,
    reproducible: bool,
) -> Optional[Tuple[ut.PandasFrame, ut.PandasFrame]]:
    """
    Detect rare genes modules based on ``what`` (default: {what}) data.

    Rare gene modules include genes which are weakly and rarely expressed, yet are highly correlated
    with each other, allowing for robust detection. Global analysis algorithms (such as metacells)
    tend to ignore or at least discount such genes.

    It is therefore useful to explicitly identify, in a pre-processing step, the few cells which
    express such rare gene modules. Once identified, these cells can be exempt from the global
    algorithm, or the global algorithm can be tweaked in some way to pay extra attention to them.

    If ``reproducible`` is ``True``, a slower (still parallel) but reproducible algorithm will be used to compute
    pearson correlations.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    Obeys (ignores the genes of) the ``lateral_gene`` per-gene (variable) annotation, if any.

    **Returns**

    Observation (Cell) Annotations
        ``cells_rare_gene_module``
            The index of the rare gene module each cell expresses the most, or ``-1`` in the common
            case it does not express any rare genes module.

        ``rare_cell``
            A boolean mask for the (few) cells that express a rare gene module.

    Variable (Gene) Annotations
        ``rare_gene``
            A boolean mask for the genes in any of the rare gene modules.

        ``rare_gene_module``
            The index of the rare gene module a gene belongs to (-1 for non-rare genes).

    If ``inplace``, these are written to to the data, and the function returns ``None``. Otherwise
    they are returned as tuple containing two data frames.

    **Computation Parameters**

    1. Pick as candidates all genes that are expressed in at most ``max_gene_cell_fraction``
       (default: {max_gene_cell_fraction}) of the cells, and whose maximal value in a cell is at least
       ``min_gene_maximum`` (default: {min_gene_maximum}). If a ``lateral_gene`` masks exist, exclude them from the
       candidates. Out of the candidates, pick at most ``max_genes`` (default {max_genes}) which are expressed in the
       least cells.

    2. Compute the similarity between the genes using
       :py:func:`metacells.tools.similarity.compute_var_var_similarity` using the
       ``genes_similarity_method`` (default: {genes_similarity_method}).

    3. Create a hierarchical clustering of the candidate genes using the ``genes_cluster_method``
       (default: {genes_cluster_method}).

    4. Identify gene modules in the hierarchical clustering which contain at least
       ``min_genes_of_modules`` genes (default: {min_genes_of_modules}), with an average gene-gene
       cross-correlation of at least ``min_module_correlation`` (default:
       {min_module_correlation}).

    5. Consider cells expressing of any of the genes in the gene module. If the expected number of
       such cells in each random pile of size ``target_pile_size`` (default: {target_pile_size}), whose total number of
       UMIs of the rare gene module is at least ``min_cell_module_total`` (default: {min_cell_module_total}), is more
       than the ``max_cells_factor_of_random_pile`` (default: {max_cells_factor_of_random_pile}) as a fraction of the
       target metacells size, then discard the rare gene module as not that rare after all.

    6. Add to the gene module all genes whose fraction in cells expressing any of the genes in the
       rare gene module is at least 2^``min_related_gene_fold_factor`` (default:
       {min_related_gene_fold_factor}) times their fraction in the rest of the population, as long
       as their maximal value in one of the expressing cells is at least ``min_gene_maximum``,
       as long as this doesn't add more than ``max_related_gene_increase_factor`` times the original
       number of cells to the rare gene module, and as long as they are not listed in the ``lateral_gene`` masks. If a
       gene is above the threshold for multiple gene modules, associate it with the gene module for which its fold
       factor is higher.

    7. Associate cells with the rare gene module if they contain at least ``min_cell_module_total``
       (default: {min_cell_module_total}) UMIs of the expanded rare gene module. If a cell meets the
       above threshold for several rare gene modules, it is associated with the one for which it
       contains more UMIs.
    """
    assert min_cells_of_modules > 0
    assert min_genes_of_modules > 0

    max_cells_of_random_pile = target_metacell_size * max_cells_factor_of_random_pile
    ut.log_calc("max_cells_of_random_pile", max_cells_of_random_pile)

    allowed_genes_mask = np.full(adata.n_vars, True, dtype="bool")

    if ut.has_data(adata, "lateral_gene"):
        allowed_genes_mask &= ~ut.get_v_numpy(adata, "lateral_gene")

    ut.log_calc("allowed_genes_mask", allowed_genes_mask)

    rare_module_of_cells = np.full(adata.n_obs, -1, dtype="int32")
    list_of_rare_gene_indices_of_modules: List[List[int]] = []

    umis_per_gene_per_cell_by_rows = ut.get_vo_proper(adata, what, layout="row_major")
    ut.log_calc("umis_per_gene_per_cell_by_rows", umis_per_gene_per_cell_by_rows)
    umis_per_gene_per_cell_by_columns = ut.get_vo_proper(adata, what, layout="column_major")
    ut.log_calc("umis_per_gene_per_cell_by_columns", umis_per_gene_per_cell_by_columns)

    candidates = _pick_candidates(
        adata_of_all_genes_of_all_cells=adata,
        what=what,
        max_genes=max_genes,
        max_gene_cell_fraction=max_gene_cell_fraction,
        min_gene_maximum=min_gene_maximum,
        min_genes_of_modules=min_genes_of_modules,
        allowed_genes_mask=allowed_genes_mask,
    )
    if candidates is None:
        return _results(
            adata=adata,
            rare_module_of_cells=rare_module_of_cells,
            list_of_rare_gene_indices_of_modules=list_of_rare_gene_indices_of_modules,
            inplace=inplace,
        )
    candidate_data, candidate_genes_indices = candidates

    similarities_between_candidate_genes = _genes_similarity(
        candidate_data=candidate_data, what=what, method=genes_similarity_method, reproducible=reproducible
    )

    linkage = _cluster_genes(
        similarities_between_candidate_genes=similarities_between_candidate_genes,
        genes_cluster_method=genes_cluster_method,
    )

    rare_gene_indices_of_modules = _identify_genes(
        candidate_genes_indices=candidate_genes_indices,
        similarities_between_candidate_genes=similarities_between_candidate_genes,
        linkage=linkage,
        min_module_correlation=min_module_correlation,
    )

    max_cells_of_modules = int(max_cells_of_random_pile * adata.n_obs / target_pile_size)
    ut.log_calc("max_cells_of_modules", max_cells_of_modules)

    related_gene_indices_of_modules = _related_genes(
        adata_of_all_genes_of_all_cells=adata,
        umis_per_gene_per_cell_by_columns=umis_per_gene_per_cell_by_columns,
        umis_per_gene_per_cell_by_rows=umis_per_gene_per_cell_by_rows,
        rare_gene_indices_of_modules=rare_gene_indices_of_modules,
        allowed_genes_mask=allowed_genes_mask,
        min_genes_of_modules=min_genes_of_modules,
        min_cells_of_modules=min_cells_of_modules,
        max_cells_of_modules=max_cells_of_modules,
        min_cell_module_total=min_cell_module_total,
        min_gene_maximum=min_gene_maximum,
        min_related_gene_fold_factor=min_related_gene_fold_factor,
        max_related_gene_increase_factor=max_related_gene_increase_factor,
    )

    _identify_cells(
        adata_of_all_genes_of_all_cells=adata,
        umis_per_gene_per_cell_by_columns=umis_per_gene_per_cell_by_columns,
        related_gene_indices_of_modules=related_gene_indices_of_modules,
        min_cells_of_modules=min_cells_of_modules,
        max_cells_of_modules=max_cells_of_modules,
        min_cell_module_total=min_cell_module_total,
        rare_module_of_cells=rare_module_of_cells,
    )

    list_of_rare_gene_indices_of_modules = _compress_modules(
        adata_of_all_genes_of_all_cells=adata,
        min_cells_of_modules=min_cells_of_modules,
        max_cells_of_modules=max_cells_of_modules,
        related_gene_indices_of_modules=related_gene_indices_of_modules,
        rare_module_of_cells=rare_module_of_cells,
    )

    return _results(
        adata=adata,
        rare_module_of_cells=rare_module_of_cells,
        list_of_rare_gene_indices_of_modules=list_of_rare_gene_indices_of_modules,
        inplace=inplace,
    )



@ut.timed_call()
def _pick_candidates(
    *,
    adata_of_all_genes_of_all_cells: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    max_genes: int,
    max_gene_cell_fraction: float,
    min_gene_maximum: int,
    min_genes_of_modules: int,
    allowed_genes_mask: ut.NumpyVector,
) -> Optional[Tuple[AnnData, ut.NumpyVector]]:
    data = ut.get_vo_proper(adata_of_all_genes_of_all_cells, what, layout="column_major")
    nnz_cells_of_genes = ut.nnz_per(data, per="column")

    nnz_cell_fraction_of_genes = nnz_cells_of_genes / adata_of_all_genes_of_all_cells.n_obs
    nnz_cell_fraction_mask_of_genes = nnz_cell_fraction_of_genes <= max_gene_cell_fraction

    max_umis_of_genes = ut.max_per(data, per="column")
    max_umis_mask_of_genes = max_umis_of_genes >= min_gene_maximum

    candidates_mask_of_genes = max_umis_mask_of_genes & nnz_cell_fraction_mask_of_genes & allowed_genes_mask
    ut.log_calc("candidate_genes", candidates_mask_of_genes)

    candidate_genes_indices = np.where(candidates_mask_of_genes)[0]
    candidate_genes_count = candidate_genes_indices.size

    if candidate_genes_count > max_genes:
        nnz_cell_fraction_of_candidates = nnz_cell_fraction_of_genes[candidate_genes_indices]
        partitioned_genes_indices = np.argpartition(nnz_cell_fraction_of_candidates, max_genes)
        candidate_genes_indices = candidate_genes_indices[partitioned_genes_indices[0:max_genes]]
        candidate_genes_count = candidate_genes_indices.size

    if candidate_genes_count < min_genes_of_modules:
        return None

    candidate_data = ut.slice(
        adata_of_all_genes_of_all_cells, name=".candidate_genes", vars=candidate_genes_indices, top_level=False
    )
    return candidate_data, candidate_genes_indices


@ut.timed_call()
def _genes_similarity(
    *,
    candidate_data: AnnData,
    what: Union[str, ut.Matrix],
    method: str,
    reproducible: bool,
) -> ut.NumpyMatrix:
    similarity = compute_var_var_similarity(
        candidate_data, what, method=method, reproducible=reproducible, inplace=False
    )
    assert similarity is not None
    return ut.to_numpy_matrix(similarity, only_extract=True)


# TODO: Replicated in metacell.pipeline.related_genes
@ut.timed_call()
def _cluster_genes(
    similarities_between_candidate_genes: ut.NumpyMatrix,
    genes_cluster_method: str,
) -> List[Tuple[int, int]]:
    with ut.timed_step("scipy.pdist"):
        ut.timed_parameters(size=similarities_between_candidate_genes.shape[0])
        distances = scd.pdist(similarities_between_candidate_genes)

    with ut.timed_step("scipy.linkage"):
        ut.timed_parameters(size=distances.shape[0], method=genes_cluster_method)
        linkage = sch.linkage(distances, method=genes_cluster_method)

    return linkage


@ut.timed_call()
def _identify_genes(
    *,
    candidate_genes_indices: ut.NumpyVector,
    similarities_between_candidate_genes: ut.NumpyMatrix,
    min_module_correlation: float,
    linkage: List[Tuple[int, int]],
) -> List[List[int]]:
    candidate_genes_count = candidate_genes_indices.size
    np.fill_diagonal(similarities_between_candidate_genes, None)
    combined_candidate_indices = {index: [index] for index in range(candidate_genes_count)}

    for link_index, link_data in enumerate(linkage):
        link_index += candidate_genes_count

        left_index = int(link_data[0])
        right_index = int(link_data[1])

        left_combined_candidates = combined_candidate_indices.get(left_index)
        right_combined_candidates = combined_candidate_indices.get(right_index)
        if not left_combined_candidates or not right_combined_candidates:
            continue

        link_combined_candidates = sorted(left_combined_candidates + right_combined_candidates)
        assert link_combined_candidates
        link_similarities = similarities_between_candidate_genes[link_combined_candidates, :][  #
            :, link_combined_candidates
        ]
        average_link_similarity = np.nanmean(link_similarities)
        if average_link_similarity < min_module_correlation:
            continue

        combined_candidate_indices[link_index] = link_combined_candidates
        del combined_candidate_indices[left_index]
        del combined_candidate_indices[right_index]

    return [
        list(candidate_genes_indices[candidate_indices]) for candidate_indices in combined_candidate_indices.values()
    ]


@ut.timed_call()
def _related_genes(  # pylint: disable=too-many-statements,too-many-branches
    *,
    adata_of_all_genes_of_all_cells: AnnData,
    umis_per_gene_per_cell_by_rows: ut.ProperMatrix,
    umis_per_gene_per_cell_by_columns: ut.ProperMatrix,
    rare_gene_indices_of_modules: List[List[int]],
    allowed_genes_mask: ut.NumpyVector,
    min_genes_of_modules: int,
    min_gene_maximum: int,
    min_cells_of_modules: int,
    max_cells_of_modules: int,
    min_cell_module_total: int,
    min_related_gene_fold_factor: float,
    max_related_gene_increase_factor: float,
) -> List[List[int]]:
    total_umis_per_gene = ut.sum_per(umis_per_gene_per_cell_by_columns, per="column")
    ut.log_calc("total_umis_per_gene", total_umis_per_gene)

    ut.log_calc("genes for modules:")
    modules_count = 0
    related_gene_indices_of_modules: List[List[int]] = []

    rare_gene_indices_of_any: Set[int] = set()
    for rare_gene_indices_of_module in rare_gene_indices_of_modules:
        if len(rare_gene_indices_of_module) >= min_genes_of_modules:
            rare_gene_indices_of_any.update(list(rare_gene_indices_of_module))

    for rare_gene_indices_of_module in rare_gene_indices_of_modules:
        if len(rare_gene_indices_of_module) < min_genes_of_modules:
            continue

        module_index = modules_count
        modules_count += 1

        with ut.log_step("- module", module_index):
            ut.log_calc(
                "rare_gene_names", sorted(adata_of_all_genes_of_all_cells.var_names[rare_gene_indices_of_module])
            )

            umis_per_module_gene_per_cell_by_columns = umis_per_gene_per_cell_by_columns[:, rare_gene_indices_of_module]
            ut.log_calc("umis_per_module_gene_per_cell_by_columns", umis_per_module_gene_per_cell_by_columns)
            assert ut.is_layout(umis_per_module_gene_per_cell_by_columns, "column_major")

            umis_per_module_gene_per_cell_by_rows = ut.to_layout(
                umis_per_module_gene_per_cell_by_columns, layout="row_major"
            )
            ut.log_calc("umis_per_module_gene_per_cell_by_rows", umis_per_module_gene_per_cell_by_rows)

            total_module_umis_per_cell = ut.sum_per(umis_per_module_gene_per_cell_by_rows, per="row")
            ut.log_calc("total_module_umis_per_cell", total_module_umis_per_cell)

            mask_of_expressed_cells = total_module_umis_per_cell > 0
            expressed_cells_count = np.sum(mask_of_expressed_cells)

            if expressed_cells_count > max_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc("expressed_cells", ut.mask_description(mask_of_expressed_cells) + " (too many)")
                continue

            if expressed_cells_count < min_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc("expressed_cells", ut.mask_description(mask_of_expressed_cells) + " (too few)")
                continue

            ut.log_calc("expressed_cells", mask_of_expressed_cells)

            umis_per_gene_per_expressed_cell_by_rows = umis_per_gene_per_cell_by_rows[mask_of_expressed_cells, :]
            ut.log_calc("umis_per_gene_per_expressed_cell_by_rows", umis_per_gene_per_expressed_cell_by_rows)
            assert ut.is_layout(umis_per_gene_per_expressed_cell_by_rows, "row_major")

            umis_per_gene_per_expressed_cell_by_columns = ut.to_layout(
                umis_per_gene_per_expressed_cell_by_rows, layout="column_major"
            )
            ut.log_calc("umis_per_module_gene_per_cell_by_columns", umis_per_module_gene_per_cell_by_columns)

            max_expressed_umis_per_gene = ut.max_per(umis_per_gene_per_expressed_cell_by_columns, per="column")
            ut.log_calc("max_expressed_umis_per_gene", max_expressed_umis_per_gene)

            total_expressed_umis_per_gene = ut.sum_per(umis_per_gene_per_expressed_cell_by_columns, per="column")
            expressed_fraction_per_gene = total_expressed_umis_per_gene / sum(total_expressed_umis_per_gene)
            ut.log_calc("expressed_fraction_per_gene", expressed_fraction_per_gene)

            total_background_umis_per_gene = total_umis_per_gene - total_expressed_umis_per_gene
            background_fraction_per_gene = total_background_umis_per_gene / sum(total_background_umis_per_gene)
            ut.log_calc("background_fraction_per_gene", background_fraction_per_gene)

            mask_of_related_genes = (
                allowed_genes_mask
                & (max_expressed_umis_per_gene >= min_gene_maximum)
                & (expressed_fraction_per_gene >= background_fraction_per_gene * (2**min_related_gene_fold_factor))
            )
            ut.log_calc("mask_of_related_genes", mask_of_related_genes)

            related_gene_indices = np.where(mask_of_related_genes)[0]
            assert np.all(mask_of_related_genes[rare_gene_indices_of_module])

            mask_of_strong_base_cells = total_module_umis_per_cell >= min_cell_module_total
            count_of_strong_base_cells = np.sum(mask_of_strong_base_cells)

            if ut.logging_calc():
                ut.log_calc(
                    "candidate_gene_names", sorted(adata_of_all_genes_of_all_cells.var_names[related_gene_indices])
                )
                ut.log_calc("mask_of_strong_base_cells", mask_of_strong_base_cells)

            related_gene_indices_of_module = list(rare_gene_indices_of_module)
            for gene_index in related_gene_indices:
                if gene_index in rare_gene_indices_of_module:
                    continue

                if gene_index in rare_gene_indices_of_any:
                    ut.log_calc(
                        f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} "
                        f"belongs to another module"
                    )
                    continue

                umis_of_related_gene_per_cell = ut.to_numpy_vector(
                    umis_per_gene_per_cell_by_columns[:, gene_index], copy=True
                )
                umis_of_related_gene_per_cell += total_module_umis_per_cell
                mask_of_strong_related_cells = umis_of_related_gene_per_cell >= min_cell_module_total
                count_of_strong_related_cells = np.sum(mask_of_strong_related_cells)
                ut.log_calc(
                    f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} "
                    f"strong cells: {count_of_strong_related_cells} "
                    f"factor: {count_of_strong_related_cells / count_of_strong_base_cells}"
                )
                if count_of_strong_related_cells > max_related_gene_increase_factor * count_of_strong_base_cells:
                    continue

                related_gene_indices_of_module.append(gene_index)

            related_gene_indices_of_modules.append(related_gene_indices_of_module)

    if ut.logging_calc():
        ut.log_calc("related genes for modules:")
        for module_index, related_gene_indices_of_module in enumerate(related_gene_indices_of_modules):
            ut.log_calc(
                f"- module {module_index} related_gene_names",
                sorted(adata_of_all_genes_of_all_cells.var_names[related_gene_indices_of_module]),
            )

    return related_gene_indices_of_modules


@ut.timed_call()
def _identify_cells(
    *,
    adata_of_all_genes_of_all_cells: AnnData,
    umis_per_gene_per_cell_by_columns: ut.ProperMatrix,
    related_gene_indices_of_modules: List[List[int]],
    min_cell_module_total: int,
    min_cells_of_modules: int,
    max_cells_of_modules: int,
    rare_module_of_cells: ut.NumpyVector,
) -> None:
    max_strength_of_cells = np.zeros(adata_of_all_genes_of_all_cells.n_obs)

    ut.log_calc("cells for modules:")
    modules_count = len(related_gene_indices_of_modules)
    for module_index, related_gene_indices_of_module in enumerate(related_gene_indices_of_modules):
        if len(related_gene_indices_of_module) == 0:
            continue

        with ut.log_step(
            "- module",
            module_index,
            formatter=lambda module_index: ut.progress_description(modules_count, module_index, "module"),
        ):
            umis_per_related_gene_per_cell_by_columns = umis_per_gene_per_cell_by_columns[
                :, related_gene_indices_of_module
            ]
            ut.log_calc("umis_per_related_gene_per_cell_by_columns", umis_per_related_gene_per_cell_by_columns)

            umis_per_related_gene_per_cell_by_rows = ut.to_layout(
                umis_per_related_gene_per_cell_by_columns, layout="row_major"
            )
            ut.log_calc("umis_per_related_gene_per_cell_by_rows", umis_per_related_gene_per_cell_by_rows)

            total_related_umis_per_cell = ut.sum_per(umis_per_related_gene_per_cell_by_rows, per="row")

            mask_of_strong_cells_of_module = total_related_umis_per_cell >= min_cell_module_total

            median_strength_of_module = np.median(total_related_umis_per_cell[mask_of_strong_cells_of_module])  #
            strong_cells_count = np.sum(mask_of_strong_cells_of_module)

            if strong_cells_count > max_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc("strong_cells", ut.mask_description(mask_of_strong_cells_of_module) + " (too many)")  #
                related_gene_indices_of_module.clear()
                continue

            if strong_cells_count < min_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc("strong_cells", ut.mask_description(mask_of_strong_cells_of_module) + " (too few)")  #
                related_gene_indices_of_module.clear()
                continue

            ut.log_calc("strong_cells", mask_of_strong_cells_of_module)

            strength_of_all_cells = total_related_umis_per_cell / median_strength_of_module
            mask_of_strong_cells_of_module &= strength_of_all_cells >= max_strength_of_cells
            max_strength_of_cells[mask_of_strong_cells_of_module] = strength_of_all_cells[
                mask_of_strong_cells_of_module
            ]

            rare_module_of_cells[mask_of_strong_cells_of_module] = module_index


@ut.timed_call()
def _compress_modules(
    *,
    adata_of_all_genes_of_all_cells: AnnData,
    min_cells_of_modules: int,
    max_cells_of_modules: int,
    related_gene_indices_of_modules: List[List[int]],
    rare_module_of_cells: ut.NumpyVector,
) -> List[List[int]]:
    list_of_rare_gene_indices_of_modules: List[List[int]] = []
    list_of_names_of_genes_of_modules: List[List[str]] = []

    cell_counts_of_modules: List[int] = []

    ut.log_calc("compress modules:")
    modules_count = len(related_gene_indices_of_modules)
    for module_index, gene_indices_of_module in enumerate(related_gene_indices_of_modules):
        if len(gene_indices_of_module) == 0:
            continue

        with ut.log_step(
            "- module",
            module_index,
            formatter=lambda module_index: ut.progress_description(modules_count, module_index, "module"),
        ):
            module_cells_mask = rare_module_of_cells == module_index
            module_cells_count = np.sum(module_cells_mask)

            if module_cells_count < min_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc("cells", str(module_cells_count) + " (too few)")
                rare_module_of_cells[module_cells_mask] = -1
                continue

            if module_cells_count > max_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc("cells", str(module_cells_count) + " (too many)")
                rare_module_of_cells[module_cells_mask] = -1
                continue

            ut.log_calc("cells", module_cells_count)

            next_module_index = len(list_of_rare_gene_indices_of_modules)
            if module_index != next_module_index:
                ut.log_calc("is reindexed to", next_module_index)
                rare_module_of_cells[module_cells_mask] = next_module_index
                module_index = next_module_index

            next_module_index += 1
            list_of_rare_gene_indices_of_modules.append(gene_indices_of_module)

            if ut.logging_calc():
                cell_counts_of_modules.append(np.sum(module_cells_mask))
            list_of_names_of_genes_of_modules.append(  #
                sorted(adata_of_all_genes_of_all_cells.var_names[gene_indices_of_module])
            )

    if ut.logging_calc():
        ut.log_calc("final modules:")
        for module_index, (module_cells_count, module_gene_names) in enumerate(
            zip(cell_counts_of_modules, list_of_names_of_genes_of_modules)
        ):
            ut.log_calc(f"- module: {module_index} cells: {module_cells_count} genes: {module_gene_names}")  #

    return list_of_rare_gene_indices_of_modules


def _results(
    *,
    adata: AnnData,
    rare_module_of_cells: ut.NumpyVector,
    list_of_rare_gene_indices_of_modules: List[List[int]],
    inplace: bool,
) -> Optional[Tuple[ut.PandasFrame, ut.PandasFrame]]:
    assert np.max(rare_module_of_cells) == len(list_of_rare_gene_indices_of_modules) - 1

    if not inplace:
        var_metrics = ut.to_pandas_frame(index=adata.var_names)

    rare_gene_mask = np.zeros(adata.n_vars, dtype="bool")
    rare_gene_modules = np.full(adata.n_vars, -1, dtype="int32")
    for module_index, rare_gene_indices_of_module in enumerate(list_of_rare_gene_indices_of_modules):
        rare_gene_mask[rare_gene_indices_of_module] = True
        rare_gene_modules[rare_gene_indices_of_module] = module_index

    if inplace:
        ut.set_v_data(adata, "rare_gene", rare_gene_mask)
        ut.set_v_data(adata, "rare_gene_module", rare_gene_modules, formatter=ut.groups_description)
    else:
        var_metrics["rare_gene"] = rare_gene_mask
        ut.log_return("rare_gene", rare_gene_mask)
        var_metrics["rare_gene_module"] = rare_gene_modules
        ut.log_return("rare_gene_module", rare_gene_modules, formatter=ut.groups_description)

    if inplace:
        ut.set_o_data(adata, "cells_rare_gene_module", rare_module_of_cells, formatter=ut.groups_description)
        ut.set_o_data(adata, "rare_cell", rare_module_of_cells >= 0)
        return None

    obs_metrics = ut.to_pandas_frame(index=adata.obs_names)
    ut.log_return("cells_rare_gene_module", rare_module_of_cells, formatter=ut.groups_description)
    ut.log_return("rare_cell", rare_module_of_cells >= 0)

    return obs_metrics, var_metrics