Source code for metacells.tools.bursty_lonely

"""
Noisy Lonely
------------
"""

from typing import Optional
from typing import Union

import numpy as np
from anndata import AnnData  # type: ignore

import metacells.parameters as pr
import metacells.utilities as ut
from metacells.tools.downsample import downsample_cells
from metacells.tools.filter import filter_data
from metacells.tools.high import find_high_normalized_variance_genes
from metacells.tools.high import find_high_total_genes
from metacells.tools.similarity import compute_var_var_similarity

__all__ = [
    "find_bursty_lonely_genes",
]



[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def find_bursty_lonely_genes(  # pylint: disable=too-many-statements
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    max_sampled_cells: int = pr.bursty_lonely_max_sampled_cells,
    downsample_min_samples: int = pr.bursty_lonely_downsample_min_samples,
    downsample_min_cell_quantile: float = pr.bursty_lonely_downsample_max_cell_quantile,
    downsample_max_cell_quantile: float = pr.bursty_lonely_downsample_min_cell_quantile,
    min_gene_total: int = pr.bursty_lonely_min_gene_total,
    min_gene_normalized_variance: float = pr.bursty_lonely_min_gene_normalized_variance,
    max_gene_similarity: float = pr.bursty_lonely_max_gene_similarity,
    inplace: bool = True,
    random_seed: int,
) -> Optional[ut.PandasSeries]:
    """
    Detect "bursty lonely" genes based on ``what`` (default: {what}) data.

    Return the indices of genes which are "bursty" (have high variance compared to their mean) and
    also "lonely" (have low correlation with all other genes). Such genes should be excluded since
    they will never meaningfully help us compute groups, and will actively cause profiles to be
    considered "deviants".

    Noisy genes have high expression and variance. Lonely genes have no (or low) correlations with
    any other gene. Noisy lonely genes tend to throw off clustering algorithms. In general, such
    algorithms try to group together cells with the same overall biological state. Since the genes
    are lonely, they don't contribute towards this goal. Since they are bursty, they actively hamper
    this, because they make cells which are otherwise similar appear different (just for this lonely
    gene).

    It is therefore useful to explicitly identify, in a pre-processing step, the (few) such genes,
    and exclude them from the rest of the analysis.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``bursty_lonely_genes``
            A boolean mask indicating whether each gene was found to be a "bursty lonely" gene.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number
       of random cells. Specify a non-zero random seed to make this reproducible.

    2. Invoke :py:func:`metacells.tools.downsample.downsample_cells` to downsample the cells to the
       same total number of UMIs, using the ``downsample_min_samples`` (default:
       {downsample_min_samples}), ``downsample_min_cell_quantile`` (default:
       {downsample_min_cell_quantile}), ``downsample_max_cell_quantile`` (default:
       {downsample_max_cell_quantile}) and the ``random_seed``.

    3. Find "bursty" genes which have a total number of UMIs of at least ``min_gene_total`` (default:
       {min_gene_total}) and a normalized variance of at least ``min_gene_normalized_variance``
       (default: ``min_gene_normalized_variance``).

    4. Cross-correlate the bursty genes.

    5. Find the bursty "lonely" genes whose maximal correlation is at most
       ``max_gene_similarity`` (default: {max_gene_similarity}) with all other genes.
    """
    if max_sampled_cells < adata.n_obs:
        np.random.seed(random_seed)
        cell_indices = np.random.choice(np.arange(adata.n_obs), size=max_sampled_cells, replace=False)
        sdata = ut.slice(adata, obs=cell_indices, name=".sampled", top_level=False)
    else:
        sdata = ut.copy_adata(adata, top_level=False)

    track_var: Optional[str] = "sampled_gene_index"

    downsample_cells(
        sdata,
        what,
        downsample_min_samples=downsample_min_samples,
        downsample_min_cell_quantile=downsample_min_cell_quantile,
        downsample_max_cell_quantile=downsample_max_cell_quantile,
        random_seed=random_seed,
    )

    find_high_total_genes(sdata, "downsampled", min_gene_total=min_gene_total)

    results = filter_data(
        sdata, name="high_total", top_level=False, track_var=track_var, var_masks=["&high_total_gene"]
    )
    track_var = None
    assert results is not None
    ht_data = results[0]

    bursty_lonely_genes_mask = np.full(adata.n_vars, False)

    if ht_data is not None:
        ht_genes_count = ht_data.shape[1]

        ht_gene_ht_gene_similarity_frame = compute_var_var_similarity(
            ht_data, "downsampled", inplace=False, reproducible=(random_seed != 0)
        )
        assert ht_gene_ht_gene_similarity_frame is not None

        ht_gene_ht_gene_similarity_matrix = ut.to_numpy_matrix(ht_gene_ht_gene_similarity_frame, only_extract=True)
        ht_gene_ht_gene_similarity_matrix = ut.to_layout(
            ht_gene_ht_gene_similarity_matrix, layout="row_major", symmetric=True
        )
        np.fill_diagonal(ht_gene_ht_gene_similarity_matrix, -1)

        htv_mask_series = find_high_normalized_variance_genes(
            ht_data, "downsampled", min_gene_normalized_variance=min_gene_normalized_variance, inplace=False
        )
        assert htv_mask_series is not None
        htv_mask = ut.to_numpy_vector(htv_mask_series)

        htv_genes_count = np.sum(htv_mask)
        assert htv_genes_count <= ht_genes_count

        if htv_genes_count > 0:
            htv_gene_ht_gene_similarity_matrix = ht_gene_ht_gene_similarity_matrix[htv_mask, :]
            assert ut.is_layout(htv_gene_ht_gene_similarity_matrix, "row_major")
            assert htv_gene_ht_gene_similarity_matrix.shape == (htv_genes_count, ht_genes_count)

            max_similarity_of_htv_genes = ut.max_per(htv_gene_ht_gene_similarity_matrix, per="row")
            htvl_mask = max_similarity_of_htv_genes <= max_gene_similarity
            htvl_genes_count = np.sum(htvl_mask)
            ut.log_calc("bursty_lonely_genes_count", htvl_genes_count)

            if htvl_genes_count > 0:
                base_index_of_ht_genes = ut.get_v_numpy(ht_data, "sampled_gene_index")
                base_index_of_htv_genes = base_index_of_ht_genes[htv_mask]
                base_index_of_htvl_genes = base_index_of_htv_genes[htvl_mask]

                bursty_lonely_genes_mask[base_index_of_htvl_genes] = True

                htvl_gene_ht_gene_similarity_matrix = htv_gene_ht_gene_similarity_matrix[htvl_mask, :]
                htvl_gene_ht_gene_similarity_matrix = ut.to_layout(
                    htvl_gene_ht_gene_similarity_matrix, layout="row_major"
                )
                assert htvl_gene_ht_gene_similarity_matrix.shape == (htvl_genes_count, ht_genes_count)

                if ut.logging_calc():
                    i_gene_totals = ut.get_v_numpy(sdata, "downsampled", sum=True)
                    ht_mask = ut.get_v_numpy(sdata, "high_total_gene")
                    i_total = np.sum(i_gene_totals)
                    htvl_gene_totals = i_gene_totals[ht_mask][htv_mask][htvl_mask]
                    top_similarity_of_htvl_genes = ut.top_per(htvl_gene_ht_gene_similarity_matrix, 10, per="row")
                    for htvl_index, gene_index in enumerate(base_index_of_htvl_genes):
                        gene_name = adata.var_names[gene_index]
                        gene_total = htvl_gene_totals[htvl_index]
                        gene_percent = 100 * gene_total / i_total
                        similar_ht_values = ut.to_numpy_vector(top_similarity_of_htvl_genes[htvl_index, :])  #
                        assert len(similar_ht_values) == ht_genes_count
                        top_similar_ht_mask = similar_ht_values > 0
                        top_similar_ht_values = similar_ht_values[top_similar_ht_mask]
                        top_similar_ht_indices = base_index_of_ht_genes[top_similar_ht_mask]
                        top_similar_ht_names = adata.var_names[top_similar_ht_indices]
                        ut.log_calc(
                            f"- {gene_name}",
                            f"total downsampled UMIs: {gene_total} "
                            + f"({gene_percent:.4g}%), correlated with: "
                            + ", ".join(
                                [
                                    f"{similar_gene_name}: {similar_gene_value:.4g}"
                                    for similar_gene_value, similar_gene_name in reversed(
                                        sorted(zip(top_similar_ht_values, top_similar_ht_names))
                                    )
                                ]
                            ),
                        )

    if ut.logging_calc():
        ut.log_calc("bursty_lonely_gene_names", sorted(list(adata.var_names[bursty_lonely_genes_mask])))

    if inplace:
        ut.set_v_data(adata, "bursty_lonely_gene", bursty_lonely_genes_mask)
        return None

    ut.log_return("bursty_lonely_genes", bursty_lonely_genes_mask)
    return ut.to_pandas_series(bursty_lonely_genes_mask, index=adata.var_names)