"""
High
----
"""
from typing import Optional
from typing import Union
import numpy as np
from anndata import AnnData # type: ignore
import metacells.parameters as pr
import metacells.utilities as ut
__all__ = [
"find_high_total_genes",
"find_high_topN_genes",
"find_high_fraction_genes",
"find_high_normalized_variance_genes",
"find_high_relative_variance_genes",
"find_metacells_marker_genes",
]
[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def find_high_total_genes(
adata: AnnData,
what: Union[str, ut.Matrix] = "__x__",
*,
min_gene_total: int,
inplace: bool = True,
) -> Optional[ut.PandasSeries]:
"""
Find genes which have high total number of ``what`` (default: {what}) data.
This should typically only be applied to downsampled data to ensure that variance in sampling
depth does not affect the result.
Genes with too-low expression are typically excluded from computations. In particular,
genes may have all-zero expression, in which case including them just slows the
computations (and triggers numeric edge cases).
**Input**
Annotated ``adata``, where the observations are cells and the variables are genes, where
``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
annotation containing such a matrix.
**Returns**
Variable (Gene) Annotations
``high_total_gene``
A boolean mask indicating whether each gene was found to have a high normalized
variance.
If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
``None``. Otherwise this is returned as a pandas series (indexed by the variable names).
**Computation Parameters**
1. Use :py:func:`metacells.utilities.computation.sum_per` to get the total UMIs of each gene.
2. Select the genes whose fraction is at least ``min_gene_total``.
"""
total_of_genes = ut.get_v_numpy(adata, what, sum=True)
genes_mask = total_of_genes >= min_gene_total
if inplace:
ut.set_v_data(adata, "high_total_gene", genes_mask)
return None
ut.log_return("high_total_genes", genes_mask)
return ut.to_pandas_series(genes_mask, index=adata.var_names)
[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def find_high_topN_genes( # pylint: disable=invalid-name
adata: AnnData,
what: Union[str, ut.Matrix] = "__x__",
*,
topN: int,
min_gene_topN: int,
inplace: bool = True,
) -> Optional[ut.PandasSeries]:
"""
Find genes which have high total top-Nth value of ``what`` (default: {what}) data.
This should typically only be applied to downsampled data to ensure that variance in sampling
depth does not affect the result.
Genes with too-low expression are typically excluded from computations. In particular,
genes may have all-zero expression, in which case including them just slows the
computations (and triggers numeric edge cases).
**Input**
Annotated ``adata``, where the observations are cells and the variables are genes, where
``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
annotation containing such a matrix.
**Returns**
Variable (Gene) Annotations
``high_top<topN>_gene``
A boolean mask indicating whether each gene was found to have a high top-Nth value.
If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
``None``. Otherwise this is returned as a pandas series (indexed by the variable names).
**Computation Parameters**
1. Use :py:func:`metacells.utilities.computation.top_per` to get the top-Nth UMIs of each gene.
2. Select the genes whose fraction is at least ``min_gene_topN``.
"""
data_of_genes = ut.get_vo_proper(adata, what, layout="column_major")
rank = max(adata.n_obs - topN - 1, 1)
topN_of_genes = ut.rank_per(data_of_genes, per="column", rank=rank) # pylint: disable=invalid-name
genes_mask = topN_of_genes >= min_gene_topN
if inplace:
ut.set_v_data(adata, f"high_top{topN}_gene", genes_mask)
return None
ut.log_return(f"high_top{topN}_genes", genes_mask)
return ut.to_pandas_series(genes_mask, index=adata.var_names)
[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def find_high_fraction_genes(
adata: AnnData,
what: Union[str, ut.Matrix] = "__x__",
*,
min_gene_fraction: float = pr.significant_gene_fraction,
inplace: bool = True,
) -> Optional[ut.PandasSeries]:
"""
Find genes which have high fraction of the total ``what`` (default: {what}) data of the cells.
Genes with too-low expression are typically excluded from computations. In particular,
genes may have all-zero expression, in which case including them just slows the
computations (and triggers numeric edge cases).
**Input**
Annotated ``adata``, where the observations are cells and the variables are genes, where
``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
annotation containing such a matrix.
**Returns**
Variable (Gene) Annotations
``high_fraction_gene``
A boolean mask indicating whether each gene was found to have a high normalized
variance.
If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
``None``. Otherwise this is returned as a pandas series (indexed by the variable names).
**Computation Parameters**
1. Use :py:func:`metacells.utilities.computation.fraction_per` to get the fraction of each gene.
2. Select the genes whose fraction is at least ``min_gene_fraction`` (default:
{min_gene_fraction}).
"""
data = ut.get_vo_proper(adata, what, layout="column_major")
fraction_of_genes = ut.fraction_per(data, per="column")
genes_mask = fraction_of_genes >= min_gene_fraction
if inplace:
ut.set_v_data(adata, "high_fraction_gene", genes_mask)
return None
ut.log_return("high_fraction_genes", genes_mask)
return ut.to_pandas_series(genes_mask, index=adata.var_names)
[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def find_high_normalized_variance_genes(
adata: AnnData,
what: Union[str, ut.Matrix] = "__x__",
*,
min_gene_normalized_variance: float = pr.significant_gene_normalized_variance,
inplace: bool = True,
) -> Optional[ut.PandasSeries]:
"""
Find genes which have high normalized variance of ``what`` (default: {what}) data.
The normalized variance measures the variance / mean of each gene. See
:py:func:`metacells.utilities.computation.normalized_variance_per` for details.
Genes with a high normalized variance are "bursty", that is, have significantly different
expression level in different cells.
**Input**
Annotated ``adata``, where the observations are cells and the variables are genes, where
``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
annotation containing such a matrix.
**Returns**
Variable (Gene) Annotations
``high_normalized_variance_gene``
A boolean mask indicating whether each gene was found to have a high normalized
variance.
If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
``None``. Otherwise this is returned as a pandas series (indexed by the variable names).
**Computation Parameters**
1. Use :py:func:`metacells.utilities.computation.normalized_variance_per` to get the normalized
variance of each gene.
2. Select the genes whose normalized variance is at least
``min_gene_normalized_variance`` (default: {min_gene_normalized_variance}).
"""
data = ut.get_vo_proper(adata, what, layout="column_major")
normalized_variance_of_genes = ut.normalized_variance_per(data, per="column")
genes_mask = normalized_variance_of_genes >= min_gene_normalized_variance
if inplace:
ut.set_v_data(adata, "high_normalized_variance_gene", genes_mask)
return None
ut.log_return("high_normalized_variance_genes", genes_mask)
return ut.to_pandas_series(genes_mask, index=adata.var_names)
[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def find_high_relative_variance_genes(
adata: AnnData,
what: Union[str, ut.Matrix] = "__x__",
*,
min_gene_relative_variance: float = pr.significant_gene_relative_variance,
window_size: int = pr.relative_variance_window_size,
inplace: bool = True,
) -> Optional[ut.PandasSeries]:
"""
Find genes which have high relative variance of ``what`` (default: {what}) data.
The relative variance measures the variance / mean of each gene relative to the other genes with
a similar level of expression. See
:py:func:`metacells.utilities.computation.relative_variance_per` for details.
Genes with a high relative variance are good candidates for being selected as "marker genes",
that is, be used to compute the similarity between cells. Using the relative variance
compensates for the bias for selecting higher-expression genes, whose normalized variance can to
be larger due to random noise alone.
**Input**
Annotated ``adata``, where the observations are cells and the variables are genes, where
``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
annotation containing such a matrix.
**Returns**
Variable (Gene) Annotations
``high_relative_variance_gene``
A boolean mask indicating whether each gene was found to have a high relative
variance.
If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
``None``. Otherwise this is returned as a pandas series (indexed by the variable names).
**Computation Parameters**
1. Use :py:func:`metacells.utilities.computation.relative_variance_per` to get the relative
variance of each gene.
2. Select the genes whose relative variance is at least
``min_gene_relative_variance`` (default: {min_gene_relative_variance}).
"""
data = ut.get_vo_proper(adata, what, layout="column_major")
relative_variance_of_genes = ut.relative_variance_per(data, per="column", window_size=window_size)
genes_mask = relative_variance_of_genes >= min_gene_relative_variance
if inplace:
ut.set_v_data(adata, "high_relative_variance_gene", genes_mask)
return None
ut.log_return("high_relative_variance_genes", genes_mask)
return ut.to_pandas_series(genes_mask, index=adata.var_names)