"""
Exclude
-------
Raw single-cell RNA sequencing data is notoriously noisy and "dirty". The pipeline steps here performs initial analysis
of the data and exclude some of it, so it would not harm the metacells computation. The steps provided here are expected
to be generically useful, but as always specific data sets may require custom cleaning steps on a case-by-case basis.
"""
from re import Pattern
from typing import Collection
from typing import List
from typing import Optional
from typing import Union
from anndata import AnnData # type: ignore
import metacells.parameters as pr
import metacells.tools as tl
import metacells.utilities as ut
__all__ = [
"exclude_genes",
"exclude_cells",
"extract_clean_data",
]
[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def exclude_genes(
adata: AnnData,
what: Union[str, ut.Matrix] = "__x__",
*,
bursty_max_sampled_cells: Optional[int] = pr.bursty_lonely_max_sampled_cells,
bursty_downsample_min_samples: int = pr.bursty_lonely_downsample_min_samples,
bursty_downsample_min_cell_quantile: float = pr.bursty_lonely_downsample_max_cell_quantile,
bursty_downsample_max_cell_quantile: float = pr.bursty_lonely_downsample_min_cell_quantile,
bursty_min_gene_total: int = pr.bursty_lonely_min_gene_total,
bursty_min_gene_normalized_variance: float = pr.bursty_lonely_min_gene_normalized_variance,
bursty_max_gene_similarity: float = pr.bursty_lonely_max_gene_similarity,
properly_sampled_min_gene_total: Optional[int] = pr.properly_sampled_min_gene_total,
excluded_gene_names: Optional[Collection[str]] = None,
excluded_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
random_seed: int,
) -> None:
"""
Exclude a subset of the genes from the metacells computation.
You can also just manually set the ``excluded_gene`` mask, or further manipulate it after calling this function.
**Input**
Annotated ``adata``, where the observations are cells and the variables are genes.
**Returns**
Sets the following in the data:
Variable (gene) annotations:
``properly_sampled_gene``
A mask of the "properly sampled" genes.
``excluded_gene``
A mask of the genes which were excluded (by name or due to not being properly sampled).
**Computation Parameters**
1. Invoke :py:func:`metacells.tools.bursty_lonely.find_bursty_lonely_genes` using
``bursty_max_sampled_cells`` (default: {bursty_max_sampled_cells}),
``bursty_downsample_min_samples`` (default: {bursty_downsample_min_samples}),
``bursty_downsample_min_cell_quantile`` (default: {bursty_downsample_min_cell_quantile}),
``bursty_downsample_max_cell_quantile`` (default: {bursty_downsample_max_cell_quantile}),
``bursty_min_gene_total`` (default: {bursty_min_gene_total}),
``bursty_min_gene_normalized_variance`` (default: {bursty_min_gene_normalized_variance}),
``bursty_max_gene_similarity`` (default: {bursty_max_gene_similarity}),
and ``random_seed``.
Any "bursty_lonely_genes" will be excluded.
2. Invoke :py:func:`metacells.tools.properly_sampled.find_properly_sampled_genes` using
``properly_sampled_min_gene_total`` (default: {properly_sampled_min_gene_total}). Genes which are not properly
sampled will be excluded.
3. Invoke :py:func:`metacells.tools.named.find_named_genes` to also exclude genes based on their name, using the
``excluded_gene_names`` (default: {excluded_gene_names}) and ``excluded_gene_patterns`` (default:
{excluded_gene_patterns}).
"""
if (
bursty_max_sampled_cells is None
and properly_sampled_min_gene_total is None
and excluded_gene_names is None
and excluded_gene_patterns is None
):
return
bursty_lonely_genes_mask: Optional[ut.NumpyVector] = None
if bursty_max_sampled_cells is not None:
tl.find_bursty_lonely_genes(
adata,
what,
max_sampled_cells=bursty_max_sampled_cells,
downsample_min_samples=bursty_downsample_min_samples,
downsample_min_cell_quantile=bursty_downsample_min_cell_quantile,
downsample_max_cell_quantile=bursty_downsample_max_cell_quantile,
min_gene_total=bursty_min_gene_total,
min_gene_normalized_variance=bursty_min_gene_normalized_variance,
max_gene_similarity=bursty_max_gene_similarity,
random_seed=random_seed,
)
bursty_lonely_genes_mask = ut.get_v_numpy(adata, "bursty_lonely_gene")
properly_sampled_genes_mask: Optional[ut.NumpyVector] = None
if properly_sampled_min_gene_total is not None:
tl.find_properly_sampled_genes(adata, what, min_gene_total=properly_sampled_min_gene_total)
properly_sampled_genes_mask = ut.get_v_numpy(adata, "properly_sampled_gene")
named_genes_mask: Optional[ut.NumpyVector] = None
if excluded_gene_names is not None or excluded_gene_patterns is not None:
named_genes_series = tl.find_named_genes(
adata, names=excluded_gene_names, patterns=excluded_gene_patterns, to=None
)
assert named_genes_series is not None
named_genes_mask = named_genes_series.values
excluded_genes_mask: Optional[ut.NumpyVector] = None
if bursty_lonely_genes_mask is not None:
excluded_genes_mask = bursty_lonely_genes_mask
if properly_sampled_genes_mask is not None:
if excluded_genes_mask is None:
excluded_genes_mask = ~properly_sampled_genes_mask
else:
excluded_genes_mask = excluded_genes_mask | ~properly_sampled_genes_mask
if named_genes_mask is not None:
if excluded_genes_mask is None:
excluded_genes_mask = named_genes_mask
else:
excluded_genes_mask = excluded_genes_mask | named_genes_mask
assert excluded_genes_mask is not None
ut.set_v_data(adata, "excluded_gene", excluded_genes_mask)
[docs]
@ut.logged()
@ut.timed_call()
def exclude_cells(
adata: AnnData,
what: Union[str, ut.Matrix] = "__x__",
*,
properly_sampled_min_cell_total: Optional[int],
properly_sampled_max_cell_total: Optional[int],
properly_sampled_max_excluded_genes_fraction: Optional[float],
additional_cells_masks: Optional[List[str]] = None,
) -> None:
"""
Exclude a subset of the cells from the metacells computation.
You can also just manually set the ``excluded_cell`` mask, or further manipulate it after calling this function.
**Input**
Annotated ``adata``, where the observations are cells and the variables are genes. Optionally, may contain an
``excluded_gene`` mask of genes to be excluded from the metacells computation. That is, invoke this after
calling :py:func:`exclude_genes` (if you wish to exclude any genes).
**Returns**
Sets the following in the full data:
Observation (cell) annotations:
``properly_sampled_cell``
A mask of the "properly sampled" cells.
``excluded_cell``
A mask of the genes which were excluded (inverse of ``properly_sampled_cell``).
**Computation Parameters**
1. Invoke :py:func:`metacells.tools.properly_sampled.find_properly_sampled_cells` using
``properly_sampled_min_cell_total`` (no default), ``properly_sampled_max_cell_total`` (no default) and
``properly_sampled_max_excluded_genes_fraction`` (no default).
2. Exclude any cells which are not properly sampled (``|~properly_sampled_cell``), with optional additional
following ``additional_cells_masks`` (using :py:func:`metacells.tools.mask.combine_masks`).
"""
tl.find_properly_sampled_cells(
adata,
what,
min_cell_total=properly_sampled_min_cell_total,
max_cell_total=properly_sampled_max_cell_total,
max_excluded_genes_fraction=properly_sampled_max_excluded_genes_fraction,
)
excluded_cells_masks = ["|~properly_sampled_cell"] + (additional_cells_masks or [])
tl.combine_masks(adata, excluded_cells_masks, to="excluded_cell")