Source code for metacells.tools.group

"""
Group
-----
"""

from hashlib import shake_128
from typing import Any
from typing import Callable
from typing import List
from typing import Optional
from typing import Union

import numpy as np
from anndata import AnnData  # type: ignore

import metacells.utilities as ut

__all__ = [
    "group_obs_data",
    "group_obs_annotation",
]



[docs]
@ut.logged()
@ut.timed_call()
def group_obs_data(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    groups: Union[str, ut.Vector],
    name: Optional[str] = None,
    prefix: Optional[str] = None,
) -> Optional[AnnData]:
    """
    Compute new data which has the ``what`` (default: {what}) sum of the observations (cells) for
    each group.

    For example, having computed a metacell index for each cell, compute the per-metacell data
    for further analysis.

    If ``groups`` is a string, it is expected to be the name of a per-observation vector annotation.
    Otherwise it should be a vector. The group indices should be integers, where negative values
    indicate "no group" and non-negative values indicate the index of the group to which each
    observation (cell) belongs to.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    An annotated data where each observation is the sum of the group of original observations
    (cells). Observations with a negative group index are discarded. If all observations are
    discarded, return ``None``.

    The new data will contain only:

    * A single observation for each group. The name of each observation will be the optional ``prefix``
      (default: {prefix}), followed by the group's index, followed by ``.`` and a 2-digit checksum of
      the grouped members.

    * An ``X`` member holding the summed-per-group data.

    * A new ``grouped`` per-observation data which counts, for each group, the number
      of grouped observations summed into it.

    If ``name`` is not specified, the data will be unnamed. Otherwise, if it starts with a ``.``, it
    will be appended to the current name (if any). Otherwise, ``name`` is the new name.
    """
    group_of_cells = ut.get_o_numpy(adata, groups, formatter=ut.groups_description)

    data = ut.get_vo_proper(adata, what, layout="row_major")
    results = ut.sum_groups(data, group_of_cells, per="row")
    if results is None:
        return None
    summed_data, cell_counts = results

    gdata = AnnData(summed_data)
    gdata.var_names = adata.var_names
    gdata.obs_names = _obs_names(prefix or "", ut.to_numpy_vector(adata.obs_names), group_of_cells)

    ut.set_name(gdata, ut.get_name(adata))
    ut.set_name(gdata, name)

    ut.set_o_data(gdata, "grouped", cell_counts, formatter=ut.sizes_description)

    return gdata



# TODO: Replicated in metacells.pipeline.collect
def _obs_names(prefix: str, name_of_members: ut.NumpyVector, group_of_members: ut.NumpyVector) -> List[str]:
    groups_count = np.max(group_of_members) + 1
    name_of_groups: List[str] = []
    prefix = prefix or ""
    for group_index in range(groups_count):
        groups_mask = group_of_members == group_index
        assert np.any(groups_mask)
        hasher = shake_128()
        for member_name in name_of_members[groups_mask]:
            hasher.update(member_name.encode("utf8"))
        checksum = int(hasher.hexdigest(16), 16) % 10
        name_of_groups.append(f"{prefix}{group_index}.{checksum:02d}")
    return name_of_groups



[docs]
@ut.logged()
@ut.timed_call()
@ut.expand_doc()
def group_obs_annotation(
    adata: AnnData,
    gdata: AnnData,
    *,
    groups: Union[str, ut.Vector],
    name: str,
    formatter: Optional[Callable[[Any], Any]] = None,
    method: str = "majority",
    min_value_fraction: float = 0.5,
    conflict: Optional[Any] = None,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Transfer per-observation data from the per-observation (cell) ``adata`` to the
    per-group-of-observations (metacells) ``gdata``.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, and the
    ``gdata`` containing the per-metacells summed data.

    **Returns**

    Observations (Cell) Annotations
        ``<name>``
            The per-group-observation annotation computed based on the per-observation annotation.

    If ``inplace`` (default: {inplace}), this is written to the ``gdata``, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the group observation
    names).

    **Computation Parameters**

    1. Iterate on all the observations (groups, metacells) in ``gdata``.

    2. Consider all the cells whose ``groups`` annotation maps them into this group.

    3. Consider all the ``name`` annotation values of these cells.

    4. Compute an annotation value for the whole group of cells using the ``method``. Supported
       methods are:

       ``unique``
            All the values of all the cells in the group are expected to be the same, use this
            unique value for the whole groups.

       ``majority``
            Use the most common value across all cells in the group as the value for the whole
            group. If this value doesn't have at least ``min_value_fraction`` (default:
            {min_value_fraction}) of the cells, use the ``conflict`` (default: {conflict}) value
            instead.
    """
    group_of_cells = ut.get_o_numpy(adata, groups, formatter=ut.groups_description)
    values_of_cells = ut.get_o_numpy(adata, name, formatter=formatter)

    value_of_groups = np.empty(gdata.n_obs, dtype=values_of_cells.dtype)

    assert method in ("unique", "majority")

    if method == "unique":
        with ut.timed_step(".unique"):
            grouped_mask = group_of_cells >= 0
            value_of_groups[group_of_cells[grouped_mask]] = values_of_cells[grouped_mask]

    else:
        assert method == "majority"
        with ut.timed_step(".majority"):
            for group_index in range(gdata.n_obs):
                cells_mask = group_of_cells == group_index
                cells_count = np.sum(cells_mask)
                assert cells_count > 0
                values_of_cells_of_group = values_of_cells[cells_mask]
                unique_values_of_group, unique_counts_of_group = np.unique(values_of_cells_of_group, return_counts=True)
                majority_index = np.argmax(unique_counts_of_group)
                majority_count = unique_counts_of_group[majority_index]
                if majority_count / cells_count < min_value_fraction:
                    value_of_groups[group_index] = conflict
                else:
                    majority_value = unique_values_of_group[majority_index]
                    value_of_groups[group_index] = majority_value

    if inplace:
        ut.set_o_data(gdata, name, value_of_groups)
        return None

    return ut.to_pandas_series(value_of_groups, index=gdata.obs_names)