Source code for amici.tools._data_utils

import h5py
import numpy as np
import numpy.typing as npt
import pandas as pd
import scipy.sparse as sp_sparse

try:
    # anndata >= 0.10
    from anndata.experimental import CSCDataset, CSRDataset

    SparseDataset = (CSRDataset, CSCDataset)
except ImportError:
    from anndata._core.sparse_dataset import SparseDataset



[docs]
def is_count_data(
    data: pd.DataFrame | npt.NDArray | sp_sparse.spmatrix | h5py.Dataset,
    n_to_check: int = 20,
):
    """
    Source: SCVI data utils (https://github.com/scverse/scvi-tools/blob/main/src/scvi/data/_utils.py#L254-L279)

    Approximately checks if the data to ensure it is count data.

    Args:
        data (pd.DataFrame | npt.NDArray | sp_sparse.spmatrix | h5py.Dataset):
            The data to check if it is count data. It can be a pandas DataFrame,
            numpy array, scipy sparse matrix, or h5py Dataset.
        n_to_check (int, optional):
            The number of samples to check from the data. Defaults to 20.

    Returns
    -------
        bool:
            True if the data is count data, False otherwise.

    Raises
    ------
        TypeError:
            If the data type is not understood.
    """
    if isinstance(data, h5py.Dataset) or isinstance(data, SparseDataset):
        data = data[:100]

    if isinstance(data, np.ndarray):
        data = data
    elif issubclass(type(data), sp_sparse.spmatrix):
        data = data.data
    elif isinstance(data, pd.DataFrame):
        data = data.to_numpy()
    else:
        raise TypeError("data type not understood")

    ret = True
    if data.shape[0] != 0:
        inds = np.random.choice(data.shape[0], size=(n_to_check,))
        check = data[inds]
        negative = np.any(check < 0)
        non_integer = np.any(check % 1 != 0)
        ret = not (negative or non_integer)
    return ret