Source code for amici.tools._data_utils
import h5py
import numpy as np
import numpy.typing as npt
import pandas as pd
import scipy.sparse as sp_sparse
try:
# anndata >= 0.10
from anndata.experimental import CSCDataset, CSRDataset
SparseDataset = (CSRDataset, CSCDataset)
except ImportError:
from anndata._core.sparse_dataset import SparseDataset
[docs]
def is_count_data(
data: pd.DataFrame | npt.NDArray | sp_sparse.spmatrix | h5py.Dataset,
n_to_check: int = 20,
):
"""
Source: SCVI data utils (https://github.com/scverse/scvi-tools/blob/main/src/scvi/data/_utils.py#L254-L279)
Approximately checks if the data to ensure it is count data.
Args:
data (pd.DataFrame | npt.NDArray | sp_sparse.spmatrix | h5py.Dataset):
The data to check if it is count data. It can be a pandas DataFrame,
numpy array, scipy sparse matrix, or h5py Dataset.
n_to_check (int, optional):
The number of samples to check from the data. Defaults to 20.
Returns
-------
bool:
True if the data is count data, False otherwise.
Raises
------
TypeError:
If the data type is not understood.
"""
if isinstance(data, h5py.Dataset) or isinstance(data, SparseDataset):
data = data[:100]
if isinstance(data, np.ndarray):
data = data
elif issubclass(type(data), sp_sparse.spmatrix):
data = data.data
elif isinstance(data, pd.DataFrame):
data = data.to_numpy()
else:
raise TypeError("data type not understood")
ret = True
if data.shape[0] != 0:
inds = np.random.choice(data.shape[0], size=(n_to_check,))
check = data[inds]
negative = np.any(check < 0)
non_integer = np.any(check % 1 != 0)
ret = not (negative or non_integer)
return ret