Source code for limix.qc._mean_std

import warnings

from limix._bits import dask, numpy, pandas, xarray


[docs]def mean_standardize(X, axis=-1, inplace=False): """ Zero-mean and one-deviation normalisation. Normalise in such a way that the mean and variance are equal to zero and one. This transformation is taken over the flattened array by default, otherwise over the specified axis. Missing values represented by ``NaN`` are ignored. Parameters ---------- X : array_like Array of values. axis : int, optional Axis value. Defaults to ``1``. inplace : bool, optional Defaults to ``False``. Returns ------- X : ndarray Normalized array. Example ------- .. doctest:: >>> import limix >>> from numpy import arange >>> >>> X = arange(15).reshape((5, 3)).astype(float) >>> print(X) # doctest: +FLOAT_CMP [[ 0. 1. 2.] [ 3. 4. 5.] [ 6. 7. 8.] [ 9. 10. 11.] [12. 13. 14.]] >>> X = arange(6).reshape((2, 3)).astype(float) >>> X = limix.qc.mean_standardize(X, axis=0) >>> print(X) # doctest: +FLOAT_CMP [[-1.22474487 0. 1.22474487] [-1.22474487 0. 1.22474487]] """ from numpy import issubdtype, integer, asarray if hasattr(X, "dtype") and issubdtype(X.dtype, integer): raise ValueError("Integer type is not supported.") if isinstance(X, (tuple, list)): if inplace: raise ValueError("Can't use `inplace=True` for {}.".format(type(X))) X = asarray(X, float) if numpy.is_array(X): X = _mean_standardize_numpy(X, axis, inplace) elif pandas.is_series(X): X = _mean_standardize_pandas_series(X, axis, inplace) elif pandas.is_dataframe(X): X = _mean_standardize_pandas_dataframe(X, axis, inplace) elif dask.is_array(X): X = _mean_standardize_dask_array(X, axis, inplace) elif dask.is_series(X): raise NotImplementedError() elif dask.is_dataframe(X): X = _mean_standardize_dask_dataframe(X, axis, inplace) elif xarray.is_dataarray(X): X = _mean_standardize_xarray_dataarray(X, axis, inplace) else: raise NotImplementedError() return X
def _mean_standardize_numpy(X, axis, inplace): from numpy import nanmean, nanstd from numpy_sugar import epsilon from numpy import inf, clip orig_shape = X.shape if X.ndim == 1: X = X.reshape(orig_shape + (1,)) if not inplace: X = X.copy() X = X.swapaxes(1, axis) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) X -= nanmean(X, axis=0) X /= clip(nanstd(X, axis=0), epsilon.tiny, inf) X = X.swapaxes(1, axis) return X.reshape(orig_shape) def _mean_standardize_pandas_series(X, axis, inplace): if not inplace: X = X.copy() a = X.to_numpy() _mean_standardize_numpy(a, axis, True) X[:] = a return X def _mean_standardize_pandas_dataframe(x, axis, inplace): if not inplace: x = x.copy() a = x.to_numpy() _mean_standardize_numpy(a, axis, True) x[:] = a return x def _mean_standardize_dask_array(x, axis, inplace): import dask.array as da from numpy_sugar import epsilon from numpy import nanmean, clip, nanstd, inf if inplace: raise NotImplementedError() x = x.swapaxes(1, axis) x = dask.array_shape_reveal(x) shape = da.compute(*x.shape) def func(a): a -= nanmean(a, axis=0) a /= clip(nanstd(a, axis=0), epsilon.tiny, inf) return a with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) x = da.apply_along_axis(_dask_apply, 0, x, func, shape[0]) return x.swapaxes(1, axis) def _mean_standardize_dask_dataframe(x, axis, inplace): if inplace: raise NotImplementedError() d = x.to_dask_array(lengths=True) orig_chunks = d.chunks d = _mean_standardize_dask_array(d, axis, False).rechunk(orig_chunks) return d.to_dask_dataframe(columns=x.columns, index=x.index) def _mean_standardize_xarray_dataarray(X, axis, inplace): if not inplace: X = X.copy(deep=True) data = X.data if dask.is_array(data): data = _mean_standardize_dask_array(data, axis, inplace) else: data = _mean_standardize_numpy(data, axis, inplace) X.data = data return X def _dask_apply(x, func1d, length): from numpy import resize x = func1d(x) return resize(x, length)