Source code for limix.stats._allele

from .._bits.dask import array_shape_reveal


[docs]def allele_frequency(X): """ Compute allele frequency from its expectation. ``X`` is a matrix whose last dimension correspond to the different set of chromosomes. The first dimension represent different variants and the second dimension represent the different samples. Parameters ---------- X : array_like Allele expectations encoded as a variants-by-samples-by-alleles matrix. Returns ------- frequenct : ndarray Allele frequencies encoded as a variants-by-alleles matrix. """ ploidy = X.shape[-1] if X.ndim < 3: n = 1 else: n = X.shape[1] return X.sum(-2) / (ploidy * n)
[docs]def compute_dosage(X, alt=None): """ Compute dosage from allele expectation. Parameters ---------- X : array_like Allele expectations encoded as a variants-by-samples-by-alleles matrix. ref : array_like Allele reference of each locus. The allele having the minor allele frequency for the provided ``X`` is used as the reference if `None`. Defaults to ``None``. Returns ------- dosage : ndarray Dosage encoded as a variants-by-samples matrix. Example ------- .. doctest:: >>> from bgen_reader import read_bgen, allele_expectation, example_files >>> from bgen_reader import compute_dosage >>> >>> with example_files("example.32bits.bgen") as filepath: ... bgen = read_bgen(filepath, verbose=False) ... variant_idx = 2 ... e = allele_expectation(bgen, variant_idx) ... dosage = compute_dosage(e) ... print(dosage[:5]) [0.01550294 0.99383543 1.97933958 0.99560547 1.97879027] """ from numpy import asarray if alt is None: return X[..., -1] try: return X[alt, :, alt] except NotImplementedError: alt = asarray(alt, int) return asarray(X, float)[alt, :, alt]
[docs]def allele_expectation(p, nalleles, ploidy): """ Allele expectation. Compute the expectation of each allele from the given probabilities. It accepts three shapes of matrices: - unidimensional array of probabilities; - bidimensional samples-by-alleles probabilities array; - and three dimensional variants-by-samples-by-alleles array. Parameters ---------- p : array_like Allele probabilities. nalleles : int Number of alleles. ploidy : int Number of complete sets of chromosomes. Returns ------- expectation : ndarray Last dimension will contain the expectation of each allele. Examples -------- .. doctest:: >>> from texttable import Texttable >>> from bgen_reader import read_bgen, allele_expectation, example_files >>> >>> sampleid = "sample_005" >>> rsid = "RSID_6" >>> >>> with example_files("example.32bits.bgen") as filepath: ... bgen = read_bgen(filepath, verbose=False) ... ... locus = bgen["variants"].query("rsid == '{}'".format(rsid)).index ... locus = locus.compute().values[0] ... sample = bgen["samples"].to_frame().query("id == '{}'".format(sampleid)) ... sample = sample.index ... sample = sample[0] ... ... nalleles = bgen["variants"].loc[locus]["nalleles"] ... ploidy = 2 ... ... p = bgen["genotype"][locus].compute()["probs"][sample] ... # For unphased genotypes only. ... e = allele_expectation(bgen, locus)[sample] ... ... alleles = bgen["variants"].loc[locus]["allele_ids"].compute() ... alleles = alleles.values[0].split(",") ... ... tab = Texttable() ... ... print(tab.add_rows( ... [ ... ["", "AA", "AG", "GG", "E[.]"], ... ["p"] + list(p) + [1.0], ... ["#" + alleles[0], 2, 1, 0, e[0]], ... ["#" + alleles[1], 0, 1, 2, e[1]], ... ] ... ).draw()) +----+-------+-------+-------+-------+ | | AA | AG | GG | E[.] | +====+=======+=======+=======+=======+ | p | 0.012 | 0.987 | 0.001 | 1 | +----+-------+-------+-------+-------+ | #A | 2 | 1 | 0 | 1.011 | +----+-------+-------+-------+-------+ | #G | 0 | 1 | 2 | 0.989 | +----+-------+-------+-------+-------+ >>> print("variant: {}".format(rsid)) variant: RSID_6 >>> print("sample : {}".format(sampleid)) sample : sample_005 Note ---- This function supports unphased genotypes only. """ from numpy import asarray, newaxis g = _get_genotypes(ploidy, nalleles) c = asarray(_genotypes_to_allele_counts(g), float) c = c.T.reshape((1,) * (p.ndim - 1) + (c.shape[1], c.shape[0])) p = array_shape_reveal(p) return (c * p[..., newaxis, :]).sum(-1)
def _get_genotypes(ploidy, nalleles): g = _make_genotypes(ploidy, 1, nalleles) g = sorted([list(reversed(i)) for i in g]) g = [list(reversed(i)) for i in g] return g def _make_genotypes(ploidy, start, end): tups = [] if ploidy == 0: return tups if ploidy == 1: return [[i] for i in range(start, end + 1)] for i in range(start, end + 1): t = _make_genotypes(ploidy - 1, i, end) for ti in t: tups += [[i] + ti] return tups def _genotypes_to_allele_counts(genotypes): nalleles = genotypes[-1][0] counts = [] for g in genotypes: count = [0] * nalleles for gi in g: count[gi - 1] += 1 counts.append(count) return counts