from .._bits.dask import array_shape_reveal
[docs]def allele_frequency(X):
"""
Compute allele frequency from its expectation.
``X`` is a matrix whose last dimension correspond to the different set of
chromosomes. The first dimension represent different variants and the second
dimension represent the different samples.
Parameters
----------
X : array_like
Allele expectations encoded as a variants-by-samples-by-alleles matrix.
Returns
-------
frequenct : ndarray
Allele frequencies encoded as a variants-by-alleles matrix.
"""
ploidy = X.shape[-1]
if X.ndim < 3:
n = 1
else:
n = X.shape[1]
return X.sum(-2) / (ploidy * n)
[docs]def compute_dosage(X, alt=None):
"""
Compute dosage from allele expectation.
Parameters
----------
X : array_like
Allele expectations encoded as a variants-by-samples-by-alleles matrix.
ref : array_like
Allele reference of each locus. The allele having the minor allele frequency for
the provided ``X`` is used as the reference if `None`. Defaults to ``None``.
Returns
-------
dosage : ndarray
Dosage encoded as a variants-by-samples matrix.
Example
-------
.. doctest::
>>> from bgen_reader import read_bgen, allele_expectation, example_files
>>> from bgen_reader import compute_dosage
>>>
>>> with example_files("example.32bits.bgen") as filepath:
... bgen = read_bgen(filepath, verbose=False)
... variant_idx = 2
... e = allele_expectation(bgen, variant_idx)
... dosage = compute_dosage(e)
... print(dosage[:5])
[0.01550294 0.99383543 1.97933958 0.99560547 1.97879027]
"""
from numpy import asarray
if alt is None:
return X[..., -1]
try:
return X[alt, :, alt]
except NotImplementedError:
alt = asarray(alt, int)
return asarray(X, float)[alt, :, alt]
[docs]def allele_expectation(p, nalleles, ploidy):
"""
Allele expectation.
Compute the expectation of each allele from the given probabilities.
It accepts three shapes of matrices:
- unidimensional array of probabilities;
- bidimensional samples-by-alleles probabilities array;
- and three dimensional variants-by-samples-by-alleles array.
Parameters
----------
p : array_like
Allele probabilities.
nalleles : int
Number of alleles.
ploidy : int
Number of complete sets of chromosomes.
Returns
-------
expectation : ndarray
Last dimension will contain the expectation of each allele.
Examples
--------
.. doctest::
>>> from texttable import Texttable
>>> from bgen_reader import read_bgen, allele_expectation, example_files
>>>
>>> sampleid = "sample_005"
>>> rsid = "RSID_6"
>>>
>>> with example_files("example.32bits.bgen") as filepath:
... bgen = read_bgen(filepath, verbose=False)
...
... locus = bgen["variants"].query("rsid == '{}'".format(rsid)).index
... locus = locus.compute().values[0]
... sample = bgen["samples"].to_frame().query("id == '{}'".format(sampleid))
... sample = sample.index
... sample = sample[0]
...
... nalleles = bgen["variants"].loc[locus]["nalleles"]
... ploidy = 2
...
... p = bgen["genotype"][locus].compute()["probs"][sample]
... # For unphased genotypes only.
... e = allele_expectation(bgen, locus)[sample]
...
... alleles = bgen["variants"].loc[locus]["allele_ids"].compute()
... alleles = alleles.values[0].split(",")
...
... tab = Texttable()
...
... print(tab.add_rows(
... [
... ["", "AA", "AG", "GG", "E[.]"],
... ["p"] + list(p) + [1.0],
... ["#" + alleles[0], 2, 1, 0, e[0]],
... ["#" + alleles[1], 0, 1, 2, e[1]],
... ]
... ).draw())
+----+-------+-------+-------+-------+
| | AA | AG | GG | E[.] |
+====+=======+=======+=======+=======+
| p | 0.012 | 0.987 | 0.001 | 1 |
+----+-------+-------+-------+-------+
| #A | 2 | 1 | 0 | 1.011 |
+----+-------+-------+-------+-------+
| #G | 0 | 1 | 2 | 0.989 |
+----+-------+-------+-------+-------+
>>> print("variant: {}".format(rsid))
variant: RSID_6
>>> print("sample : {}".format(sampleid))
sample : sample_005
Note
----
This function supports unphased genotypes only.
"""
from numpy import asarray, newaxis
g = _get_genotypes(ploidy, nalleles)
c = asarray(_genotypes_to_allele_counts(g), float)
c = c.T.reshape((1,) * (p.ndim - 1) + (c.shape[1], c.shape[0]))
p = array_shape_reveal(p)
return (c * p[..., newaxis, :]).sum(-1)
def _get_genotypes(ploidy, nalleles):
g = _make_genotypes(ploidy, 1, nalleles)
g = sorted([list(reversed(i)) for i in g])
g = [list(reversed(i)) for i in g]
return g
def _make_genotypes(ploidy, start, end):
tups = []
if ploidy == 0:
return tups
if ploidy == 1:
return [[i] for i in range(start, end + 1)]
for i in range(start, end + 1):
t = _make_genotypes(ploidy - 1, i, end)
for ti in t:
tups += [[i] + ti]
return tups
def _genotypes_to_allele_counts(genotypes):
nalleles = genotypes[-1][0]
counts = []
for g in genotypes:
count = [0] * nalleles
for gi in g:
count[gi - 1] += 1
counts.append(count)
return counts