Source code for limix.io.gen

[docs]def read(prefix, verbose=True): r""" Read GEN files into Pandas data frames. Parameters ---------- prefix : str Path prefix to the set of GEN files. Returns ------- sample : dask dataframe genotype : dask dataframe Examples -------- .. doctest:: >>> from limix.io.gen import read >>> from limix import file_example >>> >>> with file_example(["example.gen", "example.sample"]) as filepaths: ... data = read(filepaths[0][:-4]) >>> >>> print(data['sample']) sample_id subject_id missing gender age age_of_onset phenotype_1 sample 1A0 1A0 W001 0.00000 2 4 -9 0 1A1 1A1 W002 0.00000 2 4 -9 0 1A2 1A2 W003 0.00000 2 4 -9 1 1A3 1A3 W004 0.09000 2 4 -9 1 1A4 1A4 W005 0.00000 2 4 -9 1 >>> print(data['genotype'].head()) snp_id rs_id pos alleleA alleleB 1A0 1A1 1A2 1A3 \ AA AB BB AA AB BB AA AB BB AA candidate SA1 SA1 rs001 10000000 A G 0 0 1 0 0 1 0 0 1 0 SA2 SA2 rs002 10010000 A G 0 0 1 0 1 0 1 0 0 0 SA3 SA3 rs003 10020000 C T 1 0 0 0 1 0 0 0 1 0 SA4 SA4 rs004 10030000 G T 1 0 0 0 1 0 0 0 1 0 SA5 SA5 rs005 10040000 C G 0 0 1 0 1 0 1 0 0 0 <BLANKLINE> 1A4 AB BB AA AB BB candidate SA1 0.42770 0.57210 0 0.02070 0.97920 SA2 1.00000 0.00000 1 0.00000 0.00000 SA3 0.99670 0.00000 0 0.00000 1.00000 SA4 1.00000 0.00000 0 0.00000 1.00000 SA5 1.00000 0.00000 1 0.00000 0.00000 """ from pandas import read_csv, MultiIndex df_sample = read_csv(prefix + ".sample", header=0, sep=" ", skiprows=[1]) col_level0_names = ["snp_id", "rs_id", "pos", "alleleA", "alleleB"] col_level1_names = [""] * 5 for s in df_sample.iloc[:, 0]: col_level0_names += [s] * 3 col_level1_names += ["AA", "AB", "BB"] tuples = list(zip(col_level0_names, col_level1_names)) index = MultiIndex.from_tuples(tuples, names=["first", "second"]) df_sample["sample"] = df_sample["sample_id"] df_sample = df_sample.set_index("sample") df_sample.index.name = "sample" df_gen = read_csv(prefix + ".gen", names=index, sep=" ") df_gen["candidate"] = df_gen["snp_id"] df_gen = df_gen.set_index("candidate") df_gen.index.name = "candidate" return dict(sample=df_sample, genotype=df_gen)