Source code for limix.io.csv

[docs]def read(filename, sep=None, header=True, verbose=True): """ Read a CSV file. Parameters ---------- filename : str Path to a CSV file. sep : str Separator. ``None`` triggers auto-detection. Defaults to ``None``. header : bool ``True`` for file with a header; ``False`` otherwise. Defaults to ``True``. verbose : bool `True` for progress information; `False` otherwise. Returns ------- data : dask dataframes Examples -------- .. doctest:: >>> from limix.io.csv import read >>> from limix import file_example >>> >>> with file_example("data.csv") as filepath: ... df = read(filepath, verbose=False) ... print(df) # doctest: +FLOAT_CMP pheno attr1 attr2 attr3 0 sex string 10 a 1 size float -3 b 2 force int f c """ from dask.dataframe import read_csv as dask_read_csv from pandas import read_csv as pandas_read_csv from .._display import session_line if sep is None: sep = _infer_separator(filename) header = 0 if header else None with session_line("Reading {}... ".format(filename), disable=not verbose): if _is_large_file(filename): df = dask_read_csv(filename, sep=sep, header=header) else: df = pandas_read_csv(filename, sep=sep, header=header) if len(df.columns) > 0: if df.columns[0] == "Unnamed: 0": df = df.set_index("Unnamed: 0") df.index.name = None return df
def _see(filepath, header, verbose=True): """ Shows a human-friendly representation of a CSV file. Parameters ---------- filepath : str CSV file path. header : bool ``True`` for parsing the header; ``False`` otherwise. verbose : bool ``True`` for verbose; ``False`` otherwise. Returns ------- str CSV representation. """ from pandas import read_csv from .._display import session_line if header: header = 0 else: header = None with session_line(desc="Reading %s... " % filepath, disable=not verbose): sep = _infer_separator(filepath) msg = read_csv(filepath, sep=sep, header=header).head() print(msg) def _count(candidates, line): counter = {c: 0 for c in candidates} for i in line: if i in candidates: counter[i] += 1 return counter def _update(counter, c): for (k, v) in c.items(): if counter[k] != v: del counter[k] def _infer_separator(fn): nmax = 9 with open(fn, "r") as f: line = _remove_repeat(f.readline()) counter = _count(set(line), line) for _ in range(nmax - 1): line = _remove_repeat(f.readline()) if len(line) == 0: break c = _count(set(counter.keys()), line) _update(counter, c) if len(counter) == 1: return next(iter(counter.keys())) for c in set([",", "\t", " "]): if c in counter: return c counter = list(counter.items()) if len(counter) == 0: return None counter = sorted(counter, key=lambda kv: kv[1]) return counter[-1][0] def _remove_repeat(s): from re import sub return sub(r"(.)\1+", r"\1", s) def _is_large_file(filepath): import os large = 1024 * 1024 * 100 return os.path.getsize(filepath) >= large