Source code for hail.utils

import io

from hail.java import Env, handle_py4j, jiterable_to_list
from hail.typecheck import *


[docs]class Summary(object): """Class holding summary statistics about a dataset. :ivar int samples: Number of samples. :ivar int variants: Number of variants. :ivar float call_rate: Fraction of all genotypes called. :ivar contigs: Unique contigs found in dataset. :vartype contigs: list of str :ivar int multiallelics: Number of multiallelic variants. :ivar int snps: Number of SNP alternate alleles. :ivar int mnps: Number of MNP alternate alleles. :ivar int insertions: Number of insertion alternate alleles. :ivar int deletions: Number of deletion alternate alleles. :ivar int complex: Number of complex alternate alleles. :ivar int star: Number of star (upstream deletion) alternate alleles. :ivar int max_alleles: Highest number of alleles at any variant. """ @classmethod def _from_java(cls, jrep): summary = Summary.__new__(cls) summary.samples = jrep.samples() summary.variants = jrep.variants() summary.call_rate = jrep.callRate().get() if jrep.callRate().isDefined() else float('nan') summary.contigs = [str(x) for x in jiterable_to_list(jrep.contigs())] summary.multiallelics = jrep.multiallelics() summary.snps = jrep.snps() summary.mnps = jrep.mnps() summary.insertions = jrep.insertions() summary.deletions = jrep.deletions() summary.complex = jrep.complex() summary.star = jrep.star() summary.max_alleles = jrep.maxAlleles() return summary def __repr__(self): return 'Summary(samples=%d, variants=%d, call_rate=%f, contigs=%s, multiallelics=%d, snps=%d, ' \ 'mnps=%d, insertions=%d, deletions=%d, complex=%d, star=%d, max_alleles=%d)' % ( self.samples, self.variants, self.call_rate, self.contigs, self.multiallelics, self.snps, self.mnps, self.insertions, self.deletions, self.complex, self.star, self.max_alleles) def __str__(self): return repr(self)
[docs] def report(self): """Print the summary information.""" print('') # clear out pesky progress bar if necessary print('%16s: %d' % ('Samples', self.samples)) print('%16s: %d' % ('Variants', self.variants)) print('%16s: %f' % ('Call Rate', self.call_rate)) print('%16s: %s' % ('Contigs', self.contigs)) print('%16s: %d' % ('Multiallelics', self.multiallelics)) print('%16s: %d' % ('SNPs', self.snps)) print('%16s: %d' % ('MNPs', self.mnps)) print('%16s: %d' % ('Insertions', self.insertions)) print('%16s: %d' % ('Deletions', self.deletions)) print('%16s: %d' % ('Complex Alleles', self.complex)) print('%16s: %d' % ('Star Alleles', self.star)) print('%16s: %d' % ('Max Alleles', self.max_alleles))
class FunctionDocumentation(object): @handle_py4j def types_rst(self, file_name): Env.hail().utils.FunctionDocumentation.makeTypesDocs(file_name) @handle_py4j def functions_rst(self, file_name): Env.hail().utils.FunctionDocumentation.makeFunctionsDocs(file_name)
[docs]@handle_py4j @typecheck(path=strlike, buffer_size=integral) def hadoop_read(path, buffer_size=8192): """Open a readable file through the Hadoop filesystem API. Supports distributed file systems like hdfs, gs, and s3. **Examples** .. doctest:: :options: +SKIP >>> with hadoop_read('gs://my-bucket/notes.txt') as f: ... for line in f: ... print(line.strip()) **Notes** The provided source file path must be a URI (uniform resource identifier). .. caution:: These file handles are slower than standard Python file handles. If you are reading a file larger than ~50M, it will be faster to use :py:meth:`~hail.hadoop_copy` to copy the file locally, then read it with standard Python I/O tools. :param str path: Source file URI. :param int buffer_size: Size of internal buffer. :return: Iterable file reader. :rtype: `io.BufferedReader <https://docs.python.org/2/library/io.html#io.BufferedReader>`_ """ if not isinstance(path, str) and not isinstance(path, unicode): raise TypeError("expected parameter 'path' to be type str, but found %s" % type(path)) if not isinstance(buffer_size, int): raise TypeError("expected parameter 'buffer_size' to be type int, but found %s" % type(buffer_size)) return io.BufferedReader(HadoopReader(path), buffer_size=buffer_size)
[docs]@handle_py4j @typecheck(path=strlike, buffer_size=integral) def hadoop_write(path, buffer_size=8192): """Open a writable file through the Hadoop filesystem API. Supports distributed file systems like hdfs, gs, and s3. **Examples** .. doctest:: :options: +SKIP >>> with hadoop_write('gs://my-bucket/notes.txt') as f: ... f.write('result1: %s\\n' % result1) ... f.write('result2: %s\\n' % result2) **Notes** The provided destination file path must be a URI (uniform resource identifier). .. caution:: These file handles are slower than standard Python file handles. If you are writing a large file (larger than ~50M), it will be faster to write to a local file using standard Python I/O and use :py:meth:`~hail.hadoop_copy` to move your file to a distributed file system. :param str path: Destination file URI. :return: File writer object. :rtype: `io.BufferedWriter <https://docs.python.org/2/library/io.html#io.BufferedWriter>`_ """ if not isinstance(path, str) and not isinstance(path, unicode): raise TypeError("expected parameter 'path' to be type str, but found %s" % type(path)) if not isinstance(buffer_size, int): raise TypeError("expected parameter 'buffer_size' to be type int, but found %s" % type(buffer_size)) return io.BufferedWriter(HadoopWriter(path), buffer_size=buffer_size)
[docs]@handle_py4j @typecheck(src=strlike, dest=strlike) def hadoop_copy(src, dest): """Copy a file through the Hadoop filesystem API. Supports distributed file systems like hdfs, gs, and s3. **Examples** >>> hadoop_copy('gs://hail-common/LCR.interval_list', 'file:///mnt/data/LCR.interval_list') # doctest: +SKIP **Notes** The provided source and destination file paths must be URIs (uniform resource identifiers). :param str src: Source file URI. :param str dest: Destination file URI. """ Env.jutils().copyFile(src, dest, Env.hc()._jhc)
class HadoopReader(io.RawIOBase): def __init__(self, path): self._jfile = Env.jutils().readFile(path, Env.hc()._jhc) super(HadoopReader, self).__init__() def close(self): self._jfile.close() def readable(self): return True def readinto(self, b): b_from_java = self._jfile.read(len(b)).encode('iso-8859-1') n_read = len(b_from_java) b[:n_read] = b_from_java return n_read class HadoopWriter(io.RawIOBase): def __init__(self, path): self._jfile = Env.jutils().writeFile(path, Env.hc()._jhc) super(HadoopWriter, self).__init__() def writable(self): return True def close(self): self._jfile.close() def flush(self): self._jfile.flush() def write(self, b): self._jfile.write(bytearray(b)) return len(b) def wrap_to_list(s): if isinstance(s, list): return s else: return [s]