Source code for hail.dataset

from __future__ import print_function  # Python 2 and 3 print compatibility

import warnings

from decorator import decorator

from hail.expr import Type, TGenotype, TString, TVariant, TArray
from hail.typecheck import *
from hail.java import *
from hail.keytable import KeyTable
from hail.representation import Interval, Pedigree, Variant
from hail.utils import Summary, wrap_to_list, hadoop_read
from hail.kinshipMatrix import KinshipMatrix
from hail.ldMatrix import LDMatrix

warnings.filterwarnings(module=__name__, action='once')


@decorator
def requireTGenotype(func, vds, *args, **kwargs):
    if vds._is_generic_genotype:
        if vds.genotype_schema != TGenotype:
            coerced_vds = VariantDataset(vds.hc, vds._jvdf.toVDS())
            return func(coerced_vds, *args, **kwargs)
        else:
            raise TypeError("genotype signature must be Genotype, but found '%s'" % type(vds.genotype_schema))

    return func(vds, *args, **kwargs)


@decorator
def convertVDS(func, vds, *args, **kwargs):
    if vds._is_generic_genotype:
        if isinstance(vds.genotype_schema, TGenotype):
            vds = VariantDataset(vds.hc, vds._jvdf.toVDS())

    return func(vds, *args, **kwargs)

vds_type = lazy()

[docs]class VariantDataset(object): """Hail's primary representation of genomic data, a matrix keyed by sample and variant. Variant datasets may be generated from other formats using the :py:class:`.HailContext` import methods, constructed from a variant-keyed :py:class:`KeyTable` using :py:meth:`.VariantDataset.from_table`, and simulated using :py:meth:`~hail.HailContext.balding_nichols_model`. Once a variant dataset has been written to disk with :py:meth:`~hail.VariantDataset.write`, use :py:meth:`~hail.HailContext.read` to load the variant dataset into the environment. >>> vds = hc.read("data/example.vds") :ivar hc: Hail Context. :vartype hc: :class:`.HailContext` """ def __init__(self, hc, jvds): self.hc = hc self._jvds = jvds self._globals = None self._sample_annotations = None self._colkey_schema = None self._sa_schema = None self._rowkey_schema = None self._va_schema = None self._global_schema = None self._genotype_schema = None self._sample_ids = None self._num_samples = None self._jvdf_cache = None
[docs] @staticmethod @handle_py4j @typecheck(table=KeyTable) def from_table(table): """Construct a sites-only variant dataset from a key table. **Examples** Import a text table and construct a sites-only VDS: >>> table = hc.import_table('data/variant-lof.tsv', types={'v': TVariant()}).key_by('v') >>> sites_vds = VariantDataset.from_table(table) **Notes** The key table must be keyed by one column of type :py:class:`.TVariant`. All columns in the key table become variant annotations in the result. For example, a key table with key column ``v`` (*Variant*) and column ``gene`` (*String*) will produce a sites-only variant dataset with a ``va.gene`` variant annotation. :param table: Variant-keyed table. :type table: :py:class:`.KeyTable` :return: Sites-only variant dataset. :rtype: :py:class:`.VariantDataset` """ jvds = scala_object(Env.hail().variant, 'VariantDataset').fromKeyTable(table._jkt) return VariantDataset(table.hc, jvds)
@property def _jvdf(self): if self._jvdf_cache is None: if self._is_generic_genotype: self._jvdf_cache = Env.hail().variant.GenericDatasetFunctions(self._jvds) else: self._jvdf_cache = Env.hail().variant.VariantDatasetFunctions(self._jvds) return self._jvdf_cache @property def _is_generic_genotype(self): return self._jvds.isGenericGenotype() @property @handle_py4j def sample_ids(self): """Return sampleIDs. :return: List of sample IDs. :rtype: list of str """ if self._sample_ids is None: self._sample_ids = jiterable_to_list(self._jvds.sampleIds()) return self._sample_ids @property @handle_py4j def sample_annotations(self): """Return a dict of sample annotations. The keys of this dictionary are the sample IDs (strings). The values are sample annotations. :return: dict """ if self._sample_annotations is None: zipped_annotations = Env.jutils().iterableToArrayList( self._jvds.sampleIdsAndAnnotations() ) r = {} for element in zipped_annotations: r[element._1()] = self.sample_schema._convert_to_py(element._2()) self._sample_annotations = r return self._sample_annotations
[docs] @handle_py4j def num_partitions(self): """Number of partitions. **Notes** The data in a variant dataset is divided into chunks called partitions, which may be stored together or across a network, so that each partition may be read and processed in parallel by available cores. Partitions are a core concept of distributed computation in Spark, see `here <http://spark.apache.org/docs/latest/programming-guide.html#resilient-distributed-datasets-rdds>`__ for details. :rtype: int """ return self._jvds.nPartitions()
@property @handle_py4j def num_samples(self): """Number of samples. :rtype: int """ if self._num_samples is None: self._num_samples = self._jvds.nSamples() return self._num_samples
[docs] @handle_py4j def count_variants(self): """Count number of variants in variant dataset. :rtype: long """ return self._jvds.countVariants()
[docs] @handle_py4j def was_split(self): """True if multiallelic variants have been split into multiple biallelic variants. Result is True if :py:meth:`~hail.VariantDataset.split_multi` or :py:meth:`~hail.VariantDataset.filter_multi` has been called on this variant dataset, or if the variant dataset was imported with :py:meth:`~hail.HailContext.import_plink`, :py:meth:`~hail.HailContext.import_gen`, or :py:meth:`~hail.HailContext.import_bgen`, or if the variant dataset was simulated with :py:meth:`~hail.HailContext.balding_nichols_model`. :rtype: bool """ return self._jvds.wasSplit()
[docs] @handle_py4j def file_version(self): """File version of variant dataset. :rtype: int """ return self._jvds.fileVersion()
[docs] @handle_py4j @typecheck_method(key_exprs=oneof(strlike, listof(strlike)), agg_exprs=oneof(strlike, listof(strlike))) def aggregate_by_key(self, key_exprs, agg_exprs): """Aggregate by user-defined key and aggregation expressions to produce a KeyTable. Equivalent to a group-by operation in SQL. **Examples** Compute the number of LOF heterozygote calls per gene per sample: >>> kt_result = (vds ... .aggregate_by_key(['Sample = s', 'Gene = va.gene'], ... 'nHet = g.filter(g => g.isHet() && va.consequence == "LOF").count()') ... .export("test.tsv")) This will produce a :class:`KeyTable` with 3 columns (`Sample`, `Gene`, `nHet`). :param key_exprs: Named expression(s) for which fields are keys. :type key_exprs: str or list of str :param agg_exprs: Named aggregation expression(s). :type agg_exprs: str or list of str :rtype: :class:`.KeyTable` """ if isinstance(key_exprs, list): key_exprs = ",".join(key_exprs) if isinstance(agg_exprs, list): agg_exprs = ",".join(agg_exprs) return KeyTable(self.hc, self._jvds.aggregateByKey(key_exprs, agg_exprs))
[docs] @handle_py4j @requireTGenotype @typecheck_method(expr=oneof(strlike, listof(strlike)), propagate_gq=bool) def annotate_alleles_expr(self, expr, propagate_gq=False): """Annotate alleles with expression. .. include:: requireTGenotype.rst **Examples** To create a variant annotation ``va.nNonRefSamples: Array[Int]`` where the ith entry of the array is the number of samples carrying the ith alternate allele: >>> vds_result = vds.annotate_alleles_expr('va.nNonRefSamples = gs.filter(g => g.isCalledNonRef()).count()') **Notes** This method is similar to :py:meth:`.annotate_variants_expr`. :py:meth:`.annotate_alleles_expr` dynamically splits multi-allelic sites, evaluates each expression on each split allele separately, and for each expression annotates with an array with one element per alternate allele. In the splitting, genotypes are downcoded and each alternate allele is represented using its minimal representation (see :py:meth:`split_multi` for more details). :param expr: Annotation expression. :type expr: str or list of str :param bool propagate_gq: Propagate GQ instead of computing from (split) PL. :return: Annotated variant dataset. :rtype: :py:class:`.VariantDataset` """ if isinstance(expr, list): expr = ",".join(expr) jvds = self._jvdf.annotateAllelesExpr(expr, propagate_gq) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(expr=oneof(strlike, listof(strlike))) def annotate_genotypes_expr(self, expr): """Annotate genotypes with expression. **Examples** Convert the genotype schema to a :py:class:`~hail.expr.TStruct` with two fields ``GT`` and ``CASE_HET``: >>> vds_result = vds.annotate_genotypes_expr('g = {GT: g.gt, CASE_HET: sa.pheno.isCase && g.isHet()}') Assume a VCF is imported with ``generic=True`` and the resulting genotype schema is a ``Struct`` and the field ``GTA`` is a ``Call`` type. Use the ``.toGenotype()`` method in the expression language to convert a ``Call`` to a ``Genotype``. ``vds_gta`` will have a genotype schema equal to :py:class:`~hail.expr.TGenotype` >>> vds_gta = (hc.import_vcf('data/example3.vcf.bgz', generic=True, call_fields=['GTA']) ... .annotate_genotypes_expr('g = g.GTA.toGenotype()')) **Notes** :py:meth:`~hail.VariantDataset.annotate_genotypes_expr` evaluates the expression given by ``expr`` and assigns the result of the right hand side to the annotation path specified by the left-hand side (must begin with ``g``). This is analogous to :py:meth:`~hail.VariantDataset.annotate_variants_expr` and :py:meth:`~hail.VariantDataset.annotate_samples_expr` where the annotation paths are ``va`` and ``sa`` respectively. ``expr`` is in genotype context so the following symbols are in scope: - ``g``: genotype annotation - ``v`` (*Variant*): :ref:`variant` - ``va``: variant annotations - ``s`` (*Sample*): sample - ``sa``: sample annotations - ``global``: global annotations For more information, see the documentation on writing `expressions <overview.html#expressions>`__ and using the `Hail Expression Language <exprlang.html>`__. .. warning:: - If the resulting genotype schema is not :py:class:`~hail.expr.TGenotype`, subsequent function calls on the annotated variant dataset may not work such as :py:meth:`~hail.VariantDataset.pca` and :py:meth:`~hail.VariantDataset.linreg`. - Hail performance may be significantly slower if the annotated variant dataset does not have a genotype schema equal to :py:class:`~hail.expr.TGenotype`. - Genotypes are immutable. For example, if ``g`` is initially of type ``Genotype``, the expression ``g.gt = g.gt + 1`` will return a ``Struct`` with one field ``gt`` of type ``Int`` and **NOT** a ``Genotype`` with the ``gt`` incremented by 1. :param expr: Annotation expression. :type expr: str or list of str :return: Annotated variant dataset. :rtype: :py:class:`.VariantDataset` """ if isinstance(expr, list): expr = ",".join(expr) jvds = self._jvdf.annotateGenotypesExpr(expr) vds = VariantDataset(self.hc, jvds) if isinstance(vds.genotype_schema, TGenotype): return VariantDataset(self.hc, vds._jvdf.toVDS()) else: return vds
[docs] @handle_py4j @typecheck_method(expr=oneof(strlike, listof(strlike))) def annotate_global_expr(self, expr): """Annotate global with expression. **Example** Annotate global with an array of populations: >>> vds = vds.annotate_global_expr('global.pops = ["FIN", "AFR", "EAS", "NFE"]') Create, then overwrite, then drop a global annotation: >>> vds = vds.annotate_global_expr('global.pops = ["FIN", "AFR", "EAS"]') >>> vds = vds.annotate_global_expr('global.pops = ["FIN", "AFR", "EAS", "NFE"]') >>> vds = vds.annotate_global_expr('global.pops = drop(global, pops)') The expression namespace contains only one variable: - ``global``: global annotations :param expr: Annotation expression :type expr: str or list of str :return: Annotated variant dataset. :rtype: :py:class:`.VariantDataset` """ if isinstance(expr, list): expr = ','.join(expr) jvds = self._jvds.annotateGlobalExpr(expr) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(path=strlike, annotation=anytype, annotation_type=Type) def annotate_global(self, path, annotation, annotation_type): """Add global annotations from Python objects. **Examples** Add populations as a global annotation: >>> vds_result = vds.annotate_global('global.populations', ... ['EAS', 'AFR', 'EUR', 'SAS', 'AMR'], ... TArray(TString())) **Notes** This method registers new global annotations in a VDS. These annotations can then be accessed through expressions in downstream operations. The Hail data type must be provided and must match the given ``annotation`` parameter. :param str path: annotation path starting in 'global' :param annotation: annotation to add to global :param annotation_type: Hail type of annotation :type annotation_type: :py:class:`.Type` :return: Annotated variant dataset. :rtype: :py:class:`.VariantDataset` """ annotation_type._typecheck(annotation) annotated = self._jvds.annotateGlobal(annotation_type._convert_to_j(annotation), annotation_type._jtype, path) assert annotated.globalSignature().typeCheck(annotated.globalAnnotation()), 'error in java type checking' return VariantDataset(self.hc, annotated)
[docs] @handle_py4j @typecheck_method(expr=oneof(strlike, listof(strlike))) def annotate_samples_expr(self, expr): """Annotate samples with expression. **Examples** Compute per-sample GQ statistics for hets: >>> vds_result = (vds.annotate_samples_expr('sa.gqHetStats = gs.filter(g => g.isHet()).map(g => g.gq).stats()') ... .export_samples('output/samples.txt', 'sample = s, het_gq_mean = sa.gqHetStats.mean')) Compute the list of genes with a singleton LOF per sample: >>> variant_annotations_table = hc.import_table('data/consequence.tsv', impute=True).key_by('Variant') >>> vds_result = (vds.annotate_variants_table(variant_annotations_table, root='va.consequence') ... .annotate_variants_expr('va.isSingleton = gs.map(g => g.nNonRefAlleles()).sum() == 1') ... .annotate_samples_expr('sa.LOF_genes = gs.filter(g => va.isSingleton && g.isHet() && va.consequence == "LOF").map(g => va.gene).collect()')) To create an annotation for only a subset of samples based on an existing annotation: >>> vds_result = vds.annotate_samples_expr('sa.newpheno = if (sa.pheno.cohortName == "cohort1") sa.pheno.bloodPressure else NA: Double') .. note:: For optimal performance, be sure to explicitly give the alternative (``NA``) the same type as the consequent (``sa.pheno.bloodPressure``). **Notes** ``expr`` is in sample context so the following symbols are in scope: - ``s`` (*Sample*): sample - ``sa``: sample annotations - ``global``: global annotations - ``gs`` (*Aggregable[Genotype]*): aggregable of :ref:`genotype` for sample ``s`` :param expr: Annotation expression. :type expr: str or list of str :return: Annotated variant dataset. :rtype: :class:`.VariantDataset` """ if isinstance(expr, list): expr = ','.join(expr) jvds = self._jvds.annotateSamplesExpr(expr) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(table=KeyTable, root=nullable(strlike), expr=nullable(strlike), vds_key=nullable(strlike), product=bool) def annotate_samples_table(self, table, root=None, expr=None, vds_key=None, product=False): """Annotate samples with a key table. **Examples** To annotates samples using `samples1.tsv` with type imputation:: >>> table = hc.import_table('data/samples1.tsv', impute=True).key_by('Sample') >>> vds_result = vds.annotate_samples_table(table, root='sa.pheno') Given this file .. code-block:: text $ cat data/samples1.tsv Sample Height Status Age PT-1234 154.1 ADHD 24 PT-1236 160.9 Control 19 PT-1238 NA ADHD 89 PT-1239 170.3 Control 55 the three new sample annotations are ``sa.pheno.Height: Double``, ``sa.pheno.Status: String``, and ``sa.pheno.Age: Int``. To annotate without type imputation, resulting in all String types: >>> annotations = hc.import_table('data/samples1.tsv').key_by('Sample') >>> vds_result = vds.annotate_samples_table(annotations, root='sa.phenotypes') **Detailed examples** Let's import annotations from a CSV file with missing data and special characters .. code-block:: text $ cat data/samples2.tsv Batch,PT-ID 1kg,PT-0001 1kg,PT-0002 study1,PT-0003 study3,PT-0003 .,PT-0004 1kg,PT-0005 .,PT-0006 1kg,PT-0007 In this case, we should: - Pass the non-default delimiter ``,`` - Pass the non-default missing value ``.`` >>> annotations = hc.import_table('data/samples2.tsv', delimiter=',', missing='.').key_by('PT-ID') >>> vds_result = vds.annotate_samples_table(annotations, root='sa.batch') Let's import annotations from a file with no header and sample IDs that need to be transformed. Suppose the vds sample IDs are of the form ``NA#####``. This file has no header line, and the sample ID is hidden in a field with other information. .. code-block:: text $ cat data/samples3.tsv 1kg_NA12345 female 1kg_NA12346 male 1kg_NA12348 female pgc_NA23415 male pgc_NA23418 male To import it: >>> annotations = (hc.import_table('data/samples3.tsv', no_header=True) ... .annotate('sample = f0.split("_")[1]') ... .key_by('sample')) >>> vds_result = vds.annotate_samples_table(annotations, ... expr='sa.sex = table.f1, sa.batch = table.f0.split("_")[0]') **Notes** This method takes as an argument a :class:`.KeyTable` object. Hail has a default join strategy for tables keyed by String, which is to join by sample ID. If the table is keyed by something else, like population or cohort, then the ``vds_key`` argument must be passed to describe the key in the dataset to use for the join. This argument expects a list of Hail expressions whose types match, in order, the table's key types. Each expression in the list ``vds_key`` has the following symbols in scope: - ``s`` (*String*): sample ID - ``sa``: sample annotations **The** ``root`` **and** ``expr`` **arguments** .. note:: One of ``root`` or ``expr`` is required, but not both. The ``expr`` parameter expects an annotation expression involving ``sa`` (the existing sample annotations in the dataset) and ``table`` (a struct containing the columns in the table), like ``sa.col1 = table.col1, sa.col2 = table.col2`` or ``sa = merge(sa, table)``. The ``root`` parameter expects an annotation path beginning in ``sa``, like ``sa.annotations``. Passing ``root='sa.annotations'`` is exactly the same as passing ``expr='sa.annotations = table'``. ``expr`` has the following symbols in scope: - ``sa``: sample annotations - ``table``: See note. .. note:: The value of ``table`` inside root/expr depends on the number of values in the key table, as well as the ``product`` argument. There are three behaviors based on the number of values and one branch for ``product`` being true and false, for a total of six modes: +-------------------------+-------------+--------------------+-----------------------------------------------+ | Number of value columns | ``product`` | Type of ``table`` | Value of ``table`` | +=========================+=============+====================+===============================================+ | More than 2 | False | ``Struct`` | Struct with an element for each column. | +-------------------------+-------------+--------------------+-----------------------------------------------+ | 1 | False | ``T`` | The value column. | +-------------------------+-------------+--------------------+-----------------------------------------------+ | 0 | False | ``Boolean`` | Existence of any matching key. | +-------------------------+-------------+--------------------+-----------------------------------------------+ | More than 2 | True | ``Array[Struct]`` | An array with a struct for each matching key. | +-------------------------+-------------+--------------------+-----------------------------------------------+ | 1 | True | ``Array[T]`` | An array with a value for each matching key. | +-------------------------+-------------+--------------------+-----------------------------------------------+ | 0 | True | ``Int`` | The number of matching keys. | +-------------------------+-------------+--------------------+-----------------------------------------------+ **Common uses for the** ``expr`` **argument** Put annotations on the top level under ``sa`` .. code-block:: text expr='sa = merge(sa, table)' Annotate only specific annotations from the table .. code-block:: text expr='sa.annotations = select(table, toKeep1, toKeep2, toKeep3)' The above is equivalent to .. code-block:: text expr='''sa.annotations.toKeep1 = table.toKeep1, sa.annotations.toKeep2 = table.toKeep2, sa.annotations.toKeep3 = table.toKeep3''' Finally, for more information about importing key tables from text, see the documentation for :py:meth:`.HailContext.import_table`. :param table: Key table. :type table: :py:class:`.KeyTable` :param root: Sample annotation path to store text table. (This or ``expr`` required). :type root: str or None :param expr: Annotation expression. (This or ``root`` required). :type expr: str or None :param vds_key: Join key for the dataset, if not sample ID. :type vds_key: str, list of str, or None. :param bool product: Join with all matching keys (see note). :return: Annotated variant dataset. :rtype: :py:class:`.VariantDataset` """ if vds_key: vds_key = wrap_to_list(vds_key) return VariantDataset(self.hc, self._jvds.annotateSamplesTable(table._jkt, vds_key, root, expr, product))
[docs] @handle_py4j @typecheck_method(expr=oneof(strlike, listof(strlike))) def annotate_variants_expr(self, expr): """Annotate variants with expression. **Examples** Compute GQ statistics about heterozygotes per variant: >>> vds_result = vds.annotate_variants_expr('va.gqHetStats = gs.filter(g => g.isHet()).map(g => g.gq).stats()') Collect a list of sample IDs with non-ref calls in LOF variants: >>> vds_result = vds.annotate_variants_expr('va.nonRefSamples = gs.filter(g => g.isCalledNonRef()).map(g => s).collect()') Substitute a custom string for the rsID field: >>> vds_result = vds.annotate_variants_expr('va.rsid = str(v)') **Notes** ``expr`` is in variant context so the following symbols are in scope: - ``v`` (*Variant*): :ref:`variant` - ``va``: variant annotations - ``global``: global annotations - ``gs`` (*Aggregable[Genotype]*): aggregable of :ref:`genotype` for variant ``v`` For more information, see the documentation on writing `expressions <overview.html#expressions>`__ and using the `Hail Expression Language <exprlang.html>`__. :param expr: Annotation expression or list of annotation expressions. :type expr: str or list of str :return: Annotated variant dataset. :rtype: :class:`.VariantDataset` """ if isinstance(expr, list): expr = ','.join(expr) jvds = self._jvds.annotateVariantsExpr(expr) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(table=KeyTable, root=nullable(strlike), expr=nullable(strlike), vds_key=nullable(oneof(strlike, listof(strlike))), product=bool) def annotate_variants_table(self, table, root=None, expr=None, vds_key=None, product=False): """Annotate variants with a key table. **Examples** Add annotations from a variant-keyed tab separated file: >>> table = hc.import_table('data/variant-lof.tsv', impute=True).key_by('v') >>> vds_result = vds.annotate_variants_table(table, root='va.lof') Add annotations from a locus-keyed TSV: >>> kt = hc.import_table('data/locus-table.tsv', impute=True).key_by('Locus') >>> vds_result = vds.annotate_variants_table(table, root='va.scores') Add annotations from a gene-and-type-keyed TSV: >>> table = hc.import_table('data/locus-metadata.tsv', impute=True).key_by(['gene', 'type']) >>> vds_result = (vds.annotate_variants_table(table, ... root='va.foo', ... vds_key=['va.gene', 'if (va.score > 10) "Type1" else "Type2"'])) Annotate variants with the target in a GATK interval list file: >>> intervals = KeyTable.import_interval_list('data/exons2.interval_list') >>> vds_result = vds.annotate_variants_table(intervals, root='va.exon') Annotate variants with all targets from matching intervals in a GATK interval list file: >>> intervals = KeyTable.import_interval_list('data/exons2.interval_list') >>> vds_result = vds.annotate_variants_table(intervals, root='va.exons', product=True) Annotate variants using a UCSC BED file, marking each variant true/false for an overlap with any interval: >>> intervals = KeyTable.import_bed('data/file2.bed') >>> vds_result = vds.annotate_variants_table(intervals, root='va.bed') **Notes** This method takes as an argument a :class:`.KeyTable` object. Hail has default join strategies for tables keyed by Variant, Locus, or Interval. **Join strategies:** If the key is a ``Variant``, then a variant in the dataset will match a variant in the table that is equivalent. Be careful, however: ``1:1:A:T`` does not match ``1:1:A:T,C``, and vice versa. If the key is a ``Locus``, then a variant in the dataset will match any locus in the table which is equivalent to ``v.locus`` (same chromosome and position). If the key is an ``Interval``, then a variant in the dataset will match any interval in the table that contains the variant's locus (chromosome and position). If the key is not one of the above three types (a String representing gene ID, for instance), or if another join strategy should be used for a key of one of these three types (join with a locus object in variant annotations, for instance) for these types, then the ``vds_key`` argument should be passed. This argument expects a list of expressions whose types match, in order, the table's key types. Note that using ``vds_key`` is slower than annotation with a standard key type. Each expression in the list ``vds_key`` has the following symbols in scope: - ``v`` (*Variant*): :ref:`variant` - ``va``: variant annotations **The** ``root`` **and** ``expr`` **arguments** .. note:: One of ``root`` or ``expr`` is required, but not both. The ``expr`` parameter expects an annotation assignment involving ``va`` (the existing variant annotations in the dataset) and ``table`` (the values(s) in the table), like ``va.col1 = table.col1, va.col2 = table.col2`` or ``va = merge(va, table)``. The ``root`` parameter expects an annotation path beginning in ``va``, like ``va.annotations``. Passing ``root='va.annotations'`` is the same as passing ``expr='va.annotations = table'``. ``expr`` has the following symbols in scope: - ``va``: variant annotations - ``table``: See note. .. note:: The value of ``table`` inside root/expr depends on the number of values in the key table, as well as the ``product`` argument. There are three behaviors based on the number of values and one branch for ``product`` being true and false, for a total of six modes: +-------------------------+-------------+--------------------+-----------------------------------------------+ | Number of value columns | ``product`` | Type of ``table`` | Value of ``table`` | +=========================+=============+====================+===============================================+ | More than 2 | False | ``Struct`` | Struct with an element for each column. | +-------------------------+-------------+--------------------+-----------------------------------------------+ | 1 | False | ``T`` | The value column. | +-------------------------+-------------+--------------------+-----------------------------------------------+ | 0 | False | ``Boolean`` | Existence of any matching key. | +-------------------------+-------------+--------------------+-----------------------------------------------+ | More than 2 | True | ``Array[Struct]`` | An array with a struct for each matching key. | +-------------------------+-------------+--------------------+-----------------------------------------------+ | 1 | True | ``Array[T]`` | An array with a value for each matching key. | +-------------------------+-------------+--------------------+-----------------------------------------------+ | 0 | True | ``Int`` | The number of matching keys. | +-------------------------+-------------+--------------------+-----------------------------------------------+ **Common uses for the** ``expr`` **argument** Put annotations on the top level under ``va``: .. code-block:: text expr='va = merge(va, table)' Annotate only specific annotations from the table: .. code-block:: text expr='va.annotations = select(table, toKeep1, toKeep2, toKeep3)' The above is roughly equivalent to: .. code-block:: text expr='''va.annotations.toKeep1 = table.toKeep1, va.annotations.toKeep2 = table.toKeep2, va.annotations.toKeep3 = table.toKeep3''' Finally, for more information about importing key tables from text, see the documentation for :py:meth:`.HailContext.import_table`. :param table: Key table. :type table: :py:class:`.KeyTable` :param root: Variant annotation path to store text table. (This or ``expr`` required). :type root: str or None :param expr: Annotation expression. (This or ``root`` required). :type expr: str or None :param vds_key: Join key for the dataset. Much slower than default joins. :type vds_key: str, list of str, or None. :param bool product: Join with all matching keys (see note). :return: Annotated variant dataset. :rtype: :py:class:`.VariantDataset` """ if vds_key: vds_key = wrap_to_list(vds_key) jvds = self._jvds.annotateVariantsTable(table._jkt, vds_key, root, expr, product) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(other=vds_type, expr=nullable(strlike), root=nullable(strlike)) def annotate_variants_vds(self, other, expr=None, root=None): '''Annotate variants with variant annotations from .vds file. **Examples** Import a second variant dataset with annotations to merge into ``vds``: >>> vds1 = vds.annotate_variants_expr('va = drop(va, anno1)') >>> vds2 = (hc.read("data/example2.vds") ... .annotate_variants_expr('va = select(va, anno1, toKeep1, toKeep2, toKeep3)')) Copy the ``anno1`` annotation from ``other`` to ``va.annot``: >>> vds_result = vds1.annotate_variants_vds(vds2, expr='va.annot = vds.anno1') Merge the variant annotations from the two vds together and places them at ``va``: >>> vds_result = vds1.annotate_variants_vds(vds2, expr='va = merge(va, vds)') Select a subset of the annotations from ``other``: >>> vds_result = vds1.annotate_variants_vds(vds2, expr='va.annotations = select(vds, toKeep1, toKeep2, toKeep3)') The previous expression is equivalent to: >>> vds_result = vds1.annotate_variants_vds(vds2, expr='va.annotations.toKeep1 = vds.toKeep1, ' + ... 'va.annotations.toKeep2 = vds.toKeep2, ' + ... 'va.annotations.toKeep3 = vds.toKeep3') **Notes** Using this method requires one of the two optional arguments: ``expr`` and ``root``. They specify how to insert the annotations from ``other`` into the this vds's variant annotations. The ``root`` argument copies all the variant annotations from ``other`` to the specified annotation path. The ``expr`` argument expects an annotation assignment whose scope includes, ``va``, the variant annotations in the current VDS, and ``vds``, the variant annotations in ``other``. VDSes with multi-allelic variants may produce surprising results because all alternate alleles are considered part of the variant key. For example: - The variant ``22:140012:A:T,TTT`` will not be annotated by ``22:140012:A:T`` or ``22:140012:A:TTT`` - The variant ``22:140012:A:T`` will not be annotated by ``22:140012:A:T,TTT`` It is possible that an unsplit variant dataset contains no multiallelic variants, so ignore any warnings Hail prints if you know that to be the case. Otherwise, run :py:meth:`.split_multi` before :py:meth:`.annotate_variants_vds`. :param VariantDataset other: Variant dataset to annotate with. :param str root: Sample annotation path to add variant annotations. :param str expr: Annotation expression. :return: Annotated variant dataset. :rtype: :py:class:`.VariantDataset` ''' jvds = self._jvds.annotateVariantsVDS(other._jvds, joption(root), joption(expr)) return VariantDataset(self.hc, jvds)
[docs] def annotate_variants_db(self, annotations, gene_key=None): """ Annotate variants using the Hail annotation database. .. warning:: Experimental. Supported only while running Hail on the Google Cloud Platform. Documentation describing the annotations that are accessible through this method can be found :ref:`here <sec-annotationdb>`. **Examples** Annotate variants with CADD raw and PHRED scores: >>> vds = vds.annotate_variants_db(['va.cadd.RawScore', 'va.cadd.PHRED']) # doctest: +SKIP Annotate variants with gene-level PLI score, using the VEP-generated gene symbol to map variants to genes: >>> pli_vds = vds.annotate_variants_db('va.gene.constraint.pli') # doctest: +SKIP Again annotate variants with gene-level PLI score, this time using the existing ``va.gene_symbol`` annotation to map variants to genes: >>> vds = vds.annotate_variants_db('va.gene.constraint.pli', gene_key='va.gene_symbol') # doctest: +SKIP **Notes** Annotations in the database are bi-allelic, so splitting multi-allelic variants in the VDS before using this method is recommended to capture all appropriate annotations from the database. To do this, run :py:meth:`split_multi` prior to annotating variants with this method: >>> vds = vds.split_multi().annotate_variants_db(['va.cadd.RawScore', 'va.cadd.PHRED']) # doctest: +SKIP To add VEP annotations, or to add gene-level annotations without a predefined gene symbol for each variant, the :py:meth:`~.VariantDataset.annotate_variants_db` method runs Hail's :py:meth:`~.VariantDataset.vep` method on the VDS. This means that your cluster must be properly initialized to run VEP. .. warning:: If you want to add VEP annotations to your VDS, make sure to add the initialization action :code:`gs://hail-common/vep/vep/vep85-init.sh` when starting your cluster. :param annotations: List of annotations to import from the database. :type annotations: str or list of str :param gene_key: Existing variant annotation used to map variants to gene symbols if importing gene-level annotations. If not provided, the method will add VEP annotations and parse them as described in the database documentation to obtain one gene symbol per variant. :type gene_key: str :return: Annotated variant dataset. :rtype: :py:class:`.VariantDataset` """ # import modules needed by this function import sqlite3 # collect user-supplied annotations, converting str -> list if necessary and dropping duplicates annotations = list(set(wrap_to_list(annotations))) # open connection to in-memory SQLite database conn = sqlite3.connect(':memory:') # load database with annotation metadata, print error if not on Google Cloud Platform try: f = hadoop_read('gs://annotationdb/ADMIN/annotationdb.sql') except FatalError: raise EnvironmentError('Cannot read from Google Storage. Must be running on Google Cloud Platform to use annotation database.') else: curs = conn.executescript(f.read()) f.close() # parameter substitution string to put in SQL query like = ' OR '.join('a.annotation LIKE ?' for i in xrange(2*len(annotations))) # query to extract path of all needed database files and their respective annotation exprs qry = """SELECT file_path, annotation, file_type, file_element, f.file_id FROM files AS f INNER JOIN annotations AS a ON f.file_id = a.file_id WHERE {}""".format(like) # run query and collect results in a file_path: expr dictionary results = curs.execute(qry, [x + '.%' for x in annotations] + annotations).fetchall() # all file_ids to be used file_ids = list(set([x[4] for x in results])) # parameter substitution string sub = ','.join('?' for x in file_ids) # query to fetch count of total annotations in each file qry = """SELECT file_path, COUNT(*) FROM files AS f INNER JOIN annotations AS a ON f.file_id = a.file_id WHERE f.file_id IN ({}) GROUP BY file_path""".format(sub) # collect counts in file_id: count dictionary cnts = {x[0]: x[1] for x in curs.execute(qry, file_ids).fetchall()} # close database connection conn.close() # collect dictionary of file_path: expr entries file_exprs = {} for r in results: expr = r[1] + '=' + 'table' if r[2] == 'table' and cnts[r[0]] < 2 else r[1] + '=' + r[2] + '.' + r[3] try: file_exprs[r[0]] += ',' + expr except KeyError: file_exprs[r[0]] = expr # are there any gene annotations? are_genes = 'gs://annotationdb/gene/gene.kt' in file_exprs #any([x.startswith('gs://annotationdb/gene/') for x in file_exprs]) # subset to VEP annotations veps = any([x == 'vep' for x in file_exprs]) # if VEP annotations are selected, or if gene-level annotations are selected with no specified gene_key, annotate with VEP if veps or (are_genes and not gene_key): # VEP annotate the VDS self = self.vep(config='/vep/vep-gcloud.properties', root='va.vep') # extract 1 gene symbol per variant from VEP annotations if a gene_key parameter isn't provided if are_genes: # hierarchy of possible variant consequences, from most to least severe csq_terms = [ 'transcript_ablation', 'splice_acceptor_variant', 'splice_donor_variant', 'stop_gained', 'frameshift_variant', 'stop_lost', 'start_lost', 'transcript_amplification', 'inframe_insertion', 'inframe_deletion', 'missense_variant', 'protein_altering_variant', 'incomplete_terminal_codon_variant', 'stop_retained_variant', 'synonymous_variant', 'splice_region_variant', 'coding_sequence_variant', 'mature_miRNA_variant', '5_prime_UTR_variant', '3_prime_UTR_variant', 'non_coding_transcript_exon_variant', 'intron_variant', 'NMD_transcript_variant', 'non_coding_transcript_variant', 'upstream_gene_variant', 'downstream_gene_variant', 'TFBS_ablation', 'TFBS_amplification', 'TF_binding_site_variant', 'regulatory_region_ablation', 'regulatory_region_amplification', 'feature_elongation', 'regulatory_region_variant', 'feature_truncation', 'intergenic_variant' ] # add consequence terms as a global annotation self = self.annotate_global('global.csq_terms', csq_terms, TArray(TString())) # find 1 transcript/gene per variant using the following method: # 1. define the most severe consequence for each variant according to hierarchy # 2. subset to transcripts with that most severe consequence # 3. if one of the transcripts in the subset is canonical, take that gene/transcript, # else just take the first gene/transcript in the subset self = ( self .annotate_variants_expr( """ va.gene.most_severe_consequence = let canonical_consequences = va.vep.transcript_consequences.filter(t => t.canonical == 1).flatMap(t => t.consequence_terms).toSet() in if (isDefined(canonical_consequences)) orElse(global.csq_terms.find(c => canonical_consequences.contains(c)), va.vep.most_severe_consequence) else va.vep.most_severe_consequence """ ) .annotate_variants_expr( """ va.gene.transcript = let tc = va.vep.transcript_consequences.filter(t => t.consequence_terms.toSet.contains(va.gene.most_severe_consequence)) in orElse(tc.find(t => t.canonical == 1), tc[0]) """ ) ) # drop VEP annotations if not specifically requested if not veps: self = self.annotate_variants_expr('va = drop(va, vep)') # subset VEP annotations if needed subset = ','.join([x.rsplit('.')[-1] for x in annotations if x.startswith('va.vep.')]) if subset: self = self.annotate_variants_expr('va.vep = select(va.vep, {})'.format(subset)) # iterate through files, selected annotations from each file for db_file, expr in file_exprs.iteritems(): # if database file is a VDS if db_file.endswith('.vds'): # annotate analysis VDS with database VDS self = self.annotate_variants_vds(self.hc.read(db_file), expr=expr) # if database file is a keytable elif db_file.endswith('.kt'): # join on gene symbol for gene annotations if db_file == 'gs://annotationdb/gene/gene.kt': if gene_key: vds_key = gene_key else: vds_key = 'va.gene.transcript.gene_symbol' else: vds_key = None # annotate analysis VDS with database keytable self = self.annotate_variants_table(self.hc.read_table(db_file), expr=expr, vds_key=vds_key) else: continue return self
[docs] @handle_py4j def cache(self): """Mark this variant dataset to be cached in memory. :py:meth:`~hail.VariantDataset.cache` is the same as :func:`persist("MEMORY_ONLY") <hail.VariantDataset.persist>`. :rtype: :class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvdf.cache())
[docs] @handle_py4j @requireTGenotype @typecheck_method(right=vds_type) def concordance(self, right): """Calculate call concordance with another variant dataset. .. include:: requireTGenotype.rst **Example** >>> comparison_vds = hc.read('data/example2.vds') >>> summary, samples, variants = vds.concordance(comparison_vds) **Notes** This method computes the genotype call concordance between two bialellic variant datasets. It performs an inner join on samples (only samples in both datasets will be considered), and an outer join on variants. If a variant is only in one dataset, then each genotype is treated as "no data" in the other. This method returns a tuple of three objects: a nested list of list of int with global concordance summary statistics, a key table with sample concordance statistics, and a key table with variant concordance statistics. **Using the global summary result** The global summary is a list of list of int (conceptually a 5 by 5 matrix), where the indices have special meaning: 0. No Data (missing variant) 1. No Call (missing genotype call) 2. Hom Ref 3. Heterozygous 4. Hom Var The first index is the state in the left dataset (the one on which concordance was called), and the second index is the state in the right dataset (the argument to the concordance method call). Typical uses of the summary list are shown below. >>> summary, samples, variants = vds.concordance(hc.read('data/example2.vds')) >>> left_homref_right_homvar = summary[2][4] >>> left_het_right_missing = summary[3][1] >>> left_het_right_something_else = sum(summary[3][:]) - summary[3][3] >>> total_concordant = summary[2][2] + summary[3][3] + summary[4][4] >>> total_discordant = sum([sum(s[2:]) for s in summary[2:]]) - total_concordant **Using the key table results** Columns of the sample key table: - **s** (*String*) -- Sample ID, key column. - **nDiscordant** (*Long*) -- Count of discordant calls (see below for full definition). - **concordance** (*Array[Array[Long]]*) -- Array of concordance per state on left and right, matching the structure of the global summary defined above. Columns of the variant key table: - **v** (*Variant*) -- Key column. - **nDiscordant** (*Long*) -- Count of discordant calls (see below for full definition). - **concordance** (*Array[Array[Long]]*) -- Array of concordance per state on left and right, matches the structure of the global summary defined above. The two key tables produced by the concordance method can be queried with :py:meth:`.KeyTable.query`, exported to text with :py:meth:`.KeyTable.export`, and used to annotate a variant dataset with :py:meth:`.VariantDataset.annotate_variants_table`, among other things. In these tables, the column **nDiscordant** is provided as a convenience, because this is often one of the most useful concordance statistics. This value is the number of genotypes which were called (homozygous reference, heterozygous, or homozygous variant) in both datasets, but where the call did not match between the two. The column **concordance** matches the structure of the global summmary, which is detailed above. Once again, the first index into this array is the state on the left, and the second index is the state on the right. For example, ``concordance[1][4]`` is the number of "no call" genotypes on the left that were called homozygous variant on the right. :param right: right hand variant dataset for concordance :type right: :class:`.VariantDataset` :return: The global concordance statistics, a key table with sample concordance statistics, and a key table with variant concordance statistics. :rtype: (list of list of int, :py:class:`.KeyTable`, :py:class:`.KeyTable`) """ r = self._jvdf.concordance(right._jvds) j_global_concordance = r._1() sample_kt = KeyTable(self.hc, r._2()) variant_kt = KeyTable(self.hc, r._3()) global_concordance = [[j_global_concordance.apply(j).apply(i) for i in xrange(5)] for j in xrange(5)] return global_concordance, sample_kt, variant_kt
[docs] @handle_py4j def count(self): """Returns number of samples and variants in the dataset. **Examples** >>> samples, variants = vds.count() **Notes** This is also the fastest way to force evaluation of a Hail pipeline. :returns: The sample and variant counts. :rtype: (int, int) """ r = self._jvds.count() return r._1(), r._2()
[docs] @handle_py4j def deduplicate(self): """Remove duplicate variants. :return: Deduplicated variant dataset. :rtype: :py:class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvds.deduplicate())
[docs] @handle_py4j @typecheck_method(fraction=numeric, seed=integral) def sample_variants(self, fraction, seed=1): """Downsample variants to a given fraction of the dataset. **Examples** >>> small_vds = vds.sample_variants(0.01) **Notes** This method may not sample exactly ``(fraction * n_variants)`` variants from the dataset. :param float fraction: (Expected) fraction of variants to keep. :param int seed: Random seed. :return: Downsampled variant dataset. :rtype: :py:class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvds.sampleVariants(fraction, seed))
[docs] @handle_py4j @requireTGenotype @typecheck_method(output=strlike, precision=integral) def export_gen(self, output, precision=4): """Export variant dataset as GEN and SAMPLE file. .. include:: requireTGenotype.rst **Examples** Import genotype probability data, filter variants based on INFO score, and export data to a GEN and SAMPLE file: >>> vds3 = hc.import_bgen("data/example3.bgen", sample_file="data/example3.sample") >>> (vds3.filter_variants_expr("gs.infoScore().score >= 0.9") ... .export_gen("output/infoscore_filtered")) **Notes** Writes out the internal VDS to a GEN and SAMPLE fileset in the `Oxford spec <http://www.stats.ox.ac.uk/%7Emarchini/software/gwas/file_format.html>`__. The first 6 columns of the resulting GEN file are the following: - Chromosome (``v.contig``) - Variant ID (``va.varid`` if defined, else Chromosome:Position:Ref:Alt) - rsID (``va.rsid`` if defined, else ".") - position (``v.start``) - reference allele (``v.ref``) - alternate allele (``v.alt``) Genotype probabilities: - 3 probabilities per sample ``(pHomRef, pHet, pHomVar)``. - Any filtered genotypes will be output as ``(0.0, 0.0, 0.0)``. - If the input data contained Phred-scaled likelihoods, the probabilities in the GEN file will be the normalized genotype probabilities assuming a uniform prior. - If the input data did not have genotype probabilities such as data imported using :py:meth:`~hail.HailContext.import_plink`, all genotype probabilities will be ``(0.0, 0.0, 0.0)``. The sample file has 3 columns: - ID_1 and ID_2 are identical and set to the sample ID (``s``). - The third column ("missing") is set to 0 for all samples. :param str output: Output file base. Will write GEN and SAMPLE files. :param int precision: Number of digits after the decimal point each probability is truncated to. """ self._jvdf.exportGen(output, precision)
[docs] @handle_py4j @typecheck_method(output=strlike, expr=strlike, types=bool, export_ref=bool, export_missing=bool, parallel=bool) def export_genotypes(self, output, expr, types=False, export_ref=False, export_missing=False, parallel=False): """Export genotype-level information to delimited text file. **Examples** Export genotype information with identifiers that form the header: >>> vds.export_genotypes('output/genotypes.tsv', 'SAMPLE=s, VARIANT=v, GQ=g.gq, DP=g.dp, ANNO1=va.anno1, ANNO2=va.anno2') Export the same information without identifiers, resulting in a file with no header: >>> vds.export_genotypes('output/genotypes.tsv', 's, v, g.gq, g.dp, va.anno1, va.anno2') **Notes** :py:meth:`~hail.VariantDataset.export_genotypes` outputs one line per cell (genotype) in the data set, though HomRef and missing genotypes are not output by default if the genotype schema is equal to :py:class:`~hail.expr.TGenotype`. Use the ``export_ref`` and ``export_missing`` parameters to force export of HomRef and missing genotypes, respectively. The ``expr`` argument is a comma-separated list of fields or expressions, all of which must be of the form ``IDENTIFIER = <expression>``, or else of the form ``<expression>``. If some fields have identifiers and some do not, Hail will throw an exception. The accessible namespace includes ``g``, ``s``, ``sa``, ``v``, ``va``, and ``global``. .. warning:: If the genotype schema does not have the type :py:class:`~hail.expr.TGenotype`, all genotypes will be exported unless the value of ``g`` is missing. Use :py:meth:`~hail.VariantDataset.filter_genotypes` to filter out genotypes based on an expression before exporting. :param str output: Output path. :param str expr: Export expression for values to export. :param bool types: Write types of exported columns to a file at (output + ".types") :param bool export_ref: If true, export reference genotypes. Only applicable if the genotype schema is :py:class:`~hail.expr.TGenotype`. :param bool export_missing: If true, export missing genotypes. :param bool parallel: If true, writes a set of files (one per partition) rather than serially concatenating these files. """ if self._is_generic_genotype: self._jvdf.exportGenotypes(output, expr, types, export_missing, parallel) else: self._jvdf.exportGenotypes(output, expr, types, export_ref, export_missing, parallel)
[docs] @handle_py4j @typecheck_method(output=strlike, expr=strlike, types=bool) def export_samples(self, output, expr, types=False): """Export sample information to delimited text file. **Examples** Export some sample QC metrics: >>> (vds.sample_qc() ... .export_samples('output/samples.tsv', 'SAMPLE = s, CALL_RATE = sa.qc.callRate, NHET = sa.qc.nHet')) This will produce a file with a header and three columns. To produce a file with no header, just leave off the assignment to the column identifier: >>> (vds.sample_qc() ... .export_samples('output/samples.tsv', 's, sa.qc.rTiTv')) **Notes** One line per sample will be exported. As :py:meth:`~hail.VariantDataset.export_samples` runs in sample context, the following symbols are in scope: - ``s`` (*Sample*): sample - ``sa``: sample annotations - ``global``: global annotations - ``gs`` (*Aggregable[Genotype]*): aggregable of :ref:`genotype` for sample ``s`` :param str output: Output file. :param str expr: Export expression for values to export. :param bool types: Write types of exported columns to a file at (output + ".types"). """ self._jvds.exportSamples(output, expr, types)
[docs] @handle_py4j @typecheck_method(output=strlike, expr=strlike, types=bool, parallel=bool) def export_variants(self, output, expr, types=False, parallel=False): """Export variant information to delimited text file. **Examples** Export a four column TSV with ``v``, ``va.pass``, ``va.filters``, and one computed field: ``1 - va.qc.callRate``. >>> vds.export_variants('output/file.tsv', ... 'VARIANT = v, PASS = va.pass, FILTERS = va.filters, MISSINGNESS = 1 - va.qc.callRate') It is also possible to export without identifiers, which will result in a file with no header. In this case, the expressions should look like the examples below: >>> vds.export_variants('output/file.tsv', 'v, va.pass, va.qc.AF') .. note:: If any field is named, all fields must be named. In the common case that a group of annotations needs to be exported (for example, the annotations produced by ``variantqc``), one can use the ``struct.*`` syntax. This syntax produces one column per field in the struct, and names them according to the struct field name. For example, the following invocation (assuming ``va.qc`` was generated by :py:meth:`.variant_qc`): >>> vds.export_variants('output/file.tsv', 'variant = v, va.qc.*') will produce the following set of columns: .. code-block:: text variant callRate AC AF nCalled ... Note that using the ``.*`` syntax always results in named arguments, so it is not possible to export header-less files in this manner. However, naming the "splatted" struct will apply the name in front of each column like so: >>> vds.export_variants('output/file.tsv', 'variant = v, QC = va.qc.*') which produces these columns: .. code-block:: text variant QC.callRate QC.AC QC.AF QC.nCalled ... **Notes** This module takes a comma-delimited list of fields or expressions to print. These fields will be printed in the order they appear in the expression in the header and on each line. One line per variant in the VDS will be printed. The accessible namespace includes: - ``v`` (*Variant*): :ref:`variant` - ``va``: variant annotations - ``global``: global annotations - ``gs`` (*Aggregable[Genotype]*): aggregable of :ref:`genotype` for variant ``v`` **Designating output with an expression** Much like the filtering methods, this method uses the Hail expression language. While the filtering methods expect an expression that evaluates to true or false, this method expects a comma-separated list of fields to print. These fields take the form ``IDENTIFIER = <expression>``. :param str output: Output file. :param str expr: Export expression for values to export. :param bool types: Write types of exported columns to a file at (output + ".types") :param bool parallel: If true, writes a set of files (one per partition) rather than serially concatenating these files. """ self._jvds.exportVariants(output, expr, types, parallel)
[docs] @handle_py4j @typecheck_method(output=strlike, append_to_header=nullable(strlike), export_pp=bool, parallel=bool) def export_vcf(self, output, append_to_header=None, export_pp=False, parallel=False): """Export variant dataset as a .vcf or .vcf.bgz file. **Examples** Export to VCF as a block-compressed file: >>> vds.export_vcf('output/example.vcf.bgz') **Notes** :py:meth:`~hail.VariantDataset.export_vcf` writes the VDS to disk in VCF format as described in the `VCF 4.2 spec <https://samtools.github.io/hts-specs/VCFv4.2.pdf>`__. Use the ``.vcf.bgz`` extension rather than ``.vcf`` in the output file name for `blocked GZIP <http://www.htslib.org/doc/tabix.html>`__ compression. .. note:: We strongly recommended compressed (``.bgz`` extension) and parallel output (``parallel=True``) when exporting large VCFs. Consider the workflow of importing VCF to VDS and immediately exporting VDS to VCF: >>> vds.export_vcf('output/example_out.vcf') The *example_out.vcf* header will contain the FORMAT, FILTER, and INFO lines present in *example.vcf*. However, it will *not* contain CONTIG lines or lines added by external tools (such as bcftools and GATK) unless they are explicitly inserted using the ``append_to_header`` option. Hail only exports the contents of ``va.info`` to the INFO field. No other annotations besides ``va.info`` are exported. The genotype schema must have the type :py:class:`~hail.expr.TGenotype` or :py:class:`~hail.expr.TStruct`. If the type is :py:class:`~hail.expr.TGenotype`, then the FORMAT fields will be GT, AD, DP, GQ, and PL (or PP if ``export_pp`` is True). If the type is :py:class:`~hail.expr.TStruct`, then the exported FORMAT fields will be the names of each field of the Struct. Each field must have a type of String, Char, Int, Double, or Call. Arrays and Sets are also allowed as long as they are not nested. For example, a field with type ``Array[Int]`` can be exported but not a field with type ``Array[Array[Int]]``. Nested Structs are also not allowed. .. caution:: If samples or genotypes are filtered after import, the value stored in ``va.info.AC`` value may no longer reflect the number of called alternate alleles in the filtered VDS. If the filtered VDS is then exported to VCF, downstream tools may produce erroneous results. The solution is to create new annotations in ``va.info`` or overwrite existing annotations. For example, in order to produce an accurate ``AC`` field, one can run :py:meth:`~hail.VariantDataset.variant_qc` and copy the ``va.qc.AC`` field to ``va.info.AC``: >>> (vds.filter_genotypes('g.gq >= 20') ... .variant_qc() ... .annotate_variants_expr('va.info.AC = va.qc.AC') ... .export_vcf('output/example.vcf.bgz')) :param str output: Path of .vcf file to write. :param append_to_header: Path of file to append to VCF header. :type append_to_header: str or None :param bool export_pp: If true, export linear-scaled probabilities (Hail's `pp` field on genotype) as the VCF PP FORMAT field. :param bool parallel: If true, return a set of VCF files (one per partition) rather than serially concatenating these files. """ self._jvdf.exportVCF(output, joption(append_to_header), export_pp, parallel)
[docs] @handle_py4j @convertVDS @typecheck_method(output=strlike, overwrite=bool, parquet_genotypes=bool) def write(self, output, overwrite=False, parquet_genotypes=False): """Write variant dataset as VDS file. **Examples** Import data from a VCF file and then write the data to a VDS file: >>> vds.write("output/sample.vds") :param str output: Path of VDS file to write. :param bool overwrite: If true, overwrite any existing VDS file. Cannot be used to read from and write to the same path. :param bool parquet_genotypes: If true, store genotypes as Parquet rather than Hail's serialization. The resulting VDS will be larger and slower in Hail but the genotypes will be accessible from other tools that support Parquet. """ if self._is_generic_genotype: self._jvdf.write(output, overwrite) else: self._jvdf.write(output, overwrite, parquet_genotypes)
[docs] @handle_py4j @requireTGenotype @typecheck_method(expr=strlike, annotation=strlike, subset=bool, keep=bool, filter_altered_genotypes=bool, max_shift=integral, keep_star=bool) def filter_alleles(self, expr, annotation='va = va', subset=True, keep=True, filter_altered_genotypes=False, max_shift=100, keep_star=False): """Filter a user-defined set of alternate alleles for each variant. If all alternate alleles of a variant are filtered, the variant itself is filtered. The expr expression is evaluated for each alternate allele, but not for the reference allele (i.e. ``aIndex`` will never be zero). .. include:: requireTGenotype.rst **Examples** To remove alternate alleles with zero allele count and update the alternate allele count annotation with the new indices: >>> vds_result = vds.filter_alleles('va.info.AC[aIndex - 1] == 0', ... annotation='va.info.AC = aIndices[1:].map(i => va.info.AC[i - 1])', ... keep=False) Note that we skip the first element of ``aIndices`` because we are mapping between the old and new *allele* indices, not the *alternate allele* indices. **Notes** If ``filter_altered_genotypes`` is true, genotypes that contain filtered-out alleles are set to missing. :py:meth:`~hail.VariantDataset.filter_alleles` implements two algorithms for filtering alleles: subset and downcode. We will illustrate their behavior on the example genotype below when filtering the first alternate allele (allele 1) at a site with 1 reference allele and 2 alternate alleles. .. code-block:: text GT: 1/2 GQ: 10 AD: 0,50,35 0 | 1000 1 | 1000 10 2 | 1000 0 20 +----------------- 0 1 2 **Subset algorithm** The subset algorithm (the default, ``subset=True``) subsets the AD and PL arrays (i.e. removes entries corresponding to filtered alleles) and then sets GT to the genotype with the minimum PL. Note that if the genotype changes (as in the example), the PLs are re-normalized (shifted) so that the most likely genotype has a PL of 0. Qualitatively, subsetting corresponds to the belief that the filtered alleles are not real so we should discard any probability mass associated with them. The subset algorithm would produce the following: .. code-block:: text GT: 1/1 GQ: 980 AD: 0,50 0 | 980 1 | 980 0 +----------- 0 1 In summary: - GT: Set to most likely genotype based on the PLs ignoring the filtered allele(s). - AD: The filtered alleles' columns are eliminated, e.g., filtering alleles 1 and 2 transforms ``25,5,10,20`` to ``25,20``. - DP: No change. - PL: The filtered alleles' columns are eliminated and the remaining columns shifted so the minimum value is 0. - GQ: The second-lowest PL (after shifting). **Downcode algorithm** The downcode algorithm (``subset=False``) recodes occurances of filtered alleles to occurances of the reference allele (e.g. 1 -> 0 in our example). So the depths of filtered alleles in the AD field are added to the depth of the reference allele. Where downcodeing filtered alleles merges distinct genotypes, the minimum PL is used (since PL is on a log scale, this roughly corresponds to adding probabilities). The PLs are then re-normalized (shifted) so that the most likely genotype has a PL of 0, and GT is set to this genotype. If an allele is filtered, this algorithm acts similarly to :py:meth:`~hail.VariantDataset.split_multi`. The downcoding algorithm would produce the following: .. code-block:: text GT: 0/1 GQ: 10 AD: 35,50 0 | 20 1 | 0 10 +----------- 0 1 In summary: - GT: Downcode filtered alleles to reference. - AD: The filtered alleles' columns are eliminated and their value is added to the reference, e.g., filtering alleles 1 and 2 transforms ``25,5,10,20`` to ``40,20``. - DP: No change. - PL: Downcode filtered alleles to reference, combine PLs using minimum for each overloaded genotype, and shift so the overall minimum PL is 0. - GQ: The second-lowest PL (after shifting). **Expression Variables** The following symbols are in scope for ``expr``: - ``v`` (*Variant*): :ref:`variant` - ``va``: variant annotations - ``aIndex`` (*Int*): the index of the allele being tested The following symbols are in scope for ``annotation``: - ``v`` (*Variant*): :ref:`variant` - ``va``: variant annotations - ``aIndices`` (*Array[Int]*): the array of old indices (such that ``aIndices[newIndex] = oldIndex`` and ``aIndices[0] = 0``) :param str expr: Boolean filter expression involving v (variant), va (variant annotations), and aIndex (allele index) :param str annotation: Annotation modifying expression involving v (new variant), va (old variant annotations), and aIndices (maps from new to old indices) :param bool subset: If true, subsets PL and AD, otherwise downcodes the PL and AD. Genotype and GQ are set based on the resulting PLs. :param bool keep: If true, keep variants matching expr :param bool filter_altered_genotypes: If true, genotypes that contain filtered-out alleles are set to missing. :param int max_shift: maximum number of base pairs by which a split variant can move. Affects memory usage, and will cause Hail to throw an error if a variant that moves further is encountered. :param bool keepStar: If true, keep variants where the only allele left is a ``*`` allele. :return: Filtered variant dataset. :rtype: :py:class:`.VariantDataset` """ jvds = self._jvdf.filterAlleles(expr, annotation, filter_altered_genotypes, keep, subset, max_shift, keep_star) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(expr=strlike, keep=bool) def filter_genotypes(self, expr, keep=True): """Filter genotypes based on expression. **Examples** Filter genotypes by allele balance dependent on genotype call: >>> vds_result = vds.filter_genotypes('let ab = g.ad[1] / g.ad.sum() in ' + ... '((g.isHomRef() && ab <= 0.1) || ' + ... '(g.isHet() && ab >= 0.25 && ab <= 0.75) || ' + ... '(g.isHomVar() && ab >= 0.9))') **Notes** ``expr`` is in genotype context so the following symbols are in scope: - ``s`` (*Sample*): sample - ``v`` (*Variant*): :ref:`variant` - ``sa``: sample annotations - ``va``: variant annotations - ``global``: global annotations For more information, see the documentation on `data representation, annotations <overview.html#>`__, and the `expression language <exprlang.html>`__. .. caution:: When ``expr`` evaluates to missing, the genotype will be removed regardless of whether ``keep=True`` or ``keep=False``. :param str expr: Boolean filter expression. :param bool keep: Keep genotypes where ``expr`` evaluates to true. :return: Filtered variant dataset. :rtype: :class:`.VariantDataset` """ jvds = self._jvdf.filterGenotypes(expr, keep) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @requireTGenotype def filter_multi(self): """Filter out multi-allelic sites. .. include:: requireTGenotype.rst This method is much less computationally expensive than :py:meth:`.split_multi`, and can also be used to produce a variant dataset that can be used with methods that do not support multiallelic variants. :return: Dataset with no multiallelic sites, which can be used for biallelic-only methods. :rtype: :class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvdf.filterMulti())
[docs] @handle_py4j def drop_samples(self): """Removes all samples from variant dataset. The variants, variant annotations, and global annnotations will remain, producing a sites-only variant dataset. :return: Sites-only variant dataset. :rtype: :py:class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvds.dropSamples())
[docs] @handle_py4j @typecheck_method(expr=strlike, keep=bool) def filter_samples_expr(self, expr, keep=True): """Filter samples with the expression language. **Examples** Filter samples by phenotype (assumes sample annotation *sa.isCase* exists and is a Boolean variable): >>> vds_result = vds.filter_samples_expr("sa.isCase") Remove samples with an ID that matches a regular expression: >>> vds_result = vds.filter_samples_expr('"^NA" ~ s' , keep=False) Filter samples from sample QC metrics and write output to a new variant dataset: >>> (vds.sample_qc() ... .filter_samples_expr('sa.qc.callRate >= 0.99 && sa.qc.dpMean >= 10') ... .write("output/filter_samples.vds")) **Notes** ``expr`` is in sample context so the following symbols are in scope: - ``s`` (*Sample*): sample - ``sa``: sample annotations - ``global``: global annotations - ``gs`` (*Aggregable[Genotype]*): aggregable of :ref:`genotype` for sample ``s`` For more information, see the documentation on `data representation, annotations <overview.html#>`__, and the `expression language <exprlang.html>`__. .. caution:: When ``expr`` evaluates to missing, the sample will be removed regardless of whether ``keep=True`` or ``keep=False``. :param str expr: Boolean filter expression. :param bool keep: Keep samples where ``expr`` evaluates to true. :return: Filtered variant dataset. :rtype: :py:class:`.VariantDataset` """ jvds = self._jvds.filterSamplesExpr(expr, keep) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(samples=listof(strlike), keep=bool) def filter_samples_list(self, samples, keep=True): """Filter samples with a list of samples. **Examples** >>> to_remove = ['NA12878', 'NA12891', 'NA12892'] >>> vds_result = vds.filter_samples_list(to_remove, keep=False) Read list from a file: >>> to_remove = [s.strip() for s in open('data/exclude_samples.txt')] >>> vds_result = vds.filter_samples_list(to_remove, keep=False) :param samples: List of samples to keep or remove. :type samples: list of str :param bool keep: If true, keep samples in ``samples``, otherwise remove them. :return: Filtered variant dataset. :rtype: :py:class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvds.filterSamplesList(samples, keep))
[docs] @handle_py4j @typecheck_method(table=KeyTable, keep=bool) def filter_samples_table(self, table, keep=True): """Filter samples with a table keyed by sample ID. **Examples** Keep samples in a text file: >>> table = hc.import_table('data/samples1.tsv').key_by('Sample') >>> vds_filtered = vds.filter_samples_table(table, keep=True) Remove samples in a text file with 1 field, and no header: >>> to_remove = hc.import_table('data/exclude_samples.txt', no_header=True).key_by('f0') >>> vds_filtered = vds.filter_samples_table(to_remove, keep=False) **Notes** This method filters out or filters to the keys of a table. The table must have a key of type ``String``. :param table: Key table. :type table: :class:`.KeyTable` :param bool keep: If true, keep only the keys in ``table``, otherwise remove them. :return: Filtered dataset. :rtype: :class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvds.filterSamplesTable(table._jkt, keep))
[docs] @handle_py4j def drop_variants(self): """Discard all variants, variant annotations and genotypes. Samples, sample annotations and global annotations are retained. This is the same as :func:`filter_variants_expr('false') <hail.VariantDataset.filter_variants_expr>`, but much faster. **Examples** >>> vds_result = vds.drop_variants() :return: Samples-only variant dataset. :rtype: :py:class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvds.dropVariants())
[docs] @handle_py4j @typecheck_method(expr=strlike, keep=bool) def filter_variants_expr(self, expr, keep=True): """Filter variants with the expression language. **Examples** Keep variants in the gene CHD8 (assumes the variant annotation ``va.gene`` exists): >>> vds_result = vds.filter_variants_expr('va.gene == "CHD8"') Remove all variants on chromosome 1: >>> vds_result = vds.filter_variants_expr('v.contig == "1"', keep=False) .. caution:: The double quotes on ``"1"`` are necessary because ``v.contig`` is of type String. **Notes** The following symbols are in scope for ``expr``: - ``v`` (*Variant*): :ref:`variant` - ``va``: variant annotations - ``global``: global annotations - ``gs`` (*Aggregable[Genotype]*): aggregable of :ref:`genotype` for variant ``v`` For more information, see the `Overview <overview.html#>`__ and the `Expression Language <exprlang.html>`__. .. caution:: When ``expr`` evaluates to missing, the variant will be removed regardless of whether ``keep=True`` or ``keep=False``. :param str expr: Boolean filter expression. :param bool keep: Keep variants where ``expr`` evaluates to true. :return: Filtered variant dataset. :rtype: :py:class:`.VariantDataset` """ jvds = self._jvds.filterVariantsExpr(expr, keep) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(intervals=oneof(Interval, listof(Interval)), keep=bool) def filter_intervals(self, intervals, keep=True): """Filter variants with an interval or list of intervals. **Examples** Filter to one interval: >>> vds_result = vds.filter_intervals(Interval.parse('17:38449840-38530994')) Another way of writing this same query: >>> vds_result = vds.filter_intervals(Interval(Locus('17', 38449840), Locus('17', 38530994))) Two identical ways of parsing a list of intervals: >>> intervals = map(Interval.parse, ['1:50M-75M', '2:START-400000', '3-22']) >>> intervals = [Interval.parse(x) for x in ['1:50M-75M', '2:START-400000', '3-22']] Use this interval list to filter: >>> vds_result = vds.filter_intervals(intervals) **Notes** This method takes an argument of :class:`.Interval` or list of :class:`.Interval`. Based on the ``keep`` argument, this method will either restrict to variants in the supplied interval ranges, or remove all variants in those ranges. Note that intervals are left-inclusive, and right-exclusive. The below interval includes the locus ``15:100000`` but not ``15:101000``. >>> interval = Interval.parse('15:100000-101000') This method performs predicate pushdown when ``keep=True``, meaning that data shards that don't overlap any supplied interval will not be loaded at all. This property enables ``filter_intervals`` to be used for reasonably low-latency queries of small ranges of the genome, even on large datasets. Suppose we are interested in variants on chromosome 15 between 100000 and 200000. This implementation with :py:meth:`.filter_variants_expr` may come to mind first: >>> vds_filtered = vds.filter_variants_expr('v.contig == "15" && v.start >= 100000 && v.start < 200000') However, it is **much** faster (and easier!) to use this method: >>> vds_filtered = vds.filter_intervals(Interval.parse('15:100000-200000')) .. note:: A :py:class:`.KeyTable` keyed by interval can be used to filter a dataset efficiently as well. See the documentation for :py:meth:`.filter_variants_table` for an example. This is useful for using interval files to filter a dataset. :param intervals: Interval(s) to keep or remove. :type intervals: :class:`.Interval` or list of :class:`.Interval` :param bool keep: Keep variants overlapping an interval if ``True``, remove variants overlapping an interval if ``False``. :return: Filtered variant dataset. :rtype: :py:class:`.VariantDataset` """ intervals = wrap_to_list(intervals) jvds = self._jvds.filterIntervals([x._jrep for x in intervals], keep) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(variants=listof(Variant), keep=bool) def filter_variants_list(self, variants, keep=True): """Filter variants with a list of variants. **Examples** Filter VDS down to a list of variants: >>> vds_filtered = vds.filter_variants_list([Variant.parse('20:10626633:G:GC'), ... Variant.parse('20:10019093:A:G')], keep=True) **Notes** This method performs predicate pushdown when ``keep=True``, meaning that data shards that don't overlap with any supplied variant will not be loaded at all. This property enables ``filter_variants_list`` to be used for reasonably low-latency queries of one or more variants, even on large datasets. :param variants: List of variants to keep or remove. :type variants: list of :py:class:`~hail.representation.Variant` :param bool keep: If true, keep variants in ``variants``, otherwise remove them. :return: Filtered variant dataset. :rtype: :py:class:`.VariantDataset` """ return VariantDataset( self.hc, self._jvds.filterVariantsList( [TVariant()._convert_to_j(v) for v in variants], keep))
[docs] @handle_py4j @typecheck_method(table=KeyTable, keep=bool) def filter_variants_table(self, table, keep=True): """Filter variants with a Variant keyed key table. **Example** Filter variants of a VDS to those appearing in a text file: >>> kt = hc.import_table('data/sample_variants.txt', key='Variant', impute=True) >>> filtered_vds = vds.filter_variants_table(kt, keep=True) Keep all variants whose chromosome and position (locus) appear in a file with a chromosome:position column: >>> kt = hc.import_table('data/locus-table.tsv', impute=True).key_by('Locus') >>> filtered_vds = vds.filter_variants_table(kt, keep=True) Remove all variants which overlap an interval in a UCSC BED file: >>> kt = KeyTable.import_bed('data/file2.bed') >>> filtered_vds = vds.filter_variants_table(kt, keep=False) **Notes** This method takes a key table as an argument, which must be keyed by one of the following: - ``Interval`` - ``Locus`` - ``Variant`` If the key is a ``Variant``, then a variant in the dataset will be kept or removed based on finding a complete match in the table. Be careful, however: ``1:1:A:T`` does not match ``1:1:A:T,C``, and vice versa. If the key is a ``Locus``, then a variant in the dataset will be kept or removed based on finding a locus in the table that matches by chromosome and position. If the key is an ``Interval``, then a variant in the dataset will be kept or removed based on finding an interval in the table that contains the variant's chromosome and position. :param table: Key table object. :type table: :py:class:`.KeyTable` :param bool keep: If true, keep only matches in ``table``, otherwise remove them. :return: Filtered variant dataset. :rtype: :py:class:`.VariantDataset` """ return VariantDataset( self.hc, self._jvds.filterVariantsTable(table._jkt, keep))
@property @handle_py4j def globals(self): """Return global annotations as a Python object. :return: Dataset global annotations. :rtype: :py:class:`~hail.representation.Struct` """ if self._globals is None: self._globals = self.global_schema._convert_to_py(self._jvds.globalAnnotation()) return self._globals
[docs] @handle_py4j @requireTGenotype def grm(self): """Compute the Genetic Relatedness Matrix (GRM). .. include:: requireTGenotype.rst **Examples** >>> km = vds.grm() **Notes** The genetic relationship matrix (GRM) :math:`G` encodes genetic correlation between each pair of samples. It is defined by :math:`G = MM^T` where :math:`M` is a standardized version of the genotype matrix, computed as follows. Let :math:`C` be the :math:`n \\times m` matrix of raw genotypes in the variant dataset, with rows indexed by :math:`n` samples and columns indexed by :math:`m` bialellic autosomal variants; :math:`C_{ij}` is the number of alternate alleles of variant :math:`j` carried by sample :math:`i`, which can be 0, 1, 2, or missing. For each variant :math:`j`, the sample alternate allele frequency :math:`p_j` is computed as half the mean of the non-missing entries of column :math:`j`. Entries of :math:`M` are then mean-centered and variance-normalized as .. math:: M_{ij} = \\frac{C_{ij}-2p_j}{\sqrt{2p_j(1-p_j)m}}, with :math:`M_{ij} = 0` for :math:`C_{ij}` missing (i.e. mean genotype imputation). This scaling normalizes genotype variances to a common value :math:`1/m` for variants in Hardy-Weinberg equilibrium and is further motivated in the paper `Patterson, Price and Reich, 2006 <http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.0020190>`__. (The resulting amplification of signal from the low end of the allele frequency spectrum will also introduce noise for rare variants; common practice is to filter out variants with minor allele frequency below some cutoff.) The factor :math:`1/m` gives each sample row approximately unit total variance (assuming linkage equilibrium) so that the diagonal entries of the GRM are approximately 1. Equivalently, .. math:: G_{ik} = \\frac{1}{m} \\sum_{j=1}^m \\frac{(C_{ij}-2p_j)(C_{kj}-2p_j)}{2 p_j (1-p_j)} :return: Genetic Relatedness Matrix for all samples. :rtype: :py:class:`KinshipMatrix` """ jkm = self._jvdf.grm() return KinshipMatrix(jkm)
[docs] @handle_py4j @requireTGenotype def hardcalls(self): """Drop all genotype fields except the GT field. .. include:: requireTGenotype.rst A hard-called variant dataset is about two orders of magnitude smaller than a standard sequencing dataset. Use this method to create a smaller, faster representation for downstream processing that only requires the GT field. :return: Variant dataset with no genotype metadata. :rtype: :py:class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvdf.hardCalls())
[docs] @handle_py4j @requireTGenotype @typecheck_method(maf=nullable(strlike), bounded=bool, min=nullable(numeric), max=nullable(numeric)) def ibd(self, maf=None, bounded=True, min=None, max=None): """Compute matrix of identity-by-descent estimations. .. include:: requireTGenotype.rst **Examples** To calculate a full IBD matrix, using minor allele frequencies computed from the variant dataset itself: >>> vds.ibd() To calculate an IBD matrix containing only pairs of samples with ``PI_HAT`` in [0.2, 0.9], using minor allele frequencies stored in ``va.panel_maf``: >>> vds.ibd(maf='va.panel_maf', min=0.2, max=0.9) **Notes** The implementation is based on the IBD algorithm described in the `PLINK paper <http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1950838>`__. :py:meth:`~hail.VariantDataset.ibd` requires the dataset to be bi-allelic (otherwise run :py:meth:`~hail.VariantDataset.split_multi` or otherwise run :py:meth:`~hail.VariantDataset.filter_multi`) and does not perform LD pruning. Linkage disequilibrium may bias the result so consider filtering variants first. The resulting :py:class:`.KeyTable` entries have the type: *{ i: String, j: String, ibd: { Z0: Double, Z1: Double, Z2: Double, PI_HAT: Double }, ibs0: Long, ibs1: Long, ibs2: Long }*. The key list is: `*i: String, j: String*`. Conceptually, the output is a symmetric, sample-by-sample matrix. The output key table has the following form .. code-block:: text i j ibd.Z0 ibd.Z1 ibd.Z2 ibd.PI_HAT ibs0 ibs1 ibs2 sample1 sample2 1.0000 0.0000 0.0000 0.0000 ... sample1 sample3 1.0000 0.0000 0.0000 0.0000 ... sample1 sample4 0.6807 0.0000 0.3193 0.3193 ... sample1 sample5 0.1966 0.0000 0.8034 0.8034 ... :param maf: Expression for the minor allele frequency. :type maf: str or None :param bool bounded: Forces the estimations for Z0, Z1, Z2, and PI_HAT to take on biologically meaningful values (in the range [0,1]). :param min: Sample pairs with a PI_HAT below this value will not be included in the output. Must be in [0,1]. :type min: float or None :param max: Sample pairs with a PI_HAT above this value will not be included in the output. Must be in [0,1]. :type max: float or None :return: A :py:class:`.KeyTable` mapping pairs of samples to their IBD statistics :rtype: :py:class:`.KeyTable` """ return KeyTable(self.hc, self._jvdf.ibd(joption(maf), bounded, joption(min), joption(max)))
[docs] @handle_py4j @requireTGenotype @typecheck_method(threshold=numeric, tiebreaking_expr=nullable(strlike), maf=nullable(strlike), bounded=bool) def ibd_prune(self, threshold, tiebreaking_expr=None, maf=None, bounded=True): """ Prune samples from the :py:class:`.VariantDataset` based on :py:meth:`~hail.VariantDataset.ibd` PI_HAT measures of relatedness. .. include:: requireTGenotype.rst **Examples** Prune samples so that no two have a PI_HAT value greater than or equal to 0.6. >>> pruned_vds = vds.ibd_prune(0.6) Prune samples so that no two have a PI_HAT value greater than or equal to 0.5, with a tiebreaking expression that selects cases over controls: >>> pruned_vds = vds.ibd_prune( ... 0.5, ... tiebreaking_expr="if (sa1.isCase && !sa2.isCase) -1 else if (!sa1.isCase && sa2.isCase) 1 else 0") **Notes** The variant dataset returned may change in near future as a result of algorithmic improvements. The current algorithm is very efficient on datasets with many small families, less so on datasets with large families. Currently, the algorithm works by deleting the person from each family who has the highest number of relatives, and iterating until no two people have a PI_HAT value greater than that specified. If two people within a family have the same number of relatives, the tiebreaking_expr given will be used to determine which sample gets deleted. The tiebreaking_expr namespace has the following variables available: - ``s1``: The first sample id. - ``sa1``: The annotations associated with s1. - ``s2``: The second sample id. - ``sa2``: The annotations associated with s2. The tiebreaking_expr returns an integer expressing the preference for one sample over the other. Any negative integer expresses a preference for keeping ``s1``. Any positive integer expresses a preference for keeping ``s2``. A zero expresses no preference. This function must induce a `preorder <https://en.wikipedia.org/wiki/Preorder>`__ on the samples, in particular: - ``tiebreaking_expr(sample1, sample2)`` must equal ``-1 * tie breaking_expr(sample2, sample1)``, which evokes the common sense understanding that if ``x < y`` then `y > x``. - ``tiebreaking_expr(sample1, sample1)`` must equal 0, i.e. ``x = x`` - if sample1 is preferred to sample2 and sample2 is preferred to sample3, then sample1 must also be preferred to sample3 The last requirement is only important if you have three related samples with the same number of relatives and all three are related to one another. In cases like this one, it is important that either: - one of the three is preferred to **both** other ones, or - there is no preference among the three samples :param threshold: The desired maximum PI_HAT value between any pair of samples. :param tiebreaking_expr: Expression used to choose between two samples with the same number of relatives. :param maf: Expression for the minor allele frequency. :param bounded: Forces the estimations for Z0, Z1, Z2, and PI_HAT to take on biologically meaningful values (in the range [0,1]). :return: A :py:class:`.VariantDataset` containing no samples with a PI_HAT greater than threshold. :rtype: :py:class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvdf.ibdPrune(threshold, joption(tiebreaking_expr), joption(maf), bounded))
[docs] @handle_py4j @requireTGenotype @typecheck_method(maf_threshold=numeric, include_par=bool, female_threshold=numeric, male_threshold=numeric, pop_freq=nullable(strlike)) def impute_sex(self, maf_threshold=0.0, include_par=False, female_threshold=0.2, male_threshold=0.8, pop_freq=None): """Impute sex of samples by calculating inbreeding coefficient on the X chromosome. .. include:: requireTGenotype.rst **Examples** Remove samples where imputed sex does not equal reported sex: >>> imputed_sex_vds = (vds.impute_sex() ... .annotate_samples_expr('sa.sexcheck = sa.pheno.isFemale == sa.imputesex.isFemale') ... .filter_samples_expr('sa.sexcheck || isMissing(sa.sexcheck)')) **Notes** We have used the same implementation as `PLINK v1.7 <http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#sexcheck>`__. 1. X chromosome variants are selected from the VDS: ``v.contig == "X" || v.contig == "23"`` 2. Variants with a minor allele frequency less than the threshold given by ``maf-threshold`` are removed 3. Variants in the pseudoautosomal region `(X:60001-2699520) || (X:154931044-155260560)` are included if the ``include_par`` optional parameter is set to true. 4. The minor allele frequency (maf) per variant is calculated. 5. For each variant and sample with a non-missing genotype call, :math:`E`, the expected number of homozygotes (from population MAF), is computed as :math:`1.0 - (2.0*maf*(1.0-maf))`. 6. For each variant and sample with a non-missing genotype call, :math:`O`, the observed number of homozygotes, is computed as `0 = heterozygote; 1 = homozygote` 7. For each variant and sample with a non-missing genotype call, :math:`N` is incremented by 1 8. For each sample, :math:`E`, :math:`O`, and :math:`N` are combined across variants 9. :math:`F` is calculated by :math:`(O - E) / (N - E)` 10. A sex is assigned to each sample with the following criteria: `F < 0.2 => Female; F > 0.8 => Male`. Use ``female-threshold`` and ``male-threshold`` to change this behavior. **Annotations** The below annotations can be accessed with ``sa.imputesex``. - **isFemale** (*Boolean*) -- True if the imputed sex is female, false if male, missing if undetermined - **Fstat** (*Double*) -- Inbreeding coefficient - **nTotal** (*Long*) -- Total number of variants considered - **nCalled** (*Long*) -- Number of variants with a genotype call - **expectedHoms** (*Double*) -- Expected number of homozygotes - **observedHoms** (*Long*) -- Observed number of homozygotes :param float maf_threshold: Minimum minor allele frequency threshold. :param bool include_par: Include pseudoautosomal regions. :param float female_threshold: Samples are called females if F < femaleThreshold :param float male_threshold: Samples are called males if F > maleThreshold :param str pop_freq: Variant annotation for estimate of MAF. If None, MAF will be computed. :return: Annotated dataset. :rtype: :py:class:`.VariantDataset` """ jvds = self._jvdf.imputeSex(maf_threshold, include_par, female_threshold, male_threshold, joption(pop_freq)) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(right=vds_type) def join(self, right): """Join two variant datasets. **Notes** This method performs an inner join on variants, concatenates samples, and takes variant and global annotations from the left dataset (self). The datasets must have distinct samples, the same sample schema, and the same split status (both split or both multi-allelic). :param right: right-hand variant dataset :type right: :py:class:`.VariantDataset` :return: Joined variant dataset :rtype: :py:class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvds.join(right._jvds))
[docs] @handle_py4j @typecheck(datasets=tupleof(vds_type)) def union(*datasets): """Take the union of datasets vertically (include all variants). **Examples** .. testsetup:: vds_autosomal = vds vds_chromX = vds vds_chromY = vds Union two datasets: >>> vds_union = vds_autosomal.union(vds_chromX) Given a list of datasets, union them all: >>> all_vds = [vds_autosomal, vds_chromX, vds_chromY] The following three syntaxes are equivalent: >>> vds_union1 = vds_autosomal.union(vds_chromX, vds_chromY) >>> vds_union2 = all_vds[0].union(*all_vds[1:]) >>> vds_union3 = VariantDataset.union(*all_vds) **Notes** In order to combine two datasets, these requirements must be met: - the samples must match - the variant annotation schemas must match (field order within structs matters). - the cell (genotype) schemas must match (field order within structs matters). The column annotations in the resulting dataset are simply the column annotations from the first dataset; the column annotation schemas do not need to match. This method can trigger a shuffle, if partitions from two datasets overlap. :param vds_type: Datasets to combine. :type vds_type: tuple of :class:`.VariantDataset` :return: Dataset with variants from all datasets. :rtype: :class:`.VariantDataset` """ if len(datasets) == 0: raise ValueError('Expected at least one argument') elif len(datasets) == 1: return datasets[0] else: return VariantDataset(Env.hc(), Env.hail().variant.VariantSampleMatrix.union([d._jvds for d in datasets]))
[docs] @handle_py4j @requireTGenotype @typecheck_method(r2=numeric, window=integral, memory_per_core=integral, num_cores=integral) def ld_prune(self, r2=0.2, window=1000000, memory_per_core=256, num_cores=1): """Prune variants in linkage disequilibrium (LD). .. include:: requireTGenotype.rst Requires :py:class:`~hail.VariantDataset.was_split` equals True. **Examples** Export the set of common LD pruned variants to a file: >>> vds_result = (vds.variant_qc() ... .filter_variants_expr("va.qc.AF >= 0.05 && va.qc.AF <= 0.95") ... .ld_prune() ... .export_variants("output/ldpruned.variants", "v")) **Notes** Variants are pruned in each contig from smallest to largest start position. The LD pruning algorithm is as follows: .. code-block:: python pruned_set = [] for v1 in contig: keep = True for v2 in pruned_set: if ((v1.position - v2.position) <= window and correlation(v1, v2) >= r2): keep = False if keep: pruned_set.append(v1) The parameter ``window`` defines the maximum distance in base pairs between two variants to check whether the variants are independent (:math:`R^2` < ``r2``) where ``r2`` is the maximum :math:`R^2` allowed. :math:`R^2` is defined as the square of `Pearson's correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`__ :math:`{\\rho}_{x,y}` between the two genotype vectors :math:`{\\mathbf{x}}` and :math:`{\\mathbf{y}}`. .. math:: {\\rho}_{x,y} = \\frac{\\mathrm{Cov}(X,Y)}{\\sigma_X \\sigma_Y} :py:meth:`.ld_prune` with default arguments is equivalent to ``plink --indep-pairwise 1000kb 1 0.2``. The list of pruned variants returned by Hail and PLINK will differ because Hail mean-imputes missing values and tests pairs of variants in a different order than PLINK. Be sure to provide enough disk space per worker because :py:meth:`.ld_prune` `persists <http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence>`__ up to 3 copies of the data to both memory and disk. The amount of disk space required will depend on the size and minor allele frequency of the input data and the prune parameters ``r2`` and ``window``. The number of bytes stored in memory per variant is about ``nSamples / 4 + 50``. .. warning:: The variants in the pruned set are not guaranteed to be identical each time :py:meth:`.ld_prune` is run. We recommend running :py:meth:`.ld_prune` once and exporting the list of LD pruned variants using :py:meth:`.export_variants` for future use. :param float r2: Maximum :math:`R^2` threshold between two variants in the pruned set within a given window. :param int window: Width of window in base-pairs for computing pair-wise :math:`R^2` values. :param int memory_per_core: Total amount of memory available for each core in MB. If unsure, use the default value. :param int num_cores: The number of cores available. Equivalent to the total number of workers times the number of cores per worker. :return: Variant dataset filtered to those variants which remain after LD pruning. :rtype: :py:class:`.VariantDataset` """ jvds = self._jvdf.ldPrune(r2, window, num_cores, memory_per_core) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @requireTGenotype @typecheck_method(force_local=bool) def ld_matrix(self, force_local=False): """Computes the linkage disequilibrium (correlation) matrix for the variants in this VDS. **Examples** >>> ld_mat = vds.ld_matrix() **Notes** Each entry (i, j) in the LD matrix gives the :math:`r` value between variants i and j, defined as `Pearson's correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`__ :math:`\\rho_{x_i,x_j}` between the two genotype vectors :math:`x_i` and :math:`x_j`. .. math:: \\rho_{x_i,x_j} = \\frac{\\mathrm{Cov}(X_i,X_j)}{\\sigma_{X_i} \\sigma_{X_j}} Also note that variants with zero variance (:math:`\\sigma = 0`) will be dropped from the matrix. .. caution:: The matrix returned by this function can easily be very large with most entries near zero (for example, entries between variants on different chromosomes in a homogenous population). Most likely you'll want to reduce the number of variants with methods like :py:meth:`.sample_variants`, :py:meth:`.filter_variants_expr`, or :py:meth:`.ld_prune` before calling this unless your dataset is very small. :param bool force_local: If true, the LD matrix is computed using local matrix multiplication on the Spark driver. This may improve performance when the genotype matrix is small enough to easily fit in local memory. If false, the LD matrix is computed using distributed matrix multiplication if the number of genotypes exceeds :math:`5000^2` and locally otherwise. :return: Matrix of r values between pairs of variants. :rtype: :py:class:`LDMatrix` """ jldm = self._jvdf.ldMatrix(force_local) return LDMatrix(jldm)
[docs] @handle_py4j @requireTGenotype @typecheck_method(y=strlike, covariates=listof(strlike), root=strlike, use_dosages=bool, min_ac=integral, min_af=numeric) def linreg(self, y, covariates=[], root='va.linreg', use_dosages=False, min_ac=1, min_af=0.0): r"""Test each variant for association using linear regression. .. include:: requireTGenotype.rst **Examples** Run linear regression per variant using a phenotype and two covariates stored in sample annotations: >>> vds_result = vds.linreg('sa.pheno.height', covariates=['sa.pheno.age', 'sa.pheno.isFemale']) **Notes** The :py:meth:`.linreg` method computes, for each variant, statistics of the :math:`t`-test for the genotype coefficient of the linear function of best fit from sample genotype and covariates to quantitative phenotype or case-control status. Hail only includes samples for which phenotype and all covariates are defined. For each variant, missing genotypes as the mean of called genotypes. By default, genotypes values are given by hard call genotypes (``g.gt``). If ``use_dosages=True``, then genotype values are defined by the dosage :math:`\mathrm{P}(\mathrm{Het}) + 2 \cdot \mathrm{P}(\mathrm{HomVar})`. For Phred-scaled values, :math:`\mathrm{P}(\mathrm{Het})` and :math:`\mathrm{P}(\mathrm{HomVar})` are calculated by normalizing the PL likelihoods (converted from the Phred-scale) to sum to 1. Assuming there are sample annotations ``sa.pheno.height``, ``sa.pheno.age``, ``sa.pheno.isFemale``, and ``sa.cov.PC1``, the code: >>> vds_result = vds.linreg('sa.pheno.height', covariates=['sa.pheno.age', 'sa.pheno.isFemale', 'sa.cov.PC1']) considers a model of the form .. math:: \mathrm{height} = \beta_0 + \beta_1 \, \mathrm{gt} + \beta_2 \, \mathrm{age} + \beta_3 \, \mathrm{isFemale} + \beta_4 \, \mathrm{PC1} + \varepsilon, \quad \varepsilon \sim \mathrm{N}(0, \sigma^2) where the genotype :math:`\mathrm{gt}` is coded as :math:`0` for HomRef, :math:`1` for Het, and :math:`2` for HomVar, and the Boolean covariate :math:`\mathrm{isFemale}` is coded as :math:`1` for true (female) and :math:`0` for false (male). The null model sets :math:`\beta_1 = 0`. Those variants that don't vary across the included samples (e.g., all genotypes are HomRef) will have missing annotations. One can further restrict computation to those variants with at least :math:`k` observed alternate alleles (AC) or alternate allele frequency (AF) at least :math:`p` in the included samples using the options ``min_ac=k`` or ``min_af=p``, respectively. Unlike the :py:meth:`.filter_variants_expr` method, these filters do not remove variants from the underlying variant dataset; rather the linear regression annotations for variants with low AC or AF are set to missing. Adding both filters is equivalent to applying the more stringent of the two. Phenotype and covariate sample annotations may also be specified using `programmatic expressions <exprlang.html>`__ without identifiers, such as: >>> vds_result = vds.linreg('if (sa.pheno.isFemale) sa.pheno.age else (2 * sa.pheno.age + 10)') For Boolean covariate types, true is coded as 1 and false as 0. In particular, for the sample annotation ``sa.fam.isCase`` added by importing a FAM file with case-control phenotype, case is 1 and control is 0. The standard least-squares linear regression model is derived in Section 3.2 of `The Elements of Statistical Learning, 2nd Edition <http://statweb.stanford.edu/~tibs/ElemStatLearn/printings/ESLII_print10.pdf>`__. See equation 3.12 for the t-statistic which follows the t-distribution with :math:`n - k - 2` degrees of freedom, under the null hypothesis of no effect, with :math:`n` samples and :math:`k` covariates in addition to genotype and intercept. **Annotations** With the default root, the following four variant annotations are added. - **va.linreg.beta** (*Double*) -- fit genotype coefficient, :math:`\hat\beta_1` - **va.linreg.se** (*Double*) -- estimated standard error, :math:`\widehat{\mathrm{se}}` - **va.linreg.tstat** (*Double*) -- :math:`t`-statistic, equal to :math:`\hat\beta_1 / \widehat{\mathrm{se}}` - **va.linreg.pval** (*Double*) -- :math:`p`-value :param str y: Response expression :param covariates: list of covariate expressions :type covariates: list of str :param str root: Variant annotation path to store result of linear regression. :param bool use_dosages: If true, use dosages genotypes rather than hard call genotypes. :param int min_ac: Minimum alternate allele count. :param float min_af: Minimum alternate allele frequency. :return: Variant dataset with linear regression variant annotations. :rtype: :py:class:`.VariantDataset` """ jvds = self._jvdf.linreg(y, jarray(Env.jvm().java.lang.String, covariates), root, use_dosages, min_ac, min_af) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(key_name=strlike, variant_keys=strlike, single_key=bool, agg_expr=strlike, y=strlike, covariates=listof(strlike)) def linreg_burden(self, key_name, variant_keys, single_key, agg_expr, y, covariates=[]): r"""Test each keyed group of variants for association by aggregating (collapsing) genotypes and applying the linear regression model. .. include:: requireTGenotype.rst **Examples** Run a gene burden test using linear regression on the maximum genotype per gene. Here ``va.genes`` is a variant annotation of type Set[String] giving the set of genes containing the variant (see **Extended example** below for a deep dive): >>> linreg_kt, sample_kt = (hc.read('data/example_burden.vds') ... .linreg_burden(key_name='gene', ... variant_keys='va.genes', ... single_key=False, ... agg_expr='gs.map(g => g.gt).max()', ... y='sa.burden.pheno', ... covariates=['sa.burden.cov1', 'sa.burden.cov2'])) Run a gene burden test using linear regression on the weighted sum of genotypes per gene. Here ``va.gene`` is a variant annotation of type String giving a single gene per variant (or no gene if missing), and ``va.weight`` is a numeric variant annotation: >>> linreg_kt, sample_kt = (hc.read('data/example_burden.vds') ... .linreg_burden(key_name='gene', ... variant_keys='va.gene', ... single_key=True, ... agg_expr='gs.map(g => va.weight * g.gt).sum()', ... y='sa.burden.pheno', ... covariates=['sa.burden.cov1', 'sa.burden.cov2'])) To use a weighted sum of genotypes with missing genotypes mean-imputed rather than ignored, set ``agg_expr='gs.map(g => va.weight * orElse(g.gt.toDouble, 2 * va.qc.AF)).sum()'`` where ``va.qc.AF`` is the allele frequency over those samples that have no missing phenotype or covariates. .. caution:: With ``single_key=False``, ``variant_keys`` expects a variant annotation of Set or Array type, in order to allow each variant to have zero, one, or more keys (for example, the same variant may appear in multiple genes). Unlike with type Set, if the same key appears twice in a variant annotation of type Array, then that variant will be counted twice in that key's group. With ``single_key=True``, ``variant_keys`` expects a variant annotation whose value is itself the key of interest. In bose cases, variants with missing keys are ignored. **Notes** This method modifies :py:meth:`.linreg` by replacing the genotype covariate per variant and sample with an aggregated (i.e., collapsed) score per key and sample. This numeric score is computed from the sample's genotypes and annotations over all variants with that key. Conceptually, the method proceeds as follows: 1) Filter to the set of samples for which all phenotype and covariates are defined. 2) For each key and sample, aggregate genotypes across variants with that key to produce a numeric score. ``agg_expr`` must be of numeric type and has the following symbols are in scope: - ``s`` (*Sample*): sample - ``sa``: sample annotations - ``global``: global annotations - ``gs`` (*Aggregable[Genotype]*): aggregable of :ref:`genotype` for sample ``s`` Note that ``v``, ``va``, and ``g`` are accessible through `Aggregable methods <https://hail.is/hail/types.html#aggregable>`_ on ``gs``. The resulting **sample key table** has key column ``key_name`` and a numeric column of scores for each sample named by the sample ID. 3) For each key, fit the linear regression model using the supplied phenotype and covariates. The model is that of :py:meth:`.linreg` with sample genotype ``gt`` replaced by the score in the sample key table. For each key, missing scores are mean-imputed across all samples. The resulting **linear regression key table** has the following columns: - value of ``key_name`` (*String*) -- descriptor of variant group key (key column) - **beta** (*Double*) -- fit coefficient, :math:`\hat\beta_1` - **se** (*Double*) -- estimated standard error, :math:`\widehat{\mathrm{se}}` - **tstat** (*Double*) -- :math:`t`-statistic, equal to :math:`\hat\beta_1 / \widehat{\mathrm{se}}` - **pval** (*Double*) -- :math:`p`-value :py:meth:`.linreg_burden` returns both the linear regression key table and the sample key table. **Extended example** Let's walk through these steps in the ``max()`` toy example above. There are six samples with the following annotations: +--------+-------+------+------+ | Sample | pheno | cov1 | cov2 | +========+=======+======+======+ | A | 0 | 0 | -1 | +--------+-------+------+------+ | B | 0 | 2 | 3 | +--------+-------+------+------+ | C | 1 | 1 | 5 | +--------+-------+------+------+ | D | 1 | -2 | 0 | +--------+-------+------+------+ | E | 1 | -2 | -4 | +--------+-------+------+------+ | F | 1 | 4 | 3 | +--------+-------+------+------+ There are three variants with the following ``gt`` values: +---------+---+---+---+---+---+---+ | Variant | A | B | C | D | E | F | +=========+===+===+===+===+===+===+ | 1:1:A:C | 0 | 1 | 0 | 0 | 0 | 1 | +---------+---+---+---+---+---+---+ | 1:2:C:T | . | 2 | . | 2 | 0 | 0 | +---------+---+---+---+---+---+---+ | 1:3:G:C | 0 | . | 1 | 1 | 1 | . | +---------+---+---+---+---+---+---+ The ``va.genes`` annotation of type Set[String] on ``example_burden.vds`` was created using :py:meth:`.annotate_variants_table` with ``product=True`` on the interval list: .. literalinclude:: data/genes.interval_list So there are three overlapping genes: gene A contains two variants, gene B contains one variant, and gene C contains all three variants. +--------+---------+---------+---------+ | gene | 1:1:A:C | 1:2:C:T | 1:3:G:C | +========+=========+=========+=========+ | geneA | X | X | | +--------+---------+---------+---------+ | geneB | | X | | +--------+---------+---------+---------+ | geneC | X | X | X | +--------+---------+---------+---------+ Therefore :py:meth:`.annotate_variants_table` with ``product=True`` creates a variant annotation of type Set[String] with values ``Set('geneA', 'geneB')``, ``Set('geneB')``, and ``Set('geneA', 'geneB', 'geneC')``. So the sample aggregation key table is: +-----+---+---+---+---+---+---+ | gene| A | B | C | D | E | F | +=====+===+===+===+===+===+===+ |geneA| 0| 2| 0| 2| 0| 1| +-----+---+---+---+---+---+---+ |geneB| NA| 2| NA| 2| 0| 0| +-----+---+---+---+---+---+---+ |geneC| 0| 2| 1| 2| 1| 1| +-----+---+---+---+---+---+---+ Linear regression is done for each row using the supplied phenotype, covariates, and implicit intercept. The resulting linear regression key table is: +------+-------+------+-------+------+ | gene | beta | se | tstat | pval | +======+=======+======+=======+======+ | geneA| -0.084| 0.368| -0.227| 0.841| +------+-------+------+-------+------+ | geneB| -0.542| 0.335| -1.617| 0.247| +------+-------+------+-------+------+ | geneC| 0.075| 0.515| 0.145| 0.898| +------+-------+------+-------+------+ :param str key_name: Name to assign to key column of returned key tables. :param str variant_keys: Variant annotation path for the TArray or TSet of keys associated to each variant. :param bool single_key: if true, ``variant_keys`` is interpreted as a single (or missing) key per variant, rather than as a collection of keys. :param str agg_expr: Sample aggregation expression (per key). :param str y: Response expression. :param covariates: list of covariate expressions. :type covariates: list of str :return: Tuple of linear regression key table and sample aggregation key table. :rtype: (:py:class:`.KeyTable`, :py:class:`.KeyTable`) """ r = self._jvdf.linregBurden(key_name, variant_keys, single_key, agg_expr, y, jarray(Env.jvm().java.lang.String, covariates)) linreg_kt = KeyTable(self.hc, r._1()) sample_kt = KeyTable(self.hc, r._2()) return linreg_kt, sample_kt
[docs] @handle_py4j @requireTGenotype @typecheck_method(ys=listof(strlike), covariates=listof(strlike), root=strlike, use_dosages=bool, min_ac=integral, min_af=numeric) def linreg_multi_pheno(self, ys, covariates=[], root='va.linreg', use_dosages=False, min_ac=1, min_af=0.0): r"""Test each variant for association with multiple phenotypes using linear regression. This method runs linear regression for multiple phenotypes more efficiently than looping over :py:meth:`.linreg`. .. warning:: :py:meth:`.linreg_multi_pheno` uses the same set of samples for each phenotype, namely the set of samples for which **all** phenotypes and covariates are defined. **Annotations** With the default root, the following four variant annotations are added. The indexing of these annotations corresponds to that of ``y``. - **va.linreg.beta** (*Array[Double]*) -- array of fit genotype coefficients, :math:`\hat\beta_1` - **va.linreg.se** (*Array[Double]*) -- array of estimated standard errors, :math:`\widehat{\mathrm{se}}` - **va.linreg.tstat** (*Array[Double]*) -- array of :math:`t`-statistics, equal to :math:`\hat\beta_1 / \widehat{\mathrm{se}}` - **va.linreg.pval** (*Array[Double]*) -- array of :math:`p`-values :param ys: list of one or more response expressions. :type covariates: list of str :param covariates: list of covariate expressions. :type covariates: list of str :param str root: Variant annotation path to store result of linear regression. :param bool use_dosages: If true, use dosage genotypes rather than hard call genotypes. :param int min_ac: Minimum alternate allele count. :param float min_af: Minimum alternate allele frequency. :return: Variant dataset with linear regression variant annotations. :rtype: :py:class:`.VariantDataset` """ jvds = self._jvdf.linregMultiPheno(jarray(Env.jvm().java.lang.String, ys), jarray(Env.jvm().java.lang.String, covariates), root, use_dosages, min_ac, min_af) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @requireTGenotype @typecheck_method(ys=listof(strlike), covariates=listof(strlike), root=strlike, use_dosages=bool, variant_block_size=integral) def linreg3(self, ys, covariates=[], root='va.linreg', use_dosages=False, variant_block_size=16): r"""Test each variant for association with multiple phenotypes using linear regression. This method runs linear regression for multiple phenotypes more efficiently than looping over :py:meth:`.linreg`. This method is more efficient than :py:meth:`.linreg_multi_pheno` but doesn't implicitly filter on allele count or allele frequency. .. warning:: :py:meth:`.linreg3` uses the same set of samples for each phenotype, namely the set of samples for which **all** phenotypes and covariates are defined. **Annotations** With the default root, the following four variant annotations are added. The indexing of the array annotations corresponds to that of ``y``. - **va.linreg.nCompleteSamples** (*Int*) -- number of samples used - **va.linreg.AC** (*Double*) -- sum of the genotype values ``x`` - **va.linreg.ytx** (*Array[Double]*) -- array of dot products of each phenotype vector ``y`` with the genotype vector ``x`` - **va.linreg.beta** (*Array[Double]*) -- array of fit genotype coefficients, :math:`\hat\beta_1` - **va.linreg.se** (*Array[Double]*) -- array of estimated standard errors, :math:`\widehat{\mathrm{se}}` - **va.linreg.tstat** (*Array[Double]*) -- array of :math:`t`-statistics, equal to :math:`\hat\beta_1 / \widehat{\mathrm{se}}` - **va.linreg.pval** (*Array[Double]*) -- array of :math:`p`-values :param ys: list of one or more response expressions. :type covariates: list of str :param covariates: list of covariate expressions. :type covariates: list of str :param str root: Variant annotation path to store result of linear regression. :param bool use_dosages: If true, use dosage genotypes rather than hard call genotypes. :param int variant_block_size: Number of variant regressions to perform simultaneously. Larger block size requires more memmory. :return: Variant dataset with linear regression variant annotations. :rtype: :py:class:`.VariantDataset` """ jvds = self._jvdf.linreg3(jarray(Env.jvm().java.lang.String, ys), jarray(Env.jvm().java.lang.String, covariates), root, use_dosages, variant_block_size) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @requireTGenotype @typecheck_method(kinshipMatrix=KinshipMatrix, y=strlike, covariates=listof(strlike), global_root=strlike, va_root=strlike, run_assoc=bool, use_ml=bool, delta=nullable(numeric), sparsity_threshold=numeric, use_dosages=bool, n_eigs=nullable(integral), dropped_variance_fraction=(nullable(float))) def lmmreg(self, kinshipMatrix, y, covariates=[], global_root="global.lmmreg", va_root="va.lmmreg", run_assoc=True, use_ml=False, delta=None, sparsity_threshold=1.0, use_dosages=False, n_eigs=None, dropped_variance_fraction=None): """Use a kinship-based linear mixed model to estimate the genetic component of phenotypic variance (narrow-sense heritability) and optionally test each variant for association. .. include:: requireTGenotype.rst **Examples** Suppose the variant dataset saved at *data/example_lmmreg.vds* has a Boolean variant annotation ``va.useInKinship`` and numeric or Boolean sample annotations ``sa.pheno``, ``sa.cov1``, ``sa.cov2``. Then the :py:meth:`.lmmreg` function in >>> assoc_vds = hc.read("data/example_lmmreg.vds") >>> kinship_matrix = assoc_vds.filter_variants_expr('va.useInKinship').rrm() >>> lmm_vds = assoc_vds.lmmreg(kinship_matrix, 'sa.pheno', ['sa.cov1', 'sa.cov2']) will execute the following four steps in order: 1) filter to samples in given kinship matrix to those for which ``sa.pheno``, ``sa.cov``, and ``sa.cov2`` are all defined 2) compute the eigendecomposition :math:`K = USU^T` of the kinship matrix 3) fit covariate coefficients and variance parameters in the sample-covariates-only (global) model using restricted maximum likelihood (`REML <https://en.wikipedia.org/wiki/Restricted_maximum_likelihood>`__), storing results in global annotations under ``global.lmmreg`` 4) test each variant for association, storing results under ``va.lmmreg`` in variant annotations This plan can be modified as follows: - Set ``run_assoc=False`` to not test any variants for association, i.e. skip Step 5. - Set ``use_ml=True`` to use maximum likelihood instead of REML in Steps 4 and 5. - Set the ``delta`` argument to manually set the value of :math:`\delta` rather that fitting :math:`\delta` in Step 4. - Set the ``global_root`` argument to change the global annotation root in Step 4. - Set the ``va_root`` argument to change the variant annotation root in Step 5. :py:meth:`.lmmreg` adds 9 or 13 global annotations in Step 4, depending on whether :math:`\delta` is set or fit. +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ | Annotation | Type | Value | +==============================================+======================+======================================================================================================================================================+ | ``global.lmmreg.useML`` | Boolean | true if fit by ML, false if fit by REML | +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``global.lmmreg.beta`` | Dict[String, Double] | map from *intercept* and the given ``covariates`` expressions to the corresponding fit :math:`\\beta` coefficients | +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``global.lmmreg.sigmaG2`` | Double | fit coefficient of genetic variance, :math:`\\hat{\sigma}_g^2` | +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``global.lmmreg.sigmaE2`` | Double | fit coefficient of environmental variance :math:`\\hat{\sigma}_e^2` | +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``global.lmmreg.delta`` | Double | fit ratio of variance component coefficients, :math:`\\hat{\delta}` | +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``global.lmmreg.h2`` | Double | fit narrow-sense heritability, :math:`\\hat{h}^2` | +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``global.lmmreg.nEigs`` | Int | number of eigenvectors of kinship matrix used to fit model | +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``global.lmmreg.dropped_variance_fraction`` | Double | specified value of ``dropped_variance_fraction`` | +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``global.lmmreg.evals`` | Array[Double] | all eigenvalues of the kinship matrix in descending order | +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``global.lmmreg.fit.seH2`` | Double | standard error of :math:`\\hat{h}^2` under asymptotic normal approximation | +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``global.lmmreg.fit.normLkhdH2`` | Array[Double] | likelihood function of :math:`h^2` normalized on the discrete grid ``0.01, 0.02, ..., 0.99``. Index ``i`` is the likelihood for percentage ``i``. | +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``global.lmmreg.fit.maxLogLkhd`` | Double | (restricted) maximum log likelihood corresponding to :math:`\\hat{\delta}` | +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``global.lmmreg.fit.logDeltaGrid`` | Array[Double] | values of :math:`\\mathrm{ln}(\delta)` used in the grid search | +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``global.lmmreg.fit.logLkhdVals`` | Array[Double] | (restricted) log likelihood of :math:`y` given :math:`X` and :math:`\\mathrm{ln}(\delta)` at the (RE)ML fit of :math:`\\beta` and :math:`\sigma_g^2` | +----------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ These global annotations are also added to ``hail.log``, with the ranked evals and :math:`\delta` grid with values in .tsv tabular form. Use ``grep 'lmmreg:' hail.log`` to find the lines just above each table. If Step 5 is performed, :py:meth:`.lmmreg` also adds four linear regression variant annotations. +------------------------+--------+-------------------------------------------------------------------------+ | Annotation | Type | Value | +========================+========+=========================================================================+ | ``va.lmmreg.beta`` | Double | fit genotype coefficient, :math:`\hat\\beta_0` | +------------------------+--------+-------------------------------------------------------------------------+ | ``va.lmmreg.sigmaG2`` | Double | fit coefficient of genetic variance component, :math:`\hat{\sigma}_g^2` | +------------------------+--------+-------------------------------------------------------------------------+ | ``va.lmmreg.chi2`` | Double | :math:`\chi^2` statistic of the likelihood ratio test | +------------------------+--------+-------------------------------------------------------------------------+ | ``va.lmmreg.pval`` | Double | :math:`p`-value | +------------------------+--------+-------------------------------------------------------------------------+ Those variants that don't vary across the included samples (e.g., all genotypes are HomRef) will have missing annotations. The simplest way to export all resulting annotations is: >>> lmm_vds.export_variants('output/lmmreg.tsv.bgz', 'variant = v, va.lmmreg.*') >>> lmmreg_results = lmm_vds.globals['lmmreg'] By default, genotypes values are given by hard call genotypes (``g.gt``). If ``use_dosages=True``, then genotype values for per-variant association are defined by the dosage :math:`\mathrm{P}(\mathrm{Het}) + 2 \cdot \mathrm{P}(\mathrm{HomVar})`. For Phred-scaled values, :math:`\mathrm{P}(\mathrm{Het})` and :math:`\mathrm{P}(\mathrm{HomVar})` are calculated by normalizing the PL likelihoods (converted from the Phred-scale) to sum to 1. **Performance** Hail's initial version of :py:meth:`.lmmreg` scales beyond 15k samples and to an essentially unbounded number of variants, making it particularly well-suited to modern sequencing studies and complementary to tools designed for SNP arrays. Analysts have used :py:meth:`.lmmreg` in research to compute kinship from 100k common variants and test 32 million non-rare variants on 8k whole genomes in about 10 minutes on `Google cloud <http://discuss.hail.is/t/using-hail-on-the-google-cloud-platform/80>`__. While :py:meth:`.lmmreg` computes the kinship matrix :math:`K` using distributed matrix multiplication (Step 2), the full `eigendecomposition <https://en.wikipedia.org/wiki/Eigendecomposition_of_a_matrix>`__ (Step 3) is currently run on a single core of master using the `LAPACK routine DSYEVD <http://www.netlib.org/lapack/explore-html/d2/d8a/group__double_s_yeigen_ga694ddc6e5527b6223748e3462013d867.html>`__, which we empirically find to be the most performant of the four available routines; laptop performance plots showing cubic complexity in :math:`n` are available `here <https://github.com/hail-is/hail/pull/906>`__. On Google cloud, eigendecomposition takes about 2 seconds for 2535 sampes and 1 minute for 8185 samples. If you see worse performance, check that LAPACK natives are being properly loaded (see "BLAS and LAPACK" in Getting Started). Given the eigendecomposition, fitting the global model (Step 4) takes on the order of a few seconds on master. Association testing (Step 5) is fully distributed by variant with per-variant time complexity that is completely independent of the number of sample covariates and dominated by multiplication of the genotype vector :math:`v` by the matrix of eigenvectors :math:`U^T` as described below, which we accelerate with a sparse representation of :math:`v`. The matrix :math:`U^T` has size about :math:`8n^2` bytes and is currently broadcast to each Spark executor. For example, with 15k samples, storing :math:`U^T` consumes about 3.6GB of memory on a 16-core worker node with two 8-core executors. So for large :math:`n`, we recommend using a high-memory configuration such as ``highmem`` workers. **Linear mixed model** :py:meth:`.lmmreg` estimates the genetic proportion of residual phenotypic variance (narrow-sense heritability) under a kinship-based linear mixed model, and then optionally tests each variant for association using the likelihood ratio test. Inference is exact. We first describe the sample-covariates-only model used to estimate heritability, which we simply refer to as the *global model*. With :math:`n` samples and :math:`c` sample covariates, we define: - :math:`y = n \\times 1` vector of phenotypes - :math:`X = n \\times c` matrix of sample covariates and intercept column of ones - :math:`K = n \\times n` kinship matrix - :math:`I = n \\times n` identity matrix - :math:`\\beta = c \\times 1` vector of covariate coefficients - :math:`\sigma_g^2 =` coefficient of genetic variance component :math:`K` - :math:`\sigma_e^2 =` coefficient of environmental variance component :math:`I` - :math:`\delta = \\frac{\sigma_e^2}{\sigma_g^2} =` ratio of environmental and genetic variance component coefficients - :math:`h^2 = \\frac{\sigma_g^2}{\sigma_g^2 + \sigma_e^2} = \\frac{1}{1 + \delta} =` genetic proportion of residual phenotypic variance Under a linear mixed model, :math:`y` is sampled from the :math:`n`-dimensional `multivariate normal distribution <https://en.wikipedia.org/wiki/Multivariate_normal_distribution>`__ with mean :math:`X \\beta` and variance components that are scalar multiples of :math:`K` and :math:`I`: .. math:: y \sim \mathrm{N}\\left(X\\beta, \sigma_g^2 K + \sigma_e^2 I\\right) Thus the model posits that the residuals :math:`y_i - X_{i,:}\\beta` and :math:`y_j - X_{j,:}\\beta` have covariance :math:`\sigma_g^2 K_{ij}` and approximate correlation :math:`h^2 K_{ij}`. Informally: phenotype residuals are correlated as the product of overall heritability and pairwise kinship. By contrast, standard (unmixed) linear regression is equivalent to fixing :math:`\sigma_2` (equivalently, :math:`h^2`) at 0 above, so that all phenotype residuals are independent. **Caution:** while it is tempting to interpret :math:`h^2` as the `narrow-sense heritability <https://en.wikipedia.org/wiki/Heritability#Definition>`__ of the phenotype alone, note that its value depends not only the phenotype and genetic data, but also on the choice of sample covariates. **Fitting the global model** The core algorithm is essentially a distributed implementation of the spectral approach taken in `FastLMM <https://www.microsoft.com/en-us/research/project/fastlmm/>`__. Let :math:`K = USU^T` be the `eigendecomposition <https://en.wikipedia.org/wiki/Eigendecomposition_of_a_matrix#Real_symmetric_matrices>`__ of the real symmetric matrix :math:`K`. That is: - :math:`U = n \\times n` orthonormal matrix whose columns are the eigenvectors of :math:`K` - :math:`S = n \\times n` diagonal matrix of eigenvalues of :math:`K` in descending order. :math:`S_{ii}` is the eigenvalue of eigenvector :math:`U_{:,i}` - :math:`U^T = n \\times n` orthonormal matrix, the transpose (and inverse) of :math:`U` A bit of matrix algebra on the multivariate normal density shows that the linear mixed model above is mathematically equivalent to the model .. math:: U^Ty \\sim \mathrm{N}\\left(U^TX\\beta, \sigma_g^2 (S + \delta I)\\right) for which the covariance is diagonal (e.g., unmixed). That is, rotating the phenotype vector (:math:`y`) and covariate vectors (columns of :math:`X`) in :math:`\mathbb{R}^n` by :math:`U^T` transforms the model to one with independent residuals. For any particular value of :math:`\delta`, the restricted maximum likelihood (REML) solution for the latter model can be solved exactly in time complexity that is linear rather than cubic in :math:`n`. In particular, having rotated, we can run a very efficient 1-dimensional optimization procedure over :math:`\delta` to find the REML estimate :math:`(\hat{\delta}, \\hat{\\beta}, \\hat{\sigma}_g^2)` of the triple :math:`(\delta, \\beta, \sigma_g^2)`, which in turn determines :math:`\\hat{\sigma}_e^2` and :math:`\\hat{h}^2`. We first compute the maximum log likelihood on a :math:`\delta`-grid that is uniform on the log scale, with :math:`\\mathrm{ln}(\delta)` running from -8 to 8 by 0.01, corresponding to :math:`h^2` decreasing from 0.9995 to 0.0005. If :math:`h^2` is maximized at the lower boundary then standard linear regression would be more appropriate and Hail will exit; more generally, consider using standard linear regression when :math:`\\hat{h}^2` is very small. A maximum at the upper boundary is highly suspicious and will also cause Hail to exit. In any case, the log file records the table of grid values for further inspection, beginning under the info line containing "lmmreg: table of delta". If the optimal grid point falls in the interior of the grid as expected, we then use `Brent's method <https://en.wikipedia.org/wiki/Brent%27s_method>`__ to find the precise location of the maximum over the same range, with initial guess given by the optimal grid point and a tolerance on :math:`\\mathrm{ln}(\delta)` of 1e-6. If this location differs from the optimal grid point by more than 0.01, a warning will be displayed and logged, and one would be wise to investigate by plotting the values over the grid. Note that :math:`h^2` is related to :math:`\\mathrm{ln}(\delta)` through the `sigmoid function <https://en.wikipedia.org/wiki/Sigmoid_function>`_. More precisely, .. math:: h^2 = 1 - \mathrm{sigmoid}(\\mathrm{ln}(\delta)) = \mathrm{sigmoid}(-\\mathrm{ln}(\delta)) Hence one can change variables to extract a high-resolution discretization of the likelihood function of :math:`h^2` over :math:`[0,1]` at the corresponding REML estimators for :math:`\\beta` and :math:`\sigma_g^2`, as well as integrate over the normalized likelihood function using `change of variables <https://en.wikipedia.org/wiki/Integration_by_substitution>`_ and the `sigmoid differential equation <https://en.wikipedia.org/wiki/Sigmoid_function#Properties>`_. For convenience, ``global.lmmreg.fit.normLkhdH2`` records the the likelihood function of :math:`h^2` normalized over the discrete grid ``0.01, 0.02, ..., 0.98, 0.99``. The length of the array is 101 so that index ``i`` contains the likelihood at percentage ``i``. The values at indices 0 and 100 are left undefined. By the theory of maximum likelihood estimation, this normalized likelihood function is approximately normally distributed near the maximum likelihood estimate. So we estimate the standard error of the estimator of :math:`h^2` as follows. Let :math:`x_2` be the maximum likelihood estimate of :math:`h^2` and let :math:`x_ 1` and :math:`x_3` be just to the left and right of :math:`x_2`. Let :math:`y_1`, :math:`y_2`, and :math:`y_3` be the corresponding values of the (unnormalized) log likelihood function. Setting equal the leading coefficient of the unique parabola through these points (as given by Lagrange interpolation) and the leading coefficient of the log of the normal distribution, we have: .. math:: \\frac{x_3 (y_2 - y_1) + x_2 (y_1 - y_3) + x_1 (y_3 - y_2))}{(x_2 - x_1)(x_1 - x_3)(x_3 - x_2)} = -\\frac{1}{2 \sigma^2} The standard error :math:`\\hat{\sigma}` is then estimated by solving for :math:`\sigma`. Note that the mean and standard deviation of the (discretized or continuous) distribution held in ``global.lmmreg.fit.normLkhdH2`` will not coincide with :math:`\\hat{h}^2` and :math:`\\hat{\sigma}`, since this distribution only becomes normal in the infinite sample limit. One can visually assess normality by plotting this distribution against a normal distribution with the same mean and standard deviation, or use this distribution to approximate credible intervals under a flat prior on :math:`h^2`. **Testing each variant for association** Fixing a single variant, we define: - :math:`v = n \\times 1` vector of genotypes, with missing genotypes imputed as the mean of called genotypes - :math:`X_v = \\left[v | X \\right] = n \\times (1 + c)` matrix concatenating :math:`v` and :math:`X` - :math:`\\beta_v = (\\beta^0_v, \\beta^1_v, \\ldots, \\beta^c_v) = (1 + c) \\times 1` vector of covariate coefficients Fixing :math:`\delta` at the global REML estimate :math:`\\hat{\delta}`, we find the REML estimate :math:`(\\hat{\\beta}_v, \\hat{\sigma}_{g,v}^2)` via rotation of the model .. math:: y \\sim \\mathrm{N}\\left(X_v\\beta_v, \sigma_{g,v}^2 (K + \\hat{\delta} I)\\right) Note that the only new rotation to compute here is :math:`U^T v`. To test the null hypothesis that the genotype coefficient :math:`\\beta^0_v` is zero, we consider the restricted model with parameters :math:`((0, \\beta^1_v, \ldots, \\beta^c_v), \sigma_{g,v}^2)` within the full model with parameters :math:`(\\beta^0_v, \\beta^1_v, \\ldots, \\beta^c_v), \sigma_{g_v}^2)`, with :math:`\delta` fixed at :math:`\\hat\delta` in both. The latter fit is simply that of the global model, :math:`((0, \\hat{\\beta}^1, \\ldots, \\hat{\\beta}^c), \\hat{\sigma}_g^2)`. The likelihood ratio test statistic is given by .. math:: \chi^2 = n \\, \\mathrm{ln}\left(\\frac{\hat{\sigma}^2_g}{\\hat{\sigma}_{g,v}^2}\\right) and follows a chi-squared distribution with one degree of freedom. Here the ratio :math:`\\hat{\sigma}^2_g / \\hat{\sigma}_{g,v}^2` captures the degree to which adding the variant :math:`v` to the global model reduces the residual phenotypic variance. **Kinship Matrix** FastLMM uses the Realized Relationship Matrix (RRM) for kinship. This can be computed with :py:meth:`~hail.VariantDataset.rrm`. However, any instance of :py:class:`KinshipMatrix` may be used, so long as ``sample_list`` contains the complete samples of the caller variant dataset in the same order. **Low-rank approximation of kinship for improved performance** :py:meth:`.lmmreg` can implicitly use a low-rank approximation of the kinship matrix to more rapidly fit delta and the statistics for each variant. The computational complexity per variant is proportional to the number of eigenvectors used. This number can be specified in two ways. Specify the parameter ``n_eigs`` to use only the top ``n_eigs`` eigenvectors. Alternatively, specify ``dropped_variance_fraction`` to use as many eigenvectors as necessary to capture all but at most this fraction of the sample variance (also known as the trace, or the sum of the eigenvalues). For example, ``dropped_variance_fraction=0.01`` will use the minimal number of eigenvectors to account for 99% of the sample variance. Specifying both parameters will apply the more stringent (fewest eigenvectors) of the two. **Further background** For the history and mathematics of linear mixed models in genetics, including `FastLMM <https://www.microsoft.com/en-us/research/project/fastlmm/>`__, see `Christoph Lippert's PhD thesis <https://publikationen.uni-tuebingen.de/xmlui/bitstream/handle/10900/50003/pdf/thesis_komplett.pdf>`__. For an investigation of various approaches to defining kinship, see `Comparison of Methods to Account for Relatedness in Genome-Wide Association Studies with Family-Based Data <http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1004445>`__. :param kinshipMatrix: Kinship matrix to be used. :type kinshipMatrix: :class:`KinshipMatrix` :param str y: Response sample annotation. :param covariates: List of covariate sample annotations. :type covariates: list of str :param str global_root: Global annotation root, a period-delimited path starting with `global`. :param str va_root: Variant annotation root, a period-delimited path starting with `va`. :param bool run_assoc: If true, run association testing in addition to fitting the global model. :param bool use_ml: Use ML instead of REML throughout. :param delta: Fixed delta value to use in the global model, overrides fitting delta. :type delta: float or None :param float sparsity_threshold: Genotype vector sparsity at or below which to use sparse genotype vector in rotation (advanced). :param bool use_dosages: If true, use dosages rather than hard call genotypes. :param int n_eigs: Number of eigenvectors of the kinship matrix used to fit the model. :param float dropped_variance_fraction: Upper bound on fraction of sample variance lost by dropping eigenvectors with small eigenvalues. :return: Variant dataset with linear mixed regression annotations. :rtype: :py:class:`.VariantDataset` """ jvds = self._jvdf.lmmreg(kinshipMatrix._jkm, y, jarray(Env.jvm().java.lang.String, covariates), use_ml, global_root, va_root, run_assoc, joption(delta), sparsity_threshold, use_dosages, joption(n_eigs), joption(dropped_variance_fraction)) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @requireTGenotype @typecheck_method(test=strlike, y=strlike, covariates=listof(strlike), root=strlike, use_dosages=bool) def logreg(self, test, y, covariates=[], root='va.logreg', use_dosages=False): """Test each variant for association using logistic regression. .. include:: requireTGenotype.rst **Examples** Run the logistic regression Wald test per variant using a Boolean phenotype and two covariates stored in sample annotations: >>> vds_result = vds.logreg('wald', 'sa.pheno.isCase', covariates=['sa.pheno.age', 'sa.pheno.isFemale']) **Notes** The :py:meth:`~hail.VariantDataset.logreg` method performs, for each variant, a significance test of the genotype in predicting a binary (case-control) phenotype based on the logistic regression model. The phenotype type must either be numeric (with all present values 0 or 1) or Boolean, in which case true and false are coded as 1 and 0, respectively. Hail supports the Wald test ('wald'), likelihood ratio test ('lrt'), Rao score test ('score'), and Firth test ('firth'). Hail only includes samples for which the phenotype and all covariates are defined. For each variant, Hail imputes missing genotypes as the mean of called genotypes. By default, genotypes values are given by hard call genotypes (``g.gt``). If ``use_dosages=True``, then genotype values are defined by the dosage :math:`\mathrm{P}(\mathrm{Het}) + 2 \cdot \mathrm{P}(\mathrm{HomVar})`. For Phred-scaled values, :math:`\mathrm{P}(\mathrm{Het})` and :math:`\mathrm{P}(\mathrm{HomVar})` are calculated by normalizing the PL likelihoods (converted from the Phred-scale) to sum to 1. The example above considers a model of the form .. math:: \mathrm{Prob}(\mathrm{isCase}) = \mathrm{sigmoid}(\\beta_0 + \\beta_1 \, \mathrm{gt} + \\beta_2 \, \mathrm{age} + \\beta_3 \, \mathrm{isFemale} + \\varepsilon), \quad \\varepsilon \sim \mathrm{N}(0, \sigma^2) where :math:`\mathrm{sigmoid}` is the `sigmoid function <https://en.wikipedia.org/wiki/Sigmoid_function>`__, the genotype :math:`\mathrm{gt}` is coded as 0 for HomRef, 1 for Het, and 2 for HomVar, and the Boolean covariate :math:`\mathrm{isFemale}` is coded as 1 for true (female) and 0 for false (male). The null model sets :math:`\\beta_1 = 0`. The resulting variant annotations depend on the test statistic as shown in the tables below. ========== =================== ====== ===== Test Annotation Type Value ========== =================== ====== ===== Wald ``va.logreg.beta`` Double fit genotype coefficient, :math:`\hat\\beta_1` Wald ``va.logreg.se`` Double estimated standard error, :math:`\widehat{\mathrm{se}}` Wald ``va.logreg.zstat`` Double Wald :math:`z`-statistic, equal to :math:`\hat\\beta_1 / \widehat{\mathrm{se}}` Wald ``va.logreg.pval`` Double Wald p-value testing :math:`\\beta_1 = 0` LRT, Firth ``va.logreg.beta`` Double fit genotype coefficient, :math:`\hat\\beta_1` LRT, Firth ``va.logreg.chi2`` Double deviance statistic LRT, Firth ``va.logreg.pval`` Double LRT / Firth p-value testing :math:`\\beta_1 = 0` Score ``va.logreg.chi2`` Double score statistic Score ``va.logreg.pval`` Double score p-value testing :math:`\\beta_1 = 0` ========== =================== ====== ===== For the Wald and likelihood ratio tests, Hail fits the logistic model for each variant using Newton iteration and only emits the above annotations when the maximum likelihood estimate of the coefficients converges. The Firth test uses a modified form of Newton iteration. To help diagnose convergence issues, Hail also emits three variant annotations which summarize the iterative fitting process: ================ =========================== ======= ===== Test Annotation Type Value ================ =========================== ======= ===== Wald, LRT, Firth ``va.logreg.fit.nIter`` Int number of iterations until convergence, explosion, or reaching the max (25 for Wald, LRT; 100 for Firth) Wald, LRT, Firth ``va.logreg.fit.converged`` Boolean true if iteration converged Wald, LRT, Firth ``va.logreg.fit.exploded`` Boolean true if iteration exploded ================ =========================== ======= ===== We consider iteration to have converged when every coordinate of :math:`\\beta` changes by less than :math:`10^{-6}`. For Wald and LRT, up to 25 iterations are attempted; in testing we find 4 or 5 iterations nearly always suffice. Convergence may also fail due to explosion, which refers to low-level numerical linear algebra exceptions caused by manipulating ill-conditioned matrices. Explosion may result from (nearly) linearly dependent covariates or complete `separation <https://en.wikipedia.org/wiki/Separation_(statistics)>`__. A more common situation in genetics is quasi-complete seperation, e.g. variants that are observed only in cases (or controls). Such variants inevitably arise when testing millions of variants with very low minor allele count. The maximum likelihood estimate of :math:`\\beta` under logistic regression is then undefined but convergence may still occur after a large number of iterations due to a very flat likelihood surface. In testing, we find that such variants produce a secondary bump from 10 to 15 iterations in the histogram of number of iterations per variant. We also find that this faux convergence produces large standard errors and large (insignificant) p-values. To not miss such variants, consider using Firth logistic regression, linear regression, or group-based tests. Here's a concrete illustration of quasi-complete seperation in R. Suppose we have 2010 samples distributed as follows for a particular variant: ======= ====== === ====== Status HomRef Het HomVar ======= ====== === ====== Case 1000 10 0 Control 1000 0 0 ======= ====== === ====== The following R code fits the (standard) logistic, Firth logistic, and linear regression models to this data, where ``x`` is genotype, ``y`` is phenotype, and ``logistf`` is from the logistf package: .. code-block:: R x <- c(rep(0,1000), rep(1,1000), rep(1,10) y <- c(rep(0,1000), rep(0,1000), rep(1,10)) logfit <- glm(y ~ x, family=binomial()) firthfit <- logistf(y ~ x) linfit <- lm(y ~ x) The resulting p-values for the genotype coefficient are 0.991, 0.00085, and 0.0016, respectively. The erroneous value 0.991 is due to quasi-complete separation. Moving one of the 10 hets from case to control eliminates this quasi-complete separation; the p-values from R are then 0.0373, 0.0111, and 0.0116, respectively, as expected for a less significant association. The Firth test reduces bias from small counts and resolves the issue of separation by penalizing maximum likelihood estimation by the `Jeffrey's invariant prior <https://en.wikipedia.org/wiki/Jeffreys_prior>`__. This test is slower, as both the null and full model must be fit per variant, and convergence of the modified Newton method is linear rather than quadratic. For Firth, 100 iterations are attempted for the null model and, if that is successful, for the full model as well. In testing we find 20 iterations nearly always suffices. If the null model fails to converge, then the ``sa.lmmreg.fit`` annotations reflect the null model; otherwise, they reflect the full model. See `Recommended joint and meta-analysis strategies for case-control association testing of single low-count variants <http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4049324/>`__ for an empirical comparison of the logistic Wald, LRT, score, and Firth tests. The theoretical foundations of the Wald, likelihood ratio, and score tests may be found in Chapter 3 of Gesine Reinert's notes `Statistical Theory <http://www.stats.ox.ac.uk/~reinert/stattheory/theoryshort09.pdf>`__. Firth introduced his approach in `Bias reduction of maximum likelihood estimates, 1993 <http://www2.stat.duke.edu/~scs/Courses/Stat376/Papers/GibbsFieldEst/BiasReductionMLE.pdf>`__. Heinze and Schemper further analyze Firth's approach in `A solution to the problem of separation in logistic regression, 2002 <https://cemsiis.meduniwien.ac.at/fileadmin/msi_akim/CeMSIIS/KB/volltexte/Heinze_Schemper_2002_Statistics_in_Medicine.pdf>`__. Those variants that don't vary across the included samples (e.g., all genotypes are HomRef) will have missing annotations. Phenotype and covariate sample annotations may also be specified using `programmatic expressions <exprlang.html>`__ without identifiers, such as: .. code-block:: text if (sa.isFemale) sa.cov.age else (2 * sa.cov.age + 10) For Boolean covariate types, true is coded as 1 and false as 0. In particular, for the sample annotation ``sa.fam.isCase`` added by importing a FAM file with case-control phenotype, case is 1 and control is 0. Hail's logistic regression tests correspond to the ``b.wald``, ``b.lrt``, and ``b.score`` tests in `EPACTS <http://genome.sph.umich.edu/wiki/EPACTS#Single_Variant_Tests>`__. For each variant, Hail imputes missing genotypes as the mean of called genotypes, whereas EPACTS subsets to those samples with called genotypes. Hence, Hail and EPACTS results will currently only agree for variants with no missing genotypes. :param str test: Statistical test, one of: 'wald', 'lrt', 'score', or 'firth'. :param str y: Response expression. Must evaluate to Boolean or numeric with all values 0 or 1. :param covariates: list of covariate expressions :type covariates: list of str :param str root: Variant annotation path to store result of logistic regression. :param bool use_dosages: If true, use genotype dosage rather than hard call. :return: Variant dataset with logistic regression variant annotations. :rtype: :py:class:`.VariantDataset` """ jvds = self._jvdf.logreg(test, y, jarray(Env.jvm().java.lang.String, covariates), root, use_dosages) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(key_name=strlike, variant_keys=strlike, single_key=bool, agg_expr=strlike, test=strlike, y=strlike, covariates=listof(strlike)) def logreg_burden(self, key_name, variant_keys, single_key, agg_expr, test, y, covariates=[]): r"""Test each keyed group of variants for association by aggregating (collapsing) genotypes and applying the logistic regression model. .. include:: requireTGenotype.rst **Examples** Run a gene burden test using the logistic Wald test on the maximum genotype per gene. Here ``va.genes`` is a variant annotation of type Set[String] giving the set of genes containing the variant (see **Extended example** in :py:meth:`.linreg_burden` for a deeper dive in the context of linear regression): >>> logreg_kt, sample_kt = (hc.read('data/example_burden.vds') ... .logreg_burden(key_name='gene', ... variant_keys='va.genes', ... single_key=False, ... agg_expr='gs.map(g => g.gt).max()', ... test='wald', ... y='sa.burden.pheno', ... covariates=['sa.burden.cov1', 'sa.burden.cov2'])) Run a gene burden test using the logistic score test on the weighted sum of genotypes per gene. Here ``va.gene`` is a variant annotation of type String giving a single gene per variant (or no gene if missing), and ``va.weight`` is a numeric variant annotation: >>> logreg_kt, sample_kt = (hc.read('data/example_burden.vds') ... .logreg_burden(key_name='gene', ... variant_keys='va.gene', ... single_key=True, ... agg_expr='gs.map(g => va.weight * g.gt).sum()', ... test='score', ... y='sa.burden.pheno', ... covariates=['sa.burden.cov1', 'sa.burden.cov2'])) To use a weighted sum of genotypes with missing genotypes mean-imputed rather than ignored, set ``agg_expr='gs.map(g => va.weight * orElse(g.gt.toDouble, 2 * va.qc.AF)).sum()'`` where ``va.qc.AF`` is the allele frequency over those samples that have no missing phenotype or covariates. .. caution:: With ``single_key=False``, ``variant_keys`` expects a variant annotation of Set or Array type, in order to allow each variant to have zero, one, or more keys (for example, the same variant may appear in multiple genes). Unlike with type Set, if the same key appears twice in a variant annotation of type Array, then that variant will be counted twice in that key's group. With ``single_key=True``, ``variant_keys`` expects a variant annotation whose value is itself the key of interest. In bose cases, variants with missing keys are ignored. **Notes** This method modifies :py:meth:`.logreg` by replacing the genotype covariate per variant and sample with an aggregated (i.e., collapsed) score per key and sample. This numeric score is computed from the sample's genotypes and annotations over all variants with that key. The phenotype type must either be numeric (with all present values 0 or 1) or Boolean, in which case true and false are coded as 1 and 0, respectively. Hail supports the Wald test ('wald'), likelihood ratio test ('lrt'), Rao score test ('score'), and Firth test ('firth') as the ``test`` parameter. Conceptually, the method proceeds as follows: 1) Filter to the set of samples for which all phenotype and covariates are defined. 2) For each key and sample, aggregate genotypes across variants with that key to produce a numeric score. ``agg_expr`` must be of numeric type and has the following symbols are in scope: - ``s`` (*Sample*): sample - ``sa``: sample annotations - ``global``: global annotations - ``gs`` (*Aggregable[Genotype]*): aggregable of :ref:`genotype` for sample ``s`` Note that ``v``, ``va``, and ``g`` are accessible through `Aggregable methods <https://hail.is/hail/types.html#aggregable>`_ on ``gs``. The resulting **sample key table** has key column ``key_name`` and a numeric column of scores for each sample named by the sample ID. 3) For each key, fit the logistic regression model using the supplied phenotype, covariates, and test. The model and tests are those of :py:meth:`.logreg` with sample genotype ``gt`` replaced by the score in the sample key table. For each key, missing scores are mean-imputed across all samples. The resulting **logistic regression key table** has key column of type String given by the ``key_name`` parameter and additional columns corresponding to the fields of the ``va.logreg`` schema given for ``test`` in :py:meth:`.logreg`. :py:meth:`.logreg_burden` returns both the logistic regression key table and the sample key table. :param str key_name: Name to assign to key column of returned key tables. :param str variant_keys: Variant annotation path for the TArray or TSet of keys associated to each variant. :param bool single_key: if true, ``variant_keys`` is interpreted as a single (or missing) key per variant, rather than as a collection of keys. :param str agg_expr: Sample aggregation expression (per key). :param str test: Statistical test, one of: 'wald', 'lrt', 'score', or 'firth'. :param str y: Response expression. :param covariates: list of covariate expressions. :type covariates: list of str :return: Tuple of logistic regression key table and sample aggregation key table. :rtype: (:py:class:`.KeyTable`, :py:class:`.KeyTable`) """ r = self._jvdf.logregBurden(key_name, variant_keys, single_key, agg_expr, test, y, jarray(Env.jvm().java.lang.String, covariates)) logreg_kt = KeyTable(self.hc, r._1()) sample_kt = KeyTable(self.hc, r._2()) return logreg_kt, sample_kt
[docs] @handle_py4j @requireTGenotype @typecheck_method(pedigree=Pedigree) def mendel_errors(self, pedigree): """Find Mendel errors; count per variant, individual and nuclear family. .. include:: requireTGenotype.rst **Examples** Find all violations of Mendelian inheritance in each (dad, mom, kid) trio in a pedigree and return four tables: >>> ped = Pedigree.read('data/trios.fam') >>> all, per_fam, per_sample, per_variant = vds.mendel_errors(ped) Export all mendel errors to a text file: >>> all.export('output/all_mendel_errors.tsv') Annotate samples with the number of Mendel errors: >>> annotated_vds = vds.annotate_samples_table(per_sample, root="sa.mendel") Annotate variants with the number of Mendel errors: >>> annotated_vds = vds.annotate_variants_table(per_variant, root="va.mendel") **Notes** This method assumes all contigs apart from X and Y are fully autosomal; mitochondria, decoys, etc. are not given special treatment. The example above returns four tables, which contain Mendelian violations grouped in various ways. These tables are modeled after the `PLINK mendel formats <https://www.cog-genomics.org/plink2/formats#mendel>`_. The four tables contain the following columns: **First table:** all Mendel errors. This table contains one row per Mendel error in the dataset; it is possible that a variant or sample may be found on more than one row. This table closely reflects the structure of the ".mendel" PLINK format detailed below. Columns: - **fid** (*String*) -- Family ID. - **s** (*String*) -- Proband ID. - **v** (*Variant*) -- Variant in which the error was found. - **code** (*Int*) -- Mendel error code, see below. - **error** (*String*) -- Readable representation of Mendel error. **Second table:** errors per nuclear family. This table contains one row per nuclear family in the dataset. This table closely reflects the structure of the ".fmendel" PLINK format detailed below. Columns: - **fid** (*String*) -- Family ID. - **father** (*String*) -- Paternal ID. - **mother** (*String*) -- Maternal ID. - **nChildren** (*Int*) -- Number of children in this nuclear family. - **nErrors** (*Int*) -- Number of Mendel errors in this nuclear family. - **nSNP** (*Int*) -- Number of Mendel errors at SNPs in this nuclear family. **Third table:** errors per individual. This table contains one row per individual in the dataset, including founders. This table closely reflects the structure of the ".imendel" PLINK format detailed below. Columns: - **s** (*String*) -- Sample ID (key column). - **fid** (*String*) -- Family ID. - **nErrors** (*Int*) -- Number of Mendel errors found involving this individual. - **nSNP** (*Int*) -- Number of Mendel errors found involving this individual at SNPs. - **error** (*String*) -- Readable representation of Mendel error. **Fourth table:** errors per variant. This table contains one row per variant in the dataset. Columns: - **v** (*Variant*) -- Variant (key column). - **nErrors** (*Int*) -- Number of Mendel errors in this variant. **PLINK Mendel error formats:** - ``*.mendel`` -- all mendel errors: FID KID CHR SNP CODE ERROR - ``*.fmendel`` -- error count per nuclear family: FID PAT MAT CHLD N - ``*.imendel`` -- error count per individual: FID IID N - ``*.lmendel`` -- error count per variant: CHR SNP N In the PLINK formats, **FID**, **KID**, **PAT**, **MAT**, and **IID** refer to family, kid, dad, mom, and individual ID, respectively, with missing values set to ``0``. SNP denotes the variant identifier ``chr:pos:ref:alt``. N is the error count. CHLD is the number of children in a nuclear family. The CODE of each Mendel error is determined by the table below, extending the `Plink classification <https://www.cog-genomics.org/plink2/basic_stats#mendel>`__. Those individuals implicated by each code are in bold. The copy state of a locus with respect to a trio is defined as follows, where PAR is the `pseudoautosomal region <https://en.wikipedia.org/wiki/Pseudoautosomal_region>`__ (PAR). - HemiX -- in non-PAR of X, male child - HemiY -- in non-PAR of Y, male child - Auto -- otherwise (in autosome or PAR, or female child) Any refers to :math:`\{ HomRef, Het, HomVar, NoCall \}` and ! denotes complement in this set. +--------+------------+------------+----------+------------------+ |Code | Dad | Mom | Kid | Copy State | +========+============+============+==========+==================+ | 1 | HomVar | HomVar | Het | Auto | +--------+------------+------------+----------+------------------+ | 2 | HomRef | HomRef | Het | Auto | +--------+------------+------------+----------+------------------+ | 3 | HomRef | ! HomRef | HomVar | Auto | +--------+------------+------------+----------+------------------+ | 4 | ! HomRef | HomRef | HomVar | Auto | +--------+------------+------------+----------+------------------+ | 5 | HomRef | HomRef | HomVar | Auto | +--------+------------+------------+----------+------------------+ | 6 | HomVar | ! HomVar | HomRef | Auto | +--------+------------+------------+----------+------------------+ | 7 | ! HomVar | HomVar | HomRef | Auto | +--------+------------+------------+----------+------------------+ | 8 | HomVar | HomVar | HomRef | Auto | +--------+------------+------------+----------+------------------+ | 9 | Any | HomVar | HomRef | HemiX | +--------+------------+------------+----------+------------------+ | 10 | Any | HomRef | HomVar | HemiX | +--------+------------+------------+----------+------------------+ | 11 | HomVar | Any | HomRef | HemiY | +--------+------------+------------+----------+------------------+ | 12 | HomRef | Any | HomVar | HemiY | +--------+------------+------------+----------+------------------+ This method only considers children with two parents and a defined sex. PAR is currently defined with respect to reference `GRCh37 <http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/human/>`__: - X: 60001 - 2699520, 154931044 - 155260560 - Y: 10001 - 2649520, 59034050 - 59363566 :param pedigree: Sample pedigree. :type pedigree: :class:`~hail.representation.Pedigree` :returns: Four tables with Mendel error statistics. :rtype: (:class:`.KeyTable`, :class:`.KeyTable`, :class:`.KeyTable`, :class:`.KeyTable`) """ kts = self._jvdf.mendelErrors(pedigree._jrep) return KeyTable(self.hc, kts._1()), KeyTable(self.hc, kts._2()), \ KeyTable(self.hc, kts._3()), KeyTable(self.hc, kts._4())
[docs] @handle_py4j @typecheck_method(max_shift=integral) def min_rep(self, max_shift=100): """ Gives minimal, left-aligned representation of alleles. Note that this can change the variant position. **Examples** 1. Simple trimming of a multi-allelic site, no change in variant position `1:10000:TAA:TAA,AA` => `1:10000:TA:T,A` 2. Trimming of a bi-allelic site leading to a change in position `1:10000:AATAA,AAGAA` => `1:10002:T:G` :param int max_shift: maximum number of base pairs by which a split variant can move. Affects memory usage, and will cause Hail to throw an error if a variant that moves further is encountered. :rtype: :class:`.VariantDataset` """ jvds = self._jvds.minRep(max_shift) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @requireTGenotype @typecheck_method(scores=strlike, loadings=nullable(strlike), eigenvalues=nullable(strlike), k=integral, as_array=bool) def pca(self, scores, loadings=None, eigenvalues=None, k=10, as_array=False): """Run Principal Component Analysis (PCA) on the matrix of genotypes. .. include:: requireTGenotype.rst **Examples** Compute the top 5 principal component scores, stored as sample annotations ``sa.scores.PC1``, ..., ``sa.scores.PC5`` of type Double: >>> vds_result = vds.pca('sa.scores', k=5) Compute the top 5 principal component scores, loadings, and eigenvalues, stored as annotations ``sa.scores``, ``va.loadings``, and ``global.evals`` of type Array[Double]: >>> vds_result = vds.pca('sa.scores', 'va.loadings', 'global.evals', 5, as_array=True) **Notes** Hail supports principal component analysis (PCA) of genotype data, a now-standard procedure `Patterson, Price and Reich, 2006 <http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.0020190>`__. This method expects a variant dataset with biallelic autosomal variants. Scores are computed and stored as sample annotations of type Struct by default; variant loadings and eigenvalues can optionally be computed and stored in variant and global annotations, respectively. PCA is based on the singular value decomposition (SVD) of a standardized genotype matrix :math:`M`, computed as follows. An :math:`n \\times m` matrix :math:`C` records raw genotypes, with rows indexed by :math:`n` samples and columns indexed by :math:`m` bialellic autosomal variants; :math:`C_{ij}` is the number of alternate alleles of variant :math:`j` carried by sample :math:`i`, which can be 0, 1, 2, or missing. For each variant :math:`j`, the sample alternate allele frequency :math:`p_j` is computed as half the mean of the non-missing entries of column :math:`j`. Entries of :math:`M` are then mean-centered and variance-normalized as .. math:: M_{ij} = \\frac{C_{ij}-2p_j}{\sqrt{2p_j(1-p_j)m}}, with :math:`M_{ij} = 0` for :math:`C_{ij}` missing (i.e. mean genotype imputation). This scaling normalizes genotype variances to a common value :math:`1/m` for variants in Hardy-Weinberg equilibrium and is further motivated in the paper cited above. (The resulting amplification of signal from the low end of the allele frequency spectrum will also introduce noise for rare variants; common practice is to filter out variants with minor allele frequency below some cutoff.) The factor :math:`1/m` gives each sample row approximately unit total variance (assuming linkage equilibrium) and yields the sample correlation or genetic relationship matrix (GRM) as simply :math:`MM^T`. PCA then computes the SVD .. math:: M = USV^T where columns of :math:`U` are left singular vectors (orthonormal in :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2, \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`. Typically one computes only the first :math:`k` singular vectors and values, yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are :math:`n \\times k`, :math:`k \\times k` and :math:`m \\times k` respectively. From the perspective of the samples or rows of :math:`M` as data, :math:`V_k` contains the variant loadings for the first :math:`k` PCs while :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each sample. The loadings represent a new basis of features while the scores represent the projected data on those features. The eigenvalues of the GRM :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2, \ldots`, which represent the variances carried by the respective PCs. By default, Hail only computes the loadings if the ``loadings`` parameter is specified. *Note:* In PLINK/GCTA the GRM is taken as the starting point and it is computed slightly differently with regard to missing data. Here the :math:`ij` entry of :math:`MM^T` is simply the dot product of rows :math:`i` and :math:`j` of :math:`M`; in terms of :math:`C` it is .. math:: \\frac{1}{m}\sum_{l\in\mathcal{C}_i\cap\mathcal{C}_j}\\frac{(C_{il}-2p_l)(C_{jl} - 2p_l)}{2p_l(1-p_l)} where :math:`\mathcal{C}_i = \{l \mid C_{il} \\text{ is non-missing}\}`. In PLINK/GCTA the denominator :math:`m` is replaced with the number of terms in the sum :math:`\\lvert\mathcal{C}_i\cap\\mathcal{C}_j\\rvert`, i.e. the number of variants where both samples have non-missing genotypes. While this is arguably a better estimator of the true GRM (trading shrinkage for noise), it has the drawback that one loses the clean interpretation of the loadings and scores as features and projections. Separately, for the PCs PLINK/GCTA output the eigenvectors of the GRM; even ignoring the above discrepancy that means the left singular vectors :math:`U_k` instead of the component scores :math:`U_k S_k`. While this is just a matter of the scale on each PC, the scores have the advantage of representing true projections of the data onto features with the variance of a score reflecting the variance explained by the corresponding feature. (In PC bi-plots this amounts to a change in aspect ratio; for use of PCs as covariates in regression it is immaterial.) **Annotations** Given root ``scores='sa.scores'`` and ``as_array=False``, :py:meth:`~hail.VariantDataset.pca` adds a Struct to sample annotations: - **sa.scores** (*Struct*) -- Struct of sample scores With ``k=3``, the Struct has three field: - **sa.scores.PC1** (*Double*) -- Score from first PC - **sa.scores.PC2** (*Double*) -- Score from second PC - **sa.scores.PC3** (*Double*) -- Score from third PC Analogous variant and global annotations of type Struct are added by specifying the ``loadings`` and ``eigenvalues`` arguments, respectively. Given roots ``scores='sa.scores'``, ``loadings='va.loadings'``, and ``eigenvalues='global.evals'``, and ``as_array=True``, :py:meth:`~hail.VariantDataset.pca` adds the following annotations: - **sa.scores** (*Array[Double]*) -- Array of sample scores from the top k PCs - **va.loadings** (*Array[Double]*) -- Array of variant loadings in the top k PCs - **global.evals** (*Array[Double]*) -- Array of the top k eigenvalues :param str scores: Sample annotation path to store scores. :param loadings: Variant annotation path to store site loadings. :type loadings: str or None :param eigenvalues: Global annotation path to store eigenvalues. :type eigenvalues: str or None :param k: Number of principal components. :type k: int or None :param bool as_array: Store annotations as type Array rather than Struct :type k: bool or None :return: Dataset with new PCA annotations. :rtype: :class:`.VariantDataset` """ jvds = self._jvdf.pca(scores, k, joption(loadings), joption(eigenvalues), as_array) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(k=integral, maf=numeric, block_size=integral, min_kinship=numeric, statistics=enumeration("phi", "phik2", "phik2k0", "all")) def pc_relate(self, k, maf, block_size=512, min_kinship=-float("inf"), statistics="all"): """Compute relatedness estimates between individuals using a variant of the PC-Relate method. .. include:: experimental.rst **Examples** Estimate kinship, identity-by-descent two, identity-by-descent one, and identity-by-descent zero for every pair of samples, using 5 prinicpal components to correct for ancestral populations, and a minimum minor allele frequency filter of 0.01: >>> rel = vds.pc_relate(5, 0.01) Calculate values as above, but when performing distributed matrix multiplications use a matrix-block-size of 1024 by 1024. >>> rel = vds.pc_relate(5, 0.01, 1024) Calculate values as above, excluding sample-pairs with kinship less than 0.1. This is more efficient than producing the full key table and filtering using :py:meth:`~hail.KeyTable.filter`. >>> rel = vds.pc_relate(5, 0.01, min_kinship=0.1) **Method** The traditional estimator for kinship between a pair of individuals :math:`i` and :math:`j`, sharing the set :math:`S_{ij}` of single-nucleotide variants, from a population with allele frequencies :math:`p_s`, is given by: .. math:: \\widehat{\phi_{ij}} := \\frac{1}{|S_{ij}|}\\sum_{s \in S_{ij}}\\frac{(g_{is} - 2 p_s) (g_{js} - 2 p_s)}{4 * \sum_{s \in S_{ij} p_s (1 - p_s)}} This estimator is true under the model that the sharing of common (relative to the population) alleles is not very informative to relatedness (because they're common) and the sharing of rare alleles suggests a recent common ancestor from which the allele was inherited by descent. When multiple ancestry groups are mixed in a sample, this model breaks down. Alleles that are rare in all but one ancestry group are treated as very informative to relatedness. However, these alleles are simply markers of the ancestry group. The PC-Relate method corrects for this situation and the related situation of admixed individuals. PC-Relate slightly modifies the usual estimator for relatedness: occurences of population allele frequency are replaced with an "individual-specific allele frequency". This modification allows the method to correctly weight an allele according to an individual's unique ancestry profile. The "individual-specific allele frequency" at a given genetic locus is modeled by PC-Relate as a linear function of their first ``k`` principal component coordinates. As such, the efficacy of this method rests on two assumptions: - an individual's first ``k`` principal component coordinates fully describe their allele-frequency-relevant ancestry, and - the relationship between ancestry (as described by principal component coordinates) and population allele frequency is linear The estimators for kinship, and identity-by-descent zero, one, and two follow. Let: - :math:`S_{ij}` be the set of genetic loci at which both individuals :math:`i` and :math:`j` have a defined genotype - :math:`g_{is} \in {0, 1, 2}` be the number of alternate alleles that individual :math:`i` has at gentic locus :math:`s` - :math:`\\widehat{\\mu_{is}} \in [0, 1]` be the individual-specific allele frequency for individual :math:`i` at genetic locus :math:`s` - :math:`{\\widehat{\\sigma^2_{is}}} := \\widehat{\\mu_{is}} (1 - \\widehat{\\mu_{is}})`, the binomial variance of :math:`\\widehat{\\mu_{is}}` - :math:`\\widehat{\\sigma_{is}} := \sqrt{\\widehat{\\sigma^2_{is}}}`, the binomial standard deviation of :math:`\\widehat{\\mu_{is}}` - :math:`\\text{IBS}^{(0)}_{ij} := \\sum_{s \\in S_{ij}} \\mathbb{1}_{||g_{is} - g_{js} = 2||}`, the number of genetic loci at which individuals :math:`i` and :math:`j` share no alleles - :math:`\\widehat{f_i} := 2 \\widehat{\phi_{ii}} - 1`, the inbreeding coefficient for individual :math:`i` - :math:`g^D_{is}` be a dominance encoding of the genotype matrix, and :math:`X_{is}` be a normalized dominance-coded genotype matrix .. math:: g^D_{is} := \\begin{cases} \\widehat{\\mu_{is}} & g_{is} = 0 \\\\ 0 & g_{is} = 1 \\\\ 1 - \\widehat{\\mu_{is}} & g_{is} = 2 \\end{cases} X_{is} := g^D_{is} - \\widehat{\\sigma^2_{is}} (1 - \\widehat{f_i}) The estimator for kinship is given by: .. math:: \\widehat{\phi_{ij}} := \\frac{\sum_{s \in S_{ij}}(g - 2 \\mu)_{is} (g - 2 \\mu)_{js}}{4 * \sum_{s \in S_{ij}}\\widehat{\\sigma_{is}} \\widehat{\\sigma_{js}}} The estimator for identity-by-descent two is given by: .. math:: \\widehat{k^{(2)}_{ij}} := \\frac{\sum_{s \in S_{ij}}X_{is} X_{js}}{\sum_{s \in S_{ij}}\\widehat{\\sigma^2_{is}} \\widehat{\\sigma^2_{js}}} The estimator for identity-by-descent zero is given by: .. math:: \\widehat{k^{(0)}_{ij}} := \\begin{cases} \\frac{\\text{IBS}^{(0)}_{ij}} {\sum_{s \in S_{ij}} \\widehat{\\mu_{is}}^2(1 - \\widehat{\\mu_{js}})^2 + (1 - \\widehat{\\mu_{is}})^2\\widehat{\\mu_{js}}^2} & \\widehat{\phi_{ij}} > 2^{-5/2} \\\\ 1 - 4 \\widehat{\phi_{ij}} + k^{(2)}_{ij} & \\widehat{\phi_{ij}} \le 2^{-5/2} \\end{cases} The estimator for identity-by-descent one is given by: .. math:: \\widehat{k^{(1)}_{ij}} := 1 - \\widehat{k^{(2)}_{ij}} - \\widehat{k^{(0)}_{ij}} **Details** The PC-Relate method is described in "Model-free Estimation of Recent Genetic Relatedness". Conomos MP, Reiner AP, Weir BS, Thornton TA. in American Journal of Human Genetics. 2016 Jan 7. The reference implementation is available in the `GENESIS Bioconductor package <https://bioconductor.org/packages/release/bioc/html/GENESIS.html>`_ . :py:meth:`~hail.VariantDataset.pc_relate` differs from the reference implementation in a couple key ways: - the principal components analysis does not use an unrelated set of individuals - the estimators do not perform small sample correction - the algorithm does not provide an option to use population-wide allele frequency estimates - the algorithm does not provide an option to not use "overall standardization" (see R ``pcrelate`` documentation) **Notes** The ``block_size`` controls memory usage and parallelism. If it is large enough to hold an entire sample-by-sample matrix of 64-bit doubles in memory, then only one Spark worker node can be used to compute matrix operations. If it is too small, communication overhead will begin to dominate the computation's time. The author has found that on Google Dataproc (where each core has about 3.75GB of memory), setting ``block_size`` larger than 512 tends to cause memory exhaustion errors. The minimum allele frequency filter is applied per-pair: if either of the two individual's individual-specific minor allele frequency is below the threshold, then the variant's contribution to relatedness estimates is zero. Under the PC-Relate model, kinship, \[ \phi_{ij} \], ranges from 0 to 0.5, and is precisely half of the fraction-of-genetic-material-shared. Listed below are the statistics for a few pairings: - Monozygotic twins share all their genetic material so their kinship statistic is 0.5 in expection. - Parent-child and sibling pairs both have kinship 0.25 in expectation and are separated by the identity-by-descent-zero, \[ k^{(2)}_{ij} \], statistic which is zero for parent-child pairs and 0.25 for sibling pairs. - Avuncular pairs and grand-parent/-child pairs both have kinship 0.125 in expectation and both have identity-by-descent-zero 0.5 in expectation - "Third degree relatives" are those pairs sharing \[ 2^{-3} = 12.5 % \] of their genetic material, the results of PCRelate are often too noisy to reliably distinguish these pairs from higher-degree-relative-pairs or unrelated pairs. The resulting :py:class:`.KeyTable` entries have the type: *{ i: String, j: String, kin: Double, k2: Double, k1: Double, k0: Double }*. The key list is: *i: String, j: String*. :param int k: The number of principal components to use to distinguish ancestries. :param float maf: The minimum individual-specific allele frequency for an allele used to measure relatedness. :param int block_size: the side length of the blocks of the block- distributed matrices; this should be set such that at least three of these matrices fit in memory (in addition to all other objects necessary for Spark and Hail). :param float min_kinship: Pairs of samples with kinship lower than ``min_kinship`` are excluded from the results. :param str statistics: the set of statistics to compute, 'phi' will only compute the kinship statistic, 'phik2' will compute the kinship and identity-by-descent two statistics, 'phik2k0' will compute the kinship statistics and both identity-by-descent two and zero, 'all' computes the kinship statistic and all three identity-by-descent statistics. :return: A :py:class:`.KeyTable` mapping pairs of samples to estimations of their kinship and identity-by-descent zero, one, and two. :rtype: :py:class:`.KeyTable` """ intstatistics = { "phi" : 0, "phik2" : 1, "phik2k0" : 2, "all" : 3 }[statistics] return KeyTable(self.hc, self._jvdf.pcRelate(k, maf, block_size, min_kinship, intstatistics))
[docs] @handle_py4j @typecheck_method(storage_level=strlike) def persist(self, storage_level="MEMORY_AND_DISK"): """Persist this variant dataset to memory and/or disk. **Examples** Persist the variant dataset to both memory and disk: >>> vds_result = vds.persist() **Notes** The :py:meth:`~hail.VariantDataset.persist` and :py:meth:`~hail.VariantDataset.cache` methods allow you to store the current dataset on disk or in memory to avoid redundant computation and improve the performance of Hail pipelines. :py:meth:`~hail.VariantDataset.cache` is an alias for :func:`persist("MEMORY_ONLY") <hail.VariantDataset.persist>`. Most users will want "MEMORY_AND_DISK". See the `Spark documentation <http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence>`__ for a more in-depth discussion of persisting data. .. warning :: Persist, like all other :class:`.VariantDataset` functions, is functional. Its output must be captured. This is wrong: >>> vds = vds.linreg('sa.phenotype') # doctest: +SKIP >>> vds.persist() # doctest: +SKIP The above code does NOT persist ``vds``. Instead, it copies ``vds`` and persists that result. The proper usage is this: >>> vds = vds.pca().persist() # doctest: +SKIP :param storage_level: Storage level. One of: NONE, DISK_ONLY, DISK_ONLY_2, MEMORY_ONLY, MEMORY_ONLY_2, MEMORY_ONLY_SER, MEMORY_ONLY_SER_2, MEMORY_AND_DISK, MEMORY_AND_DISK_2, MEMORY_AND_DISK_SER, MEMORY_AND_DISK_SER_2, OFF_HEAP :rtype: :class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvdf.persist(storage_level))
[docs] def unpersist(self): """ Unpersists this VDS from memory/disk. **Notes** This function will have no effect on a VDS that was not previously persisted. There's nothing stopping you from continuing to use a VDS that has been unpersisted, but doing so will result in all previous steps taken to compute the VDS being performed again since the VDS must be recomputed. Only unpersist a VDS when you are done with it. """ self._jvds.unpersist()
@property @handle_py4j def global_schema(self): """ Returns the signature of the global annotations contained in this VDS. **Examples** >>> print(vds.global_schema) The ``pprint`` module can be used to print the schema in a more human-readable format: >>> from pprint import pprint >>> pprint(vds.global_schema) :rtype: :class:`.Type` """ if self._global_schema is None: self._global_schema = Type._from_java(self._jvds.globalSignature()) return self._global_schema @property @handle_py4j def colkey_schema(self): """ Returns the signature of the column key (sample) contained in this VDS. **Examples** >>> print(vds.colkey_schema) The ``pprint`` module can be used to print the schema in a more human-readable format: >>> from pprint import pprint >>> pprint(vds.colkey_schema) :rtype: :class:`.Type` """ if self._colkey_schema is None: self._colkey_schema = Type._from_java(self._jvds.sSignature()) return self._colkey_schema @property @handle_py4j def sample_schema(self): """ Returns the signature of the sample annotations contained in this VDS. **Examples** >>> print(vds.sample_schema) The ``pprint`` module can be used to print the schema in a more human-readable format: >>> from pprint import pprint >>> pprint(vds.sample_schema) :rtype: :class:`.Type` """ if self._sa_schema is None: self._sa_schema = Type._from_java(self._jvds.saSignature()) return self._sa_schema @property @handle_py4j def rowkey_schema(self): """ Returns the signature of the row key (variant) contained in this VDS. **Examples** >>> print(vds.rowkey_schema) The ``pprint`` module can be used to print the schema in a more human-readable format: >>> from pprint import pprint >>> pprint(vds.rowkey_schema) :rtype: :class:`.Type` """ if self._rowkey_schema is None: self._rowkey_schema = Type._from_java(self._jvds.vSignature()) return self._rowkey_schema @property @handle_py4j def variant_schema(self): """ Returns the signature of the variant annotations contained in this VDS. **Examples** >>> print(vds.variant_schema) The ``pprint`` module can be used to print the schema in a more human-readable format: >>> from pprint import pprint >>> pprint(vds.variant_schema) :rtype: :class:`.Type` """ if self._va_schema is None: self._va_schema = Type._from_java(self._jvds.vaSignature()) return self._va_schema @property @handle_py4j def genotype_schema(self): """ Returns the signature of the genotypes contained in this VDS. **Examples** >>> print(vds.genotype_schema) The ``pprint`` module can be used to print the schema in a more human-readable format: >>> from pprint import pprint >>> pprint(vds.genotype_schema) :rtype: :class:`.Type` """ if self._genotype_schema is None: self._genotype_schema = Type._from_java(self._jvds.genotypeSignature()) return self._genotype_schema
[docs] @handle_py4j @typecheck_method(exprs=oneof(strlike, listof(strlike))) def query_samples_typed(self, exprs): """Performs aggregation queries over samples and sample annotations, and returns Python object(s) and type(s). **Examples** >>> low_callrate_samples, t = vds.query_samples_typed( ... 'samples.filter(s => sa.qc.callRate < 0.95).collect()') See :py:meth:`.query_samples` for more information. :param exprs: query expressions :type exprs: str or list of str :rtype: (annotation or list of annotation, :class:`.Type` or list of :class:`.Type`) """ if isinstance(exprs, list): result_list = self._jvds.querySamples(jarray(Env.jvm().java.lang.String, exprs)) ptypes = [Type._from_java(x._2()) for x in result_list] annotations = [ptypes[i]._convert_to_py(result_list[i]._1()) for i in xrange(len(ptypes))] return annotations, ptypes else: result = self._jvds.querySamples(exprs) t = Type._from_java(result._2()) return t._convert_to_py(result._1()), t
[docs] @handle_py4j @typecheck_method(exprs=oneof(strlike, listof(strlike))) def query_samples(self, exprs): """Performs aggregation queries over samples and sample annotations, and returns Python object(s). **Examples** >>> low_callrate_samples = vds.query_samples('samples.filter(s => sa.qc.callRate < 0.95).collect()') **Notes** This method evaluates Hail expressions over samples and sample annotations. The ``exprs`` argument requires either a single string or a list of strings. If a single string was passed, then a single result is returned. If a list is passed, a list is returned. The namespace of the expressions includes: - ``global``: global annotations - ``samples`` (*Aggregable[Sample]*): aggregable of sample Map and filter expressions on this aggregable have the additional namespace: - ``global``: global annotations - ``s``: sample - ``sa``: sample annotations :param exprs: query expressions :type exprs: str or list of str :rtype: annotation or list of annotation """ r, t = self.query_samples_typed(exprs) return r
[docs] @handle_py4j @typecheck_method(exprs=oneof(strlike, listof(strlike))) def query_variants_typed(self, exprs): """Performs aggregation queries over variants and variant annotations, and returns Python object(s) and type(s). **Examples** >>> lof_variant_count, t = vds.query_variants_typed( ... 'variants.filter(v => va.consequence == "LOF").count()') >>> [lof_variant_count, missense_count], [t1, t2] = vds.query_variants_typed([ ... 'variants.filter(v => va.consequence == "LOF").count()', ... 'variants.filter(v => va.consequence == "Missense").count()']) See :py:meth:`.query_variants` for more information. :param exprs: query expressions :type exprs: str or list of str :rtype: (annotation or list of annotation, :class:`.Type` or list of :class:`.Type`) """ if isinstance(exprs, list): result_list = self._jvds.queryVariants(jarray(Env.jvm().java.lang.String, exprs)) ptypes = [Type._from_java(x._2()) for x in result_list] annotations = [ptypes[i]._convert_to_py(result_list[i]._1()) for i in xrange(len(ptypes))] return annotations, ptypes else: result = self._jvds.queryVariants(exprs) t = Type._from_java(result._2()) return t._convert_to_py(result._1()), t
[docs] @handle_py4j @typecheck_method(exprs=oneof(strlike, listof(strlike))) def query_variants(self, exprs): """Performs aggregation queries over variants and variant annotations, and returns Python object(s). **Examples** >>> lof_variant_count = vds.query_variants('variants.filter(v => va.consequence == "LOF").count()') >>> [lof_variant_count, missense_count] = vds.query_variants([ ... 'variants.filter(v => va.consequence == "LOF").count()', ... 'variants.filter(v => va.consequence == "Missense").count()']) **Notes** This method evaluates Hail expressions over variants and variant annotations. The ``exprs`` argument requires either a single string or a list of strings. If a single string was passed, then a single result is returned. If a list is passed, a list is returned. The namespace of the expressions includes: - ``global``: global annotations - ``variants`` (*Aggregable[Variant]*): aggregable of :ref:`variant` Map and filter expressions on this aggregable have the additional namespace: - ``global``: global annotations - ``v``: :ref:`variant` - ``va``: variant annotations **Performance Note** It is far faster to execute multiple queries in one method than to execute multiple query methods. The combined query: >>> exprs = ['variants.count()', 'variants.filter(v => v.altAllele.isSNP()).count()'] >>> [num_variants, num_snps] = vds.query_variants(exprs) will be nearly twice as fast as the split query: >>> result1 = vds.query_variants('variants.count()') >>> result2 = vds.query_variants('variants.filter(v => v.altAllele.isSNP()).count()') :param exprs: query expressions :type exprs: str or list of str :rtype: annotation or list of annotation """ r, t = self.query_variants_typed(exprs) return r
[docs] @handle_py4j @typecheck_method(exprs=oneof(strlike, listof(strlike))) def query_genotypes_typed(self, exprs): """Performs aggregation queries over genotypes, and returns Python object(s) and type(s). **Examples** >>> gq_hist, t = vds.query_genotypes_typed('gs.map(g => g.gq).hist(0, 100, 100)') >>> [gq_hist, dp_hist], [t1, t2] = vds.query_genotypes_typed(['gs.map(g => g.gq).hist(0, 100, 100)', ... 'gs.map(g => g.dp).hist(0, 60, 60)']) See :py:meth:`.query_genotypes` for more information. This method evaluates Hail expressions over genotypes, along with all variant and sample metadata for that genotype. The ``exprs`` argument requires either a list of strings or a single string The method returns a list of results and a list of types (which each contain one element if the input parameter was a single str). The namespace of the expressions includes: - ``global``: global annotations - ``gs`` (*Aggregable[Genotype]*): aggregable of :ref:`genotype` Map and filter expressions on this aggregable have the following namespace: - ``global``: global annotations - ``g``: :ref:`genotype` - ``v``: :ref:`variant` - ``va``: variant annotations - ``s``: sample - ``sa``: sample annotations **Performance Note** It is far faster to execute multiple queries in one method than to execute multiple query methods. This: >>> result1 = vds.query_genotypes('gs.count()') >>> result2 = vds.query_genotypes('gs.filter(g => v.altAllele.isSNP() && g.isHet).count()') will be nearly twice as slow as this: >>> exprs = ['gs.count()', 'gs.filter(g => v.altAllele.isSNP() && g.isHet).count()'] >>> [geno_count, snp_hets] = vds.query_genotypes(exprs) :param exprs: query expressions :type exprs: str or list of str :rtype: (annotation or list of annotation, :class:`.Type` or list of :class:`.Type`) """ if isinstance(exprs, list): result_list = self._jvds.queryGenotypes(jarray(Env.jvm().java.lang.String, exprs)) ptypes = [Type._from_java(x._2()) for x in result_list] annotations = [ptypes[i]._convert_to_py(result_list[i]._1()) for i in xrange(len(ptypes))] return annotations, ptypes else: result = self._jvds.queryGenotypes(exprs) t = Type._from_java(result._2()) return t._convert_to_py(result._1()), t
[docs] @handle_py4j @typecheck_method(exprs=oneof(strlike, listof(strlike))) def query_genotypes(self, exprs): """Performs aggregation queries over genotypes, and returns Python object(s). **Examples** Compute global GQ histogram >>> gq_hist = vds.query_genotypes('gs.map(g => g.gq).hist(0, 100, 100)') Compute call rate >>> call_rate = vds.query_genotypes('gs.fraction(g => g.isCalled)') Compute GQ and DP histograms >>> [gq_hist, dp_hist] = vds.query_genotypes(['gs.map(g => g.gq).hist(0, 100, 100)', ... 'gs.map(g => g.dp).hist(0, 60, 60)']) :param exprs: query expressions :type exprs: str or list of str :rtype: annotation or list of annotation """ r, t = self.query_genotypes_typed(exprs) return r
[docs] @handle_py4j @typecheck_method(mapping=dictof(strlike, strlike)) def rename_samples(self, mapping): """Rename samples. **Examples** >>> vds_result = vds.rename_samples({'ID1': 'id1', 'ID2': 'id2'}) Use a file with an "old_id" and "new_id" column to rename samples: >>> mapping_table = hc.import_table('data/sample_mapping.txt') >>> mapping_dict = {row.old_id: row.new_id for row in mapping_table.collect()} >>> vds_result = vds.rename_samples(mapping_dict) :param dict mapping: Mapping from old to new sample IDs. :return: Dataset with remapped sample IDs. :rtype: :class:`.VariantDataset` """ jvds = self._jvds.renameSamples(mapping) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(num_partitions=integral, shuffle=bool) def repartition(self, num_partitions, shuffle=True): """Increase or decrease the number of variant dataset partitions. **Examples** Repartition the variant dataset to have 500 partitions: >>> vds_result = vds.repartition(500) **Notes** Check the current number of partitions with :py:meth:`.num_partitions`. The data in a variant dataset is divided into chunks called partitions, which may be stored together or across a network, so that each partition may be read and processed in parallel by available cores. When a variant dataset with :math:`M` variants is first imported, each of the :math:`k` partition will contain about :math:`M/k` of the variants. Since each partition has some computational overhead, decreasing the number of partitions can improve performance after significant filtering. Since it's recommended to have at least 2 - 4 partitions per core, increasing the number of partitions can allow one to take advantage of more cores. Partitions are a core concept of distributed computation in Spark, see `here <http://spark.apache.org/docs/latest/programming-guide.html#resilient-distributed-datasets-rdds>`__ for details. With ``shuffle=True``, Hail does a full shuffle of the data and creates equal sized partitions. With ``shuffle=False``, Hail combines existing partitions to avoid a full shuffle. These algorithms correspond to the ``repartition`` and ``coalesce`` commands in Spark, respectively. In particular, when ``shuffle=False``, ``num_partitions`` cannot exceed current number of partitions. :param int num_partitions: Desired number of partitions, must be less than the current number if ``shuffle=False`` :param bool shuffle: If true, use full shuffle to repartition. :return: Variant dataset with the number of partitions equal to at most ``num_partitions`` :rtype: :class:`.VariantDataset` """ jvds = self._jvdf.coalesce(num_partitions, shuffle) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(max_partitions=integral) def naive_coalesce(self, max_partitions): """Naively descrease the number of partitions. .. warning :: :py:meth:`~hail.VariantDataset.naive_coalesce` simply combines adjacent partitions to achieve the desired number. It does not attempt to rebalance, unlike :py:meth:`~hail.VariantDataset.repartition`, so it can produce a heavily unbalanced dataset. An unbalanced dataset can be inefficient to operate on because the work is not evenly distributed across partitions. :param int max_partitions: Desired number of partitions. If the current number of partitions is less than ``max_partitions``, do nothing. :return: Variant dataset with the number of partitions equal to at most ``max_partitions`` :rtype: :class:`.VariantDataset` """ jvds = self._jvds.naiveCoalesce(max_partitions) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(force_block=bool, force_gramian=bool) def rrm(self, force_block=False, force_gramian=False): """Computes the Realized Relationship Matrix (RRM). **Examples** >>> kinship_matrix = vds.rrm() **Notes** The Realized Relationship Matrix is defined as follows. Consider the :math:`n \\times m` matrix :math:`C` of raw genotypes, with rows indexed by :math:`n` samples and columns indexed by the :math:`m` bialellic autosomal variants; :math:`C_{ij}` is the number of alternate alleles of variant :math:`j` carried by sample :math:`i`, which can be 0, 1, 2, or missing. For each variant :math:`j`, the sample alternate allele frequency :math:`p_j` is computed as half the mean of the non-missing entries of column :math:`j`. Entries of :math:`M` are then mean-centered and variance-normalized as .. math:: M_{ij} = \\frac{C_{ij}-2p_j}{\sqrt{\\frac{m}{n} \sum_{k=1}^n (C_{ij}-2p_j)^2}}, with :math:`M_{ij} = 0` for :math:`C_{ij}` missing (i.e. mean genotype imputation). This scaling normalizes each variant column to have empirical variance :math:`1/m`, which gives each sample row approximately unit total variance (assuming linkage equilibrium) and yields the :math:`n \\times n` sample correlation or realized relationship matrix (RRM) :math:`K` as simply .. math:: K = MM^T Note that the only difference between the Realized Relationship Matrix and the Genetic Relationship Matrix (GRM) used in :py:meth:`~hail.VariantDataset.grm` is the variant (column) normalization: where RRM uses empirical variance, GRM uses expected variance under Hardy-Weinberg Equilibrium. :param bool force_block: Force using Spark's BlockMatrix to compute kinship (advanced). :param bool force_gramian: Force using Spark's RowMatrix.computeGramian to compute kinship (advanced). :return: Realized Relationship Matrix for all samples. :rtype: :py:class:`KinshipMatrix` """ return KinshipMatrix(self._jvdf.rrm(force_block, force_gramian))
[docs] @handle_py4j @typecheck_method(other=vds_type, tolerance=numeric) def same(self, other, tolerance=1e-6): """True if the two variant datasets have the same variants, samples, genotypes, and annotation schemata and values. **Examples** This will return True: >>> vds.same(vds) **Notes** The ``tolerance`` parameter sets the tolerance for equality when comparing floating-point fields. More precisely, :math:`x` and :math:`y` are equal if .. math:: \abs{x - y} \leq tolerance * \max{\abs{x}, \abs{y}} :param other: variant dataset to compare against :type other: :class:`.VariantDataset` :param float tolerance: floating-point tolerance for equality :rtype: bool """ return self._jvds.same(other._jvds, tolerance)
[docs] @handle_py4j @requireTGenotype @typecheck_method(root=strlike, keep_star=bool) def sample_qc(self, root='sa.qc', keep_star=False): """Compute per-sample QC metrics. .. include:: requireTGenotype.rst **Annotations** :py:meth:`~hail.VariantDataset.sample_qc` computes 20 sample statistics from the genotype data and stores the results as sample annotations that can be accessed with ``sa.qc.<identifier>`` (or ``<root>.<identifier>`` if a non-default root was passed): +---------------------------+--------+----------------------------------------------------------+ | Name | Type | Description | +===========================+========+==========================================================+ | ``callRate`` | Double | Fraction of genotypes called | +---------------------------+--------+----------------------------------------------------------+ | ``nHomRef`` | Int | Number of homozygous reference genotypes | +---------------------------+--------+----------------------------------------------------------+ | ``nHet`` | Int | Number of heterozygous genotypes | +---------------------------+--------+----------------------------------------------------------+ | ``nHomVar`` | Int | Number of homozygous alternate genotypes | +---------------------------+--------+----------------------------------------------------------+ | ``nCalled`` | Int | Sum of ``nHomRef`` + ``nHet`` + ``nHomVar`` | +---------------------------+--------+----------------------------------------------------------+ | ``nNotCalled`` | Int | Number of uncalled genotypes | +---------------------------+--------+----------------------------------------------------------+ | ``nSNP`` | Int | Number of SNP alternate alleles | +---------------------------+--------+----------------------------------------------------------+ | ``nInsertion`` | Int | Number of insertion alternate alleles | +---------------------------+--------+----------------------------------------------------------+ | ``nDeletion`` | Int | Number of deletion alternate alleles | +---------------------------+--------+----------------------------------------------------------+ | ``nSingleton`` | Int | Number of private alleles | +---------------------------+--------+----------------------------------------------------------+ | ``nTransition`` | Int | Number of transition (A-G, C-T) alternate alleles | +---------------------------+--------+----------------------------------------------------------+ | ``nTransversion`` | Int | Number of transversion alternate alleles | +---------------------------+--------+----------------------------------------------------------+ | ``nNonRef`` | Int | Sum of ``nHet`` and ``nHomVar`` | +---------------------------+--------+----------------------------------------------------------+ | ``rTiTv`` | Double | Transition/Transversion ratio | +---------------------------+--------+----------------------------------------------------------+ | ``rHetHomVar`` | Double | Het/HomVar genotype ratio | +---------------------------+--------+----------------------------------------------------------+ | ``rInsertionDeletion`` | Double | Insertion/Deletion ratio | +---------------------------+--------+----------------------------------------------------------+ | ``dpMean`` | Double | Depth mean across all genotypes | +---------------------------+--------+----------------------------------------------------------+ | ``dpStDev`` | Double | Depth standard deviation across all genotypes | +---------------------------+--------+----------------------------------------------------------+ | ``gqMean`` | Double | The average genotype quality across all genotypes | +---------------------------+--------+----------------------------------------------------------+ | ``gqStDev`` | Double | Genotype quality standard deviation across all genotypes | +---------------------------+--------+----------------------------------------------------------+ Missing values ``NA`` may result (for example, due to division by zero) and are handled properly in filtering and written as "NA" in export modules. The empirical standard deviation is computed with zero degrees of freedom. :param str root: Sample annotation root for the computed struct. :param bool keep_star: Count star alleles as non-reference alleles :return: Annotated variant dataset with new sample qc annotations. :rtype: :class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvdf.sampleQC(root, keep_star))
[docs] @handle_py4j def storage_level(self): """Returns the storage (persistence) level of the variant dataset. **Notes** See the `Spark documentation <http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence>`__ for details on persistence levels. :rtype: str """ return self._jvds.storageLevel()
[docs] @handle_py4j @requireTGenotype def summarize(self): """Returns a summary of useful information about the dataset. .. include:: requireTGenotype.rst **Examples** >>> s = vds.summarize() >>> print(s.contigs) >>> print('call rate is %.2f' % s.call_rate) >>> s.report() The following information is contained in the summary: - **samples** (*int*) - Number of samples. - **variants** (*int*) - Number of variants. - **call_rate** (*float*) - Fraction of all genotypes called. - **contigs** (*list of str*) - List of all unique contigs found in the dataset. - **multiallelics** (*int*) - Number of multiallelic variants. - **snps** (*int*) - Number of SNP alternate alleles. - **mnps** (*int*) - Number of MNP alternate alleles. - **insertions** (*int*) - Number of insertion alternate alleles. - **deletions** (*int*) - Number of deletions alternate alleles. - **complex** (*int*) - Number of complex alternate alleles. - **star** (*int*) - Number of star (upstream deletion) alternate alleles. - **max_alleles** (*int*) - The highest number of alleles at any variant. :return: Object containing summary information. :rtype: :class:`~hail.utils.Summary` """ js = self._jvdf.summarize() return Summary._from_java(js)
[docs] @handle_py4j @typecheck_method(ann_path=strlike, attributes=dictof(strlike, strlike)) def set_va_attributes(self, ann_path, attributes): """Sets attributes for a variant annotation. Attributes are key/value pairs that can be attached to a variant annotation field. The following attributes are read from the VCF header when importing a VCF and written to the VCF header when exporting a VCF: - INFO fields attributes (attached to (`va.info.*`)): - 'Number': The arity of the field. Can take values - `0` (Boolean flag), - `1` (single value), - `R` (one value per allele, including the reference), - `A` (one value per non-reference allele), - `G` (one value per genotype), and - `.` (any number of values) - When importing: The value in read from the VCF INFO field definition - When exporting: The default value is `0` for **Boolean**, `.` for **Arrays** and 1 for all other types - 'Description' (default is '') - FILTER entries in the VCF header are generated based on the attributes of `va.filters`. Each key/value pair in the attributes will generate a FILTER entry in the VCF with ID = key and Description = value. **Examples** Consider the following command which adds a filter and an annotation to the VDS (we're assuming a split VDS for simplicity): 1) an INFO field `AC_HC`, which stores the allele count of high confidence genotypes (DP >= 10, GQ >= 20) for each non-reference allele, 2) a filter `HardFilter` that filters all sites with the `GATK suggested hard filters <http://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set>`__: - For SNVs: QD < 2.0 || FS < 60 || MQ < 40 || MQRankSum < -12.5 || ReadPosRankSum < -8.0 - For Indels (and other complex): QD < 2.0 || FS < 200.0 || ReadPosRankSum < 20.0 >>> annotated_vds = vds.annotate_variants_expr([ ... 'va.info.AC_HC = gs.filter(g => g.dp >= 10 && g.gq >= 20).callStats(g => v).AC[1:]', ... 'va.filters = if((v.altAllele.isSNP && (va.info.QD < 2.0 || va.info.FS < 60 || va.info.MQ < 40 || ' + ... 'va.info.MQRankSum < -12.5 || va.info.ReadPosRankSum < -8.0)) || ' + ... '(va.info.QD < 2.0 || va.info.FS < 200.0 || va.info.ReadPosRankSum < 20.0)) va.filters.add("HardFilter") else va.filters']) If we now export this VDS as VCF, it would produce the following header (for these new fields): .. code-block:: text ##INFO=<ID=AC_HC,Number=.,Type=String,Description="" This header doesn't contain all information that should be present in an optimal VCF header: 1) There is no FILTER entry for `HardFilter` 2) Since `AC_HC` has one entry per non-reference allele, its `Number` should be `A` 3) `AC_HC` should have a Description We can fix this by setting the attributes of these fields: >>> annotated_vds = (annotated_vds ... .set_va_attributes( ... 'va.info.AC_HC', ... {'Description': 'Allele count for high quality genotypes (DP >= 10, GQ >= 20)', ... 'Number': 'A'}) ... .set_va_attributes( ... 'va.filters', ... {'HardFilter': 'This site fails GATK suggested hard filters.'})) Exporting the VDS with the attributes now prints the following header lines: .. code-block:: text ##INFO=<ID=test,Number=A,Type=String,Description="Allele count for high quality genotypes (DP >= 10, GQ >= 20)" ##FILTER=<ID=HardFilter,Description="This site fails GATK suggested hard filters."> :param str ann_path: Path to variant annotation beginning with `va`. :param dict attributes: A str-str dict containing the attributes to set :return: Annotated dataset with the attribute added to the variant annotation. :rtype: :class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvds.setVaAttributes(ann_path, Env.jutils().javaMapToMap(attributes)))
[docs] @handle_py4j @typecheck_method(ann_path=strlike, attribute=strlike) def delete_va_attribute(self, ann_path, attribute): """Removes an attribute from a variant annotation field. Attributes are key/value pairs that can be attached to a variant annotation field. The following attributes are read from the VCF header when importing a VCF and written to the VCF header when exporting a VCF: - INFO fields attributes (attached to (`va.info.*`)): - 'Number': The arity of the field. Can take values - `0` (Boolean flag), - `1` (single value), - `R` (one value per allele, including the reference), - `A` (one value per non-reference allele), - `G` (one value per genotype), and - `.` (any number of values) - When importing: The value in read from the VCF INFO field definition - When exporting: The default value is `0` for **Boolean**, `.` for **Arrays** and 1 for all other types - 'Description' (default is '') - FILTER entries in the VCF header are generated based on the attributes of `va.filters`. Each key/value pair in the attributes will generate a FILTER entry in the VCF with ID = key and Description = value. :param str ann_path: Variant annotation path starting with 'va', period-delimited. :param str attribute: The attribute to remove (key). :return: Annotated dataset with the updated variant annotation without the attribute. :rtype: :class:`.VariantDataset` """ return VariantDataset(self.hc, self._jvds.deleteVaAttribute(ann_path, attribute))
[docs] @handle_py4j @requireTGenotype @typecheck_method(propagate_gq=bool, keep_star_alleles=bool, max_shift=integral) def split_multi(self, propagate_gq=False, keep_star_alleles=False, max_shift=100): """Split multiallelic variants. .. include:: requireTGenotype.rst **Examples** >>> vds.split_multi().write('output/split.vds') **Notes** We will explain by example. Consider a hypothetical 3-allelic variant: .. code-block:: text A C,T 0/2:7,2,6:15:45:99,50,99,0,45,99 split_multi will create two biallelic variants (one for each alternate allele) at the same position .. code-block:: text A C 0/0:13,2:15:45:0,45,99 A T 0/1:9,6:15:50:50,0,99 Each multiallelic GT field is downcoded once for each alternate allele. A call for an alternate allele maps to 1 in the biallelic variant corresponding to itself and 0 otherwise. For example, in the example above, 0/2 maps to 0/0 and 0/1. The genotype 1/2 maps to 0/1 and 0/1. The biallelic alt AD entry is just the multiallelic AD entry corresponding to the alternate allele. The ref AD entry is the sum of the other multiallelic entries. The biallelic DP is the same as the multiallelic DP. The biallelic PL entry for for a genotype g is the minimum over PL entries for multiallelic genotypes that downcode to g. For example, the PL for (A, T) at 0/1 is the minimum of the PLs for 0/1 (50) and 1/2 (45), and thus 45. Fixing an alternate allele and biallelic variant, downcoding gives a map from multiallelic to biallelic alleles and genotypes. The biallelic AD entry for an allele is just the sum of the multiallelic AD entries for alleles that map to that allele. Similarly, the biallelic PL entry for a genotype is the minimum over multiallelic PL entries for genotypes that map to that genotype. By default, GQ is recomputed from PL. If ``propagate_gq=True`` is passed, the biallelic GQ field is simply the multiallelic GQ field, that is, genotype qualities are unchanged. Here is a second example for a het non-ref .. code-block:: text A C,T 1/2:2,8,6:16:45:99,50,99,45,0,99 splits as .. code-block:: text A C 0/1:8,8:16:45:45,0,99 A T 0/1:10,6:16:50:50,0,99 **VCF Info Fields** Hail does not split annotations in the info field. This means that if a multiallelic site with ``info.AC`` value ``[10, 2]`` is split, each split site will contain the same array ``[10, 2]``. The provided allele index annotation ``va.aIndex`` can be used to select the value corresponding to the split allele's position: >>> vds_result = (vds.split_multi() ... .filter_variants_expr('va.info.AC[va.aIndex - 1] < 10', keep = False)) VCFs split by Hail and exported to new VCFs may be incompatible with other tools, if action is not taken first. Since the "Number" of the arrays in split multiallelic sites no longer matches the structure on import ("A" for 1 per allele, for example), Hail will export these fields with number ".". If the desired output is one value per site, then it is possible to use annotate_variants_expr to remap these values. Here is an example: >>> (vds.split_multi() ... .annotate_variants_expr('va.info.AC = va.info.AC[va.aIndex - 1]') ... .export_vcf('output/export.vcf')) The info field AC in *data/export.vcf* will have ``Number=1``. **Annotations** :py:meth:`~hail.VariantDataset.split_multi` adds the following annotations: - **va.wasSplit** (*Boolean*) -- true if this variant was originally multiallelic, otherwise false. - **va.aIndex** (*Int*) -- The original index of this alternate allele in the multiallelic representation (NB: 1 is the first alternate allele or the only alternate allele in a biallelic variant). For example, 1:100:A:T,C splits into two variants: 1:100:A:T with ``aIndex = 1`` and 1:100:A:C with ``aIndex = 2``. :param bool propagate_gq: Set the GQ of output (split) genotypes to be the GQ of the input (multi-allelic) variants instead of recompute GQ as the difference between the two smallest PL values. Intended to be used in conjunction with ``import_vcf(store_gq=True)``. This option will be obviated in the future by generic genotype schemas. Experimental. :param bool keep_star_alleles: Do not filter out * alleles. :param int max_shift: maximum number of base pairs by which a split variant can move. Affects memory usage, and will cause Hail to throw an error if a variant that moves further is encountered. :return: A biallelic variant dataset. :rtype: :py:class:`.VariantDataset` """ jvds = self._jvdf.splitMulti(propagate_gq, keep_star_alleles, max_shift) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @requireTGenotype @typecheck_method(pedigree=Pedigree, root=strlike) def tdt(self, pedigree, root='va.tdt'): """Find transmitted and untransmitted variants; count per variant and nuclear family. .. include:: requireTGenotype.rst **Examples** Compute TDT association results: >>> pedigree = Pedigree.read('data/trios.fam') >>> (vds.tdt(pedigree) ... .export_variants("output/tdt_results.tsv", "Variant = v, va.tdt.*")) **Notes** The transmission disequilibrium test tracks the number of times the alternate allele is transmitted (t) or not transmitted (u) from a heterozgyous parent to an affected child under the null that the rate of such transmissions is 0.5. For variants where transmission is guaranteed (i.e., the Y chromosome, mitochondria, and paternal chromosome X variants outside of the PAR), the test cannot be used. The TDT statistic is given by .. math:: (t-u)^2 \over (t+u) and follows a 1 degree of freedom chi-squared distribution under the null hypothesis. The number of transmissions and untransmissions for each possible set of genotypes is determined from the table below. The copy state of a locus with respect to a trio is defined as follows, where PAR is the pseudoautosomal region (PAR). - HemiX -- in non-PAR of X and child is male - Auto -- otherwise (in autosome or PAR, or child is female) +--------+--------+--------+------------+---+---+ | Kid | Dad | Mom | Copy State | T | U | +========+========+========+============+===+===+ | HomRef | Het | Het | Auto | 0 | 2 | +--------+--------+--------+------------+---+---+ | HomRef | HomRef | Het | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomRef | Het | HomRef | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | Het | Het | Het | Auto | 1 | 1 | +--------+--------+--------+------------+---+---+ | Het | HomRef | Het | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | Het | Het | HomRef | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | Het | HomVar | Het | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | Het | Het | HomVar | Auto | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomVar | Het | Het | Auto | 2 | 0 | +--------+--------+--------+------------+---+---+ | HomVar | Het | HomVar | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | HomVar | HomVar | Het | Auto | 1 | 0 | +--------+--------+--------+------------+---+---+ | HomRef | HomRef | Het | HemiX | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomRef | HomVar | Het | HemiX | 0 | 1 | +--------+--------+--------+------------+---+---+ | HomVar | HomRef | Het | HemiX | 1 | 0 | +--------+--------+--------+------------+---+---+ | HomVar | HomVar | Het | HemiX | 1 | 0 | +--------+--------+--------+------------+---+---+ :py:meth:`~hail.VariantDataset.tdt` only considers complete trios (two parents and a proband) with defined sex. PAR is currently defined with respect to reference `GRCh37 <http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/human/>`__: - X: 60001-2699520 - X: 154931044-155260560 - Y: 10001-2649520 - Y: 59034050-59363566 :py:meth:`~hail.VariantDataset.tdt` assumes all contigs apart from X and Y are fully autosomal; decoys, etc. are not given special treatment. **Annotations** :py:meth:`~hail.VariantDataset.tdt` adds the following annotations: - **tdt.nTransmitted** (*Int*) -- Number of transmitted alternate alleles. - **va.tdt.nUntransmitted** (*Int*) -- Number of untransmitted alternate alleles. - **va.tdt.chi2** (*Double*) -- TDT statistic. - **va.tdt.pval** (*Double*) -- p-value. :param pedigree: Sample pedigree. :type pedigree: :class:`~hail.representation.Pedigree` :param root: Variant annotation root to store TDT result. :return: Variant dataset with TDT association results added to variant annotations. :rtype: :py:class:`.VariantDataset` """ jvds = self._jvdf.tdt(pedigree._jrep, root) return VariantDataset(self.hc, jvds)
@handle_py4j def _typecheck(self): """Check if all sample, variant and global annotations are consistent with the schema.""" self._jvds.typecheck()
[docs] @handle_py4j @requireTGenotype @typecheck_method(root=strlike) def variant_qc(self, root='va.qc'): """Compute common variant statistics (quality control metrics). .. include:: requireTGenotype.rst **Examples** >>> vds_result = vds.variant_qc() .. _variantqc_annotations: **Annotations** :py:meth:`~hail.VariantDataset.variant_qc` computes 18 variant statistics from the genotype data and stores the results as variant annotations that can be accessed with ``va.qc.<identifier>`` (or ``<root>.<identifier>`` if a non-default root was passed): +---------------------------+--------+--------------------------------------------------------+ | Name | Type | Description | +===========================+========+========================================================+ | ``callRate`` | Double | Fraction of samples with called genotypes | +---------------------------+--------+--------------------------------------------------------+ | ``AF`` | Double | Calculated alternate allele frequency (q) | +---------------------------+--------+--------------------------------------------------------+ | ``AC`` | Int | Count of alternate alleles | +---------------------------+--------+--------------------------------------------------------+ | ``rHeterozygosity`` | Double | Proportion of heterozygotes | +---------------------------+--------+--------------------------------------------------------+ | ``rHetHomVar`` | Double | Ratio of heterozygotes to homozygous alternates | +---------------------------+--------+--------------------------------------------------------+ | ``rExpectedHetFrequency`` | Double | Expected rHeterozygosity based on HWE | +---------------------------+--------+--------------------------------------------------------+ | ``pHWE`` | Double | p-value from Hardy Weinberg Equilibrium null model | +---------------------------+--------+--------------------------------------------------------+ | ``nHomRef`` | Int | Number of homozygous reference samples | +---------------------------+--------+--------------------------------------------------------+ | ``nHet`` | Int | Number of heterozygous samples | +---------------------------+--------+--------------------------------------------------------+ | ``nHomVar`` | Int | Number of homozygous alternate samples | +---------------------------+--------+--------------------------------------------------------+ | ``nCalled`` | Int | Sum of ``nHomRef``, ``nHet``, and ``nHomVar`` | +---------------------------+--------+--------------------------------------------------------+ | ``nNotCalled`` | Int | Number of uncalled samples | +---------------------------+--------+--------------------------------------------------------+ | ``nNonRef`` | Int | Sum of ``nHet`` and ``nHomVar`` | +---------------------------+--------+--------------------------------------------------------+ | ``rHetHomVar`` | Double | Het/HomVar ratio across all samples | +---------------------------+--------+--------------------------------------------------------+ | ``dpMean`` | Double | Depth mean across all samples | +---------------------------+--------+--------------------------------------------------------+ | ``dpStDev`` | Double | Depth standard deviation across all samples | +---------------------------+--------+--------------------------------------------------------+ | ``gqMean`` | Double | The average genotype quality across all samples | +---------------------------+--------+--------------------------------------------------------+ | ``gqStDev`` | Double | Genotype quality standard deviation across all samples | +---------------------------+--------+--------------------------------------------------------+ Missing values ``NA`` may result (for example, due to division by zero) and are handled properly in filtering and written as "NA" in export modules. The empirical standard deviation is computed with zero degrees of freedom. :param str root: Variant annotation root for computed struct. :return: Annotated variant dataset with new variant QC annotations. :rtype: :py:class:`.VariantDataset` """ jvds = self._jvdf.variantQC(root) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j @typecheck_method(config=strlike, block_size=integral, root=strlike, csq=bool) def vep(self, config, block_size=1000, root='va.vep', csq=False): """Annotate variants with VEP. :py:meth:`~hail.VariantDataset.vep` runs `Variant Effect Predictor <http://www.ensembl.org/info/docs/tools/vep/index.html>`__ with the `LOFTEE plugin <https://github.com/konradjk/loftee>`__ on the current variant dataset and adds the result as a variant annotation. **Examples** Add VEP annotations to the dataset: >>> vds_result = vds.vep("data/vep.properties") # doctest: +SKIP **Configuration** :py:meth:`~hail.VariantDataset.vep` needs a configuration file to tell it how to run VEP. The format is a `.properties file <https://en.wikipedia.org/wiki/.properties>`__. Roughly, each line defines a property as a key-value pair of the form `key = value`. `vep` supports the following properties: - **hail.vep.perl** -- Location of Perl. Optional, default: perl. - **hail.vep.perl5lib** -- Value for the PERL5LIB environment variable when invoking VEP. Optional, by default PERL5LIB is not set. - **hail.vep.path** -- Value of the PATH environment variable when invoking VEP. Optional, by default PATH is not set. - **hail.vep.location** -- Location of the VEP Perl script. Required. - **hail.vep.cache_dir** -- Location of the VEP cache dir, passed to VEP with the `--dir` option. Required. - **hail.vep.fasta** -- Location of the FASTA file to use to look up the reference sequence, passed to VEP with the `--fasta` option. Required. - **hail.vep.assembly** -- Genome assembly version to use. Optional, default: GRCh37 - **hail.vep.plugin** -- VEP plugin, passed to VEP with the `--plugin` option. Optional. Overrides `hail.vep.lof.human_ancestor` and `hail.vep.lof.conservation_file`. - **hail.vep.lof.human_ancestor** -- Location of the human ancestor file for the LOFTEE plugin. Ignored if `hail.vep.plugin` is set. Required otherwise. - **hail.vep.lof.conservation_file** -- Location of the conservation file for the LOFTEE plugin. Ignored if `hail.vep.plugin` is set. Required otherwise. Here is an example `vep.properties` configuration file .. code-block:: text hail.vep.perl = /usr/bin/perl hail.vep.path = /usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin hail.vep.location = /path/to/vep/ensembl-tools-release-81/scripts/variant_effect_predictor/variant_effect_predictor.pl hail.vep.cache_dir = /path/to/vep hail.vep.lof.human_ancestor = /path/to/loftee_data/human_ancestor.fa.gz hail.vep.lof.conservation_file = /path/to/loftee_data//phylocsf.sql **VEP Invocation** .. code-block:: text <hail.vep.perl> <hail.vep.location> --format vcf --json --everything --allele_number --no_stats --cache --offline --dir <hail.vep.cache_dir> --fasta <hail.vep.fasta> --minimal --assembly <hail.vep.assembly> --plugin LoF,human_ancestor_fa:$<hail.vep.lof.human_ancestor>,filter_position:0.05,min_intron_size:15,conservation_file:<hail.vep.lof.conservation_file> -o STDOUT **Annotations** Annotations with the following schema are placed in the location specified by ``root``. The full resulting dataset schema can be queried with :py:attr:`~hail.VariantDataset.variant_schema`. .. code-block:: text Struct{ assembly_name: String, allele_string: String, colocated_variants: Array[Struct{ aa_allele: String, aa_maf: Double, afr_allele: String, afr_maf: Double, allele_string: String, amr_allele: String, amr_maf: Double, clin_sig: Array[String], end: Int, eas_allele: String, eas_maf: Double, ea_allele: String,, ea_maf: Double, eur_allele: String, eur_maf: Double, exac_adj_allele: String, exac_adj_maf: Double, exac_allele: String, exac_afr_allele: String, exac_afr_maf: Double, exac_amr_allele: String, exac_amr_maf: Double, exac_eas_allele: String, exac_eas_maf: Double, exac_fin_allele: String, exac_fin_maf: Double, exac_maf: Double, exac_nfe_allele: String, exac_nfe_maf: Double, exac_oth_allele: String, exac_oth_maf: Double, exac_sas_allele: String, exac_sas_maf: Double, id: String, minor_allele: String, minor_allele_freq: Double, phenotype_or_disease: Int, pubmed: Array[Int], sas_allele: String, sas_maf: Double, somatic: Int, start: Int, strand: Int }], end: Int, id: String, input: String, intergenic_consequences: Array[Struct{ allele_num: Int, consequence_terms: Array[String], impact: String, minimised: Int, variant_allele: String }], most_severe_consequence: String, motif_feature_consequences: Array[Struct{ allele_num: Int, consequence_terms: Array[String], high_inf_pos: String, impact: String, minimised: Int, motif_feature_id: String, motif_name: String, motif_pos: Int, motif_score_change: Double, strand: Int, variant_allele: String }], regulatory_feature_consequences: Array[Struct{ allele_num: Int, biotype: String, consequence_terms: Array[String], impact: String, minimised: Int, regulatory_feature_id: String, variant_allele: String }], seq_region_name: String, start: Int, strand: Int, transcript_consequences: Array[Struct{ allele_num: Int, amino_acids: String, biotype: String, canonical: Int, ccds: String, cdna_start: Int, cdna_end: Int, cds_end: Int, cds_start: Int, codons: String, consequence_terms: Array[String], distance: Int, domains: Array[Struct{ db: String name: String }], exon: String, gene_id: String, gene_pheno: Int, gene_symbol: String, gene_symbol_source: String, hgnc_id: String, hgvsc: String, hgvsp: String, hgvs_offset: Int, impact: String, intron: String, lof: String, lof_flags: String, lof_filter: String, lof_info: String, minimised: Int, polyphen_prediction: String, polyphen_score: Double, protein_end: Int, protein_start: Int, protein_id: String, sift_prediction: String, sift_score: Double, strand: Int, swissprot: String, transcript_id: String, trembl: String, uniparc: String, variant_allele: String }], variant_class: String } :param str config: Path to VEP configuration file. :param block_size: Number of variants to annotate per VEP invocation. :type block_size: int :param str root: Variant annotation path to store VEP output. :param bool csq: If ``True``, annotates VCF CSQ field as a String. If ``False``, annotates with the full nested struct schema :return: An annotated with variant annotations from VEP. :rtype: :py:class:`.VariantDataset` """ jvds = self._jvds.vep(config, root, csq, block_size) return VariantDataset(self.hc, jvds)
[docs] @handle_py4j def variants_table(self): """Convert variants and variant annotations to a KeyTable. The resulting KeyTable has schema: .. code-block:: text Struct { v: Variant va: variant annotations } with a single key ``v``. :return: Key table with variants and variant annotations. :rtype: :class:`.KeyTable` """ return KeyTable(self.hc, self._jvds.variantsKT())
[docs] @handle_py4j def samples_table(self): """Convert samples and sample annotations to KeyTable. The resulting KeyTable has schema: .. code-block:: text Struct { s: Sample sa: sample annotations } with a single key ``s``. :return: Key table with samples and sample annotations. :rtype: :class:`.KeyTable` """ return KeyTable(self.hc, self._jvds.samplesKT())
[docs] @handle_py4j def genotypes_table(self): """Generate a fully expanded genotype table. **Examples** >>> gs = vds.genotypes_table() **Notes** This produces a (massive) flat table from all the genotypes in the dataset. The table has columns: - **v** (*Variant*) - Variant (key column). - **va** (*Variant annotation schema*) - Variant annotations. - **s** (*String*) - Sample ID (key column). - **sa** (*Sample annotation schema*) - Sample annotations. - **g** (*Genotype schema*) - Genotype or generic genotype. .. caution:: This table has a row for each variant/sample pair. The genotype key table for a dataset with 10M variants and 10K samples will contain 100 billion rows. Writing or exporting this table will produce a file **much** larger than the equivalent VDS. :return: Key table with a row for each genotype. :rtype: :class:`.KeyTable` """ return KeyTable(self.hc, self._jvds.genotypeKT())
[docs] @handle_py4j @typecheck_method(variant_expr=oneof(strlike, listof(strlike)), genotype_expr=oneof(strlike, listof(strlike)), key=oneof(strlike, listof(strlike)), separator=strlike) def make_table(self, variant_expr, genotype_expr, key=[], separator='.'): """Produce a key with one row per variant and one or more columns per sample. **Examples** Consider a :py:class:`VariantDataset` ``vds`` with 2 variants and 3 samples: .. code-block:: text Variant FORMAT A B C 1:1:A:T GT:GQ 0/1:99 ./. 0/0:99 1:2:G:C GT:GQ 0/1:89 0/1:99 1/1:93 Then >>> kt = vds.make_table('v = v', ['gt = g.gt', 'gq = g.gq']) returns a :py:class:`KeyTable` with schema .. code-block:: text v: Variant A.gt: Int A.gq: Int B.gt: Int B.gq: Int C.gt: Int C.gq: Int and values .. code-block:: text v A.gt A.gq B.gt B.gq C.gt C.gq 1:1:A:T 1 99 NA NA 0 99 1:2:G:C 1 89 1 99 2 93 The above table can be generated and exported as a TSV using :class:`.KeyTable` :py:meth:`~hail.KeyTable.export`. **Notes** Per sample field names in the result are formed by concatenating the sample ID with the ``genotype_expr`` left hand side with ``separator``. If the left hand side is empty:: `` = expr then the dot (.) is omitted. :param variant_expr: Variant annotation expressions. :type variant_expr: str or list of str :param genotype_expr: Genotype annotation expressions. :type genotype_expr: str or list of str :param key: List of key columns. :type key: str or list of str :param str separator: Separator to use between sample IDs and genotype expression left-hand side identifiers. :rtype: :py:class:`.KeyTable` """ if isinstance(variant_expr, list): variant_expr = ','.join(variant_expr) if isinstance(genotype_expr, list): genotype_expr = ','.join(genotype_expr) jkt = self._jvds.makeKT(variant_expr, genotype_expr, jarray(Env.jvm().java.lang.String, wrap_to_list(key)), separator) return KeyTable(self.hc, jkt)
vds_type.set(VariantDataset)