Source code for hail.methods.statgen

import itertools
import math
import numpy as np
from typing import *

import hail as hl
import hail.expr.aggregators as agg
from hail.expr.expressions import *
from hail.expr.types import *
from hail.ir import *
from hail.genetics.reference_genome import reference_genome_type
from hail.linalg import BlockMatrix
from hail.matrixtable import MatrixTable
from hail.methods.misc import require_biallelic, require_row_key_variant, require_col_key_str
from hail.stats import LinearMixedModel
from hail.table import Table
from hail.typecheck import *
from hail.utils import wrap_to_list, new_temp_file
from hail.utils.java import *


[docs]@typecheck(dataset=MatrixTable, maf=nullable(expr_float64), bounded=bool, min=nullable(numeric), max=nullable(numeric)) def identity_by_descent(dataset, maf=None, bounded=True, min=None, max=None) -> Table: """Compute matrix of identity-by-descent estimates. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- To calculate a full IBD matrix, using minor allele frequencies computed from the dataset itself: >>> hl.identity_by_descent(dataset) To calculate an IBD matrix containing only pairs of samples with ``PI_HAT`` in :math:`[0.2, 0.9]`, using minor allele frequencies stored in the row field `panel_maf`: >>> hl.identity_by_descent(dataset, maf=dataset['panel_maf'], min=0.2, max=0.9) Notes ----- The dataset must have a column field named `s` which is a :class:`.StringExpression` and which uniquely identifies a column. The implementation is based on the IBD algorithm described in the `PLINK paper <http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1950838>`__. :func:`.identity_by_descent` requires the dataset to be biallelic and does not perform LD pruning. Linkage disequilibrium may bias the result so consider filtering variants first. The resulting :class:`.Table` entries have the type: *{ i: String, j: String, ibd: { Z0: Double, Z1: Double, Z2: Double, PI_HAT: Double }, ibs0: Long, ibs1: Long, ibs2: Long }*. The key list is: `*i: String, j: String*`. Conceptually, the output is a symmetric, sample-by-sample matrix. The output table has the following form .. code-block:: text i j ibd.Z0 ibd.Z1 ibd.Z2 ibd.PI_HAT ibs0 ibs1 ibs2 sample1 sample2 1.0000 0.0000 0.0000 0.0000 ... sample1 sample3 1.0000 0.0000 0.0000 0.0000 ... sample1 sample4 0.6807 0.0000 0.3193 0.3193 ... sample1 sample5 0.1966 0.0000 0.8034 0.8034 ... Parameters ---------- dataset : :class:`.MatrixTable` Variant-keyed and sample-keyed :class:`.MatrixTable` containing genotype information. maf : :class:`.Float64Expression`, optional Row-indexed expression for the minor allele frequency. bounded : :obj:`bool` Forces the estimations for `Z0``, ``Z1``, ``Z2``, and ``PI_HAT`` to take on biologically meaningful values (in the range [0,1]). min : :obj:`float` or :obj:`None` Sample pairs with a ``PI_HAT`` below this value will not be included in the output. Must be in :math:`[0,1]`. max : :obj:`float` or :obj:`None` Sample pairs with a ``PI_HAT`` above this value will not be included in the output. Must be in :math:`[0,1]`. Returns ------- :class:`.Table` """ require_col_key_str(dataset, 'identity_by_descent') if maf is not None: analyze('identity_by_descent/maf', maf, dataset._row_indices) dataset = dataset.select_rows(__maf = maf) else: dataset = dataset.select_rows() dataset = dataset.select_cols().select_globals().select_entries('GT') dataset = require_biallelic(dataset, 'ibd') return Table(MatrixToTableApply(dataset._mir, { 'name': 'IBD', 'mafFieldName': '__maf' if maf is not None else None, 'bounded': bounded, 'min': min, 'max': max, }))
[docs]@typecheck(call=expr_call, aaf_threshold=numeric, include_par=bool, female_threshold=numeric, male_threshold=numeric, aaf=nullable(str)) def impute_sex(call, aaf_threshold=0.0, include_par=False, female_threshold=0.2, male_threshold=0.8, aaf=None) -> Table: """Impute sex of samples by calculating inbreeding coefficient on the X chromosome. .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Remove samples where imputed sex does not equal reported sex: >>> imputed_sex = hl.impute_sex(dataset.GT) >>> dataset_result = dataset.filter_cols(imputed_sex[dataset.s].is_female != dataset.pheno.is_female, ... keep=False) Notes ----- We have used the same implementation as `PLINK v1.7 <http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#sexcheck>`__. Let `gr` be the the reference genome of the type of the `locus` key (as given by :meth:`.TLocus.reference_genome`) 1. Filter the dataset to loci on the X contig defined by `gr`. 2. Calculate alternate allele frequency (AAF) for each row from the dataset. 3. Filter to variants with AAF above `aaf_threshold`. 4. Remove loci in the pseudoautosomal region, as defined by `gr`, unless `include_par` is ``True`` (it defaults to ``False``) 5. For each row and column with a non-missing genotype call, :math:`E`, the expected number of homozygotes (from population AAF), is computed as :math:`1.0 - (2.0*\mathrm{maf}*(1.0-\mathrm{maf}))`. 6. For each row and column with a non-missing genotype call, :math:`O`, the observed number of homozygotes, is computed interpreting ``0`` as heterozygote and ``1`` as homozygote` 7. For each row and column with a non-missing genotype call, :math:`N` is incremented by 1 8. For each column, :math:`E`, :math:`O`, and :math:`N` are combined across variants 9. For each column, :math:`F` is calculated by :math:`(O - E) / (N - E)` 10. A sex is assigned to each sample with the following criteria: - Female when ``F < 0.2`` - Male when ``F > 0.8`` Use `female_threshold` and `male_threshold` to change this behavior. **Annotations** The returned column-key indexed :class:`.Table` has the following fields in addition to the matrix table's column keys: - **is_female** (:py:data:`.tbool`) -- True if the imputed sex is female, false if male, missing if undetermined. - **f_stat** (:py:data:`.tfloat64`) -- Inbreeding coefficient. - **n_called** (:py:data:`.tint64`) -- Number of variants with a genotype call. - **expected_homs** (:py:data:`.tfloat64`) -- Expected number of homozygotes. - **observed_homs** (:py:data:`.tint64`) -- Observed number of homozygotes. call : :class:`.CallExpression` A genotype call for each row and column. The source dataset's row keys must be [[locus], alleles] with types :class:`.tlocus` and :class:`.ArrayStringExpression`. Moreover, the alleles array must have exactly two elements (i.e. the variant must be biallelic). aaf_threshold : :obj:`float` Minimum alternate allele frequency threshold. include_par : :obj:`bool` Include pseudoautosomal regions. female_threshold : :obj:`float` Samples are called females if F < female_threshold. male_threshold : :obj:`float` Samples are called males if F > male_threshold. aaf : :obj:`str` or :obj:`None` A field defining the alternate allele frequency for each row. If ``None``, AAF will be computed from `call`. Return ------ :class:`.Table` Sex imputation statistics per sample. """ if aaf_threshold < 0.0 or aaf_threshold > 1.0: raise FatalError("Invalid argument for `aaf_threshold`. Must be in range [0, 1].") mt = call._indices.source mt, _ = mt._process_joins(call) mt = mt.annotate_entries(call=call) mt = require_biallelic(mt, 'impute_sex') if (aaf is None): mt = mt.annotate_rows(aaf=agg.call_stats(mt.call, mt.alleles).AF[1]) aaf = 'aaf' rg = mt.locus.dtype.reference_genome mt = hl.filter_intervals(mt, hl.map(lambda x_contig: hl.parse_locus_interval(x_contig, rg), rg.x_contigs), keep=True) if not include_par: interval_type = hl.tarray(hl.tinterval(hl.tlocus(rg))) mt = hl.filter_intervals(mt, hl.literal(rg.par, interval_type), keep=False) mt = mt.filter_rows((mt[aaf] > aaf_threshold) & (mt[aaf] < (1 - aaf_threshold))) mt = mt.annotate_cols(ib=agg.inbreeding(mt.call, mt[aaf])) kt = mt.select_cols( is_female=hl.cond(mt.ib.f_stat < female_threshold, True, hl.cond(mt.ib.f_stat > male_threshold, False, hl.null(tbool))), **mt.ib).cols() return kt
def _get_regression_row_fields(mt, pass_through, method) -> Dict[str, str]: row_fields = dict(zip(mt.row_key.keys(), mt.row_key.keys())) for f in pass_through: if isinstance(f, str): if f not in mt.row: raise ValueError(f"'{method}/pass_through': MatrixTable has no row field {repr(f)}") if f in row_fields: # allow silent pass through of key fields if f in mt.row_key: pass else: raise ValueError(f"'{method}/pass_through': found duplicated field {repr(f)}") row_fields[f] = mt[f] else: assert isinstance(f, Expression) if not f._ir.is_nested_field: raise ValueError(f"'{method}/pass_through': expect fields or nested fields, not complex expressions") if not f._indices == mt._row_indices: raise ExpressionException(f"'{method}/pass_through': require row-indexed fields, found indices {f._indices.axes}") name = f._ir.name if name in row_fields: # allow silent pass through of key fields if not (name in mt.row_key and f._ir == mt[name]._ir): raise ValueError(f"'{method}/pass_through': found duplicated field {repr(name)}") row_fields[name] = f for k in mt.row_key: del row_fields[k] return row_fields
[docs]@typecheck(y=oneof(expr_float64, sequenceof(expr_float64), sequenceof(sequenceof(expr_float64))), x=expr_float64, covariates=sequenceof(expr_float64), block_size=int, pass_through=sequenceof(oneof(str, Expression))) def linear_regression_rows(y, x, covariates, block_size=16, pass_through=()) -> hail.Table: r"""For each row, test an input variable for association with response variables using linear regression. Examples -------- >>> result_ht = hl.linear_regression_rows( ... y=dataset.pheno.height, ... x=dataset.GT.n_alt_alleles(), ... covariates=[1, dataset.pheno.age, dataset.pheno.is_female]) Warning ------- As in the example, the intercept covariate ``1`` must be included **explicitly** if desired. Warning ------- If `y` is a single value or a list, :func:`.linear_regression_rows` considers the same set of columns (i.e., samples, points) for every response variable and row, namely those columns for which **all** response variables and covariates are defined. If `y` is a list of lists, then each inner list is treated as an independent group, subsetting columns for missingness separately. Notes ----- With the default root and `y` a single expression, the following row-indexed fields are added. - **<row key fields>** (Any) -- Row key fields. - **<pass_through fields>** (Any) -- Row fields in `pass_through`. - **n** (:py:data:`.tint32`) -- Number of columns used. - **sum_x** (:py:data:`.tfloat64`) -- Sum of input values `x`. - **y_transpose_x** (:py:data:`.tfloat64`) -- Dot product of response vector `y` with the input vector `x`. - **beta** (:py:data:`.tfloat64`) -- Fit effect coefficient of `x`, :math:`\hat\beta_1` below. - **standard_error** (:py:data:`.tfloat64`) -- Estimated standard error, :math:`\widehat{\mathrm{se}}_1`. - **t_stat** (:py:data:`.tfloat64`) -- :math:`t`-statistic, equal to :math:`\hat\beta_1 / \widehat{\mathrm{se}}_1`. - **p_value** (:py:data:`.tfloat64`) -- :math:`p`-value. If `y` is a list of expressions, then the last five fields instead have type :py:data:`.tarray` of :py:data:`.tfloat64`, with corresponding indexing of the list and each array. If `y` is a list of lists of expressions, then `n` and `sum_x` are of type ``array<float64>``, and the last five fields are of type ``array<array<float64>>``. Index into these arrays with ``a[index_in_outer_list, index_in_inner_list]``. For example, if ``y=[[a], [b, c]]`` then the p-value for ``b`` is ``p_value[1][0]``. In the statistical genetics example above, the input variable `x` encodes genotype as the number of alternate alleles (0, 1, or 2). For each variant (row), genotype is tested for association with height controlling for age and sex, by fitting the linear regression model: .. math:: \mathrm{height} = \beta_0 + \beta_1 \, \mathrm{genotype} + \beta_2 \, \mathrm{age} + \beta_3 \, \mathrm{is\_female} + \varepsilon, \quad \varepsilon \sim \mathrm{N}(0, \sigma^2) Boolean covariates like :math:`\mathrm{is\_female}` are encoded as 1 for ``True`` and 0 for ``False``. The null model sets :math:`\beta_1 = 0`. The standard least-squares linear regression model is derived in Section 3.2 of `The Elements of Statistical Learning, 2nd Edition <http://statweb.stanford.edu/~tibs/ElemStatLearn/printings/ESLII_print10.pdf>`__. See equation 3.12 for the t-statistic which follows the t-distribution with :math:`n - k - 1` degrees of freedom, under the null hypothesis of no effect, with :math:`n` samples and :math:`k` covariates in addition to ``x``. Note ---- Use the `pass_through` parameter to include additional row fields from matrix table underlying ``x``. For example, to include an "rsid" field, set ``pass_through=['rsid']`` or ``pass_through=[mt.rsid]``. Parameters ---------- y : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more column-indexed response expressions. x : :class:`.Float64Expression` Entry-indexed expression for input variable. covariates : :obj:`list` of :class:`.Float64Expression` List of column-indexed covariate expressions. block_size : :obj:`int` Number of row regressions to perform simultaneously per core. Larger blocks require more memory but may improve performance. pass_through : :obj:`list` of :obj:`str` or :class:`.Expression` Additional row fields to include in the resulting table. Returns ------- :class:`.Table` """ mt = matrix_table_source('linear_regression_rows/x', x) check_entry_indexed('linear_regression_rows/x', x) y_is_list = isinstance(y, list) if y_is_list and len(y) == 0: raise ValueError(f"'linear_regression_rows': found no values for 'y'") is_chained = y_is_list and isinstance(y[0], list) if is_chained and any(len(l) == 0 for l in y): raise ValueError(f"'linear_regression_rows': found empty inner list for 'y'") y = wrap_to_list(y) for e in (itertools.chain.from_iterable(y) if is_chained else y): analyze('linear_regression_rows/y', e, mt._col_indices) for e in covariates: analyze('linear_regression_rows/covariates', e, mt._col_indices) _warn_if_no_intercept('linear_regression_rows', covariates) x_field_name = Env.get_uid() if is_chained: y_field_names = [[f'__y_{i}_{j}' for j in range(len(y[i]))] for i in range(len(y))] y_dict = dict(zip(itertools.chain.from_iterable(y_field_names), itertools.chain.from_iterable(y))) func = 'LinearRegressionRowsChained' else: y_field_names = list(f'__y_{i}' for i in range(len(y))) y_dict = dict(zip(y_field_names, y)) func = 'LinearRegressionRowsSingle' cov_field_names = list(f'__cov{i}' for i in range(len(covariates))) row_fields = _get_regression_row_fields(mt, pass_through, 'linear_regression_rows') # FIXME: selecting an existing entry field should be emitted as a SelectFields mt = mt._select_all(col_exprs=dict(**y_dict, **dict(zip(cov_field_names, covariates))), row_exprs=row_fields, col_key=[], entry_exprs={x_field_name: x}) config = { 'name': func, 'yFields': y_field_names, 'xField': x_field_name, 'covFields': cov_field_names, 'rowBlockSize': block_size, 'passThrough': [x for x in row_fields if x not in mt.row_key] } ht_result = Table(MatrixToTableApply(mt._mir, config)) if not y_is_list: fields = ['y_transpose_x', 'beta', 'standard_error', 't_stat', 'p_value'] ht_result = ht_result.annotate(**{f: ht_result[f][0] for f in fields}) return ht_result.persist()
[docs]@typecheck(test=enumeration('wald', 'lrt', 'score', 'firth'), y=oneof(expr_float64, sequenceof(expr_float64), sequenceof(sequenceof(expr_float64))), x=expr_float64, covariates=sequenceof(expr_float64), pass_through=sequenceof(oneof(str, Expression))) def logistic_regression_rows(test, y, x, covariates, pass_through=()) -> hail.Table: r"""For each row, test an input variable for association with a binary response variable using logistic regression. Examples -------- Run the logistic regression Wald test per variant using a Boolean phenotype, intercept and two covariates stored in column-indexed fields: >>> result_ht = hl.logistic_regression_rows( ... test='wald', ... y=dataset.pheno.is_case, ... x=dataset.GT.n_alt_alleles(), ... covariates=[1, dataset.pheno.age, dataset.pheno.is_female]) Run the logistic regression Wald test per variant using a list of binary (0/1) phenotypes, intercept and two covariates stored in column-indexed fields: >>> result_ht = hl.logistic_regression_rows( ... test='wald', ... y=[dataset.pheno.is_case, dataset.pheno.is_case], # where pheno values are 0, 1, or missing ... x=dataset.GT.n_alt_alleles(), ... covariates=[1, dataset.pheno.age, dataset.pheno.is_female]) Warning ------- :func:`.logistic_regression_rows` considers the same set of columns (i.e., samples, points) for every row, namely those columns for which **all** response variables and covariates are defined. For each row, missing values of `x` are mean-imputed over these columns. As in the example, the intercept covariate ``1`` must be included **explicitly** if desired. Notes ----- This method performs, for each row, a significance test of the input variable in predicting a binary (case-control) response variable based on the logistic regression model. The response variable type must either be numeric (with all present values 0 or 1) or Boolean, in which case true and false are coded as 1 and 0, respectively. Hail supports the Wald test ('wald'), likelihood ratio test ('lrt'), Rao score test ('score'), and Firth test ('firth'). Hail only includes columns for which the response variable and all covariates are defined. For each row, Hail imputes missing input values as the mean of the non-missing values. The example above considers a model of the form .. math:: \mathrm{Prob}(\mathrm{is\_case}) = \mathrm{sigmoid}(\beta_0 + \beta_1 \, \mathrm{gt} + \beta_2 \, \mathrm{age} + \beta_3 \, \mathrm{is\_female} + \varepsilon), \quad \varepsilon \sim \mathrm{N}(0, \sigma^2) where :math:`\mathrm{sigmoid}` is the `sigmoid function`_, the genotype :math:`\mathrm{gt}` is coded as 0 for HomRef, 1 for Het, and 2 for HomVar, and the Boolean covariate :math:`\mathrm{is\_female}` is coded as for ``True`` (female) and 0 for ``False`` (male). The null model sets :math:`\beta_1 = 0`. .. _sigmoid function: https://en.wikipedia.org/wiki/Sigmoid_function The structure of the emitted row field depends on the test statistic as shown in the tables below. ========== ================== ======= ============================================ Test Field Type Value ========== ================== ======= ============================================ Wald `beta` float64 fit effect coefficient, :math:`\hat\beta_1` Wald `standard_error` float64 estimated standard error, :math:`\widehat{\mathrm{se}}` Wald `z_stat` float64 Wald :math:`z`-statistic, equal to :math:`\hat\beta_1 / \widehat{\mathrm{se}}` Wald `p_value` float64 Wald p-value testing :math:`\beta_1 = 0` LRT, Firth `beta` float64 fit effect coefficient, :math:`\hat\beta_1` LRT, Firth `chi_sq_stat` float64 deviance statistic LRT, Firth `p_value` float64 LRT / Firth p-value testing :math:`\beta_1 = 0` Score `chi_sq_stat` float64 score statistic Score `p_value` float64 score p-value testing :math:`\beta_1 = 0` ========== ================== ======= ============================================ For the Wald and likelihood ratio tests, Hail fits the logistic model for each row using Newton iteration and only emits the above fields when the maximum likelihood estimate of the coefficients converges. The Firth test uses a modified form of Newton iteration. To help diagnose convergence issues, Hail also emits three fields which summarize the iterative fitting process: ================ =================== ======= =============================== Test Field Type Value ================ =================== ======= =============================== Wald, LRT, Firth `fit.n_iterations` int32 number of iterations until convergence, explosion, or reaching the max (25 for Wald, LRT; 100 for Firth) Wald, LRT, Firth `fit.converged` bool ``True`` if iteration converged Wald, LRT, Firth `fit.exploded` bool ``True`` if iteration exploded ================ =================== ======= =============================== We consider iteration to have converged when every coordinate of :math:`\beta` changes by less than :math:`10^{-6}`. For Wald and LRT, up to 25 iterations are attempted; in testing we find 4 or 5 iterations nearly always suffice. Convergence may also fail due to explosion, which refers to low-level numerical linear algebra exceptions caused by manipulating ill-conditioned matrices. Explosion may result from (nearly) linearly dependent covariates or complete separation_. .. _separation: https://en.wikipedia.org/wiki/Separation_(statistics) A more common situation in genetics is quasi-complete seperation, e.g. variants that are observed only in cases (or controls). Such variants inevitably arise when testing millions of variants with very low minor allele count. The maximum likelihood estimate of :math:`\beta` under logistic regression is then undefined but convergence may still occur after a large number of iterations due to a very flat likelihood surface. In testing, we find that such variants produce a secondary bump from 10 to 15 iterations in the histogram of number of iterations per variant. We also find that this faux convergence produces large standard errors and large (insignificant) p-values. To not miss such variants, consider using Firth logistic regression, linear regression, or group-based tests. Here's a concrete illustration of quasi-complete seperation in R. Suppose we have 2010 samples distributed as follows for a particular variant: ======= ====== === ====== Status HomRef Het HomVar ======= ====== === ====== Case 1000 10 0 Control 1000 0 0 ======= ====== === ====== The following R code fits the (standard) logistic, Firth logistic, and linear regression models to this data, where ``x`` is genotype, ``y`` is phenotype, and ``logistf`` is from the logistf package: .. code-block:: R x <- c(rep(0,1000), rep(1,1000), rep(1,10) y <- c(rep(0,1000), rep(0,1000), rep(1,10)) logfit <- glm(y ~ x, family=binomial()) firthfit <- logistf(y ~ x) linfit <- lm(y ~ x) The resulting p-values for the genotype coefficient are 0.991, 0.00085, and 0.0016, respectively. The erroneous value 0.991 is due to quasi-complete separation. Moving one of the 10 hets from case to control eliminates this quasi-complete separation; the p-values from R are then 0.0373, 0.0111, and 0.0116, respectively, as expected for a less significant association. The Firth test reduces bias from small counts and resolves the issue of separation by penalizing maximum likelihood estimation by the `Jeffrey's invariant prior <https://en.wikipedia.org/wiki/Jeffreys_prior>`__. This test is slower, as both the null and full model must be fit per variant, and convergence of the modified Newton method is linear rather than quadratic. For Firth, 100 iterations are attempted for the null model and, if that is successful, for the full model as well. In testing we find 20 iterations nearly always suffices. If the null model fails to converge, then the `logreg.fit` fields reflect the null model; otherwise, they reflect the full model. See `Recommended joint and meta-analysis strategies for case-control association testing of single low-count variants <http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4049324/>`__ for an empirical comparison of the logistic Wald, LRT, score, and Firth tests. The theoretical foundations of the Wald, likelihood ratio, and score tests may be found in Chapter 3 of Gesine Reinert's notes `Statistical Theory <http://www.stats.ox.ac.uk/~reinert/stattheory/theoryshort09.pdf>`__. Firth introduced his approach in `Bias reduction of maximum likelihood estimates, 1993 <http://www2.stat.duke.edu/~scs/Courses/Stat376/Papers/GibbsFieldEst/BiasReductionMLE.pdf>`__. Heinze and Schemper further analyze Firth's approach in `A solution to the problem of separation in logistic regression, 2002 <https://cemsiis.meduniwien.ac.at/fileadmin/msi_akim/CeMSIIS/KB/volltexte/Heinze_Schemper_2002_Statistics_in_Medicine.pdf>`__. Hail's logistic regression tests correspond to the ``b.wald``, ``b.lrt``, and ``b.score`` tests in `EPACTS`_. For each variant, Hail imputes missing input values as the mean of non-missing input values, whereas EPACTS subsets to those samples with called genotypes. Hence, Hail and EPACTS results will currently only agree for variants with no missing genotypes. .. _EPACTS: http://genome.sph.umich.edu/wiki/EPACTS#Single_Variant_Tests Note ---- Use the `pass_through` parameter to include additional row fields from matrix table underlying ``x``. For example, to include an "rsid" field, set ``pass_through=['rsid']`` or ``pass_through=[mt.rsid]``. Parameters ---------- test : {'wald', 'lrt', 'score', 'firth'} Statistical test. y : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more column-indexed response expressions. All non-missing values must evaluate to 0 or 1. Note that a :class:`.BooleanExpression` will be implicitly converted to a :class:`.Float64Expression` with this property. x : :class:`.Float64Expression` Entry-indexed expression for input variable. covariates : :obj:`list` of :class:`.Float64Expression` Non-empty list of column-indexed covariate expressions. pass_through : :obj:`list` of :obj:`str` or :class:`.Expression` Additional row fields to include in the resulting table. Returns ------- :class:`.Table` """ if len(covariates) == 0: raise ValueError('logistic regression requires at least one covariate expression') mt = matrix_table_source('logistic_regresion_rows/x', x) check_entry_indexed('logistic_regresion_rows/x', x) y_is_list = isinstance(y, list) if y_is_list and len(y) == 0: raise ValueError(f"'logistic_regression_rows': found no values for 'y'") y = wrap_to_list(y) for e in covariates: analyze('logistic_regression_rows/covariates', e, mt._col_indices) _warn_if_no_intercept('logistic_regression_rows', covariates) x_field_name = Env.get_uid() y_field = [f'__y_{i}' for i in range(len(y))] y_dict = dict(zip(y_field, y)) cov_field_names = [f'__cov{i}' for i in range(len(covariates))] row_fields = _get_regression_row_fields(mt, pass_through, 'logistic_regression_rows') # FIXME: selecting an existing entry field should be emitted as a SelectFields mt = mt._select_all(col_exprs=dict(**y_dict, **dict(zip(cov_field_names, covariates))), row_exprs=row_fields, col_key=[], entry_exprs={x_field_name: x}) config = { 'name': 'LogisticRegression', 'test': test, 'yFields': y_field, 'xField': x_field_name, 'covFields': cov_field_names, 'passThrough': [x for x in row_fields if x not in mt.row_key] } result = Table(MatrixToTableApply(mt._mir, config)) if not y_is_list: result = result.transmute(**result.logistic_regression[0]) return result.persist()
[docs]@typecheck(test=enumeration('wald', 'lrt', 'score'), y=expr_float64, x=expr_float64, covariates=sequenceof(expr_float64), pass_through=sequenceof(oneof(str, Expression))) def poisson_regression_rows(test, y, x, covariates, pass_through=()) -> Table: r"""For each row, test an input variable for association with a count response variable using `Poisson regression <https://en.wikipedia.org/wiki/Poisson_regression>`__. Notes ----- See :func:`.logistic_regression_rows` for more info on statistical tests of general linear models. Note ---- Use the `pass_through` parameter to include additional row fields from matrix table underlying ``x``. For example, to include an "rsid" field, set ``pass_through=['rsid']`` or ``pass_through=[mt.rsid]``. Parameters ---------- y : :class:`.Float64Expression` Column-indexed response expression. All non-missing values must evaluate to a non-negative integer. x : :class:`.Float64Expression` Entry-indexed expression for input variable. covariates : :obj:`list` of :class:`.Float64Expression` Non-empty list of column-indexed covariate expressions. pass_through : :obj:`list` of :obj:`str` or :class:`.Expression` Additional row fields to include in the resulting table. Returns ------- :class:`.Table` """ if len(covariates) == 0: raise ValueError('Poisson regression requires at least one covariate expression') mt = matrix_table_source('poisson_regression_rows/x', x) check_entry_indexed('poisson_regression_rows/x', x) analyze('poisson_regression_rows/y', y, mt._col_indices) all_exprs = [y] for e in covariates: all_exprs.append(e) analyze('poisson_regression_rows/covariates', e, mt._col_indices) _warn_if_no_intercept('poisson_regression_rows', covariates) x_field_name = Env.get_uid() y_field_name = '__y' cov_field_names = list(f'__cov{i}' for i in range(len(covariates))) row_fields = _get_regression_row_fields(mt, pass_through, 'poisson_regression_rows') # FIXME: selecting an existing entry field should be emitted as a SelectFields mt = mt._select_all(col_exprs=dict(**{y_field_name: y}, **dict(zip(cov_field_names, covariates))), row_exprs=row_fields, col_key=[], entry_exprs={x_field_name: x}) config = { 'name': 'PoissonRegression', 'test': test, 'yField': y_field_name, 'xField': x_field_name, 'covFields': cov_field_names, 'passThrough': [x for x in row_fields if x not in mt.row_key] } return Table(MatrixToTableApply(mt._mir, config)).persist()
[docs]@typecheck(y=expr_float64, x=sequenceof(expr_float64), z_t=nullable(expr_float64), k=nullable(np.ndarray), p_path=nullable(str), overwrite=bool, standardize=bool, mean_impute=bool) def linear_mixed_model(y, x, z_t=None, k=None, p_path=None, overwrite=False, standardize=True, mean_impute=True): r"""Initialize a linear mixed model from a matrix table. Examples -------- Initialize a model using three fixed effects (including intercept) and genetic marker random effects: >>> marker_ds = dataset.filter_rows(dataset.use_as_marker) # doctest: +SKIP >>> model, _ = hl.linear_mixed_model( # doctest: +SKIP ... y=marker_ds.pheno.height, ... x=[1, marker_ds.pheno.age, marker_ds.pheno.is_female], ... z_t=marker_ds.GT.n_alt_alleles(), ... p_path='output/p.bm') Fit the model and examine :math:`h^2`: >>> model.fit() # doctest: +SKIP >>> model.h_sq # doctest: +SKIP Sanity-check the normalized likelihood of :math:`h^2` over the percentile grid: >>> import matplotlib.pyplot as plt # doctest: +SKIP >>> plt.plot(range(101), model.h_sq_normalized_lkhd()) # doctest: +SKIP For this value of :math:`h^2`, test each variant for association: >>> result_table = hl.linear_mixed_regression_rows(dataset.GT.n_alt_alleles(), model) # doctest: +SKIP Alternatively, one can define a full-rank model using a pre-computed kinship matrix :math:`K` in ndarray form. When :math:`K` is the realized relationship matrix defined by the genetic markers, we obtain the same model as above with :math:`P` written as a block matrix but returned as an ndarray: >>> rrm = hl.realized_relationship_matrix(marker_ds.GT).to_numpy() # doctest: +SKIP >>> model, p = hl.linear_mixed_model( # doctest: +SKIP ... y=dataset.pheno.height, ... x=[1, dataset.pheno.age, dataset.pheno.is_female], ... k=rrm, ... p_path='output/p.bm', ... overwrite=True) Notes ----- See :class:`.LinearMixedModel` for details on the model and notation. Exactly one of `z_t` and `k` must be set. If `z_t` is set, the model is low-rank if the number of samples :math:`n` exceeds the number of random effects :math:`m`. At least one dimension must be less than or equal to 46300. If `standardize` is true, each random effect is first standardized to have mean 0 and variance :math:`\frac{1}{m}`, so that the diagonal values of the kinship matrix :math:`K = ZZ^T` are 1.0 in expectation. This kinship matrix corresponds to the :meth:`realized_relationship_matrix` in genetics. See :meth:`.LinearMixedModel.from_random_effects` and :meth:`.BlockMatrix.svd` for more details. If `k` is set, the model is full-rank. For correct results, the indices of `k` **must be aligned** with columns of the source of `y`. Set `p_path` if you plan to use the model in :meth:`.linear_mixed_regression_rows`. `k` must be positive semi-definite; symmetry is not checked as only the lower triangle is used. See :meth:`.LinearMixedModel.from_kinship` for more details. Missing, nan, or infinite values in `y` or `x` will raise an error. If set, `z_t` may only have missing values if `mean_impute` is true, in which case missing values of are set to the row mean. We recommend setting `mean_impute` to false if you expect no missing values, both for performance and as a sanity check. Warning ------- If the rows of the matrix table have been filtered to a small fraction, then :meth:`.MatrixTable.repartition` before this method to improve performance. Parameters ---------- y: :class:`.Float64Expression` Column-indexed expression for the observations (rows of :math:`y`). Must have no missing values. x: :obj:`list` of :class:`.Float64Expression` Non-empty list of column-indexed expressions for the fixed effects (rows of :math:`X`). Each expression must have the same source as `y` or no source (e.g., the intercept ``1.0``). Must have no missing values. z_t: :class:`.Float64Expression`, optional Entry-indexed expression for each mixed effect. These values are row-standardized to variance :math:`1 / m` to form the entries of :math:`Z^T`. If `mean_impute` is false, must have no missing values. Exactly one of `z_t` and `k` must be set. k: :class:`ndarray`, optional Kinship matrix :math:`K`. Exactly one of `z_t` and `k` must be set. p_path: :obj:`str`, optional Path at which to write the projection :math:`P` as a block matrix. Required if `z_t` is set. overwrite: :obj:`bool` If ``True``, overwrite an existing file at `p_path`. standardize: :obj:`bool` If ``True``, standardize `z_t` by row to mean 0 and variance :math:`\frac{1}{m}`. mean_impute: :obj:`bool` If ``True``, mean-impute missing values of `z_t` by row. Returns ------- model: :class:`.LinearMixedModel` Linear mixed model ready to be fit. p: :class:`ndarray` or :class:`.BlockMatrix` Matrix :math:`P` whose rows are the eigenvectors of :math:`K`. The type is block matrix if the model is low rank (i.e., if `z_t` is set and :math:`n > m`). """ source = matrix_table_source('linear_mixed_model/y', y) if ((z_t is None and k is None) or (z_t is not None and k is not None)): raise ValueError("linear_mixed_model: set exactly one of 'z_t' and 'k'") if len(x) == 0: raise ValueError("linear_mixed_model: 'x' must include at least one fixed effect") _warn_if_no_intercept('linear_mixed_model', x) # collect x and y in one pass mt = source.select_cols(xy=hl.array(x + [y])).key_cols_by() xy = np.array(mt.xy.collect(), dtype=np.float64) xy = xy.reshape(xy.size // (len(x) + 1), len(x) + 1) x_nd = np.copy(xy[:, :-1]) y_nd = np.copy(xy[:, -1]) n = y_nd.size del xy if not np.all(np.isfinite(y_nd)): raise ValueError("linear_mixed_model: 'y' has missing, nan, or infinite values") if not np.all(np.isfinite(x_nd)): raise ValueError("linear_mixed_model: 'x' has missing, nan, or infinite values") if z_t is None: model, p = LinearMixedModel.from_kinship(y_nd, x_nd, k, p_path, overwrite) else: check_entry_indexed('from_matrix_table: z_t', z_t) if matrix_table_source('linear_mixed_model/z_t', z_t) != source: raise ValueError("linear_mixed_model: 'y' and 'z_t' must " "have the same source") z_bm = BlockMatrix.from_entry_expr(z_t, mean_impute=mean_impute, center=standardize, normalize=standardize).T # variance is 1 / n m = z_bm.shape[1] model, p = LinearMixedModel.from_random_effects(y_nd, x_nd, z_bm, p_path, overwrite) if standardize: model.s = model.s * (n / m) # now variance is 1 / m if model.low_rank and isinstance(p, np.ndarray): assert n > m p = BlockMatrix.read(p_path) return model, p
[docs]@typecheck(entry_expr=expr_float64, model=LinearMixedModel, pa_t_path=nullable(str), a_t_path=nullable(str), mean_impute=bool, partition_size=nullable(int), pass_through=sequenceof(oneof(str, Expression))) def linear_mixed_regression_rows(entry_expr, model, pa_t_path=None, a_t_path=None, mean_impute=True, partition_size=None, pass_through=()): """For each row, test an input variable for association using a linear mixed model. Examples -------- See the example in :meth:`linear_mixed_model` and section below on efficiently testing multiple responses or sets of fixed effects. Notes ----- See :class:`.LinearMixedModel` for details on the model and notation. This method packages up several steps for convenience: 1. Read the transformation :math:`P` from ``model.p_path``. 2. Write `entry_expr` at `a_t_path` as the block matrix :math:`A^T` with block size that of :math:`P`. The parallelism is ``n_rows / block_size``. 3. Multiply and write :math:`A^T P^T` at `pa_t_path`. The parallelism is the number of blocks in :math:`(PA)^T`, which equals ``(n_rows / block_size) * (model.r / block_size)``. 4. Compute regression results per row with :meth:`.LinearMixedModel.fit_alternatives`. The parallelism is ``n_rows / partition_size``. If `pa_t_path` and `a_t_path` are not set, temporary files are used. `entry_expr` may only have missing values if `mean_impute` is true, in which case missing values of are set to the row mean. We recommend setting `mean_impute` to false if you expect no missing values, both for performance and as a sanity check. **Efficiently varying the response or set of fixed effects** Computing :math:`K`, :math:`P`, :math:`S`, :math:`A^T`, and especially the product :math:`(PA)^T` may require significant compute when :math:`n` and/or :math:`m` is large. However these quantities are all independent of the response :math:`y` or fixed effects :math:`X`! And with the model diagonalized, Step 4 above is fast and scalable. So having run linear mixed regression once, we can compute :math:`h^2` and regression statistics for another response or set of fixed effects on the **same samples** at the roughly the speed of :func:`.linear_regression_rows`. For example, having collected another `y` and `x` as ndarrays, one can construct a new linear mixed model directly. Supposing the model is full-rank and `p` is an ndarray: >>> model = hl.stats.LinearMixedModel(p @ y, p @ x, s) # doctest: +SKIP >>> model.fit() # doctest: +SKIP >>> result_ht = model.fit_alternatives(pa_t_path) # doctest: +SKIP Supposing the model is low-rank and `p` is a block matrix: >>> p = BlockMatrix.read(p_path) # doctest: +SKIP >>> py, px = (p @ y).to_numpy(), (p @ x).to_numpy() # doctest: +SKIP >>> model = LinearMixedModel(py, px, s, y, x) # doctest: +SKIP >>> model.fit() # doctest: +SKIP >>> result_ht = model.fit_alternatives(pa_t_path, a_t_path) # doctest: +SKIP In either case, one can easily loop through many responses or conditional analyses. To join results back to the matrix table: >>> dataset = dataset.add_row_index() # doctest: +SKIP >>> dataset = dataset.annotate_rows(lmmreg=result_ht[dataset.row_idx]]) # doctest: +SKIP Warning ------- For correct results, the column-index of `entry_expr` must correspond to the sample index of the model. This will be true, for example, if `model` was created with :func:`.linear_mixed_model` using (a possibly row-filtered version of) the source of `entry_expr`, or if `y` and `x` were collected to arrays from this source. Hail will raise an error if the number of columns does not match ``model.n``, but will not detect, for example, permuted samples. The warning on :meth:`.BlockMatrix.write_from_entry_expr` applies to this method when the number of samples is large. Note ---- Use the `pass_through` parameter to include additional row fields from matrix table underlying ``entry_expr``. For example, to include an "rsid" field, set` pass_through=['rsid']`` or ``pass_through=[mt.rsid]``. Parameters ---------- entry_expr: :class:`.Float64Expression` Entry-indexed expression for input variable. If mean_impute is false, must have no missing values. model: :class:`.LinearMixedModel` Fit linear mixed model with ``path_p`` set. pa_t_path: :obj:`str`, optional Path at which to store the transpose of :math:`PA`. If not set, a temporary file is used. a_t_path: :obj:`str`, optional Path at which to store the transpose of :math:`A`. If not set, a temporary file is used. mean_impute: :obj:`bool` Mean-impute missing values of `entry_expr` by row. partition_size: :obj:`int` Number of rows to process per partition. Default given by block size of :math:`P`. pass_through : :obj:`list` of :obj:`str` or :class:`.Expression` Additional row fields to include in the resulting table. Returns ------- :class:`.Table` """ mt = matrix_table_source('linear_mixed_regression_rows', entry_expr) n = mt.count_cols() check_entry_indexed('linear_mixed_regression_rows', entry_expr) if not model._fitted: raise ValueError("linear_mixed_regression_rows: 'model' has not been fit " "using 'fit()'") if model.p_path is None: raise ValueError("linear_mixed_regression_rows: 'model' property 'p_path' " "was not set at initialization") if model.n != n: raise ValueError(f"linear_mixed_regression_rows: linear mixed model expects {model.n} samples, " f"\n but 'entry_expr' source has {n} columns.") pa_t_path = new_temp_file() if pa_t_path is None else pa_t_path a_t_path = new_temp_file() if a_t_path is None else a_t_path p = BlockMatrix.read(model.p_path) BlockMatrix.write_from_entry_expr(entry_expr, a_t_path, mean_impute=mean_impute, block_size=p.block_size) a_t = BlockMatrix.read(a_t_path) (a_t @ p.T).write(pa_t_path, force_row_major=True) ht = model.fit_alternatives(pa_t_path, a_t_path if model.low_rank else None, partition_size) row_fields = _get_regression_row_fields(mt, pass_through, 'linear_mixed_regression_rows') mt_keys = mt.select_rows(**row_fields).add_row_index('__row_idx').rows().add_index('__row_idx').key_by('__row_idx') return mt_keys.annotate(**ht[mt_keys['__row_idx']]).key_by(*mt.row_key).drop('__row_idx')
[docs]@typecheck(key_expr=expr_any, weight_expr=expr_float64, y=expr_float64, x=expr_float64, covariates=sequenceof(expr_float64), logistic=bool, max_size=int, accuracy=numeric, iterations=int) def skat(key_expr, weight_expr, y, x, covariates, logistic=False, max_size=46340, accuracy=1e-6, iterations=10000) -> Table: r"""Test each keyed group of rows for association by linear or logistic SKAT test. Examples -------- Test each gene for association using the linear sequence kernel association test: >>> skat_table = hl.skat(key_expr=burden_ds.gene, ... weight_expr=burden_ds.weight, ... y=burden_ds.burden.pheno, ... x=burden_ds.GT.n_alt_alleles(), ... covariates=[1, burden_ds.burden.cov1, burden_ds.burden.cov2]) .. caution:: By default, the Davies algorithm iterates up to 10k times until an accuracy of 1e-6 is achieved. Hence a reported p-value of zero with no issues may truly be as large as 1e-6. The accuracy and maximum number of iterations may be controlled by the corresponding function parameters. In general, higher accuracy requires more iterations. .. caution:: To process a group with :math:`m` rows, several copies of an :math:`m \times m` matrix of doubles must fit in worker memory. Groups with tens of thousands of rows may exhaust worker memory causing the entire job to fail. In this case, use the `max_size` parameter to skip groups larger than `max_size`. Warning ------- :func:`.skat` considers the same set of columns (i.e., samples, points) for every group, namely those columns for which **all** covariates are defined. For each row, missing values of `x` are mean-imputed over these columns. As in the example, the intercept covariate ``1`` must be included **explicitly** if desired. Notes ----- This method provides a scalable implementation of the score-based variance-component test originally described in `Rare-Variant Association Testing for Sequencing Data with the Sequence Kernel Association Test <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3135811/>`__. Row weights must be non-negative. Rows with missing weights are ignored. In the R package ``skat``---which assumes rows are variants---default weights are given by evaluating the Beta(1, 25) density at the minor allele frequency. To replicate these weights in Hail using alternate allele frequencies stored in a row-indexed field `AF`, one can use the expression: >>> hl.dbeta(hl.min(ds2.AF), 1.0, 25.0) ** 2 In the logistic case, the response `y` must either be numeric (with all present values 0 or 1) or Boolean, in which case true and false are coded as 1 and 0, respectively. The resulting :class:`.Table` provides the group's key (`id`), thenumber of rows in the group (`size`), the variance component score `q_stat`, the SKAT `p-value`, and a `fault` flag. For the toy example above, the table has the form: +-------+------+--------+---------+-------+ | id | size | q_stat | p_value | fault | +=======+======+========+=========+=======+ | geneA | 2 | 4.136 | 0.205 | 0 | +-------+------+--------+---------+-------+ | geneB | 1 | 5.659 | 0.195 | 0 | +-------+------+--------+---------+-------+ | geneC | 3 | 4.122 | 0.192 | 0 | +-------+------+--------+---------+-------+ Groups larger than `max_size` appear with missing `q_stat`, `p_value`, and `fault`. The hard limit on the number of rows in a group is 46340. Note that the variance component score `q_stat` agrees with ``Q`` in the R package ``skat``, but both differ from :math:`Q` in the paper by the factor :math:`\frac{1}{2\sigma^2}` in the linear case and :math:`\frac{1}{2}` in the logistic case, where :math:`\sigma^2` is the unbiased estimator of residual variance for the linear null model. The R package also applies a "small-sample adjustment" to the null distribution in the logistic case when the sample size is less than 2000. Hail does not apply this adjustment. The fault flag is an integer indicating whether any issues occurred when running the Davies algorithm to compute the p-value as the right tail of a weighted sum of :math:`\chi^2(1)` distributions. +-------------+-----------------------------------------+ | fault value | Description | +=============+=========================================+ | 0 | no issues | +------+------+-----------------------------------------+ | 1 | accuracy NOT achieved | +------+------+-----------------------------------------+ | 2 | round-off error possibly significant | +------+------+-----------------------------------------+ | 3 | invalid parameters | +------+------+-----------------------------------------+ | 4 | unable to locate integration parameters | +------+------+-----------------------------------------+ | 5 | out of memory | +------+------+-----------------------------------------+ Parameters ---------- key_expr : :class:`.Expression` Row-indexed expression for key associated to each row. weight_expr : :class:`.Float64Expression` Row-indexed expression for row weights. y : :class:`.Float64Expression` Column-indexed response expression. If `logistic` is ``True``, all non-missing values must evaluate to 0 or 1. Note that a :class:`.BooleanExpression` will be implicitly converted to a :class:`.Float64Expression` with this property. x : :class:`.Float64Expression` Entry-indexed expression for input variable. covariates : :obj:`list` of :class:`.Float64Expression` List of column-indexed covariate expressions. logistic : :obj:`bool` If true, use the logistic test rather than the linear test. max_size : :obj:`int` Maximum size of group on which to run the test. accuracy : :obj:`float` Accuracy achieved by the Davies algorithm if fault value is zero. iterations : :obj:`int` Maximum number of iterations attempted by the Davies algorithm. Returns ------- :class:`.Table` Table of SKAT results. """ mt = matrix_table_source('skat/x', x) check_entry_indexed('skat/x', x) analyze('skat/key_expr', key_expr, mt._row_indices) analyze('skat/weight_expr', weight_expr, mt._row_indices) analyze('skat/y', y, mt._col_indices) all_exprs = [key_expr, weight_expr, y] for e in covariates: all_exprs.append(e) analyze('skat/covariates', e, mt._col_indices) _warn_if_no_intercept('skat', covariates) # FIXME: remove this logic when annotation is better optimized if x in mt._fields_inverse: x_field_name = mt._fields_inverse[x] entry_expr = {} else: x_field_name = Env.get_uid() entry_expr = {x_field_name: x} y_field_name = '__y' weight_field_name = '__weight' key_field_name = '__key' cov_field_names = list(f'__cov{i}' for i in range(len(covariates))) mt = mt._annotate_all(col_exprs=dict(**{y_field_name: y}, **dict(zip(cov_field_names, covariates))), row_exprs={weight_field_name: weight_expr, key_field_name: key_expr}, entry_exprs=entry_expr) config = { 'name': 'Skat', 'keyField': key_field_name, 'weightField': weight_field_name, 'xField': x_field_name, 'yField': y_field_name, 'covFields': cov_field_names, 'logistic': logistic, 'maxSize': max_size, 'accuracy': accuracy, 'iterations': iterations } return Table(MatrixToTableApply(mt._mir, config))
[docs]@typecheck(p_value=expr_numeric, approximate=bool) def lambda_gc(p_value, approximate=True): """ Compute genomic inflation factor (lambda GC) from an Expression of p-values. .. include:: ../_templates/experimental.rst Parameters ---------- p_value : :class:`.NumericExpression` Row-indexed numeric expression of p-values. approximate : :obj:`bool` If False, computes exact lambda GC (slower and uses more memory). Returns ------- :obj:`float` Genomic inflation factor (lambda genomic control). """ check_row_indexed('lambda_gc', p_value) t = table_source('lambda_gc', p_value) med_chisq = _lambda_gc_agg(p_value, approximate) return t.aggregate(med_chisq)
@typecheck(p_value=expr_numeric, approximate=bool) def _lambda_gc_agg(p_value, approximate=True): chisq = hl.qchisqtail(p_value, 1) if approximate: med_chisq = hl.agg.approx_quantiles(chisq, 0.5) else: med_chisq = hl.median(hl.agg.collect(chisq)) return med_chisq / hl.qchisqtail(0.5, 1)
[docs]@typecheck(call_expr=expr_call, k=int, compute_loadings=bool) def hwe_normalized_pca(call_expr, k=10, compute_loadings=False) -> Tuple[List[float], Table, Table]: r"""Run principal component analysis (PCA) on the Hardy-Weinberg-normalized genotype call matrix. Examples -------- >>> eigenvalues, scores, loadings = hl.hwe_normalized_pca(dataset.GT, k=5) Notes ----- This method specializes :func:`.pca` for the common use case of PCA in statistical genetics, that of projecting samples to a small number of ancestry coordinates. Variants that are all homozygous reference or all homozygous alternate are unnormalizable and removed before evaluation. See :func:`.pca` for more details. Users of PLINK/GCTA should be aware that Hail computes the GRM slightly differently with regard to missing data. In Hail, the :math:`ij` entry of the GRM :math:`MM^T` is simply the dot product of rows :math:`i` and :math:`j` of :math:`M`; in terms of :math:`C` it is .. math:: \frac{1}{m}\sum_{l\in\mathcal{C}_i\cap\mathcal{C}_j}\frac{(C_{il}-2p_l)(C_{jl} - 2p_l)}{2p_l(1-p_l)} where :math:`\mathcal{C}_i = \{l \mid C_{il} \text{ is non-missing}\}`. In PLINK/GCTA the denominator :math:`m` is replaced with the number of terms in the sum :math:`\lvert\mathcal{C}_i\cap\mathcal{C}_j\rvert`, i.e. the number of variants where both samples have non-missing genotypes. While this is arguably a better estimator of the true GRM (trading shrinkage for noise), it has the drawback that one loses the clean interpretation of the loadings and scores as features and projections Separately, for the PCs PLINK/GCTA output the eigenvectors of the GRM, i.e. the left singular vectors :math:`U_k` instead of the component scores :math:`U_k S_k`. The scores have the advantage of representing true projections of the data onto features with the variance of a score reflecting the variance explained by the corresponding feature. In PC bi-plots this amounts to a change in aspect ratio; for use of PCs as covariates in regression it is immaterial. Parameters ---------- call_expr : :class:`.CallExpression` Entry-indexed call expression. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` If ``True``, compute row loadings. Returns ------- (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ mt = matrix_table_source('hwe_normalized_pca/call_expr', call_expr) mt = mt.select_entries(__gt=call_expr.n_alt_alleles()) mt = mt.annotate_rows(__AC=agg.sum(mt.__gt), __n_called=agg.count_where(hl.is_defined(mt.__gt))) mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called)) n_variants = mt.count_rows() if n_variants == 0: raise FatalError("hwe_normalized_pca: found 0 variants after filtering out monomorphic sites.") info("hwe_normalized_pca: running PCA using {} variants.".format(n_variants)) mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called) mt = mt.annotate_rows( __hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt * (2 - mt.__mean_gt) * n_variants / 2)) mt = mt.unfilter_entries() normalized_gt = hl.or_else((mt.__gt - mt.__mean_gt) / mt.__hwe_scaled_std_dev, 0.0) return pca(normalized_gt, k, compute_loadings)
[docs]@typecheck(entry_expr=expr_float64, k=int, compute_loadings=bool) def pca(entry_expr, k=10, compute_loadings=False) -> Tuple[List[float], Table, Table]: r"""Run principal component analysis (PCA) on numeric columns derived from a matrix table. Examples -------- For a matrix table with variant rows, sample columns, and genotype entries, compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and 1s encoding missingness of genotype calls. >>> eigenvalues, scores, _ = hl.pca(hl.int(hl.is_defined(dataset.GT)), ... k=2) Warning ------- This method does **not** automatically mean-center or normalize each column. If desired, such transformations should be incorporated in `entry_expr`. Hail will return an error if `entry_expr` evaluates to missing, nan, or infinity on any entry. Notes ----- PCA is run on the columns of the numeric matrix obtained by evaluating `entry_expr` on each entry of the matrix table, or equivalently on the rows of the **transposed** numeric matrix :math:`M` referenced below. PCA computes the SVD .. math:: M = USV^T where columns of :math:`U` are left singular vectors (orthonormal in :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2, \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`. Typically one computes only the first :math:`k` singular vectors and values, yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are :math:`n \times k`, :math:`k \times k` and :math:`m \times k` respectively. From the perspective of the rows of :math:`M` as samples (data points), :math:`V_k` contains the loadings for the first :math:`k` PCs while :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each sample. The loadings represent a new basis of features while the scores represent the projected data on those features. The eigenvalues of the Gramian :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2, \ldots`, which represent the variances carried by the respective PCs. By default, Hail only computes the loadings if the ``loadings`` parameter is specified. Scores are stored in a :class:`.Table` with the column key of the matrix table as key and a field `scores` of type ``array<float64>`` containing the principal component scores. Loadings are stored in a :class:`.Table` with the row key of the matrix table as key and a field `loadings` of type ``array<float64>`` containing the principal component loadings. The eigenvalues are returned in descending order, with scores and loadings given the corresponding array order. Parameters ---------- entry_expr : :class:`.Expression` Numeric expression for matrix entries. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` If ``True``, compute row loadings. Returns ------- (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ check_entry_indexed('pca/entry_expr', entry_expr) mt = matrix_table_source('pca/entry_expr', entry_expr) # FIXME: remove once select_entries on a field is free if entry_expr in mt._fields_inverse: field = mt._fields_inverse[entry_expr] else: field = Env.get_uid() mt = mt.select_entries(**{field: entry_expr}) mt = mt.select_cols().select_rows().select_globals() t = (Table(MatrixToTableApply(mt._mir, { 'name': 'PCA', 'entryField': field, 'k': k, 'computeLoadings': compute_loadings })).persist()) g = t.index_globals() scores = hl.Table.parallelize(g.scores, key=list(mt.col_key)) if not compute_loadings: t = None return hl.eval(g.eigenvalues), scores, None if t is None else t.drop('eigenvalues', 'scores')
[docs]@typecheck(call_expr=expr_call, min_individual_maf=numeric, k=nullable(int), scores_expr=nullable(expr_array(expr_float64)), min_kinship=nullable(numeric), statistics=enumeration('kin', 'kin2', 'kin20', 'all'), block_size=nullable(int)) def pc_relate(call_expr, min_individual_maf, *, k=None, scores_expr=None, min_kinship=None, statistics="all", block_size=None) -> Table: r"""Compute relatedness estimates between individuals using a variant of the PC-Relate method. .. include:: ../_templates/req_diploid_gt.rst Examples -------- Estimate kinship, identity-by-descent two, identity-by-descent one, and identity-by-descent zero for every pair of samples, using a minimum minor allele frequency filter of 0.01 and 10 principal components to control for population structure. >>> rel = hl.pc_relate(dataset.GT, 0.01, k=10) Only compute the kinship statistic. This is more efficient than computing all statistics. >>> rel = hl.pc_relate(dataset.GT, 0.01, k=10, statistics='kin') Compute all statistics, excluding sample-pairs with kinship less than 0.1. This is more efficient than producing the full table and then filtering using :meth:`.Table.filter`. >>> rel = hl.pc_relate(dataset.GT, 0.01, k=10, min_kinship=0.1) One can also pass in pre-computed principal component scores. To produce the same results as in the previous example: >>> _, scores_table, _ = hl.hwe_normalized_pca(dataset.GT, ... k=10, ... compute_loadings=False) >>> rel = hl.pc_relate(dataset.GT, ... 0.01, ... scores_expr=scores_table[dataset.col_key].scores, ... min_kinship=0.1) Notes ----- The traditional estimator for kinship between a pair of individuals :math:`i` and :math:`j`, sharing the set :math:`S_{ij}` of single-nucleotide variants, from a population with allele frequencies :math:`p_s`, is given by: .. math:: \widehat{\phi_{ij}} \coloneqq \frac{1}{|S_{ij}|} \sum_{s \in S_{ij}} \frac{(g_{is} - 2 p_s) (g_{js} - 2 p_s)} {4 \sum_{s \in S_{ij}} p_s (1 - p_s)} This estimator is true under the model that the sharing of common (relative to the population) alleles is not very informative to relatedness (because they're common) and the sharing of rare alleles suggests a recent common ancestor from which the allele was inherited by descent. When multiple ancestry groups are mixed in a sample, this model breaks down. Alleles that are rare in all but one ancestry group are treated as very informative to relatedness. However, these alleles are simply markers of the ancestry group. The PC-Relate method corrects for this situation and the related situation of admixed individuals. PC-Relate slightly modifies the usual estimator for relatedness: occurrences of population allele frequency are replaced with an "individual-specific allele frequency". This modification allows the method to correctly weight an allele according to an individual's unique ancestry profile. The "individual-specific allele frequency" at a given genetic locus is modeled by PC-Relate as a linear function of a sample's first ``k`` principal component coordinates. As such, the efficacy of this method rests on two assumptions: - an individual's first `k` principal component coordinates fully describe their allele-frequency-relevant ancestry, and - the relationship between ancestry (as described by principal component coordinates) and population allele frequency is linear The estimators for kinship, and identity-by-descent zero, one, and two follow. Let: - :math:`S_{ij}` be the set of genetic loci at which both individuals :math:`i` and :math:`j` have a defined genotype - :math:`g_{is} \in {0, 1, 2}` be the number of alternate alleles that individual :math:`i` has at genetic locus :math:`s` - :math:`\widehat{\mu_{is}} \in [0, 1]` be the individual-specific allele frequency for individual :math:`i` at genetic locus :math:`s` - :math:`{\widehat{\sigma^2_{is}}} \coloneqq \widehat{\mu_{is}} (1 - \widehat{\mu_{is}})`, the binomial variance of :math:`\widehat{\mu_{is}}` - :math:`\widehat{\sigma_{is}} \coloneqq \sqrt{\widehat{\sigma^2_{is}}}`, the binomial standard deviation of :math:`\widehat{\mu_{is}}` - :math:`\text{IBS}^{(0)}_{ij} \coloneqq \sum_{s \in S_{ij}} \mathbb{1}_{||g_{is} - g_{js} = 2||}`, the number of genetic loci at which individuals :math:`i` and :math:`j` share no alleles - :math:`\widehat{f_i} \coloneqq 2 \widehat{\phi_{ii}} - 1`, the inbreeding coefficient for individual :math:`i` - :math:`g^D_{is}` be a dominance encoding of the genotype matrix, and :math:`X_{is}` be a normalized dominance-coded genotype matrix .. math:: g^D_{is} \coloneqq \begin{cases} \widehat{\mu_{is}} & g_{is} = 0 \\ 0 & g_{is} = 1 \\ 1 - \widehat{\mu_{is}} & g_{is} = 2 \end{cases} \qquad X_{is} \coloneqq g^D_{is} - \widehat{\sigma^2_{is}} (1 - \widehat{f_i}) The estimator for kinship is given by: .. math:: \widehat{\phi_{ij}} \coloneqq \frac{\sum_{s \in S_{ij}}(g - 2 \mu)_{is} (g - 2 \mu)_{js}} {4 * \sum_{s \in S_{ij}} \widehat{\sigma_{is}} \widehat{\sigma_{js}}} The estimator for identity-by-descent two is given by: .. math:: \widehat{k^{(2)}_{ij}} \coloneqq \frac{\sum_{s \in S_{ij}}X_{is} X_{js}}{\sum_{s \in S_{ij}} \widehat{\sigma^2_{is}} \widehat{\sigma^2_{js}}} The estimator for identity-by-descent zero is given by: .. math:: \widehat{k^{(0)}_{ij}} \coloneqq \begin{cases} \frac{\text{IBS}^{(0)}_{ij}} {\sum_{s \in S_{ij}} \widehat{\mu_{is}}^2(1 - \widehat{\mu_{js}})^2 + (1 - \widehat{\mu_{is}})^2\widehat{\mu_{js}}^2} & \widehat{\phi_{ij}} > 2^{-5/2} \\ 1 - 4 \widehat{\phi_{ij}} + k^{(2)}_{ij} & \widehat{\phi_{ij}} \le 2^{-5/2} \end{cases} The estimator for identity-by-descent one is given by: .. math:: \widehat{k^{(1)}_{ij}} \coloneqq 1 - \widehat{k^{(2)}_{ij}} - \widehat{k^{(0)}_{ij}} Note that, even if present, phase information is ignored by this method. The PC-Relate method is described in "Model-free Estimation of Recent Genetic Relatedness". Conomos MP, Reiner AP, Weir BS, Thornton TA. in American Journal of Human Genetics. 2016 Jan 7. The reference implementation is available in the `GENESIS Bioconductor package <https://bioconductor.org/packages/release/bioc/html/GENESIS.html>`_ . :func:`.pc_relate` differs from the reference implementation in a few ways: - if `k` is supplied, samples scores are computed via PCA on all samples, not a specified subset of genetically unrelated samples. The latter can be achieved by filtering samples, computing PCA variant loadings, and using these loadings to compute and pass in scores for all samples. - the estimators do not perform small sample correction - the algorithm does not provide an option to use population-wide allele frequency estimates - the algorithm does not provide an option to not use "overall standardization" (see R ``pcrelate`` documentation) Under the PC-Relate model, kinship, :math:`\phi_{ij}`, ranges from 0 to 0.5, and is precisely half of the fraction-of-genetic-material-shared. Listed below are the statistics for a few pairings: - Monozygotic twins share all their genetic material so their kinship statistic is 0.5 in expection. - Parent-child and sibling pairs both have kinship 0.25 in expectation and are separated by the identity-by-descent-zero, :math:`k^{(2)}_{ij}`, statistic which is zero for parent-child pairs and 0.25 for sibling pairs. - Avuncular pairs and grand-parent/-child pairs both have kinship 0.125 in expectation and both have identity-by-descent-zero 0.5 in expectation - "Third degree relatives" are those pairs sharing :math:`2^{-3} = 12.5 %` of their genetic material, the results of PCRelate are often too noisy to reliably distinguish these pairs from higher-degree-relative-pairs or unrelated pairs. Note that :math:`g_{is}` is the number of alternate alleles. Hence, for multi-allelic variants, a value of 2 may indicate two distinct alternative alleles rather than a homozygous variant genotype. To enforce the latter, either filter or split multi-allelic variants first. The resulting table has the first 3, 4, 5, or 6 fields below, depending on the `statistics` parameter: - `i` (``col_key.dtype``) -- First sample. (key field) - `j` (``col_key.dtype``) -- Second sample. (key field) - `kin` (:py:data:`.tfloat64`) -- Kinship estimate, :math:`\widehat{\phi_{ij}}`. - `ibd2` (:py:data:`.tfloat64`) -- IBD2 estimate, :math:`\widehat{k^{(2)}_{ij}}`. - `ibd0` (:py:data:`.tfloat64`) -- IBD0 estimate, :math:`\widehat{k^{(0)}_{ij}}`. - `ibd1` (:py:data:`.tfloat64`) -- IBD1 estimate, :math:`\widehat{k^{(1)}_{ij}}`. Here ``col_key`` refers to the column key of the source matrix table, and ``col_key.dtype`` is a struct containing the column key fields. There is one row for each pair of distinct samples (columns), where `i` corresponds to the column of smaller column index. In particular, if the same column key value exists for :math:`n` columns, then the resulting table will have :math:`\binom{n-1}{2}` rows with both key fields equal to that column key value. This may result in unexpected behavior in downstream processing. Parameters ---------- call_expr : :class:`.CallExpression` Entry-indexed call expression. min_individual_maf : :obj:`float` The minimum individual-specific minor allele frequency. If either individual-specific minor allele frequency for a pair of individuals is below this threshold, then the variant will not be used to estimate relatedness for the pair. k : :obj:`int`, optional If set, `k` principal component scores are computed and used. Exactly one of `k` and `scores_expr` must be specified. scores_expr : :class:`.ArrayNumericExpression`, optional Column-indexed expression of principal component scores, with the same source as `call_expr`. All array values must have the same positive length, corresponding to the number of principal components, and all scores must be non-missing. Exactly one of `k` and `scores_expr` must be specified. min_kinship : :obj:`float`, optional If set, pairs of samples with kinship lower than `min_kinship` are excluded from the results. statistics : :obj:`str` Set of statistics to compute. If ``'kin'``, only estimate the kinship statistic. If ``'kin2'``, estimate the above and IBD2. If ``'kin20'``, estimate the above and IBD0. If ``'all'``, estimate the above and IBD1. block_size : :obj:`int`, optional Block size of block matrices used in the algorithm. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.Table` A :class:`.Table` mapping pairs of samples to their pair-wise statistics. """ mt = matrix_table_source('pc_relate/call_expr', call_expr) if k and scores_expr is None: _, scores, _ = hwe_normalized_pca(call_expr, k, compute_loadings=False) scores_expr = scores[mt.col_key].scores elif not k and scores_expr is not None: analyze('pc_relate/scores_expr', scores_expr, mt._col_indices) elif k and scores_expr is not None: raise ValueError("pc_relate: exactly one of 'k' and 'scores_expr' must be set, found both") else: raise ValueError("pc_relate: exactly one of 'k' and 'scores_expr' must be set, found neither") scores_table = mt.select_cols(__scores=scores_expr)\ .key_cols_by().select_cols('__scores').cols() n_missing = scores_table.aggregate(agg.count_where(hl.is_missing(scores_table.__scores))) if n_missing > 0: raise ValueError(f'Found {n_missing} columns with missing scores array.') mt = mt.select_entries(__gt=call_expr.n_alt_alleles()).unfilter_entries() mt = mt.annotate_rows(__mean_gt=agg.mean(mt.__gt)) mean_imputed_gt = hl.or_else(hl.float64(mt.__gt), mt.__mean_gt) if not block_size: block_size = BlockMatrix.default_block_size() g = BlockMatrix.from_entry_expr(mean_imputed_gt, block_size=block_size) pcs = scores_table.collect(_localize=False).map(lambda x: x.__scores) ht = Table(BlockMatrixToTableApply(g._bmir, pcs._ir, { 'name': 'PCRelate', 'maf': min_individual_maf, 'blockSize': block_size, 'minKinship': min_kinship, 'statistics': {'kin': 0, 'kin2': 1, 'kin20': 2, 'all': 3}[statistics] })) if statistics == 'kin': ht = ht.drop('ibd0', 'ibd1', 'ibd2') elif statistics == 'kin2': ht = ht.drop('ibd0', 'ibd1') elif statistics == 'kin20': ht = ht.drop('ibd1') col_keys = hl.literal(mt.select_cols().key_cols_by().cols().collect(), dtype=tarray(mt.col_key.dtype)) return ht.key_by(i=col_keys[ht.i], j=col_keys[ht.j])
[docs]@typecheck(ds=oneof(Table, MatrixTable), keep_star=bool, left_aligned=bool, permit_shuffle=bool) def split_multi(ds, keep_star=False, left_aligned=False, *, permit_shuffle=False): """Split multiallelic variants. The resulting dataset will be keyed by the split locus and alleles. :func:`.split_multi` adds the following fields: - `was_split` (*bool*) -- ``True`` if this variant was originally multiallelic, otherwise ``False``. - `a_index` (*int*) -- The original index of this alternate allele in the multiallelic representation (NB: 1 is the first alternate allele or the only alternate allele in a biallelic variant). For example, 1:100:A:T,C splits into two variants: 1:100:A:T with ``a_index = 1`` and 1:100:A:C with ``a_index = 2``. - `old_locus` (*locus*) -- The original, unsplit locus. - `old_alleles` (*array<str>*) -- The original, unsplit alleles. All other fields are left unchanged. Example ------- :func:`.split_multi_hts`, which splits multiallelic variants for the HTS genotype schema and updates the entry fields by downcoding the genotype, is implemented as: >>> sm = hl.split_multi(ds) >>> pl = hl.or_missing( ... hl.is_defined(sm.PL), ... (hl.range(0, 3).map(lambda i: hl.min(hl.range(0, hl.len(sm.PL)) ... .filter(lambda j: hl.downcode(hl.unphased_diploid_gt_index_call(j), sm.a_index) == hl.unphased_diploid_gt_index_call(i)) ... .map(lambda j: sm.PL[j]))))) >>> split_ds = sm.annotate_entries( ... GT=hl.downcode(sm.GT, sm.a_index), ... AD=hl.or_missing(hl.is_defined(sm.AD), ... [hl.sum(sm.AD) - sm.AD[sm.a_index], sm.AD[sm.a_index]]), ... DP=sm.DP, ... PL=pl, ... GQ=hl.gq_from_pl(pl)).drop('old_locus', 'old_alleles') Warning ------- In order to support a wide variety of data types, this function splits only the variants on a :class:`.MatrixTable`, but **not the genotypes**. Use :func:`.split_multi_hts` if possible, or split the genotypes yourself using one of the entry modification methods: :meth:`.MatrixTable.annotate_entries`, :meth:`.MatrixTable.select_entries`, :meth:`.MatrixTable.transmute_entries`. See Also -------- :func:`.split_multi_hts` Parameters ---------- ds : :class:`.MatrixTable` or :class:`.Table` An unsplit dataset. keep_star : :obj:`bool` Do not filter out * alleles. left_aligned : :obj:`bool` If ``True``, variants are assumed to be left aligned and have unique loci. This avoids a shuffle. If the assumption is violated, an error is generated. permit_shuffle : :obj:`bool` If ``True``, permit a data shuffle to sort out-of-order split results. This will only be required if input data has duplicate loci, one of which contains more than one alternate allele. Returns ------- :class:`.MatrixTable` or :class:`.Table` """ require_row_key_variant(ds, "split_multi") new_id = Env.get_uid() is_table = isinstance(ds, Table) old_row = ds.row if is_table else ds._rvrow kept_alleles = hl.range(1, hl.len(old_row.alleles)) if not keep_star: kept_alleles = kept_alleles.filter(lambda i: old_row.alleles[i] != "*") def new_struct(variant, i): return hl.struct(alleles=variant.alleles, locus=variant.locus, a_index=i, was_split=hl.len(old_row.alleles) > 2) def split_rows(expr, rekey): if isinstance(ds, MatrixTable): mt = (ds.annotate_rows(**{new_id: expr}) .explode_rows(new_id)) if rekey: mt = mt.key_rows_by() else: mt = mt.key_rows_by('locus') new_row_expr = mt._rvrow.annotate(locus=mt[new_id]['locus'], alleles=mt[new_id]['alleles'], a_index=mt[new_id]['a_index'], was_split=mt[new_id]['was_split'], old_locus=mt.locus, old_alleles=mt.alleles).drop(new_id) mt = mt._select_rows('split_multi', new_row_expr) if rekey: return mt.key_rows_by('locus', 'alleles') else: return MatrixTable(MatrixKeyRowsBy(mt._mir, ['locus', 'alleles'], is_sorted=True)) else: assert isinstance(ds, Table) ht = (ds.annotate(**{new_id: expr}) .explode(new_id)) if rekey: ht = ht.key_by() else: ht = ht.key_by('locus') new_row_expr = ht.row.annotate(locus=ht[new_id]['locus'], alleles=ht[new_id]['alleles'], a_index=ht[new_id]['a_index'], was_split=ht[new_id]['was_split'], old_locus=ht.locus, old_alleles=ht.alleles).drop(new_id) ht = ht._select('split_multi', new_row_expr) if rekey: return ht.key_by('locus', 'alleles') else: return Table(TableKeyBy(ht._tir, ['locus', 'alleles'], is_sorted=True)) if left_aligned: def make_struct(i): def error_on_moved(v): return (hl.case() .when(v.locus == old_row.locus, new_struct(v, i)) .or_error("Found non-left-aligned variant in split_multi")) return hl.bind(error_on_moved, hl.min_rep(old_row.locus, [old_row.alleles[0], old_row.alleles[i]])) return split_rows(hl.sorted(kept_alleles.map(make_struct)), permit_shuffle) else: def make_struct(i, cond): def struct_or_empty(v): return (hl.case() .when(cond(v.locus), hl.array([new_struct(v, i)])) .or_missing()) return hl.bind(struct_or_empty, hl.min_rep(old_row.locus, [old_row.alleles[0], old_row.alleles[i]])) def make_array(cond): return hl.sorted(kept_alleles.flatmap(lambda i: make_struct(i, cond))) left = split_rows(make_array(lambda locus: locus == ds['locus']), permit_shuffle) moved = split_rows(make_array(lambda locus: locus != ds['locus']), True) return left.union(moved) if is_table else left.union_rows(moved, _check_cols=False)
[docs]@typecheck(ds=oneof(Table, MatrixTable), keep_star=bool, left_aligned=bool, vep_root=str, permit_shuffle=bool) def split_multi_hts(ds, keep_star=False, left_aligned=False, vep_root='vep', *, permit_shuffle=False): """Split multiallelic variants for datasets that contain one or more fields from a standard high-throughput sequencing entry schema. .. code-block:: text struct { GT: call, AD: array<int32>, DP: int32, GQ: int32, PL: array<int32>, PGT: call, PID: str } For other entry fields, write your own splitting logic using :meth:`.MatrixTable.annotate_entries`. Examples -------- >>> hl.split_multi_hts(dataset).write('output/split.vds') Notes ----- We will explain by example. Consider a hypothetical 3-allelic variant: .. code-block:: text A C,T 0/2:7,2,6:15:45:99,50,99,0,45,99 :func:`.split_multi_hts` will create two biallelic variants (one for each alternate allele) at the same position .. code-block:: text A C 0/0:13,2:15:45:0,45,99 A T 0/1:9,6:15:50:50,0,99 Each multiallelic `GT` or `PGT` field is downcoded once for each alternate allele. A call for an alternate allele maps to 1 in the biallelic variant corresponding to itself and 0 otherwise. For example, in the example above, 0/2 maps to 0/0 and 0/1. The genotype 1/2 maps to 0/1 and 0/1. The biallelic alt `AD` entry is just the multiallelic `AD` entry corresponding to the alternate allele. The ref AD entry is the sum of the other multiallelic entries. The biallelic `DP` is the same as the multiallelic `DP`. The biallelic `PL` entry for a genotype g is the minimum over `PL` entries for multiallelic genotypes that downcode to g. For example, the `PL` for (A, T) at 0/1 is the minimum of the PLs for 0/1 (50) and 1/2 (45), and thus 45. Fixing an alternate allele and biallelic variant, downcoding gives a map from multiallelic to biallelic alleles and genotypes. The biallelic `AD` entry for an allele is just the sum of the multiallelic `AD` entries for alleles that map to that allele. Similarly, the biallelic `PL` entry for a genotype is the minimum over multiallelic `PL` entries for genotypes that map to that genotype. `GQ` is recomputed from `PL` if `PL` is provided and is not missing. If not, it is copied from the original GQ. Here is a second example for a het non-ref .. code-block:: text A C,T 1/2:2,8,6:16:45:99,50,99,45,0,99 splits as .. code-block:: text A C 0/1:8,8:16:45:45,0,99 A T 0/1:10,6:16:50:50,0,99 **VCF Info Fields** Hail does not split fields in the info field. This means that if a multiallelic site with `info.AC` value ``[10, 2]`` is split, each split site will contain the same array ``[10, 2]``. The provided allele index field `a_index` can be used to select the value corresponding to the split allele's position: >>> split_ds = hl.split_multi_hts(dataset) >>> split_ds = split_ds.filter_rows(split_ds.info.AC[split_ds.a_index - 1] < 10, ... keep = False) VCFs split by Hail and exported to new VCFs may be incompatible with other tools, if action is not taken first. Since the "Number" of the arrays in split multiallelic sites no longer matches the structure on import ("A" for 1 per allele, for example), Hail will export these fields with number ".". If the desired output is one value per site, then it is possible to use annotate_variants_expr to remap these values. Here is an example: >>> split_ds = hl.split_multi_hts(dataset) >>> split_ds = split_ds.annotate_rows(info = hl.struct(AC=split_ds.info.AC[split_ds.a_index - 1], ... **split_ds.info)) # doctest: +SKIP >>> hl.export_vcf(split_ds, 'output/export.vcf') # doctest: +SKIP The info field AC in *data/export.vcf* will have ``Number=1``. **New Fields** :func:`.split_multi_hts` adds the following fields: - `was_split` (*bool*) -- ``True`` if this variant was originally multiallelic, otherwise ``False``. - `a_index` (*int*) -- The original index of this alternate allele in the multiallelic representation (NB: 1 is the first alternate allele or the only alternate allele in a biallelic variant). For example, 1:100:A:T,C splits into two variants: 1:100:A:T with ``a_index = 1`` and 1:100:A:C with ``a_index = 2``. See Also -------- :func:`.split_multi` Parameters ---------- ds : :class:`.MatrixTable` or :class:`.Table` An unsplit dataset. keep_star : :obj:`bool` Do not filter out * alleles. left_aligned : :obj:`bool` If ``True``, variants are assumed to be left aligned and have unique loci. This avoids a shuffle. If the assumption is violated, an error is generated. vep_root : :obj:`str` Top-level location of vep data. All variable-length VEP fields (intergenic_consequences, motif_feature_consequences, regulatory_feature_consequences, and transcript_consequences) will be split properly (i.e. a_index corresponding to the VEP allele_num). permit_shuffle : :obj:`bool` If ``True``, permit a data shuffle to sort out-of-order split results. This will only be required if input data has duplicate loci, one of which contains more than one alternate allele. Returns ------- :class:`.MatrixTable` or :class:`.Table` A biallelic variant dataset. """ split = split_multi(ds, keep_star=keep_star, left_aligned=left_aligned, permit_shuffle=permit_shuffle) row_fields = set(ds.row) update_rows_expression = {} if vep_root in row_fields: update_rows_expression[vep_root] = split[vep_root].annotate(**{ x: split[vep_root][x].filter(lambda csq: csq.allele_num == split.a_index) for x in ('intergenic_consequences', 'motif_feature_consequences', 'regulatory_feature_consequences', 'transcript_consequences')}) if isinstance(ds, Table): return split.annotate(**update_rows_expression).drop('old_locus', 'old_alleles') split = split.annotate_rows(**update_rows_expression) entry_fields = ds.entry expected_field_types = { 'GT': hl.tcall, 'AD': hl.tarray(hl.tint), 'DP': hl.tint, 'GQ': hl.tint, 'PL': hl.tarray(hl.tint), 'PGT': hl.tcall, 'PID': hl.tstr } bad_fields = [] for field in entry_fields: if field in expected_field_types and entry_fields[field].dtype != expected_field_types[field]: bad_fields.append((field, entry_fields[field].dtype, expected_field_types[field])) if bad_fields: msg = '\n '.join([f"'{x[0]}'\tfound: {x[1]}\texpected: {x[2]}" for x in bad_fields]) raise TypeError("'split_multi_hts': Found invalid types for the following fields:\n " + msg) update_entries_expression = {} if 'GT' in entry_fields: update_entries_expression['GT'] = hl.downcode(split.GT, split.a_index) if 'DP' in entry_fields: update_entries_expression['DP'] = split.DP if 'AD' in entry_fields: update_entries_expression['AD'] = hl.or_missing(hl.is_defined(split.AD), [hl.sum(split.AD) - split.AD[split.a_index], split.AD[split.a_index]]) if 'PL' in entry_fields: pl = hl.or_missing( hl.is_defined(split.PL), (hl.range(0, 3).map(lambda i: hl.min((hl.range(0, hl.triangle(split.old_alleles.length())) .filter(lambda j: hl.downcode(hl.unphased_diploid_gt_index_call(j), split.a_index).unphased_diploid_gt_index() == i ).map(lambda j: split.PL[j])))))) if 'GQ' in entry_fields: update_entries_expression['PL'] = pl update_entries_expression['GQ'] = hl.or_else(hl.gq_from_pl(pl), split.GQ) else: update_entries_expression['PL'] = pl else: if 'GQ' in entry_fields: update_entries_expression['GQ'] = split.GQ if 'PGT' in entry_fields: update_entries_expression['PGT'] = hl.downcode(split.PGT, split.a_index) if 'PID' in entry_fields: update_entries_expression['PID'] = split.PID return split.annotate_entries(**update_entries_expression).drop('old_locus', 'old_alleles')
[docs]@typecheck(call_expr=expr_call) def genetic_relatedness_matrix(call_expr) -> BlockMatrix: r"""Compute the genetic relatedness matrix (GRM). Examples -------- >>> grm = hl.genetic_relatedness_matrix(dataset.GT) Notes ----- The genetic relationship matrix (GRM) :math:`G` encodes genetic correlation between each pair of samples. It is defined by :math:`G = MM^T` where :math:`M` is a standardized version of the genotype matrix, computed as follows. Let :math:`C` be the :math:`n \times m` matrix of raw genotypes in the variant dataset, with rows indexed by :math:`n` samples and columns indexed by :math:`m` bialellic autosomal variants; :math:`C_{ij}` is the number of alternate alleles of variant :math:`j` carried by sample :math:`i`, which can be 0, 1, 2, or missing. For each variant :math:`j`, the sample alternate allele frequency :math:`p_j` is computed as half the mean of the non-missing entries of column :math:`j`. Entries of :math:`M` are then mean-centered and variance-normalized as .. math:: M_{ij} = \frac{C_{ij}-2p_j}{\sqrt{2p_j(1-p_j)m}}, with :math:`M_{ij} = 0` for :math:`C_{ij}` missing (i.e. mean genotype imputation). This scaling normalizes genotype variances to a common value :math:`1/m` for variants in Hardy-Weinberg equilibrium and is further motivated in the paper `Patterson, Price and Reich, 2006 <http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.0020190>`__. (The resulting amplification of signal from the low end of the allele frequency spectrum will also introduce noise for rare variants; common practice is to filter out variants with minor allele frequency below some cutoff.) The factor :math:`1/m` gives each sample row approximately unit total variance (assuming linkage equilibrium) so that the diagonal entries of the GRM are approximately 1. Equivalently, .. math:: G_{ik} = \frac{1}{m} \sum_{j=1}^m \frac{(C_{ij}-2p_j)(C_{kj}-2p_j)}{2 p_j (1-p_j)} This method drops variants with :math:`p_j = 0` or :math:`p_j = 1` before computing kinship. Parameters ---------- call_expr : :class:`.CallExpression` Entry-indexed call expression with columns corresponding to samples. Returns ------- :class:`.BlockMatrix` Genetic relatedness matrix for all samples. Row and column indices correspond to matrix table column index. """ mt = matrix_table_source('genetic_relatedness_matrix/call_expr', call_expr) check_entry_indexed('genetic_relatedness_matrix/call_expr', call_expr) mt = mt.select_entries(__gt=call_expr.n_alt_alleles()).unfilter_entries() mt = mt.select_rows(__AC=agg.sum(mt.__gt), __n_called=agg.count_where(hl.is_defined(mt.__gt))) mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called)) mt = mt.select_rows(__mean_gt=mt.__AC / mt.__n_called) mt = mt.annotate_rows(__hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt * (2 - mt.__mean_gt))) normalized_gt = hl.or_else((mt.__gt - mt.__mean_gt) / mt.__hwe_scaled_std_dev, 0.0) bm = BlockMatrix.from_entry_expr(normalized_gt) return (bm.T @ bm) / (bm.n_rows / 2.0)
[docs]@typecheck(call_expr=expr_call) def realized_relationship_matrix(call_expr) -> BlockMatrix: r"""Computes the realized relationship matrix (RRM). Examples -------- >>> rrm = hl.realized_relationship_matrix(dataset.GT) Notes ----- The realized relationship matrix (RRM) is defined as follows. Consider the :math:`n \times m` matrix :math:`C` of raw genotypes, with rows indexed by :math:`n` samples and columns indexed by the :math:`m` bialellic autosomal variants; :math:`C_{ij}` is the number of alternate alleles of variant :math:`j` carried by sample :math:`i`, which can be 0, 1, 2, or missing. For each variant :math:`j`, the sample alternate allele frequency :math:`p_j` is computed as half the mean of the non-missing entries of column :math:`j`. Entries of :math:`M` are then mean-centered and variance-normalized as .. math:: M_{ij} = \frac{C_{ij}-2p_j} {\sqrt{\frac{m}{n} \sum_{k=1}^n (C_{ij}-2p_j)^2}}, with :math:`M_{ij} = 0` for :math:`C_{ij}` missing (i.e. mean genotype imputation). This scaling normalizes each variant column to have empirical variance :math:`1/m`, which gives each sample row approximately unit total variance (assuming linkage equilibrium) and yields the :math:`n \times n` sample correlation or realized relationship matrix (RRM) :math:`K` as simply .. math:: K = MM^T Note that the only difference between the realized relationship matrix and the genetic relatedness matrix (GRM) used in :func:`.realized_relationship_matrix` is the variant (column) normalization: where RRM uses empirical variance, GRM uses expected variance under Hardy-Weinberg Equilibrium. This method drops variants with zero variance before computing kinship. Parameters ---------- call_expr : :class:`.CallExpression` Entry-indexed call expression on matrix table with columns corresponding to samples. Returns ------- :class:`.BlockMatrix` Realized relationship matrix for all samples. Row and column indices correspond to matrix table column index. """ mt = matrix_table_source('realized_relationship_matrix/call_expr', call_expr) check_entry_indexed('realized_relationship_matrix/call_expr', call_expr) mt = mt.select_entries(__gt=call_expr.n_alt_alleles()).unfilter_entries() mt = mt.select_rows(__AC=agg.sum(mt.__gt), __ACsq=agg.sum(mt.__gt * mt.__gt), __n_called=agg.count_where(hl.is_defined(mt.__gt))) mt = mt.select_rows(__mean_gt=mt.__AC / mt.__n_called, __centered_length=hl.sqrt(mt.__ACsq - (mt.__AC ** 2) / mt.__n_called)) mt = mt.filter_rows(mt.__centered_length > 0.1) # truly non-zero values are at least sqrt(0.5) normalized_gt = hl.or_else((mt.__gt - mt.__mean_gt) / mt.__centered_length, 0.0) bm = BlockMatrix.from_entry_expr(normalized_gt) return (bm.T @ bm) / (bm.n_rows / bm.n_cols)
[docs]@typecheck(entry_expr=expr_float64, block_size=nullable(int)) def row_correlation(entry_expr, block_size=None) -> BlockMatrix: """Computes the correlation matrix between row vectors. Examples -------- Consider the following dataset with three variants and four samples: >>> data = [{'v': '1:1:A:C', 's': 'a', 'GT': hl.Call([0, 0])}, ... {'v': '1:1:A:C', 's': 'b', 'GT': hl.Call([0, 0])}, ... {'v': '1:1:A:C', 's': 'c', 'GT': hl.Call([0, 1])}, ... {'v': '1:1:A:C', 's': 'd', 'GT': hl.Call([1, 1])}, ... {'v': '1:2:G:T', 's': 'a', 'GT': hl.Call([0, 1])}, ... {'v': '1:2:G:T', 's': 'b', 'GT': hl.Call([1, 1])}, ... {'v': '1:2:G:T', 's': 'c', 'GT': hl.Call([0, 1])}, ... {'v': '1:2:G:T', 's': 'd', 'GT': hl.Call([0, 0])}, ... {'v': '1:3:C:G', 's': 'a', 'GT': hl.Call([0, 1])}, ... {'v': '1:3:C:G', 's': 'b', 'GT': hl.Call([0, 0])}, ... {'v': '1:3:C:G', 's': 'c', 'GT': hl.Call([1, 1])}, ... {'v': '1:3:C:G', 's': 'd', 'GT': hl.null(hl.tcall)}] >>> ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call}')) >>> mt = ht.to_matrix_table(row_key=['v'], col_key=['s']) Compute genotype correlation between all pairs of variants: >>> ld = hl.row_correlation(mt.GT.n_alt_alleles()) >>> ld.to_numpy() array([[ 1. , -0.85280287, 0.42640143], [-0.85280287, 1. , -0.5 ], [ 0.42640143, -0.5 , 1. ]]) Compute genotype correlation between consecutively-indexed variants: >>> ld.sparsify_band(lower=0, upper=1).to_numpy() array([[ 1. , -0.85280287, 0. ], [ 0. , 1. , -0.5 ], [ 0. , 0. , 1. ]]) Warning ------- Rows with a constant value (i.e., zero variance) will result `nan` correlation values. To avoid this, first check that all rows vary or filter out constant rows (for example, with the help of :func:`.aggregators.stats`). Notes ----- In this method, each row of entries is regarded as a vector with elements defined by `entry_expr` and missing values mean-imputed per row. The ``(i, j)`` element of the resulting block matrix is the correlation between rows ``i`` and ``j`` (as 0-indexed by order in the matrix table; see :meth:`add_row_index`). The correlation of two vectors is defined as the `Pearson correlation coeffecient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`__ between the corresponding empirical distributions of elements, or equivalently as the cosine of the angle between the vectors. This method has two stages: - writing the row-normalized block matrix to a temporary file on persistent disk with :meth:`BlockMatrix.from_entry_expr`. The parallelism is ``n_rows / block_size``. - reading and multiplying this block matrix by its transpose. The parallelism is ``(n_rows / block_size)^2`` if all blocks are computed. Warning ------- See all warnings on :meth:`BlockMatrix.from_entry_expr`. In particular, for large matrices, it may be preferable to run the two stages separately, saving the row-normalized block matrix to a file on external storage with :meth:`BlockMatrix.write_from_entry_expr`. The resulting number of matrix elements is the square of the number of rows in the matrix table, so computing the full matrix may be infeasible. For example, ten million rows would produce 800TB of float64 values. The block-sparse representation on BlockMatrix may be used to work efficiently with regions of such matrices, as in the second example above and :meth:`ld_matrix`. To prevent excessive re-computation, be sure to write and read the (possibly block-sparsified) result before multiplication by another matrix. Parameters ---------- entry_expr : :class:`.Float64Expression` Entry-indexed numeric expression on matrix table. block_size : :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.BlockMatrix` Correlation matrix between row vectors. Row and column indices correspond to matrix table row index. """ bm = BlockMatrix.from_entry_expr(entry_expr, mean_impute=True, center=True, normalize=True, block_size=block_size) return bm @ bm.T
[docs]@typecheck(entry_expr=expr_float64, locus_expr=expr_locus(), radius=oneof(int, float), coord_expr=nullable(expr_float64), block_size=nullable(int)) def ld_matrix(entry_expr, locus_expr, radius, coord_expr=None, block_size=None) -> BlockMatrix: """Computes the windowed correlation (linkage disequilibrium) matrix between variants. Examples -------- Consider the following dataset consisting of three variants with centimorgan coordinates and four samples: >>> data = [{'v': '1:1:A:C', 'cm': 0.1, 's': 'a', 'GT': hl.Call([0, 0])}, ... {'v': '1:1:A:C', 'cm': 0.1, 's': 'b', 'GT': hl.Call([0, 0])}, ... {'v': '1:1:A:C', 'cm': 0.1, 's': 'c', 'GT': hl.Call([0, 1])}, ... {'v': '1:1:A:C', 'cm': 0.1, 's': 'd', 'GT': hl.Call([1, 1])}, ... {'v': '1:2000000:G:T', 'cm': 0.9, 's': 'a', 'GT': hl.Call([0, 1])}, ... {'v': '1:2000000:G:T', 'cm': 0.9, 's': 'b', 'GT': hl.Call([1, 1])}, ... {'v': '1:2000000:G:T', 'cm': 0.9, 's': 'c', 'GT': hl.Call([0, 1])}, ... {'v': '1:2000000:G:T', 'cm': 0.9, 's': 'd', 'GT': hl.Call([0, 0])}, ... {'v': '2:1:C:G', 'cm': 0.2, 's': 'a', 'GT': hl.Call([0, 1])}, ... {'v': '2:1:C:G', 'cm': 0.2, 's': 'b', 'GT': hl.Call([0, 0])}, ... {'v': '2:1:C:G', 'cm': 0.2, 's': 'c', 'GT': hl.Call([1, 1])}, ... {'v': '2:1:C:G', 'cm': 0.2, 's': 'd', 'GT': hl.null(hl.tcall)}] >>> ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, cm: float64, GT: call}')) >>> ht = ht.transmute(**hl.parse_variant(ht.v)) >>> mt = ht.to_matrix_table(row_key=['locus', 'alleles'], col_key=['s'], row_fields=['cm']) Compute linkage disequilibrium between all pairs of variants on the same contig and within two megabases: >>> ld = hl.ld_matrix(mt.GT.n_alt_alleles(), mt.locus, radius=2e6) >>> ld.to_numpy() array([[ 1. , -0.85280287, 0. ], [-0.85280287, 1. , 0. ], [ 0. , 0. , 1. ]]) Within one megabases: >>> ld = hl.ld_matrix(mt.GT.n_alt_alleles(), mt.locus, radius=1e6) >>> ld.to_numpy() array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]) Within one centimorgan: >>> ld = hl.ld_matrix(mt.GT.n_alt_alleles(), mt.locus, radius=1.0, coord_expr=mt.cm) >>> ld.to_numpy() array([[ 1. , -0.85280287, 0. ], [-0.85280287, 1. , 0. ], [ 0. , 0. , 1. ]]) Within one centimorgan, and only calculate the upper triangle: >>> ld = hl.ld_matrix(mt.GT.n_alt_alleles(), mt.locus, radius=1.0, coord_expr=mt.cm) >>> ld = ld.sparsify_triangle() >>> ld.to_numpy() array([[ 1. , -0.85280287, 0. ], [ 0. , 1. , 0. ], [ 0. , 0. , 1. ]]) Notes ----- This method sparsifies the result of :meth:`row_correlation` using :func:`.linalg.utils.locus_windows` and :meth:`.BlockMatrix.sparsify_row_intervals` in order to only compute linkage disequilibrium between nearby variants. Use :meth:`row_correlation` directly to calculate correlation without windowing. More precisely, variants are 0-indexed by their order in the matrix table (see :meth:`add_row_index`). Each variant is regarded as a vector of elements defined by `entry_expr`, typically the number of alternate alleles or genotype dosage. Missing values are mean-imputed within variant. The method produces a symmetric block-sparse matrix supported in a neighborhood of the diagonal. If variants :math:`i` and :math:`j` are on the same contig and within `radius` base pairs (inclusive) then the :math:`(i, j)` element is their `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`__. Otherwise, the :math:`(i, j)` element is ``0.0``. Rows with a constant value (i.e., zero variance) will result in ``nan`` correlation values. To avoid this, first check that all variants vary or filter out constant variants (for example, with the help of :func:`.aggregators.stats`). If the :meth:`.global_position` on `locus_expr` is not in ascending order, this method will fail. Ascending order should hold for a matrix table keyed by locus or variant (and the associated row table), or for a table that's been ordered by `locus_expr`. Set `coord_expr` to use a value other than position to define the windows. This row-indexed numeric expression must be non-missing, non-``nan``, on the same source as `locus_expr`, and ascending with respect to locus position for each contig; otherwise the method will raise an error. Warning ------- See the warnings in :meth:`row_correlation`. In particular, for large matrices it may be preferable to run its stages separately. `entry_expr` and `locus_expr` are implicitly aligned by row-index, though they need not be on the same source. If their sources differ in the number of rows, an error will be raised; otherwise, unintended misalignment may silently produce unexpected results. Parameters ---------- entry_expr : :class:`.Float64Expression` Entry-indexed numeric expression on matrix table. locus_expr : :class:`.LocusExpression` Row-indexed locus expression on a table or matrix table that is row-aligned with the matrix table of `entry_expr`. radius: :obj:`int` or :obj:`float` Radius of window for row values. coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value on the same table or matrix table as `locus_expr`. By default, the row value is given by the locus position. block_size : :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.BlockMatrix` Windowed correlation matrix between variants. Row and column indices correspond to matrix table variant index. """ starts_and_stops = hl.linalg.utils.locus_windows(locus_expr, radius, coord_expr, _localize=False) ld = hl.row_correlation(entry_expr, block_size) return BlockMatrix._from_java(ld._jbm.filterRowIntervalsIR( Env.backend()._to_java_ir(starts_and_stops._ir), False))
[docs]@typecheck(n_populations=int, n_samples=int, n_variants=int, n_partitions=nullable(int), pop_dist=nullable(sequenceof(numeric)), fst=nullable(sequenceof(numeric)), af_dist=expr_any, reference_genome=reference_genome_type, mixture=bool) def balding_nichols_model(n_populations, n_samples, n_variants, n_partitions=None, pop_dist=None, fst=None, af_dist=hl.rand_unif(0.1, 0.9, seed=0), reference_genome='default', mixture=False) -> MatrixTable: r"""Generate a matrix table of variants, samples, and genotypes using the Balding-Nichols or Pritchard-Stephens-Donnelly model. Examples -------- Generate a matrix table of genotypes with 1000 variants and 100 samples across 3 populations: >>> bn_ds = hl.balding_nichols_model(3, 100, 1000, reference_genome='GRCh37') Generate a matrix table using 4 populations, 40 samples, 150 variants, 3 partitions, population distribution ``[0.1, 0.2, 0.3, 0.4]``, :math:`F_{ST}` values ``[.02, .06, .04, .12]``, ancestral allele frequencies drawn from a truncated beta distribution with ``a = 0.01`` and ``b = 0.05`` over the interval ``[0.05, 1]``, and random seed 1: >>> hl.set_global_seed(1) >>> bn_ds = hl.balding_nichols_model(4, 40, 150, 3, ... pop_dist=[0.1, 0.2, 0.3, 0.4], ... fst=[.02, .06, .04, .12], ... af_dist=hl.rand_beta(a=0.01, b=2.0, lower=0.05, upper=1.0)) To guarantee reproducibility, we set the Hail global seed with :func:`.set_global_seed` immediately prior to generating the dataset. Notes ----- This method simulates a matrix table of variants, samples, and genotypes using the Balding-Nichols model, which we now define. - :math:`K` populations are labeled by integers :math:`0, 1, \dots, K - 1`. - :math:`N` samples are labeled by strings :math:`0, 1, \dots, N - 1`. - :math:`M` variants are defined as ``1:1:A:C``, ``1:2:A:C``, ..., ``1:M:A:C``. - The default distribution for population assignment :math:`\pi` is uniform. - The default ancestral frequency distribution :math:`P_0` is uniform on :math:`[0.1, 0.9]`. All three classes are located in ``hail.stats``. - The default :math:`F_{ST}` values are all :math:`0.1`. The Balding-Nichols model models genotypes of individuals from a structured population comprising :math:`K` homogeneous modern populations that have each diverged from a single ancestral population (a `star phylogeny`). Each sample is assigned a population by sampling from the categorical distribution :math:`\pi`. Note that the actual size of each population is random. Variants are modeled as biallelic and unlinked. Ancestral allele frequencies are drawn independently for each variant from a frequency spectrum :math:`P_0`. The extent of genetic drift of each modern population from the ancestral population is defined by the corresponding :math:`F_{ST}` parameter :math:`F_k` (here and below, lowercase indices run over a range bounded by the corresponding uppercase parameter, e.g. :math:`k = 1, \ldots, K`). For each variant and population, allele frequencies are drawn from a `beta distribution <https://en.wikipedia.org/wiki/Beta_distribution>`__ whose parameters are determined by the ancestral allele frequency and :math:`F_{ST}` parameter. The beta distribution gives a continuous approximation of the effect of genetic drift. We denote sample population assignments by :math:`k_n`, ancestral allele frequencies by :math:`p_m`, population allele frequencies by :math:`p_{k, m}`, and diploid, unphased genotype calls by :math:`g_{n, m}` (0, 1, and 2 correspond to homozygous reference, heterozygous, and homozygous variant, respectively). The generative model is then given by: .. math:: \begin{aligned} k_n \,&\sim\, \pi \\ p_m \,&\sim\, P_0 \\ p_{k,m} \mid p_m\,&\sim\, \mathrm{Beta}(\mu = p_m,\, \sigma^2 = F_k p_m (1 - p_m)) \\ g_{n,m} \mid k_n, p_{k, m} \,&\sim\, \mathrm{Binomial}(2, p_{k_n, m}) \end{aligned} The beta distribution by its mean and variance above; the usual parameters are :math:`a = (1 - p) \frac{1 - F}{F}` and :math:`b = p \frac{1 - F}{F}` with :math:`F = F_k` and :math:`p = p_m`. The resulting dataset has the following fields. Global fields: - `bn.n_populations` (:py:data:`.tint32`) -- Number of populations. - `bn.n_samples` (:py:data:`.tint32`) -- Number of samples. - `bn.n_variants` (:py:data:`.tint32`) -- Number of variants. - `bn.n_partitions` (:py:data:`.tint32`) -- Number of partitions. - `bn.pop_dist` (:class:`.tarray` of :py:data:`.tfloat64`) -- Population distribution indexed by population. - `bn.fst` (:class:`.tarray` of :py:data:`.tfloat64`) -- :math:`F_{ST}` values indexed by population. - `bn.seed` (:py:data:`.tint32`) -- Random seed. - `bn.mixture` (:py:data:`.tbool`) -- Value of `mixture` parameter. Row fields: - `locus` (:class:`.tlocus`) -- Variant locus (key field). - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles (key field). - `ancestral_af` (:py:data:`.tfloat64`) -- Ancestral allele frequency. - `af` (:class:`.tarray` of :py:data:`.tfloat64`) -- Modern allele frequencies indexed by population. Column fields: - `sample_idx` (:py:data:`.tint32`) - Sample index (key field). - `pop` (:py:data:`.tint32`) -- Population of sample. Entry fields: - `GT` (:py:data:`.tcall`) -- Genotype call (diploid, unphased). For the `Pritchard-Stephens-Donnelly model <http://www.genetics.org/content/155/2/945.long>`__, set the `mixture` to true to treat `pop_dist` as the parameters of the Dirichlet distribution describing admixture between the modern populations. In this case, the type of `pop` is :class:`.tarray` of :py:data:`.tfloat64` and the value is the mixture proportions. Parameters ---------- n_populations : :obj:`int` Number of modern populations. n_samples : :obj:`int` Total number of samples. n_variants : :obj:`int` Number of variants. n_partitions : :obj:`int`, optional Number of partitions. Default is 1 partition per million entries or 8, whichever is larger. pop_dist : :obj:`list` of :obj:`float`, optional Unnormalized population distribution, a list of length `n_populations` with non-negative values. Default is ``[1, ..., 1]``. fst : :obj:`list` of :obj:`float`, optional :math:`F_{ST}` values, a list of length `n_populations` with values in (0, 1). Default is ``[0.1, ..., 0.1]``. af_dist : :class:`.Float64Expression` representing a random function. Ancestral allele frequency distribution. Default is :func:`.rand_unif` over the range `[0.1, 0.9]` with seed 0. reference_genome : :obj:`str` or :class:`.ReferenceGenome` Reference genome to use. mixture : :obj:`bool` Treat `pop_dist` as the parameters of a Dirichlet distribution, as in the Prichard-Stevens-Donnelly model. Returns ------- :class:`.MatrixTable` Simulated matrix table of variants, samples, and genotypes. """ if pop_dist is None: pop_dist = [1 for _ in range(n_populations)] if fst is None: fst = [0.1 for _ in range(n_populations)] if n_partitions is None: n_partitions = max(8, int(n_samples * n_variants / 1000000)) # verify args for name, var in {"populations": n_populations, "samples": n_samples, "variants": n_variants, "partitions": n_partitions}.items(): if var < 1: raise ValueError("n_{} must be positive, got {}".format(name, var)) for name, var in {"pop_dist": pop_dist, "fst": fst}.items(): if len(var) != n_populations: raise ValueError("{} must be of length n_populations={}, got length {}" .format(name, n_populations, len(var))) if any(x < 0 for x in pop_dist): raise ValueError("pop_dist must be non-negative, got {}" .format(pop_dist)) if any(x <= 0 or x >= 1 for x in fst): raise ValueError("elements of fst must satisfy 0 < x < 1, got {}" .format(fst)) # verify af_dist if not af_dist._is_scalar: raise ExpressionException('balding_nichols_model expects af_dist to ' + 'have scalar arguments: found expression ' + 'from source {}' .format(af_dist._indices.source)) if af_dist.dtype != tfloat64: raise ValueError("af_dist must be a hail function with return type tfloat64.") info("balding_nichols_model: generating genotypes for {} populations, {} samples, and {} variants..." .format(n_populations, n_samples, n_variants)) # generate matrix table bn = hl.utils.range_matrix_table(n_variants, n_samples, n_partitions) bn = bn.annotate_globals( bn=hl.struct(n_populations=n_populations, n_samples=n_samples, n_variants=n_variants, n_partitions=n_partitions, pop_dist=pop_dist, fst=fst, mixture=mixture)) # col info pop_f = hl.rand_dirichlet if mixture else hl.rand_cat bn = bn.key_cols_by(sample_idx=bn.col_idx) bn = bn.select_cols(pop=pop_f(pop_dist)) # row info bn = bn.key_rows_by(locus=hl.locus_from_global_position(bn.row_idx, reference_genome=reference_genome), alleles=['A', 'C']) bn = bn.select_rows(ancestral_af=af_dist, af=hl.bind(lambda ancestral: hl.array([(1 - x) / x for x in fst]) .map(lambda x: hl.rand_beta(ancestral * x, (1 - ancestral) * x)), af_dist)) # entry info p = hl.sum(bn.pop * bn.af) if mixture else bn.af[bn.pop] idx = hl.rand_cat([(1 - p) ** 2, 2 * p * (1-p), p ** 2]) return bn.select_entries(GT=hl.unphased_diploid_gt_index_call(idx))
[docs]@typecheck(mt=MatrixTable, f=anytype) def filter_alleles(mt: MatrixTable, f: Callable) -> MatrixTable: """Filter alternate alleles. .. include:: ../_templates/req_tvariant.rst Examples -------- Keep SNPs: >>> ds_result = hl.filter_alleles(ds, lambda allele, i: hl.is_snp(ds.alleles[0], allele)) Keep alleles with AC > 0: >>> ds_result = hl.filter_alleles(ds, lambda a, allele_index: ds.info.AC[allele_index - 1] > 0) Update the AC field of the resulting dataset: >>> updated_info = ds_result.info.annotate(AC = ds_result.new_to_old.map(lambda i: ds_result.info.AC[i-1])) >>> ds_result = ds_result.annotate_rows(info = updated_info) Notes ----- The following new fields are generated: - `old_locus` (``locus``) -- The old locus, before filtering and computing the minimal representation. - `old_alleles` (``array<str>``) -- The old alleles, before filtering and computing the minimal representation. - `old_to_new` (``array<int32>``) -- An array that maps old allele index to new allele index. Its length is the same as `old_alleles`. Alleles that are filtered are missing. - `new_to_old` (``array<int32>``) -- An array that maps new allele index to the old allele index. Its length is the same as the modified `alleles` field. If all alternate alleles of a variant are filtered out, the variant itself is filtered out. **Using** `f` The `f` argument is a function or lambda evaluated per alternate allele to determine whether that allele is kept. If `f` evaluates to ``True``, the allele is kept. If `f` evaluates to ``False`` or missing, the allele is removed. `f` is a function that takes two arguments: the allele string (of type :class:`.StringExpression`) and the allele index (of type :class:`.Int32Expression`), and returns a boolean expression. This can be either a defined function or a lambda. For example, these two usages are equivalent: (with a lambda) >>> ds_result = hl.filter_alleles(ds, lambda allele, i: hl.is_snp(ds.alleles[0], allele)) (with a defined function) >>> def filter_f(allele, allele_index): ... return hl.is_snp(ds.alleles[0], allele) >>> ds_result = hl.filter_alleles(ds, filter_f) Warning ------- :func:`.filter_alleles` does not update any fields other than `locus` and `alleles`. This means that row fields like allele count (AC) and entry fields like allele depth (AD) can become meaningless unless they are also updated. You can update them with :meth:`.annotate_rows` and :meth:`.annotate_entries`. See Also -------- :func:`.filter_alleles_hts` Parameters ---------- mt : :class:`.MatrixTable` Dataset. f : callable Function from (allele: :class:`StringExpression`, allele_index: :class:`.Int32Expression`) to :class:`.BooleanExpression` Returns ------- :class:`.MatrixTable` """ require_row_key_variant(mt, 'filter_alleles') inclusion = hl.range(0, hl.len(mt.alleles)).map(lambda i: (i == 0) | hl.bind(lambda ii: f(mt.alleles[ii], ii), i)) # old locus, old alleles, new to old, old to new mt = mt.annotate_rows(__allele_inclusion=inclusion, old_locus=mt.locus, old_alleles=mt.alleles) new_to_old = (hl.zip_with_index(mt.__allele_inclusion) .filter(lambda elt: elt[1]) .map(lambda elt: elt[0])) old_to_new_dict = (hl.dict(hl.zip_with_index(hl.zip_with_index(mt.alleles) .filter(lambda elt: mt.__allele_inclusion[elt[0]])) .map(lambda elt: (elt[1][1], elt[0])))) old_to_new = hl.bind(lambda d: mt.alleles.map(lambda a: d.get(a)), old_to_new_dict) mt = mt.annotate_rows(old_to_new=old_to_new, new_to_old=new_to_old) new_locus_alleles = hl.min_rep(mt.locus, mt.new_to_old.map(lambda i: mt.alleles[i])) mt = mt.annotate_rows(__new_locus=new_locus_alleles.locus, __new_alleles=new_locus_alleles.alleles) mt = mt.filter_rows(hl.len(mt.__new_alleles) > 1) left = mt.filter_rows((mt.locus == mt.__new_locus) & (mt.alleles == mt.__new_alleles)) right = mt.filter_rows((mt.locus != mt.__new_locus) | (mt.alleles != mt.__new_alleles)) right = right.key_rows_by(locus=right.__new_locus, alleles=right.__new_alleles) return left.union_rows(right, _check_cols=False).drop('__allele_inclusion', '__new_locus', '__new_alleles')
[docs]@typecheck(mt=MatrixTable, f=anytype, subset=bool) def filter_alleles_hts(mt: MatrixTable, f: Callable, subset: bool = False) -> MatrixTable: """Filter alternate alleles and update standard GATK entry fields. Examples -------- Filter to SNP alleles using the subset strategy: >>> ds_result = hl.filter_alleles_hts( ... ds, ... lambda allele, _: hl.is_snp(ds.alleles[0], allele), ... subset=True) Update the AC field of the resulting dataset: >>> updated_info = ds_result.info.annotate(AC = ds_result.new_to_old.map(lambda i: ds_result.info.AC[i-1])) >>> ds_result = ds_result.annotate_rows(info = updated_info) Notes ----- For usage of the `f` argument, see the :func:`.filter_alleles` documentation. :func:`.filter_alleles_hts` requires the dataset have the GATK VCF schema, namely the following entry fields in this order: .. code-block:: text GT: call AD: array<int32> DP: int32 GQ: int32 PL: array<int32> Use :meth:`.MatrixTable.select_entries` to rearrange these fields if necessary. The following new fields are generated: - `old_locus` (``locus``) -- The old locus, before filtering and computing the minimal representation. - `old_alleles` (``array<str>``) -- The old alleles, before filtering and computing the minimal representation. - `old_to_new` (``array<int32>``) -- An array that maps old allele index to new allele index. Its length is the same as `old_alleles`. Alleles that are filtered are missing. - `new_to_old` (``array<int32>``) -- An array that maps new allele index to the old allele index. Its length is the same as the modified `alleles` field. **Downcode algorithm** We will illustrate the behavior on the example genotype below when filtering the first alternate allele (allele 1) at a site with 1 reference allele and 2 alternate alleles. .. code-block:: text GT: 1/2 GQ: 10 AD: 0,50,35 0 | 1000 1 | 1000 10 2 | 1000 0 20 +----------------- 0 1 2 The downcode algorithm recodes occurances of filtered alleles to occurances of the reference allele (e.g. 1 -> 0 in our example). So the depths of filtered alleles in the AD field are added to the depth of the reference allele. Where downcoding filtered alleles merges distinct genotypes, the minimum PL is used (since PL is on a log scale, this roughly corresponds to adding probabilities). The PLs are then re-normalized (shifted) so that the most likely genotype has a PL of 0, and GT is set to this genotype. If an allele is filtered, this algorithm acts similarly to :func:`.split_multi_hts`. The downcode algorithm would produce the following: .. code-block:: text GT: 0/1 GQ: 10 AD: 35,50 0 | 20 1 | 0 10 +----------- 0 1 In summary: - GT: Downcode filtered alleles to reference. - AD: Columns of filtered alleles are eliminated and their values are added to the reference column, e.g., filtering alleles 1 and 2 transforms ``25,5,10,20`` to ``40,20``. - DP: No change. - PL: Downcode filtered alleles to reference, combine PLs using minimum for each overloaded genotype, and shift so the overall minimum PL is 0. - GQ: The second-lowest PL (after shifting). **Subset algorithm** We will illustrate the behavior on the example genotype below when filtering the first alternate allele (allele 1) at a site with 1 reference allele and 2 alternate alleles. .. code-block:: text GT: 1/2 GQ: 10 AD: 0,50,35 0 | 1000 1 | 1000 10 2 | 1000 0 20 +----------------- 0 1 2 The subset algorithm subsets the AD and PL arrays (i.e. removes entries corresponding to filtered alleles) and then sets GT to the genotype with the minimum PL. Note that if the genotype changes (as in the example), the PLs are re-normalized (shifted) so that the most likely genotype has a PL of 0. Qualitatively, subsetting corresponds to the belief that the filtered alleles are not real so we should discard any probability mass associated with them. The subset algorithm would produce the following: .. code-block:: text GT: 1/1 GQ: 980 AD: 0,50 0 | 980 1 | 980 0 +----------- 0 1 In summary: - GT: Set to most likely genotype based on the PLs ignoring the filtered allele(s). - AD: The filtered alleles' columns are eliminated, e.g., filtering alleles 1 and 2 transforms ``25,5,10,20`` to ``25,20``. - DP: Unchanged. - PL: Columns involving filtered alleles are eliminated and the remaining columns' values are shifted so the minimum value is 0. - GQ: The second-lowest PL (after shifting). Warning ------- :func:`.filter_alleles_hts` does not update any row fields other than `locus` and `alleles`. This means that row fields like allele count (AC) can become meaningless unless they are also updated. You can update them with :meth:`.annotate_rows`. See Also -------- :func:`.filter_alleles` Parameters ---------- mt : :class:`.MatrixTable` f : callable Function from (allele: :class:`StringExpression`, allele_index: :class:`.Int32Expression`) to :class:`.BooleanExpression` subset : :obj:`.bool` Subset PL field if ``True``, otherwise downcode PL field. The calculation of GT and GQ also depend on whether one subsets or downcodes the PL. Returns ------- :class:`.MatrixTable` """ if mt.entry.dtype != hl.hts_entry_schema: raise FatalError("'filter_alleles_hts': entry schema must be the HTS entry schema:\n" " found: {}\n" " expected: {}\n" " Use 'hl.filter_alleles' to split entries with non-HTS entry fields.".format( mt.entry.dtype, hl.hts_entry_schema )) mt = filter_alleles(mt, f) if subset: newPL = hl.cond( hl.is_defined(mt.PL), hl.bind( lambda unnorm: unnorm - hl.min(unnorm), hl.range(0, hl.triangle(mt.alleles.length())).map( lambda newi: hl.bind( lambda newc: mt.PL[hl.call(mt.new_to_old[newc[0]], mt.new_to_old[newc[1]]).unphased_diploid_gt_index()], hl.unphased_diploid_gt_index_call(newi)))), hl.null(tarray(tint32))) return mt.annotate_entries( GT=hl.unphased_diploid_gt_index_call(hl.argmin(newPL, unique=True)), AD=hl.cond( hl.is_defined(mt.AD), hl.range(0, mt.alleles.length()).map( lambda newi: mt.AD[mt.new_to_old[newi]]), hl.null(tarray(tint32))), # DP unchanged GQ=hl.gq_from_pl(newPL), PL=newPL) # otherwise downcode else: mt = mt.annotate_rows(__old_to_new_no_na = mt.old_to_new.map(lambda x: hl.or_else(x, 0))) newPL = hl.cond( hl.is_defined(mt.PL), (hl.range(0, hl.triangle(hl.len(mt.alleles))) .map(lambda newi: hl.min(hl.range(0, hl.triangle(hl.len(mt.old_alleles))) .filter(lambda oldi: hl.bind( lambda oldc: hl.call(mt.__old_to_new_no_na[oldc[0]], mt.__old_to_new_no_na[oldc[1]]) == hl.unphased_diploid_gt_index_call(newi), hl.unphased_diploid_gt_index_call(oldi))) .map(lambda oldi: mt.PL[oldi])))), hl.null(tarray(tint32))) return mt.annotate_entries( GT=hl.call(mt.__old_to_new_no_na[mt.GT[0]], mt.__old_to_new_no_na[mt.GT[1]]), AD=hl.cond( hl.is_defined(mt.AD), (hl.range(0, hl.len(mt.alleles)) .map(lambda newi: hl.sum(hl.range(0, hl.len(mt.old_alleles)) .filter(lambda oldi: mt.__old_to_new_no_na[oldi] == newi) .map(lambda oldi: mt.AD[oldi])))), hl.null(tarray(tint32))), # DP unchanged GQ=hl.gq_from_pl(newPL), PL=newPL).drop('__old_to_new_no_na')
@typecheck(mt=MatrixTable, call_field=str, r2=numeric, bp_window_size=int, memory_per_core=int) def _local_ld_prune(mt, call_field, r2=0.2, bp_window_size=1000000, memory_per_core=256): bytes_per_core = memory_per_core * 1024 * 1024 fraction_memory_to_use = 0.25 variant_byte_overhead = 50 genotypes_per_pack = 32 n_samples = mt.count_cols() min_bytes_per_core = math.ceil((1 / fraction_memory_to_use) * 8 * n_samples + variant_byte_overhead) if bytes_per_core < min_bytes_per_core: raise ValueError("memory_per_core must be greater than {} MB".format(min_bytes_per_core // (1024 * 1024))) bytes_per_variant = math.ceil(8 * n_samples / genotypes_per_pack) + variant_byte_overhead bytes_available_per_core = bytes_per_core * fraction_memory_to_use max_queue_size = int(max(1.0, math.ceil(bytes_available_per_core / bytes_per_variant))) info(f'ld_prune: running local pruning stage with max queue size of {max_queue_size} variants') return Table(MatrixToTableApply(mt._mir, { 'name': 'LocalLDPrune', 'callField': call_field, 'r2Threshold': float(r2), 'windowSize': bp_window_size, 'maxQueueSize': max_queue_size }))
[docs]@typecheck(call_expr=expr_call, r2=numeric, bp_window_size=int, memory_per_core=int, keep_higher_maf=bool, block_size=nullable(int)) def ld_prune(call_expr, r2=0.2, bp_window_size=1000000, memory_per_core=256, keep_higher_maf=True, block_size=None): """Returns a maximal subset of variants that are nearly uncorrelated within each window. .. include:: ../_templates/req_diploid_gt.rst .. include:: ../_templates/req_biallelic.rst .. include:: ../_templates/req_tvariant.rst Examples -------- Prune variants in linkage disequilibrium by filtering a dataset to those variants returned by :func:`.ld_prune`. If the dataset contains multiallelic variants, the multiallelic variants must be filtered out or split before being passed to :func:`.ld_prune`. >>> biallelic_dataset = dataset.filter_rows(hl.len(dataset.alleles) == 2) >>> pruned_variant_table = hl.ld_prune(biallelic_dataset.GT, r2=0.2, bp_window_size=500000) >>> filtered_ds = dataset.filter_rows(hl.is_defined(pruned_variant_table[dataset.row_key])) Notes ----- This method finds a maximal subset of variants such that the squared Pearson correlation coefficient :math:`r^2` of any pair at most `bp_window_size` base pairs apart is strictly less than `r2`. Each variant is represented as a vector over samples with elements given by the (mean-imputed) number of alternate alleles. In particular, even if present, **phase information is ignored**. Variants that do not vary across samples are dropped. The method prunes variants in linkage disequilibrium in three stages. - The first, "local pruning" stage prunes correlated variants within each partition, using a local variant queue whose size is determined by `memory_per_core`. A larger queue may facilitate more local pruning in this stage. Minor allele frequency is not taken into account. The parallelism is the number of matrix table partitions. - The second, "global correlation" stage uses block-sparse matrix multiplication to compute correlation between each pair of remaining variants within `bp_window_size` base pairs, and then forms a graph of correlated variants. The parallelism of writing the locally-pruned matrix table as a block matrix is ``n_locally_pruned_variants / block_size``. - The third, "global pruning" stage applies :func:`.maximal_independent_set` to prune variants from this graph until no edges remain. This algorithm iteratively removes the variant with the highest vertex degree. If `keep_higher_maf` is true, then in the case of a tie for highest degree, the variant with lowest minor allele frequency is removed. Warning ------- The locally-pruned matrix table and block matrix are stored as temporary files on persistent disk. See the warnings on `BlockMatrix.from_entry_expr` with regard to memory and Hadoop replication errors. Parameters ---------- call_expr : :class:`.CallExpression` Entry-indexed call expression on a matrix table with row-indexed variants and column-indexed samples. r2 : :obj:`float` Squared correlation threshold (exclusive upper bound). Must be in the range [0.0, 1.0]. bp_window_size: :obj:`int` Window size in base pairs (inclusive upper bound). memory_per_core : :obj:`int` Memory in MB per core for local pruning queue. keep_higher_maf: :obj:`int` If ``True``, break ties at each step of the global pruning stage by preferring to keep variants with higher minor allele frequency. block_size: :obj:`int`, optional Block size for block matrices in the second stage. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.Table` Table of a maximal independent set of variants. """ if block_size is None: block_size = BlockMatrix.default_block_size() if not 0.0 <= r2 <= 1: raise ValueError(f'r2 must be in the range [0.0, 1.0], found {r2}') if bp_window_size < 0: raise ValueError(f'bp_window_size must be non-negative, found {bp_window_size}') check_entry_indexed('ld_prune/call_expr', call_expr) mt = matrix_table_source('ld_prune/call_expr', call_expr) require_row_key_variant(mt, 'ld_prune') # FIXME: remove once select_entries on a field is free if call_expr in mt._fields_inverse: field = mt._fields_inverse[call_expr] else: field = Env.get_uid() mt = mt.select_entries(**{field: call_expr}) mt = mt.select_rows().select_cols() mt = mt.distinct_by_row() locally_pruned_table_path = new_temp_file() (_local_ld_prune(require_biallelic(mt, 'ld_prune'), field, r2, bp_window_size, memory_per_core) .write(locally_pruned_table_path, overwrite=True)) locally_pruned_table = hl.read_table(locally_pruned_table_path).add_index() mt = mt.annotate_rows(info=locally_pruned_table[mt.row_key]) mt = mt.filter_rows(hl.is_defined(mt.info)) std_gt_bm = BlockMatrix.from_entry_expr( hl.or_else( (mt[field].n_alt_alleles() - mt.info.mean) * mt.info.centered_length_rec, 0.0), block_size=block_size) r2_bm = (std_gt_bm @ std_gt_bm.T) ** 2 _, stops = hl.linalg.utils.locus_windows(locally_pruned_table.locus, bp_window_size) entries = r2_bm.sparsify_row_intervals(range(stops.size), stops, blocks_only=True).entries(keyed=False) entries = entries.filter((entries.entry >= r2) & (entries.i < entries.j)) entries = entries.select(i = hl.int32(entries.i), j = hl.int32(entries.j)) if keep_higher_maf: fields = ['mean', 'locus'] else: fields = ['locus'] info = locally_pruned_table.aggregate( hl.agg.collect(locally_pruned_table.row.select('idx', *fields)), _localize=False) info = hl.sorted(info, key=lambda x: x.idx) entries = entries.annotate_globals(info = info) entries = entries.filter( (entries.info[entries.i].locus.contig == entries.info[entries.j].locus.contig) & (entries.info[entries.j].locus.position - entries.info[entries.i].locus.position <= bp_window_size)) if keep_higher_maf: entries = entries.annotate( i=hl.struct(idx=entries.i, twice_maf=hl.min(entries.info[entries.i].mean, 2.0 - entries.info[entries.i].mean)), j=hl.struct(idx=entries.j, twice_maf=hl.min(entries.info[entries.j].mean, 2.0 - entries.info[entries.j].mean))) def tie_breaker(l, r): return hl.sign(r.twice_maf - l.twice_maf) else: tie_breaker = None variants_to_remove = hl.maximal_independent_set( entries.i, entries.j, keep=False, tie_breaker=tie_breaker, keyed=False) locally_pruned_table = locally_pruned_table.annotate_globals( variants_to_remove = variants_to_remove.aggregate( hl.agg.collect_as_set(variants_to_remove.node.idx), _localize=False)) return locally_pruned_table.filter( locally_pruned_table.variants_to_remove.contains(hl.int32(locally_pruned_table.idx)), keep=False ).select().persist()
def _warn_if_no_intercept(caller, covariates): if all([e._indices.axes for e in covariates]): warn(f'{caller}: model appears to have no intercept covariate.' '\n To include an intercept, add 1.0 to the list of covariates.') return True return False