import builtins
import functools
import itertools
import operator
from typing import Any, Callable, Iterable, Optional, TypeVar, Union
import numpy as np
import pandas as pd
from deprecated import deprecated
import hail
import hail as hl
from hail import ir
from hail.expr.expressions import (
ArrayExpression,
ArrayNumericExpression,
BooleanExpression,
CallExpression,
DictExpression,
Expression,
ExpressionException,
Float32Expression,
Float64Expression,
Int32Expression,
Int64Expression,
IntervalExpression,
LocusExpression,
NumericExpression,
SetExpression,
StreamExpression,
StringExpression,
StructExpression,
TupleExpression,
apply_expr,
cast_expr,
coercer_from_dtype,
construct_expr,
construct_variable,
expr_any,
expr_array,
expr_bool,
expr_call,
expr_dict,
expr_float32,
expr_float64,
expr_int32,
expr_int64,
expr_interval,
expr_locus,
expr_ndarray,
expr_numeric,
expr_oneof,
expr_set,
expr_str,
expr_stream,
expr_struct,
expr_tuple,
impute_type,
to_expr,
unify_all,
unify_exprs,
unify_types_limited,
)
from hail.expr.types import (
HailType,
hail_type,
is_float32,
is_float64,
is_int32,
is_int64,
is_numeric,
is_primitive,
tarray,
tbool,
tcall,
tdict,
tfloat32,
tfloat64,
tint32,
tint64,
tinterval,
tlocus,
tndarray,
trngstate,
tset,
tstr,
tstream,
tstruct,
ttuple,
)
from hail.genetics.allele_type import AlleleType
from hail.genetics.reference_genome import ReferenceGenome, reference_genome_type
from hail.typecheck import (
anyfunc,
anytype,
arg_check,
args_check,
enumeration,
func_spec,
nullable,
oneof,
sequenceof,
tupleof,
typecheck,
)
from hail.utils.java import Env, warning
from hail.utils.misc import plural
Coll_T = TypeVar('Coll_T', ArrayExpression, SetExpression)
Num_T = TypeVar('Num_T', Int32Expression, Int64Expression, Float32Expression, Float64Expression)
def _func(name, ret_type, *args, type_args=()):
indices, aggregations = unify_all(*args)
return construct_expr(
ir.Apply(name, ret_type, *(a._ir for a in args), type_args=type_args), ret_type, indices, aggregations
)
def _seeded_func(name, ret_type, seed, *args):
if seed is None:
static_rng_uid = Env.next_static_rng_uid()
else:
if Env._hc is None or not Env._hc._user_specified_rng_nonce:
warning(
'To ensure reproducible randomness across Hail sessions, '
'you must set the "global_seed" parameter in hl.init(), in '
'addition to the local seed in each random function.'
)
static_rng_uid = -seed - 1
indices, aggregations = unify_all(*args)
rng_state = ir.Ref('__rng_state', trngstate)
return construct_expr(
ir.ApplySeeded(name, static_rng_uid, rng_state, ret_type, *(a._ir for a in args)),
ret_type,
indices,
aggregations,
)
def ndarray_broadcasting(func):
def broadcast_or_not(x):
if isinstance(x.dtype, tndarray):
return x.map(func)
else:
return func(x)
return broadcast_or_not
@typecheck(a=expr_array(), x=expr_any)
def _lower_bound(a, x):
if a.dtype.element_type != x.dtype:
raise TypeError(f"_lower_bound: incompatible types: {a.dtype}, {x.dtype}")
indices, aggregations = unify_all(a, x)
return construct_expr(ir.LowerBoundOnOrderedCollection(a._ir, x._ir, on_key=False), tint32, indices, aggregations)
@typecheck(cdf=expr_struct(), q=expr_oneof(expr_float32, expr_float64))
def _quantile_from_cdf(cdf, q):
def compute(cdf):
n = cdf.ranks[cdf.ranks.length() - 1]
pos = hl.int64(q * n) + 1
idx = hl.max(0, hl.min(cdf['values'].length() - 1, _lower_bound(cdf.ranks, pos) - 1))
res = hl.if_else(n == 0, hl.missing(cdf['values'].dtype.element_type), cdf['values'][idx])
return res
return hl.rbind(cdf, compute)
@typecheck(raw_cdf=expr_struct())
def _result_from_raw_cdf(raw_cdf):
levels = raw_cdf.levels
item_weights = (
hl._stream_range(hl.len(levels) - 1)
.flatmap(
lambda l: hl._stream_range(levels[l], levels[l + 1]).map(
lambda i: hl.struct(level=l, value=raw_cdf['items'][i])
)
)
.aggregate(lambda x: hl.agg.group_by(x.value, hl.agg.sum(hl.bit_lshift(1, x.level))))
)
weights = item_weights.values()
ranks = weights.scan(lambda acc, weight: acc + weight, 0)
values = item_weights.keys()
return hl.struct(values=values, ranks=ranks, _compaction_counts=raw_cdf._compaction_counts)
@typecheck(k=expr_int32, left=expr_struct(), right=expr_struct())
def _cdf_combine(k, left, right):
t = tstruct(levels=tarray(tint32), items=tarray(tfloat64), _compaction_counts=tarray(tint32))
return _func('approxCDFCombine', t, k, left, right)
@typecheck(cdf=expr_struct(), failure_prob=expr_oneof(expr_float32, expr_float64), all_quantiles=bool)
def _error_from_cdf(cdf, failure_prob, all_quantiles=False):
"""Estimates error of approx_cdf aggregator, using Hoeffding's inequality.
Parameters
----------
cdf : :class:`.StructExpression`
Result of :func:`.approx_cdf` aggregator
failure_prob: :class:`.NumericExpression`
Upper bound on probability of true error being greater than estimated error.
all_quantiles: :obj:`bool`
If ``True``, with probability 1 - `failure_prob`, error estimate applies
to all quantiles simultaneously.
Returns
-------
:class:`.NumericExpression`
Upper bound on error of quantile estimates.
"""
def compute_sum(cdf):
s = hl.sum(
hl.range(0, hl.len(cdf._compaction_counts)).map(lambda i: cdf._compaction_counts[i] * (2 ** (2 * i)))
)
return s / (cdf.ranks[-1] ** 2)
def update_grid_size(p, s):
return 4 * hl.sqrt(hl.log(2 * p / failure_prob) / (2 * s))
def compute_grid_size(s):
return hl.fold(lambda p, i: update_grid_size(p, s), 1 / failure_prob, hl.range(0, 5))
def compute_single_error(s, failure_prob=failure_prob):
return hl.sqrt(hl.log(2 / failure_prob) * s / 2)
def compute_global_error(s):
return hl.rbind(compute_grid_size(s), lambda p: 1 / p + compute_single_error(s, failure_prob / p))
if all_quantiles:
return hl.rbind(cdf, lambda cdf: hl.rbind(compute_sum(cdf), compute_global_error))
else:
return hl.rbind(cdf, lambda cdf: hl.rbind(compute_sum(cdf), compute_single_error))
def _error_from_cdf_python(cdf, failure_prob, all_quantiles=False):
"""Estimates error of approx_cdf aggregator, using Hoeffding's inequality.
Parameters
----------
cdf : :obj:`dict`
Result of :func:`.approx_cdf` aggregator, evaluated to a python dict
failure_prob: :obj:`float`
Upper bound on probability of true error being greater than estimated error.
all_quantiles: :obj:`bool`
If ``True``, with probability 1 - `failure_prob`, error estimate applies
to all quantiles simultaneously.
Returns
-------
:obj:`float`
Upper bound on error of quantile estimates.
"""
import math
s = 0
for i in builtins.range(builtins.len(cdf._compaction_counts)):
s += cdf._compaction_counts[i] << (2 * i)
s = s / (cdf.ranks[-1] ** 2)
def update_grid_size(p):
return 4 * math.sqrt(math.log(2 * p / failure_prob) / (2 * s))
def compute_grid_size(s):
p = 1 / failure_prob
for _ in builtins.range(5):
p = update_grid_size(p)
return p
def compute_single_error(s, failure_prob=failure_prob):
return math.sqrt(math.log(2 / failure_prob) * s / 2)
if s == 0:
# no compactions ergo no error
return 0
elif all_quantiles:
p = compute_grid_size(s)
return 1 / p + compute_single_error(s, failure_prob / p)
else:
return compute_single_error(s, failure_prob)
[docs]@typecheck(t=hail_type)
def missing(t: Union[HailType, str]):
"""Creates an expression representing a missing value of a specified type.
Examples
--------
>>> hl.eval(hl.missing(hl.tarray(hl.tstr)))
None
>>> hl.eval(hl.missing('array<str>'))
None
Notes
-----
This method is useful for constructing an expression that includes missing
values, since :obj:`None` cannot be interpreted as an expression.
Parameters
----------
t : :class:`str` or :class:`.HailType`
Type of the missing expression.
Returns
-------
:class:`.Expression`
A missing expression of type `t`.
"""
return construct_expr(ir.NA(t), t)
[docs]@deprecated(version="0.2.62", reason="Replaced by hl.missing")
@typecheck(t=hail_type)
def null(t: Union[HailType, str]):
"""Deprecated in favor of :func:`.missing`.
Creates an expression representing a missing value of a specified type.
Examples
--------
>>> hl.eval(hl.null(hl.tarray(hl.tstr)))
None
>>> hl.eval(hl.null('array<str>'))
None
Notes
-----
This method is useful for constructing an expression that includes missing
values, since :obj:`None` cannot be interpreted as an expression.
Parameters
----------
t : :class:`str` or :class:`.HailType`
Type of the missing expression.
Returns
-------
:class:`.Expression`
A missing expression of type `t`.
"""
return missing(t)
[docs]@typecheck(x=anytype, dtype=nullable(hail_type))
def literal(x: Any, dtype: Optional[Union[HailType, str]] = None):
"""Captures and broadcasts a Python variable or object as an expression.
Examples
--------
>>> table = hl.utils.range_table(8)
>>> greetings = hl.literal({1: 'Good morning', 4: 'Good afternoon', 6 : 'Good evening'})
>>> table.annotate(greeting = greetings.get(table.idx)).show()
+-------+------------------+
| idx | greeting |
+-------+------------------+
| int32 | str |
+-------+------------------+
| 0 | NA |
| 1 | "Good morning" |
| 2 | NA |
| 3 | NA |
| 4 | "Good afternoon" |
| 5 | NA |
| 6 | "Good evening" |
| 7 | NA |
+-------+------------------+
Notes
-----
Use this function to capture large Python objects for use in expressions. This
function provides an alternative to adding an object as a global annotation on a
:class:`.Table` or :class:`.MatrixTable`.
Parameters
----------
x
Object to capture and broadcast as an expression.
Returns
-------
:class:`.Expression`
"""
wrapper = {'has_expr': False, 'has_free_vars': False}
def typecheck_expr(t, x):
if isinstance(x, Expression):
wrapper['has_expr'] = True
wrapper['has_free_vars'] |= (
builtins.len(x._ir.free_vars) > 0
or builtins.len(x._ir.free_agg_vars) > 0
or builtins.len(x._ir.free_scan_vars) > 0
)
if x.dtype != t:
raise TypeError(f"'literal': type mismatch: expected '{t}', found '{x.dtype}'")
elif x._indices.source is not None:
if x._indices.axes:
raise ExpressionException(
f"'literal' can only accept scalar or global expression arguments,"
f" found indices {x._indices.axes}"
)
return False
elif x is None or x is pd.NA:
return False
else:
t._typecheck_one_level(x)
return True
if dtype is None:
dtype = impute_type(x)
# Special handling of numpy. Have to extract from numpy scalars, do nothing on numpy arrays
if isinstance(x, np.generic):
x = x.item()
elif isinstance(x, np.ndarray):
pass
else:
try:
dtype._traverse(x, typecheck_expr)
except TypeError as e:
raise TypeError("'literal': object did not match the passed type '{}'".format(dtype)) from e
if wrapper['has_free_vars']:
raise ValueError(
"'literal' cannot be used with hail expressions that depend "
"on other expressions. Use expression 'x' directly "
"instead of passing it to 'literal'."
)
if wrapper['has_expr']:
return literal(hl.eval(to_expr(x, dtype)), dtype)
if x is None or x is pd.NA:
return hl.missing(dtype)
elif is_primitive(dtype):
if dtype == tint32:
assert is_int32(x)
assert tint32.min_value <= x <= tint32.max_value
return construct_expr(ir.I32(x), tint32)
elif dtype == tint64:
assert is_int64(x)
assert tint64.min_value <= x <= tint64.max_value
return construct_expr(ir.I64(x), tint64)
elif dtype == tfloat32:
assert is_float32(x)
return construct_expr(ir.F32(x), tfloat32)
elif dtype == tfloat64:
assert is_float64(x)
return construct_expr(ir.F64(x), tfloat64)
elif dtype == tbool:
assert isinstance(x, builtins.bool)
return construct_expr(ir.TrueIR() if x else ir.FalseIR(), tbool)
else:
assert dtype == tstr
assert isinstance(x, builtins.str)
return construct_expr(ir.Str(x), tstr)
else:
return construct_expr(ir.EncodedLiteral(dtype, x), dtype)
[docs]@deprecated(version="0.2.59", reason="Replaced by hl.if_else")
@typecheck(condition=expr_bool, consequent=expr_any, alternate=expr_any, missing_false=bool)
def cond(condition, consequent, alternate, missing_false: bool = False):
"""Deprecated in favor of :func:`.if_else`.
Expression for an if/else statement; tests a condition and returns one of two options based on the result.
Examples
--------
>>> x = 5
>>> hl.eval(hl.cond(x < 2, 'Hi', 'Bye'))
'Bye'
>>> a = hl.literal([1, 2, 3, 4])
>>> hl.eval(hl.cond(hl.len(a) > 0, 2.0 * a, a / 2.0))
[2.0, 4.0, 6.0, 8.0]
Notes
-----
If `condition` evaluates to ``True``, returns `consequent`. If `condition`
evaluates to ``False``, returns `alternate`. If `predicate` is missing, returns
missing.
Note
----
The type of `consequent` and `alternate` must be the same.
Parameters
----------
condition : :class:`.BooleanExpression`
Condition to test.
consequent : :class:`.Expression`
Branch to return if the condition is ``True``.
alternate : :class:`.Expression`
Branch to return if the condition is ``False``.
missing_false : :obj:`.bool`
If ``True``, treat missing `condition` as ``False``.
See Also
--------
:func:`.case`, :func:`.switch`, :func:`.if_else`
Returns
-------
:class:`.Expression`
One of `consequent`, `alternate`, or missing, based on `condition`.
"""
return if_else(condition, consequent, alternate, missing_false)
[docs]@typecheck(condition=expr_bool, consequent=expr_any, alternate=expr_any, missing_false=bool)
def if_else(condition, consequent, alternate, missing_false: bool = False):
"""Expression for an if/else statement; tests a condition and returns one of two options based on the result.
Examples
--------
>>> x = 5
>>> hl.eval(hl.if_else(x < 2, 'Hi', 'Bye'))
'Bye'
>>> a = hl.literal([1, 2, 3, 4])
>>> hl.eval(hl.if_else(hl.len(a) > 0, 2.0 * a, a / 2.0))
[2.0, 4.0, 6.0, 8.0]
Notes
-----
If `condition` evaluates to ``True``, returns `consequent`. If `condition`
evaluates to ``False``, returns `alternate`. If `predicate` is missing, returns
missing.
Note
----
The type of `consequent` and `alternate` must be the same.
Parameters
----------
condition : :class:`.BooleanExpression`
Condition to test.
consequent : :class:`.Expression`
Branch to return if the condition is ``True``.
alternate : :class:`.Expression`
Branch to return if the condition is ``False``.
missing_false : :obj:`.bool`
If ``True``, treat missing `condition` as ``False``.
See Also
--------
:func:`.case`, :func:`.switch`
Returns
-------
:class:`.Expression`
One of `consequent`, `alternate`, or missing, based on `condition`.
"""
if missing_false:
condition = hl.bind(lambda x: hl.is_defined(x) & x, condition)
indices, aggregations = unify_all(condition, consequent, alternate)
consequent, alternate, success = unify_exprs(consequent, alternate)
if not success:
raise TypeError(
f"'if_else' and 'cond' require the 'consequent' and 'alternate' arguments to have the same type\n"
f" consequent: type '{consequent.dtype}'\n"
f" alternate: type '{alternate.dtype}'"
)
assert consequent.dtype == alternate.dtype
return construct_expr(ir.If(condition._ir, consequent._ir, alternate._ir), consequent.dtype, indices, aggregations)
[docs]def case(missing_false: bool = False) -> 'hail.expr.builders.CaseBuilder':
"""Chain multiple if-else statements with a :class:`.CaseBuilder`.
Examples
--------
>>> x = hl.literal('foo bar baz')
>>> expr = (hl.case()
... .when(x[:3] == 'FOO', 1)
... .when(hl.len(x) == 11, 2)
... .when(x == 'secret phrase', 3)
... .default(0))
>>> hl.eval(expr)
2
Parameters
----------
missing_false : :obj:`bool`
Treat missing predicates as ``False``.
See Also
--------
:class:`.CaseBuilder`, :func:`.switch`, :func:`.cond`
Returns
-------
:class:`.CaseBuilder`.
"""
from .builders import CaseBuilder
return CaseBuilder(missing_false=missing_false)
[docs]@typecheck(expr=expr_any)
def switch(expr) -> 'hail.expr.builders.SwitchBuilder':
"""Build a conditional tree on the value of an expression.
Examples
--------
>>> csq = hl.literal('loss of function')
>>> expr = (hl.switch(csq)
... .when('synonymous', 1)
... .when('SYN', 1)
... .when('missense', 2)
... .when('MIS', 2)
... .when('loss of function', 3)
... .when('LOF', 3)
... .or_missing())
>>> hl.eval(expr)
3
See Also
--------
:class:`.SwitchBuilder`, :func:`.case`, :func:`.cond`
Parameters
----------
expr : :class:`.Expression`
Value to match against.
Returns
-------
:class:`.SwitchBuilder`
"""
from .builders import SwitchBuilder
return SwitchBuilder(expr)
[docs]@typecheck(f=anytype, exprs=expr_any, _ctx=nullable(str))
def bind(f: Callable, *exprs, _ctx=None):
"""Bind a temporary variable and use it in a function.
Examples
--------
>>> hl.eval(hl.bind(lambda x: x + 1, 1))
2
:func:`.bind` also can take multiple arguments:
>>> hl.eval(hl.bind(lambda x, y: x / y, x, x))
1.0
Parameters
----------
f : function ( (args) -> :class:`.Expression`)
Function of `exprs`.
exprs : variable-length args of :class:`.Expression`
Expressions to bind.
Returns
-------
:class:`.Expression`
Result of evaluating `f` with `exprs` as arguments.
"""
args = []
uids = []
irs = []
for expr in exprs:
uid = Env.get_uid(base=_ctx)
args.append(construct_variable(uid, expr._type, expr._indices, expr._aggregations))
uids.append(uid)
irs.append(expr._ir)
lambda_result = to_expr(f(*args))
if _ctx:
indices, aggregations = unify_all(lambda_result) # FIXME: hacky. May drop field refs from errors?
else:
indices, aggregations = unify_all(*exprs, lambda_result)
res_ir = lambda_result._ir
for uid, value_ir in builtins.zip(uids, irs):
if _ctx == 'agg':
res_ir = ir.AggLet(uid, value_ir, res_ir, is_scan=False)
elif _ctx == 'scan':
res_ir = ir.AggLet(uid, value_ir, res_ir, is_scan=True)
else:
res_ir = ir.Let(uid, value_ir, res_ir)
return construct_expr(res_ir, lambda_result.dtype, indices, aggregations)
[docs]def rbind(*exprs, _ctx=None):
"""Bind a temporary variable and use it in a function.
This is :func:`.bind` with flipped argument order.
Examples
--------
>>> hl.eval(hl.rbind(1, lambda x: x + 1))
2
:func:`.rbind` also can take multiple arguments:
>>> hl.eval(hl.rbind(4.0, 2.0, lambda x, y: x / y))
2.0
Parameters
----------
exprs : variable-length args of :class:`.Expression`
Expressions to bind.
f : function ( (args) -> :class:`.Expression`)
Function of `exprs`.
Returns
-------
:class:`.Expression`
Result of evaluating `f` with `exprs` as arguments.
"""
*args, f = exprs
args = [expr_any.check(arg, 'rbind', f'argument {index}') for index, arg in builtins.enumerate(args)]
return hl.bind(f, *args, _ctx=_ctx)
[docs]@typecheck(c1=expr_int32, c2=expr_int32, c3=expr_int32, c4=expr_int32)
def chi_squared_test(c1, c2, c3, c4) -> StructExpression:
"""Performs chi-squared test of independence on a 2x2 contingency table.
Examples
--------
>>> hl.eval(hl.chi_squared_test(10, 10, 10, 10))
Struct(p_value=1.0, odds_ratio=1.0)
>>> hl.eval(hl.chi_squared_test(51, 43, 22, 92))
Struct(p_value=1.4626257805267089e-07, odds_ratio=4.959830866807611)
Notes
-----
The odds ratio is given by ``(c1 / c2) / (c3 / c4)``.
Returned fields may be ``nan`` or ``inf``.
Parameters
----------
c1 : int or :class:`.Expression` of type :py:data:`.tint32`
Value for cell 1.
c2 : int or :class:`.Expression` of type :py:data:`.tint32`
Value for cell 2.
c3 : int or :class:`.Expression` of type :py:data:`.tint32`
Value for cell 3.
c4 : int or :class:`.Expression` of type :py:data:`.tint32`
Value for cell 4.
Returns
-------
:class:`.StructExpression`
A :class:`.tstruct` expression with two fields, `p_value`
(:py:data:`.tfloat64`) and `odds_ratio` (:py:data:`.tfloat64`).
"""
ret_type = tstruct(p_value=tfloat64, odds_ratio=tfloat64)
return _func("chi_squared_test", ret_type, c1, c2, c3, c4)
[docs]@typecheck(c1=expr_int32, c2=expr_int32, c3=expr_int32, c4=expr_int32, min_cell_count=expr_int32)
def contingency_table_test(c1, c2, c3, c4, min_cell_count) -> StructExpression:
"""Performs chi-squared or Fisher's exact test of independence on a 2x2
contingency table.
Examples
--------
>>> hl.eval(hl.contingency_table_test(51, 43, 22, 92, min_cell_count=22))
Struct(p_value=1.4626257805267089e-07, odds_ratio=4.959830866807611)
>>> hl.eval(hl.contingency_table_test(51, 43, 22, 92, min_cell_count=23))
Struct(p_value=2.1564999740157304e-07, odds_ratio=4.918058171469967)
Notes
-----
If all cell counts are at least `min_cell_count`, the chi-squared test is
used. Otherwise, Fisher's exact test is used.
Returned fields may be ``nan`` or ``inf``.
Parameters
----------
c1 : int or :class:`.Expression` of type :py:data:`.tint32`
Value for cell 1.
c2 : int or :class:`.Expression` of type :py:data:`.tint32`
Value for cell 2.
c3 : int or :class:`.Expression` of type :py:data:`.tint32`
Value for cell 3.
c4 : int or :class:`.Expression` of type :py:data:`.tint32`
Value for cell 4.
min_cell_count : int or :class:`.Expression` of type :py:data:`.tint32`
Minimum count in every cell to use the chi-squared test.
Returns
-------
:class:`.StructExpression`
A :class:`.tstruct` expression with two fields, `p_value`
(:py:data:`.tfloat64`) and `odds_ratio` (:py:data:`.tfloat64`).
"""
ret_type = tstruct(p_value=tfloat64, odds_ratio=tfloat64)
return _func("contingency_table_test", ret_type, c1, c2, c3, c4, min_cell_count)
# We use 64-bit integers.
# It is relatively easy to encounter an integer overflow bug with 32-bit integers.
[docs]@typecheck(a=expr_array(expr_int64), b=expr_array(expr_int64), c=expr_array(expr_int64), d=expr_array(expr_int64))
def cochran_mantel_haenszel_test(
a: Union[tarray, list], b: Union[tarray, list], c: Union[tarray, list], d: Union[tarray, list]
) -> StructExpression:
"""Perform the Cochran-Mantel-Haenszel test for association.
Examples
--------
>>> a = [56, 61, 73, 71]
>>> b = [69, 257, 65, 48]
>>> c = [40, 57, 71, 55]
>>> d = [77, 301, 79, 48]
>>> hl.eval(hl.cochran_mantel_haenszel_test(a, b, c, d))
Struct(test_statistic=5.0496881823306765, p_value=0.024630370456863417)
>>> mt = ds.filter_rows(mt.locus == hl.Locus(20, 10633237))
>>> mt.count_rows()
1
>>> a, b, c, d = mt.aggregate_entries(
... hl.tuple([
... hl.array([hl.agg.count_where(mt.GT.is_non_ref() & mt.pheno.is_case & mt.pheno.is_female), hl.agg.count_where(mt.GT.is_non_ref() & mt.pheno.is_case & ~mt.pheno.is_female)]),
... hl.array([hl.agg.count_where(mt.GT.is_non_ref() & ~mt.pheno.is_case & mt.pheno.is_female), hl.agg.count_where(mt.GT.is_non_ref() & ~mt.pheno.is_case & ~mt.pheno.is_female)]),
... hl.array([hl.agg.count_where(~mt.GT.is_non_ref() & mt.pheno.is_case & mt.pheno.is_female), hl.agg.count_where(~mt.GT.is_non_ref() & mt.pheno.is_case & ~mt.pheno.is_female)]),
... hl.array([hl.agg.count_where(~mt.GT.is_non_ref() & ~mt.pheno.is_case & mt.pheno.is_female), hl.agg.count_where(~mt.GT.is_non_ref() & ~mt.pheno.is_case & ~mt.pheno.is_female)])
... ])
... )
>>> hl.eval(hl.cochran_mantel_haenszel_test(a, b, c, d))
Struct(test_statistic=0.2188830334629822, p_value=0.6398923118508772)
Notes
-----
See the `Wikipedia article <https://en.m.wikipedia.org/wiki/Cochran%E2%80%93Mantel%E2%80%93Haenszel_statistics>`_
for more details.
Parameters
----------
a : :class:`.ArrayExpression` of type :py:data:`.tint64`
Values for the upper-left cell in the contingency tables.
b : :class:`.ArrayExpression` of type :py:data:`.tint64`
Values for the upper-right cell in the contingency tables.
c : :class:`.ArrayExpression` of type :py:data:`.tint64`
Values for the lower-left cell in the contingency tables.
d : :class:`.ArrayExpression` of type :py:data:`.tint64`
Values for the lower-right cell in the contingency tables.
Returns
-------
:class:`.StructExpression`
A :class:`.tstruct` expression with two fields, `test_statistic`
(:py:data:`.tfloat64`) and `p_value` (:py:data:`.tfloat64`).
"""
# The variable names below correspond to the notation used in the Wikipedia article.
# https://en.m.wikipedia.org/wiki/Cochran%E2%80%93Mantel%E2%80%93Haenszel_statistics
n1 = hl.zip(a, b).map(lambda ab: ab[0] + ab[1])
n2 = hl.zip(c, d).map(lambda cd: cd[0] + cd[1])
m1 = hl.zip(a, c).map(lambda ac: ac[0] + ac[1])
m2 = hl.zip(b, d).map(lambda bd: bd[0] + bd[1])
t = hl.zip(n1, n2).map(lambda nn: nn[0] + nn[1])
def numerator_term(a, n1, m1, t):
return a - n1 * m1 / t
# The numerator comes from the link below, not from the Wikipedia article.
# https://www.biostathandbook.com/cmh.html
numerator = (hl.abs(hl.sum(hl.zip(a, n1, m1, t).map(lambda tup: numerator_term(*tup)))) - 0.5) ** 2
def denominator_term(n1, n2, m1, m2, t):
return n1 * n2 * m1 * m2 / (t**3 - t**2)
denominator = hl.sum(hl.zip(n1, n2, m1, m2, t).map(lambda tup: denominator_term(*tup)))
test_statistic = numerator / denominator
p_value = pchisqtail(test_statistic, 1)
return struct(test_statistic=test_statistic, p_value=p_value)
[docs]@typecheck(
collection=expr_oneof(
expr_dict(), expr_set(expr_tuple([expr_any, expr_any])), expr_array(expr_tuple([expr_any, expr_any]))
)
)
def dict(collection) -> DictExpression:
"""Creates a dictionary.
Examples
--------
>>> hl.eval(hl.dict([('foo', 1), ('bar', 2), ('baz', 3)]))
{'bar': 2, 'baz': 3, 'foo': 1}
Notes
-----
This method expects arrays or sets with elements of type :class:`.ttuple`
with 2 fields. The first field of the tuple becomes the key, and the second
field becomes the value.
Parameters
----------
collection : :class:`.DictExpression` or :class:`.ArrayExpression` or :class:`.SetExpression`
Returns
-------
:class:`.DictExpression`
"""
if isinstance(collection.dtype, (tarray, tset)):
key_type, value_type = collection.dtype.element_type.types
return _func('dict', tdict(key_type, value_type), collection)
else:
assert isinstance(collection.dtype, tdict)
return collection
[docs]@typecheck(x=expr_float64, a=expr_float64, b=expr_float64)
def dbeta(x, a, b) -> Float64Expression:
"""
Returns the probability density at `x` of a `beta distribution
<https://en.wikipedia.org/wiki/Beta_distribution>`__ with parameters `a`
(alpha) and `b` (beta).
Examples
--------
>>> hl.eval(hl.dbeta(.2, 5, 20))
4.900377563180943
Parameters
----------
x : :obj:`float` or :class:`.Expression` of type :py:data:`.tfloat64`
Point in [0,1] at which to sample. If a < 1 then x must be positive.
If b < 1 then x must be less than 1.
a : :obj:`float` or :class:`.Expression` of type :py:data:`.tfloat64`
The alpha parameter in the beta distribution. The result is undefined
for non-positive a.
b : :obj:`float` or :class:`.Expression` of type :py:data:`.tfloat64`
The beta parameter in the beta distribution. The result is undefined
for non-positive b.
Returns
-------
:class:`.Float64Expression`
"""
return _func("dbeta", tfloat64, x, a, b)
[docs]@typecheck(x=expr_float64, df=expr_float64, ncp=nullable(expr_float64), log_p=expr_bool)
def dchisq(x, df, ncp=None, log_p=False) -> Float64Expression:
"""Compute the probability density at `x` of a chi-squared distribution with `df`
degrees of freedom.
Examples
--------
>>> hl.eval(hl.dchisq(1, 2))
0.3032653298563167
>>> hl.eval(hl.dchisq(1, 2, ncp=2))
0.17472016746112667
>>> hl.eval(hl.dchisq(1, 2, log_p=True))
-1.1931471805599454
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64`
Non-negative number at which to compute the probability density.
df : float or :class:`.Expression` of type :py:data:`.tfloat64`
Degrees of freedom.
ncp: float or :class:`.Expression` of type :py:data:`.tfloat64`
Noncentrality parameter, defaults to 0 if unspecified.
log_p : bool or :class:`.BooleanExpression`
If ``True``, the natural logarithm of the probability density is returned.
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
The probability density.
"""
if ncp is None:
return _func("dchisq", tfloat64, x, df, log_p)
else:
return _func("dnchisq", tfloat64, x, df, ncp, log_p)
[docs]@typecheck(x=expr_float64, mu=expr_float64, sigma=expr_float64, log_p=expr_bool)
def dnorm(x, mu=0, sigma=1, log_p=False) -> Float64Expression:
"""Compute the probability density at `x` of a normal distribution with mean
`mu` and standard deviation `sigma`. Returns density of standard normal
distribution by default.
Examples
--------
>>> hl.eval(hl.dnorm(1))
0.24197072451914337
>>> hl.eval(hl.dnorm(1, mu=1, sigma=2))
0.19947114020071635
>>> hl.eval(hl.dnorm(1, log_p=True))
-1.4189385332046727
Parameters
----------
x : :obj:`float` or :class:`.Expression` of type :py:data:`.tfloat64`
Real number at which to compute the probability density.
mu : float or :class:`.Expression` of type :py:data:`.tfloat64`
Mean (default = 0).
sigma: float or :class:`.Expression` of type :py:data:`.tfloat64`
Standard deviation (default = 1).
log_p : :obj:`bool` or :class:`.BooleanExpression`
If ``True``, the natural logarithm of the probability density is returned.
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
The probability density.
"""
return _func("dnorm", tfloat64, x, mu, sigma, log_p)
[docs]@typecheck(x=expr_float64, lamb=expr_float64, log_p=expr_bool)
def dpois(x, lamb, log_p=False) -> Float64Expression:
"""Compute the (log) probability density at x of a Poisson distribution with rate parameter `lamb`.
Examples
--------
>>> hl.eval(hl.dpois(5, 3))
0.10081881344492458
Parameters
----------
x : :obj:`float` or :class:`.Expression` of type :py:data:`.tfloat64`
Non-negative number at which to compute the probability density.
lamb : :obj:`float` or :class:`.Expression` of type :py:data:`.tfloat64`
Poisson rate parameter. Must be non-negative.
log_p : :obj:`bool` or :class:`.BooleanExpression`
If ``True``, the natural logarithm of the probability density is returned.
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
The (log) probability density.
"""
return _func("dpois", tfloat64, x, lamb, log_p)
[docs]@typecheck(x=oneof(expr_float64, expr_ndarray(expr_float64)))
@ndarray_broadcasting
def exp(x) -> Float64Expression:
"""Computes `e` raised to the power `x`.
Examples
--------
>>> hl.eval(hl.exp(2))
7.38905609893065
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64` or :class:`.NDArrayNumericExpression`
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64` or :class:`.NDArrayNumericExpression`
"""
return _func("exp", tfloat64, x)
[docs]@typecheck(c1=expr_int32, c2=expr_int32, c3=expr_int32, c4=expr_int32)
def fisher_exact_test(c1, c2, c3, c4) -> StructExpression:
"""Calculates the p-value, odds ratio, and 95% confidence interval using
Fisher's exact test for a 2x2 table.
Examples
--------
>>> hl.eval(hl.fisher_exact_test(10, 10, 10, 10))
Struct(p_value=1.0000000000000002, odds_ratio=1.0,
ci_95_lower=0.24385796914260355, ci_95_upper=4.100747675033819)
>>> hl.eval(hl.fisher_exact_test(51, 43, 22, 92))
Struct(p_value=2.1564999740157304e-07, odds_ratio=4.918058171469967,
ci_95_lower=2.5659373368248444, ci_95_upper=9.677929632035475)
Notes
-----
This method is identical to the version implemented in
`R <https://stat.ethz.ch/R-manual/R-devel/library/stats/html/fisher.test.html>`_ with default
parameters (two-sided, alpha = 0.05, null hypothesis that the odds ratio equals 1).
Returned fields may be ``nan`` or ``inf``.
Parameters
----------
c1 : int or :class:`.Expression` of type :py:data:`.tint32`
Value for cell 1.
c2 : int or :class:`.Expression` of type :py:data:`.tint32`
Value for cell 2.
c3 : int or :class:`.Expression` of type :py:data:`.tint32`
Value for cell 3.
c4 : int or :class:`.Expression` of type :py:data:`.tint32`
Value for cell 4.
Returns
-------
:class:`.StructExpression`
A :class:`.tstruct` expression with four fields, `p_value`
(:py:data:`.tfloat64`), `odds_ratio` (:py:data:`.tfloat64`),
`ci_95_lower (:py:data:`.tfloat64`), and `ci_95_upper`
(:py:data:`.tfloat64`).
"""
ret_type = tstruct(p_value=tfloat64, odds_ratio=tfloat64, ci_95_lower=tfloat64, ci_95_upper=tfloat64)
return _func("fisher_exact_test", ret_type, c1, c2, c3, c4)
[docs]@typecheck(x=expr_oneof(expr_float32, expr_float64, expr_ndarray(expr_float64)))
@ndarray_broadcasting
def floor(x):
"""The largest integral value that is less than or equal to `x`.
Examples
--------
>>> hl.eval(hl.floor(3.1))
3.0
Parameters
----------
x : :class:`.Float32Expression`, :class:`.Float64Expression`, or :class:`.NDArrayNumericExpression`
Returns
-------
:class:`.Float32Expression`, :class:`.Float64Expression`, or :class:`.NDArrayNumericExpression`
"""
return _func("floor", x.dtype, x)
[docs]@typecheck(x=expr_oneof(expr_float32, expr_float64, expr_ndarray(expr_float64)))
@ndarray_broadcasting
def ceil(x):
"""The smallest integral value that is greater than or equal to `x`.
Examples
--------
>>> hl.eval(hl.ceil(3.1))
4.0
Parameters
----------
x : :class:`.Float32Expression`,:class:`.Float64Expression` or :class:`.NDArrayNumericExpression`
Returns
-------
:class:`.Float32Expression`, :class:`.Float64Expression`, or :class:`.NDArrayNumericExpression`
"""
return _func("ceil", x.dtype, x)
[docs]@typecheck(n_hom_ref=expr_int32, n_het=expr_int32, n_hom_var=expr_int32, one_sided=expr_bool)
def hardy_weinberg_test(n_hom_ref, n_het, n_hom_var, one_sided=False) -> StructExpression:
"""Performs test of Hardy-Weinberg equilibrium.
Examples
--------
>>> hl.eval(hl.hardy_weinberg_test(250, 500, 250))
Struct(het_freq_hwe=0.5002501250625313, p_value=0.9747844394217698)
>>> hl.eval(hl.hardy_weinberg_test(37, 200, 85))
Struct(het_freq_hwe=0.48964964307448583, p_value=1.1337210383168987e-06)
Notes
-----
By default, this method performs a two-sided exact test with mid-p-value correction of
`Hardy-Weinberg equilibrium <https://en.wikipedia.org/wiki/Hardy%E2%80%93Weinberg_principle>`__
via an efficient implementation of the
`Levene-Haldane distribution <../_static/LeveneHaldane.pdf>`__,
which models the number of heterozygous individuals under equilibrium.
The mean of this distribution is ``(n_ref * n_var) / (2n - 1)``, where
``n_ref = 2*n_hom_ref + n_het`` is the number of reference alleles,
``n_var = 2*n_hom_var + n_het`` is the number of variant alleles,
and ``n = n_hom_ref + n_het + n_hom_var`` is the number of individuals.
So the expected frequency of heterozygotes under equilibrium,
`het_freq_hwe`, is this mean divided by ``n``.
To perform one-sided exact test of excess heterozygosity with mid-p-value
correction instead, set `one_sided=True` and the p-value returned will be
from the one-sided exact test.
Parameters
----------
n_hom_ref : int or :class:`.Expression` of type :py:data:`.tint32`
Number of homozygous reference genotypes.
n_het : int or :class:`.Expression` of type :py:data:`.tint32`
Number of heterozygous genotypes.
n_hom_var : int or :class:`.Expression` of type :py:data:`.tint32`
Number of homozygous variant genotypes.
one_sided : :obj:`bool`
``False`` by default. When ``True``, perform one-sided test for excess heterozygosity.
Returns
-------
:class:`.StructExpression`
A struct expression with two fields, `het_freq_hwe`
(:py:data:`.tfloat64`) and `p_value` (:py:data:`.tfloat64`).
"""
ret_type = tstruct(het_freq_hwe=tfloat64, p_value=tfloat64)
return _func("hardy_weinberg_test", ret_type, n_hom_ref, n_het, n_hom_var, one_sided)
[docs]@typecheck(contig=expr_str, pos=expr_int32, reference_genome=reference_genome_type)
def locus(contig, pos, reference_genome: Union[str, ReferenceGenome] = 'default') -> LocusExpression:
"""Construct a locus expression from a chromosome and position.
Examples
--------
>>> hl.eval(hl.locus("1", 10000, reference_genome='GRCh37'))
Locus(contig=1, position=10000, reference_genome=GRCh37)
Parameters
----------
contig : str or :class:`.StringExpression`
Chromosome.
pos : int or :class:`.Expression` of type :py:data:`.tint32`
Base position along the chromosome.
reference_genome : :class:`str` or :class:`.ReferenceGenome`
Reference genome to use.
Returns
-------
:class:`.LocusExpression`
"""
return _func('Locus', tlocus(reference_genome), contig, pos)
[docs]@typecheck(global_pos=expr_int64, reference_genome=reference_genome_type)
def locus_from_global_position(
global_pos, reference_genome: Union[str, ReferenceGenome] = 'default'
) -> LocusExpression:
"""Constructs a locus expression from a global position and a reference genome.
The inverse of :meth:`.LocusExpression.global_position`.
Examples
--------
>>> hl.eval(hl.locus_from_global_position(0))
Locus(contig=1, position=1, reference_genome=GRCh37)
>>> hl.eval(hl.locus_from_global_position(2824183054))
Locus(contig=21, position=42584230, reference_genome=GRCh37)
>>> hl.eval(hl.locus_from_global_position(2824183054, reference_genome='GRCh38'))
Locus(contig=chr22, position=1, reference_genome=GRCh38)
Parameters
----------
global_pos : int or :class:`.Expression` of type :py:data:`.tint64`
Global base position along the reference genome.
reference_genome : :class:`str` or :class:`.ReferenceGenome`
Reference genome to use for converting the global position to a contig and local position.
Returns
-------
:class:`.LocusExpression`
"""
return _func('globalPosToLocus', tlocus(reference_genome), global_pos)
[docs]@typecheck(s=expr_str, reference_genome=reference_genome_type)
def parse_locus(s, reference_genome: Union[str, ReferenceGenome] = 'default') -> LocusExpression:
"""Construct a locus expression by parsing a string or string expression.
Examples
--------
>>> hl.eval(hl.parse_locus('1:10000', reference_genome='GRCh37'))
Locus(contig=1, position=10000, reference_genome=GRCh37)
Notes
-----
This method expects strings of the form ``contig:position``, e.g. ``16:29500000``
or ``X:123456``.
Parameters
----------
s : str or :class:`.StringExpression`
String to parse.
reference_genome : :class:`str` or :class:`.ReferenceGenome`
Reference genome to use.
Returns
-------
:class:`.LocusExpression`
"""
return _func('Locus', tlocus(reference_genome), s)
[docs]@typecheck(s=expr_str, reference_genome=reference_genome_type)
def parse_variant(s, reference_genome: Union[str, ReferenceGenome] = 'default') -> StructExpression:
"""Construct a struct with a locus and alleles by parsing a string.
Examples
--------
>>> hl.eval(hl.parse_variant('1:100000:A:T,C', reference_genome='GRCh37'))
Struct(locus=Locus(contig=1, position=100000, reference_genome=GRCh37), alleles=['A', 'T', 'C'])
Notes
-----
This method returns an expression of type :class:`.tstruct` with the
following fields:
- `locus` (:class:`.tlocus`)
- `alleles` (:class:`.tarray` of :py:data:`.tstr`)
Parameters
----------
s : :class:`.StringExpression`
String to parse.
reference_genome: :class:`str` or :class:`.ReferenceGenome`
Reference genome to use.
Returns
-------
:class:`.StructExpression`
Struct with fields `locus` and `alleles`.
"""
t = tstruct(locus=tlocus(reference_genome), alleles=tarray(tstr))
return _func('LocusAlleles', t, s)
[docs]def variant_str(*args) -> 'StringExpression':
"""Create a variant colon-delimited string.
Parameters
----------
args
Arguments (see notes).
Returns
-------
:class:`.StringExpression`
Notes
-----
Expects either one argument of type
``struct{locus: locus<RG>, alleles: array<str>``, or two arguments of type
``locus<RG>`` and ``array<str>``. The function returns a string of the form
.. code-block:: text
CHR:POS:REF:ALT1,ALT2,...ALTN
e.g.
1:1:A:T
16:250125:AAA:A,CAA
Examples
--------
>>> hl.eval(hl.variant_str(hl.locus('1', 10000), ['A', 'T', 'C']))
'1:10000:A:T,C'
"""
args = [to_expr(arg) for arg in args]
def type_error():
raise ValueError(
f"'variant_str' expects arguments of the following types:\n"
f" Option 1: 1 argument of type 'struct{{locus: locus<RG>, alleles: array<str>}}\n"
f" Option 2: 2 arguments of type 'locus<RG>', 'array<str>'\n"
f" Found: {builtins.len(args)} {plural('argument', builtins.len(args))} "
f"of type {', '.join(builtins.str(x.dtype) for x in args)}"
)
if builtins.len(args) == 1:
[s] = args
t = s.dtype
if (
not isinstance(t, tstruct)
or not builtins.len(t) == 2
or not isinstance(t[0], tlocus)
or not t[1] == tarray(tstr)
):
type_error()
return hl.rbind(s, lambda x: hl.str(x[0]) + ":" + x[1][0] + ":" + hl.delimit(x[1][1:]))
elif builtins.len(args) == 2:
[locus, alleles] = args
if not isinstance(locus.dtype, tlocus) or not alleles.dtype == tarray(tstr):
type_error()
return hl.str(locus) + ":" + hl.rbind(alleles, lambda x: x[0] + ":" + hl.delimit(x[1:]))
else:
type_error()
[docs]@typecheck(gp=expr_array(expr_float64))
def gp_dosage(gp) -> Float64Expression:
"""
Return expected genotype dosage from array of genotype probabilities.
Examples
--------
>>> hl.eval(hl.gp_dosage([0.0, 0.5, 0.5]))
1.5
Notes
-----
This function is only defined for bi-allelic variants. The `gp` argument
must be length 3. The value is ``gp[1] + 2 * gp[2]``.
Parameters
----------
gp : :class:`.Expression` of type :class:`.tarray` of :obj:`.tfloat64`
Length 3 array of bi-allelic genotype probabilities
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
"""
return _func("dosage", tfloat64, gp)
[docs]@typecheck(pl=expr_array(expr_int32))
def pl_dosage(pl) -> Float64Expression:
r"""
Return expected genotype dosage from array of Phred-scaled genotype
likelihoods with uniform prior. Only defined for bi-allelic variants. The
`pl` argument must be length 3.
For a PL array ``[a, b, c]``, let:
.. math::
a^\prime = 10^{-a/10} \\
b^\prime = 10^{-b/10} \\
c^\prime = 10^{-c/10} \\
The genotype dosage is given by:
.. math::
\frac{b^\prime + 2 c^\prime}
{a^\prime + b^\prime +c ^\prime}
Examples
--------
>>> hl.eval(hl.pl_dosage([5, 10, 100]))
0.24025307377482674
Parameters
----------
pl : :class:`.ArrayNumericExpression` of type :py:data:`.tint32`
Length 3 array of bi-allelic Phred-scaled genotype likelihoods
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
"""
return hl.sum(pl_to_gp(pl) * hl.range(3), filter_missing=False)
@typecheck(pl=expr_array(expr_int32), _cache_size=int)
def pl_to_gp(pl, _cache_size=2048) -> ArrayNumericExpression:
"""
Return the linear-scaled genotype probabilities from an array of Phred-scaled genotype likelihoods.
Examples
--------
>>> hl.eval(hl.pl_to_gp([0, 10, 100]))
[0.9090909090082644, 0.09090909090082644, 9.090909090082645e-11]
Notes
-----
This function assumes a uniform prior on the possible genotypes.
Parameters
----------
pl : :class:`.ArrayNumericExpression` of type :py:data:`.tint32`
Array of Phred-scaled genotype likelihoods.
Returns
-------
:class:`.ArrayNumericExpression` of type :py:data:`.tfloat64`
"""
phred_table = hl.literal([10 ** (-x / 10.0) for x in builtins.range(_cache_size)])
gp = hl.bind(lambda pls: pls.map(lambda x: hl.if_else(x >= _cache_size, 10 ** (-x / 10.0), phred_table[x])), pl)
return hl.bind(lambda gp: gp / hl.sum(gp), gp)
[docs]@typecheck(start=expr_any, end=expr_any, includes_start=expr_bool, includes_end=expr_bool)
def interval(start, end, includes_start=True, includes_end=False) -> IntervalExpression:
"""Construct an interval expression.
Examples
--------
>>> hl.eval(hl.interval(5, 100))
Interval(start=5, end=100, includes_start=True, includes_end=False)
>>> hl.eval(hl.interval(hl.locus("1", 100), hl.locus("1", 1000)))
Interval(start=Locus(contig=1, position=100, reference_genome=GRCh37),
end=Locus(contig=1, position=1000, reference_genome=GRCh37),
includes_start=True,
includes_end=False)
Notes
-----
`start` and `end` must have the same type.
Parameters
----------
start : :class:`.Expression`
Start point.
end : :class:`.Expression`
End point.
includes_start : :class:`.BooleanExpression`
If ``True``, interval includes start point.
includes_end : :class:`.BooleanExpression`
If ``True``, interval includes end point.
Returns
-------
:class:`.IntervalExpression`
"""
if not start.dtype == end.dtype:
raise TypeError("Type mismatch of start and end points: '{}', '{}'".format(start.dtype, end.dtype))
return _func('Interval', tinterval(start.dtype), start, end, includes_start, includes_end)
[docs]@typecheck(
contig=expr_str,
start=expr_int32,
end=expr_int32,
includes_start=expr_bool,
includes_end=expr_bool,
reference_genome=reference_genome_type,
invalid_missing=expr_bool,
)
def locus_interval(
contig,
start,
end,
includes_start=True,
includes_end=False,
reference_genome: Union[str, ReferenceGenome] = 'default',
invalid_missing=False,
) -> IntervalExpression:
"""Construct a locus interval expression.
Examples
--------
>>> hl.eval(hl.locus_interval("1", 100, 1000, reference_genome='GRCh37'))
Interval(start=Locus(contig=1, position=100, reference_genome=GRCh37),
end=Locus(contig=1, position=1000, reference_genome=GRCh37),
includes_start=True,
includes_end=False)
Parameters
----------
contig : :class:`.StringExpression`
Contig name.
start : :class:`.Int32Expression`
Starting base position.
end : :class:`.Int32Expression`
End base position.
includes_start : :class:`.BooleanExpression`
If ``True``, interval includes start point.
includes_end : :class:`.BooleanExpression`
If ``True``, interval includes end point.
reference_genome : :class:`str` or :class:`.hail.genetics.ReferenceGenome`
Reference genome to use.
invalid_missing : :class:`.BooleanExpression`
If ``True``, invalid intervals are set to NA rather than causing an exception.
Returns
-------
:class:`.IntervalExpression`
"""
return _func(
'LocusInterval',
tinterval(tlocus(reference_genome)),
contig,
start,
end,
includes_start,
includes_end,
invalid_missing,
)
[docs]@typecheck(s=expr_str, reference_genome=reference_genome_type, invalid_missing=expr_bool)
def parse_locus_interval(
s, reference_genome: Union[str, ReferenceGenome] = 'default', invalid_missing=False
) -> IntervalExpression:
"""Construct a locus interval expression by parsing a string or string
expression.
Examples
--------
>>> hl.eval(hl.parse_locus_interval('1:1000-2000', reference_genome='GRCh37'))
Interval(start=Locus(contig=1, position=1000, reference_genome=GRCh37),
end=Locus(contig=1, position=2000, reference_genome=GRCh37),
includes_start=True,
includes_end=False)
>>> hl.eval(hl.parse_locus_interval('1:start-10M', reference_genome='GRCh37'))
Interval(start=Locus(contig=1, position=1, reference_genome=GRCh37),
end=Locus(contig=1, position=10000000, reference_genome=GRCh37),
includes_start=True,
includes_end=False)
Notes
-----
The start locus must precede the end locus. The default bounds of the
interval are left-inclusive and right-exclusive. To change this, add
one of ``[`` or ``(`` at the beginning of the string for left-inclusive
or left-exclusive respectively. Likewise, add one of ``]`` or ``)`` at
the end of the string for right-inclusive or right-exclusive
respectively.
There are several acceptable representations for `s`.
``CHR1:POS1-CHR2:POS2`` is the fully specified representation, and
we use this to define the various shortcut representations.
In a ``POS`` field, ``start`` (``Start``, ``START``) stands for 1.
In a ``POS`` field, ``end`` (``End``, ``END``) stands for the contig length.
In a ``POS`` field, the qualifiers ``m`` (``M``) and ``k`` (``K``) multiply
the given number by ``1,000,000`` and ``1,000``, respectively. ``1.6K`` is
short for 1600, and ``29M`` is short for 29000000.
``CHR:POS1-POS2`` stands for ``CHR:POS1-CHR:POS2``
``CHR1-CHR2`` stands for ``CHR1:START-CHR2:END``
``CHR`` stands for ``CHR:START-CHR:END``
Note
----
The bounds of the interval must be valid loci for the reference genome
(contig in reference genome and position is within the range [1-END])
except in the case where the position is ``0`` **AND** the interval is
**left-exclusive** which is normalized to be ``1`` and left-inclusive.
Likewise, in the case where the position is ``END + 1`` **AND**
the interval is **right-exclusive** which is normalized to be ``END``
and right-inclusive.
Parameters
----------
s : str or :class:`.StringExpression`
String to parse.
reference_genome : :class:`str` or :class:`.hail.genetics.ReferenceGenome`
Reference genome to use.
invalid_missing : :class:`.BooleanExpression`
If ``True``, invalid intervals are set to NA rather than causing an exception.
Returns
-------
:class:`.IntervalExpression`
"""
return _func('LocusInterval', tinterval(tlocus(reference_genome)), s, invalid_missing)
[docs]@typecheck(alleles=expr_int32, phased=expr_bool)
def call(*alleles, phased=False) -> CallExpression:
"""Construct a call expression.
Examples
--------
>>> hl.eval(hl.call(1, 0))
Call(alleles=[0, 1], phased=False)
Parameters
----------
alleles : variable-length args of :obj:`int` or :class:`.Expression` of type :py:data:`.tint32`
List of allele indices.
phased : :obj:`bool`
If ``True``, preserve the order of `alleles`.
Returns
-------
:class:`.CallExpression`
"""
if builtins.len(alleles) > 2:
raise NotImplementedError("'call' supports a maximum of 2 alleles.")
return _func('Call', tcall, *alleles, phased)
[docs]@typecheck(gt_index=expr_int32)
def unphased_diploid_gt_index_call(gt_index) -> CallExpression:
"""Construct an unphased, diploid call from a genotype index.
Examples
--------
>>> hl.eval(hl.unphased_diploid_gt_index_call(4))
Call(alleles=[1, 2], phased=False)
Parameters
----------
gt_index : :obj:`int` or :class:`.Expression` of type :py:data:`.tint32`
Unphased, diploid genotype index.
Returns
-------
:class:`.CallExpression`
"""
return _func('UnphasedDiploidGtIndexCall', tcall, to_expr(gt_index))
[docs]@typecheck(s=expr_str)
def parse_call(s) -> CallExpression:
"""Construct a call expression by parsing a string or string expression.
Examples
--------
>>> hl.eval(hl.parse_call('0|2'))
Call(alleles=[0, 2], phased=True)
Notes
-----
This method expects strings in the following format:
+--------+-----------------+-----------------+
| ploidy | Phased | Unphased |
+========+=================+=================+
| 0 | ``|-`` | ``-`` |
+--------+-----------------+-----------------+
| 1 | ``|i`` | ``i`` |
+--------+-----------------+-----------------+
| 2 | ``i|j`` | ``i/j`` |
+--------+-----------------+-----------------+
| 3 | ``i|j|k`` | ``i/j/k`` |
+--------+-----------------+-----------------+
| N | ``i|j|k|...|N`` | ``i/j/k/.../N`` |
+--------+-----------------+-----------------+
Parameters
----------
s : str or :class:`.StringExpression`
String to parse.
Returns
-------
:class:`.CallExpression`
"""
return _func('Call', tcall, s)
[docs]@typecheck(expression=expr_any)
def is_defined(expression) -> BooleanExpression:
"""Returns ``True`` if the argument is not missing.
Examples
--------
>>> hl.eval(hl.is_defined(5))
True
>>> hl.eval(hl.is_defined(hl.missing(hl.tstr)))
False
>>> hl.eval(hl.is_defined(hl.missing(hl.tbool) & True))
False
Parameters
----------
expression
Expression to test.
Returns
-------
:class:`.BooleanExpression`
``True`` if `expression` is not missing, ``False`` otherwise.
"""
return ~apply_expr(lambda x: ir.IsNA(x), tbool, expression)
[docs]@typecheck(expression=expr_any)
def is_missing(expression) -> BooleanExpression:
"""Returns ``True`` if the argument is missing.
Examples
--------
>>> hl.eval(hl.is_missing(5))
False
>>> hl.eval(hl.is_missing(hl.missing(hl.tstr)))
True
>>> hl.eval(hl.is_missing(hl.missing(hl.tbool) & True))
True
Parameters
----------
expression
Expression to test.
Returns
-------
:class:`.BooleanExpression`
``True`` if `expression` is missing, ``False`` otherwise.
"""
return apply_expr(lambda x: ir.IsNA(x), tbool, expression)
[docs]@typecheck(x=expr_oneof(expr_float32, expr_float64, expr_ndarray(expr_float64)))
@ndarray_broadcasting
def is_nan(x) -> BooleanExpression:
"""Returns ``True`` if the argument is ``nan`` (not a number).
Examples
--------
>>> hl.eval(hl.is_nan(0))
False
>>> hl.eval(hl.is_nan(hl.literal(0) / 0))
True
>>> hl.eval(hl.is_nan(hl.literal(0) / hl.missing(hl.tfloat64)))
None
Notes
-----
Note that :func:`~.is_missing` will return ``False`` on ``nan`` since ``nan``
is a defined value. Additionally, this method will return missing if `x` is
missing.
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64`
Expression to test or or :class:`.NDArrayNumericExpression`.
Returns
-------
:class:`.BooleanExpression`
``True`` if `x` is ``nan``, ``False`` otherwise or
:class:`.NDArrayNumericExpression` filled with such values
"""
return _func("isnan", tbool, x)
[docs]@typecheck(x=expr_oneof(expr_float32, expr_float64, expr_ndarray(expr_float64)))
@ndarray_broadcasting
def is_finite(x) -> BooleanExpression:
"""Returns ``True`` if the argument is a finite floating-point number.
Examples
--------
>>> hl.eval(hl.is_finite(0))
True
>>> hl.eval(hl.is_finite(float('nan')))
False
>>> hl.eval(hl.is_finite(float('inf')))
False
>>> hl.eval(hl.is_finite(hl.missing('float32')))
None
Notes
-----
This method will return missing, not ``True``, if `x` is missing.
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64` or :class:`.NDArrayNumericExpression`
Returns
-------
:class:`.BooleanExpression` or :class:`.NDArrayNumericExpression` filled with such expressions
"""
return _func("is_finite", tbool, x)
[docs]@typecheck(x=expr_oneof(expr_float32, expr_float64, expr_ndarray(expr_float64)))
@ndarray_broadcasting
def is_infinite(x) -> BooleanExpression:
"""Returns ``True`` if the argument is positive or negative infinity.
Examples
--------
>>> hl.eval(hl.is_infinite(0))
False
>>> hl.eval(hl.is_infinite(float('nan')))
False
>>> hl.eval(hl.is_infinite(float('inf')))
True
>>> hl.eval(hl.is_infinite(hl.missing('float32')))
None
Notes
-----
This method will return missing, not ``False``, if `x` is missing.
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64` or :class:`.NDArrayNumericExpression`
Returns
-------
:class:`.BooleanExpression` or :class:`.NDArrayNumericExpression` filled with such expressions
"""
return _func("is_infinite", tbool, x)
[docs]@typecheck(x=expr_any)
def json(x) -> StringExpression:
"""Convert an expression to a JSON string expression.
Examples
--------
>>> hl.eval(hl.json([1,2,3,4,5]))
'[1,2,3,4,5]'
>>> hl.eval(hl.json(hl.struct(a='Hello', b=0.12345, c=[1,2], d={'hi', 'bye'})))
'{"a":"Hello","b":0.12345,"c":[1,2],"d":["bye","hi"]}'
Parameters
----------
x
Expression to convert.
Returns
-------
:class:`.StringExpression`
String expression with JSON representation of `x`.
"""
return _func("json", tstr, x)
[docs]@typecheck(x=expr_str, dtype=hail_type)
def parse_json(x, dtype):
"""Convert a JSON string to a structured expression.
Examples
--------
>>> json_str = '{"a": 5, "b": 1.1, "c": "foo"}'
>>> parsed = hl.parse_json(json_str, dtype='struct{a: int32, b: float64, c: str}')
>>> hl.eval(parsed.a)
5
Parameters
----------
x : :class:`.StringExpression`
JSON string.
dtype
Type of value to parse.
Returns
-------
:class:`.Expression`
"""
return _func("parse_json", ttuple(dtype), x, type_args=(dtype,))[0]
[docs]@typecheck(x=oneof(expr_float64, expr_ndarray(expr_float64)), base=nullable(expr_float64))
def log(x, base=None) -> Float64Expression:
"""Take the logarithm of the `x` with base `base`.
Examples
--------
>>> hl.eval(hl.log(10))
2.302585092994046
>>> hl.eval(hl.log(10, 10))
1.0
>>> hl.eval(hl.log(1024, 2))
10.0
Notes
-----
If the `base` argument is not supplied, then the natural logarithm is used.
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64`
base : float or :class:`.Expression` of type :py:data:`.tfloat64`
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
"""
def scalar_log(x):
if base is not None:
return _func("log", tfloat64, x, to_expr(base))
else:
return _func("log", tfloat64, x)
x = to_expr(x)
if isinstance(x.dtype, tndarray):
return x.map(scalar_log)
return scalar_log(x)
[docs]@typecheck(x=oneof(expr_float64, expr_ndarray(expr_float64)))
@ndarray_broadcasting
def log10(x) -> Float64Expression:
"""Take the logarithm of the `x` with base 10.
Examples
--------
>>> hl.eval(hl.log10(1000))
3.0
>>> hl.eval(hl.log10(0.0001123))
-3.949620243738542
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64` or :class:`.NDArrayNumericExpression`
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64` or :class:`.NDArrayNumericExpression`
"""
return _func("log10", tfloat64, x)
[docs]@typecheck(x=oneof(expr_float64, expr_ndarray(expr_float64)))
@ndarray_broadcasting
def logit(x) -> Float64Expression:
"""The logistic function.
Examples
--------
>>> hl.eval(hl.logit(.01))
-4.59511985013459
>>> hl.eval(hl.logit(.5))
0.0
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64` or :class:`.NDArrayNumericExpression`
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64` or :class:`.NDArrayNumericExpression`
"""
return hl.log(x / (1 - x))
[docs]@typecheck(x=oneof(expr_float64, expr_ndarray(expr_float64)))
@ndarray_broadcasting
def expit(x) -> Float64Expression:
"""The logistic sigmoid function.
.. math::
\textrm{expit}(x) = \frac{1}{1 + e^{-x}}
Examples
--------
>>> hl.eval(hl.expit(.01))
0.5024999791668749
>>> hl.eval(hl.expit(0.0))
0.5
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64` or :class:`.NDArrayNumericExpression`
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64` or :class:`.NDArrayNumericExpression`
"""
return hl.if_else(x >= 0, 1 / (1 + hl.exp(-x)), hl.rbind(hl.exp(x), lambda exped: exped / (exped + 1)))
[docs]@typecheck(args=expr_any)
def coalesce(*args):
"""Returns the first non-missing value of `args`.
Examples
--------
>>> x1 = hl.missing('int')
>>> x2 = 2
>>> hl.eval(hl.coalesce(x1, x2))
2
Notes
-----
All arguments must have the same type, or must be convertible to a common
type (all numeric, for instance).
See Also
--------
:func:`.or_else`
Parameters
----------
args : variable-length args of :class:`.Expression`
Returns
-------
:class:`.Expression`
"""
if builtins.len(args) < 1:
raise ValueError("'coalesce' requires at least one expression argument")
*exprs, success = unify_exprs(*args)
if not success:
arg_types = ''.join([f"\n argument {i}: type '{arg.dtype}'" for i, arg in builtins.enumerate(exprs)])
raise TypeError(f"'coalesce' requires all arguments to have the same type or compatible types" f"{arg_types}")
indices, aggregations = unify_all(*exprs)
return construct_expr(ir.Coalesce(*(e._ir for e in exprs)), exprs[0].dtype, indices, aggregations)
[docs]@typecheck(a=expr_any, b=expr_any)
def or_else(a, b):
"""If `a` is missing, return `b`.
Examples
--------
>>> hl.eval(hl.or_else(5, 7))
5
>>> hl.eval(hl.or_else(hl.missing(hl.tint32), 7))
7
See Also
--------
:func:`.coalesce`
Parameters
----------
a: :class:`.Expression`
b: :class:`.Expression`
Returns
-------
:class:`.Expression`
"""
a, b, success = unify_exprs(a, b)
if not success:
raise TypeError(
f"'or_else' requires the 'a' and 'b' arguments to have the same type\n"
f" a: type '{a.dtype}'\n"
f" b: type '{b.dtype}'"
)
return coalesce(a, b)
[docs]@typecheck(predicate=expr_bool, value=expr_any)
def or_missing(predicate, value):
"""Returns `value` if `predicate` is ``True``, otherwise returns missing.
Examples
--------
>>> hl.eval(hl.or_missing(True, 5))
5
>>> hl.eval(hl.or_missing(False, 5))
None
Parameters
----------
predicate : :class:`.BooleanExpression`
value : :class:`.Expression`
Value to return if `predicate` is ``True``.
Returns
-------
:class:`.Expression`
This expression has the same type as `b`.
"""
return hl.if_else(predicate, value, hl.missing(value.dtype))
[docs]@typecheck(
x=expr_int32, n=expr_int32, p=expr_float64, alternative=enumeration("two.sided", "two-sided", "greater", "less")
)
def binom_test(x, n, p, alternative: str) -> Float64Expression:
"""Performs a binomial test on `p` given `x` successes in `n` trials.
Returns the p-value from the `exact binomial test
<https://en.wikipedia.org/wiki/Binomial_test>`__ of the null hypothesis that
success has probability `p`, given `x` successes in `n` trials.
The alternatives are interpreted as follows:
- ``'less'``: a one-tailed test of the significance of `x` or fewer successes,
- ``'greater'``: a one-tailed test of the significance of `x` or more successes, and
- ``'two-sided'``: a two-tailed test of the significance of `x` or any equivalent or more unlikely outcome.
Examples
--------
All the examples below use a fair coin as the null hypothesis. Zero is
interpreted as tail and one as heads.
Test if a coin is biased towards heads or tails after observing two heads
out of ten flips:
>>> hl.eval(hl.binom_test(2, 10, 0.5, 'two-sided'))
0.10937499999999994
Test if a coin is biased towards tails after observing four heads out of ten
flips:
>>> hl.eval(hl.binom_test(4, 10, 0.5, 'less'))
0.3769531250000001
Test if a coin is biased towards heads after observing thirty-two heads out
of fifty flips:
>>> hl.eval(hl.binom_test(32, 50, 0.5, 'greater'))
0.03245432353613613
Parameters
----------
x : int or :class:`.Expression` of type :py:data:`.tint32`
Number of successes.
n : int or :class:`.Expression` of type :py:data:`.tint32`
Number of trials.
p : float or :class:`.Expression` of type :py:data:`.tfloat64`
Probability of success, between 0 and 1.
alternative
: One of, "two-sided", "greater", "less", (deprecated: "two.sided").
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
p-value.
"""
if alternative == 'two.sided':
warning(
'"two.sided" is a deprecated and will be removed in a future '
'release, please use "two-sided" for the `alternative` parameter '
'to hl.binom_test'
)
alternative = 'two-sided'
alt_enum = {"two-sided": 0, "less": 1, "greater": 2}[alternative]
return _func("binomTest", tfloat64, x, n, p, to_expr(alt_enum))
[docs]@typecheck(x=expr_float64, df=expr_float64, ncp=nullable(expr_float64), lower_tail=expr_bool, log_p=expr_bool)
def pchisqtail(x, df, ncp=None, lower_tail=False, log_p=False) -> Float64Expression:
"""Returns the probability under the right-tail starting at x for a chi-squared
distribution with df degrees of freedom.
Examples
--------
>>> hl.eval(hl.pchisqtail(5, 1))
0.025347318677468304
>>> hl.eval(hl.pchisqtail(5, 1, ncp=2))
0.20571085634347097
>>> hl.eval(hl.pchisqtail(5, 1, lower_tail=True))
0.9746526813225317
>>> hl.eval(hl.pchisqtail(5, 1, log_p=True))
-3.6750823266311876
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64`
The value at which to evaluate the CDF.
df : float or :class:`.Expression` of type :py:data:`.tfloat64`
Degrees of freedom.
ncp: float or :class:`.Expression` of type :py:data:`.tfloat64`
Noncentrality parameter, defaults to 0 if unspecified.
lower_tail : bool or :class:`.BooleanExpression`
If ``True``, compute the probability of an outcome at or below `x`,
otherwise greater than `x`.
log_p : bool or :class:`.BooleanExpression`
Return the natural logarithm of the probability.
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
"""
if ncp is None:
return _func("pchisqtail", tfloat64, x, df, lower_tail, log_p)
else:
return _func("pnchisqtail", tfloat64, x, df, ncp, lower_tail, log_p)
PGENCHISQ_RETURN_TYPE = tstruct(value=tfloat64, n_iterations=tint32, converged=tbool, fault=tint32)
[docs]@typecheck(
x=expr_float64,
w=expr_array(expr_float64),
k=expr_array(expr_int32),
lam=expr_array(expr_float64),
mu=expr_float64,
sigma=expr_float64,
max_iterations=nullable(expr_int32),
min_accuracy=nullable(expr_float64),
)
def pgenchisq(x, w, k, lam, mu, sigma, *, max_iterations=None, min_accuracy=None) -> Float64Expression:
r"""The cumulative probability function of a `generalized chi-squared distribution
<https://en.wikipedia.org/wiki/Generalized_chi-squared_distribution>`__.
The generalized chi-squared distribution has many interpretations. We share here four
interpretations of the values of this distribution:
1. A linear combination of normal variables and squares of normal variables.
2. A weighted sum of sums of squares of normally distributed values plus a normally distributed
value.
3. A weighted sum of chi-squared distributed values plus a normally distributed value.
4. A `"quadratic form" <https://en.wikipedia.org/wiki/Quadratic_form_(statistics)>`__ in a vector
of uncorrelated `standard normal
<https://en.wikipedia.org/wiki/Normal_distribution#Standard_normal_distribution>`__ values.
The parameters of this function correspond to the parameters of the third interpretation.
.. math::
\begin{aligned}
w &: R^n \quad k : Z^n \quad lam : R^n \quad mu : R \quad sigma : R \\
\\
x &\sim N(mu, sigma^2) \\
y_i &\sim \mathrm{NonCentralChiSquared}(k_i, lam_i) \\
\\
Z &= x + w y^T \\
&= x + \sum_i w_i y_i \\
Z &\sim \mathrm{GeneralizedNonCentralChiSquared}(w, k, lam, mu, sigma)
\end{aligned}
The generalized chi-squared distribution often arises when working on linear models with standard
normal noise because the sum of the squares of the residuals should follow a generalized
chi-squared distribution.
Examples
--------
The following plot shows three examples of the generalized chi-squared cumulative distribution
function.
.. image:: https://upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Generalized_chi-square_cumulative_distribution_function.svg/1280px-Generalized_chi-square_cumulative_distribution_function.svg.png
:alt: Plots of examples of the generalized chi-square cumulative distribution function. Created by Dvidby0.
:target: https://commons.wikimedia.org/wiki/File:Generalized_chi-square_cumulative_distribution_function.svg
:width: 640px
The following examples are chosen from the three instances shown above. The curves appear in the
same order as the legend of the plot: blue, red, yellow.
>>> hl.eval(hl.pgenchisq(-80, w=[1, 2], k=[1, 4], lam=[1, 1], mu=0, sigma=0).value)
0.0
>>> hl.eval(hl.pgenchisq(-20, w=[1, 2], k=[1, 4], lam=[1, 1], mu=0, sigma=0).value)
0.0
>>> hl.eval(hl.pgenchisq(10 , w=[1, 2], k=[1, 4], lam=[1, 1], mu=0, sigma=0).value)
0.4670012373599629
>>> hl.eval(hl.pgenchisq(40 , w=[1, 2], k=[1, 4], lam=[1, 1], mu=0, sigma=0).value)
0.9958803111156718
>>> hl.eval(hl.pgenchisq(-80, w=[-2, -1], k=[5, 2], lam=[3, 1], mu=-3, sigma=0).value)
9.227056966837344e-05
>>> hl.eval(hl.pgenchisq(-20, w=[-2, -1], k=[5, 2], lam=[3, 1], mu=-3, sigma=0).value)
0.516439358616939
>>> hl.eval(hl.pgenchisq(10 , w=[-2, -1], k=[5, 2], lam=[3, 1], mu=-3, sigma=0).value)
1.0
>>> hl.eval(hl.pgenchisq(40 , w=[-2, -1], k=[5, 2], lam=[3, 1], mu=-3, sigma=0).value)
1.0
>>> hl.eval(hl.pgenchisq(-80, w=[1, -10, 2], k=[1, 2, 3], lam=[2, 3, 7], mu=-10, sigma=0).value)
0.14284718767288906
>>> hl.eval(hl.pgenchisq(-20, w=[1, -10, 2], k=[1, 2, 3], lam=[2, 3, 7], mu=-10, sigma=0).value)
0.5950150356303258
>>> hl.eval(hl.pgenchisq(10 , w=[1, -10, 2], k=[1, 2, 3], lam=[2, 3, 7], mu=-10, sigma=0).value)
0.923219534175858
>>> hl.eval(hl.pgenchisq(40 , w=[1, -10, 2], k=[1, 2, 3], lam=[2, 3, 7], mu=-10, sigma=0).value)
0.9971746768781656
Notes
-----
We follow Wikipedia's notational conventions. Some texts refer to the weight vector (our `w`) as
:math:`\lambda` or `lb` and the non-centrality vector (our `lam`) as `nc`.
We use the Davies' algorithm which was published as:
`Davies, Robert. "The distribution of a linear combination of chi-squared random variables."
Applied Statistics 29 323-333. 1980. <http://www.robertnz.net/pdf/lc_chisq.pdf>`__
Davies included Fortran source code in the original publication. Davies also released a `C
language port <http://www.robertnz.net/QF.htm>`__. Hail's implementation is a fairly direct port
of the C implementation to Scala. Davies provides 39 test cases with the source code. The Hail
tests include all 39 test cases as well as a few additional tests.
Davies' website cautions:
The method works well in most situations if you want only modest accuracy, say 0.0001. But
problems may arise if the sum is dominated by one or two terms with a total of only one or
two degrees of freedom and x is small.
For an accessible introduction the Generalized Chi-Squared Distribution, we strongly recommend
the introduction of this paper:
`Das, Abhranil; Geisler, Wilson (2020). "A method to integrate and classify normal
distributions". <https://arxiv.org/abs/2012.14331>`__
Parameters
----------
x : :obj:`float` or :class:`.Expression` of type :py:data:`.tfloat64`
The value at which to evaluate the cumulative distribution function (CDF).
w : :obj:`list` of :obj:`float` or :class:`.Expression` of type :py:class:`.tarray` of :py:data:`.tfloat64`
A weight for each non-central chi-square term.
k : :obj:`list` of :obj:`int` or :class:`.Expression` of type :py:class:`.tarray` of :py:data:`.tint32`
A degrees of freedom parameter for each non-central chi-square term.
lam : :obj:`list` of :obj:`float` or :class:`.Expression` of type :py:class:`.tarray` of :py:data:`.tfloat64`
A non-centrality parameter for each non-central chi-square term. We use `lam` instead
of `lambda` because the latter is a reserved word in Python.
mu : :obj:`float` or :class:`.Expression` of type :py:data:`.tfloat64`
The standard deviation of the normal term.
sigma : :obj:`float` or :class:`.Expression` of type :py:data:`.tfloat64`
The standard deviation of the normal term.
max_iterations : :obj:`int` or :class:`.Expression` of type :py:data:`.tint32`
The maximum number of iterations of the numerical integration before raising an error. The
default maximum number of iterations is ``1e5``.
min_accuracy : :obj:`int` or :class:`.Expression` of type :py:data:`.tint32`
The minimum accuracy of the returned value. If the minimum accuracy is not achieved, this
function will raise an error. The default minimum accuracy is ``1e-5``.
Returns
-------
:class:`.StructExpression`
This method returns a structure with the value as well as information about the numerical
integration.
- value : :class:`.Float64Expression`. If converged is true, the value of the CDF evaluated
at `x`. Otherwise, this is the last value the integration evaluated before aborting.
- n_iterations : :class:`.Int32Expression`. The number of iterations before stopping.
- converged : :class:`.BooleanExpression`. True if the `min_accuracy` was achieved and round
off error is not likely significant.
- fault : :class:`.Int32Expression`. If converged is true, fault is zero. If converged is
false, fault is either one or two. One indicates that the requried accuracy was not
achieved. Two indicates the round-off error is possibly significant.
"""
if max_iterations is None:
max_iterations = hl.literal(10_000)
if min_accuracy is None:
min_accuracy = hl.literal(1e-5)
return _func("pgenchisq", PGENCHISQ_RETURN_TYPE, x - mu, w, k, lam, sigma, max_iterations, min_accuracy)
[docs]@typecheck(x=expr_float64, mu=expr_float64, sigma=expr_float64, lower_tail=expr_bool, log_p=expr_bool)
def pnorm(x, mu=0, sigma=1, lower_tail=True, log_p=False) -> Float64Expression:
"""The cumulative probability function of a normal distribution with mean
`mu` and standard deviation `sigma`. Returns cumulative probability of
standard normal distribution by default.
Examples
--------
>>> hl.eval(hl.pnorm(0))
0.5
>>> hl.eval(hl.pnorm(1, mu=2, sigma=2))
0.30853753872598694
>>> hl.eval(hl.pnorm(2, lower_tail=False))
0.022750131948179212
>>> hl.eval(hl.pnorm(2, log_p=True))
-0.023012909328963493
Notes
-----
Returns the left-tail probability `p` = Prob(:math:`Z < x`) with :math:`Z`
a normal random variable. Defaults to a standard normal random variable.
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64`
mu : float or :class:`.Expression` of type :py:data:`.tfloat64`
Mean (default = 0).
sigma: float or :class:`.Expression` of type :py:data:`.tfloat64`
Standard deviation (default = 1).
lower_tail : bool or :class:`.BooleanExpression`
If ``True``, compute the probability of an outcome at or below `x`,
otherwise greater than `x`.
log_p : bool or :class:`.BooleanExpression`
Return the natural logarithm of the probability.
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
"""
return _func("pnorm", tfloat64, x, mu, sigma, lower_tail, log_p)
[docs]@typecheck(x=expr_float64, n=expr_float64, lower_tail=expr_bool, log_p=expr_bool)
def pT(x, n, lower_tail=True, log_p=False) -> Float64Expression:
r"""The cumulative probability function of a `t-distribution
<https://en.wikipedia.org/wiki/Student%27s_t-distribution>`__ with
`n` degrees of freedom.
Examples
--------
>>> hl.eval(hl.pT(0, 10))
0.5
>>> hl.eval(hl.pT(1, 10))
0.82955343384897
>>> hl.eval(hl.pT(1, 10, lower_tail=False))
0.17044656615103004
>>> hl.eval(hl.pT(1, 10, log_p=True))
-0.186867754489647
Notes
-----
If `lower_tail` is true, returns Prob(:math:`X \leq` `x`) where :math:`X` is
a t-distributed random variable with `n` degrees of freedom. If `lower_tail`
is false, returns Prob(:math:`X` > `x`).
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64`
n : float or :class:`.Expression` of type :py:data:`.tfloat64`
Degrees of freedom of the t-distribution.
lower_tail : bool or :class:`.BooleanExpression`
If ``True``, compute the probability of an outcome at or below `x`,
otherwise greater than `x`.
log_p : bool or :class:`.BooleanExpression`
Return the natural logarithm of the probability.
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
"""
return _func("pT", tfloat64, x, n, lower_tail, log_p)
[docs]@typecheck(x=expr_float64, df1=expr_float64, df2=expr_float64, lower_tail=expr_bool, log_p=expr_bool)
def pF(x, df1, df2, lower_tail=True, log_p=False) -> Float64Expression:
r"""The cumulative probability function of a `F-distribution
<https://en.wikipedia.org/wiki/F-distribution>`__ with parameters
`df1` and `df2`.
Examples
--------
>>> hl.eval(hl.pF(0, 3, 10))
0.0
>>> hl.eval(hl.pF(1, 3, 10))
0.5676627969783028
>>> hl.eval(hl.pF(1, 3, 10, lower_tail=False))
0.4323372030216972
>>> hl.eval(hl.pF(1, 3, 10, log_p=True))
-0.566227703842908
Notes
-----
If `lower_tail` is true, returns Prob(:math:`X \leq` `x`) where :math:`X` is
a random variable with distribution :math:`F`(df1, df2). If `lower_tail`
is false, returns Prob(:math:`X` > `x`).
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64`
df1 : float or :class:`.Expression` of type :py:data:`.tfloat64`
Parameter of the F-distribution
df2 : float or :class:`.Expression` of type :py:data:`.tfloat64`
Parameter of the F-distribution
lower_tail : bool or :class:`.BooleanExpression`
If ``True``, compute the probability of an outcome at or below `x`,
otherwise greater than `x`.
log_p : bool or :class:`.BooleanExpression`
Return the natural logarithm of the probability.
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
"""
return _func("pF", tfloat64, x, df1, df2, lower_tail, log_p)
[docs]@typecheck(x=expr_float64, lamb=expr_float64, lower_tail=expr_bool, log_p=expr_bool)
def ppois(x, lamb, lower_tail=True, log_p=False) -> Float64Expression:
r"""The cumulative probability function of a Poisson distribution.
Examples
--------
>>> hl.eval(hl.ppois(2, 1))
0.9196986029286058
Notes
-----
If `lower_tail` is true, returns Prob(:math:`X \leq` `x`) where :math:`X` is a
Poisson random variable with rate parameter `lamb`. If `lower_tail` is false,
returns Prob(:math:`X` > `x`).
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64`
lamb : float or :class:`.Expression` of type :py:data:`.tfloat64`
Rate parameter of Poisson distribution.
lower_tail : bool or :class:`.BooleanExpression`
If ``True``, compute the probability of an outcome at or below `x`,
otherwise greater than `x`.
log_p : bool or :class:`.BooleanExpression`
Return the natural logarithm of the probability.
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
"""
return _func("ppois", tfloat64, x, lamb, lower_tail, log_p)
[docs]@typecheck(p=expr_float64, df=expr_float64, ncp=nullable(expr_float64), lower_tail=expr_bool, log_p=expr_bool)
def qchisqtail(p, df, ncp=None, lower_tail=False, log_p=False) -> Float64Expression:
"""The quantile function of a chi-squared distribution with `df` degrees of
freedom, inverts :func:`~.pchisqtail`.
Examples
--------
>>> hl.eval(hl.qchisqtail(0.05, 2))
5.991464547107979
>>> hl.eval(hl.qchisqtail(0.05, 2, ncp=2))
10.838131614372958
>>> hl.eval(hl.qchisqtail(0.05, 2, lower_tail=True))
0.10258658877510107
>>> hl.eval(hl.qchisqtail(hl.log(0.05), 2, log_p=True))
5.991464547107979
Notes
-----
Returns right-quantile `x` for which `p` = Prob(:math:`Z^2` > x) with
:math:`Z^2` a chi-squared random variable with degrees of freedom specified
by `df`. The probability `p` must satisfy 0 < `p` < 1.
Parameters
----------
p : float or :class:`.Expression` of type :py:data:`.tfloat64`
Probability.
df : float or :class:`.Expression` of type :py:data:`.tfloat64`
Degrees of freedom.
ncp: float or :class:`.Expression` of type :py:data:`.tfloat64`
Corresponds to `ncp` parameter in :func:`.pchisqtail`.
lower_tail : bool or :class:`.BooleanExpression`
Corresponds to `lower_tail` parameter in :func:`.pchisqtail`.
log_p : bool or :class:`.BooleanExpression`
Exponentiate `p`, corresponds to `log_p` parameter in :func:`.pchisqtail`.
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
"""
if ncp is None:
return _func("qchisqtail", tfloat64, p, df, lower_tail, log_p)
else:
return _func("qnchisqtail", tfloat64, p, df, ncp, lower_tail, log_p)
[docs]@typecheck(p=expr_float64, mu=expr_float64, sigma=expr_float64, lower_tail=expr_bool, log_p=expr_bool)
def qnorm(p, mu=0, sigma=1, lower_tail=True, log_p=False) -> Float64Expression:
"""The quantile function of a normal distribution with mean `mu` and
standard deviation `sigma`, inverts :func:`~.pnorm`. Returns quantile of
standard normal distribution by default.
Examples
--------
>>> hl.eval(hl.qnorm(0.90))
1.2815515655446008
>>> hl.eval(hl.qnorm(0.90, mu=1, sigma=2))
3.5631031310892016
>>> hl.eval(hl.qnorm(0.90, lower_tail=False))
-1.2815515655446008
>>> hl.eval(hl.qnorm(hl.log(0.90), log_p=True))
1.2815515655446008
Notes
-----
Returns left-quantile `x` for which p = Prob(:math:`Z` < x) with :math:`Z`
a normal random variable with mean `mu` and standard deviation `sigma`.
Defaults to a standard normal random variable, and the probability `p` must
satisfy 0 < `p` < 1.
Parameters
----------
p : float or :class:`.Expression` of type :py:data:`.tfloat64`
Probability.
mu : float or :class:`.Expression` of type :py:data:`.tfloat64`
Mean (default = 0).
sigma: float or :class:`.Expression` of type :py:data:`.tfloat64`
Standard deviation (default = 1).
lower_tail : bool or :class:`.BooleanExpression`
Corresponds to `lower_tail` parameter in :func:`.pnorm`.
log_p : bool or :class:`.BooleanExpression`
Exponentiate `p`, corresponds to `log_p` parameter in :func:`.pnorm`.
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
"""
return _func("qnorm", tfloat64, p, mu, sigma, lower_tail, log_p)
[docs]@typecheck(p=expr_float64, lamb=expr_float64, lower_tail=expr_bool, log_p=expr_bool)
def qpois(p, lamb, lower_tail=True, log_p=False) -> Float64Expression:
r"""The quantile function of a Poisson distribution with rate parameter
`lamb`, inverts :func:`~.ppois`.
Examples
--------
>>> hl.eval(hl.qpois(0.99, 1))
4
Notes
-----
Returns the smallest integer :math:`x` such that Prob(:math:`X \leq x`) :math:`\geq` `p` where :math:`X`
is a Poisson random variable with rate parameter `lambda`.
Parameters
----------
p : float or :class:`.Expression` of type :py:data:`.tfloat64`
lamb : float or :class:`.Expression` of type :py:data:`.tfloat64`
Rate parameter of Poisson distribution.
lower_tail : bool or :class:`.BooleanExpression`
Corresponds to `lower_tail` parameter in inverse :func:`.ppois`.
log_p : bool or :class:`.BooleanExpression`
Exponentiate `p` before testing.
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
"""
return _func("qpois", tint32, p, lamb, lower_tail, log_p)
[docs]@typecheck(start=expr_int32, stop=nullable(expr_int32), step=expr_int32)
def range(start, stop=None, step=1) -> ArrayNumericExpression:
"""Returns an array of integers from `start` to `stop` by `step`.
Examples
--------
>>> hl.eval(hl.range(10))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
>>> hl.eval(hl.range(3, 10))
[3, 4, 5, 6, 7, 8, 9]
>>> hl.eval(hl.range(0, 10, step=3))
[0, 3, 6, 9]
Notes
-----
The range includes `start`, but excludes `stop`.
If provided exactly one argument, the argument is interpreted as `stop` and
`start` is set to zero. This matches the behavior of Python's ``range``.
Parameters
----------
start : int or :class:`.Expression` of type :py:data:`.tint32`
Start of range.
stop : int or :class:`.Expression` of type :py:data:`.tint32`
End of range.
step : int or :class:`.Expression` of type :py:data:`.tint32`
Step of range.
Returns
-------
:class:`.ArrayNumericExpression`
"""
if stop is None:
stop = start
start = hl.literal(0)
return apply_expr(
lambda sta, sto, ste: ir.toArray(ir.StreamRange(sta, sto, ste)), tarray(tint32), start, stop, step
)
@typecheck(start=expr_int32, stop=nullable(expr_int32), step=expr_int32)
def _stream_range(start, stop=None, step=1) -> StreamExpression:
if stop is None:
stop = start
start = hl.literal(0)
return apply_expr(lambda sta, sto, ste: ir.StreamRange(sta, sto, ste), tstream(tint32), start, stop, step)
@typecheck(length=expr_int32)
def zeros(length) -> ArrayNumericExpression:
"""Returns an array of zeros of length `length`.
Examples
--------
>>> hl.eval(hl.zeros(4))
[0, 0, 0, 0]
Parameters
----------
length : int or :class:`.Expression` of type :py:data:`.tint32`
length of zeros array.
Returns
-------
:class:`.ArrayInt32Expression`
"""
return apply_expr(lambda z: ir.ArrayZeros(z), tarray(tint32), length)
[docs]@typecheck(p=expr_float64, seed=nullable(int))
def rand_bool(p, seed=None) -> BooleanExpression:
"""Returns ``True`` with probability `p`.
Examples
--------
>>> hl.reset_global_randomness()
>>> hl.eval(hl.rand_bool(0.5))
False
>>> hl.eval(hl.rand_bool(0.5))
True
Parameters
----------
p : :obj:`float` or :class:`.Float64Expression`
Probability between 0 and 1.
seed : :obj:`int`, optional
Random seed.
Returns
-------
:class:`.BooleanExpression`
"""
return _seeded_func("rand_bool", tbool, seed, p)
[docs]@typecheck(mean=expr_float64, sd=expr_float64, seed=nullable(int), size=nullable(tupleof(expr_int64)))
def rand_norm(mean=0, sd=1, seed=None, size=None) -> Float64Expression:
"""Samples from a normal distribution with mean `mean` and standard
deviation `sd`.
Examples
--------
>>> hl.reset_global_randomness()
>>> hl.eval(hl.rand_norm())
0.347110923255205
>>> hl.eval(hl.rand_norm())
-0.9281375348070483
Parameters
----------
mean : :obj:`float` or :class:`.Float64Expression`
Mean of normal distribution.
sd : float or :class:`.Expression` of type :py:data:`.tfloat64`
Standard deviation of normal distribution.
seed : :obj:`int`, optional
Random seed.
size : :obj:`int` or :obj:`tuple` of :obj:`int`, optional
Returns
-------
:class:`.Float64Expression`
"""
if size is None:
return _seeded_func("rand_norm", tfloat64, seed, mean, sd)
else:
(nrows, ncols) = size
return _seeded_func("rand_norm_nd", tndarray(tfloat64, 2), seed, nrows, ncols, mean, sd)
@typecheck(mean=nullable(expr_array(expr_float64)), cov=nullable(expr_array(expr_float64)), seed=nullable(int))
def rand_norm2d(mean=None, cov=None, seed=None) -> ArrayNumericExpression:
"""Samples from a normal distribution with mean `mean` and covariance matrix `cov`.
Examples
--------
>>> hl.reset_global_randomness()
>>> hl.eval(hl.rand_norm2d())
[-1.3909495945443346, 1.2805588680053859]
>>> hl.eval(hl.rand_norm2d())
[0.289520302334123, -1.1108917435930954]
Notes
-----
The covariance of a 2d normal distribution is a 2x2 symmetric matrix
[[a, b], [b, c]]. This is specified in `cov` as a length 3 array [a, b, c].
The covariance matrix must be positive semi-definite, i.e. a>0, c>0, and
a*c - b^2 > 0.
If `mean` and `cov` are both None, draws from the standard 2d normal
distribution.
Parameters
----------
mean : :class:`.ArrayNumericExpression`, optional
Mean of normal distribution. Array of length 2.
cov : :class:`.ArrayNumericExpression`, optional
Covariance of normal distribution. Array of length 3.
seed : :obj:`int`, optional
Random seed.
Returns
-------
:class:`.ArrayFloat64Expression`
"""
if mean is None:
mean = [0, 0]
if cov is None:
cov = [1, 0, 1]
def f(mean, cov):
m1 = mean[0]
m2 = mean[1]
s11 = cov[0]
s12 = cov[1]
s22 = cov[2]
x = hl.range(0, 2).map(lambda i: rand_norm(seed=seed))
return hl.rbind(
hl.sqrt(s11),
lambda root_s11: hl.array([
m1 + root_s11 * x[0],
m2 + (s12 / root_s11) * x[0] + hl.sqrt(s22 - s12 * s12 / s11) * x[1],
]),
)
return hl.rbind(mean, cov, f)
[docs]@typecheck(lamb=expr_float64, seed=nullable(int))
def rand_pois(lamb, seed=None) -> Float64Expression:
"""Samples from a `Poisson distribution
<https://en.wikipedia.org/wiki/Poisson_distribution>`__ with rate parameter
`lamb`.
Examples
--------
>>> hl.reset_global_randomness()
>>> hl.eval(hl.rand_pois(1))
4.0
>>> hl.eval(hl.rand_pois(1))
4.0
Parameters
----------
lamb : :obj:`float` or :class:`.Float64Expression`
Rate parameter for Poisson distribution.
seed : :obj:`int`, optional
Random seed.
Returns
-------
:class:`.Float64Expression`
"""
return _seeded_func("rand_pois", tfloat64, seed, lamb)
[docs]@typecheck(lower=expr_float64, upper=expr_float64, seed=nullable(int), size=nullable(tupleof(expr_int64)))
def rand_unif(lower=0.0, upper=1.0, seed=None, size=None) -> Float64Expression:
"""Samples from a uniform distribution within the interval
[`lower`, `upper`].
Examples
--------
>>> hl.reset_global_randomness()
>>> hl.eval(hl.rand_unif())
0.9828239225846387
>>> hl.eval(hl.rand_unif(0, 1))
0.49094525115847415
>>> hl.eval(hl.rand_unif(0, 1))
0.3972543766997359
Parameters
----------
lower : :obj:`float` or :class:`.Float64Expression`
Left boundary of range. Defaults to 0.0.
upper : :obj:`float` or :class:`.Float64Expression`
Right boundary of range. Defaults to 1.0.
seed : :obj:`int`, optional
Random seed.
size : :obj:`int` or :obj:`tuple` of :obj:`int`, optional
Returns
-------
:class:`.Float64Expression`
"""
if size is None:
return _seeded_func("rand_unif", tfloat64, seed, lower, upper)
else:
(nrows, ncols) = size
return _seeded_func("rand_unif_nd", tndarray(tfloat64, 2), seed, nrows, ncols, lower, upper)
[docs]@typecheck(a=expr_int32, b=nullable(expr_int32), seed=nullable(int))
def rand_int32(a, b=None, *, seed=None) -> Int32Expression:
"""Samples from a uniform distribution of 32-bit integers.
If b is `None`, samples from the uniform distribution over [0, a). Otherwise, sample from the
uniform distribution over [a, b).
Examples
--------
>>> hl.reset_global_randomness()
>>> hl.eval(hl.rand_int32(10))
9
>>> hl.eval(hl.rand_int32(10, 15))
14
>>> hl.eval(hl.rand_int32(10, 15))
12
Parameters
----------
a : :obj:`int` or :class:`.Int32Expression`
If b is `None`, the right boundary of the range; otherwise, the left boundary of range.
b : :obj:`int` or :class:`.Int32Expression`
If specified, the right boundary of the range.
seed : :obj:`int`, optional
Random seed.
Returns
-------
:class:`.Int32Expression`
"""
if b is None:
return _seeded_func("rand_int32", tint32, seed, a)
return _seeded_func("rand_int32", tint32, seed, b - a) + a
[docs]@typecheck(a=nullable(expr_int64), b=nullable(expr_int64), seed=nullable(int))
def rand_int64(a=None, b=None, *, seed=None) -> Int64Expression:
"""Samples from a uniform distribution of 64-bit integers.
If a and b are both specified, samples from the uniform distribution over [a, b).
If b is `None`, samples from the uniform distribution over [0, a).
If both a and b are `None` samples from the uniform distribution over all
64-bit integers.
Examples
--------
>>> hl.reset_global_randomness()
>>> hl.eval(hl.rand_int64(10))
9
>>> hl.eval(hl.rand_int64(1 << 33, 1 << 35))
33089740109
>>> hl.eval(hl.rand_int64(1 << 33, 1 << 35))
18195458570
Parameters
----------
a : :obj:`int` or :class:`.Int64Expression`
If b is `None`, the right boundary of the range; otherwise, the left boundary of range.
b : :obj:`int` or :class:`.Int64Expression`
If specified, the right boundary of the range.
seed : :obj:`int`, optional
Random seed.
Returns
-------
:class:`.Int64Expression`
"""
if a is None:
return _seeded_func("rand_int64", tint64, seed)
if b is None:
return _seeded_func("rand_int64", tint64, seed, a)
return _seeded_func("rand_int64", tint64, seed, b - a) + a
[docs]@typecheck(
a=expr_float64, b=expr_float64, lower=nullable(expr_float64), upper=nullable(expr_float64), seed=nullable(int)
)
def rand_beta(a, b, lower=None, upper=None, seed=None) -> Float64Expression:
"""Samples from a `beta distribution
<https://en.wikipedia.org/wiki/Beta_distribution>`__ with parameters `a`
(alpha) and `b` (beta).
Notes
-----
The optional parameters `lower` and `upper` represent a truncated beta
distribution with parameters a and b and support `[lower, upper]`. Draws are
made via rejection sampling, i.e. returning the first draw from Beta(a,b)
that falls in range `[lower, upper]`. This procedure may be slow if the
probability mass of Beta(a,b) over `[lower, upper]` is small.
Examples
--------
>>> hl.reset_global_randomness()
>>> hl.eval(hl.rand_beta(0.5, 0.5))
0.30607924177641355
>>> hl.eval(hl.rand_beta(2, 5))
0.1103872607301062
Parameters
----------
a : :obj:`float` or :class:`.Float64Expression`
b : :obj:`float` or :class:`.Float64Expression`
lower : :obj:`float` or :class:`.Float64Expression`, optional
Lower boundary of truncated beta distribution.
upper : :obj:`float` or :class:`.Float64Expression`, optional
Upper boundary of truncated beta distribution.
seed : :obj:`int`, optional
Random seed.
Returns
-------
:class:`.Float64Expression`
"""
if lower is None and upper is None:
return _seeded_func("rand_beta", tfloat64, seed, a, b)
if lower is None:
lower = hl.literal(0)
if upper is None:
upper = hl.literal(1)
return _seeded_func("rand_beta", tfloat64, seed, a, b, lower, upper)
[docs]@typecheck(shape=expr_float64, scale=expr_float64, seed=nullable(int))
def rand_gamma(shape, scale, seed=None) -> Float64Expression:
"""Samples from a `gamma distribution
<https://en.wikipedia.org/wiki/Gamma_distribution>`__
with parameters `shape` and `scale`.
Examples
--------
>>> hl.reset_global_randomness()
>>> hl.eval(hl.rand_gamma(1, 1))
3.115449479063202
>>> hl.eval(hl.rand_gamma(1, 1))
3.077698059931638
Parameters
----------
shape : :obj:`float` or :class:`.Float64Expression`
scale : :obj:`float` or :class:`.Float64Expression`
seed : :obj:`int`, optional
Random seed.
Returns
-------
:class:`.Float64Expression`
"""
return _seeded_func("rand_gamma", tfloat64, seed, shape, scale)
[docs]@typecheck(prob=expr_array(expr_float64), seed=nullable(int))
def rand_cat(prob, seed=None) -> Int32Expression:
"""Samples from a `categorical distribution
<https://en.wikipedia.org/wiki/Categorical_distribution>`__.
Notes
-----
The categories correspond to the indices of `prob`, an unnormalized
probability mass function. The probability of drawing index ``i`` is
``prob[i]/sum(prob)``.
Warning
-------
This function may be slow when the number of categories is large.
Examples
--------
>>> hl.reset_global_randomness()
>>> hl.eval(hl.rand_cat([0, 1.7, 2]))
2
>>> hl.eval(hl.rand_cat([0, 1.7, 2]))
2
Parameters
----------
prob : :obj:`list` of float or :class:`.ArrayExpression` of type :py:data:`.tfloat64`
seed : :obj:`int` or `None`
If not `None`, function will be seeded with provided seed.
Returns
-------
:class:`.Int32Expression`
"""
return _seeded_func("rand_cat", tint32, seed, prob)
[docs]@typecheck(a=expr_array(expr_float64), seed=nullable(int))
def rand_dirichlet(a, seed=None) -> ArrayExpression:
"""Samples from a `Dirichlet distribution
<https://en.wikipedia.org/wiki/Dirichlet_distribution>`__.
Examples
--------
>>> hl.reset_global_randomness()
>>> hl.eval(hl.rand_dirichlet([1, 1, 1]))
[0.6987619676833735, 0.287566556865261, 0.013671475451365567]
>>> hl.eval(hl.rand_dirichlet([1, 1, 1]))
[0.16299928555608242, 0.04393664153526524, 0.7930640729086523]
Parameters
----------
a : :obj:`list` of float or :class:`.ArrayExpression` of type :py:data:`.tfloat64`
Array of non-negative concentration parameters.
seed : :obj:`int`, optional
Random seed.
Returns
-------
:class:`.Float64Expression`
"""
return hl.bind(lambda x: x / hl.sum(x), a.map(lambda p: hl.if_else(p == 0.0, 0.0, hl.rand_gamma(p, 1, seed=seed))))
[docs]@typecheck(x=oneof(expr_float64, expr_ndarray(expr_float64)))
@ndarray_broadcasting
def sqrt(x) -> Float64Expression:
"""Returns the square root of `x`.
Examples
--------
>>> hl.eval(hl.sqrt(3))
1.7320508075688772
Notes
-----
It is also possible to exponentiate expression with standard Python syntax,
e.g. ``x ** 0.5``.
Parameters
----------
x : float or :class:`.Expression` of type :py:data:`.tfloat64` or :class:`.NDArrayNumericExpression`
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64` or :class:`.NDArrayNumericExpression`
"""
return _func("sqrt", tfloat64, x)
[docs]@typecheck(x=expr_array(expr_float64), y=expr_array(expr_float64))
def corr(x, y) -> Float64Expression:
"""Compute the
`Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`__
between `x` and `y`.
Examples
--------
>>> hl.eval(hl.corr([1, 2, 4], [2, 3, 1]))
-0.6546536707079772
Notes
-----
Only indices where both `x` and `y` are non-missing will be included in the
calculation.
If `x` and `y` have length zero, then the result is missing.
Parameters
----------
x : :class:`.Expression` of type ``array<tfloat64>``
y : :class:`.Expression` of type ``array<tfloat64>``
Returns
-------
:class:`.Float64Expression`
"""
return _func("corr", tfloat64, x, y)
[docs]@typecheck(ref=expr_str, alt=expr_str)
@ir.udf(tstr, tstr)
def numeric_allele_type(ref, alt) -> Int32Expression:
"""Returns the type of the polymorphism as an integer. The value returned
is the integer value of :class:`.AlleleType` representing that kind of
polymorphism.
Examples
--------
>>> hl.eval(hl.numeric_allele_type('A', 'T')) == AlleleType.SNP
True
Notes
-----
The values of :class:`.AlleleType` are not stable and thus should not be
relied upon across hail versions.
"""
_base_regex = "^([ACGTNM])+$"
_symbolic_regex = r"(^\.)|(\.$)|(^<)|(>$)|(\[)|(\])"
return hl.bind(
lambda r, a: hl.if_else(
r.matches(_base_regex),
hl.case()
.when(
a.matches(_base_regex),
hl.case()
.when(
r.length() == a.length(),
hl.if_else(
r.length() == 1,
hl.if_else(r != a, AlleleType.SNP, AlleleType.UNKNOWN),
hl.if_else(hamming(r, a) == 1, AlleleType.SNP, AlleleType.MNP),
),
)
.when((r.length() < a.length()) & (r[0] == a[0]) & a.endswith(r[1:]), AlleleType.INSERTION)
.when((r[0] == a[0]) & r.endswith(a[1:]), AlleleType.DELETION)
.default(AlleleType.COMPLEX),
)
.when(a == '*', AlleleType.STAR)
.when(a.matches(_symbolic_regex), AlleleType.SYMBOLIC)
.default(AlleleType.UNKNOWN),
AlleleType.UNKNOWN,
),
ref,
alt,
)
@deprecated(version='0.2.129', reason="Replaced by the public numeric_allele_type")
@typecheck(ref=expr_str, alt=expr_str)
def _num_allele_type(ref, alt) -> Int32Expression:
"""Provided for backwards compatibility, don't use it in new code, or
within the hail library itself
"""
return numeric_allele_type(ref, alt)
[docs]@typecheck(ref=expr_str, alt=expr_str)
def is_snp(ref, alt) -> BooleanExpression:
"""Returns ``True`` if the alleles constitute a single nucleotide polymorphism.
Examples
--------
>>> hl.eval(hl.is_snp('A', 'T'))
True
Parameters
----------
ref : :class:`.StringExpression`
Reference allele.
alt : :class:`.StringExpression`
Alternate allele.
Returns
-------
:class:`.BooleanExpression`
"""
return numeric_allele_type(ref, alt) == AlleleType.SNP
[docs]@typecheck(ref=expr_str, alt=expr_str)
def is_mnp(ref, alt) -> BooleanExpression:
"""Returns ``True`` if the alleles constitute a multiple nucleotide polymorphism.
Examples
--------
>>> hl.eval(hl.is_mnp('AA', 'GT'))
True
Parameters
----------
ref : :class:`.StringExpression`
Reference allele.
alt : :class:`.StringExpression`
Alternate allele.
Returns
-------
:class:`.BooleanExpression`
"""
return numeric_allele_type(ref, alt) == AlleleType.MNP
[docs]@typecheck(ref=expr_str, alt=expr_str)
def is_transition(ref, alt) -> BooleanExpression:
"""Returns ``True`` if the alleles constitute a transition.
Examples
--------
>>> hl.eval(hl.is_transition('A', 'T'))
False
>>> hl.eval(hl.is_transition('AAA', 'AGA'))
True
Parameters
----------
ref : :class:`.StringExpression`
Reference allele.
alt : :class:`.StringExpression`
Alternate allele.
Returns
-------
:class:`.BooleanExpression`
"""
return is_snp(ref, alt) & _is_snp_transition(ref, alt)
[docs]@typecheck(ref=expr_str, alt=expr_str)
def is_transversion(ref, alt) -> BooleanExpression:
"""Returns ``True`` if the alleles constitute a transversion.
Examples
--------
>>> hl.eval(hl.is_transversion('A', 'T'))
True
>>> hl.eval(hl.is_transversion('AAA', 'AGA'))
False
Parameters
----------
ref : :class:`.StringExpression`
Reference allele.
alt : :class:`.StringExpression`
Alternate allele.
Returns
-------
:class:`.BooleanExpression`
"""
return is_snp(ref, alt) & (~(_is_snp_transition(ref, alt)))
@typecheck(ref=expr_str, alt=expr_str)
@ir.udf(tstr, tstr)
def _is_snp_transition(ref, alt) -> BooleanExpression:
indices = hl.range(0, ref.length())
return hl.any(
lambda i: (
(ref[i] != alt[i])
& (
((ref[i] == 'A') & (alt[i] == 'G'))
| ((ref[i] == 'G') & (alt[i] == 'A'))
| ((ref[i] == 'C') & (alt[i] == 'T'))
| ((ref[i] == 'T') & (alt[i] == 'C'))
)
),
indices,
)
[docs]@typecheck(ref=expr_str, alt=expr_str)
def is_insertion(ref, alt) -> BooleanExpression:
"""Returns ``True`` if the alleles constitute an insertion.
Examples
--------
>>> hl.eval(hl.is_insertion('A', 'ATT'))
True
Parameters
----------
ref : :class:`.StringExpression`
Reference allele.
alt : :class:`.StringExpression`
Alternate allele.
Returns
-------
:class:`.BooleanExpression`
"""
return numeric_allele_type(ref, alt) == AlleleType.INSERTION
[docs]@typecheck(ref=expr_str, alt=expr_str)
def is_deletion(ref, alt) -> BooleanExpression:
"""Returns ``True`` if the alleles constitute a deletion.
Examples
--------
>>> hl.eval(hl.is_deletion('ATT', 'A'))
True
Parameters
----------
ref : :class:`.StringExpression`
Reference allele.
alt : :class:`.StringExpression`
Alternate allele.
Returns
-------
:class:`.BooleanExpression`
"""
return numeric_allele_type(ref, alt) == AlleleType.DELETION
[docs]@typecheck(ref=expr_str, alt=expr_str)
def is_indel(ref, alt) -> BooleanExpression:
"""Returns ``True`` if the alleles constitute an insertion or deletion.
Examples
--------
>>> hl.eval(hl.is_indel('ATT', 'A'))
True
Parameters
----------
ref : :class:`.StringExpression`
Reference allele.
alt : :class:`.StringExpression`
Alternate allele.
Returns
-------
:class:`.BooleanExpression`
"""
return hl.bind(lambda t: (t == AlleleType.INSERTION) | (t == AlleleType.DELETION), numeric_allele_type(ref, alt))
[docs]@typecheck(ref=expr_str, alt=expr_str)
def is_star(ref, alt) -> BooleanExpression:
"""Returns ``True`` if the alleles constitute an upstream deletion.
Examples
--------
>>> hl.eval(hl.is_star('A', '*'))
True
Parameters
----------
ref : :class:`.StringExpression`
Reference allele.
alt : :class:`.StringExpression`
Alternate allele.
Returns
-------
:class:`.BooleanExpression`
"""
return numeric_allele_type(ref, alt) == AlleleType.STAR
[docs]@typecheck(ref=expr_str, alt=expr_str)
def is_complex(ref, alt) -> BooleanExpression:
"""Returns ``True`` if the alleles constitute a complex polymorphism.
Examples
--------
>>> hl.eval(hl.is_complex('ATT', 'GCAC'))
True
Parameters
----------
ref : :class:`.StringExpression`
Reference allele.
alt : :class:`.StringExpression`
Alternate allele.
Returns
-------
:class:`.BooleanExpression`
"""
return numeric_allele_type(ref, alt) == AlleleType.COMPLEX
[docs]@typecheck(ref=expr_str, alt=expr_str)
def is_strand_ambiguous(ref, alt) -> BooleanExpression:
"""Returns ``True`` if the alleles are strand ambiguous.
Strand ambiguous allele pairs are ``A/T``, ``T/A``,
``C/G``, and ``G/C`` where the first allele is `ref`
and the second allele is `alt`.
Examples
--------
>>> hl.eval(hl.is_strand_ambiguous('A', 'T'))
True
Parameters
----------
ref : :class:`.StringExpression`
Reference allele.
alt : :class:`.StringExpression`
Alternate allele.
Returns
-------
:class:`.BooleanExpression`
"""
alleles = hl.literal({('A', 'T'), ('T', 'A'), ('G', 'C'), ('C', 'G')})
return alleles.contains((ref, alt))
[docs]@typecheck(ref=expr_str, alt=expr_str)
def allele_type(ref, alt) -> StringExpression:
"""Returns the type of the polymorphism as a string.
Examples
--------
>>> hl.eval(hl.allele_type('A', 'T'))
'SNP'
>>> hl.eval(hl.allele_type('ATT', 'A'))
'Deletion'
Notes
-----
The possible return values are:
- ``"SNP"``
- ``"MNP"``
- ``"Insertion"``
- ``"Deletion"``
- ``"Complex"``
- ``"Star"``
- ``"Symbolic"``
- ``"Unknown"``
Parameters
----------
ref : :class:`.StringExpression`
Reference allele.
alt : :class:`.StringExpression`
Alternate allele.
Returns
-------
:class:`.StringExpression`
"""
return hl.literal(AlleleType.strings())[numeric_allele_type(ref, alt)]
[docs]@typecheck(s1=expr_str, s2=expr_str)
def hamming(s1, s2) -> Int32Expression:
"""Returns the Hamming distance between the two strings.
Examples
--------
>>> hl.eval(hl.hamming('ATATA', 'ATGCA'))
2
>>> hl.eval(hl.hamming('abcdefg', 'zzcdefz'))
3
Notes
-----
This method will fail if the two strings have different length.
Parameters
----------
s1 : :class:`.StringExpression`
First string.
s2 : :class:`.StringExpression`
Second string.
Returns
-------
:class:`.Expression` of type :py:data:`.tint32`
"""
return _func("hamming", tint32, s1, s2)
[docs]@typecheck(s=expr_str)
def entropy(s) -> Float64Expression:
r"""Returns the `Shannon entropy <https://en.wikipedia.org/wiki/Entropy_(information_theory)>`__
of the character distribution defined by the string.
Examples
--------
>>> hl.eval(hl.entropy('ac'))
1.0
>>> hl.eval(hl.entropy('accctg'))
1.7924812503605778
Notes
-----
For a string of length :math:`n` with :math:`k` unique characters
:math:`\{ c_1, \dots, c_k \}`, let :math:`p_i` be the probability that
a randomly chosen character is :math:`c_i`, e.g. the number of instances
of :math:`c_i` divided by :math:`n`. Then the base-2 Shannon entropy is
given by
.. math::
H = \sum_{i=1}^k p_i \log_2(p_i).
Parameters
----------
s : :class:`.StringExpression`
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
"""
return _func("entropy", tfloat64, s)
@typecheck(x=expr_any, trunc=nullable(expr_int32))
def _showstr(x, trunc=None):
if trunc is None:
return _func("showStr", tstr, x)
return _func("showStr", tstr, x, trunc)
[docs]@typecheck(x=expr_any)
def str(x) -> StringExpression:
"""Returns the string representation of `x`.
Examples
--------
>>> hl.eval(hl.str(hl.struct(a=5, b=7)))
'{"a":5,"b":7}'
Parameters
----------
x
Returns
-------
:class:`.StringExpression`
"""
if x.dtype == tstr:
return x
else:
return _func("str", tstr, x)
[docs]@typecheck(c=expr_call, i=expr_int32)
def downcode(c, i) -> CallExpression:
"""Create a new call by setting all alleles other than i to ref
Examples
--------
Preserve the third allele and downcode all other alleles to reference.
>>> hl.eval(hl.downcode(hl.call(1, 2), 2))
Call(alleles=[0, 1], phased=False)
>>> hl.eval(hl.downcode(hl.call(2, 2), 2))
Call(alleles=[1, 1], phased=False)
>>> hl.eval(hl.downcode(hl.call(0, 1), 2))
Call(alleles=[0, 0], phased=False)
Parameters
----------
c : :class:`.CallExpression`
A call.
i : :class:`.Expression` of type :py:data:`.tint32`
The index of the allele that will be sent to the alternate allele. All
other alleles will be downcoded to reference.
Returns
-------
:class:`.CallExpression`
"""
return _func("downcode", tcall, c, i)
@typecheck(pl=expr_array(expr_int32))
def gq_from_pl(pl) -> Int32Expression:
"""Compute genotype quality from Phred-scaled probability likelihoods.
Examples
--------
>>> hl.eval(hl.gq_from_pl([0, 69, 1035]))
69
Parameters
----------
pl : :class:`.Expression` of type :class:`.tarray` of :obj:`.tint32`.
Returns
-------
:class:`.Expression` of type :py:data:`.tint32`
"""
return _func("gqFromPL", tint32, pl)
[docs]@typecheck(n=expr_int32)
def triangle(n) -> Int32Expression:
"""Returns the triangle number of `n`.
Examples
--------
>>> hl.eval(hl.triangle(3))
6
Notes
-----
The calculation is ``n * (n + 1) / 2``.
Parameters
----------
n : :class:`.Expression` of type :py:data:`.tint32`
Returns
-------
:class:`.Expression` of type :py:data:`.tint32`
"""
return _func("triangle", tint32, n)
[docs]@typecheck(f=func_spec(1, expr_bool), collection=expr_oneof(expr_set(), expr_array()))
def filter(f: Callable, collection):
"""Returns a new collection containing elements where `f` returns ``True``.
Examples
--------
>>> a = [1, 2, 3, 4]
>>> s = {'Alice', 'Bob', 'Charlie'}
>>> hl.eval(hl.filter(lambda x: x % 2 == 0, a))
[2, 4]
>>> hl.eval(hl.filter(lambda x: ~(x[-1] == 'e'), s))
{'Bob'}
Notes
-----
Returns a same-type expression; evaluated on a :class:`.SetExpression`, returns a
:class:`.SetExpression`. Evaluated on an :class:`.ArrayExpression`,
returns an :class:`.ArrayExpression`.
Parameters
----------
f : function ( (arg) -> :class:`.BooleanExpression`)
Function to evaluate for each element of the collection. Must return a
:class:`.BooleanExpression`.
collection : :class:`.ArrayExpression` or :class:`.SetExpression`.
Array or set expression to filter.
Returns
-------
:class:`.ArrayExpression` or :class:`.SetExpression`
Expression of the same type as `collection`.
"""
return collection.filter(f)
collection_type = expr_oneof(expr_set(), expr_array())
any_to_bool_type = func_spec(1, expr_bool)
[docs]def any(*args) -> BooleanExpression:
"""Check for any ``True`` in boolean expressions or collections of booleans.
:func:`~.any` comes in three forms:
1. ``hl.any(boolean, ...)``. Is at least one argument ``True``?
2. ``hl.any(collection)``. Is at least one element of this collection ``True``?
3. ``hl.any(function, collection)``. Does ``function`` return ``True`` for at
least one value in this collection?
Examples
--------
The first form:
>>> hl.eval(hl.any())
False
>>> hl.eval(hl.any(True))
True
>>> hl.eval(hl.any(False))
False
>>> hl.eval(hl.any(False, False, True, False))
True
The second form:
>>> hl.eval(hl.any([False, True, False]))
True
>>> hl.eval(hl.any([False, False, False]))
False
The third form:
>>> a = ['The', 'quick', 'brown', 'fox']
>>> s = {1, 3, 5, 6, 7, 9}
>>> hl.eval(hl.any(lambda x: x[-1] == 'x', a))
True
>>> hl.eval(hl.any(lambda x: x % 4 == 0, s))
False
Notes
-----
:func:`~.any` returns ``False`` when given an empty array or empty argument list.
"""
base = hl.literal(False)
if builtins.len(args) == 0:
return base
if builtins.len(args) == 1:
arg = arg_check(args[0], 'any', 'collection', oneof(collection_type, expr_bool))
if arg.dtype == hl.tbool:
return arg
return arg.any(lambda x: x)
if builtins.len(args) == 2:
if callable(args[0]):
f = arg_check(args[0], 'any', 'f', any_to_bool_type)
collection = arg_check(args[1], 'any', 'collection', collection_type)
return collection.any(f)
n_args = builtins.len(args)
args = [args_check(x, 'any', 'exprs', i, n_args, expr_bool) for i, x in builtins.enumerate(args)]
return functools.reduce(operator.ior, args, base)
[docs]def all(*args) -> BooleanExpression:
"""Check for all ``True`` in boolean expressions or collections of booleans.
:func:`~.all` comes in three forms:
1. ``hl.all(boolean, ...)``. Are all arguments ``True``?
2. ``hl.all(collection)``. Are all elements of the collection ``True``?
3. ``hl.all(function, collection)``. Does ``function`` return ``True`` for
all values in this collection?
Examples
--------
The first form:
>>> hl.eval(hl.all())
True
>>> hl.eval(hl.all(True))
True
>>> hl.eval(hl.all(False))
False
>>> hl.eval(hl.all(True, True, True))
True
>>> hl.eval(hl.all(False, False, True, False))
False
The second form:
>>> hl.eval(hl.all([False, True, False]))
False
>>> hl.eval(hl.all([True, True, True]))
True
The third form:
>>> a = ['The', 'quick', 'brown', 'fox']
>>> s = {1, 3, 5, 6, 7, 9}
>>> hl.eval(hl.all(lambda x: hl.len(x) > 3, a))
False
>>> hl.eval(hl.all(lambda x: x < 10, s))
True
Notes
-----
:func:`~.all` returns ``True`` when given an empty array or empty argument list.
"""
base = hl.literal(True)
if builtins.len(args) == 0:
return base
if builtins.len(args) == 1:
arg = arg_check(args[0], 'any', 'collection', oneof(collection_type, expr_bool))
if arg.dtype == hl.tbool:
return arg
return arg.all(lambda x: x)
if builtins.len(args) == 2:
if callable(args[0]):
f = arg_check(args[0], 'all', 'f', any_to_bool_type)
collection = arg_check(args[1], 'all', 'collection', collection_type)
return collection.all(f)
n_args = builtins.len(args)
args = [args_check(x, 'all', 'exprs', i, n_args, expr_bool) for i, x in builtins.enumerate(args)]
return functools.reduce(operator.iand, args, base)
[docs]@typecheck(f=func_spec(1, expr_bool), collection=expr_oneof(expr_set(), expr_array()))
def find(f: Callable, collection):
"""Returns the first element where `f` returns ``True``.
Examples
--------
>>> a = ['The', 'quick', 'brown', 'fox']
>>> s = {1, 3, 5, 6, 7, 9}
>>> hl.eval(hl.find(lambda x: x[-1] == 'x', a))
'fox'
>>> hl.eval(hl.find(lambda x: x % 4 == 0, s))
None
Notes
-----
If `f` returns ``False`` for every element, then the result is missing.
Sets are unordered. If `collection` is of type :class:`.tset`, then the
element returned comes from no guaranteed ordering.
Parameters
----------
f : function ( (arg) -> :class:`.BooleanExpression`)
Function to evaluate for each element of the collection. Must return a
:class:`.BooleanExpression`.
collection : :class:`.ArrayExpression` or :class:`.SetExpression`
Collection expression.
Returns
-------
:class:`.Expression`
Expression whose type is the element type of the collection.
"""
return collection.find(f)
[docs]@typecheck(f=func_spec(1, expr_any), collection=expr_oneof(expr_set(), expr_array()))
def flatmap(f: Callable, collection):
"""Map each element of the collection to a new collection, and flatten the results.
Examples
--------
>>> a = [[0, 1], [1, 2], [4, 5, 6, 7]]
>>> hl.eval(hl.flatmap(lambda x: x[1:], a))
[1, 2, 5, 6, 7]
Parameters
----------
f : function ( (arg) -> :class:`.CollectionExpression`)
Function from the element type of the collection to the type of the
collection. For instance, `flatmap` on a ``set<str>`` should take
a ``str`` and return a ``set``.
collection : :class:`.ArrayExpression` or :class:`.SetExpression`
Collection expression.
Returns
-------
:class:`.ArrayExpression` or :class:`.SetExpression`
"""
expected_type, s = (tarray, 'Array') if isinstance(collection.dtype, tarray) else (tset, 'Set')
def unify_ret(t):
if not isinstance(t, expected_type):
raise TypeError("'flatmap' expects 'f' to return an expression of type '{}', found '{}'".format(s, t))
return t
return collection.flatmap(f)
[docs]@typecheck(f=func_spec(1, expr_any), collection=expr_oneof(expr_set(), expr_array()))
def group_by(f: Callable, collection) -> DictExpression:
"""Group collection elements into a dict according to a lambda function.
Examples
--------
>>> a = ['The', 'quick', 'brown', 'fox']
>>> hl.eval(hl.group_by(lambda x: hl.len(x), a))
{3: ['The', 'fox'], 5: ['quick', 'brown']}
Parameters
----------
f : function ( (arg) -> :class:`.Expression`)
Function to evaluate for each element of the collection to produce a key for the
resulting dictionary.
collection : :class:`.ArrayExpression` or :class:`.SetExpression`
Collection expression.
Returns
-------
:class:`.DictExpression`.
Dictionary keyed by results of `f`.
"""
return collection.group_by(f)
[docs]@typecheck(f=func_spec(2, expr_any), zero=expr_any, collection=expr_oneof(expr_set(), expr_array()))
def fold(f: Callable, zero, collection) -> Expression:
"""Reduces a collection with the given function `f`, provided the initial value `zero`.
Examples
--------
>>> a = [0, 1, 2]
>>> hl.eval(hl.fold(lambda i, j: i + j, 0, a))
3
Parameters
----------
f : function ( (:class:`.Expression`, :class:`.Expression`) -> :class:`.Expression`)
Function which takes the cumulative value and the next element, and
returns a new value.
zero : :class:`.Expression`
Initial value to pass in as left argument of `f`.
collection : :class:`.ArrayExpression` or :class:`.SetExpression`
Returns
-------
:class:`.Expression`
"""
return collection.fold(lambda x, y: f(x, y), zero)
[docs]@typecheck(f=func_spec(2, expr_any), zero=expr_any, a=expr_array())
def array_scan(f: Callable, zero, a) -> ArrayExpression:
"""Map each element of `a` to cumulative value of function `f`, with initial value `zero`.
Examples
--------
>>> a = [0, 1, 2]
>>> hl.eval(hl.array_scan(lambda i, j: i + j, 0, a))
[0, 0, 1, 3]
Parameters
----------
f : function ( (:class:`.Expression`, :class:`.Expression`) -> :class:`.Expression`)
Function which takes the cumulative value and the next element, and
returns a new value.
zero : :class:`.Expression`
Initial value to pass in as left argument of `f`.
a : :class:`.ArrayExpression`
Returns
-------
:class:`.ArrayExpression`.
"""
return a.scan(lambda x, y: f(x, y), zero)
@typecheck(streams=expr_stream(), fill_missing=bool)
def _zip_streams(*streams, fill_missing: bool = False) -> StreamExpression:
n_streams = builtins.len(streams)
uids = [Env.get_uid() for _ in builtins.range(n_streams)]
types = [stream._type.element_type for stream in streams]
body_ir = ir.MakeTuple([ir.Ref(uid, type) for uid, type in builtins.zip(uids, types)])
indices, aggregations = unify_all(*streams)
behavior = 'ExtendNA' if fill_missing else 'TakeMinLength'
return construct_expr(
ir.StreamZip([s._ir for s in streams], uids, body_ir, behavior),
tstream(ttuple(*(s.dtype.element_type for s in streams))),
indices,
aggregations,
)
[docs]@typecheck(arrays=expr_array(), fill_missing=bool)
def zip(*arrays, fill_missing: bool = False) -> ArrayExpression:
"""Zip together arrays into a single array.
Examples
--------
>>> hl.eval(hl.zip([1, 2, 3], [4, 5, 6]))
[(1, 4), (2, 5), (3, 6)]
If the arrays are different lengths, the behavior is decided by the `fill_missing` parameter.
>>> hl.eval(hl.zip([1], [10, 20], [100, 200, 300]))
[(1, 10, 100)]
>>> hl.eval(hl.zip([1], [10, 20], [100, 200, 300], fill_missing=True))
[(1, 10, 100), (None, 20, 200), (None, None, 300)]
Notes
-----
The element type of the resulting array is a :class:`.ttuple` with a field
for each array.
Parameters
----------
arrays: : variable-length args of :class:`.ArrayExpression`
Array expressions.
fill_missing : :obj:`bool`
If ``False``, return an array with length equal to the shortest length
of the `arrays`. If ``True``, return an array equal to the longest
length of the `arrays`, by extending the shorter arrays with missing
values.
Returns
-------
:class:`.ArrayExpression`
"""
return _zip_streams(*(a._to_stream() for a in arrays), fill_missing=fill_missing).to_array()
def _zip_func(*arrays, fill_missing=False, f):
n_arrays = builtins.len(arrays)
uids = [Env.get_uid() for _ in builtins.range(n_arrays)]
refs = [
construct_expr(ir.Ref(uid, a.dtype.element_type), a.dtype.element_type, a._indices, a._aggregations)
for uid, a in builtins.zip(uids, arrays)
]
body_result = f(*refs)
indices, aggregations = unify_all(*arrays, body_result)
behavior = 'ExtendNA' if fill_missing else 'TakeMinLength'
return construct_expr(
ir.toArray(ir.StreamZip([ir.toStream(a._ir) for a in arrays], uids, body_result._ir, behavior)),
tarray(body_result.dtype),
indices,
aggregations,
)
[docs]@typecheck(a=expr_array(), start=expr_int32, index_first=bool)
def enumerate(a, start=0, *, index_first=True):
"""Returns an array of (index, element) tuples.
Examples
--------
>>> hl.eval(hl.enumerate(['A', 'B', 'C']))
[(0, 'A'), (1, 'B'), (2, 'C')]
>>> hl.eval(hl.enumerate(['A', 'B', 'C'], start=3))
[(3, 'A'), (4, 'B'), (5, 'C')]
>>> hl.eval(hl.enumerate(['A', 'B', 'C'], index_first=False))
[('A', 0), ('B', 1), ('C', 2)]
Parameters
----------
a : :class:`.ArrayExpression`
start : :class:`.Int32Expression`
The index value from which the counter is started, 0 by default.
index_first: :obj:`bool`
If ``True``, the index is the first value of the element tuples. If
``False``, the index is the second value.
Returns
-------
:class:`.ArrayExpression`
Array of (index, element) or (element, index) tuples.
"""
return a._to_stream().zip_with_index(start, index_first=index_first).to_array()
[docs]@deprecated(version='0.2.56', reason="Replaced by hl.enumerate")
@typecheck(a=expr_array(), index_first=bool)
def zip_with_index(a, index_first=True):
"""Deprecated in favor of :func:`.enumerate`.
Returns an array of (index, element) tuples.
Examples
--------
>>> hl.eval(hl.zip_with_index(['A', 'B', 'C']))
[(0, 'A'), (1, 'B'), (2, 'C')]
>>> hl.eval(hl.zip_with_index(['A', 'B', 'C'], index_first=False))
[('A', 0), ('B', 1), ('C', 2)]
Parameters
----------
a : :class:`.ArrayExpression`
index_first: :obj:`bool`
If ``True``, the index is the first value of the element tuples. If
``False``, the index is the second value.
Returns
-------
:class:`.ArrayExpression`
Array of (index, element) or (element, index) tuples.
"""
return enumerate(a, index_first=index_first)
[docs]@typecheck(f=anyfunc, collections=expr_oneof(expr_set(), expr_array(), expr_ndarray()))
def map(f: Callable, *collections):
r"""Transform each element of a collection.
Examples
--------
>>> a = ['The', 'quick', 'brown', 'fox']
>>> b = [2, 4, 6, 8]
>>> hl.eval(hl.map(lambda x: hl.len(x), a))
[3, 5, 5, 3]
>>> hl.eval(hl.map(lambda s, n: hl.len(s) + n, a, b))
[5, 9, 11, 11]
Parameters
----------
f : function ( (\*arg) -> :class:`.Expression`)
Function to transform each element of the collection.
\*collections : :class:`.ArrayExpression` or :class:`.SetExpression`
A single collection expression or multiple array expressions.
Returns
-------
:class:`.ArrayExpression` or :class:`.SetExpression`.
Collection where each element has been transformed by `f`.
"""
if builtins.len(collections) == 1:
return collections[0].map(f)
else:
return hl.zip(*collections).starmap(f)
@typecheck(expr=oneof(expr_any, func_spec(0, expr_any)), n=expr_int32)
def repeat(expr: 'Union[hl.Expression, Callable[[], hl.Expression]]', n: 'hl.tint32') -> 'hl.ArrayExpression':
"""Return array of `n` elements initialized by `expr`.
Examples
--------
>>> hl.reset_global_randomness()
>>> hl.eval(hl.repeat(hl.rand_int32(10), 5))
[9, 9, 9, 9, 9]
>>> hl.eval(hl.repeat(lambda: hl.rand_int32(10), 5))
[3, 4, 5, 4, 0]
Parameters
----------
n : :class:`.tint32`
Number of elements in the array
expr : :class:`.Expression` or :class:`Callable[[], .Expression]`
Array element initializer. If `expr` is an `.Expression`, every element
in the array will have the same value. Otherwise, if `expr` is a thunk
(ie. a callable with no arguments), the array will be populated by
evaluating `expr()` `n` times.
Returns
-------
:class:`.ArrayExpression`:
Array where each element has been initialized by `expr`
"""
mkarray = lambda x: hl.range(n).map(lambda _: x)
return hl.rbind(expr, mkarray) if isinstance(expr, hl.Expression) else mkarray(expr())
[docs]@typecheck(f=anyfunc, collection=expr_oneof(expr_set(), expr_array(), expr_ndarray()))
def starmap(f: Callable, collection):
r"""Transform each element of a collection of tuples.
Examples
--------
>>> a = [(1, 5), (3, 2), (7, 8)]
>>> hl.eval(hl.starmap(lambda x, y: hl.if_else(x < y, x, y), a))
[1, 2, 7]
Parameters
----------
f : function ( (\*args) -> :class:`.Expression`)
Function to transform each element of the collection.
collection : :class:`.ArrayExpression` or :class:`.SetExpression`
Collection expression.
Returns
-------
:class:`.ArrayExpression` or :class:`.SetExpression`.
Collection where each element has been transformed by `f`.
"""
return collection.starmap(f)
[docs]@typecheck(x=expr_oneof(expr_set(), expr_array(), expr_dict(), expr_str, expr_tuple(), expr_struct()))
def len(x) -> Int32Expression:
"""Returns the size of a collection or string.
Examples
--------
>>> a = ['The', 'quick', 'brown', 'fox']
>>> s = {1, 3, 5, 6, 7, 9}
>>> hl.eval(hl.len(a))
4
>>> hl.eval(hl.len(s))
6
>>> hl.eval(hl.len("12345"))
5
Parameters
----------
x : :class:`.ArrayExpression` or :class:`.SetExpression` or :class:`.DictExpression` or :class:`.StringExpression`
String or collection expression.
Returns
-------
:class:`.Expression` of type :py:data:`.tint32`
"""
if isinstance(x.dtype, (ttuple, tstruct)):
return hl.int32(builtins.len(x))
elif x.dtype == tstr:
return apply_expr(lambda x: ir.Apply("length", tint32, x), tint32, x)
else:
return apply_expr(lambda x: ir.ArrayLen(ir.CastToArray(x)), tint32, array(x))
[docs]@typecheck(x=expr_oneof(expr_array(), expr_str))
def reversed(x):
"""Reverses the elements of a collection.
Examples
--------
>>> a = ['The', 'quick', 'brown', 'fox']
>>> hl.eval(hl.reversed(a))
['fox', 'brown', 'quick', 'The']
Parameters
----------
x : :class:`.ArrayExpression` or :class:`.StringExpression`
Array or string expression.
Returns
-------
:class:`.Expression`
"""
typ = x.dtype
x = range(0, len(x)).map(lambda i: x[len(x) - 1 - i])
if typ == tstr:
x = hl.delimit(x, '')
return x
@typecheck(name=builtins.str, exprs=tupleof(Expression), filter_missing=builtins.bool, filter_nan=builtins.bool)
def _comparison_func(name, exprs, filter_missing, filter_nan):
if builtins.len(exprs) < 1:
raise ValueError(f"{name:!r} requires at least one argument")
if (
builtins.len(exprs) == 1
and (isinstance(exprs[0].dtype, (tarray, tset)))
and is_numeric(exprs[0].dtype.element_type)
):
[e] = exprs
if filter_nan and e.dtype.element_type in (tfloat32, tfloat64):
name = 'nan' + name
return array(e)._filter_missing_method(filter_missing, name, exprs[0].dtype.element_type)
else:
if not builtins.all(is_numeric(e.dtype) for e in exprs):
expr_types = ', '.join("'{}'".format(e.dtype) for e in exprs)
raise TypeError(
f"{name!r} expects a single numeric array expression or multiple numeric expressions\n"
f" Found {builtins.len(exprs)} arguments with types {expr_types}"
)
unified_typ = unify_types_limited(*(e.dtype for e in exprs))
ec = coercer_from_dtype(unified_typ)
indices, aggs = unify_all(*exprs)
func_name = name
if filter_missing:
func_name += '_ignore_missing'
if filter_nan and unified_typ in (tfloat32, tfloat64):
func_name = 'nan' + func_name
return construct_expr(
functools.reduce(lambda l, r: ir.Apply(func_name, unified_typ, l, r), [ec.coerce(e)._ir for e in exprs]),
unified_typ,
indices,
aggs,
)
[docs]@typecheck(
exprs=expr_oneof(expr_numeric, expr_set(expr_numeric), expr_array(expr_numeric)), filter_missing=builtins.bool
)
def nanmax(*exprs, filter_missing: builtins.bool = True) -> NumericExpression:
"""Returns the maximum value of a collection or of given arguments, excluding NaN.
Examples
--------
Compute the maximum value of an array:
>>> hl.eval(hl.nanmax([1.1, 50.1, float('nan')]))
50.1
Take the maximum value of arguments:
>>> hl.eval(hl.nanmax(1.1, 50.1, float('nan')))
50.1
Notes
-----
Like the Python builtin ``max`` function, this function can either take a
single iterable expression (an array or set of numeric elements), or
variable-length arguments of numeric expressions.
Note
----
If `filter_missing` is ``True``, then the result is the maximum of
non-missing arguments or elements. If `filter_missing` is ``False``, then
any missing argument or element causes the result to be missing.
NaN arguments / array elements are ignored; the maximum value of `NaN` and
any non-`NaN` value `x` is `x`.
See Also
--------
:func:`max`, :func:`min`, :func:`nanmin`
Parameters
----------
exprs : :class:`.ArrayExpression` or :class:`.SetExpression` or varargs of :class:`.NumericExpression`
Single numeric array or set, or multiple numeric values.
filter_missing : :obj:`bool`
Remove missing arguments/elements before computing maximum.
Returns
-------
:class:`.NumericExpression`
"""
return _comparison_func('max', exprs, filter_missing, filter_nan=True)
[docs]@typecheck(
exprs=expr_oneof(expr_numeric, expr_set(expr_numeric), expr_array(expr_numeric)), filter_missing=builtins.bool
)
def max(*exprs, filter_missing: builtins.bool = True) -> NumericExpression:
"""Returns the maximum element of a collection or of given numeric expressions.
Examples
--------
Take the maximum value of an array:
>>> hl.eval(hl.max([1, 3, 5, 6, 7, 9]))
9
Take the maximum value of values:
>>> hl.eval(hl.max(1, 50, 2))
50
Notes
-----
Like the Python builtin ``max`` function, this function can either take a
single iterable expression (an array or set of numeric elements), or
variable-length arguments of numeric expressions.
Note
----
If `filter_missing` is ``True``, then the result is the maximum of
non-missing arguments or elements. If `filter_missing` is ``False``, then
any missing argument or element causes the result to be missing.
If any element or argument is `NaN`, then the result is `NaN`.
See Also
--------
:func:`nanmax`, :func:`min`, :func:`nanmin`
Parameters
----------
exprs : :class:`.ArrayExpression` or :class:`.SetExpression` or varargs of :class:`.NumericExpression`
Single numeric array or set, or multiple numeric values.
filter_missing : :obj:`bool`
Remove missing arguments/elements before computing maximum.
Returns
-------
:class:`.NumericExpression`
"""
return _comparison_func('max', exprs, filter_missing, filter_nan=False)
[docs]@typecheck(
exprs=expr_oneof(expr_numeric, expr_set(expr_numeric), expr_array(expr_numeric)), filter_missing=builtins.bool
)
def nanmin(*exprs, filter_missing: builtins.bool = True) -> NumericExpression:
"""Returns the minimum value of a collection or of given arguments, excluding NaN.
Examples
--------
Compute the minimum value of an array:
>>> hl.eval(hl.nanmin([1.1, 50.1, float('nan')]))
1.1
Take the minimum value of arguments:
>>> hl.eval(hl.nanmin(1.1, 50.1, float('nan')))
1.1
Notes
-----
Like the Python builtin ``min`` function, this function can either take a
single iterable expression (an array or set of numeric elements), or
variable-length arguments of numeric expressions.
Note
----
If `filter_missing` is ``True``, then the result is the minimum of
non-missing arguments or elements. If `filter_missing` is ``False``, then
any missing argument or element causes the result to be missing.
NaN arguments / array elements are ignored; the minimum value of `NaN` and
any non-`NaN` value `x` is `x`.
See Also
--------
:func:`min`, :func:`max`, :func:`nanmax`
Parameters
----------
exprs : :class:`.ArrayExpression` or :class:`.SetExpression` or varargs of :class:`.NumericExpression`
Single numeric array or set, or multiple numeric values.
filter_missing : :obj:`bool`
Remove missing arguments/elements before computing minimum.
Returns
-------
:class:`.NumericExpression`
"""
return _comparison_func('min', exprs, filter_missing, filter_nan=True)
[docs]@typecheck(
exprs=expr_oneof(expr_numeric, expr_set(expr_numeric), expr_array(expr_numeric)), filter_missing=builtins.bool
)
def min(*exprs, filter_missing: builtins.bool = True) -> NumericExpression:
"""Returns the minimum element of a collection or of given numeric expressions.
Examples
--------
Take the minimum value of an array:
>>> hl.eval(hl.min([1, 3, 5, 6, 7, 9]))
1
Take the minimum value of arguments:
>>> hl.eval(hl.min(1, 50, 2))
1
Notes
-----
Like the Python builtin ``min`` function, this function can either take a
single iterable expression (an array or set of numeric elements), or
variable-length arguments of numeric expressions.
Note
----
If `filter_missing` is ``True``, then the result is the minimum of
non-missing arguments or elements. If `filter_missing` is ``False``, then
any missing argument or element causes the result to be missing.
If any element or argument is `NaN`, then the result is `NaN`.
See Also
--------
:func:`nanmin`, :func:`max`, :func:`nanmax`
Parameters
----------
exprs : :class:`.ArrayExpression` or :class:`.SetExpression` or varargs of :class:`.NumericExpression`
Single numeric array or set, or multiple numeric values.
filter_missing : :obj:`bool`
Remove missing arguments/elements before computing minimum.
Returns
-------
:class:`.NumericExpression`
"""
return _comparison_func('min', exprs, filter_missing, filter_nan=False)
[docs]@typecheck(x=expr_oneof(expr_numeric, expr_array(expr_numeric), expr_ndarray(expr_numeric)))
def abs(x):
"""Take the absolute value of a numeric value, array or ndarray.
Examples
--------
>>> hl.eval(hl.abs(-5))
5
>>> hl.eval(hl.abs([1.0, -2.5, -5.1]))
[1.0, 2.5, 5.1]
Parameters
----------
x : :class:`.NumericExpression`, :class:`.ArrayNumericExpression` or :class:`.NDArrayNumericExpression`
Returns
-------
:class:`.NumericExpression`, :class:`.ArrayNumericExpression` or :class:`.NDArrayNumericExpression`.
"""
if isinstance(x.dtype, (tarray, tndarray)):
return map(abs, x)
else:
return x._method('abs', x.dtype)
[docs]@typecheck(x=expr_oneof(expr_numeric, expr_array(expr_numeric), expr_ndarray(expr_numeric)))
def sign(x):
"""Returns the sign of a numeric value, array or ndarray.
Examples
--------
>>> hl.eval(hl.sign(-1.23))
-1.0
>>> hl.eval(hl.sign([-4, 0, 5]))
[-1, 0, 1]
>>> hl.eval(hl.sign([0.0, 3.14]))
[0.0, 1.0]
>>> hl.eval(hl.sign(float('nan')))
nan
Notes
-----
The sign function preserves type and maps ``nan`` to ``nan``.
Parameters
----------
x : :class:`.NumericExpression`, :class:`.ArrayNumericExpression` or :class:`.NDArrayNumericExpression`
Returns
-------
:class:`.NumericExpression`, :class:`.ArrayNumericExpression` or :class:`.NDArrayNumericExpression`.
"""
if isinstance(x.dtype, (tarray, tndarray)):
return map(sign, x)
else:
return x._method('sign', x.dtype)
[docs]@typecheck(collection=expr_oneof(expr_set(expr_numeric), expr_array(expr_numeric)), filter_missing=bool)
def mean(collection, filter_missing: bool = True) -> Float64Expression:
"""Returns the mean of all values in the collection.
Examples
--------
>>> a = [1, 3, 5, 6, 7, 9]
>>> hl.eval(hl.mean(a))
5.166666666666667
Note
----
Missing elements are ignored if `filter_missing` is ``True``. If `filter_missing`
is ``False``, then any missing element causes the result to be missing.
Parameters
----------
collection : :class:`.ArrayExpression` or :class:`.SetExpression`
Collection expression with numeric element type.
filter_missing : :obj:`bool`
Remove missing elements from the collection before computing product.
Returns
-------
:class:`.Expression` of type :py:data:`.tfloat64`
"""
return array(collection)._filter_missing_method(filter_missing, "mean", tfloat64)
[docs]@typecheck(collection=expr_oneof(expr_set(expr_numeric), expr_array(expr_numeric)), filter_missing=bool)
def product(collection, filter_missing: bool = True) -> NumericExpression:
"""Returns the product of values in the collection.
Examples
--------
>>> a = [1, 3, 5, 6, 7, 9]
>>> hl.eval(hl.product(a))
5670
Note
----
Missing elements are ignored if `filter_missing` is ``True``. If `filter_missing`
is ``False``, then any missing element causes the result to be missing.
Parameters
----------
collection : :class:`.ArrayExpression` or :class:`.SetExpression`
Collection expression with numeric element type.
filter_missing : :obj:`bool`
Remove missing elements from the collection before computing product.
Returns
-------
:class:`.NumericExpression`
"""
return array(collection)._filter_missing_method(filter_missing, "product", collection.dtype.element_type)
[docs]@typecheck(collection=expr_oneof(expr_set(expr_numeric), expr_array(expr_numeric)), filter_missing=bool)
def sum(collection, filter_missing: bool = True) -> NumericExpression:
"""Returns the sum of values in the collection.
Examples
--------
>>> a = [1, 3, 5, 6, 7, 9]
>>> hl.eval(hl.sum(a))
31
Note
----
Missing elements are ignored if `filter_missing` is ``True``. If `filter_missing`
is ``False``, then any missing element causes the result to be missing.
Parameters
----------
collection : :class:`.ArrayExpression` or :class:`.SetExpression`
Collection expression with numeric element type.
filter_missing : :obj:`bool`
Remove missing elements from the collection before computing product.
Returns
-------
:class:`.NumericExpression`
"""
return array(collection)._filter_missing_method(filter_missing, "sum", collection.dtype.element_type)
[docs]@typecheck(a=expr_array(expr_numeric), filter_missing=bool)
def cumulative_sum(a, filter_missing: bool = True) -> ArrayNumericExpression:
"""Returns an array of the cumulative sum of values in the array.
Examples
--------
>>> a = [1, 3, 5, 6, 7, 9]
>>> hl.eval(hl.cumulative_sum(a))
[1, 4, 9, 15, 22, 31]
Note
----
Missing elements are ignored if `filter_missing` is ``True``. If `filter_missing`
is ``False``, then any missing element causes the result to be missing.
Parameters
----------
a : :class:`.ArrayNumericExpression`
Array expression with numeric element type.
filter_missing : :obj:`bool`
Remove missing elements from the collection before computing product.
Returns
-------
:class:`.ArrayNumericExpression`
"""
if filter_missing:
a = a.filter(hl.is_defined)
return a.scan(lambda accum, elt: accum + elt, 0)[1:]
[docs]@typecheck(kwargs=expr_any)
def struct(**kwargs) -> StructExpression:
"""Construct a struct expression.
Examples
--------
>>> s = hl.struct(a=5, b='Foo')
>>> hl.eval(s.a)
5
Returns
-------
:class:`.StructExpression`
Keyword arguments as a struct.
"""
return StructExpression._from_fields(kwargs)
[docs]def tuple(iterable: Iterable) -> TupleExpression:
"""Construct a tuple expression.
Examples
--------
>>> t = hl.tuple([1, 2, '3'])
>>> hl.eval(t)
(1, 2, '3')
>>> hl.eval(t[2])
'3'
Parameters
----------
iterable : an iterable of :class:`.Expression`
Tuple elements.
Returns
-------
:class:`.TupleExpression`
"""
t = builtins.tuple(iterable)
return to_expr(t)
[docs]@typecheck(collection=expr_oneof(expr_set(), expr_array()))
def set(collection) -> SetExpression:
"""Convert a set expression.
Examples
--------
>>> s = hl.set(['Bob', 'Charlie', 'Alice', 'Bob', 'Bob'])
>>> hl.eval(s) # doctest: +SKIP
{'Alice', 'Bob', 'Charlie'}
Returns
-------
:class:`.SetExpression`
Set of all unique elements.
"""
if isinstance(collection.dtype, tset):
return collection
return apply_expr(lambda c: ir.ToSet(ir.toStream(c)), tset(collection.dtype.element_type), collection)
[docs]@typecheck(t=hail_type)
def empty_set(t: Union[HailType, builtins.str]) -> SetExpression:
"""Returns an empty set of elements of a type `t`.
Examples
--------
>>> hl.eval(hl.empty_set(hl.tstr))
set()
Parameters
----------
t : :class:`str` or :class:`.HailType`
Type of the set elements.
Returns
-------
:class:`.SetExpression`
"""
return hl.set(empty_array(t))
[docs]@typecheck(collection=expr_oneof(expr_set(), expr_array(), expr_dict(), expr_ndarray()))
def array(collection) -> ArrayExpression:
"""Construct an array expression.
Examples
--------
>>> s = {'Bob', 'Charlie', 'Alice'}
>>> hl.eval(hl.array(s))
['Alice', 'Bob', 'Charlie']
Parameters
----------
collection : :class:`.ArrayExpression` or :class:`.SetExpression` or :class:`.DictExpression`
Returns
-------
:class:`.ArrayExpression`
"""
if isinstance(collection.dtype, tarray):
return collection
elif isinstance(collection.dtype, tset):
return apply_expr(lambda c: ir.CastToArray(c), tarray(collection.dtype.element_type), collection)
elif isinstance(collection.dtype, tndarray):
if collection.dtype.ndim != 1:
raise ValueError(f'array: only one dimensional ndarrays are supported: {collection.dtype}')
return collection._data_array()
else:
assert isinstance(collection.dtype, tdict)
return _func('dictToArray', tarray(ttuple(collection.dtype.key_type, collection.dtype.value_type)), collection)
[docs]@typecheck(t=hail_type)
def empty_array(t: Union[HailType, builtins.str]) -> ArrayExpression:
"""Returns an empty array of elements of a type `t`.
Examples
--------
>>> hl.eval(hl.empty_array(hl.tint32))
[]
Parameters
----------
t : :class:`str` or :class:`.HailType`
Type of the array elements.
Returns
-------
:class:`.ArrayExpression`
"""
array_t = hl.tarray(t)
a = ir.MakeArray([], array_t)
return construct_expr(a, array_t)
def _ndarray(collection, row_major=None, dtype=None):
"""Construct a Hail ndarray from either a flat Hail array, a `NumPy` ndarray or python value/nested lists.
Parameters
----------
collection : :class:`numpy.ndarray` or :obj:`numeric` or :obj: `list` of `numeric`
Type of the array elements.
row_major : :obj: `bool` or None
Returns
-------
:class:`.NDArrayExpression`
"""
def list_shape(x):
if isinstance(x, (list, builtins.tuple)):
dim_len = builtins.len(x)
if dim_len != 0:
first, rest = x[0], x[1:]
inner_shape = list_shape(first)
for e in rest:
other_inner_shape = list_shape(e)
if inner_shape != other_inner_shape:
raise ValueError(f'inner dimensions do not match: {inner_shape}, {other_inner_shape}')
return [dim_len, *inner_shape]
else:
return [dim_len]
else:
return []
def deep_flatten(xs: Iterable) -> Iterable:
return [y for x in xs for y in (deep_flatten(x) if isinstance(x, (list, builtins.tuple)) else [x])]
def flatten_expr_assert_shape(shape):
def recur(dim):
return lambda xs: hl.bind(
lambda actual: hl.bind(
lambda expected: hl.case()
.when(
actual == expected,
xs if dim == builtins.len(shape) - 1 else xs.flatmap(recur(dim + 1)),
)
.or_error(
f'ndarray dimension {dim} did not match.\n'
+ (' Expected len(dimension) == ' + hl.str(expected) + '\n')
+ (' Actual: ' + hl.str(actual) + '.')
),
shape[dim],
),
hl.len(xs),
)
return recur(0)
def from_data_and_shape(data: ArrayExpression, shape: TupleExpression):
data = data.map(lambda value: cast_expr(value, dtype))
ndir = ir.MakeNDArray(data._ir, shape._ir, hl.bool(True)._ir)
new_indices, new_aggregations = unify_all(data, shape)
ndim = builtins.len(shape)
return construct_expr(ndir, tndarray(data.dtype.element_type, ndim), new_indices, new_aggregations)
if isinstance(collection, NumericExpression):
return from_data_and_shape(array([collection]), hl.tuple([]))
elif isinstance(collection, ArrayExpression):
recursive_type = collection.dtype
ndim = 0
while isinstance(recursive_type, (tarray, tndarray)):
recursive_type = recursive_type._element_type
ndim += 1
return hl.bind(
lambda arr: hl.bind(
lambda shape: hl.bind(
lambda data: from_data_and_shape(data, shape),
flatten_expr_assert_shape(shape)(arr),
),
hl.tuple(
hl.int64(hl.len(dim))
for dim in itertools.accumulate(
builtins.range(ndim - 1),
lambda xs, _: xs[0],
initial=arr,
)
),
),
collection,
)
elif isinstance(collection, Expression):
raise ValueError(f"{collection} cannot be converted into an ndarray")
elif isinstance(collection, np.ndarray):
return hl.literal(collection)
else:
if isinstance(collection, (list, builtins.tuple)):
shape = list_shape(collection)
data = deep_flatten(collection)
else:
shape = []
data = [collection]
shape_expr = to_expr(tuple([hl.int64(i) for i in shape]), ttuple(*[tint64 for _ in shape]))
data_expr = hl.array(data) if data else hl.empty_array("float64")
ndim = builtins.len(shape)
return from_data_and_shape(data_expr, shape_expr)
[docs]@typecheck(key_type=hail_type, value_type=hail_type)
def empty_dict(key_type: Union[HailType, builtins.str], value_type: Union[HailType, builtins.str]) -> DictExpression:
"""Returns an empty dictionary with key type `key_type` and value type
`value_type`.
Examples
--------
>>> hl.eval(hl.empty_dict(hl.tstr, hl.tint32))
{}
Parameters
----------
key_type : :class:`str` or :class:`.HailType`
Type of the keys.
value_type : :class:`str` or :class:`.HailType`
Type of the values.
Returns
-------
:class:`.DictExpression`
"""
return hl.dict(hl.empty_array(hl.ttuple(key_type, value_type)))
[docs]@typecheck(collection=expr_oneof(expr_set(expr_set()), expr_array(expr_array())))
def flatten(collection):
"""Flatten a nested collection by concatenating sub-collections.
Examples
--------
>>> a = [[1, 2], [2, 3]]
>>> hl.eval(hl.flatten(a))
[1, 2, 2, 3]
Parameters
----------
collection : :class:`.ArrayExpression` or :class:`.SetExpression`
Collection with element type :class:`.tarray` or :class:`.tset`.
Returns
-------
collection : :class:`.ArrayExpression` or :class:`.SetExpression`
"""
return collection.flatmap(lambda x: x)
def _union_intersection_base(name, arrays, key, join_f, result_f):
if builtins.len(arrays) == 0:
raise ValueError(f"{name}: require at least one input array")
t = arrays[0].dtype.element_type
if not isinstance(t, tstruct):
raise ValueError(f"{name}: expect a struct element type, found {t}")
for k in key:
if k not in t:
raise ValueError(f"{name}: key field {k!r} not in element type {t}")
for i, a in builtins.enumerate(arrays):
if a.dtype.element_type != t:
raise ValueError(
f"{name}: input {i} has a different element type than input 0:"
f"\n input 0: {t}"
f"\n input {i}: {a.dtype.element_type}"
)
key_typ = hl.tstruct(**{k: t[k] for k in key})
vals_typ = hl.tarray(t)
key_uid = Env.get_uid()
vals_uid = Env.get_uid()
key_var = construct_variable(key_uid, key_typ)
vals_var = construct_variable(vals_uid, vals_typ)
join_ir = join_f(key_var, vals_var)
irs = []
for a in arrays:
if isinstance(a.dtype, hl.tarray):
irs.append(ir.toStream(a._ir))
else:
irs.append(a._ir)
indices, aggs = unify_all(*arrays)
zj = ir.ToArray(ir.StreamZipJoin(irs, key, key_uid, vals_uid, join_ir._ir))
return result_f(construct_expr(zj, zj.typ, indices, aggs))
def _zip_join_producers(contexts, stream_f, key, join_f):
ctx_uid = Env.get_uid()
ctx_var = construct_variable(ctx_uid, contexts.dtype.element_type)
stream_req = stream_f(ctx_var)
make_prod_ir = stream_req._ir
if isinstance(make_prod_ir.typ, hl.tarray):
make_prod_ir = ir.ToStream(make_prod_ir)
t = stream_req.dtype.element_type
key_typ = hl.tstruct(**{k: t[k] for k in key})
vals_typ = hl.tarray(t)
key_uid = Env.get_uid()
vals_uid = Env.get_uid()
key_var = construct_variable(key_uid, key_typ)
vals_var = construct_variable(vals_uid, vals_typ)
join_ir = join_f(key_var, vals_var)
zj = ir.ToArray(ir.StreamZipJoinProducers(contexts._ir, ctx_uid, make_prod_ir, key, key_uid, vals_uid, join_ir._ir))
indices, aggs = unify_all(contexts, stream_req, join_ir)
return construct_expr(zj, zj.typ, indices, aggs)
[docs]@typecheck(arrays=expr_oneof(expr_stream(expr_any), expr_array(expr_any)), key=sequenceof(builtins.str))
def keyed_intersection(*arrays, key):
"""Compute the intersection of sorted arrays on a given key.
Requires sorted arrays with distinct keys.
Warning
-------
Experimental. Does not support downstream randomness.
Parameters
----------
arrays
key
Returns
-------
:class:`.ArrayExpression`
"""
return _union_intersection_base(
'keyed_intersection',
arrays,
key,
lambda key_var, vals_var: hl.tuple((key_var, vals_var)),
lambda res: res.filter(lambda x: hl.fold(lambda acc, elt: acc & hl.is_defined(elt), True, x[1])).map(
lambda x: x[1].first()
),
)
[docs]@typecheck(arrays=expr_oneof(expr_stream(expr_any), expr_array(expr_any)), key=sequenceof(builtins.str))
def keyed_union(*arrays, key):
"""Compute the distinct union of sorted arrays on a given key.
Requires sorted arrays with distinct keys.
Warning
-------
Experimental. Does not support downstream randomness.
Parameters
----------
exprs
key
Returns
-------
:class:`.ArrayExpression`
"""
return _union_intersection_base(
'keyed_union',
arrays,
key,
lambda keys_var, vals_var: hl.fold(
lambda acc, elt: hl.coalesce(acc, elt), hl.missing(vals_var.dtype.element_type), vals_var
),
lambda res: res,
)
[docs]@typecheck(collection=expr_oneof(expr_array(), expr_set()), delimiter=expr_str)
def delimit(collection, delimiter=',') -> StringExpression:
"""Joins elements of `collection` into single string delimited by `delimiter`.
Examples
--------
>>> a = ['Bob', 'Charlie', 'Alice', 'Bob', 'Bob']
>>> hl.eval(hl.delimit(a))
'Bob,Charlie,Alice,Bob,Bob'
Notes
-----
If the element type of `collection` is not :py:data:`.tstr`, then the
:func:`str` function will be called on each element before joining with
the delimiter.
Parameters
----------
collection : :class:`.ArrayExpression` or :class:`.SetExpression`
Collection.
delimiter : str or :class:`.StringExpression`
Field delimiter.
Returns
-------
:class:`.StringExpression`
Joined string expression.
"""
if not collection.dtype.element_type == tstr:
collection = map(str, collection)
return collection._method("mkString", tstr, delimiter)
@typecheck(left=expr_any, right=expr_any)
def _compare(left, right):
if left.dtype != right.dtype:
raise TypeError(
f"'compare' expected 'left' and 'right' to have the same type: found {left.dtype} vs {right.dtype}"
)
indices, aggregations = unify_all(left, right)
return construct_expr(ir.ApplyComparisonOp("Compare", left._ir, right._ir), tint32, indices, aggregations)
@typecheck(collection=expr_array(), less_than=nullable(func_spec(2, expr_bool)))
def _sort_by(collection, less_than):
left_id = Env.get_uid()
right_id = Env.get_uid()
elt_type = collection.dtype.element_type
left = construct_expr(ir.Ref(left_id, elt_type), elt_type, collection._indices, collection._aggregations)
right = construct_expr(ir.Ref(right_id, elt_type), elt_type, collection._indices, collection._aggregations)
return construct_expr(
ir.ArraySort(ir.toStream(collection._ir), left_id, right_id, less_than(left, right)._ir),
collection.dtype,
collection._indices,
collection._aggregations,
)
[docs]@typecheck(
collection=expr_oneof(expr_array(), expr_dict(), expr_set()),
key=nullable(func_spec(1, expr_any)),
reverse=expr_bool,
)
def sorted(collection, key: Optional[Callable] = None, reverse=False) -> ArrayExpression:
"""Returns a sorted array.
Examples
--------
>>> a = ['Charlie', 'Alice', 'Bob']
>>> hl.eval(hl.sorted(a))
['Alice', 'Bob', 'Charlie']
>>> hl.eval(hl.sorted(a, reverse=True))
['Charlie', 'Bob', 'Alice']
>>> hl.eval(hl.sorted(a, key=lambda x: hl.len(x)))
['Bob', 'Alice', 'Charlie']
Notes
-----
The ordered types are :py:data:`.tstr` and numeric types.
Parameters
----------
collection : :class:`.ArrayExpression` or :class:`.SetExpression` or :class:`.DictExpression`
Collection to sort.
key: function ( (arg) -> :class:`.Expression`), optional
Function to evaluate for each element to compute sort key.
reverse : :class:`.BooleanExpression`
Sort in descending order.
Returns
-------
:class:`.ArrayExpression`
Sorted array.
"""
if not isinstance(collection, ArrayExpression):
collection = hl.array(collection)
def comp(left, right):
return (
hl.case()
.when(hl.is_missing(left), False)
.when(hl.is_missing(right), True)
.when(reverse, hl._compare(right, left) < 0)
.default(hl._compare(left, right) < 0)
)
if key is None:
return _sort_by(collection, comp)
else:
with_key = collection.map(lambda elt: hl.tuple([key(elt), elt]))
return _sort_by(with_key, lambda l, r: comp(l[0], r[0])).map(lambda elt: elt[1])
[docs]@typecheck(array=expr_array(expr_numeric), unique=bool)
def argmin(array, unique: bool = False) -> Int32Expression:
"""Return the index of the minimum value in the array.
Examples
--------
>>> hl.eval(hl.argmin([0.2, 0.3, 0.6]))
0
>>> hl.eval(hl.argmin([0.4, 0.2, 0.2]))
1
>>> hl.eval(hl.argmin([0.4, 0.2, 0.2], unique=True))
None
Notes
-----
Returns the index of the minimum value in the array.
If two or more elements are tied for minimum, then the `unique` parameter
will determine the result. If `unique` is ``False``, then the first index
will be returned. If `unique` is ``True``, then the result is missing.
If the array is empty, then the result is missing.
Note
----
Missing elements are ignored.
Parameters
----------
array : :class:`.ArrayNumericExpression`
unique : bool
Returns
-------
:class:`.Expression` of type :py:data:`.tint32`
"""
if unique:
return array._method("uniqueMinIndex", tint32)
else:
return array._method("argmin", tint32)
[docs]@typecheck(array=expr_array(expr_numeric), unique=bool)
def argmax(array, unique: bool = False) -> Int32Expression:
"""Return the index of the maximum value in the array.
Examples
--------
>>> hl.eval(hl.argmax([0.2, 0.2, 0.6]))
2
>>> hl.eval(hl.argmax([0.4, 0.4, 0.2]))
0
>>> hl.eval(hl.argmax([0.4, 0.4, 0.2], unique=True))
None
Notes
-----
Returns the index of the maximum value in the array.
If two or more elements are tied for maximum, then the `unique` parameter
will determine the result. If `unique` is ``False``, then the first index
will be returned. If `unique` is ``True``, then the result is missing.
If the array is empty, then the result is missing.
Note
----
Missing elements are ignored.
Parameters
----------
array : :class:`.ArrayNumericExpression`
unique: bool
Returns
-------
:class:`.Expression` of type :py:data:`.tint32`
"""
if unique:
return array._method("uniqueMaxIndex", tint32)
else:
return array._method("argmax", tint32)
[docs]@typecheck(x=expr_oneof(expr_numeric, expr_bool, expr_str))
def float64(x) -> Float64Expression:
"""Convert to a 64-bit floating point expression.
Examples
--------
>>> hl.eval(hl.float64('1.1'))
1.1
>>> hl.eval(hl.float64(1))
1.0
>>> hl.eval(hl.float64(True))
1.0
Parameters
----------
x : :class:`.NumericExpression` or :class:`.BooleanExpression` or :class:`.StringExpression`
Returns
-------
:class:`.NumericExpression` of type :py:data:`.tfloat64`
"""
if x.dtype == tfloat64:
return x
else:
return x._method("toFloat64", tfloat64)
[docs]@typecheck(x=expr_str)
def parse_float64(x) -> Float64Expression:
"""Parse a string as a 64-bit floating point number.
Examples
--------
>>> hl.eval(hl.parse_float64('1.1'))
1.1
>>> hl.eval(hl.parse_float64('asdf'))
None
Notes
-----
If the input is an invalid floating point number, then result of this call will be missing.
Parameters
----------
x : :class:`.StringExpression`
Returns
-------
:class:`.NumericExpression` of type :py:data:`.tfloat64`
"""
return x._method("toFloat64OrMissing", tfloat64)
[docs]@typecheck(x=expr_oneof(expr_numeric, expr_bool, expr_str))
def float32(x) -> Float32Expression:
"""Convert to a 32-bit floating point expression.
Examples
--------
>>> hl.eval(hl.float32('1.1'))
1.100000023841858
>>> hl.eval(hl.float32(1))
1.0
>>> hl.eval(hl.float32(True))
1.0
Parameters
----------
x : :class:`.NumericExpression` or :class:`.BooleanExpression` or :class:`.StringExpression`
Returns
-------
:class:`.NumericExpression` of type :py:data:`.tfloat32`
"""
if x.dtype == tfloat32:
return x
else:
return x._method("toFloat32", tfloat32)
[docs]@typecheck(x=expr_str)
def parse_float32(x) -> Float32Expression:
"""Parse a string as a 32-bit floating point number.
Examples
--------
>>> hl.eval(hl.parse_float32('1.1'))
1.100000023841858
>>> hl.eval(hl.parse_float32('asdf'))
None
Notes
-----
If the input is an invalid floating point number, then result of this call will be missing.
Parameters
----------
x : :class:`.StringExpression`
Returns
-------
:class:`.NumericExpression` of type :py:data:`.tfloat32`
"""
return x._method("toFloat32OrMissing", tfloat32)
[docs]@typecheck(x=expr_oneof(expr_numeric, expr_bool, expr_str))
def int64(x) -> Int64Expression:
"""Convert to a 64-bit integer expression.
Examples
--------
>>> hl.eval(hl.int64('1'))
1
>>> hl.eval(hl.int64(1.5))
1
>>> hl.eval(hl.int64(True))
1
Parameters
----------
x : :class:`.NumericExpression` or :class:`.BooleanExpression` or :class:`.StringExpression`
Returns
-------
:class:`.NumericExpression` of type :py:data:`.tint64`
"""
if x.dtype == tint64:
return x
else:
return x._method("toInt64", tint64)
[docs]@typecheck(x=expr_str)
def parse_int64(x) -> Int64Expression:
"""Parse a string as a 64-bit integer.
Examples
--------
>>> hl.eval(hl.parse_int64('154'))
154
>>> hl.eval(hl.parse_int64('15.4'))
None
>>> hl.eval(hl.parse_int64('asdf'))
None
Notes
-----
If the input is an invalid integer, then result of this call will be missing.
Parameters
----------
x : :class:`.StringExpression`
Returns
-------
:class:`.NumericExpression` of type :py:data:`.tint64`
"""
return x._method("toInt64OrMissing", tint64)
[docs]@typecheck(x=expr_oneof(expr_numeric, expr_bool, expr_str))
def int32(x) -> Int32Expression:
"""Convert to a 32-bit integer expression.
Examples
--------
>>> hl.eval(hl.int32('1'))
1
>>> hl.eval(hl.int32(1.5))
1
>>> hl.eval(hl.int32(True))
1
Parameters
----------
x : :class:`.NumericExpression` or :class:`.BooleanExpression` or :class:`.StringExpression`
Returns
-------
:class:`.NumericExpression` of type :py:data:`.tint32`
"""
if x.dtype == tint32:
return x
else:
return x._method("toInt32", tint32)
[docs]@typecheck(x=expr_str)
def parse_int32(x) -> Int32Expression:
"""Parse a string as a 32-bit integer.
Examples
--------
>>> hl.eval(hl.parse_int32('154'))
154
>>> hl.eval(hl.parse_int32('15.4'))
None
>>> hl.eval(hl.parse_int32('asdf'))
None
Notes
-----
If the input is an invalid integer, then result of this call will be missing.
Parameters
----------
x : :class:`.StringExpression`
Returns
-------
:class:`.NumericExpression` of type :py:data:`.tint32`
"""
return x._method("toInt32OrMissing", tint32)
[docs]@typecheck(x=expr_oneof(expr_numeric, expr_bool, expr_str))
def int(x) -> Int32Expression:
"""Convert to a 32-bit integer expression.
Examples
--------
>>> hl.eval(hl.int('1'))
1
>>> hl.eval(hl.int(1.5))
1
>>> hl.eval(hl.int(True))
1
Note
----
Alias for :func:`.int32`.
Parameters
----------
x : :class:`.NumericExpression` or :class:`.BooleanExpression` or :class:`.StringExpression`
Returns
-------
:class:`.NumericExpression` of type :py:data:`.tint32`
"""
return int32(x)
[docs]@typecheck(x=expr_str)
def parse_int(x) -> Int32Expression:
"""Parse a string as a 32-bit integer.
Examples
--------
>>> hl.eval(hl.parse_int('154'))
154
>>> hl.eval(hl.parse_int('15.4'))
None
>>> hl.eval(hl.parse_int('asdf'))
None
Notes
-----
If the input is an invalid integer, then result of this call will be missing.
Parameters
----------
x : :class:`.StringExpression`
Returns
-------
:class:`.NumericExpression` of type :py:data:`.tint32`
"""
return parse_int32(x)
[docs]@typecheck(x=expr_oneof(expr_numeric, expr_bool, expr_str))
def float(x) -> Float64Expression:
"""Convert to a 64-bit floating point expression.
Examples
--------
>>> hl.eval(hl.float('1.1'))
1.1
>>> hl.eval(hl.float(1))
1.0
>>> hl.eval(hl.float(True))
1.0
Note
----
Alias for :func:`.float64`.
Parameters
----------
x : :class:`.NumericExpression` or :class:`.BooleanExpression` or :class:`.StringExpression`
Returns
-------
:class:`.NumericExpression` of type :py:data:`.tfloat64`
"""
return float64(x)
[docs]@typecheck(x=expr_str)
def parse_float(x) -> Float64Expression:
"""Parse a string as a 64-bit floating point number.
Examples
--------
>>> hl.eval(hl.parse_float('1.1'))
1.1
>>> hl.eval(hl.parse_float('asdf'))
None
Notes
-----
If the input is an invalid floating point number, then result of this call will be missing.
Parameters
----------
x : :class:`.StringExpression`
Returns
-------
:class:`.NumericExpression` of type :py:data:`.tfloat64`
"""
return parse_float64(x)
[docs]@typecheck(x=expr_oneof(expr_numeric, expr_bool, expr_str))
def bool(x) -> BooleanExpression:
"""Convert to a Boolean expression.
Examples
--------
>>> hl.eval(hl.bool('TRUE'))
True
>>> hl.eval(hl.bool(1.5))
True
Notes
-----
Numeric expressions return ``True`` if they are non-zero, and ``False``
if they are zero.
Acceptable string values are: ``'True'``, ``'true'``, ``'TRUE'``,
``'False'``, ``'false'``, and ``'FALSE'``.
Parameters
----------
x : :class:`.NumericExpression` or :class:`.BooleanExpression` or :class:`.StringExpression`
Returns
-------
:class:`.BooleanExpression`
"""
if x.dtype == tbool:
return x
elif is_numeric(x.dtype):
return x != 0
else:
return x._method("toBoolean", tbool)
[docs]@typecheck(s=expr_str, rna=builtins.bool)
def reverse_complement(s, rna=False):
"""Reverses the string and translates base pairs into their complements
Examples
--------
>>> bases = hl.literal('NNGATTACA')
>>> hl.eval(hl.reverse_complement(bases))
'TGTAATCNN'
Parameters
----------
s : :class:`.StringExpression`
Base string.
rna : :obj:`bool`
If ``True``, pair adenine (A) with uracil (U) instead of thymine (T).
Returns
-------
:class:`.StringExpression`
"""
s = s.reverse()
if rna:
pairs = [('A', 'U'), ('U', 'A'), ('T', 'A'), ('G', 'C'), ('C', 'G')]
else:
pairs = [('A', 'T'), ('T', 'A'), ('G', 'C'), ('C', 'G')]
d = {}
for b1, b2 in pairs:
d[b1] = b2
d[b1.lower()] = b2.lower()
return s.translate(d)
[docs]@typecheck(
contig=expr_str, position=expr_int32, before=expr_int32, after=expr_int32, reference_genome=reference_genome_type
)
def get_sequence(contig, position, before=0, after=0, reference_genome='default') -> StringExpression:
"""Return the reference sequence at a given locus.
Examples
--------
Return the reference allele for ``'GRCh37'`` at the locus ``'1:45323'``:
>>> hl.eval(hl.get_sequence('1', 45323, reference_genome='GRCh37')) # doctest: +SKIP
"T"
Notes
-----
This function requires `reference genome` has an attached
reference sequence. Use :meth:`.ReferenceGenome.add_sequence` to
load and attach a reference sequence to a reference genome.
Returns ``None`` if `contig` and `position` are not valid coordinates in
`reference_genome`.
Parameters
----------
contig : :class:`.Expression` of type :py:data:`.tstr`
Locus contig.
position : :class:`.Expression` of type :py:data:`.tint32`
Locus position.
before : :class:`.Expression` of type :py:data:`.tint32`, optional
Number of bases to include before the locus of interest. Truncates at
contig boundary.
after : :class:`.Expression` of type :py:data:`.tint32`, optional
Number of bases to include after the locus of interest. Truncates at
contig boundary.
reference_genome : :class:`str` or :class:`.ReferenceGenome`
Reference genome to use. Must have a reference sequence available.
Returns
-------
:class:`.StringExpression`
"""
if not reference_genome.has_sequence():
raise TypeError(
"Reference genome '{}' does not have a sequence loaded. Use 'add_sequence' to load the sequence from a FASTA file.".format(
reference_genome.name
)
)
return _func("getReferenceSequence", tstr, contig, position, before, after, type_args=(tlocus(reference_genome),))
[docs]@typecheck(contig=expr_str, reference_genome=reference_genome_type)
def is_valid_contig(contig, reference_genome='default') -> BooleanExpression:
"""Returns ``True`` if `contig` is a valid contig name in `reference_genome`.
Examples
--------
>>> hl.eval(hl.is_valid_contig('1', reference_genome='GRCh37'))
True
>>> hl.eval(hl.is_valid_contig('chr1', reference_genome='GRCh37'))
False
Parameters
----------
contig : :class:`.Expression` of type :py:data:`.tstr`
reference_genome : :class:`str` or :class:`.ReferenceGenome`
Returns
-------
:class:`.BooleanExpression`
"""
return _func("isValidContig", tbool, contig, type_args=(tlocus(reference_genome),))
[docs]@typecheck(contig=expr_str, reference_genome=reference_genome_type)
def contig_length(contig, reference_genome='default') -> Int32Expression:
"""Returns the length of `contig` in `reference_genome`.
Examples
--------
>>> hl.eval(hl.contig_length('5', reference_genome='GRCh37'))
180915260
Parameters
----------
contig : :class:`.Expression` of type :py:data:`.tstr`
reference_genome : :class:`str` or :class:`.ReferenceGenome`
Returns
-------
:class:`.Int32Expression`
"""
return _func("contigLength", tint32, contig, type_args=(tlocus(reference_genome),))
[docs]@typecheck(contig=expr_str, position=expr_int32, reference_genome=reference_genome_type)
def is_valid_locus(contig, position, reference_genome='default') -> BooleanExpression:
"""Returns ``True`` if `contig` and `position` is a valid site in `reference_genome`.
Examples
--------
>>> hl.eval(hl.is_valid_locus('1', 324254, 'GRCh37'))
True
>>> hl.eval(hl.is_valid_locus('chr1', 324254, 'GRCh37'))
False
Parameters
----------
contig : :class:`.Expression` of type :py:data:`.tstr`
position : :class:`.Expression` of type :py:data:`.tint`
reference_genome : :class:`str` or :class:`.ReferenceGenome`
Returns
-------
:class:`.BooleanExpression`
"""
return _func("isValidLocus", tbool, contig, position, type_args=(tlocus(reference_genome),))
[docs]@typecheck(locus=expr_locus(), is_female=expr_bool, father=expr_call, mother=expr_call, child=expr_call)
def mendel_error_code(locus, is_female, father, mother, child):
r"""Compute a Mendelian violation code for genotypes.
>>> father = hl.call(0, 0)
>>> mother = hl.call(1, 1)
>>> child1 = hl.call(0, 1) # consistent
>>> child2 = hl.call(0, 0) # Mendel error
>>> locus = hl.locus('2', 2000000)
>>> hl.eval(hl.mendel_error_code(locus, True, father, mother, child1))
None
>>> hl.eval(hl.mendel_error_code(locus, True, father, mother, child2))
7
Note
----
Ignores call phasing, and assumes diploid and biallelic. Haploid calls for
hemiploid samples on sex chromosomes also are acceptable input.
Notes
-----
In the table below, the copy state of a locus with respect to a trio is
defined as follows, where PAR is the `pseudoautosomal region
<https://en.wikipedia.org/wiki/Pseudoautosomal_region>`__ (PAR) of X and Y
defined by the reference genome and the autosome is defined by
:meth:`.LocusExpression.in_autosome`:
- Auto -- in autosome or in PAR, or in non-PAR of X and female child
- HemiX -- in non-PAR of X and male child
- HemiY -- in non-PAR of Y and male child
`Any` refers to the set \{ HomRef, Het, HomVar, NoCall \} and `~`
denotes complement in this set.
+------+---------+---------+--------+------------+---------------+
| Code | Dad | Mom | Kid | Copy State | Implicated |
+======+=========+=========+========+============+===============+
| 1 | HomVar | HomVar | Het | Auto | Dad, Mom, Kid |
+------+---------+---------+--------+------------+---------------+
| 2 | HomRef | HomRef | Het | Auto | Dad, Mom, Kid |
+------+---------+---------+--------+------------+---------------+
| 3 | HomRef | ~HomRef | HomVar | Auto | Dad, Kid |
+------+---------+---------+--------+------------+---------------+
| 4 | ~HomRef | HomRef | HomVar | Auto | Mom, Kid |
+------+---------+---------+--------+------------+---------------+
| 5 | HomRef | HomRef | HomVar | Auto | Kid |
+------+---------+---------+--------+------------+---------------+
| 6 | HomVar | ~HomVar | HomRef | Auto | Dad, Kid |
+------+---------+---------+--------+------------+---------------+
| 7 | ~HomVar | HomVar | HomRef | Auto | Mom, Kid |
+------+---------+---------+--------+------------+---------------+
| 8 | HomVar | HomVar | HomRef | Auto | Kid |
+------+---------+---------+--------+------------+---------------+
| 9 | Any | HomVar | HomRef | HemiX | Mom, Kid |
+------+---------+---------+--------+------------+---------------+
| 10 | Any | HomRef | HomVar | HemiX | Mom, Kid |
+------+---------+---------+--------+------------+---------------+
| 11 | HomVar | Any | HomRef | HemiY | Dad, Kid |
+------+---------+---------+--------+------------+---------------+
| 12 | HomRef | Any | HomVar | HemiY | Dad, Kid |
+------+---------+---------+--------+------------+---------------+
Parameters
----------
locus : :class:`.LocusExpression`
is_female : :class:`.BooleanExpression`
father : :class:`.CallExpression`
mother : :class:`.CallExpression`
child : :class:`.CallExpression`
Returns
-------
:class:`.Int32Expression`
"""
father_n = father.n_alt_alleles()
mother_n = mother.n_alt_alleles()
child_n = child.n_alt_alleles()
auto_cond = (
hl.case(missing_false=True)
.when((father_n == 2) & (mother_n == 2) & (child_n == 1), 1)
.when((father_n == 0) & (mother_n == 0) & (child_n == 1), 2)
.when((father_n == 0) & (mother_n == 0) & (child_n == 2), 5)
.when((father_n == 2) & (mother_n == 2) & (child_n == 0), 8)
.when((father_n == 0) & (child_n == 2), 3)
.when((mother_n == 0) & (child_n == 2), 4)
.when((father_n == 2) & (child_n == 0), 6)
.when((mother_n == 2) & (child_n == 0), 7)
.or_missing()
)
hemi_x_cond = (
hl.case(missing_false=True)
.when((mother_n == 2) & (child_n == 0), 9)
.when((mother_n == 0) & (child_n > 0), 10)
.or_missing()
)
hemi_y_cond = (
hl.case(missing_false=True)
.when((father_n > 0) & (child_n == 0), 11)
.when((father_n == 0) & (child_n > 0), 12)
.or_missing()
)
return (
hl.case()
.when(locus.in_autosome_or_par() | is_female, auto_cond)
.when(locus.in_x_nonpar() & (~is_female), hemi_x_cond)
.when(locus.in_y_nonpar() & (~is_female), hemi_y_cond)
.or_missing()
)
[docs]@typecheck(locus=expr_locus(), alleles=expr_array(expr_str))
def min_rep(locus, alleles):
"""Computes the minimal representation of a (locus, alleles) polymorphism.
Examples
--------
>>> hl.eval(hl.min_rep(hl.locus('1', 100000), ['TAA', 'TA']))
Struct(locus=Locus(contig=1, position=100000, reference_genome=GRCh37), alleles=['TA', 'T'])
>>> hl.eval(hl.min_rep(hl.locus('1', 100000), ['AATAA', 'AACAA']))
Struct(locus=Locus(contig=1, position=100002, reference_genome=GRCh37), alleles=['T', 'C'])
Notes
-----
Computing the minimal representation can cause the locus shift right (the
position can increase).
Parameters
----------
locus : :class:`.LocusExpression`
alleles : :class:`.ArrayExpression` of type :py:data:`.tstr`
Returns
-------
:class:`.StructExpression`
A :class:`.tstruct` expression with two fields, `locus`
(:class:`.LocusExpression`) and `alleles`
(:class:`.ArrayExpression` of type :py:data:`.tstr`).
"""
ret_type = tstruct(locus=locus.dtype, alleles=alleles.dtype)
return _func('min_rep', ret_type, locus, alleles)
[docs]@typecheck(
x=oneof(expr_locus(), expr_interval(expr_locus())),
dest_reference_genome=reference_genome_type,
min_match=builtins.float,
include_strand=builtins.bool,
)
def liftover(x, dest_reference_genome, min_match=0.95, include_strand=False):
"""Lift over coordinates to a different reference genome.
Examples
--------
Lift over the locus coordinates from reference genome ``'GRCh37'`` to
``'GRCh38'``:
>>> hl.eval(hl.liftover(hl.locus('1', 1034245, 'GRCh37'), 'GRCh38')) # doctest: +SKIP
Locus(contig='chr1', position=1098865, reference_genome='GRCh38')
Lift over the locus interval coordinates from reference genome ``'GRCh37'``
to ``'GRCh38'``:
>>> hl.eval(hl.liftover(hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'), 'GRCh38')) # doctest: +SKIP
Interval(Locus(contig='chr20', position=79360, reference_genome='GRCh38'),
Locus(contig='chr20', position=101815, reference_genome='GRCh38'),
True,
True)
See :ref:`liftover_howto` for more instructions on lifting over a Table
or MatrixTable.
Notes
-----
This function requires the reference genome of `x` has a chain file loaded
for `dest_reference_genome`. Use :meth:`.ReferenceGenome.add_liftover` to
load and attach a chain file to a reference genome.
Returns ``None`` if `x` could not be converted.
Warning
-------
Before using the result of :func:`.liftover` as a new row key or column
key, be sure to filter out missing values.
Parameters
----------
x : :class:`.Expression` of type :class:`.tlocus` or :class:`.tinterval` of :class:`.tlocus`
Locus or locus interval to lift over.
dest_reference_genome : :class:`str` or :class:`.ReferenceGenome`
Reference genome to convert to.
min_match : :obj:`float`
Minimum ratio of bases that must remap.
include_strand : :obj:`bool`
If True, output the result as a :class:`.StructExpression` with the first field `result` being
the locus or locus interval and the second field `is_negative_strand` is a boolean indicating
whether the locus or locus interval has been mapped to the negative strand of the destination
reference genome. Otherwise, output the converted locus or locus interval.
Returns
-------
:class:`.Expression`
A locus or locus interval converted to `dest_reference_genome`.
"""
if not 0.0 <= min_match <= 1.0:
raise TypeError("'liftover' requires 'min_match' is in the range [0, 1]. Got {}".format(min_match))
if isinstance(x.dtype, tlocus):
rg = x.dtype.reference_genome
method_name = "liftoverLocus"
rtype = tstruct(result=tlocus(dest_reference_genome), is_negative_strand=tbool)
else:
rg = x.dtype.point_type.reference_genome
method_name = "liftoverLocusInterval"
rtype = tstruct(result=tinterval(tlocus(dest_reference_genome)), is_negative_strand=tbool)
if not rg.has_liftover(dest_reference_genome.name):
raise TypeError(
"""Reference genome '{}' does not have liftover to '{}'.
Use 'add_liftover' to load a liftover chain file.""".format(rg.name, dest_reference_genome.name)
)
expr = _func(method_name, rtype, x, to_expr(min_match, tfloat64))
if not include_strand:
expr = expr.result
return expr
[docs]@typecheck(
f=func_spec(1, expr_float64),
min=expr_float64,
max=expr_float64,
max_iter=builtins.int,
epsilon=builtins.float,
tolerance=builtins.float,
)
def uniroot(f: Callable, min, max, *, max_iter=1000, epsilon=2.2204460492503131e-16, tolerance=1.220703e-4):
"""Finds a root of the function `f` within the interval `[min, max]`.
Examples
--------
>>> hl.eval(hl.uniroot(lambda x: x - 1, -5, 5))
1.0
Notes
-----
`f(min)` and `f(max)` must not have the same sign.
If no root can be found, the result of this call will be `NA` (missing).
:func:`.uniroot` returns an estimate for a root with accuracy
`4 * epsilon * abs(x) + tolerance`.
4*EPSILON*abs(x) + tol
Parameters
----------
f : function ( (arg) -> :class:`.Float64Expression`)
Must return a :class:`.Float64Expression`.
min : :class:`.Float64Expression`
max : :class:`.Float64Expression`
max_iter : `int`
The maximum number of iterations before giving up.
epsilon : `float`
The scaling factor in the accuracy of the root found.
tolerance : `float`
The constant factor in approximate accuracy of the root found.
Returns
-------
:class:`.Float64Expression`
The root of the function `f`.
"""
# Based on:
# https://github.com/wch/r-source/blob/e5b21d0397c607883ff25cca379687b86933d730/src/library/stats/src/zeroin.c
def error_if_missing(x):
res = f(x)
return case().when(is_defined(res), res).or_error(format("'uniroot': value of f(x) is missing for x = %.1e", x))
wrapped_f = hl.experimental.define_function(error_if_missing, 'float')
def uniroot(recur, a, b, c, fa, fb, fc, prev, iterations_remaining):
tol = 2 * epsilon * abs(b) + tolerance / 2
cb = c - b
t1 = fb / fc
t2 = fb / fa
q1 = fa / fc # = t1 / t2
pq = if_else(
a == c,
(cb * t1) / (t1 - 1.0), # linear
-t2 * (cb * q1 * (q1 - t1) - (b - a) * (t1 - 1.0)) / ((q1 - 1.0) * (t1 - 1.0) * (t2 - 1.0)),
) # quadratic
interpolated = if_else(
(sign(pq) == sign(cb))
& (0.75 * abs(cb) > abs(pq) + tol / 2) # b + pq within [b, c]
& (abs(pq) < abs(prev / 2)), # pq not too large
pq,
cb / 2,
)
new_step = if_else((abs(prev) >= tol) & (abs(fa) > abs(fb)), interpolated, cb / 2) # try interpolation
new_b = b + if_else(new_step < 0, hl.min(new_step, -tol), hl.max(new_step, tol))
new_fb = wrapped_f(new_b)
return if_else(
iterations_remaining == 0,
missing('float'),
if_else(
abs(fc) < abs(fb),
recur(b, c, b, fb, fc, fb, prev, iterations_remaining),
if_else(
(abs(cb / 2) <= tol) | (fb == 0),
b, # acceptable approximation found
if_else(
sign(new_fb) == sign(fc), # use c = b for next iteration if signs match
recur(b, new_b, b, fb, new_fb, fb, new_step, iterations_remaining - 1),
recur(b, new_b, c, fb, new_fb, fc, new_step, iterations_remaining - 1),
),
),
),
)
fmin = wrapped_f(min)
fmax = wrapped_f(max)
run_loop = hl.experimental.define_function(
lambda min, max, fmin, fmax: hl.experimental.loop(
uniroot, 'float', min, max, min, fmin, fmax, fmin, max - min, max_iter
),
'float',
'float',
'float',
'float',
)
return (
case()
.when(
min < max,
case()
.when(fmin * fmax <= 0, run_loop(min, max, fmin, fmax))
.or_error(
format(
"'uniroot': sign of endpoints must have opposite signs, got: f(min) = %.1e, f(max) = %.1e",
fmin,
fmax,
)
),
)
.or_error(format("'uniroot': min must be less than max in call to uniroot, got: min %.1e, max %.1e", min, max))
)
[docs]@typecheck(x=expr_float64, y=expr_float64, tolerance=expr_float64, absolute=expr_bool, nan_same=expr_bool)
def approx_equal(x, y, tolerance=1e-6, absolute=False, nan_same=False):
"""Tests whether two numbers are approximately equal.
Examples
--------
>>> hl.eval(hl.approx_equal(0.25, 0.2500001))
True
>>> hl.eval(hl.approx_equal(0.25, 0.251, tolerance=1e-3, absolute=True))
False
Parameters
----------
x : :class:`.NumericExpression`
y : :class:`.NumericExpression`
tolerance : :class:`.NumericExpression`
absolute : :class:`.BooleanExpression`
If True, compute ``abs(x - y) <= tolerance``. Otherwise, compute
``abs(x - y) <= max(tolerance * max(abs(x), abs(y)), 2 ** -1022)``.
nan_same : :class:`.BooleanExpression`
If True, then ``NaN == NaN`` will evaluate to True. Otherwise,
it will return False.
Returns
-------
:class:`.BooleanExpression`
"""
return _func("approxEqual", hl.tbool, x, y, tolerance, absolute, nan_same)
def _shift_op(x, y, op):
assert op in ('<<', '>>', '>>>')
t = x.dtype
if t == hl.tint64:
word_size = 64
zero = hl.int64(0)
else:
word_size = 32
zero = hl.int32(0)
indices, aggregations = unify_all(x, y)
return hl.bind(
lambda x, y: (
hl.case()
.when(y >= word_size, hl.sign(x) if op == '>>' else zero)
.when(y >= 0, construct_expr(ir.ApplyBinaryPrimOp(op, x._ir, y._ir), t, indices, aggregations))
.or_error('cannot shift by a negative value: ' + hl.str(x) + f" {op} " + hl.str(y))
),
x,
y,
)
def _bit_op(x, y, op):
if x.dtype == hl.tint32 and y.dtype == hl.tint32:
t = hl.tint32
else:
t = hl.tint64
coercer = coercer_from_dtype(t)
x = coercer.coerce(x)
y = coercer.coerce(y)
indices, aggregations = unify_all(x, y)
return construct_expr(ir.ApplyBinaryPrimOp(op, x._ir, y._ir), t, indices, aggregations)
[docs]@typecheck(x=expr_oneof(expr_int32, expr_int64), y=expr_oneof(expr_int32, expr_int64))
def bit_and(x, y):
"""Bitwise and `x` and `y`.
Examples
--------
>>> hl.eval(hl.bit_and(5, 3))
1
Notes
-----
See `the Python wiki <https://wiki.python.org/moin/BitwiseOperators>`__
for more information about bit operators.
Parameters
----------
x : :class:`.Int32Expression` or :class:`.Int64Expression`
y : :class:`.Int32Expression` or :class:`.Int64Expression`
Returns
-------
:class:`.Int32Expression` or :class:`.Int64Expression`
"""
return _bit_op(x, y, '&')
[docs]@typecheck(x=expr_oneof(expr_int32, expr_int64), y=expr_oneof(expr_int32, expr_int64))
def bit_or(x, y):
"""Bitwise or `x` and `y`.
Examples
--------
>>> hl.eval(hl.bit_or(5, 3))
7
Notes
-----
See `the Python wiki <https://wiki.python.org/moin/BitwiseOperators>`__
for more information about bit operators.
Parameters
----------
x : :class:`.Int32Expression` or :class:`.Int64Expression`
y : :class:`.Int32Expression` or :class:`.Int64Expression`
Returns
-------
:class:`.Int32Expression` or :class:`.Int64Expression`
"""
return _bit_op(x, y, '|')
[docs]@typecheck(x=expr_oneof(expr_int32, expr_int64), y=expr_oneof(expr_int32, expr_int64))
def bit_xor(x, y):
"""Bitwise exclusive-or `x` and `y`.
Examples
--------
>>> hl.eval(hl.bit_xor(5, 3))
6
Notes
-----
See `the Python wiki <https://wiki.python.org/moin/BitwiseOperators>`__
for more information about bit operators.
Parameters
----------
x : :class:`.Int32Expression` or :class:`.Int64Expression`
y : :class:`.Int32Expression` or :class:`.Int64Expression`
Returns
-------
:class:`.Int32Expression` or :class:`.Int64Expression`
"""
return _bit_op(x, y, '^')
[docs]@typecheck(x=expr_oneof(expr_int32, expr_int64), y=expr_int32)
def bit_lshift(x, y):
"""Bitwise left-shift `x` by `y`.
Examples
--------
>>> hl.eval(hl.bit_lshift(5, 3))
40
>>> hl.eval(hl.bit_lshift(1, 8))
256
Unlike Python, Hail integers are fixed-size (32 or 64 bits),
and bits extended beyond will be ignored:
>>> hl.eval(hl.bit_lshift(1, 31))
-2147483648
>>> hl.eval(hl.bit_lshift(1, 32))
0
>>> hl.eval(hl.bit_lshift(hl.int64(1), 32))
4294967296
>>> hl.eval(hl.bit_lshift(hl.int64(1), 64))
0
Notes
-----
See `the Python wiki <https://wiki.python.org/moin/BitwiseOperators>`__
for more information about bit operators.
Parameters
----------
x : :class:`.Int32Expression` or :class:`.Int64Expression`
y : :class:`.Int32Expression` or :class:`.Int64Expression`
Returns
-------
:class:`.Int32Expression` or :class:`.Int64Expression`
"""
return _shift_op(x, y, '<<')
[docs]@typecheck(x=expr_oneof(expr_int32, expr_int64), y=expr_int32, logical=builtins.bool)
def bit_rshift(x, y, logical=False):
"""Bitwise right-shift `x` by `y`.
Examples
--------
>>> hl.eval(hl.bit_rshift(256, 3))
32
With ``logical=False`` (default), the sign is preserved:
>>> hl.eval(hl.bit_rshift(-1, 1))
-1
With ``logical=True``, the sign bit is treated as any other:
>>> hl.eval(hl.bit_rshift(-1, 1, logical=True))
2147483647
Notes
-----
If `logical` is ``False``, then the shift is a sign-preserving right shift.
If `logical` is ``True``, then the shift is logical, with the sign bit
treated as any other bit.
See `the Python wiki <https://wiki.python.org/moin/BitwiseOperators>`__
for more information about bit operators.
Parameters
----------
x : :class:`.Int32Expression` or :class:`.Int64Expression`
y : :class:`.Int32Expression` or :class:`.Int64Expression`
logical : :obj:`bool`
Returns
-------
:class:`.Int32Expression` or :class:`.Int64Expression`
"""
if logical:
return _shift_op(x, y, '>>>')
else:
return _shift_op(x, y, '>>')
[docs]@typecheck(x=expr_oneof(expr_int32, expr_int64))
def bit_not(x):
"""Bitwise invert `x`.
Examples
--------
>>> hl.eval(hl.bit_not(0))
-1
Notes
-----
See `the Python wiki <https://wiki.python.org/moin/BitwiseOperators>`__
for more information about bit operators.
Parameters
----------
x : :class:`.Int32Expression` or :class:`.Int64Expression`
Returns
-------
:class:`.Int32Expression` or :class:`.Int64Expression`
"""
return construct_expr(ir.ApplyUnaryPrimOp('~', x._ir), x.dtype, x._indices, x._aggregations)
[docs]@typecheck(x=expr_oneof(expr_int32, expr_int64))
def bit_count(x):
"""Count the number of 1s in the in the `two's complement <https://en.wikipedia.org/wiki/Two%27s_complement>`__ binary representation of `x`.
Examples
--------
The binary representation of `7` is `111`, so:
>>> hl.eval(hl.bit_count(7))
3
Parameters
----------
x : :class:`.Int32Expression` or :class:`.Int64Expression`
Returns
----------
:class:`.Int32Expression`
"""
return construct_expr(ir.ApplyUnaryPrimOp('BitCount', x._ir), tint32, x._indices, x._aggregations)
[docs]@typecheck(array=expr_array(expr_numeric), elem=expr_numeric)
def binary_search(array, elem) -> Int32Expression:
"""Binary search `array` for the insertion point of `elem`.
Parameters
----------
array : :class:`.Expression` of type :class:`.tarray`
elem : :class:`.Expression`
Returns
-------
:class:`.Int32Expression`
Notes
-----
This function assumes that `array` is sorted in ascending order, and does
not perform any sortedness check. Missing values sort last.
The returned index is the lower bound on the insertion point of `elem` into
the ordered array, or the index of the first element in `array` not smaller
than `elem`. This is a value between 0 and the length of `array`, inclusive
(if all elements in `array` are smaller than `elem`, the returned value is
the length of `array` or the index of the first missing value, if one
exists).
If either `elem` or `array` is missing, the result is missing.
Examples
--------
>>> a = hl.array([0, 2, 4, 8])
>>> hl.eval(hl.binary_search(a, -1))
0
>>> hl.eval(hl.binary_search(a, 1))
1
>>> hl.eval(hl.binary_search(a, 10))
4
"""
c = coercer_from_dtype(array.dtype.element_type)
if not c.can_coerce(elem.dtype):
raise TypeError(
f"'binary_search': cannot search an array of type {array.dtype} for a value of type {elem.dtype}"
)
elem = c.coerce(elem)
return hl.switch(elem).when_missing(hl.missing(hl.tint32)).default(_lower_bound(array, elem))
@typecheck(s=expr_str)
def _escape_string(s):
return _func("escapeString", hl.tstr, s)
@typecheck(left=expr_any, right=expr_any, tolerance=expr_float64, absolute=expr_bool)
def _values_similar(left, right, tolerance=1e-6, absolute=False):
assert left.dtype == right.dtype
return (is_missing(left) & is_missing(right)) | (
(is_defined(left) & is_defined(right)) & _func("valuesSimilar", hl.tbool, left, right, tolerance, absolute)
)
@typecheck(coords=expr_array(expr_array(expr_float64)), radius=expr_float64)
def _locus_windows_per_contig(coords, radius):
rt = hl.ttuple(hl.tarray(hl.tint32), hl.tarray(hl.tint32))
return _func("locus_windows_per_contig", rt, coords, radius)
[docs]@typecheck(a=expr_array(), seed=nullable(builtins.int))
def shuffle(a, seed: Optional[builtins.int] = None) -> ArrayExpression:
"""Randomly permute an array
Example
-------
>>> hl.reset_global_randomness()
>>> hl.eval(hl.shuffle(hl.range(5)))
[4, 0, 2, 1, 3]
Parameters
----------
a : :class:`.ArrayExpression`
Array to permute.
seed : :obj:`int`, optional
Random seed.
Returns
-------
:class:`.ArrayExpression`
"""
return sorted(a, key=lambda _: hl.rand_unif(0.0, 1.0))
[docs]@typecheck(path=builtins.str, point_or_interval=expr_any)
def query_table(path, point_or_interval):
"""Query records from a table corresponding to a given point or range of keys.
Notes
-----
This function does not dispatch to a distributed runtime; it can be used inside
already-distributed queries such as in :meth:`.Table.annotate`.
Warning
-------
This function contains no safeguards against reading large amounts of data
using a single thread.
Parameters
----------
path : :class:`str`
Table path.
point_or_interval
Point or interval to query.
Returns
-------
:class:`.ArrayExpression`
"""
table = hl.read_table(path)
row_typ = table.row.dtype
key_typ = table.key.dtype
key_names = list(key_typ)
len = builtins.len
if len(key_typ) == 0:
raise ValueError("query_table: cannot query unkeyed table")
def coerce_endpoint(point):
if point.dtype == key_typ[0]:
point = hl.struct(**{key_names[0]: point})
ts = point.dtype
if isinstance(ts, tstruct):
i = 0
while i < len(ts):
if i >= len(key_typ):
raise ValueError(
f"query_table: queried with {len(ts)} key field(s), but table only has {len(key_typ)} key field(s)"
)
if key_typ[i] != ts[i]:
raise ValueError(
f"query_table: key mismatch at key field {i} ({list(ts.keys())[i]!r}): query type is {ts[i]}, table key type is {key_typ[i]}"
)
i += 1
if i == 0:
raise ValueError("query_table: cannot query with empty key")
point_size = builtins.len(point.dtype)
return hl.tuple([
hl.struct(**{
key_names[i]: (point[i] if i < point_size else hl.missing(key_typ[i]))
for i in builtins.range(builtins.len(key_typ))
}),
hl.int32(point_size),
])
else:
raise ValueError(
f"query_table: key mismatch: cannot query a table with key "
f"({', '.join(builtins.str(x) for x in key_typ.values())}) with query point type {point.dtype}"
)
if point_or_interval.dtype != key_typ[0] and isinstance(point_or_interval.dtype, hl.tinterval):
partition_interval = hl.interval(
start=coerce_endpoint(point_or_interval.start),
end=coerce_endpoint(point_or_interval.end),
includes_start=point_or_interval.includes_start,
includes_end=point_or_interval.includes_end,
)
else:
point = coerce_endpoint(point_or_interval)
partition_interval = hl.interval(start=point, end=point, includes_start=True, includes_end=True)
return construct_expr(
ir.ToArray(ir.ReadPartition(partition_interval._ir, reader=ir.PartitionNativeIntervalReader(path, row_typ))),
type=hl.tarray(row_typ),
indices=partition_interval._indices,
aggregations=partition_interval._aggregations,
)
@typecheck(msg=expr_str, result=expr_any)
def _console_log(msg, result):
indices, aggregations = unify_all(msg, result)
return construct_expr(ir.ConsoleLog(msg._ir, result._ir), result.dtype, indices, aggregations)