import re
from collections import Counter
from hail.typecheck import nullable, sequenceof, typecheck_method
from hail.utils.java import Env, FatalError, warning
[docs]class Trio(object):
"""Class containing information about nuclear family relatedness and sex.
:param str s: Sample ID of proband.
:param fam_id: Family ID.
:type fam_id: str or None
:param pat_id: Sample ID of father.
:type pat_id: str or None
:param mat_id: Sample ID of mother.
:type mat_id: str or None
:param is_female: Sex of proband.
:type is_female: bool or None
"""
@typecheck_method(s=str, fam_id=nullable(str), pat_id=nullable(str), mat_id=nullable(str), is_female=nullable(bool))
def __init__(self, s, fam_id=None, pat_id=None, mat_id=None, is_female=None):
self._fam_id = fam_id
self._s = s
self._pat_id = pat_id
self._mat_id = mat_id
self._is_female = is_female
def __repr__(self):
return 'Trio(s=%s, fam_id=%s, pat_id=%s, mat_id=%s, is_female=%s)' % (
repr(self.s),
repr(self.fam_id),
repr(self.pat_id),
repr(self.mat_id),
repr(self.is_female),
)
def __str__(self):
return 'Trio(s=%s, fam_id=%s, pat_id=%s, mat_id=%s, is_female=%s)' % (
str(self.s),
str(self.fam_id),
str(self.pat_id),
str(self.mat_id),
str(self.is_female),
)
def __eq__(self, other):
return (
isinstance(other, Trio)
and self._s == other._s
and self._mat_id == other._mat_id
and self._pat_id == other._pat_id
and self._fam_id == other._fam_id
and self._is_female == other._is_female
)
def __hash__(self):
return hash((self._s, self._pat_id, self._mat_id, self._fam_id, self._is_female))
@property
def s(self):
"""ID of proband in trio, never missing.
:rtype: str
"""
return self._s
@property
def pat_id(self):
"""ID of father in trio, may be missing.
:rtype: str or None
"""
return self._pat_id
@property
def mat_id(self):
"""ID of mother in trio, may be missing.
:rtype: str or None
"""
return self._mat_id
@property
def fam_id(self):
"""Family ID.
:rtype: str or None
"""
return self._fam_id
@property
def is_male(self):
"""Returns ``True`` if the proband is a reported male,
``False`` if reported female, and ``None`` if no sex is defined.
:rtype: bool or None
"""
if self._is_female is None:
return None
return self._is_female is False
@property
def is_female(self):
"""Returns ``True`` if the proband is a reported female,
``False`` if reported male, and ``None`` if no sex is defined.
:rtype: bool or None
"""
if self._is_female is None:
return None
return self._is_female is True
[docs] def is_complete(self):
"""Returns True if the trio has a defined mother and father.
The considered fields are :meth:`mat_id` and :meth:`pat_id`.
Recall that ``s`` may never be missing. The :meth:`fam_id`
and :meth:`is_female` fields may be missing in a complete trio.
:rtype: bool
"""
return self._pat_id is not None and self._mat_id is not None
def _restrict_to(self, ids):
if self._s not in ids:
return None
return Trio(
self._s,
self._fam_id,
self._pat_id if self._pat_id in ids else None,
self._mat_id if self._mat_id in ids else None,
self._is_female,
)
def _sex_as_numeric_string(self):
if self._is_female is None:
return "0"
return "2" if self.is_female else "1"
def _to_fam_file_line(self):
def sample_id_or_else_zero(sample_id):
if sample_id is None:
return "0"
return sample_id
line_list = [
sample_id_or_else_zero(self._fam_id),
self._s,
sample_id_or_else_zero(self._pat_id),
sample_id_or_else_zero(self._mat_id),
self._sex_as_numeric_string(),
"0",
]
return "\t".join(line_list)
[docs]class Pedigree(object):
"""Class containing a list of trios, with extra functionality.
:param trios: list of trio objects to include in pedigree
:type trios: list of :class:`.Trio`
"""
@typecheck_method(trios=sequenceof(Trio))
def __init__(self, trios):
self._trios = tuple(trios)
def __eq__(self, other):
return isinstance(other, Pedigree) and self._trios == other._trios
def __hash__(self):
return hash(self._trios)
def __iter__(self):
return self._trios.__iter__()
[docs] @classmethod
@typecheck_method(fam_path=str, delimiter=str)
def read(cls, fam_path, delimiter='\\s+') -> 'Pedigree':
"""Read a PLINK .fam file and return a pedigree object.
**Examples**
>>> ped = hl.Pedigree.read('data/test.fam')
Notes
-------
See `PLINK .fam file <https://www.cog-genomics.org/plink2/formats#fam>`_ for
the required format.
:param str fam_path: path to .fam file.
:param str delimiter: Field delimiter.
:rtype: :class:`.Pedigree`
"""
trios = []
missing_sex_count = 0
missing_sex_values = set()
with Env.fs().open(fam_path) as file:
for line in file:
split_line = re.split(delimiter, line.strip())
num_fields = len(split_line)
if num_fields != 6:
raise FatalError(
"Require 6 fields per line in .fam, but this line has {}: {}".format(num_fields, line)
)
(fam, kid, dad, mom, sex, _) = tuple(split_line)
# 1 is male, 2 is female, 0 is unknown.
is_female = sex == "2" if sex in {'1', '2'} else None
if is_female is None:
missing_sex_count += 1
missing_sex_values.add(kid)
trio = Trio(
kid,
fam if fam != "0" else None,
dad if dad != "0" else None,
mom if mom != "0" else None,
is_female,
)
trios.append(trio)
only_ids = [trio.s for trio in trios]
duplicate_ids = [id for id, count in Counter(only_ids).items() if count > 1]
if duplicate_ids:
raise FatalError("Invalid pedigree: found duplicate proband IDs\n{}".format(duplicate_ids))
if missing_sex_count > 0:
warning(
"Found {} samples with missing sex information (not 1 or 2).\n Missing samples: [{}]".format(
missing_sex_count, missing_sex_values
)
)
return Pedigree(trios)
@property
def trios(self):
"""List of trio objects in this pedigree.
:rtype: list of :class:`.Trio`
"""
return self._trios
[docs] def complete_trios(self):
"""List of trio objects that have a defined father and mother.
:rtype: list of :class:`.Trio`
"""
return list(filter(lambda t: t.is_complete(), self.trios))
[docs] @typecheck_method(samples=sequenceof(nullable(str)))
def filter_to(self, samples):
"""Filter the pedigree to a given list of sample IDs.
**Notes**
For any trio, the following steps will be applied:
- If the proband is not in the list of samples provided, the trio is removed.
- If the father is not in the list of samples provided, `pat_id` is set to ``None``.
- If the mother is not in the list of samples provided, `mat_id` is set to ``None``.
Parameters
----------
samples: :obj:`list` [:obj:`str`]
Sample IDs to keep.
Returns
-------
:class:`.Pedigree`
"""
sample_set = set(samples)
filtered_trios = []
for trio in self._trios:
restricted_trio = trio._restrict_to(sample_set)
if restricted_trio is not None:
filtered_trios.append(restricted_trio)
return Pedigree(filtered_trios)
[docs] @typecheck_method(path=str)
def write(self, path):
"""Write a .fam file to the given path.
**Examples**
>>> ped = hl.Pedigree.read('data/test.fam')
>>> ped.write('output/out.fam')
**Notes**
This method writes a `PLINK .fam file <https://www.cog-genomics.org/plink2/formats#fam>`_.
.. caution::
Phenotype information is not preserved in the Pedigree data
structure in Hail. Reading and writing a PLINK .fam file will
result in loss of this information. Use :func:`~.import_fam` to
manipulate this information.
:param path: output path
:type path: str
"""
lines = [t._to_fam_file_line() for t in self._trios]
with Env.fs().open(path, mode="w") as file:
for line in lines:
file.write(line + "\n")