from typing import Optional, Union
import hail as hl
from hail.matrixtable import MatrixTable
from hail.table import Table
from .datasets_metadata import get_datasets_metadata
def _read_dataset(path: str) -> Union[Table, MatrixTable, hl.linalg.BlockMatrix]:
if path.endswith('.ht'):
return hl.read_table(path)
elif path.endswith('.mt'):
return hl.read_matrix_table(path)
elif path.endswith('.bm'):
return hl.linalg.BlockMatrix.read(path)
raise ValueError(f'Invalid path: {path}. Can only load datasets with .ht, .mt, or .bm extensions.')
[docs]def load_dataset(
name: str, version: Optional[str], reference_genome: Optional[str], region: str = 'us-central1', cloud: str = 'gcp'
) -> Union[Table, MatrixTable, hl.linalg.BlockMatrix]:
"""Load a genetic dataset from Hail's repository.
Example
-------
>>> # Load the gnomAD "HGDP + 1000 Genomes" dense MatrixTable with GRCh38 coordinates.
>>> mt = hl.experimental.load_dataset(name='gnomad_hgdp_1kg_subset_dense',
... version='3.1.2',
... reference_genome='GRCh38',
... region='us-central1',
... cloud='gcp')
Parameters
----------
name : :class:`str`
Name of the dataset to load.
version : :class:`str`, optional
Version of the named dataset to load (see available versions in
documentation). Possibly ``None`` for some datasets.
reference_genome : :class:`str`, optional
Reference genome build, ``'GRCh37'`` or ``'GRCh38'``. Possibly ``None``
for some datasets.
region : :class:`str`
Specify region for bucket, ``'us'``, ``'us-central1'``, or ``'europe-west1'``, (default is
``'us-central1'``).
cloud : :class:`str`
Specify if using Google Cloud Platform or Amazon Web Services,
``'gcp'`` or ``'aws'`` (default is ``'gcp'``).
Note
----
The ``'aws'`` `cloud` platform is currently only available for the ``'us'``
`region`.
Returns
-------
:class:`.Table`, :class:`.MatrixTable`, or :class:`.BlockMatrix`
"""
valid_regions = {'us', 'us-central1', 'europe-west1'}
if region not in valid_regions:
raise ValueError(
f'Specify valid region parameter,'
f' received: region={region!r}.\n'
f'Valid region values are {valid_regions}.'
)
valid_clouds = {'gcp', 'aws'}
if cloud not in valid_clouds:
raise ValueError(
f'Specify valid cloud parameter,'
f' received: cloud={cloud!r}.\n'
f'Valid cloud platforms are {valid_clouds}.'
)
datasets = get_datasets_metadata()
names = set([dataset for dataset in datasets])
if name not in names:
raise ValueError(f'{name} is not a dataset available in the' f' repository.')
versions = set(dataset['version'] for dataset in datasets[name]['versions'])
if version not in versions:
raise ValueError(
f'Version {version!r} not available for dataset' f' {name!r}.\n' f'Available versions: {versions}.'
)
reference_genomes = set(dataset['reference_genome'] for dataset in datasets[name]['versions'])
if reference_genome not in reference_genomes:
raise ValueError(
f'Reference genome build {reference_genome!r} not'
f' available for dataset {name!r}.\n'
f'Available reference genome builds:'
f' {reference_genomes}.'
)
clouds = set(k for dataset in datasets[name]['versions'] for k in dataset['url'].keys())
if cloud not in clouds:
raise ValueError(f'Cloud platform {cloud!r} not available for dataset {name}.\nAvailable platforms: {clouds}.')
regions = set(k for dataset in datasets[name]['versions'] for k in dataset['url'][cloud].keys())
if region not in regions:
raise ValueError(
f'Region {region!r} not available for dataset'
f' {name!r} on cloud platform {cloud!r}.\n'
f'Available regions: {regions}.'
)
path = [
dataset['url'][cloud][region]
for dataset in datasets[name]['versions']
if all([dataset['version'] == version, dataset['reference_genome'] == reference_genome])
]
assert len(path) == 1
path = path[0]
if path.startswith('s3://'):
try:
dataset = _read_dataset(path)
except hl.utils.java.FatalError:
dataset = _read_dataset(path.replace('s3://', 's3a://'))
else:
dataset = _read_dataset(path)
return dataset