Source code for hail.experimental.datasets

import json
import os
from typing import Optional, Union

import hail as hl
import pkg_resources


def _read_dataset(path: str) -> Union[hl.Table, hl.MatrixTable, hl.linalg.BlockMatrix]:
    if path.endswith('.ht'):
        return hl.read_table(path)
    elif path.endswith('.mt'):
        return hl.read_matrix_table(path)
    elif path.endswith('.bm'):
        return hl.linalg.BlockMatrix.read(path)
    raise ValueError(f'Invalid path: {path}. Can only load datasets with .ht, .mt, or .bm extensions.')


[docs]def load_dataset(name: str, version: Optional[str], reference_genome: Optional[str], region: str = 'us', cloud: str = 'gcp') -> Union[hl.Table, hl.MatrixTable, hl.linalg.BlockMatrix]: """Load a genetic dataset from Hail's repository. Example ------- >>> # Load the gnomAD "HGDP + 1000 Genomes" dense MatrixTable with GRCh38 coordinates. >>> mt = hl.experimental.load_dataset(name='gnomad_hgdp_1kg_subset_dense', ... version='3.1.2', ... reference_genome='GRCh38', ... region='us', ... cloud='gcp') Parameters ---------- name : :class:`str` Name of the dataset to load. version : :class:`str`, optional Version of the named dataset to load (see available versions in documentation). Possibly ``None`` for some datasets. reference_genome : :class:`str`, optional Reference genome build, ``'GRCh37'`` or ``'GRCh38'``. Possibly ``None`` for some datasets. region : :class:`str` Specify region for bucket, ``'us'`` or ``'eu'``, (default is ``'us'``). cloud : :class:`str` Specify if using Google Cloud Platform or Amazon Web Services, ``'gcp'`` or ``'aws'`` (default is ``'gcp'``). Note ---- The ``'aws'`` `cloud` platform is currently only available for the ``'us'`` `region`. If `region` is ``'eu'``, `cloud` must be set to ``'gcp'``. Returns ------- :class:`.Table`, :class:`.MatrixTable`, or :class:`.BlockMatrix` """ valid_regions = {'us', 'eu'} if region not in valid_regions: raise ValueError(f'Specify valid region parameter,' f' received: region={repr(region)}.\n' f'Valid region values are {valid_regions}.') valid_clouds = {'gcp', 'aws'} if cloud not in valid_clouds: raise ValueError(f'Specify valid cloud parameter,' f' received: cloud={repr(cloud)}.\n' f'Valid cloud platforms are {valid_clouds}.') config_path = pkg_resources.resource_filename(__name__, 'datasets.json') assert os.path.exists(config_path), f'{config_path} does not exist' with open(config_path) as f: datasets = json.load(f) names = set([dataset for dataset in datasets]) if name not in names: raise ValueError(f'{name} is not a dataset available in the' f' repository.') versions = set(dataset['version'] for dataset in datasets[name]['versions']) if version not in versions: raise ValueError(f'Version {repr(version)} not available for dataset' f' {repr(name)}.\n' f'Available versions: {versions}.') reference_genomes = set(dataset['reference_genome'] for dataset in datasets[name]['versions']) if reference_genome not in reference_genomes: raise ValueError(f'Reference genome build {repr(reference_genome)} not' f' available for dataset {repr(name)}.\n' f'Available reference genome builds:' f' {reference_genomes}.') clouds = set(k for dataset in datasets[name]['versions'] for k in dataset['url'].keys()) if cloud not in clouds: raise ValueError(f'Cloud platform {repr(cloud)} not available for' f' dataset {name}.\n' f'Available platforms: {clouds}.') regions = set(k for dataset in datasets[name]['versions'] for k in dataset['url'][cloud].keys()) if region not in regions: raise ValueError(f'Region {repr(region)} not available for dataset' f' {repr(name)} on cloud platform {repr(cloud)}.\n' f'Available regions: {regions}.') path = [dataset['url'][cloud][region] for dataset in datasets[name]['versions'] if all([dataset['version'] == version, dataset['reference_genome'] == reference_genome])] assert len(path) == 1 path = path[0] if path.startswith('s3://'): try: dataset = _read_dataset(path) except hl.utils.java.FatalError: dataset = _read_dataset(path.replace('s3://', 's3a://')) else: dataset = _read_dataset(path) return dataset