Source code for hail.experimental.datasets

import json
import hail as hl

[docs]def load_dataset(name, version, reference_genome, config_file='gs://hail-datasets/datasets.json'): """Load a genetic dataset from Hail's repository. Example ------- >>> # Load 1000 Genomes MatrixTable with GRCh38 coordinates >>> mt_1kg = hl.experimental.load_dataset(name='1000_genomes', # doctest: +SKIP ... version='phase3', ... reference_genome='GRCh38') Parameters ---------- name : :obj:`str` Name of the dataset to load. version : :obj:`str` Version of the named dataset to load (see available versions in documentation). reference_genome : `GRCh37` or `GRCh38` Reference genome build. Returns ------- :class:`.Table` or :class:`.MatrixTable`""" with hl.hadoop_open(config_file, 'r') as f: datasets = json.load(f) names = set([dataset['name'] for dataset in datasets]) if name not in names: raise ValueError('{} is not a dataset available in the repository.'.format(repr(name))) versions = set([dataset['version'] for dataset in datasets if dataset['name'] == name]) if version not in versions: raise ValueError("""Version {0} not available for dataset {1}. Available versions: {{{2}}}.""".format(repr(version), repr(name), repr('","'.join(versions)))) reference_genomes = set([dataset['reference_genome'] for dataset in datasets if dataset['name'] == name]) if reference_genome not in reference_genomes: raise ValueError("""Reference genome build {0} not available for dataset {1}. Available reference genome builds: {{'{2}'}}.""".format(repr(reference_genome), repr(name), '\',\''.join((reference_genomes)))) path = [dataset['path'] for dataset in datasets if all([dataset['name'] == name, dataset['version'] == version, dataset['reference_genome'] == reference_genome])][0].strip('/') if path.endswith('.ht'): dataset = hl.read_table(path) else: if not path.endswith('.mt'): raise ValueError('Invalid path {}: can only load datasets with .ht or .mt extensions.'.format(repr(path))) dataset = hl.read_matrix_table(path) return dataset