Source code for hail.experimental.export_entries_by_col

import hail as hl
from hail.matrixtable import MatrixTable
from hail.typecheck import typecheck


[docs]@typecheck( mt=MatrixTable, path=str, batch_size=int, bgzip=bool, header_json_in_file=bool, use_string_key_as_file_name=bool ) def export_entries_by_col( mt: MatrixTable, path: str, batch_size: int = 256, bgzip: bool = True, header_json_in_file: bool = True, use_string_key_as_file_name: bool = False, ): """Export entries of the `mt` by column as separate text files. Examples -------- >>> range_mt = hl.utils.range_matrix_table(10, 10) >>> range_mt = range_mt.annotate_entries(x = hl.rand_unif(0, 1)) >>> hl.experimental.export_entries_by_col(range_mt, 'output/cols_files') Notes ----- This function writes a directory with one file per column in `mt`. The files contain one tab-separated field (with header) for each row field and entry field in `mt`. The column fields of `mt` are written as JSON in the first line of each file, prefixed with a ``#``. The above will produce a directory at ``output/cols_files`` with the following files: .. code-block:: text $ ls -l output/cols_files total 80 -rw-r--r-- 1 hail-dev wheel 712 Jan 25 17:19 index.tsv -rw-r--r-- 1 hail-dev wheel 712 Jan 25 17:19 part-00.tsv.bgz -rw-r--r-- 1 hail-dev wheel 712 Jan 25 17:19 part-01.tsv.bgz -rw-r--r-- 1 hail-dev wheel 712 Jan 25 17:19 part-02.tsv.bgz -rw-r--r-- 1 hail-dev wheel 712 Jan 25 17:19 part-03.tsv.bgz -rw-r--r-- 1 hail-dev wheel 712 Jan 25 17:19 part-04.tsv.bgz -rw-r--r-- 1 hail-dev wheel 712 Jan 25 17:19 part-05.tsv.bgz -rw-r--r-- 1 hail-dev wheel 712 Jan 25 17:19 part-06.tsv.bgz -rw-r--r-- 1 hail-dev wheel 712 Jan 25 17:19 part-07.tsv.bgz -rw-r--r-- 1 hail-dev wheel 712 Jan 25 17:19 part-08.tsv.bgz -rw-r--r-- 1 hail-dev wheel 712 Jan 25 17:19 part-09.tsv.bgz $ zcat output/cols_files/part-00.tsv.bgz #{"col_idx":0} row_idx x 0 6.2501e-02 1 7.0083e-01 2 3.6452e-01 3 4.4170e-01 4 7.9177e-02 5 6.2392e-01 6 5.9920e-01 7 9.7540e-01 8 8.4848e-01 9 3.7423e-01 Due to overhead and file system limits related to having large numbers of open files, this function will iteratively export groups of columns. The `batch_size` parameter can control the size of these groups. Parameters ---------- mt : :class:`.MatrixTable` path : :obj:`int` Path (directory to write to. batch_size : :obj:`int` Number of columns to write per iteration. bgzip : :obj:`bool` BGZip output files. header_json_in_file : :obj:`bool` Include JSON header in each component file (if False, only written to index.tsv) """ if use_string_key_as_file_name and not (len(mt.col_key) == 1 and mt.col_key[0].dtype == hl.tstr): raise ValueError( f'parameter "use_string_key_as_file_name" requires a single string column key, found {list(mt.col_key.dtype.values())}' ) hl.utils.java.Env.backend().execute( hl.ir.MatrixToValueApply( mt._mir, { 'name': 'MatrixExportEntriesByCol', 'parallelism': batch_size, 'path': path, 'bgzip': bgzip, 'headerJsonInFile': header_json_in_file, 'useStringKeyAsFileName': use_string_key_as_file_name, }, ) )