Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[query/vds combiner] Change sanity checks on combiner construction #14087

Merged
merged 3 commits into from
Dec 9, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 36 additions & 25 deletions hail/python/hail/vds/combiner/variant_dataset_combiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@

import hail as hl
from hail.expr import HailType, tmatrix
from hail.utils import Interval
from hail.utils import FatalError, Interval
from hail.utils.java import info, warning
from .combine import combine_variant_datasets, transform_gvcf, defined_entry_fields, make_variant_stream, \
make_reference_stream, combine_r, calculate_even_genome_partitioning, \
calculate_new_intervals, combine
from ..variant_dataset import VariantDataset


class VDSMetadata(NamedTuple):
Expand Down Expand Up @@ -223,8 +224,6 @@ def __init__(self,
gvcf_info_to_keep: Optional[Collection[str]] = None,
gvcf_reference_entry_fields_to_keep: Optional[Collection[str]] = None,
):
if not (vdses or gvcfs):
raise ValueError("one of 'vdses' or 'gvcfs' must be nonempty")
if gvcf_import_intervals:
interval = gvcf_import_intervals[0]
if not isinstance(interval.point_type, hl.tlocus):
Expand Down Expand Up @@ -345,12 +344,21 @@ def load(path) -> 'VariantDatasetCombiner':
fs = hl.current_backend().fs
with fs.open(path) as stream:
combiner = json.load(stream, cls=Decoder)
combiner._raise_if_output_exists()
if combiner._save_path != path:
warning('path/save_path mismatch in loaded VariantDatasetCombiner, using '
f'{path} as the new save_path for this combiner')
combiner._save_path = path
return combiner

def _raise_if_output_exists(self):
fs = hl.current_backend().fs
ref_success_path = os.path.join(VariantDataset._reference_path(self._output_path), '_SUCCESS')
var_success_path = os.path.join(VariantDataset._variants_path(self._output_path), '_SUCCESS')
if fs.exists(ref_success_path) and fs.exists(var_success_path):
raise FatalError(f'combiner output already exists at {self._output_path}\n'
'move or delete it before continuing')

def to_dict(self) -> dict:
"""A serializable representation of this combiner."""
intervals_typ = hl.tarray(hl.tinterval(hl.tlocus(self._reference_genome)))
Expand Down Expand Up @@ -399,14 +407,14 @@ def step(self):
self._job_id += 1

def _write_final(self, vds):
fd = hl.vds.VariantDataset.ref_block_max_length_field
fd = VariantDataset.ref_block_max_length_field

if fd not in vds.reference_data.globals:
info("VDS combiner: computing reference block max length...")
max_len = vds.reference_data.aggregate_entries(
hl.agg.max(vds.reference_data.END + 1 - vds.reference_data.locus.position))
info(f"VDS combiner: max reference block length is {max_len}")
vds = hl.vds.VariantDataset(reference_data=vds.reference_data.annotate_globals(**{fd: max_len}),
vds = VariantDataset(reference_data=vds.reference_data.annotate_globals(**{fd: max_len}),
variant_data=vds.variant_data)

vds.write(self._output_path)
Expand Down Expand Up @@ -548,7 +556,7 @@ def _step_gvcfs(self):
globals=hl.struct(
g=hl.literal(ids).map(lambda s: hl.struct(__cols=[hl.struct(s=s)]))))
variant_ht = combine(variant_ht)
vds = hl.vds.VariantDataset(reference_ht._unlocalize_entries('__entries', '__cols', ['s']),
vds = VariantDataset(reference_ht._unlocalize_entries('__entries', '__cols', ['s']),
variant_ht._unlocalize_entries('__entries', '__cols',
['s'])._key_rows_by_assert_sorted('locus',
'alleles'))
Expand Down Expand Up @@ -654,8 +662,9 @@ def maybe_load_from_saved_path(save_path: str) -> Optional[VariantDatasetCombine
combiner._target_records = target_records
combiner._gvcf_batch_size = gvcf_batch_size
return combiner
except (ValueError, TypeError, OSError, KeyError):
warning(f'file exists at {save_path}, but it is not a valid combiner plan, overwriting')
except (ValueError, TypeError, OSError, KeyError) as e:
warning(f'file exists at {save_path}, but it is not a valid combiner plan, overwriting\n'
f' caused by: {e}')
return None

# We do the first save_path check now after validating the arguments
Expand Down Expand Up @@ -792,23 +801,25 @@ def maybe_load_from_saved_path(save_path: str) -> Optional[VariantDatasetCombine

vdses.sort(key=lambda x: x.n_samples, reverse=True)

return VariantDatasetCombiner(save_path=save_path,
output_path=output_path,
temp_path=temp_path,
reference_genome=reference_genome,
dataset_type=dataset_type,
branch_factor=branch_factor,
target_records=target_records,
gvcf_batch_size=gvcf_batch_size,
contig_recoding=contig_recoding,
call_fields=call_fields,
vdses=vdses,
gvcfs=gvcf_paths,
gvcf_import_intervals=intervals,
gvcf_external_header=gvcf_external_header,
gvcf_sample_names=gvcf_sample_names,
gvcf_info_to_keep=gvcf_info_to_keep,
gvcf_reference_entry_fields_to_keep=gvcf_reference_entry_fields_to_keep)
combiner = VariantDatasetCombiner(save_path=save_path,
output_path=output_path,
temp_path=temp_path,
reference_genome=reference_genome,
dataset_type=dataset_type,
branch_factor=branch_factor,
target_records=target_records,
gvcf_batch_size=gvcf_batch_size,
contig_recoding=contig_recoding,
call_fields=call_fields,
vdses=vdses,
gvcfs=gvcf_paths,
gvcf_import_intervals=intervals,
gvcf_external_header=gvcf_external_header,
gvcf_sample_names=gvcf_sample_names,
gvcf_info_to_keep=gvcf_info_to_keep,
gvcf_reference_entry_fields_to_keep=gvcf_reference_entry_fields_to_keep)
combiner._raise_if_output_exists()
return combiner


def load_combiner(path: str) -> VariantDatasetCombiner:
Expand Down