hail-is · danking · Dec 9, 2023 · Dec 8, 2023 · Dec 8, 2023 · Dec 9, 2023
diff --git a/hail/python/hail/vds/combiner/variant_dataset_combiner.py b/hail/python/hail/vds/combiner/variant_dataset_combiner.py
@@ -10,11 +10,12 @@
 
 import hail as hl
 from hail.expr import HailType, tmatrix
-from hail.utils import Interval
+from hail.utils import FatalError, Interval
 from hail.utils.java import info, warning
 from .combine import combine_variant_datasets, transform_gvcf, defined_entry_fields, make_variant_stream, \
     make_reference_stream, combine_r, calculate_even_genome_partitioning, \
     calculate_new_intervals, combine
+from ..variant_dataset import VariantDataset
 
 
 class VDSMetadata(NamedTuple):
@@ -223,8 +224,6 @@ def __init__(self,
                  gvcf_info_to_keep: Optional[Collection[str]] = None,
                  gvcf_reference_entry_fields_to_keep: Optional[Collection[str]] = None,
                  ):
-        if not (vdses or gvcfs):
-            raise ValueError("one of 'vdses' or 'gvcfs' must be nonempty")
         if gvcf_import_intervals:
             interval = gvcf_import_intervals[0]
             if not isinstance(interval.point_type, hl.tlocus):
@@ -345,12 +344,21 @@ def load(path) -> 'VariantDatasetCombiner':
         fs = hl.current_backend().fs
         with fs.open(path) as stream:
             combiner = json.load(stream, cls=Decoder)
+            combiner._raise_if_output_exists()
             if combiner._save_path != path:
                 warning('path/save_path mismatch in loaded VariantDatasetCombiner, using '
                         f'{path} as the new save_path for this combiner')
                 combiner._save_path = path
             return combiner
 
+    def _raise_if_output_exists(self):
+        fs = hl.current_backend().fs
+        ref_success_path = os.path.join(VariantDataset._reference_path(self._output_path), '_SUCCESS')
+        var_success_path = os.path.join(VariantDataset._variants_path(self._output_path), '_SUCCESS')
+        if fs.exists(ref_success_path) and fs.exists(var_success_path):
+            raise FatalError(f'combiner output already exists at {self._output_path}\n'
+                             'move or delete it before continuing')
+
     def to_dict(self) -> dict:
         """A serializable representation of this combiner."""
         intervals_typ = hl.tarray(hl.tinterval(hl.tlocus(self._reference_genome)))
@@ -399,14 +407,14 @@ def step(self):
             self._job_id += 1
 
     def _write_final(self, vds):
-        fd = hl.vds.VariantDataset.ref_block_max_length_field
+        fd = VariantDataset.ref_block_max_length_field
 
         if fd not in vds.reference_data.globals:
             info("VDS combiner: computing reference block max length...")
             max_len = vds.reference_data.aggregate_entries(
                 hl.agg.max(vds.reference_data.END + 1 - vds.reference_data.locus.position))
             info(f"VDS combiner: max reference block length is {max_len}")
-            vds = hl.vds.VariantDataset(reference_data=vds.reference_data.annotate_globals(**{fd: max_len}),
+            vds = VariantDataset(reference_data=vds.reference_data.annotate_globals(**{fd: max_len}),
                                         variant_data=vds.variant_data)
 
         vds.write(self._output_path)
@@ -548,7 +556,7 @@ def _step_gvcfs(self):
                                             globals=hl.struct(
                                                 g=hl.literal(ids).map(lambda s: hl.struct(__cols=[hl.struct(s=s)]))))
             variant_ht = combine(variant_ht)
-            vds = hl.vds.VariantDataset(reference_ht._unlocalize_entries('__entries', '__cols', ['s']),
+            vds = VariantDataset(reference_ht._unlocalize_entries('__entries', '__cols', ['s']),
                                         variant_ht._unlocalize_entries('__entries', '__cols',
                                                                        ['s'])._key_rows_by_assert_sorted('locus',
                                                                                                          'alleles'))
@@ -654,8 +662,9 @@ def maybe_load_from_saved_path(save_path: str) -> Optional[VariantDatasetCombine
                 combiner._target_records = target_records
                 combiner._gvcf_batch_size = gvcf_batch_size
                 return combiner
-            except (ValueError, TypeError, OSError, KeyError):
-                warning(f'file exists at {save_path}, but it is not a valid combiner plan, overwriting')
+            except (ValueError, TypeError, OSError, KeyError) as e:
+                warning(f'file exists at {save_path}, but it is not a valid combiner plan, overwriting\n'
+                        f'    caused by: {e}')
         return None
 
     # We do the first save_path check now after validating the arguments
@@ -792,23 +801,25 @@ def maybe_load_from_saved_path(save_path: str) -> Optional[VariantDatasetCombine
 
     vdses.sort(key=lambda x: x.n_samples, reverse=True)
 
-    return VariantDatasetCombiner(save_path=save_path,
-                                  output_path=output_path,
-                                  temp_path=temp_path,
-                                  reference_genome=reference_genome,
-                                  dataset_type=dataset_type,
-                                  branch_factor=branch_factor,
-                                  target_records=target_records,
-                                  gvcf_batch_size=gvcf_batch_size,
-                                  contig_recoding=contig_recoding,
-                                  call_fields=call_fields,
-                                  vdses=vdses,
-                                  gvcfs=gvcf_paths,
-                                  gvcf_import_intervals=intervals,
-                                  gvcf_external_header=gvcf_external_header,
-                                  gvcf_sample_names=gvcf_sample_names,
-                                  gvcf_info_to_keep=gvcf_info_to_keep,
-                                  gvcf_reference_entry_fields_to_keep=gvcf_reference_entry_fields_to_keep)
+    combiner = VariantDatasetCombiner(save_path=save_path,
+                                      output_path=output_path,
+                                      temp_path=temp_path,
+                                      reference_genome=reference_genome,
+                                      dataset_type=dataset_type,
+                                      branch_factor=branch_factor,
+                                      target_records=target_records,
+                                      gvcf_batch_size=gvcf_batch_size,
+                                      contig_recoding=contig_recoding,
+                                      call_fields=call_fields,
+                                      vdses=vdses,
+                                      gvcfs=gvcf_paths,
+                                      gvcf_import_intervals=intervals,
+                                      gvcf_external_header=gvcf_external_header,
+                                      gvcf_sample_names=gvcf_sample_names,
+                                      gvcf_info_to_keep=gvcf_info_to_keep,
+                                      gvcf_reference_entry_fields_to_keep=gvcf_reference_entry_fields_to_keep)
+    combiner._raise_if_output_exists()
+    return combiner
 
 
 def load_combiner(path: str) -> VariantDatasetCombiner: