Skip to content

Commit

Permalink
Check for duplicate alleles
Browse files Browse the repository at this point in the history
  • Loading branch information
hyanwong committed Sep 4, 2024
1 parent a080e8f commit 2aa6c46
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 0 deletions.
17 changes: 17 additions & 0 deletions tests/test_variantdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,3 +754,20 @@ def test_empty_alleles_not_at_end(self, tmp_path):
samples = tsinfer.VariantData(path, "variant_ancestral_allele")
with pytest.raises(ValueError, match="Empty alleles must be at the end"):
tsinfer.infer(samples)

def test_unique_alleles(self, tmp_path):
path = tmp_path / "data.zarr"
ds = sgkit.simulate_genotype_call_dataset(n_variant=3, n_sample=3, n_ploidy=1)
ds["variant_allele"] = (
ds["variant_allele"].dims,
np.array([["A", "C", "T"], ["A", "C", ""], ["A", "A", ""]], dtype="S1"),
)
ds["variant_ancestral_allele"] = (
["variants"],
np.array(["A", "A", "A"], dtype="S1"),
)
sgkit.save_dataset(ds, path)
with pytest.raises(
ValueError, match="Duplicate allele values provided at site 2"
):
tsinfer.VariantData(path, "variant_ancestral_allele")
6 changes: 6 additions & 0 deletions tsinfer/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2398,6 +2398,12 @@ def __init__(
f"The ancestral allele {ancestral_allele} was not"
f" found in the dataset."
)
for i, (alleles, num_alleles) in enumerate(
zip(self.sites_alleles, self.num_alleles())
):
if len(set(alleles) - {b"", "", None}) != num_alleles:
raise ValueError(f"Duplicate allele values provided at site {i}")

self._sites_ancestral_allele = self._sites_ancestral_allele.astype(str)
unknown_alleles = collections.Counter()
converted = np.zeros(self.num_sites, dtype=np.int8)
Expand Down

0 comments on commit 2aa6c46

Please sign in to comment.