Skip to content

Commit

Permalink
test: make graph variant tests more granular (#2540)
Browse files Browse the repository at this point in the history
The graph variant tests were checking all variants in one test, so if
one format had issues, the whole variant was failing. The result was
overly broad xfails.

This change splits the variant tests up into separate tests for each
variant.

This change is made in preparation for adding Python based Graph variants.
  • Loading branch information
aucampia authored Aug 25, 2023
1 parent f34c476 commit 35e5543
Show file tree
Hide file tree
Showing 15 changed files with 195 additions and 86 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
227 changes: 156 additions & 71 deletions test/test_graph/test_variants.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,29 @@
from __future__ import annotations

import dataclasses
import itertools
import json
import logging
import os
import re
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path, PurePath
from test.data import TEST_DATA_DIR
from test.utils import GraphHelper
from test.utils.graph import GraphSource
from typing import (
ClassVar,
Collection,
DefaultDict,
Dict,
Iterable,
List,
Optional,
OrderedDict,
Pattern,
Tuple,
Type,
Union,
cast,
)
Expand All @@ -27,10 +33,11 @@

import rdflib.compare
import rdflib.util
from rdflib.graph import Dataset
from rdflib.graph import Dataset, _GraphT
from rdflib.namespace import XSD
from rdflib.term import URIRef
from rdflib.util import guess_format

MODULE_PATH = Path(__file__).parent

TEST_DIR = Path(__file__).parent.parent.absolute()
VARIANTS_DIR = TEST_DATA_DIR / "variants"
Expand All @@ -41,27 +48,23 @@
SUFFIX_FORMAT_MAP = {**rdflib.util.SUFFIX_FORMAT_MAP, "hext": "hext"}


@dataclass
@dataclass(frozen=True)
class GraphAsserts:
"""
A specification of asserts that must be checked against a graph. This is
read in from a JSON dict.
A specification of asserts that must be checked against a graph.
"""

quad_count: Optional[int] = None
exact_match: bool = False
has_subject_iris: Optional[List[str]] = None

def check(self, first_graph: Optional[Dataset], graph: Dataset) -> None:
def check(self, graph: Dataset) -> None:
"""
if `first_graph` is `None` then this is the first check before any
other graphs have been processed.
"""
if self.quad_count is not None:
assert self.quad_count == len(list(graph.quads()))
if first_graph is not None and self.exact_match:
GraphHelper.assert_quad_sets_equals(first_graph, graph)
if first_graph is None and self.has_subject_iris is not None:
if self.has_subject_iris is not None:
subjects_iris = {
f"{subject}"
for subject in graph.subjects()
Expand All @@ -70,25 +73,60 @@ def check(self, first_graph: Optional[Dataset], graph: Dataset) -> None:
assert set(self.has_subject_iris) == subjects_iris

@classmethod
def from_path(cls, path: Path) -> GraphAsserts:
def from_path(cls, path: Path):
with path.open("r") as f:
return cls(**json.load(f))
keys = dataclasses.fields(cls)
data = json.load(f)
return cls(**{key.name: data[key.name] for key in keys if key.name in data})


@dataclass(frozen=True)
class GraphVariantsMeta(GraphAsserts):
"""
Meta information about a set of variants.
"""

public_id: Optional[str] = None
exact_match: bool = False


_VARIANT_PREFERENCE: Dict[str, int] = dict(
(format, index)
for index, format in enumerate(
[
"nquads",
"nt",
"ntriples",
"turtle",
"ttl",
"trig",
"xml",
"hext",
]
)
)


@dataclass(order=True)
class GraphVariants:
"""
Represents a graph with multiple variants in different files.
Represents multiple variants of a single graph in different files.
"""

key: str
variants: Dict[str, Path] = field(default_factory=OrderedDict)
asserts: GraphAsserts = field(default_factory=lambda: GraphAsserts())
variants: Dict[str, GraphSource] = field(default_factory=OrderedDict)
meta: GraphVariantsMeta = field(default_factory=GraphVariantsMeta)

_variant_regex: ClassVar[Pattern[str]] = re.compile(
r"^(.*?)(|[-]variant-[^/]+|[-]asserts)$"
r"^(.*?)(|[-]variant-[^/]+|[-]asserts|[-]meta)$"
)

def __post_init__(self) -> None:
self.ordered_variants = sorted(
self.variants.items(),
key=lambda variant: _VARIANT_PREFERENCE.get(variant[1].format, 1000),
)

def pytest_param(
self,
marks: Optional[
Expand All @@ -97,9 +135,20 @@ def pytest_param(
) -> ParameterSet:
if marks is None:
marks = cast(Tuple[MarkDecorator], tuple())
logging.debug("self = %s", self)
return pytest.param(self, id=self.key, marks=marks)

@property
def public_id(self) -> str:
return self.meta.public_id or f"example:rdflib:test:data:variant:{self.key}"

@property
def preferred_variant(self) -> Tuple[str, GraphSource]:
return self.ordered_variants[0]

def load(self, variant_key: str, graph_type: Type[_GraphT]) -> _GraphT:
variant = self.variants[variant_key]
return variant.load(public_id=self.public_id, graph_type=graph_type)

@classmethod
def _decompose_path(cls, file_path: Path, basedir: Optional[Path]):
if basedir:
Expand All @@ -117,47 +166,64 @@ def _decompose_path(cls, file_path: Path, basedir: Optional[Path]):
def for_files(
cls, file_paths: Iterable[Path], basedir: Optional[Path] = None
) -> Dict[str, "GraphVariants"]:
graph_varaint_dict: Dict[str, GraphVariants] = {}
graph_sources: DefaultDict[str, Dict[str, GraphSource]] = defaultdict(dict)
graph_meta: Dict[str, GraphVariantsMeta] = {}
for file_path in file_paths:
logging.debug("file_path = %s", file_path)
file_key, variant_key = cls._decompose_path(file_path, basedir)
# file_key = f"{file_path.parent / stem}"
if file_key not in graph_varaint_dict:
graph_variant = graph_varaint_dict[file_key] = GraphVariants(file_key)
file_graph_sources = graph_sources[file_key]
if variant_key.endswith("-meta.json"):
if file_key in graph_meta:
raise RuntimeError(f"Duplicate meta for {file_key} in {file_path}")
graph_meta[file_key] = GraphVariantsMeta.from_path(file_path)
else:
graph_variant = graph_varaint_dict[file_key]
if variant_key.endswith("-asserts.json"):
graph_variant.asserts = GraphAsserts.from_path(file_path)
if variant_key in file_graph_sources:
raise RuntimeError(
f"Duplicate variant {variant_key} for {file_key} in {file_path}"
)
file_graph_sources[variant_key] = GraphSource.from_path(file_path)
graph_variant_dict = {}
for file_key, variants in graph_sources.items():
if file_key in graph_meta:
meta = graph_meta[file_key]
del graph_meta[file_key]
else:
graph_variant.variants[variant_key] = file_path
return graph_varaint_dict
meta = GraphVariantsMeta()
if len(variants) < 2:
raise RuntimeError(f"Only one variant for {file_key}")
graph_variant_dict[file_key] = GraphVariants(file_key, variants, meta)
if graph_meta:
raise RuntimeError(f"Unmatched meta {graph_meta}")
return graph_variant_dict

@classmethod
def for_directory(
cls, directory: Path, basedir: Optional[Path] = None
) -> Dict[str, "GraphVariants"]:
file_paths = []
for file_path in directory.glob("**/*"):
for file_path in directory.glob("*"):
if not file_path.is_file():
continue
if file_path.name.endswith(".md"):
continue
file_paths.append(file_path)
logging.debug("file_paths = %s", file_paths)
return cls.for_files(file_paths, basedir)


GRAPH_VARIANT_DICT = {
GRAPH_VARIANTS_DICT = {
**GraphVariants.for_directory(VARIANTS_DIR, TEST_DATA_DIR),
**GraphVariants.for_files(EXTRA_FILES, TEST_DIR),
}

EXPECTED_FAILURES = {
("variants/schema_only_base"): pytest.mark.xfail(
EXPECTED_FAILURES: Dict[Tuple[str, Optional[str]], MarkDecorator] = {
("variants/schema_only_base", ".ttl"): pytest.mark.xfail(
reason="Some issue with handling base URI that does not end with a slash",
raises=ValueError,
),
("variants/schema_only_base", ".n3"): pytest.mark.xfail(
reason="Some issue with handling base URI that does not end with a slash",
raises=ValueError,
),
("variants/rdf11trig_eg2"): pytest.mark.xfail(
("variants/rdf11trig_eg2", ".hext"): pytest.mark.xfail(
reason="""
This fails randomly, passing less than 10% of the time, and always failing
with comparing hext against trig. Not clear why, it may be a big with hext
Expand All @@ -180,7 +246,7 @@ def for_directory(
""",
raises=AssertionError,
),
("variants/diverse_quads"): pytest.mark.xfail(
("variants/diverse_quads", ".trig"): pytest.mark.xfail(
reason="""
TriG parsing gets confused about what graph 'XSD string' appears in:
(rdflib.term.URIRef('example:subject'),
Expand All @@ -198,52 +264,71 @@ def for_directory(
def tests_found() -> None:
logging.debug("VARIANTS_DIR = %s", VARIANTS_DIR)
logging.debug("EXTRA_FILES = %s", EXTRA_FILES)
assert len(GRAPH_VARIANT_DICT) >= 1
logging.debug("ALL_VARIANT_GRAPHS = %s", GRAPH_VARIANT_DICT)
xml_literal = GRAPH_VARIANT_DICT.get("variants/xml_literal")
assert len(GRAPH_VARIANTS_DICT) >= 1
logging.debug("ALL_VARIANT_GRAPHS = %s", GRAPH_VARIANTS_DICT)
xml_literal = GRAPH_VARIANTS_DICT.get("variants/xml_literal")
assert xml_literal is not None
assert len(xml_literal.variants) >= 5
assert xml_literal.asserts.quad_count == 1
assert xml_literal.meta.quad_count == 1


@pytest.mark.parametrize(
"graph_variant",
[
graph_variant.pytest_param(EXPECTED_FAILURES.get(graph_variant.key))
for graph_variant in GRAPH_VARIANT_DICT.values()
],
)
def test_variants(graph_variant: GraphVariants) -> None:
_PREFERRED_GRAPHS: Dict[str, Dataset] = {}


def load_preferred(graph_variants: GraphVariants) -> Dataset:
if graph_variants.key in _PREFERRED_GRAPHS:
return _PREFERRED_GRAPHS[graph_variants.key]
preferred_variant = graph_variants.preferred_variant
preferred_graph = graph_variants.load(preferred_variant[0], Dataset)
GraphHelper.strip_literal_datatypes(preferred_graph, {XSD.string})
_PREFERRED_GRAPHS[graph_variants.key] = preferred_graph
return preferred_graph


def make_variant_source_cases() -> Iterable[ParameterSet]:
for graph_variants in GRAPH_VARIANTS_DICT.values():
variants = graph_variants.ordered_variants
preferred_variant = variants[0]
preferred_key = preferred_variant[0]

for variant_key in itertools.chain([None], (i[0] for i in variants[1:])):
marks = []
if (graph_variants.key, variant_key) in EXPECTED_FAILURES:
marks.append(EXPECTED_FAILURES[(graph_variants.key, variant_key)])
yield pytest.param(
graph_variants,
variant_key,
marks=marks,
id=f"{graph_variants.key}-{preferred_key}-{variant_key}",
)


@pytest.mark.parametrize(["graph_variants", "variant_key"], make_variant_source_cases())
def test_variant_source(
graph_variants: GraphVariants, variant_key: Optional[str]
) -> None:
"""
All variants of a graph are isomorphic with the first variant, and thus
eachother.
"""
logging.debug("graph_variant = %s", graph_variant)
public_id = URIRef(f"example:{graph_variant.key}")
assert len(graph_variant.variants) > 0
first_graph: Optional[Dataset] = None
first_path: Optional[Path] = None
logging.debug("graph_variant.asserts = %s", graph_variant.asserts)

for variant_key, variant_path in graph_variant.variants.items():
logging.debug("variant_path = %s", variant_path)
format = guess_format(variant_path.name, fmap=SUFFIX_FORMAT_MAP)
assert format is not None, f"could not determine format for {variant_path.name}"
graph = Dataset()
graph.parse(variant_path, format=format, publicID=public_id)
# Stripping data types as different parsers (e.g. hext) have different
# opinions of when a bare string is of datatype XSD.string or not.
# Probably something that needs more investigation.
GraphHelper.strip_literal_datatypes(graph, {XSD.string})
graph_variant.asserts.check(first_graph, graph)
if first_graph is None:
first_graph = graph
first_path = variant_path
preferred_path = graph_variants.preferred_variant[1].path
preferred_graph: Dataset = load_preferred(graph_variants)

if variant_key is None:
# Only check asserts against the preferred variant, and only
# when not comparing variants.
graph_variants.meta.check(preferred_graph)
else:
variant_path = graph_variants.variants[variant_key].path
variant_graph = graph_variants.load(variant_key, Dataset)
GraphHelper.strip_literal_datatypes(variant_graph, {XSD.string})

if graph_variants.meta.exact_match:
GraphHelper.assert_quad_sets_equals(preferred_graph, variant_graph)
else:
assert first_path is not None
GraphHelper.assert_cgraph_isomorphic(
first_graph,
graph,
preferred_graph,
variant_graph,
False,
f"checking {variant_path.relative_to(VARIANTS_DIR)} against {first_path.relative_to(VARIANTS_DIR)}",
f"checking {variant_path.relative_to(VARIANTS_DIR)} against {preferred_path.relative_to(VARIANTS_DIR)}",
)
2 changes: 1 addition & 1 deletion test/utils/dawg_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def from_sources(
local_base,
public_id,
)
graph = source.load(public_id=public_id)
graph: Graph = source.load(public_id=public_id)
yield from cls.from_graph(
uri_mapper,
graph,
Expand Down
Loading

0 comments on commit 35e5543

Please sign in to comment.