Skip to content

Commit

Permalink
Merge pull request #347 from OP-TED/feature/TED-889
Browse files Browse the repository at this point in the history
Feature/ted 889
  • Loading branch information
CaptainOfHacks authored Nov 8, 2022
2 parents 31e393b + 2158191 commit b0e7ecf
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 40 deletions.
77 changes: 39 additions & 38 deletions ted_sws/master_data_registry/services/entity_deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,47 +49,39 @@ def generate_mdr_alignment_links(merged_rdf_fragments: rdflib.Graph, cet_uri: st
return alignment_graph


def clean_mdr_alignment_links(source_subjects: Set[rdflib.term.URIRef], alignment_graph: rdflib.Graph) -> rdflib.Graph:
"""
Clear redundant links from alignment graph.
:param source_subjects:
:param alignment_graph:
:return:
"""
for source_triple in alignment_graph.triples(triple=(None, OWL.sameAs, None)):
rdf_subject, rdf_predicate, rdf_object = source_triple
if (rdf_subject != rdf_object) and (rdf_subject in source_subjects):
for target_triple in alignment_graph.triples(triple=(rdf_object, OWL.sameAs, rdf_subject)):
alignment_graph.remove(triple=target_triple)
else:
alignment_graph.remove(triple=source_triple)

return alignment_graph


def reduce_link_set(alignment_graph: rdflib.Graph, source_subjects: set) -> rdflib.Graph:
def reduce_link_set(alignment_graph: rdflib.Graph, source_subjects: set, canonical_cets: set) -> rdflib.Graph:
"""
This function reduce number of alignment links from alignment graph.
:param alignment_graph:
:param source_subjects:
:param canonical_cets:
:return:
"""
reduced_graph = rdflib.Graph()

def copy_rdf_graph(graph: rdflib.Graph) -> rdflib.Graph:
copy_graph = rdflib.Graph()
for graph_triple in iter(graph):
copy_graph.add(graph_triple)
return copy_graph

reduced_graph = copy_rdf_graph(graph=alignment_graph)
found_transition = True
while found_transition:
found_transition = False
alignment_graph = clean_mdr_alignment_links(source_subjects=source_subjects,
alignment_graph=alignment_graph)
for triple in iter(alignment_graph):
save_current_triple = True
for transition_triple in alignment_graph.triples(triple=(triple[2], OWL.sameAs, None)):
save_current_triple = False
found_transition = True
alignment_graph.add((triple[0], triple[1], transition_triple[2]))
if save_current_triple:
reduced_graph.add(triple)
else:
alignment_graph.remove(triple)
for canonical_cet in canonical_cets:
for triple in alignment_graph.triples(triple=(None, OWL.sameAs, canonical_cet)):
if (triple[0] != triple[2]) and (triple[0] in source_subjects):
for transition_triple in alignment_graph.triples(triple=(None, OWL.sameAs, triple[0])):
found_transition = True
reduced_graph.remove(triple=transition_triple)
if triple[2] != transition_triple[0]:
reduced_graph.add(triple=(transition_triple[0], transition_triple[1], triple[2]))
else:
reduced_graph.remove(triple=triple)
if found_transition:
alignment_graph = copy_rdf_graph(graph=reduced_graph)
canonical_cets = [canonical_cet for canonical_cet in canonical_cets
if next(alignment_graph.triples(triple=(None,OWL.sameAs , canonical_cet)),None)]
return reduced_graph


Expand Down Expand Up @@ -154,10 +146,10 @@ def inject_similarity_links_in_notices(notices: List[Notice],
notice = notices_dict[notice_id]
inject_links = rdflib.Graph()
if inject_reflexive_links:
inject_links.add((root_uri, OWL.sameAs, root_uri))
else:
for triple in alignment_graph.triples(triple=(root_uri, OWL.sameAs, None)):
inject_links.add(triple)
else:
inject_links.add((root_uri, OWL.sameAs, root_uri))
notice.distilled_rdf_manifestation.object_data = '\n'.join([notice.distilled_rdf_manifestation.object_data,
str(inject_links.serialize(format="nt"))])

Expand All @@ -172,12 +164,21 @@ def create_mdr_alignment_links(cet_rdf_fragments: List[rdflib.Graph], cet_uri: s
:return:
"""
merged_rdf_fragments = merge_rdf_fragments_into_graph(rdf_fragments=cet_rdf_fragments)
alignment_graph = generate_mdr_alignment_links(merged_rdf_fragments=merged_rdf_fragments, cet_uri=cet_uri)
alignment_graph += generate_mdr_alignment_links(merged_rdf_fragments=merged_rdf_fragments, cet_uri=cet_uri,
mdr_sparql_endpoint=mdr_sparql_endpoint)
local_alignment_graph = generate_mdr_alignment_links(merged_rdf_fragments=merged_rdf_fragments, cet_uri=cet_uri)
mdr_alignment_graph = generate_mdr_alignment_links(merged_rdf_fragments=merged_rdf_fragments, cet_uri=cet_uri,
mdr_sparql_endpoint=mdr_sparql_endpoint)
canonical_cets = {triple[2] for triple in iter(mdr_alignment_graph)}
alignment_graph = local_alignment_graph + mdr_alignment_graph

source_subjects = {next(cet_rdf_fragment.triples(triple=(None, RDF.type, URIRef(cet_uri))))[0]
for cet_rdf_fragment in cet_rdf_fragments}
reduced_link_set = reduce_link_set(alignment_graph=alignment_graph, source_subjects=source_subjects)
if not canonical_cets:
canonical_cets = source_subjects
else:
source_subjects.difference_update(canonical_cets)
reduced_link_set = reduce_link_set(alignment_graph=alignment_graph, source_subjects=source_subjects,
canonical_cets=canonical_cets)

return reduced_link_set


Expand Down
25 changes: 23 additions & 2 deletions tests/e2e/master_data_registry/test_entity_deduplication.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import rdflib
from rdflib import OWL

from ted_sws.data_manager.adapters.triple_store import FusekiAdapter
from ted_sws.master_data_registry.services.entity_deduplication import deduplicate_entities_by_cet_uri

Expand All @@ -17,8 +20,9 @@

def test_deduplicate_entities_by_cet_uri(notice_with_rdf_manifestation, organisation_cet_uri):
fuseki_triple_store = FusekiAdapter()
if TEST_MDR_REPOSITORY not in fuseki_triple_store.list_repositories():
fuseki_triple_store.create_repository(repository_name=TEST_MDR_REPOSITORY)
if TEST_MDR_REPOSITORY in fuseki_triple_store.list_repositories():
fuseki_triple_store.delete_repository(repository_name=TEST_MDR_REPOSITORY)
fuseki_triple_store.create_repository(repository_name=TEST_MDR_REPOSITORY)
notice_with_rdf_manifestation.set_distilled_rdf_manifestation(
distilled_rdf_manifestation=notice_with_rdf_manifestation.rdf_manifestation.copy())
deduplicate_entities_by_cet_uri(notices=[notice_with_rdf_manifestation], cet_uri=organisation_cet_uri,
Expand All @@ -29,4 +33,21 @@ def test_deduplicate_entities_by_cet_uri(notice_with_rdf_manifestation, organisa
unique_cet_roots = sparql_endpoint.with_query(sparql_query=TEST_QUERY_UNIQUE_CET_ROOTS).fetch_tabular()
assert len(unique_names) == len(unique_cet_roots)

notice_rdf_content = notice_with_rdf_manifestation.distilled_rdf_manifestation.object_data
notice_rdf_graph = rdflib.Graph()
notice_rdf_graph.parse(data=notice_rdf_content, format="ttl")

non_canonical_same_as_triples = [triple for triple in notice_rdf_graph.triples(triple=(None, OWL.sameAs, None))]
canonical_cets_set = set(unique_cet_roots["s"].tolist())
for triple in non_canonical_same_as_triples:
assert str(triple[2]) in canonical_cets_set

canonical_cets_same_as_triples = []
for canonical_cet in canonical_cets_set:
for triple in notice_rdf_graph.triples(triple=(rdflib.URIRef(canonical_cet), OWL.sameAs, None)):
canonical_cets_same_as_triples.append(triple)

for triple in canonical_cets_same_as_triples:
assert str(triple[2]) in canonical_cets_set

fuseki_triple_store.delete_repository(repository_name=TEST_MDR_REPOSITORY)

0 comments on commit b0e7ecf

Please sign in to comment.