Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/ted 568 #310

Merged
merged 16 commits into from
Oct 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -113,4 +113,5 @@ tests/reports/allure/report/
tests/reports/allure/test_results/
package-lock.json
package.json
/infra/digest_api/digest_service/project_requirements.txt
/infra/digest_api/digest_service/project_requirements.txt
.limes/*
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ PROJECT_PATH = $(shell pwd)
AIRFLOW_INFRA_FOLDER ?= ${PROJECT_PATH}/.airflow
RML_MAPPER_PATH = ${PROJECT_PATH}/.rmlmapper/rmlmapper.jar
XML_PROCESSOR_PATH = ${PROJECT_PATH}/.saxon/saxon-he-10.6.jar
LIMES_ALIGNMENT_PATH = $(PROJECT_PATH)/.limes/limes.jar
HOSTNAME = $(shell hostname)


Expand Down Expand Up @@ -212,13 +213,17 @@ init-rml-mapper:
@ mkdir -p ./.rmlmapper
@ wget -c https://api.bitbucket.org/2.0/repositories/Dragos0000/rml-mapper/src/master/rmlmapper.jar -P ./.rmlmapper

init-limes:
@ echo -e "Limes folder initialisation!"
@ mkdir -p ./.limes
@ wget -c https://github.com/dice-group/LIMES/releases/download/1.7.9/limes.jar -P ./.limes

init-saxon:
@ echo -e "$(BUILD_PRINT)Saxon folder initialization $(END_BUILD_PRINT)"
@ wget -c https://kumisystems.dl.sourceforge.net/project/saxon/Saxon-HE/10/Java/SaxonHE10-6J.zip -P .saxon/
@ cd .saxon && unzip SaxonHE10-6J.zip && rm -rf SaxonHE10-6J.zip

start-project-services: | start-airflow start-mongo init-rml-mapper start-allegro-graph start-metabase
start-project-services: | start-airflow start-mongo init-rml-mapper init-limes start-allegro-graph start-metabase
stop-project-services: | stop-airflow stop-mongo stop-allegro-graph stop-metabase

#-----------------------------------------------------------------------------
Expand Down Expand Up @@ -246,6 +251,7 @@ staging-dotenv-file: guard-VAULT_ADDR guard-VAULT_TOKEN vault-installed
@ echo ENVIRONMENT=staging >> .env
@ echo SUBDOMAIN=staging. >> .env
@ echo RML_MAPPER_PATH=${RML_MAPPER_PATH} >> .env
@ echo LIMES_ALIGNMENT_PATH=${LIMES_ALIGNMENT_PATH} >> .env
@ echo XML_PROCESSOR_PATH=${XML_PROCESSOR_PATH} >> .env
@ echo AIRFLOW_INFRA_FOLDER=~/airflow-infra/staging >> .env
@ echo AIRFLOW_WORKER_HOSTNAME=${HOSTNAME} >> .env
Expand All @@ -265,6 +271,7 @@ dev-dotenv-file: guard-VAULT_ADDR guard-VAULT_TOKEN vault-installed
@ echo ENVIRONMENT=dev >> .env
@ echo SUBDOMAIN= >> .env
@ echo RML_MAPPER_PATH=${RML_MAPPER_PATH} >> .env
@ echo LIMES_ALIGNMENT_PATH=${LIMES_ALIGNMENT_PATH} >> .env
@ echo XML_PROCESSOR_PATH=${XML_PROCESSOR_PATH} >> .env
@ echo AIRFLOW_INFRA_FOLDER=${AIRFLOW_INFRA_FOLDER} >> .env
@ echo AIRFLOW_WORKER_HOSTNAME=${HOSTNAME} >> .env
Expand All @@ -284,6 +291,7 @@ prod-dotenv-file: guard-VAULT_ADDR guard-VAULT_TOKEN vault-installed
@ echo ENVIRONMENT=prod >> .env
@ echo SUBDOMAIN= >> .env
@ echo RML_MAPPER_PATH=${RML_MAPPER_PATH} >> .env
@ echo LIMES_ALIGNMENT_PATH=${LIMES_ALIGNMENT_PATH} >> .env
@ echo XML_PROCESSOR_PATH=${XML_PROCESSOR_PATH} >> .env
@ echo AIRFLOW_INFRA_FOLDER=~/airflow-infra/prod >> .env
@ echo AIRFLOW_WORKER_HOSTNAME=${HOSTNAME} >> .env
Expand Down
30 changes: 25 additions & 5 deletions ted_sws/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@

__version__ = "0.0.1"

import json
import os
import pathlib

import dotenv

Expand All @@ -25,12 +27,14 @@
RUN_TEST_ENV_VAL = "test"
os.environ[RUN_ENV_NAME] = RUN_ENV_VAL


# SECRET_PATHS = ['mongo-db', 'github']
# SECRET_MOUNT = f'ted-{ENV}'
# VaultSecretsStore.default_secret_mount = SECRET_MOUNT
# VaultSecretsStore.default_secret_paths = SECRET_PATHS

PROJECT_PATH = pathlib.Path(__file__).parent.resolve()
SPARQL_PREFIXES_PATH = PROJECT_PATH / "resources" / "prefixes" / "prefixes.json"


class MongoDBConfig:

Expand Down Expand Up @@ -71,6 +75,12 @@ def RML_MAPPER_PATH(self) -> str:
return EnvConfigResolver().config_resolve()


class LimesAlignmentConfig:
@property
def LIMES_ALIGNMENT_PATH(self) -> str:
return EnvConfigResolver().config_resolve()


class AllegroConfig:
@property
def AGRAPH_SUPER_USER(self) -> str:
Expand All @@ -85,7 +95,7 @@ def ALLEGRO_HOST(self) -> str:
return EnvConfigResolver().config_resolve()

@property
def TRIPLE_STORE_ENDPOINT_URL(self)->str:
def TRIPLE_STORE_ENDPOINT_URL(self) -> str:
return EnvConfigResolver().config_resolve()


Expand Down Expand Up @@ -160,6 +170,7 @@ class TedAPIConfig:
def TED_API_URL(self) -> str:
return EnvConfigResolver().config_resolve()


class FusekiConfig:
@property
def FUSEKI_ADMIN_USER(self) -> str:
Expand All @@ -173,6 +184,7 @@ def FUSEKI_ADMIN_PASSWORD(self) -> str:
def FUSEKI_ADMIN_HOST(self) -> str:
return EnvConfigResolver().config_resolve()


class SFTPConfig:
@property
def SFTP_HOST(self) -> str:
Expand All @@ -188,16 +200,24 @@ def SFTP_USER(self) -> str:
return EnvConfigResolver().config_resolve()

@property
def SFTP_PASSWORD(self)->str:
def SFTP_PASSWORD(self) -> str:
return EnvConfigResolver().config_resolve()

@property
def SFTP_PATH(self)->str:
def SFTP_PATH(self) -> str:
return EnvConfigResolver().config_resolve()


class SPARQLConfig:

@property
def SPARQL_PREFIXES(self) -> dict:
return json.loads(SPARQL_PREFIXES_PATH.read_text(encoding="utf-8"))["prefix_definitions"]


class TedConfigResolver(MongoDBConfig, RMLMapperConfig, XMLProcessorConfig, ELKConfig, LoggingConfig,
GitHubArtefacts, API, AllegroConfig, TedAPIConfig, SFTPConfig, FusekiConfig):
GitHubArtefacts, API, AllegroConfig, TedAPIConfig, SFTPConfig, FusekiConfig,
SPARQLConfig, LimesAlignmentConfig):
"""
This class resolve the secrets of the ted-sws project.
"""
Expand Down
Empty file.
Empty file.
28 changes: 28 additions & 0 deletions ted_sws/alignment_oracle/adapters/limes_alignment_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pathlib
import subprocess
import tempfile

from ted_sws.alignment_oracle.model.limes_config import LimesConfigParams
from ted_sws.alignment_oracle.services.limes_configurator import generate_xml_config_from_limes_config


class LimesAlignmentEngine:
"""
This is a adapter for limes executable.
"""
def __init__(self, limes_executable_path: pathlib.Path):
self.limes_executable_path = limes_executable_path

def execute(self, limes_config_params: LimesConfigParams):
"""
This method generate alignment links based on limes_config_params.
:param limes_config_params:
:return:
"""
limes_xml_config = generate_xml_config_from_limes_config(limes_config_params=limes_config_params)
temp_file = tempfile.NamedTemporaryFile()
temp_file.write(limes_xml_config.encode(encoding="utf-8"))
bash_script = f"java -jar {self.limes_executable_path} {temp_file.name}"
script_result = subprocess.run(bash_script, shell=True, capture_output=True)
temp_file.close()
script_result.stderr.decode('utf-8')
Empty file.
36 changes: 36 additions & 0 deletions ted_sws/alignment_oracle/model/limes_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from typing import List

from pydantic import BaseModel


class LimesDataSource(BaseModel):
"""
This class provide a model for LIMES alignment engine data source.
"""
id: str
sparql_endpoint: str
sparql_variable: str
sparql_restrictions: List[str]
sparql_properties: List[str]


class LimesDataResult(BaseModel):
"""
This class provide a model for LIMES alignment engine result.
"""
threshold: float
result_file_path: str
relation: str


class LimesConfigParams(BaseModel):
"""
This class provide a model for LIMES alignment engine configurations.
"""
prefixes: dict
source: LimesDataSource
target: LimesDataSource
alignment_metric: str
acceptance: LimesDataResult
review: LimesDataResult
result_file_format: str
Empty file.
Empty file.
51 changes: 51 additions & 0 deletions ted_sws/alignment_oracle/resources/templates/limes_config.jinja2
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE LIMES SYSTEM "limes.dtd">
<LIMES>
{% for label, namespace in prefixes.items() -%}
<PREFIX>
<NAMESPACE>{{namespace}}</NAMESPACE>
<LABEL>{{label}}</LABEL>
</PREFIX>
{%- endfor %}
<SOURCE>
<ID>{{ source.id }}</ID>
<ENDPOINT>{{ source.sparql_endpoint }}</ENDPOINT>
<VAR>{{ source.sparql_variable }}</VAR>
<PAGESIZE>2000</PAGESIZE>
{% for restriction in source.sparql_restrictions -%}
<RESTRICTION>{{ restriction }}</RESTRICTION>
{%- endfor %}
{% for property in source.sparql_properties -%}
<PROPERTY>{{ property }}</PROPERTY>
{%- endfor %}
</SOURCE>
<TARGET>
<ID>{{ target.id }}</ID>
<ENDPOINT>{{ target.sparql_endpoint }}</ENDPOINT>
<VAR>{{ target.sparql_variable }}</VAR>
<PAGESIZE>2000</PAGESIZE>
{% for restriction in target.sparql_restrictions -%}
<RESTRICTION>{{ restriction }}</RESTRICTION>
{%- endfor %}
{% for property in target.sparql_properties -%}
<PROPERTY>{{ property }}</PROPERTY>
{%- endfor %}
</TARGET>
<METRIC>{{ alignment_metric }}</METRIC>
<ACCEPTANCE>
<THRESHOLD>{{ acceptance.threshold }}</THRESHOLD>
<FILE>{{ acceptance.result_file_path }}</FILE>
<RELATION>{{ acceptance.relation }}</RELATION>
</ACCEPTANCE>
<REVIEW>
<THRESHOLD>{{ review.threshold }}</THRESHOLD>
<FILE>{{ review.result_file_path }}</FILE>
<RELATION>{{ review.relation }}</RELATION>
</REVIEW>
<EXECUTION>
<REWRITER>default</REWRITER>
<PLANNER>default</PLANNER>
<ENGINE>default</ENGINE>
</EXECUTION>
<OUTPUT>{{ result_file_format }}</OUTPUT>
</LIMES>
Empty file.
30 changes: 30 additions & 0 deletions ted_sws/alignment_oracle/services/generate_alignment_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import pathlib
from ted_sws import config
from ted_sws.alignment_oracle.adapters.limes_alignment_engine import LimesAlignmentEngine
from ted_sws.alignment_oracle.model.limes_config import LimesConfigParams

DEFAULT_MAX_ACCEPTANCE_THRESHOLD = 1.0
DEFAULT_MAX_REVIEW_THRESHOLD = 0.95
DEFAULT_DELTA_THRESHOLD = 0.05


def generate_alignment_links(limes_config_params: LimesConfigParams, threshold: float,
delta: float = DEFAULT_DELTA_THRESHOLD) -> str:
"""
This function generate alignment links using limes engine.
:param limes_config_params:
:param threshold:
:param delta:
:return:
"""
limes_config_params.review.threshold = min(threshold, DEFAULT_MAX_REVIEW_THRESHOLD)
limes_config_params.acceptance.threshold = min(threshold + delta, DEFAULT_MAX_ACCEPTANCE_THRESHOLD)
limes_alignment_engine = LimesAlignmentEngine(limes_executable_path=config.LIMES_ALIGNMENT_PATH)
limes_alignment_engine.execute(limes_config_params=limes_config_params)
review_result_path = pathlib.Path(limes_config_params.review.result_file_path)
review_result_content = review_result_path.read_text(encoding="utf-8")
if limes_config_params.acceptance.threshold == DEFAULT_MAX_ACCEPTANCE_THRESHOLD:
acceptance_result_path = pathlib.Path(limes_config_params.acceptance.result_file_path)
acceptance_result_content = acceptance_result_path.read_text(encoding="utf-8")
review_result_content += acceptance_result_content
return review_result_content
61 changes: 61 additions & 0 deletions ted_sws/alignment_oracle/services/limes_configurator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import pathlib
from typing import List

from jinja2 import Environment, PackageLoader

from ted_sws import config
from ted_sws.alignment_oracle.model.limes_config import LimesConfigParams, LimesDataSource, LimesDataResult

TEMPLATES = Environment(loader=PackageLoader("ted_sws.alignment_oracle.resources", "templates"))
LIMES_CONFIG_TEMPLATE = "limes_config.jinja2"
DEFAULT_SOURCE_SPARQL_VAR = "x"
DEFAULT_TARGET_SPARQL_VAR = "y"
DEFAULT_ACCEPTANCE_THRESHOLD = 0.95
DEFAULT_REVIEW_THRESHOLD = 0.7
DEFAULT_ACCEPTANCE_FILE_NAME = "acceptance.ttl"
DEFAULT_REVIEW_FILE_NAME = "review.ttl"
DEFAULT_PREFIXES = config.SPARQL_PREFIXES
DEFAULT_RESULT_FILE_FORMAT = "TTL"
DEFAULT_RELATION = "owl:sameAs"
DEFAULT_SOURCE_ID = "default_source_id"
DEFAULT_TARGET_ID = "default_target_id"


def generate_xml_config_from_limes_config(limes_config_params: LimesConfigParams) -> str:
return TEMPLATES.get_template(LIMES_CONFIG_TEMPLATE).render(limes_config_params.dict())


def generate_default_limes_config_params(sparql_endpoint: str,
result_dir_path: pathlib.Path,
alignment_metric: str,
source_sparql_restrictions: List[str],
target_sparql_restrictions: List[str],
source_sparql_properties: List[str],
target_sparql_properties: List[str],
) -> LimesConfigParams:
acceptance_file_path = str(result_dir_path / DEFAULT_ACCEPTANCE_FILE_NAME)
review_file_path = str(result_dir_path / DEFAULT_REVIEW_FILE_NAME)
return LimesConfigParams(prefixes=DEFAULT_PREFIXES,
source=LimesDataSource(id=DEFAULT_SOURCE_ID,
sparql_endpoint=sparql_endpoint,
sparql_variable=DEFAULT_SOURCE_SPARQL_VAR,
sparql_restrictions=source_sparql_restrictions,
sparql_properties=source_sparql_properties
),
target=LimesDataSource(id=DEFAULT_TARGET_ID,
sparql_endpoint=sparql_endpoint,
sparql_variable=DEFAULT_TARGET_SPARQL_VAR,
sparql_restrictions=target_sparql_restrictions,
sparql_properties=target_sparql_properties
),
alignment_metric=alignment_metric,
acceptance=LimesDataResult(threshold=DEFAULT_ACCEPTANCE_THRESHOLD,
result_file_path=acceptance_file_path,
relation=DEFAULT_RELATION
),
review=LimesDataResult(threshold=DEFAULT_REVIEW_THRESHOLD,
result_file_path=review_file_path,
relation=DEFAULT_RELATION
),
result_file_format=DEFAULT_RESULT_FILE_FORMAT
)
Empty file.
6 changes: 6 additions & 0 deletions tests/e2e/alignment_oracle/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import pytest


@pytest.fixture
def limes_sparql_endpoint() -> str:
return "https://fuseki.ted-data.eu/test_limes/query"
23 changes: 23 additions & 0 deletions tests/e2e/alignment_oracle/test_alignment_links_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pathlib
import tempfile

from ted_sws.alignment_oracle.services.generate_alignment_links import generate_alignment_links
from ted_sws.alignment_oracle.services.limes_configurator import generate_default_limes_config_params


def test_alignment_links_generation(limes_sparql_endpoint):
with tempfile.TemporaryDirectory() as tmp_dir_path:
limes_config_params = generate_default_limes_config_params(sparql_endpoint=limes_sparql_endpoint,
result_dir_path=pathlib.Path(tmp_dir_path),
alignment_metric="ADD(Jaccard(x.epo:hasLegalName, y.epo:hasLegalName), Jaccard(x.street, y.street))",
source_sparql_restrictions=["?x a org:Organization"],
source_sparql_properties=["epo:hasLegalName",
"legal:registeredAddress/locn:thoroughfare RENAME street"
],
target_sparql_restrictions=["?y a org:Organization"],
target_sparql_properties=["epo:hasLegalName",
"legal:registeredAddress/locn:thoroughfare RENAME street"
]
)
result_links = generate_alignment_links(limes_config_params=limes_config_params, threshold=0.95)
assert result_links
Empty file.
Loading