Skip to content

Commit

Permalink
Merge pull request #24 from linkml/inline-docs
Browse files Browse the repository at this point in the history
Additional inline docs and tests
  • Loading branch information
cmungall authored Oct 26, 2022
2 parents 24e53cc + 969e216 commit b53947b
Show file tree
Hide file tree
Showing 11 changed files with 237 additions and 51 deletions.
2 changes: 1 addition & 1 deletion src/prefixmaps/data/linked_data.curated.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ prefixes:
schema: http://schema.org/
bibo: http://purl.org/ontology/bibo/
swrl: http://www.w3.org/2003/11/swrl#
sh: https://w3id.org/shacl/
sh: http://www.w3.org/ns/shacl#
owl: http://www.w3.org/2002/07/owl#
qb: http://purl.org/linked-data/cube#
prov: http://www.w3.org/ns/prov#
Expand Down
107 changes: 88 additions & 19 deletions src/prefixmaps/datamodel/context.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Classes for managing individual Contexts."""

import re
from dataclasses import dataclass, field
from enum import Enum
Expand All @@ -22,35 +24,76 @@

class StatusType(Enum):
"""
Classification of prefix expansions
Classification of prefix expansions.
Note that only canonical mappings are exposed to the users of the library. However,
it can be useful for prefixmap ETL pipelines to include non-canonical mappings for
purposes of debugging.
"""

canonical = "canonical"
"""The canonical prefix expansion for a prefix. The set of all canonical mappings must be bijective."""

prefix_alias = "prefix_alias"
"""The prefix is an alias for an existing canonical prefix."""

namespace_alias = "namespace_alias"
"""The prefix is an alias for an existing canonical namespace."""

multi_alias = "multi_alias"
"""Both the prefix and the namespace are aliases for existing canonical namespaces."""


@dataclass
class PrefixExpansion:
"""
An individual mapping between a prefix and a namespace
An individual mapping between a prefix and a namespace.
A PrefixExpansion corresponds to a SHACL PrefixDeclaration (https://www.w3.org/TR/shacl/#dfn-prefix-declarations)
"""

context: CONTEXT
"""Each PrefixExpansion is grouped into a context."""

prefix: PREFIX
"""Corresponds to http://www.w3.org/ns/shacl#prefix"""

namespace: NAMESPACE
"""Corresponds to http://www.w3.org/ns/shacl#namespace"""

status: StatusType
"""Indicates whether the expansion is canonical, a prefix alias, a namespace alias, or both."""

def canonical(self) -> bool:
"""
True if this is the canonical expansions
True if this is the canonical mapping in both directions.
:return:
Note that canonicality is always relative to a context:
- ("GEO", "http://purl.obolibrary.org/obo/geo/") is canonical in the OBO Foundry context
:return: True if the status is canonical
"""
return self.status == StatusType.canonical

def validate(self) -> List[str]:
"""
Validate the prefix expansion.
- Ensures that prefixes conform to W3C CURIE syntax
- Ensures that namespaces conform to a restricted subset of W3C URI syntax
Note that we use a highly restricted syntax in order to filter out pseudo-semantic
URIs. These include URLs for websites intended for humans that have http parameters
with `?`s, `=`s, etc.
These URLs are almost NEVER intended to be used as semantic URIs, i.e as subjects of
RDF triples. It is almost always bad practice to use them as such.
In future, if we discover exceptions to this rule, we will add them here.
:return: list of validation errors
"""
messages = []
if not PREFIX_RE.match(self.prefix):
messages.append(f"prefix {self.prefix} does not match {PREFIX_RE}")
Expand All @@ -65,13 +108,27 @@ def validate(self) -> List[str]:
@dataclass
class Context:
"""
A context is a localized collection of prefix expansions
A context is a localized collection of prefix expansions.
A context should be internally consistent:
- the set of canonical PrefixExpansions should be bijective
However, there is no guarantee that a context is consistent with other contexts.
"""

name: CONTEXT
"""A unique stable handle for the context."""

description: Optional[str] = None
"""A human readable concise description of the context."""

prefix_expansions: List[PrefixExpansion] = field(default_factory=lambda: [])
"""All prefix expansions within that context. Corresponds to http://www.w3.org/ns/shacl#prefixes"""

comments: List[str] = None
"""Optional comments on the context."""

location: Optional[str] = None
format: Optional[str] = None
merged_from: Optional[List[str]] = None
Expand All @@ -80,9 +137,10 @@ class Context:

def combine(self, context: "Context"):
"""
Merge a context into this one
Merge a context into this one.
The current context stays primary
If there are conflicts, the current context takes precedence,
and the merged expansions are marked as non-canonical
:param context:
:return:
Expand All @@ -98,18 +156,18 @@ def add_prefix(
preferred: bool = False,
):
"""
Adds a prefix expansion to this context
Adds a prefix expansion to this context.
The current context stays canonical. Additional prefixes
added may be classified as aliases
added may be classified as non-canonical.
If upper or lower is set for this context, the the
If upper or lower is set for this context, the
prefix will be auto-case normalized,
UNLESS preferred=True
:param prefix:
:param namespace:
:param status:
:param prefix: prefix to be added
:param namespace: namespace to be added
:param status: the status of the prefix being added
:param preferred:
:return:
"""
Expand Down Expand Up @@ -143,7 +201,7 @@ def add_prefix(

def filter(self, prefix: PREFIX = None, namespace: NAMESPACE = None):
"""
Returns namespaces matching query
Returns namespaces matching query.
:param prefix:
:param namespace:
Expand All @@ -160,8 +218,9 @@ def filter(self, prefix: PREFIX = None, namespace: NAMESPACE = None):

def prefixes(self, lower=False) -> List[str]:
"""
All unique prefixes in all prefix expansions
All unique prefixes in all prefix expansions.
:param lower: if True, the prefix is normalized to lowercase.
:return:
"""
if lower:
Expand All @@ -173,6 +232,7 @@ def namespaces(self, lower=False) -> List[str]:
"""
All unique namespaces in all prefix expansions
:param lower: if True, the namespace is normalized to lowercase.
:return:
"""
if lower:
Expand All @@ -182,21 +242,30 @@ def namespaces(self, lower=False) -> List[str]:

def as_dict(self) -> PREFIX_EXPANSION_DICT:
"""
Returns a mapping between canonical prefixes and expansions
Returns a mapping between canonical prefixes and expansions.
:return:
This only includes canonical expansions. The results can be safely used
in the header of RDF syntax documents.
:return: Mappings between prefixes and namespaces
"""
return {pe.prefix: pe.namespace for pe in self.prefix_expansions if pe.canonical()}

def as_inverted_dict(self) -> INVERSE_PREFIX_EXPANSION_DICT:
"""
Returns a mapping between canonical expansions and prefixes
Returns a mapping between canonical expansions and prefixes.
:return:
:return: Mapping between namespaces and prefixes
"""
return {pe.namespace: pe.prefix for pe in self.prefix_expansions if pe.canonical()}

def validate(self, canonical_only=True) -> List[str]:
"""
Validates each prefix expansion in the context.
:param canonical_only:
:return:
"""
messages = []
for pe in self.prefix_expansions:
if canonical_only and not pe.canonical():
Expand Down
22 changes: 20 additions & 2 deletions src/prefixmaps/ingest/etl_runner.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""ETL logic for retrieving and normalizing upstream contexts."""
from pathlib import Path
from typing import Callable, Dict, Mapping, Union

Expand All @@ -24,17 +25,25 @@
"bioregistry": from_bioregistry,
"prefixcc": from_prefixcc,
}
"""Maps the name of a context to the python function that can generate it"""

COMBINED = {
"merged": ["obo", "go", "linked_data", "bioregistry.upper", "prefixcc"],
"merged.oak": ["obo", "go", "linked_data", "bioregistry.upper", "prefixcc"],
}
"""Contexts that remix other contexts. Order is significant, with the first listed having highest precedence."""


def load_context_from_source(context: CONTEXT) -> Context:
"""
Loads a context from upstream source
Loads a context from upstream source.
The context name should be a handle for either:
:param context:
- An atomic context (e.g. obo, linked_data)
- A conbined context (which remixes existing contexts)
:param context: unique handle of the context
:return:
"""
if context in CONTEXTS:
Expand All @@ -49,6 +58,15 @@ def load_context_from_source(context: CONTEXT) -> Context:


def run_etl(output_directory: Union[str, Path]) -> None:
"""
Runs the complete ETL pipeline.
All contexts are refreshed from upstream sources, and written to the output directory,
as CSV.
:param output_directory:
:return:
"""
# contexts = load_contexts_meta()
output_directory = Path(output_directory).resolve()
output_directory.mkdir(exist_ok=True, parents=True)
Expand Down
14 changes: 0 additions & 14 deletions src/prefixmaps/ingest/ingest.py

This file was deleted.

27 changes: 24 additions & 3 deletions src/prefixmaps/ingest/ingest_bioregistry.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""ETL from bioregistry to prefixmaps."""
import logging
import re

Expand All @@ -12,21 +13,39 @@
"ols",
"n2t",
]
"""Priority order for bioregistry."""


def from_bioregistry_upper(**kwargs) -> Context:
"""
As :ref:`from_bioregistry` with default uppercase normalization on
As :ref:`from_bioregistry`, with default uppercase normalization on
:param kwargs:
:param kwargs: pass-through to :ref:`from_bioregistry`
:return:
"""
return from_bioregistry(upper=True, **kwargs)


def from_bioregistry(upper=False, canonical_idorg=True, filter_dubious=True) -> Context:
"""
Creates a Context from the bioregistry
Creates a Context from the bioregistry.
This will transform bioregistry entries into semantic prefix expansions.
Note: in future some of the logic from this can migrate up to the main
bioregistries repository. For now, we deal with additional corner cases:
URLs that look like they are not intended to be used as semantic URIs are
filtered by default. This can be disabled with ``filter_dubious=False``.
This method also has special handling for the identifiers.org registry
(aka "miriam"). This is because a number of triplestores have historically
used URIs of the form "http://identifiers.org/Prefix/LocalId" as the
subject of their triples. While this is bad practice for "born semantic"
IDs such as those in OBO, a lot of the bio-semantic web community have
adopted this practice to provide semantic URIs non-born-semantic databases.
In order to support this use case, we have an option to preserve these
original namespaces. This can be disabled with ``canonical_idorg=False``.
:param upper: if True, normalize prefix to uppercase
unless a preferred form is stated
Expand All @@ -38,6 +57,8 @@ def from_bioregistry(upper=False, canonical_idorg=True, filter_dubious=True) ->
from bioregistry import get_prefix_map

ctxt = Context("bioregistry", upper=upper)
# We always set use_preferred=True, which ensures that OBO prefixes
# are either capitalized (e.g. GO) or use the preferred form (e.g. FBbt)
prefix_map = get_prefix_map(priority=priority, use_preferred=True)
pm_non_preferred = get_prefix_map(priority=priority, use_preferred=False)
pm_miriam = get_prefix_map(priority=["miriam"])
Expand Down
7 changes: 6 additions & 1 deletion src/prefixmaps/ingest/ingest_go.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""Ingests the GO prefix registry."""
from typing import TextIO, Union

import requests
Expand All @@ -15,7 +16,11 @@ def parse_go_xrefs_from_remote() -> Context:

def parse_go_xrefs(input: Union[str, TextIO]) -> Context:
"""
Parse GO db-xrefs.yaml file
Parse GO db-xrefs.yaml file.
Note that most entries in the file are ignored. We only extract the
"embedded JSON-LD context" which are those marked rdf_uri_prefix,
which indicates the *semantic* expansions used in the triplestore.
:param file:
:return:
Expand Down
Loading

0 comments on commit b53947b

Please sign in to comment.