Merge pull request #24 from linkml/inline-docs

Additional inline docs and tests
linkml · Oct 26, 2022 · b53947b · b53947b
2 parents 24e53cc + 969e216
commit b53947b
Show file tree

Hide file tree

Showing 11 changed files with 237 additions and 51 deletions.
diff --git a/src/prefixmaps/data/linked_data.curated.yaml b/src/prefixmaps/data/linked_data.curated.yaml
@@ -11,7 +11,7 @@ prefixes:
   schema: http://schema.org/
   bibo: http://purl.org/ontology/bibo/
   swrl: http://www.w3.org/2003/11/swrl#
-  sh: https://w3id.org/shacl/
+  sh: http://www.w3.org/ns/shacl#
   owl: http://www.w3.org/2002/07/owl#
   qb: http://purl.org/linked-data/cube#
   prov: http://www.w3.org/ns/prov#

diff --git a/src/prefixmaps/datamodel/context.py b/src/prefixmaps/datamodel/context.py
@@ -1,3 +1,5 @@
+"""Classes for managing individual Contexts."""
+
 import re
 from dataclasses import dataclass, field
 from enum import Enum
@@ -22,35 +24,76 @@
 
 class StatusType(Enum):
     """
-    Classification of prefix expansions
+    Classification of prefix expansions.
+
+    Note that only canonical mappings are exposed to the users of the library. However,
+    it can be useful for prefixmap ETL pipelines to include non-canonical mappings for
+    purposes of debugging.
     """
 
     canonical = "canonical"
+    """The canonical prefix expansion for a prefix. The set of all canonical mappings must be bijective."""
+
     prefix_alias = "prefix_alias"
+    """The prefix is an alias for an existing canonical prefix."""
+
     namespace_alias = "namespace_alias"
+    """The prefix is an alias for an existing canonical namespace."""
+
     multi_alias = "multi_alias"
+    """Both the prefix and the namespace are aliases for existing canonical namespaces."""
 
 
 @dataclass
 class PrefixExpansion:
     """
-    An individual mapping between a prefix and a namespace
+    An individual mapping between a prefix and a namespace.
+
+    A PrefixExpansion corresponds to a SHACL PrefixDeclaration (https://www.w3.org/TR/shacl/#dfn-prefix-declarations)
     """
 
     context: CONTEXT
+    """Each PrefixExpansion is grouped into a context."""
+
     prefix: PREFIX
+    """Corresponds to http://www.w3.org/ns/shacl#prefix"""
+
     namespace: NAMESPACE
+    """Corresponds to http://www.w3.org/ns/shacl#namespace"""
+
     status: StatusType
+    """Indicates whether the expansion is canonical, a prefix alias, a namespace alias, or both."""
 
     def canonical(self) -> bool:
         """
-        True if this is the canonical expansions
+        True if this is the canonical mapping in both directions.
 
-        :return:
+        Note that canonicality is always relative to a context:
+
+        - ("GEO", "http://purl.obolibrary.org/obo/geo/") is canonical in the OBO Foundry context
+
+        :return: True if the status is canonical
         """
         return self.status == StatusType.canonical
 
     def validate(self) -> List[str]:
+        """
+        Validate the prefix expansion.
+
+        - Ensures that prefixes conform to W3C CURIE syntax
+        - Ensures that namespaces conform to a restricted subset of W3C URI syntax
+
+        Note that we use a highly restricted syntax in order to filter out pseudo-semantic
+        URIs. These include URLs for websites intended for humans that have http parameters
+        with `?`s, `=`s, etc.
+
+        These URLs are almost NEVER intended to be used as semantic URIs, i.e as subjects of
+        RDF triples. It is almost always bad practice to use them as such.
+
+        In future, if we discover exceptions to this rule, we will add them here.
+
+        :return: list of validation errors
+        """
         messages = []
         if not PREFIX_RE.match(self.prefix):
             messages.append(f"prefix {self.prefix} does not match {PREFIX_RE}")
@@ -65,13 +108,27 @@ def validate(self) -> List[str]:
 @dataclass
 class Context:
     """
-    A context is a localized collection of prefix expansions
+    A context is a localized collection of prefix expansions.
+
+    A context should be internally consistent:
+
+    - the set of canonical PrefixExpansions should be bijective
+
+    However, there is no guarantee that a context is consistent with other contexts.
     """
 
     name: CONTEXT
+    """A unique stable handle for the context."""
+
     description: Optional[str] = None
+    """A human readable concise description of the context."""
+
     prefix_expansions: List[PrefixExpansion] = field(default_factory=lambda: [])
+    """All prefix expansions within that context. Corresponds to http://www.w3.org/ns/shacl#prefixes"""
+
     comments: List[str] = None
+    """Optional comments on the context."""
+
     location: Optional[str] = None
     format: Optional[str] = None
     merged_from: Optional[List[str]] = None
@@ -80,9 +137,10 @@ class Context:
 
     def combine(self, context: "Context"):
         """
-        Merge a context into this one
+        Merge a context into this one.
 
-        The current context stays primary
+        If there are conflicts, the current context takes precedence,
+        and the merged expansions are marked as non-canonical
 
         :param context:
         :return:
@@ -98,18 +156,18 @@ def add_prefix(
         preferred: bool = False,
     ):
         """
-        Adds a prefix expansion to this context
+        Adds a prefix expansion to this context.
 
         The current context stays canonical. Additional prefixes
-        added may be classified as aliases
+        added may be classified as non-canonical.
 
-        If upper or lower is set for this context, the the
+        If upper or lower is set for this context, the
         prefix will be auto-case normalized,
         UNLESS preferred=True
 
-        :param prefix:
-        :param namespace:
-        :param status:
+        :param prefix: prefix to be added
+        :param namespace: namespace to be added
+        :param status: the status of the prefix being added
         :param preferred:
         :return:
         """
@@ -143,7 +201,7 @@ def add_prefix(
 
     def filter(self, prefix: PREFIX = None, namespace: NAMESPACE = None):
         """
-        Returns namespaces matching query
+        Returns namespaces matching query.
 
         :param prefix:
         :param namespace:
@@ -160,8 +218,9 @@ def filter(self, prefix: PREFIX = None, namespace: NAMESPACE = None):
 
     def prefixes(self, lower=False) -> List[str]:
         """
-        All unique prefixes in all prefix expansions
+        All unique prefixes in all prefix expansions.
 
+        :param lower: if True, the prefix is normalized to lowercase.
         :return:
         """
         if lower:
@@ -173,6 +232,7 @@ def namespaces(self, lower=False) -> List[str]:
         """
         All unique namespaces in all prefix expansions
 
+        :param lower: if True, the namespace is normalized to lowercase.
         :return:
         """
         if lower:
@@ -182,21 +242,30 @@ def namespaces(self, lower=False) -> List[str]:
 
     def as_dict(self) -> PREFIX_EXPANSION_DICT:
         """
-        Returns a mapping between canonical prefixes and expansions
+        Returns a mapping between canonical prefixes and expansions.
 
-        :return:
+        This only includes canonical expansions. The results can be safely used
+        in the header of RDF syntax documents.
+
+        :return: Mappings between prefixes and namespaces
         """
         return {pe.prefix: pe.namespace for pe in self.prefix_expansions if pe.canonical()}
 
     def as_inverted_dict(self) -> INVERSE_PREFIX_EXPANSION_DICT:
         """
-        Returns a mapping between canonical expansions and prefixes
+        Returns a mapping between canonical expansions and prefixes.
 
-        :return:
+        :return: Mapping between namespaces and prefixes
         """
         return {pe.namespace: pe.prefix for pe in self.prefix_expansions if pe.canonical()}
 
     def validate(self, canonical_only=True) -> List[str]:
+        """
+        Validates each prefix expansion in the context.
+
+        :param canonical_only:
+        :return:
+        """
         messages = []
         for pe in self.prefix_expansions:
             if canonical_only and not pe.canonical():

diff --git a/src/prefixmaps/ingest/etl_runner.py b/src/prefixmaps/ingest/etl_runner.py
@@ -1,3 +1,4 @@
+"""ETL logic for retrieving and normalizing upstream contexts."""
 from pathlib import Path
 from typing import Callable, Dict, Mapping, Union
 
@@ -24,17 +25,25 @@
     "bioregistry": from_bioregistry,
     "prefixcc": from_prefixcc,
 }
+"""Maps the name of a context to the python function that can generate it"""
+
 COMBINED = {
     "merged": ["obo", "go", "linked_data", "bioregistry.upper", "prefixcc"],
     "merged.oak": ["obo", "go", "linked_data", "bioregistry.upper", "prefixcc"],
 }
+"""Contexts that remix other contexts. Order is significant, with the first listed having highest precedence."""
 
 
 def load_context_from_source(context: CONTEXT) -> Context:
     """
-    Loads a context from upstream source
+    Loads a context from upstream source.
+
+    The context name should be a handle for either:
 
-    :param context:
+    - An atomic context (e.g. obo, linked_data)
+    - A conbined context (which remixes existing contexts)
+
+    :param context: unique handle of the context
     :return:
     """
     if context in CONTEXTS:
@@ -49,6 +58,15 @@ def load_context_from_source(context: CONTEXT) -> Context:
 
 
 def run_etl(output_directory: Union[str, Path]) -> None:
+    """
+    Runs the complete ETL pipeline.
+
+    All contexts are refreshed from upstream sources, and written to the output directory,
+    as CSV.
+
+    :param output_directory:
+    :return:
+    """
     # contexts = load_contexts_meta()
     output_directory = Path(output_directory).resolve()
     output_directory.mkdir(exist_ok=True, parents=True)

diff --git a/src/prefixmaps/ingest/ingest.py b/src/prefixmaps/ingest/ingest.py
diff --git a/src/prefixmaps/ingest/ingest_bioregistry.py b/src/prefixmaps/ingest/ingest_bioregistry.py
@@ -1,3 +1,4 @@
+"""ETL from bioregistry to prefixmaps."""
 import logging
 import re
 
@@ -12,21 +13,39 @@
     "ols",
     "n2t",
 ]
+"""Priority order for bioregistry."""
 
 
 def from_bioregistry_upper(**kwargs) -> Context:
     """
-    As :ref:`from_bioregistry` with default uppercase normalization on
+    As :ref:`from_bioregistry`, with default uppercase normalization on
 
-    :param kwargs:
+    :param kwargs: pass-through to :ref:`from_bioregistry`
     :return:
     """
     return from_bioregistry(upper=True, **kwargs)
 
 
 def from_bioregistry(upper=False, canonical_idorg=True, filter_dubious=True) -> Context:
     """
-    Creates a Context from the bioregistry
+    Creates a Context from the bioregistry.
+
+    This will transform bioregistry entries into semantic prefix expansions.
+
+    Note: in future some of the logic from this can migrate up to the main
+    bioregistries repository. For now, we deal with additional corner cases:
+
+    URLs that look like they are not intended to be used as semantic URIs are
+    filtered by default. This can be disabled with ``filter_dubious=False``.
+
+    This method also has special handling for the identifiers.org registry
+    (aka "miriam"). This is because a number of triplestores have historically
+    used URIs of the form "http://identifiers.org/Prefix/LocalId" as the
+    subject of their triples. While this is bad practice for "born semantic"
+    IDs such as those in OBO, a lot of the bio-semantic web community have
+    adopted this practice to provide semantic URIs non-born-semantic databases.
+    In order to support this use case, we have an option to preserve these
+    original namespaces. This can be disabled with ``canonical_idorg=False``.
 
     :param upper: if True, normalize prefix to uppercase
                     unless a preferred form is stated
@@ -38,6 +57,8 @@ def from_bioregistry(upper=False, canonical_idorg=True, filter_dubious=True) ->
     from bioregistry import get_prefix_map
 
     ctxt = Context("bioregistry", upper=upper)
+    # We always set use_preferred=True, which ensures that OBO prefixes
+    # are either capitalized (e.g. GO) or use the preferred form (e.g. FBbt)
     prefix_map = get_prefix_map(priority=priority, use_preferred=True)
     pm_non_preferred = get_prefix_map(priority=priority, use_preferred=False)
     pm_miriam = get_prefix_map(priority=["miriam"])

diff --git a/src/prefixmaps/ingest/ingest_go.py b/src/prefixmaps/ingest/ingest_go.py
@@ -1,3 +1,4 @@
+"""Ingests the GO prefix registry."""
 from typing import TextIO, Union
 
 import requests
@@ -15,7 +16,11 @@ def parse_go_xrefs_from_remote() -> Context:
 
 def parse_go_xrefs(input: Union[str, TextIO]) -> Context:
     """
-    Parse GO db-xrefs.yaml file
+    Parse GO db-xrefs.yaml file.
+
+    Note that most entries in the file are ignored. We only extract the
+    "embedded JSON-LD context" which are those marked rdf_uri_prefix,
+    which indicates the *semantic* expansions used in the triplestore.
 
     :param file:
     :return: