diff --git a/.env-example b/.env-example new file mode 100644 index 0000000..70f91e8 --- /dev/null +++ b/.env-example @@ -0,0 +1 @@ +STAC_CHECK_CONFIG="stac-check.config.yml" \ No newline at end of file diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml deleted file mode 100644 index 0cf627d..0000000 --- a/.github/workflows/publish-to-pypi.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Publish Python distributions to PyPI - -on: - push: - branches: - - main - -jobs: - build-n-publish: - name: Build and publish Python distributions to PyPI and TestPyPI - runs-on: ubuntu-18.04 - - steps: - - uses: actions/checkout@master - - name: Set up Python 3.8 - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - - name: Install pypa/build - run: >- - python -m - pip install - build - --user - - name: Build a binary wheel and a source tarball - run: >- - python -m - build - --sdist - --wheel - --outdir dist/ - - name: Publish distribution to PyPI - # if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@v1.4.2 - with: - password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/publish-to-test-pypi.yml b/.github/workflows/publish-to-test-pypi.yml deleted file mode 100644 index 20c0cea..0000000 --- a/.github/workflows/publish-to-test-pypi.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: Publish Python distributions to PyPI - -on: - push: - branches: - - dev - -jobs: - build-n-publish: - name: Build and publish Python distributions to PyPI and TestPyPI - runs-on: ubuntu-18.04 - - steps: - - uses: actions/checkout@master - - name: Set up Python 3.8 - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - - name: Install pypa/build - run: >- - python -m - pip install - build - --user - - name: Build a binary wheel and a source tarball - run: >- - python -m - build - --sdist - --wheel - --outdir dist/ - - name: Publish distribution to Test PyPI - # if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@v1.4.2 - with: - password: ${{ secrets.TEST_PYPI_API_TOKEN }} - repository_url: https://test.pypi.org/legacy/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 97137af..54f5c6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. The format is (loosely) based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). +## [Unreleased] - +### Added +- Option to include a configuration file to ignore selected checks +### Changed +- Change name from stac_check to stac-check in setup for cli + ## [v1.1.3] - 2022-03-03 - Fix thumbnail size check diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..d75e319 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include stac_check/stac-check.config.yml \ No newline at end of file diff --git a/README.md b/README.md index 7a1cc0d..40ca1e0 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ or for local development --- ### Usage ``` -Usage: stac_check [OPTIONS] FILE +Usage: stac-check [OPTIONS] FILE Options: --version Show the version and exit. @@ -33,7 +33,7 @@ $ make shell --- ### Examples -``` stac_check https://raw.githubusercontent.com/stac-utils/pystac/main/tests/data-files/examples/0.9.0/collection-spec/examples/landsat-collection.json --recursive ``` +``` stac-check https://raw.githubusercontent.com/stac-utils/pystac/main/tests/data-files/examples/0.9.0/collection-spec/examples/landsat-collection.json --recursive ``` ``` ____ ____ __ ___ ___ _ _ ____ ___ __ _ / ___)(_ _)/ _\ / __)___ / __)/ )( \( __)/ __)( / ) @@ -68,7 +68,7 @@ Recursive validation error message: This object has 4 links ``` -``` stac_check sample_files/0.9.0/landsat8-sample.json``` +``` stac-check sample_files/0.9.0/landsat8-sample.json```
stac-check: STAC spec validaton and linting tool
 
@@ -96,7 +96,7 @@ STAC Best Practices:
 This object has 4 links
 
-``` stac_check sample_files/1.0.0/core-item.json --assets``` +``` stac-check sample_files/1.0.0/core-item.json --assets```
 stac-check: STAC spec validaton and linting tool
 
@@ -131,7 +131,7 @@ This object has 4 links
 
 
    
-``` stac_check sample_files/1.0.0/core-item-bad-links.json --links --assets```    
+``` stac-check sample_files/1.0.0/core-item-bad-links.json --links --assets```    
 
 stac-check: STAC spec validaton and linting tool
 
@@ -173,7 +173,7 @@ LINK request errors:
 This object has 4 links
 
-``` stac_check sample_files/0.9.0/bad-item.json``` +``` stac-check sample_files/0.9.0/bad-item.json```
 stac-check: STAC spec validaton and linting tool
 
diff --git a/setup.py b/setup.py
index 9c6c629..c3fb239 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
     name="stac_check",
     version=__version__,
     description="Linting and validation tool for STAC assets",
-    url="https://github.com/jonhealy1/stac-check",
+    url="https://github.com/stac-utils/stac-check",
     packages=find_packages(exclude=("tests",)),
     include_package_data=True,
     install_requires=[
@@ -20,10 +20,12 @@
         "requests>=2.19.1",
         "jsonschema>=3.1.2b0",
         "pytest",
-        "stac-validator>=2.4.2"
+        "stac-validator>=2.4.2",
+        "PyYAML",
+        "python-dotenv",
     ],
     entry_points={
-        'console_scripts': ['stac_check=stac_check.cli:main']
+        'console_scripts': ['stac-check=stac_check.cli:main']
     },
     author="Jonathan Healy",
     author_email="jonathan.d.healy@gmail.com",
diff --git a/stac_check/lint.py b/stac_check/lint.py
index 257020a..1d2fd4f 100644
--- a/stac_check/lint.py
+++ b/stac_check/lint.py
@@ -1,14 +1,21 @@
+import pkg_resources
 from stac_validator.validate import StacValidate
 from stac_validator.utilities import is_valid_url
 import json
+import yaml
 import os
 from dataclasses import dataclass
 import pystac
 import requests
+from typing import Optional
+from dotenv import load_dotenv
+
+load_dotenv()
 
 @dataclass
 class Linter:
     item: str
+    config_file: Optional[str] = None
     assets: bool = False
     links: bool = False
     recursive: bool = False
@@ -16,6 +23,7 @@ class Linter:
     def __post_init__(self):
         self.data = self.load_data(self.item)
         self.message = self.validate_file(self.item)
+        self.config = self.parse_config(self.config_file)
         self.asset_type = self.message["asset_type"] if "asset_type" in self.message else ""
         self.version = self.message["version"] if "version" in self.message else ""
         self.validator_version = "2.3.0"
@@ -33,6 +41,22 @@ def __post_init__(self):
         self.file_name = os.path.basename(self.item).split('.')[0]
         self.best_practices_msg = self.create_best_practices_msg()
 
+    @staticmethod
+    def parse_config(config_file):
+        default_config_file = os.getenv("STAC_CHECK_CONFIG")
+        if default_config_file:
+            with open(default_config_file) as f:
+                default_config = yaml.load(f, Loader=yaml.FullLoader)
+        else:
+            with pkg_resources.resource_stream(__name__, "stac-check.config.yml") as f:
+                default_config = yaml.load(f, Loader=yaml.FullLoader)
+        if config_file:
+            with open(config_file) as f:
+                config = yaml.load(f, Loader=yaml.FullLoader)
+            default_config.update(config)
+            
+        return default_config
+
     def load_data(self, file):
         if is_valid_url(file):
             resp = requests.get(file)
@@ -90,13 +114,13 @@ def check_summaries(self):
         if self.asset_type == "COLLECTION":
             return "summaries" in self.data
 
-    def check_bloated_links(self):
+    def check_bloated_links(self, max_links: Optional[int] = 20):
         if "links" in self.data:
-            return len(self.data["links"]) > 20
+            return len(self.data["links"]) > max_links
 
-    def check_bloated_metadata(self):
+    def check_bloated_metadata(self, max_properties: Optional[int] = 20):
         if "properties" in self.data:
-            return len(self.data["properties"].keys()) > 20
+            return len(self.data["properties"].keys()) > max_properties
 
     def check_datetime_null(self):
         if "properties" in self.data:
@@ -169,72 +193,75 @@ def check_catalog_id_file_name(self):
 
     def create_best_practices_dict(self):
         best_practices_dict = {}
+        config = self.config["linting"]
+        max_links = self.config["settings"]["max_links"]
+        max_properties = self.config["settings"]["max_properties"]
 
         # best practices - item ids should only contain searchable identifiers
-        if self.check_searchable_identifiers() == False: 
+        if self.check_searchable_identifiers() == False and config["searchable_identifiers"] == True: 
             msg_1 = f"Item name '{self.object_id}' should only contain Searchable identifiers"
             msg_2 = f"Identifiers should consist of only lowercase characters, numbers, '_', and '-'"
             best_practices_dict["searchable_identifiers"] = [msg_1, msg_2]
 
         # best practices - item ids should not contain ':' or '/' characters
-        if self.check_percent_encoded():
+        if self.check_percent_encoded() and config["percent_encoded"] == True:
             msg_1 = f"Item name '{self.object_id}' should not contain ':' or '/'"
             msg_2 = f"https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#item-ids"
             best_practices_dict["percent_encoded"] = [msg_1, msg_2]
 
         # best practices - item ids should match file names
-        if not self.check_item_id_file_name():
+        if not self.check_item_id_file_name() and config["item_id_file_name"] == True:
             msg_1 = f"Item file names should match their ids: '{self.file_name}' not equal to '{self.object_id}"
             best_practices_dict["check_item_id"] = [msg_1]
 
         # best practices - collection and catalog file names should be collection.json and catalog.json 
-        if not self.check_catalog_id_file_name():
+        if not self.check_catalog_id_file_name() and config["catalog_id_file_name"] == True: 
             msg_1 = f"Object should be called '{self.asset_type.lower()}.json' not '{self.file_name}.json'"
             best_practices_dict["check_catalog_id"] = [msg_1]
 
         # best practices - collections should contain summaries
-        if self.check_summaries() == False:
+        if self.check_summaries() == False and config["check_summaries"] == True:
             msg_1 = f"A STAC collection should contain a summaries field"
             msg_2 = f"It is recommended to store information like eo:bands in summaries"
             best_practices_dict["check_summaries"] = [msg_1, msg_2]
 
-        # best practices - datetime files should not be set to null
-        if self.check_datetime_null():
+        # best practices - datetime fields should not be set to null
+        if self.check_datetime_null() and config["null_datetime"] == True:
             msg_1 = f"Please avoid setting the datetime field to null, many clients search on this field"
             best_practices_dict["datetime_null"] = [msg_1]
 
         # best practices - check unlocated items to make sure bbox field is not set
-        if self.check_unlocated():
+        if self.check_unlocated() and config["check_unlocated"] == True:
             msg_1 = f"Unlocated item. Please avoid setting the bbox field when geometry is set to null"
             best_practices_dict["check_unlocated"] = [msg_1]
 
         # best practices - recommend items have a geometry
-        if self.check_geometry_null():
+        if self.check_geometry_null() and config["check_geometry"] == True:
             msg_1 = f"All items should have a geometry field. STAC is not meant for non-spatial data"
             best_practices_dict["null_geometry"] = [msg_1]
 
         # check to see if there are too many links
-        if self.check_bloated_links():
+        if self.check_bloated_links(max_links=max_links) and config["bloated_links"] == True:
             msg_1 = f"You have {len(self.data['links'])} links. Please consider using sub-collections or sub-catalogs"
             best_practices_dict["bloated_links"] = [msg_1]
 
         # best practices - check for bloated metadata in properties
-        if self.check_bloated_metadata():
+        if self.check_bloated_metadata(max_properties=max_properties) and config["bloated_metadata"] == True:
             msg_1 = f"You have {len(self.data['properties'])} properties. Please consider using links to avoid bloated metadata"
             best_practices_dict["bloated_metadata"] = [msg_1]
 
         # best practices - ensure thumbnail is a small file size ["png", "jpeg", "jpg", "webp"]
-        if not self.check_thumbnail() and self.asset_type == "ITEM":
+        if not self.check_thumbnail() and self.asset_type == "ITEM" and config["check_thumbnail"] == True:
             msg_1 = f"A thumbnail should have a small file size ie. png, jpeg, jpg, webp"
             best_practices_dict["check_thumbnail"] = [msg_1]
 
         # best practices - ensure that links in catalogs and collections include a title field
-        if not self.check_links_title_field():
+        if not self.check_links_title_field() and config["links_title"] == True:
             msg_1 = f"Links in catalogs and collections should always have a 'title' field"
             best_practices_dict["check_links_title"] = [msg_1]
 
         # best practices - ensure that links in catalogs and collections include self link
-        if not self.check_links_self():
+        if not self.check_links_self() and config["links_self"] == True:
             msg_1 = f"A link to 'self' in links is strongly recommended"
             best_practices_dict["check_links_self"] = [msg_1]
 
diff --git a/stac_check/stac-check.config.yml b/stac_check/stac-check.config.yml
new file mode 100644
index 0000000..bccdfd9
--- /dev/null
+++ b/stac_check/stac-check.config.yml
@@ -0,0 +1,33 @@
+linting:
+  # Identifiers should consist of only lowercase characters, numbers, '_', and '-'
+  searchable_identifiers: true
+  # Item name '{self.object_id}' should not contain ':' or '/'
+  percent_encoded: true
+  # Item file names should match their ids
+  item_id_file_name: true
+  # Collections and catalogs should be named collection.json and catalog.json
+  catalog_id_file_name: true
+  # A STAC collection should contain a summaries field
+  check_summaries: true
+  # Datetime fields should not be set to null
+  null_datetime: true
+  # best practices - check unlocated items to make sure bbox field is not set
+  check_unlocated: true
+  # best practices - recommend items have a geometry
+  check_geometry: true
+  # check to see if there are too many links
+  bloated_links: true
+  # best practices - check for bloated metadata in properties
+  bloated_metadata: true
+  # best practices - ensure thumbnail is a small file size ["png", "jpeg", "jpg", "webp"]
+  check_thumbnail: true
+  # best practices - ensure that links in catalogs and collections include a title field
+  links_title: true
+  # best practices - ensure that links in catalogs and collections include self link
+  links_self: true
+
+settings:
+  # number of links before the bloated links warning is shown
+  max_links: 20
+  # number of properties before the bloated metadata warning is shown
+  max_properties: 20
\ No newline at end of file
diff --git a/tests/test.config.yml b/tests/test.config.yml
new file mode 100644
index 0000000..031ac62
--- /dev/null
+++ b/tests/test.config.yml
@@ -0,0 +1,33 @@
+linting:
+  # Identifiers should consist of only lowercase characters, numbers, '_', and '-'
+  searchable_identifiers: false
+  # Item name '{self.object_id}' should not contain ':' or '/'
+  percent_encoded: true
+  # Item file names should match their ids
+  item_id_file_name: true
+  # Collections and catalogs should be named collection.json and catalog.json
+  catalog_id_file_name: true
+  # A STAC collection should contain a summaries field
+  check_summaries: true
+  # Datetime fields should not be set to null
+  null_datetime: true
+  # best practices - check unlocated items to make sure bbox field is not set
+  check_unlocated: true
+  # best practices - recommend items have a geometry
+  check_geometry: true
+  # check to see if there are too many links
+  bloated_links: true
+  # best practices - check for bloated metadata in properties
+  bloated_metadata: true
+  # best practices - ensure thumbnail is a small file size ["png", "jpeg", "jpg", "webp"]
+  check_thumbnail: true
+  # best practices - ensure that links in catalogs and collections include a title field
+  links_title: true
+  # best practices - ensure that links in catalogs and collections include self link
+  links_self: true
+
+settings:
+  # number of links before the bloated links warning is shown
+  max_links: 200
+  # number of properties before the bloated metadata warning is shown
+  max_properties: 20
\ No newline at end of file
diff --git a/tests/test_config.py b/tests/test_config.py
new file mode 100644
index 0000000..ebbbfc4
--- /dev/null
+++ b/tests/test_config.py
@@ -0,0 +1,32 @@
+from stac_check.lint import Linter
+
+def test_linter_config_file():
+    file = "sample_files/1.0.0/core-item.json"
+    linter = Linter(file)
+
+    # Use defualt config
+    assert linter.config["linting"]["searchable_identifiers"] == True
+    assert linter.create_best_practices_dict()["searchable_identifiers"] == [
+        f"Item name '{linter.object_id}' should only contain Searchable identifiers",
+        "Identifiers should consist of only lowercase characters, numbers, '_', and '-'"
+    ]
+
+    # Load config file
+    linter = Linter(file, config_file="tests/test.config.yml")
+
+    assert linter.config["linting"]["searchable_identifiers"] == False
+    assert "searchable_identifiers" not in linter.create_best_practices_dict()
+
+def test_linter_max_links():
+    file = "sample_files/1.0.0/core-item-bloated.json"
+    linter = Linter(file)
+
+    assert linter.check_bloated_links() == True
+    assert len(linter.data["links"]) > 20
+
+    # Load config file
+    linter = Linter(file, config_file="tests/test.config.yml")
+    assert "bloated_links" not in linter.create_best_practices_dict()
+
+
+    
\ No newline at end of file