diff --git a/.env-example b/.env-example new file mode 100644 index 0000000..70f91e8 --- /dev/null +++ b/.env-example @@ -0,0 +1 @@ +STAC_CHECK_CONFIG="stac-check.config.yml" \ No newline at end of file diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml deleted file mode 100644 index 0cf627d..0000000 --- a/.github/workflows/publish-to-pypi.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Publish Python distributions to PyPI - -on: - push: - branches: - - main - -jobs: - build-n-publish: - name: Build and publish Python distributions to PyPI and TestPyPI - runs-on: ubuntu-18.04 - - steps: - - uses: actions/checkout@master - - name: Set up Python 3.8 - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - - name: Install pypa/build - run: >- - python -m - pip install - build - --user - - name: Build a binary wheel and a source tarball - run: >- - python -m - build - --sdist - --wheel - --outdir dist/ - - name: Publish distribution to PyPI - # if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@v1.4.2 - with: - password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/publish-to-test-pypi.yml b/.github/workflows/publish-to-test-pypi.yml deleted file mode 100644 index 20c0cea..0000000 --- a/.github/workflows/publish-to-test-pypi.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: Publish Python distributions to PyPI - -on: - push: - branches: - - dev - -jobs: - build-n-publish: - name: Build and publish Python distributions to PyPI and TestPyPI - runs-on: ubuntu-18.04 - - steps: - - uses: actions/checkout@master - - name: Set up Python 3.8 - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - - name: Install pypa/build - run: >- - python -m - pip install - build - --user - - name: Build a binary wheel and a source tarball - run: >- - python -m - build - --sdist - --wheel - --outdir dist/ - - name: Publish distribution to Test PyPI - # if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@v1.4.2 - with: - password: ${{ secrets.TEST_PYPI_API_TOKEN }} - repository_url: https://test.pypi.org/legacy/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 97137af..54f5c6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. The format is (loosely) based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). +## [Unreleased] - +### Added +- Option to include a configuration file to ignore selected checks +### Changed +- Change name from stac_check to stac-check in setup for cli + ## [v1.1.3] - 2022-03-03 - Fix thumbnail size check diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..d75e319 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include stac_check/stac-check.config.yml \ No newline at end of file diff --git a/README.md b/README.md index 7a1cc0d..40ca1e0 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ or for local development --- ### Usage ``` -Usage: stac_check [OPTIONS] FILE +Usage: stac-check [OPTIONS] FILE Options: --version Show the version and exit. @@ -33,7 +33,7 @@ $ make shell --- ### Examples -``` stac_check https://raw.githubusercontent.com/stac-utils/pystac/main/tests/data-files/examples/0.9.0/collection-spec/examples/landsat-collection.json --recursive ``` +``` stac-check https://raw.githubusercontent.com/stac-utils/pystac/main/tests/data-files/examples/0.9.0/collection-spec/examples/landsat-collection.json --recursive ``` ``` ____ ____ __ ___ ___ _ _ ____ ___ __ _ / ___)(_ _)/ _\ / __)___ / __)/ )( \( __)/ __)( / ) @@ -68,7 +68,7 @@ Recursive validation error message: This object has 4 links ``` -``` stac_check sample_files/0.9.0/landsat8-sample.json``` +``` stac-check sample_files/0.9.0/landsat8-sample.json```
stac-check: STAC spec validaton and linting tool @@ -96,7 +96,7 @@ STAC Best Practices: This object has 4 links-``` stac_check sample_files/1.0.0/core-item.json --assets``` +``` stac-check sample_files/1.0.0/core-item.json --assets```
stac-check: STAC spec validaton and linting tool @@ -131,7 +131,7 @@ This object has 4 links -``` stac_check sample_files/1.0.0/core-item-bad-links.json --links --assets``` +``` stac-check sample_files/1.0.0/core-item-bad-links.json --links --assets```stac-check: STAC spec validaton and linting tool @@ -173,7 +173,7 @@ LINK request errors: This object has 4 links-``` stac_check sample_files/0.9.0/bad-item.json``` +``` stac-check sample_files/0.9.0/bad-item.json```stac-check: STAC spec validaton and linting tool diff --git a/setup.py b/setup.py index 9c6c629..c3fb239 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ name="stac_check", version=__version__, description="Linting and validation tool for STAC assets", - url="https://github.com/jonhealy1/stac-check", + url="https://github.com/stac-utils/stac-check", packages=find_packages(exclude=("tests",)), include_package_data=True, install_requires=[ @@ -20,10 +20,12 @@ "requests>=2.19.1", "jsonschema>=3.1.2b0", "pytest", - "stac-validator>=2.4.2" + "stac-validator>=2.4.2", + "PyYAML", + "python-dotenv", ], entry_points={ - 'console_scripts': ['stac_check=stac_check.cli:main'] + 'console_scripts': ['stac-check=stac_check.cli:main'] }, author="Jonathan Healy", author_email="jonathan.d.healy@gmail.com", diff --git a/stac_check/lint.py b/stac_check/lint.py index 257020a..1d2fd4f 100644 --- a/stac_check/lint.py +++ b/stac_check/lint.py @@ -1,14 +1,21 @@ +import pkg_resources from stac_validator.validate import StacValidate from stac_validator.utilities import is_valid_url import json +import yaml import os from dataclasses import dataclass import pystac import requests +from typing import Optional +from dotenv import load_dotenv + +load_dotenv() @dataclass class Linter: item: str + config_file: Optional[str] = None assets: bool = False links: bool = False recursive: bool = False @@ -16,6 +23,7 @@ class Linter: def __post_init__(self): self.data = self.load_data(self.item) self.message = self.validate_file(self.item) + self.config = self.parse_config(self.config_file) self.asset_type = self.message["asset_type"] if "asset_type" in self.message else "" self.version = self.message["version"] if "version" in self.message else "" self.validator_version = "2.3.0" @@ -33,6 +41,22 @@ def __post_init__(self): self.file_name = os.path.basename(self.item).split('.')[0] self.best_practices_msg = self.create_best_practices_msg() + @staticmethod + def parse_config(config_file): + default_config_file = os.getenv("STAC_CHECK_CONFIG") + if default_config_file: + with open(default_config_file) as f: + default_config = yaml.load(f, Loader=yaml.FullLoader) + else: + with pkg_resources.resource_stream(__name__, "stac-check.config.yml") as f: + default_config = yaml.load(f, Loader=yaml.FullLoader) + if config_file: + with open(config_file) as f: + config = yaml.load(f, Loader=yaml.FullLoader) + default_config.update(config) + + return default_config + def load_data(self, file): if is_valid_url(file): resp = requests.get(file) @@ -90,13 +114,13 @@ def check_summaries(self): if self.asset_type == "COLLECTION": return "summaries" in self.data - def check_bloated_links(self): + def check_bloated_links(self, max_links: Optional[int] = 20): if "links" in self.data: - return len(self.data["links"]) > 20 + return len(self.data["links"]) > max_links - def check_bloated_metadata(self): + def check_bloated_metadata(self, max_properties: Optional[int] = 20): if "properties" in self.data: - return len(self.data["properties"].keys()) > 20 + return len(self.data["properties"].keys()) > max_properties def check_datetime_null(self): if "properties" in self.data: @@ -169,72 +193,75 @@ def check_catalog_id_file_name(self): def create_best_practices_dict(self): best_practices_dict = {} + config = self.config["linting"] + max_links = self.config["settings"]["max_links"] + max_properties = self.config["settings"]["max_properties"] # best practices - item ids should only contain searchable identifiers - if self.check_searchable_identifiers() == False: + if self.check_searchable_identifiers() == False and config["searchable_identifiers"] == True: msg_1 = f"Item name '{self.object_id}' should only contain Searchable identifiers" msg_2 = f"Identifiers should consist of only lowercase characters, numbers, '_', and '-'" best_practices_dict["searchable_identifiers"] = [msg_1, msg_2] # best practices - item ids should not contain ':' or '/' characters - if self.check_percent_encoded(): + if self.check_percent_encoded() and config["percent_encoded"] == True: msg_1 = f"Item name '{self.object_id}' should not contain ':' or '/'" msg_2 = f"https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#item-ids" best_practices_dict["percent_encoded"] = [msg_1, msg_2] # best practices - item ids should match file names - if not self.check_item_id_file_name(): + if not self.check_item_id_file_name() and config["item_id_file_name"] == True: msg_1 = f"Item file names should match their ids: '{self.file_name}' not equal to '{self.object_id}" best_practices_dict["check_item_id"] = [msg_1] # best practices - collection and catalog file names should be collection.json and catalog.json - if not self.check_catalog_id_file_name(): + if not self.check_catalog_id_file_name() and config["catalog_id_file_name"] == True: msg_1 = f"Object should be called '{self.asset_type.lower()}.json' not '{self.file_name}.json'" best_practices_dict["check_catalog_id"] = [msg_1] # best practices - collections should contain summaries - if self.check_summaries() == False: + if self.check_summaries() == False and config["check_summaries"] == True: msg_1 = f"A STAC collection should contain a summaries field" msg_2 = f"It is recommended to store information like eo:bands in summaries" best_practices_dict["check_summaries"] = [msg_1, msg_2] - # best practices - datetime files should not be set to null - if self.check_datetime_null(): + # best practices - datetime fields should not be set to null + if self.check_datetime_null() and config["null_datetime"] == True: msg_1 = f"Please avoid setting the datetime field to null, many clients search on this field" best_practices_dict["datetime_null"] = [msg_1] # best practices - check unlocated items to make sure bbox field is not set - if self.check_unlocated(): + if self.check_unlocated() and config["check_unlocated"] == True: msg_1 = f"Unlocated item. Please avoid setting the bbox field when geometry is set to null" best_practices_dict["check_unlocated"] = [msg_1] # best practices - recommend items have a geometry - if self.check_geometry_null(): + if self.check_geometry_null() and config["check_geometry"] == True: msg_1 = f"All items should have a geometry field. STAC is not meant for non-spatial data" best_practices_dict["null_geometry"] = [msg_1] # check to see if there are too many links - if self.check_bloated_links(): + if self.check_bloated_links(max_links=max_links) and config["bloated_links"] == True: msg_1 = f"You have {len(self.data['links'])} links. Please consider using sub-collections or sub-catalogs" best_practices_dict["bloated_links"] = [msg_1] # best practices - check for bloated metadata in properties - if self.check_bloated_metadata(): + if self.check_bloated_metadata(max_properties=max_properties) and config["bloated_metadata"] == True: msg_1 = f"You have {len(self.data['properties'])} properties. Please consider using links to avoid bloated metadata" best_practices_dict["bloated_metadata"] = [msg_1] # best practices - ensure thumbnail is a small file size ["png", "jpeg", "jpg", "webp"] - if not self.check_thumbnail() and self.asset_type == "ITEM": + if not self.check_thumbnail() and self.asset_type == "ITEM" and config["check_thumbnail"] == True: msg_1 = f"A thumbnail should have a small file size ie. png, jpeg, jpg, webp" best_practices_dict["check_thumbnail"] = [msg_1] # best practices - ensure that links in catalogs and collections include a title field - if not self.check_links_title_field(): + if not self.check_links_title_field() and config["links_title"] == True: msg_1 = f"Links in catalogs and collections should always have a 'title' field" best_practices_dict["check_links_title"] = [msg_1] # best practices - ensure that links in catalogs and collections include self link - if not self.check_links_self(): + if not self.check_links_self() and config["links_self"] == True: msg_1 = f"A link to 'self' in links is strongly recommended" best_practices_dict["check_links_self"] = [msg_1] diff --git a/stac_check/stac-check.config.yml b/stac_check/stac-check.config.yml new file mode 100644 index 0000000..bccdfd9 --- /dev/null +++ b/stac_check/stac-check.config.yml @@ -0,0 +1,33 @@ +linting: + # Identifiers should consist of only lowercase characters, numbers, '_', and '-' + searchable_identifiers: true + # Item name '{self.object_id}' should not contain ':' or '/' + percent_encoded: true + # Item file names should match their ids + item_id_file_name: true + # Collections and catalogs should be named collection.json and catalog.json + catalog_id_file_name: true + # A STAC collection should contain a summaries field + check_summaries: true + # Datetime fields should not be set to null + null_datetime: true + # best practices - check unlocated items to make sure bbox field is not set + check_unlocated: true + # best practices - recommend items have a geometry + check_geometry: true + # check to see if there are too many links + bloated_links: true + # best practices - check for bloated metadata in properties + bloated_metadata: true + # best practices - ensure thumbnail is a small file size ["png", "jpeg", "jpg", "webp"] + check_thumbnail: true + # best practices - ensure that links in catalogs and collections include a title field + links_title: true + # best practices - ensure that links in catalogs and collections include self link + links_self: true + +settings: + # number of links before the bloated links warning is shown + max_links: 20 + # number of properties before the bloated metadata warning is shown + max_properties: 20 \ No newline at end of file diff --git a/tests/test.config.yml b/tests/test.config.yml new file mode 100644 index 0000000..031ac62 --- /dev/null +++ b/tests/test.config.yml @@ -0,0 +1,33 @@ +linting: + # Identifiers should consist of only lowercase characters, numbers, '_', and '-' + searchable_identifiers: false + # Item name '{self.object_id}' should not contain ':' or '/' + percent_encoded: true + # Item file names should match their ids + item_id_file_name: true + # Collections and catalogs should be named collection.json and catalog.json + catalog_id_file_name: true + # A STAC collection should contain a summaries field + check_summaries: true + # Datetime fields should not be set to null + null_datetime: true + # best practices - check unlocated items to make sure bbox field is not set + check_unlocated: true + # best practices - recommend items have a geometry + check_geometry: true + # check to see if there are too many links + bloated_links: true + # best practices - check for bloated metadata in properties + bloated_metadata: true + # best practices - ensure thumbnail is a small file size ["png", "jpeg", "jpg", "webp"] + check_thumbnail: true + # best practices - ensure that links in catalogs and collections include a title field + links_title: true + # best practices - ensure that links in catalogs and collections include self link + links_self: true + +settings: + # number of links before the bloated links warning is shown + max_links: 200 + # number of properties before the bloated metadata warning is shown + max_properties: 20 \ No newline at end of file diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..ebbbfc4 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,32 @@ +from stac_check.lint import Linter + +def test_linter_config_file(): + file = "sample_files/1.0.0/core-item.json" + linter = Linter(file) + + # Use defualt config + assert linter.config["linting"]["searchable_identifiers"] == True + assert linter.create_best_practices_dict()["searchable_identifiers"] == [ + f"Item name '{linter.object_id}' should only contain Searchable identifiers", + "Identifiers should consist of only lowercase characters, numbers, '_', and '-'" + ] + + # Load config file + linter = Linter(file, config_file="tests/test.config.yml") + + assert linter.config["linting"]["searchable_identifiers"] == False + assert "searchable_identifiers" not in linter.create_best_practices_dict() + +def test_linter_max_links(): + file = "sample_files/1.0.0/core-item-bloated.json" + linter = Linter(file) + + assert linter.check_bloated_links() == True + assert len(linter.data["links"]) > 20 + + # Load config file + linter = Linter(file, config_file="tests/test.config.yml") + assert "bloated_links" not in linter.create_best_practices_dict() + + + \ No newline at end of file