Skip to content

Commit

Permalink
Feature/add directory scan for ContentFileScanner. (#4)
Browse files Browse the repository at this point in the history
* Add async method for external column names scanner (#3)

* Add scan directory for the ContentFilesScanner
  • Loading branch information
fvaleye authored Nov 8, 2021
1 parent c5f7e32 commit 197c2b3
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 9 deletions.
2 changes: 1 addition & 1 deletion python/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "metadata_guardian-python"
version = "0.0.8"
version = "0.0.9"
authors = ["Florian Valeye <fvaleye@github.com>"]
homepage = "https://fvaleye.github.io/metadata-guardian/python"
license = "Apache-2.0"
Expand Down
6 changes: 3 additions & 3 deletions python/docs/source/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ Scan the column names of a local source:

Scan content of a file:

>>> from metadata_guardian import DataRules, ContentFileScanner, AvailableCategory
>>> from metadata_guardian import DataRules, ContentFilesScanner, AvailableCategory
>>>
>>> data_rules = DataRules.from_available_category(category=AvailableCategory.PII)
>>> column_scanner = ContentFileScanner(data_rules=data_rules)
>>> report = column_scanner.scan_local_file(path="path")
>>> content_file_scanner = ContentFilesScanner(data_rules=data_rules)
>>> report = content_file_scanner.scan_local_file(path="path")
>>> report.to_console()

23 changes: 21 additions & 2 deletions python/metadata_guardian/scanner.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import os
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional
Expand Down Expand Up @@ -203,7 +204,7 @@ async def async_validate_words(table_name: str) -> ReportResults:


@dataclass
class ContentFileScanner:
class ContentFilesScanner:
"""Content Files Scanner instance."""

data_rules: DataRules
Expand All @@ -212,7 +213,7 @@ def scan_local_file(self, path: str) -> MetadataGuardianReport:
"""
Scan a file with data rules.
:param path: the path of the file to scan
:return: Metadata Guardian report
:return: a Metadata Guardian report
"""
return MetadataGuardianReport(
report_results=[
Expand All @@ -221,3 +222,21 @@ def scan_local_file(self, path: str) -> MetadataGuardianReport:
)
]
)

def scan_directory(
self, directory_path: str, file_names_extension: str
) -> MetadataGuardianReport:
"""
Scan all the files inside directory path with the file name extension.
:param directory_path: the directory path to scan
:param file_names_extension: the file name extension to include (without the .)
:return: a Metadata Guardian report
"""
report = MetadataGuardianReport()
for root, dirs, files in os.walk(directory_path):
for name in files:
if name.endswith(f".{file_names_extension}"):
report.append(
other_report=self.scan_local_file(path=f"{root}/{name}")
)
return report
4 changes: 2 additions & 2 deletions python/tests/test_data_rules.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pytest

from metadata_guardian.data_rules import AvailableCategory, DataRules
from metadata_guardian.scanner import ColumnScanner, ContentFileScanner
from metadata_guardian.scanner import ColumnScanner, ContentFilesScanner
from metadata_guardian.source.local.avro_schema_source import AvroSchemaSource


Expand Down Expand Up @@ -49,7 +49,7 @@ def test_get_data_rules_from_category_inclusion_no_violation(local_file):
def test_get_data_rules_from_category_inclusion_violation_content(local_file):
data_rules = DataRules.from_available_category(category=AvailableCategory.INCLUSION)

md_results = ContentFileScanner(data_rules=data_rules).scan_local_file(local_file)
md_results = ContentFilesScanner(data_rules=data_rules).scan_local_file(local_file)

assert len(md_results.report_results[0].results) == 1
assert "resources/inclusion_violation.txt" in md_results.report_results[0].source
Expand Down
16 changes: 15 additions & 1 deletion python/tests/test_scanner.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import asyncio
import os
from unittest.mock import patch

from metadata_guardian.data_rules import AvailableCategory, DataRules
from metadata_guardian.report import MetadataGuardianReport, ReportResults
from metadata_guardian.scanner import ColumnScanner
from metadata_guardian.scanner import ColumnScanner, ContentFilesScanner
from metadata_guardian.source.external.snowflake_source import SnowflakeSource


Expand Down Expand Up @@ -125,3 +126,16 @@ def test_column_scanner_database_name_async(mock_connection):
)

assert report == expected


def test_local_directory_scan():
directory_path = os.path.join(os.path.dirname(__file__), "resources")
file_names_extension = "txt"

data_rules = DataRules.from_available_category(category=AvailableCategory.INCLUSION)

report = ContentFilesScanner(data_rules=data_rules).scan_directory(
directory_path=directory_path, file_names_extension=file_names_extension
)

assert "resources/inclusion_violation.txt" in str(report)

0 comments on commit 197c2b3

Please sign in to comment.