Skip to content

Commit

Permalink
feature: First version of spacy plugin for piicatcher
Browse files Browse the repository at this point in the history
A piicatcher plugin that uses spacy to scan column data. By default it
downloads en_US_core_news_lg as the model to use.
  • Loading branch information
vrajat committed Dec 18, 2021
1 parent 91356e5 commit 25c90fd
Show file tree
Hide file tree
Showing 11 changed files with 3,083 additions and 0 deletions.
32 changes: 32 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: piicatcher_spacy
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.6', '3.7', '3.8']
name: Python ${{ matrix.python-version }}
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install Python Poetry
uses: abatilo/actions-poetry@v2.1.4
- name: View poetry version
run: poetry --version
- name: Install dependencies
run: |
python -m poetry install
- name: Test with pytest
run: |
python -m poetry run pytest --junitxml=junit/test-results.xml --cov=piicatcher --cov-report=xml --cov-report=html tests/
50 changes: 50 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: Upload Python Package

on:
push:
# Sequence of patterns matched against refs/tags
tags:
- 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10

jobs:
release:
name: Create Release
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@master
- name: Build Changelog
id: github_release
uses: mikepenz/release-changelog-builder-action@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Create Release
id: create_release
uses: softprops/action-gh-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token
with:
body: ${{steps.github_release.outputs.changelog}}
draft: false
prerelease: false
deploy:
needs: release
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: "3.8"
- name: Install Python Poetry
uses: abatilo/actions-poetry@v2.1.4
- name: View poetry version
run: poetry --version
- name: Install dependencies
run: |
python -m poetry install
- name: Build and publish
env:
PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }}
PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
poetry publish --build --username "${PYPI_USERNAME}" --password "${PYPI_PASSWORD}"
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,5 @@ dmypy.json

# Pyre type checker
.pyre/

.idea
32 changes: 32 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
repos:
- repo: local
hooks:
- id: isort
name: isort
stages: [commit]
language: system
entry: poetry run isort
types: [python]

- id: black
name: black
stages: [commit]
language: system
entry: poetry run black
types: [python]

- id: flake8
name: flake8
stages: [commit]
language: system
entry: poetry run flake8
types: [python]
exclude: setup.py

- id: mypy
name: mypy
stages: [commit]
language: system
entry: poetry run mypy
types: [python]
pass_filenames: false
Empty file added piicatcher_spacy/__init__.py
Empty file.
1 change: 1 addition & 0 deletions piicatcher_spacy/detectors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .spacy import SpacyDetector
96 changes: 96 additions & 0 deletions piicatcher_spacy/detectors/spacy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import importlib
import logging
from typing import Optional

import spacy
from dbcat.catalog import CatColumn
from dbcat.catalog.pii_types import PiiType
from piicatcher import Address, Person, BirthDate
from piicatcher.detectors import register_detector, DatumDetector


LOGGER = logging.getLogger(__name__)


@register_detector
class SpacyDetector(DatumDetector):
pii_cls_map = {
'FAC': Address, # Buildings, airports, highways, bridges, etc.
'GPE': Address, # Countries, cities, states.
'LOC': Address, # Non-GPE locations, mountain ranges, bodies of water.
'PERSON': Person, # People, including fictional.
'PER': Person, # Bug in french model
'DATE': BirthDate, # Dates within the period 18 to 100 years ago.
}
name = 'DatumSpacyDetector'

def __init__(self, model: str = "en_US_core_news_lg"):
super(SpacyDetector, self).__init__()

# Fixes a warning message from transformers that is pulled in via spacy
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
self.check_spacy_version()

if not self.check_spacy_model(model):
raise ValueError("Unable to find spacy model '{}'. Is your language supported? "
"Check the list of models available here: "
"https://github.com/explosion/spacy-models ".format(self.model))

self.nlp = spacy.load(model)

# If the model doesn't support named entity recognition
if 'ner' not in [step[0] for step in self.nlp.pipeline]:
raise ValueError(
"The spacy model '{}' doesn't support named entity recognition, "
"please choose another model.".format(self.model)
)

@staticmethod
def check_spacy_version() -> bool:
"""Ensure that the version of spaCy is v3."""
spacy_version = spacy.__version__ # spacy_info.get('spaCy version', spacy_info.get('spacy_version', None))

if spacy_version is None:
raise ImportError('Spacy v3 needs to be installed. Unable to detect spacy version.')
try:
spacy_major = int(spacy_version.split('.')[0])
except Exception:
raise ImportError('Spacy v3 needs to be installed. Spacy version {} is unknown.'.format(spacy_version))
if spacy_major != 3:
raise ImportError('Spacy v3 needs to be installed. Detected version {}.'.format(spacy_version))

return True

@staticmethod
def check_spacy_model(model) -> bool:
"""Ensure that the spaCy model is installed."""
spacy_info = spacy.info()
if isinstance(spacy_info, str):
raise ValueError('Unable to detect spacy models.')
models = list(spacy_info.get('pipelines', spacy_info.get('models', None)).keys())
if models is None:
raise ValueError('Unable to detect spacy models.')

if model not in models:
LOGGER.info("Downloading spacy model {}".format(model))
spacy.cli.download(model)
importlib.import_module(model)
# spacy.info() doesnt update after a spacy.cli.download, so theres no point checking it
models.append(model)

# Always returns true, if it fails to download, spacy sys.exit()s
return model in models

def detect(self, column: CatColumn, datum: str) -> Optional[PiiType]:
doc = self.nlp(datum)
for ent in doc.ents:
LOGGER.debug("Found %s", ent.label_)
if ent.label_ == "PERSON":
return Person()

if ent.label_ == "GPE":
return Address()

if ent.label_ == "DATE":
return BirthDate()
Loading

0 comments on commit 25c90fd

Please sign in to comment.