Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Increase versions, speed & update model downloading #10

Merged
merged 12 commits into from
Jul 27, 2022
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

## Version 2.0.0 - Feature release - 2022-06-13
- Upgrade Flair, spaCy and model downloading functionality

## Version 1.3.4 - Feature release - 2022-01-04
- Add Japanese support

Expand Down
26 changes: 14 additions & 12 deletions code-env/python/spec/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
torch==1.6.0
flair==0.6.1
flair==0.11.3
flask>=2.0,<2.1
gensim==3.8.0
flask>=1.0,<1.1
numpy==1.19.5
spacy[ja]==3.3.0
nicolasdalsass marked this conversation as resolved.
Show resolved Hide resolved
tokenizers==0.10.3; python_version == '3.6'
sudachipy==0.6.0; python_version == '3.6'
tqdm==4.50.0
spacy[ja]==2.3.2
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.3.1/es_core_news_sm-2.3.1.tar.gz
https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-2.3.1/zh_core_web_sm-2.3.1.tar.gz
https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-2.3.0/pl_core_news_sm-2.3.0.tar.gz
https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-2.3.0/nb_core_news_sm-2.3.0.tar.gz
https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz
https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.3.0/de_core_news_sm-2.3.0.tar.gz
https://github.com/explosion/spacy-models/releases/download/ja_core_news_sm-2.3.0/ja_core_news_sm-2.3.0.tar.gz
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.3.0/zh_core_web_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-3.3.0/pl_core_news_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-3.3.0/nb_core_news_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.3.0/fr_core_news_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.3.0/de_core_news_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/ja_core_news_sm-3.3.0/ja_core_news_sm-3.3.0.tar.gz
19 changes: 19 additions & 0 deletions code-env/python/spec/resources_init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
######################## Base imports #################################
from dataiku.code_env_resources import clear_all_env_vars
from dataiku.code_env_resources import set_env_path

######################## Download FLAIR Models ###########################
# Clear all environment variables defined by a previously run script
clear_all_env_vars()

# Set Flair cache directory
set_env_path("FLAIR_CACHE_ROOT", "flair")

from flair.models import SequenceTagger

# Download pretrained model: automatically managed by Flair,
# does not download anything if model is already in FLAIR_CACHE_ROOT
SequenceTagger.load('flair/ner-english-fast')
# Add any other models you want to download, check https://huggingface.co/flair for examples
# E.g. SequenceTagger.load('flair/ner-french')
# Make sure to modify the model used in recipe.py if you want to use a different model
12 changes: 1 addition & 11 deletions custom-recipes/named-entity-recognition-extract/recipe.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,6 @@
"arity": "UNARY",
"required": true,
"acceptsDataset": true
},
{
"name": "model_folder",
"label": "Flair model (optional)",
"description": "Folder containing Flair model weights",
"arity": "UNARY",
"required": false,
"acceptsManagedFolder": true,
"acceptsDataset": false,
"mustBeStrictlyType": "Filesystem"
}
],
"outputRoles": [
Expand Down Expand Up @@ -124,7 +114,7 @@
"name": "ner_model",
"label": "Model",
"type": "SELECT",
"description": "spaCy (multi-lingual, faster) or Flair (English only, slower)",
"description": "spaCy (faster) or Flair (slower)",
"selectChoices": [
{
"value": "spacy",
Expand Down
22 changes: 11 additions & 11 deletions custom-recipes/named-entity-recognition-extract/recipe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# -*- coding: utf-8 -*-
import multiprocessing

import dataiku
from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config

Expand Down Expand Up @@ -37,16 +39,10 @@

language = recipe_config.get("text_language_spacy", "en")
else:
from ner_utils_flair import extract_entities, CustomSequenceTagger

try:
model_folder = get_input_names_for_role("model_folder")[0]
except IndexError:
raise Exception(
"To use Flair, download the model using the macro and add the resulting folder as input to the recipe."
)
folder_path = dataiku.Folder(model_folder).get_path()
tagger = CustomSequenceTagger.load("ner-ontonotes-fast", folder_path)
from flair.models import SequenceTagger
from ner_utils_flair import extract_entities

tagger = SequenceTagger.load("flair/ner-english-fast")

#############################
# Main Loop
Expand All @@ -63,7 +59,11 @@ def compute_entities_df(df):
out_df = df.merge(out_df, left_index=True, right_index=True)
return out_df

if ner_model == "spacy":
chunksize = 200 * multiprocessing.cpu_count()
else:
chunksize = 100

process_dataset_chunks(
input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=100
input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=chunksize
)
2 changes: 1 addition & 1 deletion plugin.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"id": "named-entity-recognition",
"version": "1.3.4",
"version": "2.0.0",
"meta": {
"label": "Named Entity Recognition",
"category": "Natural Language Processing",
Expand Down
155 changes: 12 additions & 143 deletions python-lib/ner_utils_flair.py
Original file line number Diff line number Diff line change
@@ -1,134 +1,9 @@
# -*- coding: utf-8 -*-
import os
import logging
import re
import json
import requests
from urllib.parse import urlparse
from collections import defaultdict
import json

from flair.data import Sentence
from flair.models.sequence_tagger_model import SequenceTagger
import pandas as pd
from tqdm import tqdm

FLAIR_ENTITIES = [
"PERSON",
"NORP",
"FAC",
"ORG",
"GPE",
"LOC",
"PRODUCT",
"EVENT",
"WORK_OF_ART",
"LAW",
"LANGUAGE",
"DATE",
"TIME",
"PERCENT",
"MONEY",
"QUANTITY",
"ORDINAL",
"CARDINAL",
]


def get_from_cache(url: str, cache_dir: str = None) -> str:
"""
Given a URL, look for the corresponding dataset in the local cache.
If it's not there, download it. Then return the path to the cached file.
"""
os.makedirs(cache_dir, exist_ok=True)

filename = re.sub(r".+/", "", url)
# get cache path to put the file
cache_path = os.path.join(cache_dir, filename)
if os.path.exists(cache_path):
logging.info("File {} found in cache".format(filename))
return cache_path

# make HEAD request to check ETag
response = requests.head(url)
if response.status_code != 200:
raise IOError("HEAD request failed for url {}".format(url))

if not os.path.exists(cache_path):
logging.info("File {} not found in cache, downloading from URL {}...".format(filename, url))
req = requests.get(url, stream=True)
content_length = req.headers.get("Content-Length")
total = int(content_length) if content_length is not None else None
progress = tqdm(unit="B", total=total)
with open(cache_path, "wb") as temp_file:
for chunk in req.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()

return cache_path


def cached_path(url_or_filename: str, cache_path: str, cache_dir: str) -> str:
"""
Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and
return the path to the cached file. If it's already a local path,
make sure the file exists and then return the path.
"""
dataset_cache = os.path.join(cache_path, cache_dir)

parsed = urlparse(url_or_filename)

if parsed.scheme in ("http", "https"):
# URL, so get it from the cache (downloading if necessary)
return get_from_cache(url_or_filename, dataset_cache)
elif parsed.scheme == "" and os.path.exists(url_or_filename):
# File, and it exists.
return url_or_filename
elif parsed.scheme == "":
# File, but it doesn't exist.
raise FileNotFoundError("file {} not found".format(url_or_filename))
else:
# Something unknown
raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))


class CustomSequenceTagger(SequenceTagger):
@staticmethod
def load(model: str, cache_path: str):
model_file = None
aws_resource_path = "https://nlp.informatik.hu-berlin.de/resources/models"

if model.lower() == "ner":
base_path = "/".join([aws_resource_path, "ner", "en-ner-conll03-v0.4.pt"])
model_file = cached_path(base_path, cache_path, cache_dir="models")

if model.lower() == "ner-ontonotes-fast":
base_path = "/".join([aws_resource_path, "ner-ontonotes-fast", "en-ner-ontonotes-fast-v0.4.pt"])
model_file = cached_path(base_path, cache_path, cache_dir="models")

if model_file is not None:
tagger = SequenceTagger.load(model_file)
return tagger


#############################
# NER function
#############################

# Regex for matching either
PATTERN = r"({}|{})".format(
# Single-word entities
r"(?:\s*\S+ <S-[A-Z_]*>)", # (<S-TAG> format)
# Match multi-word entities
r"{}{}{}".format(
r"(?:\s*\S+ <B-[A-Z_]*>)", # A first tag in <B-TAG> format
r"(?:\s*\S+ <I-[A-Z_]*>)*", # Zero or more tags in <I-TAG> format
r"(?:\s*\S+ <E-[A-Z_]*>)", # A final tag in <E-TAG> format
),
)
matcher = re.compile(PATTERN)


def extract_entities(text_column, format, tagger):
Expand All @@ -138,33 +13,27 @@ def extract_entities(text_column, format, tagger):
# Tag Sentences
tagger.predict(sentences)

# Retrieve entities
if format:
entity_df = pd.DataFrame()
else:
entity_df = pd.DataFrame(columns=FLAIR_ENTITIES)

# Extract entities
rows = []
for sentence in sentences:
df_row = defaultdict(list)
entities = matcher.findall(sentence.to_tagged_string())
# Entities are in the following format: word1 <X-TAG> word2 <X-TAG> ...
for entity in entities:
# Extract entity text (word1, word2, ...)
text = " ".join(entity.split()[::2])
# Extract entity type (TAG)
tag = re.search(r"<.-(.+?)>", entity).group(1)
for entity in sentence.get_spans('ner'):
tag = entity.get_label("ner").value
text = entity.text
df_row[tag].append(text)

if format:
df_row = {"sentence": sentence.to_plain_string(), "entities": json.dumps(df_row)}
else:
for k, v in df_row.items():
df_row[k] = json.dumps(v)
df_row["sentence"] = sentence.to_plain_string()

entity_df = entity_df.append(df_row, ignore_index=True)
rows.append(df_row)

cols = [col for col in entity_df.columns.tolist() if col != "sentence"]
entity_df = entity_df[cols]
entity_df = pd.DataFrame(rows)

# Put 'sentence' column first
cols = sorted(list(entity_df.columns))
cols.insert(0, cols.pop(cols.index("sentence")))
entity_df = entity_df[cols]
return entity_df
27 changes: 20 additions & 7 deletions python-lib/ner_utils_spacy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import json
from collections import defaultdict
import json

import pandas as pd
import spacy

Expand All @@ -15,14 +16,25 @@
"nb": "nb_core_news_sm",
}

def get_spacy_model(language: str):
language_model = SPACY_LANGUAGE_MODELS.get(language, None)
if language_model is None:
raise ValueError(f"The language {language} is not available. \
You can add the language & corresponding model name by editing the code.")
try:
nlp = spacy.load(language_model, exclude=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
except OSError:
# Raising ValueError instead of OSError so it shows up at the top of the log
raise ValueError(f"Could not find spaCy model for the language {language}. \
Maybe you need to edit the requirements.txt file to enable it.")
return nlp

def extract_entities(text_column, format: bool, language: str):
# Tag sentences
nlp = spacy.load(SPACY_LANGUAGE_MODELS[language])
docs = nlp.pipe(text_column.values)

nlp = get_spacy_model(language=language)
docs = nlp.pipe(text_column.values, n_process=-1, batch_size=100)
# Extract entities
entity_df = pd.DataFrame()
rows = []
for doc in docs:
df_row = defaultdict(list)
for entity in doc.ents:
Expand All @@ -35,11 +47,12 @@ def extract_entities(text_column, format: bool, language: str):
df_row[k] = json.dumps(v)
df_row["sentence"] = doc.text

entity_df = entity_df.append(df_row, ignore_index=True)
rows.append(df_row)

entity_df = pd.DataFrame(rows)

# Put 'sentence' column first
cols = sorted(list(entity_df.columns))
cols.insert(0, cols.pop(cols.index("sentence")))
entity_df = entity_df[cols]

return entity_df
25 changes: 0 additions & 25 deletions python-runnables/named-entity-recognition-download/runnable.json

This file was deleted.

Loading