Skip to content

Commit

Permalink
Increase versions, speed & update model downloading (#10)
Browse files Browse the repository at this point in the history
Co-authored-by: Nicolas Dalsass <nicolasdalsass@users.noreply.github.com>

- Move FLAIR to resources_init & make non-en spaCy models optional
- Upgrade to spaCy 3 & increment plugin version
- Disable unused pipeline algos - divides recipe time roughly by two
- Allow multi-cpu processing - on a 8 core machine, divives recipe time roughly by 3
  • Loading branch information
Muennighoff authored Jul 27, 2022
1 parent 6fb143e commit 99cdf91
Show file tree
Hide file tree
Showing 11 changed files with 84 additions and 268 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

## Version 2.0.0 - Feature release - 2022-06-13
- Upgrade Flair, spaCy and model downloading functionality

## Version 1.3.4 - Feature release - 2022-01-04
- Add Japanese support

Expand Down
26 changes: 14 additions & 12 deletions code-env/python/spec/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
torch==1.6.0
flair==0.6.1
flair==0.11.3
flask>=2.0,<2.1
gensim==3.8.0
flask>=1.0,<1.1
numpy==1.19.5
spacy[ja]==3.3.0
tokenizers==0.10.3; python_version == '3.6'
sudachipy==0.6.0; python_version == '3.6'
tqdm==4.50.0
spacy[ja]==2.3.2
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.3.1/es_core_news_sm-2.3.1.tar.gz
https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-2.3.1/zh_core_web_sm-2.3.1.tar.gz
https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-2.3.0/pl_core_news_sm-2.3.0.tar.gz
https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-2.3.0/nb_core_news_sm-2.3.0.tar.gz
https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz
https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.3.0/de_core_news_sm-2.3.0.tar.gz
https://github.com/explosion/spacy-models/releases/download/ja_core_news_sm-2.3.0/ja_core_news_sm-2.3.0.tar.gz
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.3.0/zh_core_web_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-3.3.0/pl_core_news_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-3.3.0/nb_core_news_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.3.0/fr_core_news_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.3.0/de_core_news_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/ja_core_news_sm-3.3.0/ja_core_news_sm-3.3.0.tar.gz
19 changes: 19 additions & 0 deletions code-env/python/spec/resources_init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
######################## Base imports #################################
from dataiku.code_env_resources import clear_all_env_vars
from dataiku.code_env_resources import set_env_path

######################## Download FLAIR Models ###########################
# Clear all environment variables defined by a previously run script
clear_all_env_vars()

# Set Flair cache directory
set_env_path("FLAIR_CACHE_ROOT", "flair")

from flair.models import SequenceTagger

# Download pretrained model: automatically managed by Flair,
# does not download anything if model is already in FLAIR_CACHE_ROOT
SequenceTagger.load('flair/ner-english-fast')
# Add any other models you want to download, check https://huggingface.co/flair for examples
# E.g. SequenceTagger.load('flair/ner-french')
# Make sure to modify the model used in recipe.py if you want to use a different model
12 changes: 1 addition & 11 deletions custom-recipes/named-entity-recognition-extract/recipe.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,6 @@
"arity": "UNARY",
"required": true,
"acceptsDataset": true
},
{
"name": "model_folder",
"label": "Flair model (optional)",
"description": "Folder containing Flair model weights",
"arity": "UNARY",
"required": false,
"acceptsManagedFolder": true,
"acceptsDataset": false,
"mustBeStrictlyType": "Filesystem"
}
],
"outputRoles": [
Expand Down Expand Up @@ -124,7 +114,7 @@
"name": "ner_model",
"label": "Model",
"type": "SELECT",
"description": "spaCy (multi-lingual, faster) or Flair (English only, slower)",
"description": "spaCy (faster) or Flair (slower)",
"selectChoices": [
{
"value": "spacy",
Expand Down
22 changes: 11 additions & 11 deletions custom-recipes/named-entity-recognition-extract/recipe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# -*- coding: utf-8 -*-
import multiprocessing

import dataiku
from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config

Expand Down Expand Up @@ -37,16 +39,10 @@

language = recipe_config.get("text_language_spacy", "en")
else:
from ner_utils_flair import extract_entities, CustomSequenceTagger

try:
model_folder = get_input_names_for_role("model_folder")[0]
except IndexError:
raise Exception(
"To use Flair, download the model using the macro and add the resulting folder as input to the recipe."
)
folder_path = dataiku.Folder(model_folder).get_path()
tagger = CustomSequenceTagger.load("ner-ontonotes-fast", folder_path)
from flair.models import SequenceTagger
from ner_utils_flair import extract_entities

tagger = SequenceTagger.load("flair/ner-english-fast")

#############################
# Main Loop
Expand All @@ -63,7 +59,11 @@ def compute_entities_df(df):
out_df = df.merge(out_df, left_index=True, right_index=True)
return out_df

if ner_model == "spacy":
chunksize = 200 * multiprocessing.cpu_count()
else:
chunksize = 100

process_dataset_chunks(
input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=100
input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=chunksize
)
2 changes: 1 addition & 1 deletion plugin.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"id": "named-entity-recognition",
"version": "1.3.4",
"version": "2.0.0",
"meta": {
"label": "Named Entity Recognition",
"category": "Natural Language Processing",
Expand Down
155 changes: 12 additions & 143 deletions python-lib/ner_utils_flair.py
Original file line number Diff line number Diff line change
@@ -1,134 +1,9 @@
# -*- coding: utf-8 -*-
import os
import logging
import re
import json
import requests
from urllib.parse import urlparse
from collections import defaultdict
import json

from flair.data import Sentence
from flair.models.sequence_tagger_model import SequenceTagger
import pandas as pd
from tqdm import tqdm

FLAIR_ENTITIES = [
"PERSON",
"NORP",
"FAC",
"ORG",
"GPE",
"LOC",
"PRODUCT",
"EVENT",
"WORK_OF_ART",
"LAW",
"LANGUAGE",
"DATE",
"TIME",
"PERCENT",
"MONEY",
"QUANTITY",
"ORDINAL",
"CARDINAL",
]


def get_from_cache(url: str, cache_dir: str = None) -> str:
"""
Given a URL, look for the corresponding dataset in the local cache.
If it's not there, download it. Then return the path to the cached file.
"""
os.makedirs(cache_dir, exist_ok=True)

filename = re.sub(r".+/", "", url)
# get cache path to put the file
cache_path = os.path.join(cache_dir, filename)
if os.path.exists(cache_path):
logging.info("File {} found in cache".format(filename))
return cache_path

# make HEAD request to check ETag
response = requests.head(url)
if response.status_code != 200:
raise IOError("HEAD request failed for url {}".format(url))

if not os.path.exists(cache_path):
logging.info("File {} not found in cache, downloading from URL {}...".format(filename, url))
req = requests.get(url, stream=True)
content_length = req.headers.get("Content-Length")
total = int(content_length) if content_length is not None else None
progress = tqdm(unit="B", total=total)
with open(cache_path, "wb") as temp_file:
for chunk in req.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()

return cache_path


def cached_path(url_or_filename: str, cache_path: str, cache_dir: str) -> str:
"""
Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and
return the path to the cached file. If it's already a local path,
make sure the file exists and then return the path.
"""
dataset_cache = os.path.join(cache_path, cache_dir)

parsed = urlparse(url_or_filename)

if parsed.scheme in ("http", "https"):
# URL, so get it from the cache (downloading if necessary)
return get_from_cache(url_or_filename, dataset_cache)
elif parsed.scheme == "" and os.path.exists(url_or_filename):
# File, and it exists.
return url_or_filename
elif parsed.scheme == "":
# File, but it doesn't exist.
raise FileNotFoundError("file {} not found".format(url_or_filename))
else:
# Something unknown
raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))


class CustomSequenceTagger(SequenceTagger):
@staticmethod
def load(model: str, cache_path: str):
model_file = None
aws_resource_path = "https://nlp.informatik.hu-berlin.de/resources/models"

if model.lower() == "ner":
base_path = "/".join([aws_resource_path, "ner", "en-ner-conll03-v0.4.pt"])
model_file = cached_path(base_path, cache_path, cache_dir="models")

if model.lower() == "ner-ontonotes-fast":
base_path = "/".join([aws_resource_path, "ner-ontonotes-fast", "en-ner-ontonotes-fast-v0.4.pt"])
model_file = cached_path(base_path, cache_path, cache_dir="models")

if model_file is not None:
tagger = SequenceTagger.load(model_file)
return tagger


#############################
# NER function
#############################

# Regex for matching either
PATTERN = r"({}|{})".format(
# Single-word entities
r"(?:\s*\S+ <S-[A-Z_]*>)", # (<S-TAG> format)
# Match multi-word entities
r"{}{}{}".format(
r"(?:\s*\S+ <B-[A-Z_]*>)", # A first tag in <B-TAG> format
r"(?:\s*\S+ <I-[A-Z_]*>)*", # Zero or more tags in <I-TAG> format
r"(?:\s*\S+ <E-[A-Z_]*>)", # A final tag in <E-TAG> format
),
)
matcher = re.compile(PATTERN)


def extract_entities(text_column, format, tagger):
Expand All @@ -138,33 +13,27 @@ def extract_entities(text_column, format, tagger):
# Tag Sentences
tagger.predict(sentences)

# Retrieve entities
if format:
entity_df = pd.DataFrame()
else:
entity_df = pd.DataFrame(columns=FLAIR_ENTITIES)

# Extract entities
rows = []
for sentence in sentences:
df_row = defaultdict(list)
entities = matcher.findall(sentence.to_tagged_string())
# Entities are in the following format: word1 <X-TAG> word2 <X-TAG> ...
for entity in entities:
# Extract entity text (word1, word2, ...)
text = " ".join(entity.split()[::2])
# Extract entity type (TAG)
tag = re.search(r"<.-(.+?)>", entity).group(1)
for entity in sentence.get_spans('ner'):
tag = entity.get_label("ner").value
text = entity.text
df_row[tag].append(text)

if format:
df_row = {"sentence": sentence.to_plain_string(), "entities": json.dumps(df_row)}
else:
for k, v in df_row.items():
df_row[k] = json.dumps(v)
df_row["sentence"] = sentence.to_plain_string()

entity_df = entity_df.append(df_row, ignore_index=True)
rows.append(df_row)

cols = [col for col in entity_df.columns.tolist() if col != "sentence"]
entity_df = entity_df[cols]
entity_df = pd.DataFrame(rows)

# Put 'sentence' column first
cols = sorted(list(entity_df.columns))
cols.insert(0, cols.pop(cols.index("sentence")))
entity_df = entity_df[cols]
return entity_df
27 changes: 20 additions & 7 deletions python-lib/ner_utils_spacy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import json
from collections import defaultdict
import json

import pandas as pd
import spacy

Expand All @@ -15,14 +16,25 @@
"nb": "nb_core_news_sm",
}

def get_spacy_model(language: str):
language_model = SPACY_LANGUAGE_MODELS.get(language, None)
if language_model is None:
raise ValueError(f"The language {language} is not available. \
You can add the language & corresponding model name by editing the code.")
try:
nlp = spacy.load(language_model, exclude=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
except OSError:
# Raising ValueError instead of OSError so it shows up at the top of the log
raise ValueError(f"Could not find spaCy model for the language {language}. \
Maybe you need to edit the requirements.txt file to enable it.")
return nlp

def extract_entities(text_column, format: bool, language: str):
# Tag sentences
nlp = spacy.load(SPACY_LANGUAGE_MODELS[language])
docs = nlp.pipe(text_column.values)

nlp = get_spacy_model(language=language)
docs = nlp.pipe(text_column.values, n_process=-1, batch_size=100)
# Extract entities
entity_df = pd.DataFrame()
rows = []
for doc in docs:
df_row = defaultdict(list)
for entity in doc.ents:
Expand All @@ -35,11 +47,12 @@ def extract_entities(text_column, format: bool, language: str):
df_row[k] = json.dumps(v)
df_row["sentence"] = doc.text

entity_df = entity_df.append(df_row, ignore_index=True)
rows.append(df_row)

entity_df = pd.DataFrame(rows)

# Put 'sentence' column first
cols = sorted(list(entity_df.columns))
cols.insert(0, cols.pop(cols.index("sentence")))
entity_df = entity_df[cols]

return entity_df
25 changes: 0 additions & 25 deletions python-runnables/named-entity-recognition-download/runnable.json

This file was deleted.

Loading

0 comments on commit 99cdf91

Please sign in to comment.