diff --git a/CHANGELOG.md b/CHANGELOG.md index 297d5f0..750a1a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## Version 2.0.0 - Feature release - 2022-06-13 +- Upgrade Flair, spaCy and model downloading functionality + ## Version 1.3.4 - Feature release - 2022-01-04 - Add Japanese support diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt index 4a93d27..a160eb4 100644 --- a/code-env/python/spec/requirements.txt +++ b/code-env/python/spec/requirements.txt @@ -1,14 +1,16 @@ -torch==1.6.0 -flair==0.6.1 +flair==0.11.3 +flask>=2.0,<2.1 gensim==3.8.0 -flask>=1.0,<1.1 +numpy==1.19.5 +spacy[ja]==3.3.0 +tokenizers==0.10.3; python_version == '3.6' +sudachipy==0.6.0; python_version == '3.6' tqdm==4.50.0 -spacy[ja]==2.3.2 -https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz -https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.3.1/es_core_news_sm-2.3.1.tar.gz -https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-2.3.1/zh_core_web_sm-2.3.1.tar.gz -https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-2.3.0/pl_core_news_sm-2.3.0.tar.gz -https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-2.3.0/nb_core_news_sm-2.3.0.tar.gz -https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz -https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.3.0/de_core_news_sm-2.3.0.tar.gz -https://github.com/explosion/spacy-models/releases/download/ja_core_news_sm-2.3.0/ja_core_news_sm-2.3.0.tar.gz +https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz +# https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0.tar.gz +# https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.3.0/zh_core_web_sm-3.3.0.tar.gz +# https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-3.3.0/pl_core_news_sm-3.3.0.tar.gz +# https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-3.3.0/nb_core_news_sm-3.3.0.tar.gz +# https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.3.0/fr_core_news_sm-3.3.0.tar.gz +# https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.3.0/de_core_news_sm-3.3.0.tar.gz +# https://github.com/explosion/spacy-models/releases/download/ja_core_news_sm-3.3.0/ja_core_news_sm-3.3.0.tar.gz diff --git a/code-env/python/spec/resources_init.py b/code-env/python/spec/resources_init.py new file mode 100644 index 0000000..f157002 --- /dev/null +++ b/code-env/python/spec/resources_init.py @@ -0,0 +1,19 @@ +######################## Base imports ################################# +from dataiku.code_env_resources import clear_all_env_vars +from dataiku.code_env_resources import set_env_path + +######################## Download FLAIR Models ########################### +# Clear all environment variables defined by a previously run script +clear_all_env_vars() + +# Set Flair cache directory +set_env_path("FLAIR_CACHE_ROOT", "flair") + +from flair.models import SequenceTagger + +# Download pretrained model: automatically managed by Flair, +# does not download anything if model is already in FLAIR_CACHE_ROOT +SequenceTagger.load('flair/ner-english-fast') +# Add any other models you want to download, check https://huggingface.co/flair for examples +# E.g. SequenceTagger.load('flair/ner-french') +# Make sure to modify the model used in recipe.py if you want to use a different model diff --git a/custom-recipes/named-entity-recognition-extract/recipe.json b/custom-recipes/named-entity-recognition-extract/recipe.json index 32c4ec9..94116fe 100644 --- a/custom-recipes/named-entity-recognition-extract/recipe.json +++ b/custom-recipes/named-entity-recognition-extract/recipe.json @@ -14,16 +14,6 @@ "arity": "UNARY", "required": true, "acceptsDataset": true - }, - { - "name": "model_folder", - "label": "Flair model (optional)", - "description": "Folder containing Flair model weights", - "arity": "UNARY", - "required": false, - "acceptsManagedFolder": true, - "acceptsDataset": false, - "mustBeStrictlyType": "Filesystem" } ], "outputRoles": [ @@ -124,7 +114,7 @@ "name": "ner_model", "label": "Model", "type": "SELECT", - "description": "spaCy (multi-lingual, faster) or Flair (English only, slower)", + "description": "spaCy (faster) or Flair (slower)", "selectChoices": [ { "value": "spacy", diff --git a/custom-recipes/named-entity-recognition-extract/recipe.py b/custom-recipes/named-entity-recognition-extract/recipe.py index b7cf5d2..43acb16 100644 --- a/custom-recipes/named-entity-recognition-extract/recipe.py +++ b/custom-recipes/named-entity-recognition-extract/recipe.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +import multiprocessing + import dataiku from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config @@ -37,16 +39,10 @@ language = recipe_config.get("text_language_spacy", "en") else: - from ner_utils_flair import extract_entities, CustomSequenceTagger - - try: - model_folder = get_input_names_for_role("model_folder")[0] - except IndexError: - raise Exception( - "To use Flair, download the model using the macro and add the resulting folder as input to the recipe." - ) - folder_path = dataiku.Folder(model_folder).get_path() - tagger = CustomSequenceTagger.load("ner-ontonotes-fast", folder_path) + from flair.models import SequenceTagger + from ner_utils_flair import extract_entities + + tagger = SequenceTagger.load("flair/ner-english-fast") ############################# # Main Loop @@ -63,7 +59,11 @@ def compute_entities_df(df): out_df = df.merge(out_df, left_index=True, right_index=True) return out_df +if ner_model == "spacy": + chunksize = 200 * multiprocessing.cpu_count() +else: + chunksize = 100 process_dataset_chunks( - input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=100 + input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=chunksize ) diff --git a/plugin.json b/plugin.json index c8708ea..db8f529 100644 --- a/plugin.json +++ b/plugin.json @@ -1,6 +1,6 @@ { "id": "named-entity-recognition", - "version": "1.3.4", + "version": "2.0.0", "meta": { "label": "Named Entity Recognition", "category": "Natural Language Processing", diff --git a/python-lib/ner_utils_flair.py b/python-lib/ner_utils_flair.py index 6e5ed7f..b03d248 100644 --- a/python-lib/ner_utils_flair.py +++ b/python-lib/ner_utils_flair.py @@ -1,134 +1,9 @@ # -*- coding: utf-8 -*- -import os -import logging -import re -import json -import requests -from urllib.parse import urlparse from collections import defaultdict +import json from flair.data import Sentence -from flair.models.sequence_tagger_model import SequenceTagger import pandas as pd -from tqdm import tqdm - -FLAIR_ENTITIES = [ - "PERSON", - "NORP", - "FAC", - "ORG", - "GPE", - "LOC", - "PRODUCT", - "EVENT", - "WORK_OF_ART", - "LAW", - "LANGUAGE", - "DATE", - "TIME", - "PERCENT", - "MONEY", - "QUANTITY", - "ORDINAL", - "CARDINAL", -] - - -def get_from_cache(url: str, cache_dir: str = None) -> str: - """ - Given a URL, look for the corresponding dataset in the local cache. - If it's not there, download it. Then return the path to the cached file. - """ - os.makedirs(cache_dir, exist_ok=True) - - filename = re.sub(r".+/", "", url) - # get cache path to put the file - cache_path = os.path.join(cache_dir, filename) - if os.path.exists(cache_path): - logging.info("File {} found in cache".format(filename)) - return cache_path - - # make HEAD request to check ETag - response = requests.head(url) - if response.status_code != 200: - raise IOError("HEAD request failed for url {}".format(url)) - - if not os.path.exists(cache_path): - logging.info("File {} not found in cache, downloading from URL {}...".format(filename, url)) - req = requests.get(url, stream=True) - content_length = req.headers.get("Content-Length") - total = int(content_length) if content_length is not None else None - progress = tqdm(unit="B", total=total) - with open(cache_path, "wb") as temp_file: - for chunk in req.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - progress.update(len(chunk)) - temp_file.write(chunk) - progress.close() - - return cache_path - - -def cached_path(url_or_filename: str, cache_path: str, cache_dir: str) -> str: - """ - Given something that might be a URL (or might be a local path), - determine which. If it's a URL, download the file and cache it, and - return the path to the cached file. If it's already a local path, - make sure the file exists and then return the path. - """ - dataset_cache = os.path.join(cache_path, cache_dir) - - parsed = urlparse(url_or_filename) - - if parsed.scheme in ("http", "https"): - # URL, so get it from the cache (downloading if necessary) - return get_from_cache(url_or_filename, dataset_cache) - elif parsed.scheme == "" and os.path.exists(url_or_filename): - # File, and it exists. - return url_or_filename - elif parsed.scheme == "": - # File, but it doesn't exist. - raise FileNotFoundError("file {} not found".format(url_or_filename)) - else: - # Something unknown - raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) - - -class CustomSequenceTagger(SequenceTagger): - @staticmethod - def load(model: str, cache_path: str): - model_file = None - aws_resource_path = "https://nlp.informatik.hu-berlin.de/resources/models" - - if model.lower() == "ner": - base_path = "/".join([aws_resource_path, "ner", "en-ner-conll03-v0.4.pt"]) - model_file = cached_path(base_path, cache_path, cache_dir="models") - - if model.lower() == "ner-ontonotes-fast": - base_path = "/".join([aws_resource_path, "ner-ontonotes-fast", "en-ner-ontonotes-fast-v0.4.pt"]) - model_file = cached_path(base_path, cache_path, cache_dir="models") - - if model_file is not None: - tagger = SequenceTagger.load(model_file) - return tagger - - -############################# -# NER function -############################# - -# Regex for matching either -PATTERN = r"({}|{})".format( - # Single-word entities - r"(?:\s*\S+ )", # ( format) - # Match multi-word entities - r"{}{}{}".format( - r"(?:\s*\S+ )", # A first tag in format - r"(?:\s*\S+ )*", # Zero or more tags in format - r"(?:\s*\S+ )", # A final tag in format - ), -) -matcher = re.compile(PATTERN) def extract_entities(text_column, format, tagger): @@ -138,23 +13,14 @@ def extract_entities(text_column, format, tagger): # Tag Sentences tagger.predict(sentences) - # Retrieve entities - if format: - entity_df = pd.DataFrame() - else: - entity_df = pd.DataFrame(columns=FLAIR_ENTITIES) - + # Extract entities + rows = [] for sentence in sentences: df_row = defaultdict(list) - entities = matcher.findall(sentence.to_tagged_string()) - # Entities are in the following format: word1 word2 ... - for entity in entities: - # Extract entity text (word1, word2, ...) - text = " ".join(entity.split()[::2]) - # Extract entity type (TAG) - tag = re.search(r"<.-(.+?)>", entity).group(1) + for entity in sentence.get_spans('ner'): + tag = entity.get_label("ner").value + text = entity.text df_row[tag].append(text) - if format: df_row = {"sentence": sentence.to_plain_string(), "entities": json.dumps(df_row)} else: @@ -162,9 +28,12 @@ def extract_entities(text_column, format, tagger): df_row[k] = json.dumps(v) df_row["sentence"] = sentence.to_plain_string() - entity_df = entity_df.append(df_row, ignore_index=True) + rows.append(df_row) - cols = [col for col in entity_df.columns.tolist() if col != "sentence"] - entity_df = entity_df[cols] + entity_df = pd.DataFrame(rows) + # Put 'sentence' column first + cols = sorted(list(entity_df.columns)) + cols.insert(0, cols.pop(cols.index("sentence"))) + entity_df = entity_df[cols] return entity_df diff --git a/python-lib/ner_utils_spacy.py b/python-lib/ner_utils_spacy.py index 6835264..df66247 100644 --- a/python-lib/ner_utils_spacy.py +++ b/python-lib/ner_utils_spacy.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- -import json from collections import defaultdict +import json + import pandas as pd import spacy @@ -15,14 +16,25 @@ "nb": "nb_core_news_sm", } +def get_spacy_model(language: str): + language_model = SPACY_LANGUAGE_MODELS.get(language, None) + if language_model is None: + raise ValueError(f"The language {language} is not available. \ + You can add the language & corresponding model name by editing the code.") + try: + nlp = spacy.load(language_model, exclude=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]) + except OSError: + # Raising ValueError instead of OSError so it shows up at the top of the log + raise ValueError(f"Could not find spaCy model for the language {language}. \ + Maybe you need to edit the requirements.txt file to enable it.") + return nlp def extract_entities(text_column, format: bool, language: str): # Tag sentences - nlp = spacy.load(SPACY_LANGUAGE_MODELS[language]) - docs = nlp.pipe(text_column.values) - + nlp = get_spacy_model(language=language) + docs = nlp.pipe(text_column.values, n_process=-1, batch_size=100) # Extract entities - entity_df = pd.DataFrame() + rows = [] for doc in docs: df_row = defaultdict(list) for entity in doc.ents: @@ -35,11 +47,12 @@ def extract_entities(text_column, format: bool, language: str): df_row[k] = json.dumps(v) df_row["sentence"] = doc.text - entity_df = entity_df.append(df_row, ignore_index=True) + rows.append(df_row) + + entity_df = pd.DataFrame(rows) # Put 'sentence' column first cols = sorted(list(entity_df.columns)) cols.insert(0, cols.pop(cols.index("sentence"))) entity_df = entity_df[cols] - return entity_df diff --git a/python-runnables/named-entity-recognition-download/runnable.json b/python-runnables/named-entity-recognition-download/runnable.json deleted file mode 100644 index 4791238..0000000 --- a/python-runnables/named-entity-recognition-download/runnable.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "meta": { - "label": "Download Flair model", - "description": "Download Flair model weights for the Named Entity Recognition recipe", - "icon": "icon-tag", - "category": "Natural Language Processing" - }, - "impersonate": false, - "permissions": [ - "WRITE_CONF" - ], - "resultType": "HTML", - "resultLabel": "Model download result", - "extension": "txt", - "mimeType": "text/plain", - "params": [ - { - "name": "folder_name", - "label": "Folder name", - "type": "STRING", - "description": "Folder where the Flair model will be stored", - "mandatory": true - } - ] -} \ No newline at end of file diff --git a/python-runnables/named-entity-recognition-download/runnable.py b/python-runnables/named-entity-recognition-download/runnable.py deleted file mode 100644 index 5d1e71f..0000000 --- a/python-runnables/named-entity-recognition-download/runnable.py +++ /dev/null @@ -1,55 +0,0 @@ -# -*- coding: utf-8 -*- -import logging -from time import time - -import dataiku -from dataiku.runnables import Runnable - -from ner_utils_flair import CustomSequenceTagger - - -class MyRunnable(Runnable): - def __init__(self, project_key, config, plugin_config): - """ - :param project_key: the project in which the runnable executes - :param config: the dict of the configuration of the object - :param plugin_config: contains the plugin settings - """ - self.project_key = project_key - self.config = config - self.plugin_config = plugin_config - self.client = dataiku.api_client() - - def get_progress_target(self): - """ - If the runnable will return some progress info, have this function return a tuple of - (target, unit) where unit is one of: SIZE, FILES, RECORDS, NONE - """ - return (100, "NONE") - - def run(self, progress_callback): - - # Retrieving parameters - output_folder_name = self.config.get("folder_name", "") - - # Creating new Managed Folder if needed - project = self.client.get_project(self.project_key) - output_folder_found = False - for folder in project.list_managed_folders(): - if output_folder_name == folder["name"]: - output_folder = project.get_managed_folder(folder["id"]) - output_folder_found = True - break - if not output_folder_found: - output_folder = project.create_managed_folder(output_folder_name) - output_folder = dataiku.Folder(output_folder.get_definition()["id"], project_key=self.project_key) - if output_folder.get_info().get("type") != "Filesystem": - raise TypeError("Please store the model on the server filesystem") - output_folder_path = output_folder.get_path() - - logging.info("Downloading Flair model...") - start = time() - CustomSequenceTagger.load(model="ner-ontonotes-fast", cache_path=output_folder_path) - result_message = "Downloading Flair model: Done in {:.2f} seconds.".format(time() - start) - logging.info(result_message) - return result_message diff --git a/webapps/named-entity-recognition-spacy/backend.py b/webapps/named-entity-recognition-spacy/backend.py index 7596d96..023af1f 100644 --- a/webapps/named-entity-recognition-spacy/backend.py +++ b/webapps/named-entity-recognition-spacy/backend.py @@ -1,9 +1,9 @@ import json + from flask import request -import spacy from spacy import displacy -from ner_utils_spacy import SPACY_LANGUAGE_MODELS +from ner_utils_spacy import get_spacy_model @app.route("/run_NER") # noqa @@ -11,7 +11,7 @@ def run_NER(): text = request.args.get("input", "") language = request.args.get("language", "en") print("Processing text '{}' in language '{}'...".format(text, language)) - nlp = spacy.load(SPACY_LANGUAGE_MODELS[language]) + nlp = get_spacy_model(language=language) doc = nlp(text) html = displacy.render(doc, style="ent", page=False) return json.dumps(html)