Increase versions, speed & update model downloading (#10)

Co-authored-by: Nicolas Dalsass <nicolasdalsass@users.noreply.github.com> - Move FLAIR to resources_init & make non-en spaCy models optional - Upgrade to spaCy 3 & increment plugin version - Disable unused pipeline algos - divides recipe time roughly by two - Allow multi-cpu processing - on a 8 core machine, divives recipe time roughly by 3
dataiku · Jul 27, 2022 · 99cdf91 · 99cdf91
1 parent 6fb143e
commit 99cdf91
Show file tree

Hide file tree

Showing 11 changed files with 84 additions and 268 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # Changelog
 
+## Version 2.0.0 - Feature release - 2022-06-13
+- Upgrade Flair, spaCy and model downloading functionality
+
 ## Version 1.3.4 - Feature release - 2022-01-04
 - Add Japanese support
 

diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt
@@ -1,14 +1,16 @@
-torch==1.6.0
-flair==0.6.1
+flair==0.11.3
+flask>=2.0,<2.1
 gensim==3.8.0
-flask>=1.0,<1.1
+numpy==1.19.5
+spacy[ja]==3.3.0
+tokenizers==0.10.3; python_version == '3.6'
+sudachipy==0.6.0; python_version == '3.6'
 tqdm==4.50.0
-spacy[ja]==2.3.2
-https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
-https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.3.1/es_core_news_sm-2.3.1.tar.gz
-https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-2.3.1/zh_core_web_sm-2.3.1.tar.gz
-https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-2.3.0/pl_core_news_sm-2.3.0.tar.gz
-https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-2.3.0/nb_core_news_sm-2.3.0.tar.gz
-https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz
-https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.3.0/de_core_news_sm-2.3.0.tar.gz
-https://github.com/explosion/spacy-models/releases/download/ja_core_news_sm-2.3.0/ja_core_news_sm-2.3.0.tar.gz
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz
+# https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0.tar.gz
+# https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.3.0/zh_core_web_sm-3.3.0.tar.gz
+# https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-3.3.0/pl_core_news_sm-3.3.0.tar.gz
+# https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-3.3.0/nb_core_news_sm-3.3.0.tar.gz
+# https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.3.0/fr_core_news_sm-3.3.0.tar.gz
+# https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.3.0/de_core_news_sm-3.3.0.tar.gz
+# https://github.com/explosion/spacy-models/releases/download/ja_core_news_sm-3.3.0/ja_core_news_sm-3.3.0.tar.gz
diff --git a/code-env/python/spec/resources_init.py b/code-env/python/spec/resources_init.py
@@ -0,0 +1,19 @@
+######################## Base imports #################################
+from dataiku.code_env_resources import clear_all_env_vars
+from dataiku.code_env_resources import set_env_path
+
+######################## Download FLAIR Models ###########################
+# Clear all environment variables defined by a previously run script
+clear_all_env_vars()
+
+# Set Flair cache directory
+set_env_path("FLAIR_CACHE_ROOT", "flair")
+
+from flair.models import SequenceTagger
+
+# Download pretrained model: automatically managed by Flair,
+# does not download anything if model is already in FLAIR_CACHE_ROOT
+SequenceTagger.load('flair/ner-english-fast')
+# Add any other models you want to download, check https://huggingface.co/flair for examples
+# E.g. SequenceTagger.load('flair/ner-french')
+# Make sure to modify the model used in recipe.py if you want to use a different model
diff --git a/custom-recipes/named-entity-recognition-extract/recipe.json b/custom-recipes/named-entity-recognition-extract/recipe.json
@@ -14,16 +14,6 @@
             "arity": "UNARY",
             "required": true,
             "acceptsDataset": true
-        },
-        {
-            "name": "model_folder",
-            "label": "Flair model (optional)",
-            "description": "Folder containing Flair model weights",
-            "arity": "UNARY",
-            "required": false,
-            "acceptsManagedFolder": true,
-            "acceptsDataset": false,
-            "mustBeStrictlyType": "Filesystem"
         }
     ],
     "outputRoles": [
@@ -124,7 +114,7 @@
             "name": "ner_model",
             "label": "Model",
             "type": "SELECT",
-            "description": "spaCy (multi-lingual, faster) or Flair (English only, slower)",
+            "description": "spaCy (faster) or Flair (slower)",
             "selectChoices": [
                 {
                     "value": "spacy",

diff --git a/custom-recipes/named-entity-recognition-extract/recipe.py b/custom-recipes/named-entity-recognition-extract/recipe.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+import multiprocessing
+
 import dataiku
 from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config
 
@@ -37,16 +39,10 @@
 
     language = recipe_config.get("text_language_spacy", "en")
 else:
-    from ner_utils_flair import extract_entities, CustomSequenceTagger
-
-    try:
-        model_folder = get_input_names_for_role("model_folder")[0]
-    except IndexError:
-        raise Exception(
-            "To use Flair, download the model using the macro and add the resulting folder as input to the recipe."
-        )
-    folder_path = dataiku.Folder(model_folder).get_path()
-    tagger = CustomSequenceTagger.load("ner-ontonotes-fast", folder_path)
+    from flair.models import SequenceTagger
+    from ner_utils_flair import extract_entities
+
+    tagger = SequenceTagger.load("flair/ner-english-fast")
 
 #############################
 # Main Loop
@@ -63,7 +59,11 @@ def compute_entities_df(df):
     out_df = df.merge(out_df, left_index=True, right_index=True)
     return out_df
 
+if ner_model == "spacy":
+    chunksize = 200 * multiprocessing.cpu_count()
+else:
+    chunksize = 100
 
 process_dataset_chunks(
-    input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=100
+    input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=chunksize
 )
diff --git a/plugin.json b/plugin.json
@@ -1,6 +1,6 @@
 {
     "id": "named-entity-recognition",
-    "version": "1.3.4",
+    "version": "2.0.0",
     "meta": {
         "label": "Named Entity Recognition",
         "category": "Natural Language Processing",

diff --git a/python-lib/ner_utils_flair.py b/python-lib/ner_utils_flair.py
@@ -1,134 +1,9 @@
 # -*- coding: utf-8 -*-
-import os
-import logging
-import re
-import json
-import requests
-from urllib.parse import urlparse
 from collections import defaultdict
+import json
 
 from flair.data import Sentence
-from flair.models.sequence_tagger_model import SequenceTagger
 import pandas as pd
-from tqdm import tqdm
-
-FLAIR_ENTITIES = [
-    "PERSON",
-    "NORP",
-    "FAC",
-    "ORG",
-    "GPE",
-    "LOC",
-    "PRODUCT",
-    "EVENT",
-    "WORK_OF_ART",
-    "LAW",
-    "LANGUAGE",
-    "DATE",
-    "TIME",
-    "PERCENT",
-    "MONEY",
-    "QUANTITY",
-    "ORDINAL",
-    "CARDINAL",
-]
-
-
-def get_from_cache(url: str, cache_dir: str = None) -> str:
-    """
-    Given a URL, look for the corresponding dataset in the local cache.
-    If it's not there, download it. Then return the path to the cached file.
-    """
-    os.makedirs(cache_dir, exist_ok=True)
-
-    filename = re.sub(r".+/", "", url)
-    # get cache path to put the file
-    cache_path = os.path.join(cache_dir, filename)
-    if os.path.exists(cache_path):
-        logging.info("File {} found in cache".format(filename))
-        return cache_path
-
-    # make HEAD request to check ETag
-    response = requests.head(url)
-    if response.status_code != 200:
-        raise IOError("HEAD request failed for url {}".format(url))
-
-    if not os.path.exists(cache_path):
-        logging.info("File {} not found in cache, downloading from URL {}...".format(filename, url))
-        req = requests.get(url, stream=True)
-        content_length = req.headers.get("Content-Length")
-        total = int(content_length) if content_length is not None else None
-        progress = tqdm(unit="B", total=total)
-        with open(cache_path, "wb") as temp_file:
-            for chunk in req.iter_content(chunk_size=1024):
-                if chunk:  # filter out keep-alive new chunks
-                    progress.update(len(chunk))
-                    temp_file.write(chunk)
-        progress.close()
-
-    return cache_path
-
-
-def cached_path(url_or_filename: str, cache_path: str, cache_dir: str) -> str:
-    """
-    Given something that might be a URL (or might be a local path),
-    determine which. If it's a URL, download the file and cache it, and
-    return the path to the cached file. If it's already a local path,
-    make sure the file exists and then return the path.
-    """
-    dataset_cache = os.path.join(cache_path, cache_dir)
-
-    parsed = urlparse(url_or_filename)
-
-    if parsed.scheme in ("http", "https"):
-        # URL, so get it from the cache (downloading if necessary)
-        return get_from_cache(url_or_filename, dataset_cache)
-    elif parsed.scheme == "" and os.path.exists(url_or_filename):
-        # File, and it exists.
-        return url_or_filename
-    elif parsed.scheme == "":
-        # File, but it doesn't exist.
-        raise FileNotFoundError("file {} not found".format(url_or_filename))
-    else:
-        # Something unknown
-        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
-
-
-class CustomSequenceTagger(SequenceTagger):
-    @staticmethod
-    def load(model: str, cache_path: str):
-        model_file = None
-        aws_resource_path = "https://nlp.informatik.hu-berlin.de/resources/models"
-
-        if model.lower() == "ner":
-            base_path = "/".join([aws_resource_path, "ner", "en-ner-conll03-v0.4.pt"])
-            model_file = cached_path(base_path, cache_path, cache_dir="models")
-
-        if model.lower() == "ner-ontonotes-fast":
-            base_path = "/".join([aws_resource_path, "ner-ontonotes-fast", "en-ner-ontonotes-fast-v0.4.pt"])
-            model_file = cached_path(base_path, cache_path, cache_dir="models")
-
-        if model_file is not None:
-            tagger = SequenceTagger.load(model_file)
-            return tagger
-
-
-#############################
-# NER function
-#############################
-
-# Regex for matching either
-PATTERN = r"({}|{})".format(
-    # Single-word entities
-    r"(?:\s*\S+ <S-[A-Z_]*>)",  # (<S-TAG> format)
-    # Match multi-word entities
-    r"{}{}{}".format(
-        r"(?:\s*\S+ <B-[A-Z_]*>)",  # A first tag in <B-TAG> format
-        r"(?:\s*\S+ <I-[A-Z_]*>)*",  # Zero or more tags in <I-TAG> format
-        r"(?:\s*\S+ <E-[A-Z_]*>)",  # A final tag in <E-TAG> format
-    ),
-)
-matcher = re.compile(PATTERN)
 
 
 def extract_entities(text_column, format, tagger):
@@ -138,33 +13,27 @@ def extract_entities(text_column, format, tagger):
     # Tag Sentences
     tagger.predict(sentences)
 
-    # Retrieve entities
-    if format:
-        entity_df = pd.DataFrame()
-    else:
-        entity_df = pd.DataFrame(columns=FLAIR_ENTITIES)
-
+    # Extract entities
+    rows = []
     for sentence in sentences:
         df_row = defaultdict(list)
-        entities = matcher.findall(sentence.to_tagged_string())
-        # Entities are in the following format: word1 <X-TAG> word2 <X-TAG> ...
-        for entity in entities:
-            # Extract entity text (word1, word2, ...)
-            text = " ".join(entity.split()[::2])
-            # Extract entity type (TAG)
-            tag = re.search(r"<.-(.+?)>", entity).group(1)
+        for entity in sentence.get_spans('ner'):
+            tag = entity.get_label("ner").value
+            text = entity.text
             df_row[tag].append(text)
-
         if format:
             df_row = {"sentence": sentence.to_plain_string(), "entities": json.dumps(df_row)}
         else:
             for k, v in df_row.items():
                 df_row[k] = json.dumps(v)
             df_row["sentence"] = sentence.to_plain_string()
 
-        entity_df = entity_df.append(df_row, ignore_index=True)
+        rows.append(df_row)
 
-    cols = [col for col in entity_df.columns.tolist() if col != "sentence"]
-    entity_df = entity_df[cols]
+    entity_df = pd.DataFrame(rows)
 
+    # Put 'sentence' column first
+    cols = sorted(list(entity_df.columns))
+    cols.insert(0, cols.pop(cols.index("sentence")))
+    entity_df = entity_df[cols]
     return entity_df
diff --git a/python-lib/ner_utils_spacy.py b/python-lib/ner_utils_spacy.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
-import json
 from collections import defaultdict
+import json
+
 import pandas as pd
 import spacy
 
@@ -15,14 +16,25 @@
     "nb": "nb_core_news_sm",
 }
 
+def get_spacy_model(language: str):
+    language_model = SPACY_LANGUAGE_MODELS.get(language, None)
+    if language_model is None:
+        raise ValueError(f"The language {language} is not available. \
+                        You can add the language & corresponding model name by editing the code.")
+    try:
+        nlp = spacy.load(language_model, exclude=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
+    except OSError:
+        # Raising ValueError instead of OSError so it shows up at the top of the log
+        raise ValueError(f"Could not find spaCy model for the language {language}. \
+                        Maybe you need to edit the requirements.txt file to enable it.")
+    return nlp
 
 def extract_entities(text_column, format: bool, language: str):
     # Tag sentences
-    nlp = spacy.load(SPACY_LANGUAGE_MODELS[language])
-    docs = nlp.pipe(text_column.values)
-
+    nlp = get_spacy_model(language=language)
+    docs = nlp.pipe(text_column.values, n_process=-1, batch_size=100)
     # Extract entities
-    entity_df = pd.DataFrame()
+    rows = []
     for doc in docs:
         df_row = defaultdict(list)
         for entity in doc.ents:
@@ -35,11 +47,12 @@ def extract_entities(text_column, format: bool, language: str):
                 df_row[k] = json.dumps(v)
             df_row["sentence"] = doc.text
 
-        entity_df = entity_df.append(df_row, ignore_index=True)
+        rows.append(df_row)
+
+    entity_df = pd.DataFrame(rows)
 
     # Put 'sentence' column first
     cols = sorted(list(entity_df.columns))
     cols.insert(0, cols.pop(cols.index("sentence")))
     entity_df = entity_df[cols]
-
     return entity_df
diff --git a/python-runnables/named-entity-recognition-download/runnable.json b/python-runnables/named-entity-recognition-download/runnable.json