Skip to content

Commit

Permalink
Merge pull request #578 from axsauze/sklearn_spacy_text_example
Browse files Browse the repository at this point in the history
Example using Seldon for text classification with SpaCy tokenizer
  • Loading branch information
axsaucedo authored May 21, 2019
2 parents be9e532 + 22f595c commit c62dbfa
Show file tree
Hide file tree
Showing 6 changed files with 1,219 additions and 0 deletions.
23 changes: 23 additions & 0 deletions examples/models/sklearn_spacy_text/RedditClassifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import dill

from ml_utils import CleanTextTransformer, SpacyTokenTransformer

class RedditClassifier(object):
def __init__(self):

self._clean_text_transformer = CleanTextTransformer()
self._spacy_tokenizer = SpacyTokenTransformer()

with open('tfidf_vectorizer.model', 'rb') as model_file:
self._tfidf_vectorizer = dill.load(model_file)

with open('lr.model', 'rb') as model_file:
self._lr_model = dill.load(model_file)

def predict(self, X, feature_names):
clean_text = self._clean_text_transformer.transform(X)
spacy_tokens = self._spacy_tokenizer.transform(clean_text)
tfidf_features = self._tfidf_vectorizer.transform(spacy_tokens)
predictions = self._lr_model.predict_proba(tfidf_features)
return predictions

Empty file.
75 changes: 75 additions & 0 deletions examples/models/sklearn_spacy_text/ml_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import spacy
from spacy.cli import download
import re
import numpy as np
from sklearn.base import TransformerMixin
from html.parser import HTMLParser
import dill
import sys, os

download("en_core_web_sm")
nlp = spacy.load('en_core_web_sm', parser=False, entity=False)

class SpacyTokenTransformer(TransformerMixin):
__symbols = set("!$%^&*()_+|~-=`{}[]:\";'<>?,./-")

def transform(self, X, **kwargs):
f = np.vectorize(SpacyTokenTransformer.transform_to_token, otypes=[object])
X_tokenized = f(X)
return X_tokenized

def fit(self, X, y=None, **fit_params):
return self

@staticmethod
def transform_to_token(text):
str_text = str(text)
doc = nlp(str_text, disable=['parser', 'tagger', 'ner'])
tokens = []
for token in doc:
if token.like_url:
clean_token = "URL"
else:
clean_token = token.lemma_.lower().strip()
if len(clean_token) < 1 or clean_token in \
SpacyTokenTransformer.__symbols:
continue
tokens.append(clean_token)
return tokens

class CleanTextTransformer(TransformerMixin):
__html_parser = HTMLParser()
__uplus_pattern = \
re.compile("\<[uU]\+(?P<digit>[a-zA-Z0-9]+)\>")
__markup_link_pattern = \
re.compile("\[(.*)\]\((.*)\)")

def transform(self, X, **kwargs):
f = np.vectorize(CleanTextTransformer.transform_clean_text)
X_clean = f(X)
return X_clean

def fit(self, X, y=None, **fit_params):
return self

@staticmethod
def transform_clean_text(raw_text):
try:
decoded = raw_text.encode("ISO-8859-1").decode("utf-8")
except:
decoded = raw_text.encode("ISO-8859-1").decode("cp1252")
html_unescaped = CleanTextTransformer.\
__html_parser.unescape(decoded)
html_unescaped = re.sub(r"\r\n", " ", html_unescaped)
html_unescaped = re.sub(r"\r\r\n", " ", html_unescaped)
html_unescaped = re.sub(r"\r", " ", html_unescaped)
html_unescaped = html_unescaped.replace("&gt;", " > ")
html_unescaped = html_unescaped.replace("&lt;", " < ")
html_unescaped = html_unescaped.replace("--", " - ")
html_unescaped = CleanTextTransformer.__uplus_pattern.sub(
" U\g<digit> ", html_unescaped)
html_unescaped = CleanTextTransformer.__markup_link_pattern.sub(
" \1 \2 ", html_unescaped)
html_unescaped = html_unescaped.replace("\\", "")
return html_unescaped

53 changes: 53 additions & 0 deletions examples/models/sklearn_spacy_text/reddit_clf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"apiVersion": "machinelearning.seldon.io/v1alpha2",
"kind": "SeldonDeployment",
"metadata": {
"labels": {
"app": "seldon"
},
"name": "reddit-classifier"
},
"spec": {
"annotations": {
"project_name": "Reddit classifier",
"deployment_version": "v1"
},
"name": "reddit-classifier",
"oauth_key": "oauth-key",
"oauth_secret": "oauth-secret",
"predictors": [
{
"componentSpecs": [{
"spec": {
"containers": [
{
"image": "reddit-classifier:0.1",
"imagePullPolicy": "IfNotPresent",
"name": "classifier",
"resources": {
"requests": {
"memory": "1Mi"
}
}
}
],
"terminationGracePeriodSeconds": 20
}
}],
"graph": {
"children": [],
"name": "classifier",
"endpoint": {
"type" : "REST"
},
"type": "MODEL"
},
"name": "single-model",
"replicas": 1,
"annotations": {
"predictor_version" : "v1"
}
}
]
}
}
4 changes: 4 additions & 0 deletions examples/models/sklearn_spacy_text/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
scipy>= 0.13.3
scikit-learn>=0.18
spacy==2.0.18
dill==0.2.9
Loading

0 comments on commit c62dbfa

Please sign in to comment.