From 49e1607630a78880f32c4254f5396128da1c7834 Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 29 Oct 2021 11:39:29 +0100 Subject: [PATCH 1/7] Modify tox --- setup.py | 65 ++++++++++++++++++++++++++++++++++---------------------- tox.ini | 2 +- 2 files changed, 41 insertions(+), 26 deletions(-) diff --git a/setup.py b/setup.py index d1b7ec88..a7fd87d2 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,45 @@ with open('README.md', 'r') as f: long_description = f.read() +extras = { + 'umap': [ + 'umap-learn' + ], + 'gensim': [ + 'gensim<=4.0.0' + ], + 'sklearn': [ + 'scikit-learn' + ], + 'transformers': [ + 'transformers', + 'tokenizers==0.10.1' + ], + 'tensorflow': [ + 'tensorflow==2.4.0', + 'tensorflow-addons', + 'numpy>=1.19.2,<1.20' + ], + 'torch': [ + 'torch' + ], + 'spacy': [ + 'spacy[lookups]==3.0.6' + ], + # All visualisation libraries + 'vis': [ + 'bokeh', + 'pandas' + ], + # All evaluation libraries + 'evaluate': [ + 'nervaluate' + ] +} + +# Allow users to install 'all' if they wish +extras['all'] = [dep for dep_list in extras.values() for dep in dep_list] + setuptools.setup( name=about['__name__'], version=about['__version__'], @@ -42,38 +81,14 @@ 'Operating System :: OS Independent', ], install_requires=[ - 'numpy>=1.19.2,<1.20', - 'pandas', 'boto3', - 'scikit-learn', - 'scipy==1.4.1', - 'click>=7.0,<8.0', - 'umap-learn', - 'nervaluate', 'twine', - 'gensim<=4.0.0', 'cython', 'flake8', 'black', - 'transformers', - 'tokenizers==0.10.1', 'tqdm' ], - extras_require={ - 'tensorflow': [ - 'tensorflow==2.4.0', - 'tensorflow-addons' - ], - 'torch': [ - 'torch' - ], - 'spacy': [ - 'spacy[lookups]==3.0.6' - ], - 'vis': [ - 'bokeh' - ] - }, + extras_require=extras, tests_require=[ 'pytest', 'pytest-cov', diff --git a/tox.ini b/tox.ini index bb6fe280..273db192 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ envlist = py37, py38 [testenv] deps = -r requirements_test.txt - .[spacy,torch,tensorflow,vis] + .[all] commands = python -m spacy download en_core_web_sm pytest -m '{env:TEST_SUITE:}' -s -v --durations=0 --disable-warnings --tb=line --cov=wellcomeml ./tests From b00b6e6a6755d19760c595b6bef0a4ea8c4cc0f5 Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 29 Oct 2021 11:55:40 +0100 Subject: [PATCH 2/7] Pin click --- setup.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/setup.py b/setup.py index a7fd87d2..dc537de5 100644 --- a/setup.py +++ b/setup.py @@ -27,14 +27,14 @@ long_description = f.read() extras = { - 'umap': [ - 'umap-learn' - ], - 'gensim': [ - 'gensim<=4.0.0' - ], - 'sklearn': [ - 'scikit-learn' + 'core': [ + 'scikit-learn', + 'scipy', + 'umap-learn', + 'gensim<=4.0.0', + 'bokeh', + 'pandas', + 'nervaluate' ], 'transformers': [ 'transformers', @@ -49,17 +49,9 @@ 'torch' ], 'spacy': [ - 'spacy[lookups]==3.0.6' + 'spacy[lookups]==3.0.6', + 'click>=7.0,<8.0' ], - # All visualisation libraries - 'vis': [ - 'bokeh', - 'pandas' - ], - # All evaluation libraries - 'evaluate': [ - 'nervaluate' - ] } # Allow users to install 'all' if they wish From c927c5d91b13fa932b53216f38fa85645a92780e Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 29 Oct 2021 12:43:58 +0100 Subject: [PATCH 3/7] Update all extra messages --- wellcomeml/ml/attention.py | 2 +- wellcomeml/ml/bert_classifier.py | 12 +++++------ wellcomeml/ml/bert_semantic_equivalence.py | 20 ++++++++++--------- wellcomeml/ml/bert_vectorizer.py | 11 +++++++---- wellcomeml/ml/bilstm.py | 14 ++++++------- wellcomeml/ml/clustering.py | 23 +++++++++++++++------- wellcomeml/ml/cnn.py | 14 +++++++------ wellcomeml/ml/doc2vec_vectorizer.py | 15 +++++++++++--- wellcomeml/ml/frequency_vectorizer.py | 9 ++++++--- wellcomeml/ml/keras_utils.py | 7 ++++--- wellcomeml/ml/keras_vectorizer.py | 12 ++++++----- wellcomeml/ml/sent2vec_vectorizer.py | 12 +++++++++-- wellcomeml/ml/similarity_entity_linking.py | 18 ++++++++++++----- wellcomeml/ml/spacy_classifier.py | 22 ++++++++++----------- wellcomeml/ml/spacy_entity_linking.py | 5 ++++- wellcomeml/ml/spacy_knowledge_base.py | 5 ++++- wellcomeml/ml/spacy_ner.py | 8 +++++--- wellcomeml/ml/transformers_tokenizer.py | 21 ++++++++++++++------ wellcomeml/ml/vectorizer.py | 11 ++++++++++- wellcomeml/ml/voting_classifier.py | 16 +++++++++++---- wellcomeml/spacy/spacy_doc_to_prodigy.py | 5 ++++- wellcomeml/utils.py | 8 ++++---- 22 files changed, 176 insertions(+), 94 deletions(-) diff --git a/wellcomeml/ml/attention.py b/wellcomeml/ml/attention.py index 52017018..4d860bcd 100644 --- a/wellcomeml/ml/attention.py +++ b/wellcomeml/ml/attention.py @@ -3,7 +3,7 @@ try: import tensorflow as tf except ImportError as e: - throw_extra_import_message(error=e, required_module='tensorflow', extra='tensorflow') + throw_extra_import_message(error=e, required_modules='tensorflow', extras='tensorflow') class SelfAttention(tf.keras.layers.Layer): diff --git a/wellcomeml/ml/bert_classifier.py b/wellcomeml/ml/bert_classifier.py index 21a0907d..5072258d 100644 --- a/wellcomeml/ml/bert_classifier.py +++ b/wellcomeml/ml/bert_classifier.py @@ -6,17 +6,17 @@ import math import os -from transformers import BertTokenizer, TFBertForSequenceClassification -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.model_selection import train_test_split -from sklearn.metrics import f1_score - from wellcomeml.utils import throw_extra_import_message try: + from transformers import BertTokenizer, TFBertForSequenceClassification + from sklearn.base import BaseEstimator, TransformerMixin + from sklearn.model_selection import train_test_split + from sklearn.metrics import f1_score import tensorflow as tf except ImportError as e: - throw_extra_import_message(error=e, required_module='tensorflow', extra='tensorflow') + throw_extra_import_message(error=e, required_modules='tensorflow,sklearn', + extras='tensorflow,sklearn,transformers') PRETRAINED_CONFIG = { diff --git a/wellcomeml/ml/bert_semantic_equivalence.py b/wellcomeml/ml/bert_semantic_equivalence.py index 0896b6f4..b97d53c3 100644 --- a/wellcomeml/ml/bert_semantic_equivalence.py +++ b/wellcomeml/ml/bert_semantic_equivalence.py @@ -3,22 +3,24 @@ import math import os -from transformers import BertConfig, BertTokenizer, \ - TFBertForSequenceClassification - -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.model_selection import train_test_split -from sklearn.utils.validation import check_is_fitted -from sklearn.exceptions import NotFittedError +from wellcomeml.utils import throw_extra_import_message from wellcomeml.ml.keras_utils import CategoricalMetrics # , MetricMiniBatchHistory from wellcomeml.logger import LOGGING_LEVEL, build_logger -from wellcomeml.utils import throw_extra_import_message + try: import tensorflow as tf + from transformers import BertConfig, BertTokenizer, \ + TFBertForSequenceClassification + + from sklearn.base import BaseEstimator, TransformerMixin + from sklearn.model_selection import train_test_split + from sklearn.utils.validation import check_is_fitted + from sklearn.exceptions import NotFittedError except ImportError as e: - throw_extra_import_message(error=e, required_module='tensorflow', extra='tensorflow') + throw_extra_import_message(error=e, required_modules='tensorflow,sklearn', + extras='core,tensorflow,transformers') class SemanticEquivalenceClassifier(BaseEstimator, TransformerMixin): diff --git a/wellcomeml/ml/bert_vectorizer.py b/wellcomeml/ml/bert_vectorizer.py index 5af96932..64b75890 100644 --- a/wellcomeml/ml/bert_vectorizer.py +++ b/wellcomeml/ml/bert_vectorizer.py @@ -4,17 +4,20 @@ """ import logging -from transformers import BertModel, BertTokenizer -from sklearn.base import BaseEstimator, TransformerMixin -import numpy as np import tqdm from wellcomeml.utils import check_cache_and_download, throw_extra_import_message +required_modules = "torch,sklearn,numpy" +required_extras = "torch,transformers,sklearn" try: + from transformers import BertModel, BertTokenizer + from sklearn.base import BaseEstimator, TransformerMixin + import numpy as np import torch except ImportError as e: - throw_extra_import_message(error=e, extra="torch", required_module="torch") + throw_extra_import_message(error=e, required_modules=required_modules, + extras=required_extras) logger = logging.getLogger(__name__) diff --git a/wellcomeml/ml/bilstm.py b/wellcomeml/ml/bilstm.py index cb9be420..2aaaa94e 100644 --- a/wellcomeml/ml/bilstm.py +++ b/wellcomeml/ml/bilstm.py @@ -1,19 +1,19 @@ from datetime import datetime import math -from sklearn.model_selection import train_test_split -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.metrics import f1_score -from scipy.sparse import csr_matrix, vstack -import numpy as np - from wellcomeml.ml.attention import HierarchicalAttention from wellcomeml.utils import throw_extra_import_message try: import tensorflow as tf + from sklearn.model_selection import train_test_split + from sklearn.base import BaseEstimator, ClassifierMixin + from sklearn.metrics import f1_score + from scipy.sparse import csr_matrix, vstack + import numpy as np except ImportError as e: - throw_extra_import_message(error=e, required_module='tensorflow', extra='tensorflow') + throw_extra_import_message(error=e, required_modules='tensorflow,scipy,numpy', + extras='tensorflow,core') TENSORBOARD_LOG_DIR = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S") CALLBACK_DICT = { diff --git a/wellcomeml/ml/clustering.py b/wellcomeml/ml/clustering.py index 0e981302..ebb9f79b 100644 --- a/wellcomeml/ml/clustering.py +++ b/wellcomeml/ml/clustering.py @@ -5,14 +5,23 @@ from wellcomeml.ml import vectorizer from wellcomeml.logger import logger +from wellcomeml.utils import throw_extra_import_message + +required_modules = 'tensorflow,sklearn,numpy' +extras = 'tensorflow,core' + +try: + import numpy as np + from sklearn.base import ClusterMixin + from sklearn.cluster import DBSCAN, OPTICS, KMeans + from sklearn.manifold import TSNE + from sklearn.model_selection import GridSearchCV + from sklearn.metrics import silhouette_score + from sklearn.pipeline import Pipeline +except ImportError as e: + throw_extra_import_message(error=e, required_modules=required_modules, + extras=extras) -import numpy as np -from sklearn.base import ClusterMixin -from sklearn.cluster import DBSCAN, OPTICS, KMeans -from sklearn.manifold import TSNE -from sklearn.model_selection import GridSearchCV -from sklearn.metrics import silhouette_score -from sklearn.pipeline import Pipeline try: from hdbscan import HDBSCAN diff --git a/wellcomeml/ml/cnn.py b/wellcomeml/ml/cnn.py index d16b6de5..c8a28ade 100644 --- a/wellcomeml/ml/cnn.py +++ b/wellcomeml/ml/cnn.py @@ -18,19 +18,21 @@ import logging import math -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.metrics import f1_score -from scipy.sparse import csr_matrix, vstack -import numpy as np - from wellcomeml.ml.attention import HierarchicalAttention from wellcomeml.utils import throw_extra_import_message +required_modules = 'tensorflow,sklearn,numpy,scipy' +required_extras = 'core,tensorflow' + try: import tensorflow_addons as tfa import tensorflow as tf + from sklearn.base import BaseEstimator, ClassifierMixin + from sklearn.metrics import f1_score + from scipy.sparse import csr_matrix, vstack + import numpy as np except ImportError as e: - throw_extra_import_message(error=e, required_module='tensorflow', extra='tensorflow') + throw_extra_import_message(error=e, required_modules=required_modules, extras=required_extras) logger = logging.getLogger(__name__) diff --git a/wellcomeml/ml/doc2vec_vectorizer.py b/wellcomeml/ml/doc2vec_vectorizer.py index 1a39b70c..b952262f 100644 --- a/wellcomeml/ml/doc2vec_vectorizer.py +++ b/wellcomeml/ml/doc2vec_vectorizer.py @@ -4,9 +4,18 @@ import statistics import logging -from sklearn.base import BaseEstimator, TransformerMixin -from gensim.models.doc2vec import Doc2Vec, TaggedDocument -import numpy as np +from wellcomeml.utils import throw_extra_import_message + +required_modules = 'sklearn,gensim,numpy' +required_extras = 'core' + +try: + from sklearn.base import BaseEstimator, TransformerMixin + from gensim.models.doc2vec import Doc2Vec, TaggedDocument + import numpy as np +except ImportError as e: + throw_extra_import_message(error=e, required_modules=required_modules, extras=required_extras) + logging.getLogger("gensim").setLevel(logging.WARNING) diff --git a/wellcomeml/ml/frequency_vectorizer.py b/wellcomeml/ml/frequency_vectorizer.py index 89bc2c2e..34f8c07c 100644 --- a/wellcomeml/ml/frequency_vectorizer.py +++ b/wellcomeml/ml/frequency_vectorizer.py @@ -7,15 +7,18 @@ import logging import re -from scipy import sparse -from sklearn.feature_extraction.text import TfidfVectorizer from wellcomeml.utils import throw_extra_import_message # Heavy dependencies go here +required_modules = 'spacy,sklearn,scipy' +required_extras = 'spacy,core' + try: import spacy + from scipy import sparse + from sklearn.feature_extraction.text import TfidfVectorizer except ImportError as e: - throw_extra_import_message(error=e, required_module='spacy', extra='spacy') + throw_extra_import_message(error=e, required_modules='spacy', extras='spacy') logger = logging.getLogger(__name__) diff --git a/wellcomeml/ml/keras_utils.py b/wellcomeml/ml/keras_utils.py index 32a88dc0..58887bab 100644 --- a/wellcomeml/ml/keras_utils.py +++ b/wellcomeml/ml/keras_utils.py @@ -1,14 +1,15 @@ import csv from collections import defaultdict -from sklearn.metrics import f1_score, precision_score, recall_score - from wellcomeml.utils import throw_extra_import_message +require_modules = 'sklearn,tensorflow' +required_extras = 'core,tensorflow' try: + from sklearn.metrics import f1_score, precision_score, recall_score import tensorflow as tf except ImportError as e: - throw_extra_import_message(error=e, required_module='tensorflow', extra='tensorflow') + throw_extra_import_message(error=e, required_modules=require_modules, extras=required_extras) class Metrics(tf.keras.callbacks.Callback): diff --git a/wellcomeml/ml/keras_vectorizer.py b/wellcomeml/ml/keras_vectorizer.py index 453f1da9..7133fbef 100644 --- a/wellcomeml/ml/keras_vectorizer.py +++ b/wellcomeml/ml/keras_vectorizer.py @@ -2,21 +2,23 @@ Implements KerasTokenizer that abstracts Keras tokenisation and encoding and KerasVectorizer that is sklearn compatible """ -from sklearn.base import BaseEstimator, TransformerMixin -import numpy as np -import gensim.downloader as api - from os import path import logging from wellcomeml.ml.transformers_tokenizer import TransformersTokenizer from wellcomeml.utils import throw_extra_import_message +required_modules = 'tensorflow,sklearn,numpy,gensim' +required_extras = 'tensorflow,core' + try: + from sklearn.base import BaseEstimator, TransformerMixin + import numpy as np + import gensim.downloader as api from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences except ImportError as e: - throw_extra_import_message(error=e, required_module='tensorflow', extra='tensorflow') + throw_extra_import_message(error=e, required_modules=required_modules, extras=required_extras) logger = logging.getLogger(__name__) diff --git a/wellcomeml/ml/sent2vec_vectorizer.py b/wellcomeml/ml/sent2vec_vectorizer.py index a8b45a96..5c8b4884 100644 --- a/wellcomeml/ml/sent2vec_vectorizer.py +++ b/wellcomeml/ml/sent2vec_vectorizer.py @@ -2,9 +2,17 @@ Vectorizer that exposes sklearn interface to sent2vec paper and codebase. https://github.com/epfml/sent2vec """ -from sklearn.base import TransformerMixin, BaseEstimator +from wellcomeml.utils import check_cache_and_download, throw_extra_import_message + +required_modules = 'sklearn' +required_extras = 'core' + +try: + from sklearn.base import TransformerMixin, BaseEstimator +except ImportError as e: + throw_extra_import_message(error=e, required_modules=required_modules, + required_extras=required_extras) -from wellcomeml.utils import check_cache_and_download class Sent2VecVectorizer(BaseEstimator, TransformerMixin): diff --git a/wellcomeml/ml/similarity_entity_linking.py b/wellcomeml/ml/similarity_entity_linking.py index 778a3695..43f6541d 100644 --- a/wellcomeml/ml/similarity_entity_linking.py +++ b/wellcomeml/ml/similarity_entity_linking.py @@ -3,14 +3,22 @@ using the TFIDF vectors or a BERT embedding from the corpus documents. """ - -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.metrics import f1_score -from sklearn.metrics.pairwise import cosine_similarity -import numpy as np +from wellcomeml.utils import throw_extra_import_message from wellcomeml.ml.bert_vectorizer import BertVectorizer +required_modules = 'sklearn' +required_extras = 'core' + +try: + from sklearn.feature_extraction.text import TfidfVectorizer + from sklearn.metrics import f1_score + from sklearn.metrics.pairwise import cosine_similarity + import numpy as np +except ImportError as e: + throw_extra_import_message(error=e, required_modules=required_modules, + required_extras=required_extras) + class SimilarityEntityLinker: def __init__(self, stopwords, embedding="tf-idf"): diff --git a/wellcomeml/ml/spacy_classifier.py b/wellcomeml/ml/spacy_classifier.py index a9e561aa..3429afc0 100644 --- a/wellcomeml/ml/spacy_classifier.py +++ b/wellcomeml/ml/spacy_classifier.py @@ -5,30 +5,28 @@ of grants Adapted from https://github.com/explosion/spaCy/blob/master/examples/training/train_textcat.py """ -from sklearn.metrics import f1_score -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.model_selection import train_test_split -from sklearn.metrics import precision_recall_fscore_support -from scipy.sparse import csr_matrix -import numpy as np - import logging import random import time from wellcomeml.utils import throw_extra_import_message +required_modules = 'spacy,numpy,scipy,torch' +required_extras = 'spacy,core,torch' + try: + from sklearn.metrics import f1_score + from sklearn.base import BaseEstimator, ClassifierMixin + from sklearn.model_selection import train_test_split + from sklearn.metrics import precision_recall_fscore_support + from scipy.sparse import csr_matrix + import numpy as np from spacy.training import Example from spacy.util import minibatch, compounding import spacy -except ImportError as e: - throw_extra_import_message(error=e, required_module='spacy', extra='spacy') - -try: import torch except ImportError as e: - throw_extra_import_message(error=e, extra="torch", required_module="torch") + throw_extra_import_message(error=e, required_modules=required_modules, extras=required_extras) logger = logging.getLogger(__name__) diff --git a/wellcomeml/ml/spacy_entity_linking.py b/wellcomeml/ml/spacy_entity_linking.py index acde6665..b6a9c2cb 100644 --- a/wellcomeml/ml/spacy_entity_linking.py +++ b/wellcomeml/ml/spacy_entity_linking.py @@ -6,13 +6,16 @@ from wellcomeml.utils import throw_extra_import_message +required_modules = 'spacy' +required_extras = 'spacy' + try: from spacy.training import Example from spacy.util import minibatch, compounding import spacy from spacy.kb import KnowledgeBase except ImportError as e: - throw_extra_import_message(error=e, required_module='spacy', extra='spacy') + throw_extra_import_message(error=e, required_modules=required_modules, extras=required_extras) class SpacyEntityLinker(object): diff --git a/wellcomeml/ml/spacy_knowledge_base.py b/wellcomeml/ml/spacy_knowledge_base.py index 3fb52083..12cf0750 100644 --- a/wellcomeml/ml/spacy_knowledge_base.py +++ b/wellcomeml/ml/spacy_knowledge_base.py @@ -10,13 +10,16 @@ from wellcomeml.utils import throw_extra_import_message +required_modules = 'spacy' +required_extras = 'spacy' + try: from spacy.vocab import Vocab from spacy.kb import KnowledgeBase import spacy except ImportError as e: - throw_extra_import_message(error=e, required_module='spacy', extra='spacy') + throw_extra_import_message(error=e, required_modules=required_modules, extras=required_extras) class SpacyKnowledgeBase(object): diff --git a/wellcomeml/ml/spacy_ner.py b/wellcomeml/ml/spacy_ner.py index 78de5a04..b592bf0c 100644 --- a/wellcomeml/ml/spacy_ner.py +++ b/wellcomeml/ml/spacy_ner.py @@ -1,14 +1,16 @@ import random -from nervaluate import Evaluator - from wellcomeml.utils import throw_extra_import_message +required_modules = 'spacy,nervaluate' +required_extras = 'spacy,core' + try: from spacy.training import Example import spacy + from nervaluate import Evaluator except ImportError as e: - throw_extra_import_message(error=e, required_module='spacy', extra='spacy') + throw_extra_import_message(error=e, required_modules=required_modules, extras=required_extras) class SpacyNER: diff --git a/wellcomeml/ml/transformers_tokenizer.py b/wellcomeml/ml/transformers_tokenizer.py index c371ba67..ce9372c2 100644 --- a/wellcomeml/ml/transformers_tokenizer.py +++ b/wellcomeml/ml/transformers_tokenizer.py @@ -2,12 +2,21 @@ Implements Tokenizer that abstracts common tokenisation strategies used in Transformers """ -from tokenizers.models import WordPiece, BPE -from tokenizers.normalizers import Lowercase, Sequence -from tokenizers.trainers import BpeTrainer, WordPieceTrainer -from tokenizers.pre_tokenizers import ByteLevel, Whitespace -from tokenizers import Tokenizer -import numpy as np +from wellcomeml.utils import throw_extra_import_message + +required_modules = 'tokenizers,numpy' +required_extras = 'core,transformers' + +try: + from tokenizers.models import WordPiece, BPE + from tokenizers.normalizers import Lowercase, Sequence + from tokenizers.trainers import BpeTrainer, WordPieceTrainer + from tokenizers.pre_tokenizers import ByteLevel, Whitespace + from tokenizers import Tokenizer + import numpy as np +except ImportError as e: + throw_extra_import_message(error=e, required_modules=required_modules, + required_extras=required_extras) # TODO # - generalise to two sentences diff --git a/wellcomeml/ml/vectorizer.py b/wellcomeml/ml/vectorizer.py index 50776ba0..5787aad1 100644 --- a/wellcomeml/ml/vectorizer.py +++ b/wellcomeml/ml/vectorizer.py @@ -5,8 +5,17 @@ A generic vectorizer that can fallback to tdidf or bag of words from sklearn or embed using bert, doc2vec etc """ +from wellcomeml.utils import throw_extra_import_message + +required_modules = 'sklearn' +required_extras = 'core' + +try: + from sklearn.base import BaseEstimator, TransformerMixin +except ImportError as e: + throw_extra_import_message(error=e, required_modules=required_modules, + required_extras=required_extras) -from sklearn.base import BaseEstimator, TransformerMixin class Vectorizer(BaseEstimator, TransformerMixin): diff --git a/wellcomeml/ml/voting_classifier.py b/wellcomeml/ml/voting_classifier.py index a447fb07..4766c946 100644 --- a/wellcomeml/ml/voting_classifier.py +++ b/wellcomeml/ml/voting_classifier.py @@ -33,10 +33,18 @@ """ import logging -from sklearn.utils.validation import check_is_fitted -from sklearn.ensemble import VotingClassifier -from sklearn.exceptions import NotFittedError -import numpy as np +required_modules = 'sklearn,numpy' +required_extras = 'core' + +try: + from sklearn.utils.validation import check_is_fitted + from sklearn.ensemble import VotingClassifier + from sklearn.exceptions import NotFittedError + import numpy as np +except ImportError as e: + throw_extra_import_message(error=e, required_modules=required_modules, + required_extras=required_extras) + logger = logging.getLogger(__name__) diff --git a/wellcomeml/spacy/spacy_doc_to_prodigy.py b/wellcomeml/spacy/spacy_doc_to_prodigy.py index 7611cf89..e19ca125 100644 --- a/wellcomeml/spacy/spacy_doc_to_prodigy.py +++ b/wellcomeml/spacy/spacy_doc_to_prodigy.py @@ -2,10 +2,13 @@ # coding: utf-8 from wellcomeml.utils import throw_extra_import_message +required_modules = 'spacy' +required_extras = 'spacy' + try: import spacy except ImportError as e: - throw_extra_import_message(error=e, required_module='spacy', extra='spacy') + throw_extra_import_message(error=e, required_modules=required_modules, extras=required_extras) class SpacyDocToProdigy: diff --git a/wellcomeml/utils.py b/wellcomeml/utils.py index 56bd5759..3089dba1 100644 --- a/wellcomeml/utils.py +++ b/wellcomeml/utils.py @@ -62,10 +62,10 @@ def check_cache_and_download(model_name): return model_path -def throw_extra_import_message(error, extra, required_module): +def throw_extra_import_message(error, extras, required_modules): """Safely throws an import error if it due to missing extras, and re-raising it otherwise""" - if error.name == required_module: - raise ImportError(f"To use this class/module you need to install wellcomeml with {extra} " - f"extras, e.g. pip install wellcomeml[{extra}]") + if error.name in required_modules.split(','): + raise ImportError(f"To use this class/module you need to install wellcomeml with {extras} " + f"extras, e.g. pip install wellcomeml[{extras}]") else: raise error From 82f1ddd9fbead05b28fc97f594c10a3d43eb48c7 Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 29 Oct 2021 13:04:30 +0100 Subject: [PATCH 4/7] Update readme --- README.md | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 90919a22..79a6ad50 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ This package contains common utility functions for usual tasks at the Wellcome T * An intuitive sklearn-like API wrapping text vectorizers, such as Doc2vec, Bert, Scibert * Common API for off-the-shelf classifiers to allow quick iteration (e.g. Frequency Vectorizer, Bert, Scibert, basic CNN, BiLSTM, SemanticSimilarity) * Utils to download and convert academic text datasets for benchmark +* Utils to download data from the EPMC API For more information read the official [docs](http://wellcometrust.github.io/WellcomeML). @@ -23,39 +24,38 @@ Installing from PyPi pip install wellcomeml ``` -This will install the "vanilla" package. In order to install the deep-learning functionality -(torch/transformers/spacy transformers/visualisation functionalities): +This will install the "vanilla" package with very little functionality, such as io, dataset download etc. + +If space is not a problem, you can install the _full_ package (around 2.2GB): ```bash -pip install wellcomeml[spacy, tensorflow, torch, vis] +pip install wellcomeml[all] ``` -For a list of functionalities/classes and the dependencies on "extras", see [extras](#5-extras). - +The full package is relatively big, therefore we also have fine-grained installations if you only wish to use one specific module. +Those are `core, transformers, tensorflow, torch, spacy`. You can install one, or more of those you want, e.g.: -Installing from a release wheel +```bash +pip install wellcomeml[tensorflow, core] +``` -Download the wheel [from aws](https://datalabs-public.s3.eu-west-2.amazonaws.com/wellcomeml/wellcomeml-2020.1.0-py3-none-any.whl) -and pip install it: +To check that your installation allows you to use a specific module, try (for example): ```bash -pip install wellcomeml-2020.1.0-py3-none-any.whl -pip install wellcomeml-2020.1.0-py3-none-any.whl[deep-learning] +python -c "import wellcomeml.ml.bert_vectorizer" ``` -### 1.1 Installing wellcomeml[deep-learning] on windows +If you don't have the correct dependencies installed for a module, an error will appear +and point you to the right dependencies. + +### 1.1 Installing wellcomeml[all] on windows -Torch has a different installation for windows so it will not get automatically installed with wellcomeml[deeplearning]. +Torch has a different installation for windows so it will not get automatically installed with wellcomeml[all]. It needs to be installed first (this is for machines with no CUDA parallel computing platform for those that do look here https://pytorch.org/ for correct installation): ``` pip install torch==1.5.1+cpu torchvision==0.6.1+cpu -f https://download.pytorch.org/whl/torch_stable.html -``` - -Then install wellcomeml[deep-learning]: - -``` -pip install wellcomeml[deep-learning] +pip install wellcomeml[all] ``` ## 2. Development @@ -145,5 +145,4 @@ more information that might be informative to resolve the issue. | wellcomeml.ml.spacy_ner | Named entity recognition classifier based on spacy | spacy | | wellcomeml.ml.transformers_tokenizer | Bespoke tokenizer based on transformers | Transformers | | wellcomeml.ml.vectorizer | Abstract class for vectorizers | NA | -| wellcomeml.ml.voting_classifier | Meta-classifier based on majority voting | NA | - +| wellcomeml.ml.voting_classifier | Meta-classifier based on majority voting | NA | From 8609590e1d2782d5bd66777b81e6dcbdbee7a828 Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 29 Oct 2021 13:06:18 +0100 Subject: [PATCH 5/7] Add extras --- wellcomeml/metrics/f1.py | 12 ++++++++-- .../metrics/ner_classification_report.py | 9 ++++++- wellcomeml/viz/visualize_clusters.py | 24 ++++++++++++------- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/wellcomeml/metrics/f1.py b/wellcomeml/metrics/f1.py index 63e6295c..2483600a 100644 --- a/wellcomeml/metrics/f1.py +++ b/wellcomeml/metrics/f1.py @@ -2,8 +2,16 @@ Adapted from: https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric """ -import tensorflow as tf -import tensorflow.keras.backend as K +from wellcomeml.utils import throw_extra_import_message + +required_modules = 'tensorflow' +required_extras = 'tensorflow' + +try: + import tensorflow as tf + import tensorflow.keras.backend as K +except ImportError as e: + throw_extra_import_message(e, required_modules, required_extras) def f1_metric(y_true, y_pred): diff --git a/wellcomeml/metrics/ner_classification_report.py b/wellcomeml/metrics/ner_classification_report.py index 4819f7fe..9ceb3af2 100644 --- a/wellcomeml/metrics/ner_classification_report.py +++ b/wellcomeml/metrics/ner_classification_report.py @@ -1,5 +1,12 @@ +from wellcomeml.utils import throw_extra_import_message -from nervaluate import Evaluator +required_module = 'nervaluate' +required_extras = 'core' + +try: + from nervaluate import Evaluator +except ImportError as e: + throw_extra_import_message(error, required_module, required_extras) def ner_classification_report(y_true, y_pred, groups, tags): diff --git a/wellcomeml/viz/visualize_clusters.py b/wellcomeml/viz/visualize_clusters.py index efc715e1..719eb6a5 100644 --- a/wellcomeml/viz/visualize_clusters.py +++ b/wellcomeml/viz/visualize_clusters.py @@ -1,16 +1,24 @@ from typing import Optional, Union -import numpy as np -import pandas as pd - -from bokeh.io import output_notebook, reset_output -from bokeh.models import Legend, Dropdown, ColumnDataSource, CustomJS -from bokeh.plotting import figure, output_file, show -from bokeh.layouts import column -from bokeh.events import MenuItemClick +from wellcomeml.utils import throw_extra_import_message from wellcomeml.viz.palettes import (Wellcome33, WellcomeBackground, WellcomeNoData) +required_modules = 'numpy,pandas,bokeh' +required_extras = 'core' + +try: + import numpy as np + import pandas as pd + + from bokeh.io import output_notebook, reset_output + from bokeh.models import Legend, Dropdown, ColumnDataSource, CustomJS + from bokeh.plotting import figure, output_file, show + from bokeh.layouts import column + from bokeh.events import MenuItemClick +except ImportError as e: + throw_extra_import_message(error=e, required_modules=required_modules, extras=required_extras) + def visualize_clusters(clustering, filter_list: Optional[list] = None, texts: Optional[list] = None, From dfcbf912fb6cee470f708d611cc80d0ad7b2e2aa Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 29 Oct 2021 13:06:33 +0100 Subject: [PATCH 6/7] Fix extras error --- wellcomeml/metrics/ner_classification_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wellcomeml/metrics/ner_classification_report.py b/wellcomeml/metrics/ner_classification_report.py index 9ceb3af2..535ccea2 100644 --- a/wellcomeml/metrics/ner_classification_report.py +++ b/wellcomeml/metrics/ner_classification_report.py @@ -6,7 +6,7 @@ try: from nervaluate import Evaluator except ImportError as e: - throw_extra_import_message(error, required_module, required_extras) + throw_extra_import_message(e, required_module, required_extras) def ner_classification_report(y_true, y_pred, groups, tags): From 03b32bdbae4d1936c25afe0a6a5dd502b644d7af Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 29 Oct 2021 13:08:12 +0100 Subject: [PATCH 7/7] Fix flake8 --- wellcomeml/ml/sent2vec_vectorizer.py | 1 - wellcomeml/ml/vectorizer.py | 1 - wellcomeml/ml/voting_classifier.py | 6 +++--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/wellcomeml/ml/sent2vec_vectorizer.py b/wellcomeml/ml/sent2vec_vectorizer.py index 5c8b4884..6f5fc12b 100644 --- a/wellcomeml/ml/sent2vec_vectorizer.py +++ b/wellcomeml/ml/sent2vec_vectorizer.py @@ -14,7 +14,6 @@ required_extras=required_extras) - class Sent2VecVectorizer(BaseEstimator, TransformerMixin): def __init__(self, pretrained=None): self.pretrained = pretrained diff --git a/wellcomeml/ml/vectorizer.py b/wellcomeml/ml/vectorizer.py index 5787aad1..cd7e0bc2 100644 --- a/wellcomeml/ml/vectorizer.py +++ b/wellcomeml/ml/vectorizer.py @@ -17,7 +17,6 @@ required_extras=required_extras) - class Vectorizer(BaseEstimator, TransformerMixin): """ Abstract class, sklearn-compatible, that can vectorize texts using diff --git a/wellcomeml/ml/voting_classifier.py b/wellcomeml/ml/voting_classifier.py index 4766c946..c21a956d 100644 --- a/wellcomeml/ml/voting_classifier.py +++ b/wellcomeml/ml/voting_classifier.py @@ -33,6 +33,8 @@ """ import logging +from wellcomeml.utils import throw_extra_import_message + required_modules = 'sklearn,numpy' required_extras = 'core' @@ -42,9 +44,7 @@ from sklearn.exceptions import NotFittedError import numpy as np except ImportError as e: - throw_extra_import_message(error=e, required_modules=required_modules, - required_extras=required_extras) - + throw_extra_import_message(error=e, required_modules=required_modules, extras=required_extras) logger = logging.getLogger(__name__)