Merge pull request #372 from wellcometrust/feature/add-more-extras-an…

…d-more-error-messages Breakdown extras
wellcometrust · Oct 29, 2021 · 2ebbdc5 · 2ebbdc5
2 parents 2f2eb6c + 03b32bd
commit 2ebbdc5
Show file tree

Hide file tree

Showing 28 changed files with 260 additions and 151 deletions.
diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@ This package contains common utility functions for usual tasks at the Wellcome T
 * An intuitive sklearn-like API wrapping text vectorizers, such as Doc2vec, Bert, Scibert
 * Common API for off-the-shelf classifiers to allow quick iteration (e.g. Frequency Vectorizer, Bert, Scibert, basic CNN, BiLSTM, SemanticSimilarity)
 * Utils to download and convert academic text datasets for benchmark
+* Utils to download data from the EPMC API
 
 For more information read the official [docs](http://wellcometrust.github.io/WellcomeML).
 
@@ -23,39 +24,38 @@ Installing from PyPi
 pip install wellcomeml
 ```
 
-This will install the "vanilla" package. In order to install the deep-learning functionality
-(torch/transformers/spacy transformers/visualisation functionalities):
+This will install the "vanilla" package with very little functionality, such as io, dataset download etc.
+
+If space is not a problem, you can install the  _full_ package (around 2.2GB):
 
 ```bash
-pip install wellcomeml[spacy, tensorflow, torch, vis]
+pip install wellcomeml[all]
 ```
 
-For a list of functionalities/classes and the dependencies on "extras", see [extras](#5-extras).
-
+The full package is relatively big, therefore we also have fine-grained installations if you only wish to use one specific module.
+Those are `core, transformers, tensorflow, torch, spacy`. You can install one, or more of those you want, e.g.:
 
-Installing from a release wheel
+```bash
+pip install wellcomeml[tensorflow, core]
+```
 
-Download the wheel [from aws](https://datalabs-public.s3.eu-west-2.amazonaws.com/wellcomeml/wellcomeml-2020.1.0-py3-none-any.whl)
-and pip install it:
+To check that your installation allows you to use a specific module, try (for example):
 
 ```bash
-pip install wellcomeml-2020.1.0-py3-none-any.whl
-pip install wellcomeml-2020.1.0-py3-none-any.whl[deep-learning]
+python -c "import wellcomeml.ml.bert_vectorizer"
 ```
 
-### 1.1 Installing wellcomeml[deep-learning] on windows 
+If you don't have the correct dependencies installed for a module, an error will appear 
+and point you to the right dependencies.
+
+### 1.1 Installing wellcomeml[all] on windows 
 
-Torch has a different installation for windows so it will not get automatically installed with wellcomeml[deeplearning].
+Torch has a different installation for windows so it will not get automatically installed with wellcomeml[all].
 It needs to be installed first (this is for machines with no CUDA parallel computing platform for those that do look here https://pytorch.org/ for correct installation):
 
 ```
 pip install torch==1.5.1+cpu torchvision==0.6.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
-```
-
-Then install wellcomeml[deep-learning]:
-
-```
-pip install wellcomeml[deep-learning]
+pip install wellcomeml[all]
 ```
 
 ## 2. Development
@@ -145,5 +145,4 @@ more information that might be informative to resolve the issue.
 | wellcomeml.ml.spacy_ner | Named entity recognition classifier based on spacy | spacy |
 | wellcomeml.ml.transformers_tokenizer | Bespoke tokenizer based on transformers | Transformers |
 | wellcomeml.ml.vectorizer | Abstract class for vectorizers | NA |
-| wellcomeml.ml.voting_classifier | Meta-classifier based on majority voting | NA | 
-
+| wellcomeml.ml.voting_classifier | Meta-classifier based on majority voting | NA |
diff --git a/setup.py b/setup.py
@@ -26,6 +26,37 @@
 with open('README.md', 'r') as f:
     long_description = f.read()
 
+extras = {
+        'core': [
+            'scikit-learn',
+            'scipy',
+            'umap-learn',
+            'gensim<=4.0.0',
+            'bokeh',
+            'pandas',
+            'nervaluate'
+        ],
+        'transformers': [
+            'transformers',
+            'tokenizers==0.10.1'
+        ],
+        'tensorflow': [
+            'tensorflow==2.4.0',
+            'tensorflow-addons',
+            'numpy>=1.19.2,<1.20'
+        ],
+        'torch': [
+            'torch'
+        ],
+        'spacy': [
+            'spacy[lookups]==3.0.6',
+            'click>=7.0,<8.0'
+        ],
+}
+
+# Allow users to install 'all' if they wish
+extras['all'] = [dep for dep_list in extras.values() for dep in dep_list]
+
 setuptools.setup(
     name=about['__name__'],
     version=about['__version__'],
@@ -42,38 +73,14 @@
         'Operating System :: OS Independent',
     ],
     install_requires=[
-        'numpy>=1.19.2,<1.20',
-        'pandas',
         'boto3',
-        'scikit-learn',
-        'scipy==1.4.1',
-        'click>=7.0,<8.0',
-        'umap-learn',
-        'nervaluate',
         'twine',
-        'gensim<=4.0.0',
         'cython',
         'flake8',
         'black',
-        'transformers',
-        'tokenizers==0.10.1',
         'tqdm'
     ],
-    extras_require={
-        'tensorflow': [
-            'tensorflow==2.4.0',
-            'tensorflow-addons'
-        ],
-        'torch': [
-            'torch'
-        ],
-        'spacy': [
-            'spacy[lookups]==3.0.6'
-        ],
-        'vis': [
-            'bokeh'
-        ]
-    },
+    extras_require=extras,
     tests_require=[
         'pytest',
         'pytest-cov',

diff --git a/tox.ini b/tox.ini
@@ -4,7 +4,7 @@ envlist = py37, py38
 [testenv]
 deps = 
 	-r requirements_test.txt
-	.[spacy,torch,tensorflow,vis]
+	.[all]
 
 commands = python -m spacy download en_core_web_sm
            pytest -m '{env:TEST_SUITE:}' -s -v --durations=0 --disable-warnings --tb=line --cov=wellcomeml ./tests

diff --git a/wellcomeml/metrics/f1.py b/wellcomeml/metrics/f1.py
@@ -2,8 +2,16 @@
 
 Adapted from: https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric
 """
-import tensorflow as tf
-import tensorflow.keras.backend as K
+from wellcomeml.utils import throw_extra_import_message
+
+required_modules = 'tensorflow'
+required_extras = 'tensorflow'
+
+try:
+    import tensorflow as tf
+    import tensorflow.keras.backend as K
+except ImportError as e:
+    throw_extra_import_message(e, required_modules, required_extras)
 
 
 def f1_metric(y_true, y_pred):

diff --git a/wellcomeml/metrics/ner_classification_report.py b/wellcomeml/metrics/ner_classification_report.py
@@ -1,5 +1,12 @@
+from wellcomeml.utils import throw_extra_import_message
 
-from nervaluate import Evaluator
+required_module = 'nervaluate'
+required_extras = 'core'
+
+try:
+    from nervaluate import Evaluator
+except ImportError as e:
+    throw_extra_import_message(e, required_module, required_extras)
 
 
 def ner_classification_report(y_true, y_pred, groups, tags):

diff --git a/wellcomeml/ml/attention.py b/wellcomeml/ml/attention.py
@@ -3,7 +3,7 @@
 try:
     import tensorflow as tf
 except ImportError as e:
-    throw_extra_import_message(error=e, required_module='tensorflow', extra='tensorflow')
+    throw_extra_import_message(error=e, required_modules='tensorflow', extras='tensorflow')
 
 
 class SelfAttention(tf.keras.layers.Layer):

diff --git a/wellcomeml/ml/bert_classifier.py b/wellcomeml/ml/bert_classifier.py
@@ -6,17 +6,17 @@
 import math
 import os
 
-from transformers import BertTokenizer, TFBertForSequenceClassification
-from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import f1_score
-
 from wellcomeml.utils import throw_extra_import_message
 
 try:
+    from transformers import BertTokenizer, TFBertForSequenceClassification
+    from sklearn.base import BaseEstimator, TransformerMixin
+    from sklearn.model_selection import train_test_split
+    from sklearn.metrics import f1_score
     import tensorflow as tf
 except ImportError as e:
-    throw_extra_import_message(error=e, required_module='tensorflow', extra='tensorflow')
+    throw_extra_import_message(error=e, required_modules='tensorflow,sklearn',
+                               extras='tensorflow,sklearn,transformers')
 
 
 PRETRAINED_CONFIG = {

diff --git a/wellcomeml/ml/bert_semantic_equivalence.py b/wellcomeml/ml/bert_semantic_equivalence.py
@@ -3,22 +3,24 @@
 import math
 import os
 
-from transformers import BertConfig, BertTokenizer, \
-    TFBertForSequenceClassification
-
-from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.model_selection import train_test_split
-from sklearn.utils.validation import check_is_fitted
-from sklearn.exceptions import NotFittedError
+from wellcomeml.utils import throw_extra_import_message
 
 from wellcomeml.ml.keras_utils import CategoricalMetrics  # , MetricMiniBatchHistory
 from wellcomeml.logger import LOGGING_LEVEL, build_logger
-from wellcomeml.utils import throw_extra_import_message
+
 
 try:
     import tensorflow as tf
+    from transformers import BertConfig, BertTokenizer, \
+        TFBertForSequenceClassification
+
+    from sklearn.base import BaseEstimator, TransformerMixin
+    from sklearn.model_selection import train_test_split
+    from sklearn.utils.validation import check_is_fitted
+    from sklearn.exceptions import NotFittedError
 except ImportError as e:
-    throw_extra_import_message(error=e, required_module='tensorflow', extra='tensorflow')
+    throw_extra_import_message(error=e, required_modules='tensorflow,sklearn',
+                               extras='core,tensorflow,transformers')
 
 
 class SemanticEquivalenceClassifier(BaseEstimator, TransformerMixin):

diff --git a/wellcomeml/ml/bert_vectorizer.py b/wellcomeml/ml/bert_vectorizer.py
@@ -4,17 +4,20 @@
 """
 import logging
 
-from transformers import BertModel, BertTokenizer
-from sklearn.base import BaseEstimator, TransformerMixin
-import numpy as np
 import tqdm
 
 from wellcomeml.utils import check_cache_and_download, throw_extra_import_message
 
+required_modules = "torch,sklearn,numpy"
+required_extras = "torch,transformers,sklearn"
 try:
+    from transformers import BertModel, BertTokenizer
+    from sklearn.base import BaseEstimator, TransformerMixin
+    import numpy as np
     import torch
 except ImportError as e:
-    throw_extra_import_message(error=e, extra="torch", required_module="torch")
+    throw_extra_import_message(error=e, required_modules=required_modules,
+                               extras=required_extras)
 
 logger = logging.getLogger(__name__)
 

diff --git a/wellcomeml/ml/bilstm.py b/wellcomeml/ml/bilstm.py
@@ -1,19 +1,19 @@
 from datetime import datetime
 import math
 
-from sklearn.model_selection import train_test_split
-from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.metrics import f1_score
-from scipy.sparse import csr_matrix, vstack
-import numpy as np
-
 from wellcomeml.ml.attention import HierarchicalAttention
 from wellcomeml.utils import throw_extra_import_message
 
 try:
     import tensorflow as tf
+    from sklearn.model_selection import train_test_split
+    from sklearn.base import BaseEstimator, ClassifierMixin
+    from sklearn.metrics import f1_score
+    from scipy.sparse import csr_matrix, vstack
+    import numpy as np
 except ImportError as e:
-    throw_extra_import_message(error=e, required_module='tensorflow', extra='tensorflow')
+    throw_extra_import_message(error=e, required_modules='tensorflow,scipy,numpy',
+                               extras='tensorflow,core')
 
 TENSORBOARD_LOG_DIR = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
 CALLBACK_DICT = {

diff --git a/wellcomeml/ml/clustering.py b/wellcomeml/ml/clustering.py
@@ -5,14 +5,23 @@
 
 from wellcomeml.ml import vectorizer
 from wellcomeml.logger import logger
+from wellcomeml.utils import throw_extra_import_message
+
+required_modules = 'tensorflow,sklearn,numpy'
+extras = 'tensorflow,core'
+
+try:
+    import numpy as np
+    from sklearn.base import ClusterMixin
+    from sklearn.cluster import DBSCAN, OPTICS, KMeans
+    from sklearn.manifold import TSNE
+    from sklearn.model_selection import GridSearchCV
+    from sklearn.metrics import silhouette_score
+    from sklearn.pipeline import Pipeline
+except ImportError as e:
+    throw_extra_import_message(error=e, required_modules=required_modules,
+                               extras=extras)
 
-import numpy as np
-from sklearn.base import ClusterMixin
-from sklearn.cluster import DBSCAN, OPTICS, KMeans
-from sklearn.manifold import TSNE
-from sklearn.model_selection import GridSearchCV
-from sklearn.metrics import silhouette_score
-from sklearn.pipeline import Pipeline
 
 try:
     from hdbscan import HDBSCAN

diff --git a/wellcomeml/ml/cnn.py b/wellcomeml/ml/cnn.py
@@ -18,19 +18,21 @@
 import logging
 import math
 
-from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.metrics import f1_score
-from scipy.sparse import csr_matrix, vstack
-import numpy as np
-
 from wellcomeml.ml.attention import HierarchicalAttention
 from wellcomeml.utils import throw_extra_import_message
 
+required_modules = 'tensorflow,sklearn,numpy,scipy'
+required_extras = 'core,tensorflow'
+
 try:
     import tensorflow_addons as tfa
     import tensorflow as tf
+    from sklearn.base import BaseEstimator, ClassifierMixin
+    from sklearn.metrics import f1_score
+    from scipy.sparse import csr_matrix, vstack
+    import numpy as np
 except ImportError as e:
-    throw_extra_import_message(error=e, required_module='tensorflow', extra='tensorflow')
+    throw_extra_import_message(error=e, required_modules=required_modules, extras=required_extras)
 
 logger = logging.getLogger(__name__)
 

diff --git a/wellcomeml/ml/doc2vec_vectorizer.py b/wellcomeml/ml/doc2vec_vectorizer.py
@@ -4,9 +4,18 @@
 import statistics
 import logging
 
-from sklearn.base import BaseEstimator, TransformerMixin
-from gensim.models.doc2vec import Doc2Vec, TaggedDocument
-import numpy as np
+from wellcomeml.utils import throw_extra_import_message
+
+required_modules = 'sklearn,gensim,numpy'
+required_extras = 'core'
+
+try:
+    from sklearn.base import BaseEstimator, TransformerMixin
+    from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+    import numpy as np
+except ImportError as e:
+    throw_extra_import_message(error=e, required_modules=required_modules, extras=required_extras)
+
 
 logging.getLogger("gensim").setLevel(logging.WARNING)