diff --git a/Makefile b/Makefile index 405392787e3d..f3ebb6135e25 100644 --- a/Makefile +++ b/Makefile @@ -121,12 +121,8 @@ endif rm data/MITIE*.bz2 prepare-transformers: - if [ $(OS) = "Windows_NT" ]; then HOME_DIR="$(HOMEDRIVE)$(HOMEPATH)"; else HOME_DIR=$(HOME); fi;\ - CACHE_DIR=$$HOME_DIR/.cache/torch/transformers;\ - mkdir -p "$$CACHE_DIR";\ - i=0;\ - while read -r URL; do read -r CACHE_FILE; if { [ $(CI) ] && [ $$i -gt 4 ]; } || ! [ $(CI) ]; then wget -nv $$URL -O $$CACHE_DIR/$$CACHE_FILE; fi; i=$$((i + 1)); done < "data/test/hf_transformers_models.txt" - + while read -r MODEL; do poetry run python scripts/download_transformer_model.py $$MODEL ; done < data/test/hf_transformers_models.txt + if ! [ $(CI) ]; then poetry run python scripts/download_transformer_model.py rasa/LaBSE; fi prepare-tests-macos: brew install wget graphviz || true @@ -178,7 +174,7 @@ test-nlu-predictors: prepare-spacy prepare-mitie test-marker test-full-model-training: PYTEST_MARKER=category_full_model_training and (not flaky) test-full-model-training: DD_ARGS := $(or $(DD_ARGS),) -test-full-model-training: prepare-spacy prepare-mitie test-marker +test-full-model-training: prepare-spacy prepare-mitie prepare-transformers test-marker test-other-unit-tests: PYTEST_MARKER=category_other_unit_tests and (not flaky) test-other-unit-tests: DD_ARGS := $(or $(DD_ARGS),) @@ -198,7 +194,7 @@ test-gh-actions: test-marker: clean # OMP_NUM_THREADS can improve overall performance using one thread by process (on tensorflow), avoiding overload # TF_CPP_MIN_LOG_LEVEL=2 sets C code log level for tensorflow to error suppressing lower log events - OMP_NUM_THREADS=1 TF_CPP_MIN_LOG_LEVEL=2 poetry run pytest tests -n $(JOBS) --dist loadscope -m "$(PYTEST_MARKER)" --cov rasa --ignore $(INTEGRATION_TEST_FOLDER) $(DD_ARGS) + TRANSFORMERS_OFFLINE=1 OMP_NUM_THREADS=1 TF_CPP_MIN_LOG_LEVEL=2 poetry run pytest tests -n $(JOBS) --dist loadscope -m "$(PYTEST_MARKER)" --cov rasa --ignore $(INTEGRATION_TEST_FOLDER) $(DD_ARGS) generate-pending-changelog: poetry run python -c "from scripts import release; release.generate_changelog('major.minor.patch')" diff --git a/data/test/hf_transformers_models.txt b/data/test/hf_transformers_models.txt index 6c474cddb083..ab931e036c4f 100644 --- a/data/test/hf_transformers_models.txt +++ b/data/test/hf_transformers_models.txt @@ -1,54 +1,9 @@ -https://s3.amazonaws.com/models.huggingface.co/bert/rasa/LaBSE/vocab.txt -21aa38329c730774d9f45df9ec5443a9bd4abd2191e1d510c27647c151c5437f.f2539f82b1008971c6ea6574f078d95c6eead57223fc74fdc420013fa9de391a -https://s3.amazonaws.com/models.huggingface.co/bert/rasa/LaBSE/special_tokens_map.json -99497d78492c90ab7d824d695b9a8d043369fbc2bf1112dcc7cdef9a6c4fa691.275045728fbf41c11d3dae08b8742c054377e18d92cc7b72b6351152a99b64e4 -https://s3.amazonaws.com/models.huggingface.co/bert/rasa/LaBSE/tokenizer_config.json -527f618330e845c9d31826e7d9ce983aa816fafcf4f29f8c52f8ae1fdd097219.1c61d5d3dc67d88e0c74c64cda9b17bc30bdbd1c373cceeb740b9953729709aa -https://s3.amazonaws.com/models.huggingface.co/bert/rasa/LaBSE/config.json -90984a8da5021905af8679644b61bc5428ef16e9a307469152c163ec873db240.f1ba7080a92fc164a144311742f36dfb6a724bc9da532264b30d87040e15cc9d -https://cdn.huggingface.co/rasa/LaBSE/tf_model.h5 -fd2ff7409cd4abbce31d54b8acebc305939787751dd697b6f38a3bf1f197a614.2589e15ea34b96d9bdcc478748ae77b629487da363566089fe6a8cdb1e6ea284.h5 -https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt -8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00 -https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json -8a3b1cfe5da58286e12a0f5d7d182b8d6eca88c08e26c332ee3817548cf7e60a.f12a4f986e43d8b328f5b067a641064d67b91597567a06c7b122d1ca7dfd9741 -https://cdn.huggingface.co/bert-base-chinese-tf_model.h5 -86a460b592673bcac3fe5d858ecf519e4890b4f6eddd1a46a077bd672dee6fe5.e6b974f59b54219496a89fd32be7afb020374df0976a796e5ccd3a1733d31537.h5 -https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json -4ab93d0cd78ae80e746c27c9cd34e90b470abdabe0590c9ec742df61625ba310.b9628f6fe5519626534b82ce7ec72b22ce0ae79550325f45c604a25c0ad87fd6 -https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt -0f8de0dbd6a2bb6bde7d758f4c120dd6dd20b46f2bf0a47bc899c89f46532fde.20808570f9a3169212a577f819c845330da870aeb14c40f7319819fce10c3b76 -https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json -a27bb7c70e9002d7558d2682d5a95f3c0a8b31034616309459e0b51ef07ade09.bd0797be126548711309ad2174d2afb16e3c37e891707667603d85e35a4ad001 -https://cdn.huggingface.co/openai-gpt-tf_model.h5 -642cba239b8eca22b702e71e92d507b8af47ddd2df74dc7751e2a4f65d8d434c.f26918df904593cca2dbd78a3bb760376f4f8ce1ce8d3b13bb6ab28228ee65fc.h5 -https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71 -https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda -https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json -4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.db13c9bc9c7bdd738ec89e069621d88e05dc670366092d809a9cbcac6798e24e -https://cdn.huggingface.co/gpt2-tf_model.h5 -132dec44f9ced4b20f1b1c88a426b1d3dab5ba9e5f24a82541833dae44d5b8db.afd2261c07481427cd087f622388c2c086be9c62875f5945922c7adb2239b63a.h5 -https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model -dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8 -https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json -c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.69e5e35e0b798cab5e473f253752f8bf4d280ee37682281a23eed80f6e2d09c6 -https://cdn.huggingface.co/xlnet-base-cased-tf_model.h5 -44ada4a49497a676c2d1fa2dbb7059df50f9cedb14f332862d2fca2c35d04a7d.42bc04b3944abffc38e9b60aadffb89f81aafa9c86473d157ed4b28953471ceb.h5 -https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt -26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084 -https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json -4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517 -https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json -a41e817d5c0743e29e86ff85edc8c257e61bc8d88e4271bb1b243b6e7614c633.8949e27aafafa845a18d98a0e3a88bc2d248bbc32a1b75947366664658f23b1c -https://cdn.huggingface.co/distilbert-base-uncased-tf_model.h5 -cce28882467f298a29fc905b9dd1683695d96198a83432fe707089dccd71c019.e02bd57e9d8507853eccc7c04ac2e938a6cdaff4b9bf941c10e781b61ddb9bbd.h5 -https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json -d0c5776499adc1ded22493fae699da0971c1ee4c2587111707a4d177d20257a2.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b -https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt -b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda -https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json -e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.117c81977c5979de8c088352e74ec6e70f5c66096c28b61d3c50101609b39690 -https://cdn.huggingface.co/roberta-base-tf_model.h5 -2e18f106492efe1a8e6766e4d4e0bf4a82cee267c0cc52af431cf97005d4e3db.34733ed140f011f207fb07b32b443050356e99a9638db284a22d77bd3d5f54b3.h5 \ No newline at end of file +bert-base-chinese +openai-gpt +gpt2 +xlnet-base-cased +bert-base-uncased +roberta-base +distilbert-base-uncased +camembert-base +sentence-transformers/all-MiniLM-L6-v2 diff --git a/scripts/download_transformer_model.py b/scripts/download_transformer_model.py new file mode 100644 index 000000000000..534befd4205d --- /dev/null +++ b/scripts/download_transformer_model.py @@ -0,0 +1,7 @@ +from huggingface_hub import snapshot_download +import sys + +print(f"Downloading model files for {sys.argv[1]}...") +snapshot_download( + repo_id=sys.argv[1], allow_patterns=["*.txt", "*.json", "*.h5", "*.model"] +)