diff --git a/models/phishing-models/phishing-bert-20221115.onnx b/models/phishing-models/phishing-bert-20221115.onnx deleted file mode 100644 index 7c1e0ec716..0000000000 --- a/models/phishing-models/phishing-bert-20221115.onnx +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:80cc263dd7f9087dd19decfa687614635381540b29ba5aabc4ae1ffa7009e757 -size 438007870 diff --git a/models/phishing-models/phishing-bert-20221115.pt b/models/phishing-models/phishing-bert-20221115.pt deleted file mode 100644 index b28ef683ba..0000000000 --- a/models/phishing-models/phishing-bert-20221115.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0bca567a4bd840fb4b20b6b4a1fb98d9a5a79b2b2dc959b7b539ee35e10357b6 -size 438040521 diff --git a/models/phishing-models/phishing-bert-20230421.onnx b/models/phishing-models/phishing-bert-20230421.onnx new file mode 100644 index 0000000000..6365d2bc31 --- /dev/null +++ b/models/phishing-models/phishing-bert-20230421.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d6e89b2890ed8f7d89577f3b204a117f115c90116025280a5b164c55400e8d4 +size 438207850 diff --git a/models/phishing-models/phishing-bert-20230421.pt b/models/phishing-models/phishing-bert-20230421.pt new file mode 100644 index 0000000000..b27de845fb --- /dev/null +++ b/models/phishing-models/phishing-bert-20230421.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e0a4b6454dbf20625161e36337bf75e1175cc7fb9a70af22671379489818482 +size 438049721 diff --git a/models/sid-models/sid-minibert-20211021.onnx b/models/sid-models/sid-minibert-20211021.onnx deleted file mode 100644 index 0140aa2fa4..0000000000 --- a/models/sid-models/sid-minibert-20211021.onnx +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a8729582a6784121e78ba4305513c3099314a75ecf51501470d1244555588cc -size 44720790 diff --git a/models/sid-models/sid-minibert-20211021.pth b/models/sid-models/sid-minibert-20211021.pth deleted file mode 100644 index 3ce5fdd30a..0000000000 --- a/models/sid-models/sid-minibert-20211021.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d552a23031146cc76952989f0299b6b79090bf0e8998c0c40bb635966b5801e7 -size 44737609 diff --git a/models/sid-models/sid-minibert-20230424.onnx b/models/sid-models/sid-minibert-20230424.onnx new file mode 100644 index 0000000000..6e28a69796 --- /dev/null +++ b/models/sid-models/sid-minibert-20230424.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d12f31d6dcebd03dab1824c850ebe928b5f5b76d3e9029e9b160f593c12bea9e +size 44793456 diff --git a/models/sid-models/sid-minibert-20230424.pt b/models/sid-models/sid-minibert-20230424.pt new file mode 100644 index 0000000000..c63fdc8124 --- /dev/null +++ b/models/sid-models/sid-minibert-20230424.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4216cca489d02cfdf7d03f2c8c5f19e52d94de8661957f9449ba289329b6cb0 +size 44743365 diff --git a/models/training-tuning-scripts/log-parsing-models/log-parsing-training.ipynb b/models/training-tuning-scripts/log-parsing-models/log-parsing-training.ipynb index 74b5f16d2a..6185f737ff 100644 --- a/models/training-tuning-scripts/log-parsing-models/log-parsing-training.ipynb +++ b/models/training-tuning-scripts/log-parsing-models/log-parsing-training.ipynb @@ -24,7 +24,9 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from seqeval.metrics import classification_report,accuracy_score,f1_score\n", @@ -66,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -111,56 +113,59 @@ " \n", " \n", " \n", - " 257\n", + " 4655\n", " <NA>\n", " <NA>\n", - " 158.69.5.181 - - [04/Apr/2018:23:06:49 +0200] ...\n", - " 158.69.5.181\n", - " -\n", + " 193.106.31.130 - - [11/Aug/2019:19:54:28 +0200...\n", + " 193.106.31.130\n", " -\n", " -\n", " -\n", - " Other\n", - " Other\n", - " <NA>\n", - " 1.1\n", + " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ...\n", + " IE\n", + " Windows\n", + " Vista\n", + " 1.0\n", " POST\n", " /administrator/index.php\n", - " 4498\n", + " 4481\n", " 200\n", - " [04/Apr/2018:23:06:49 +0200]\n", + " [11/Aug/2019:19:54:28 +0200]\n", " \n", " \n", "\n", "" ], "text/plain": [ - " error_level error_message \\\n", - "257 \n", + " error_level error_message \\\n", + "4655 \n", + "\n", + " raw remote_host \\\n", + "4655 193.106.31.130 - - [11/Aug/2019:19:54:28 +0200... 193.106.31.130 \n", "\n", - " raw remote_host \\\n", - "257 158.69.5.181 - - [04/Apr/2018:23:06:49 +0200] ... 158.69.5.181 \n", + " remote_logname remote_user request_header_referer \\\n", + "4655 - - - \n", "\n", - " remote_logname remote_user request_header_referer \\\n", - "257 - - - \n", + " request_header_user_agent \\\n", + "4655 Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ... \n", "\n", - " request_header_user_agent request_header_user_agent__browser__family \\\n", - "257 - Other \n", + " request_header_user_agent__browser__family \\\n", + "4655 IE \n", "\n", - " request_header_user_agent__os__family \\\n", - "257 Other \n", + " request_header_user_agent__os__family \\\n", + "4655 Windows \n", "\n", - " request_header_user_agent__os__version_string request_http_ver \\\n", - "257 1.1 \n", + " request_header_user_agent__os__version_string request_http_ver \\\n", + "4655 Vista 1.0 \n", "\n", - " request_method request_url response_bytes_clf status \\\n", - "257 POST /administrator/index.php 4498 200 \n", + " request_method request_url response_bytes_clf status \\\n", + "4655 POST /administrator/index.php 4481 200 \n", "\n", - " time_received \n", - "257 [04/Apr/2018:23:06:49 +0200] " + " time_received \n", + "4655 [11/Aug/2019:19:54:28 +0200] " ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -172,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -191,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -203,7 +208,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -215,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -247,7 +252,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -274,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -292,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -323,15 +328,15 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/opt/conda/envs/rapids/lib/python3.8/site-packages/cudf/core/subword_tokenizer.py:187: UserWarning: When truncation is not True, the behaviour currently differs from HuggingFace as cudf always returns overflowing tokens\n", - " warn(warning_msg)\n" + "/opt/conda/envs/morpheus/lib/python3.8/site-packages/cudf/core/subword_tokenizer.py:189: UserWarning: When truncation is not True, the behavior currently differs from HuggingFace as cudf always returns overflowing tokens\n", + " warnings.warn(warning_msg)\n" ] } ], @@ -349,7 +354,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -365,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -376,7 +381,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -395,7 +400,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -414,7 +419,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -424,7 +429,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -437,7 +442,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -456,17 +461,17 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']\n", + "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']\n", "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } @@ -491,7 +496,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -515,37 +520,37 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Epoch: 50%|█████ | 1/2 [00:35<00:35, 35.41s/it]" + "Epoch: 50%|█████ | 1/2 [00:38<00:38, 38.73s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Train loss: 0.18636336472931586\n" + "Train loss: 0.2076284834630277\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Epoch: 100%|██████████| 2/2 [01:10<00:00, 35.27s/it]" + "Epoch: 100%|██████████| 2/2 [01:17<00:00, 38.85s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Train loss: 0.0059268270875965185\n", - "CPU times: user 44.8 s, sys: 25.7 s, total: 1min 10s\n", - "Wall time: 1min 10s\n" + "Train loss: 0.008250679652531925\n", + "CPU times: user 1min 16s, sys: 896 ms, total: 1min 17s\n", + "Wall time: 1min 17s\n" ] }, { @@ -596,7 +601,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -606,33 +611,58 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 23, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/envs/morpheus/lib/python3.8/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: [PAD] seems not to be NE tag.\n", + " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "f1 score: 0.997863\n", + "Accuracy score: 0.999263\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/envs/morpheus/lib/python3.8/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/opt/conda/envs/morpheus/lib/python3.8/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "f1 score: 0.998655\n", - "Accuracy score: 0.999771\n", " precision recall f1-score support\n", "\n", - " error_level 1.000 1.000 1.000 100\n", - " error_message 1.000 1.000 1.000 100\n", - " remote_host 1.000 1.000 1.000 913\n", - " request_header_referer 1.000 1.000 1.000 508\n", - " request_header_user_agent 1.000 1.000 1.000 1002\n", - "request_header_user_agent__os__version_string 0.875 1.000 0.933 14\n", - " request_http_ver 1.000 1.000 1.000 913\n", - " request_method 1.000 1.000 1.000 913\n", - " request_url 0.997 0.981 0.989 913\n", - " response_bytes_clf 1.000 1.000 1.000 911\n", - " status 1.000 1.000 1.000 912\n", - " time_received 1.000 1.000 1.000 985\n", + " PAD] 0.000 0.000 0.000 0\n", + " error_level 1.000 1.000 1.000 90\n", + " error_message 1.000 1.000 1.000 90\n", + " remote_host 1.000 1.000 1.000 890\n", + " request_header_referer 1.000 0.996 0.998 476\n", + " request_header_user_agent 1.000 1.000 1.000 1005\n", + "request_header_user_agent__os__version_string 0.000 0.000 0.000 19\n", + " request_http_ver 1.000 1.000 1.000 890\n", + " request_method 1.000 1.000 1.000 890\n", + " request_url 1.000 0.990 0.995 890\n", + " response_bytes_clf 1.000 1.000 1.000 888\n", + " status 1.000 1.000 1.000 888\n", + " time_received 0.998 1.000 0.999 952\n", "\n", - " micro avg 0.999 0.998 0.999 8184\n", - " macro avg 0.989 0.998 0.994 8184\n", - " weighted avg 0.999 0.998 0.999 8184\n", + " micro avg 0.999 0.996 0.998 7968\n", + " macro avg 0.846 0.845 0.846 7968\n", + " weighted avg 0.997 0.996 0.997 7968\n", "\n" ] } @@ -700,7 +730,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -714,6 +744,65 @@ " model.save_pretrained('log_parsing_apache_morpheus')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export model to ONNX" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tokenizer_output = tokenizer(logs_df[\"raw_preprocess\"][0:3],\n", + " max_length=MAX_SEQ_LEN,\n", + " stride = STRIDE,\n", + " truncation=False,\n", + " max_num_rows = 3,\n", + " add_special_tokens=False,\n", + " return_tensors='pt'\n", + " )\n", + "sample_input_ids = tokenizer_output['input_ids'].type(torch.long)\n", + "sample_attention_masks = tokenizer_output['attention_mask'].type(torch.long)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sample_model_input = (sample_input_ids, sample_attention_masks)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "torch.onnx.export(model, \n", + " sample_model_input, \n", + " \"model.onnx\", # where to save the model\n", + " export_params=True, # store the trained parameter weights inside the model file\n", + " opset_version=10, # the ONNX version to export the model to\n", + " do_constant_folding=True, # whether to execute constant folding for optimization\n", + " input_names = ['input_ids','attention_mask'], # the model's input names\n", + " output_names = ['output'], # the model's output names\n", + " dynamic_axes={'input_ids' : {0 : 'batch_size'}, # variable length axes\n", + " 'attention_mask': {0: 'batch_size'}, \n", + " 'output' : {0 : 'batch_size'}})" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -733,18 +822,11 @@ "\n", "https://medium.com/rapids-ai/cybert-28b35a4c81c4" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -758,7 +840,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.8.15" } }, "nbformat": 4, diff --git a/models/training-tuning-scripts/phishing-models/phish-bert-training.ipynb b/models/training-tuning-scripts/phishing-models/phish-bert-training.ipynb index 743ad313fc..3cd8eed7d7 100644 --- a/models/training-tuning-scripts/phishing-models/phish-bert-training.ipynb +++ b/models/training-tuning-scripts/phishing-models/phish-bert-training.ipynb @@ -43,19 +43,13 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/conda/envs/morpheus/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "import cudf\n", + "from cudf.core.subword_tokenizer import SubwordTokenizer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import f1_score\n", "import binary_sequence_classifier\n", @@ -91,7 +85,9 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "if not os.path.isfile(\"smsspamcollection.zip\"): \n", @@ -103,7 +99,9 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "name": "stdout", @@ -127,7 +125,9 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "if not os.path.isfile(\"SMSSpamCollection\"):\n", @@ -202,10 +202,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']\n", + "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']\n", "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } @@ -241,28 +241,28 @@ "name": "stderr", "output_type": "stream", "text": [ - "Epoch: 50%|█████ | 1/2 [00:35<00:35, 35.78s/it]" + "Epoch: 50%|█████ | 1/2 [00:34<00:34, 34.08s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Train loss: 0.09204745624946165\n" + "Train loss: 0.07317519100782062\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Epoch: 100%|██████████| 2/2 [01:11<00:00, 35.92s/it]" + "Epoch: 100%|██████████| 2/2 [01:09<00:00, 34.79s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Train loss: 0.01900260798949083\n" + "Train loss: 0.015208904994817982\n" ] }, { @@ -283,8 +283,8 @@ "metadata": {}, "outputs": [], "source": [ - "# save model file and configuration file in a directory\n", - "seq_classifier.save_model(\"./phish-bert-model\")" + "# save as pytorch model\n", + "torch.save(seq_classifier._model.module, \"phishing-bert.pt\")" ] }, { @@ -345,7 +345,7 @@ { "data": { "text/plain": [ - "0.9729729729729729" + "0.9731543624161074" ] }, "execution_count": 15, @@ -359,6 +359,59 @@ "f1_score(true_labels, tests)" ] }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Export Model to ONNX" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = SubwordTokenizer(\"./resources/bert-base-uncased-hash.txt\", do_lower_case=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer_output = tokenizer(df[\"message\"][0:3],\n", + " max_length=128,\n", + " max_num_rows=3,\n", + " truncation=True,\n", + " add_special_tokens=True,\n", + " return_tensors=\"pt\")\n", + "\n", + "sample_model_input = (tokenizer_output[\"input_ids\"].type(torch.long), tokenizer_output[\"attention_mask\"].type(torch.long))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "torch.onnx.export(seq_classifier._model.module, \n", + " sample_model_input, \n", + " \"model.onnx\", # where to save the model\n", + " export_params=True, # store the trained parameter weights inside the model file\n", + " opset_version=10, # the ONNX version to export the model to\n", + " do_constant_folding=True, # whether to execute constant folding for optimization\n", + " input_names = ['input_ids','attention_mask'], # the model's input names\n", + " output_names = ['output'], # the model's output names\n", + " dynamic_axes={'input_ids' : {0 : 'batch_size'}, # variable length axes\n", + " 'attention_mask': {0: 'batch_size'}, \n", + " 'output' : {0 : 'batch_size'}})" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -418,7 +471,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9 (default, Jan 26 2021, 15:33:00) \n[GCC 8.4.0]" + "version": "3.8.15" }, "vscode": { "interpreter": { diff --git a/models/training-tuning-scripts/root-cause-models/root-cause-bert.ipynb b/models/training-tuning-scripts/root-cause-models/root-cause-bert.ipynb index 9d729610ac..37f09e6754 100644 --- a/models/training-tuning-scripts/root-cause-models/root-cause-bert.ipynb +++ b/models/training-tuning-scripts/root-cause-models/root-cause-bert.ipynb @@ -64,8 +64,9 @@ }, "outputs": [], "source": [ - "import cudf;\n", - "from binary_sequence_classifier import BinarySequenceClassifier;\n", + "import cudf\n", + "from cudf.core.subword_tokenizer import SubwordTokenizer\n", + "from binary_sequence_classifier import BinarySequenceClassifier\n", "from os import path;\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import f1_score\n", @@ -88,7 +89,9 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "random_seed=42" @@ -100,7 +103,8 @@ "metadata": { "colab": {}, "colab_type": "code", - "id": "_UkeC7SG2krJ" + "id": "_UkeC7SG2krJ", + "tags": [] }, "outputs": [], "source": [ @@ -187,7 +191,9 @@ { "cell_type": "code", "execution_count": 11, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "#X_train.to_csv(\"Rootcause-training-data.csv\",index=False)" @@ -217,7 +223,7 @@ "metadata": {}, "outputs": [], "source": [ - "X_test.to_json(\"Rootcause-validation-data.jsonlines\",orient='records',lines=True)" + "X_test.to_json(\"Rootcause-validation-data.jsonlines\", orient='records',lines=True)" ] }, { @@ -264,10 +270,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']\n", + "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']\n", "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } @@ -307,14 +313,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Epoch: 100%|██████████| 1/1 [00:08<00:00, 8.64s/it]" + "Epoch: 100%|██████████| 1/1 [00:09<00:00, 9.12s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Train loss: 0.6523606370795857\n" + "Train loss: 0.5870807089588859\n" ] }, { @@ -370,7 +376,7 @@ { "data": { "text/plain": [ - "0.9601666666666666" + "0.95" ] }, "execution_count": 21, @@ -423,7 +429,7 @@ { "data": { "text/plain": [ - "0.9765765765765766" + "0.9591474245115452" ] }, "execution_count": 24, @@ -457,8 +463,8 @@ { "data": { "text/plain": [ - "array([[189, 13],\n", - " [ 0, 271]])" + "array([[180, 22],\n", + " [ 1, 270]])" ] }, "execution_count": 25, @@ -487,28 +493,29 @@ { "data": { "text/plain": [ - "(array([ 0, 1, 7, 9, 11, 12, 13, 14, 16, 18, 19, 20, 22,\n", - " 23, 24, 25, 29, 31, 34, 42, 44, 45, 46, 47, 49, 50,\n", - " 51, 52, 53, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65,\n", - " 66, 67, 68, 69, 70, 71, 73, 74, 75, 80, 81, 82, 85,\n", - " 86, 88, 89, 90, 92, 93, 94, 95, 97, 98, 99, 101, 103,\n", - " 104, 105, 106, 107, 109, 110, 112, 113, 114, 115, 117, 119, 120,\n", - " 123, 124, 125, 126, 128, 129, 130, 131, 134, 136, 137, 138, 139,\n", - " 140, 141, 142, 143, 144, 145, 146, 148, 149, 150, 153, 154, 156,\n", - " 159, 160, 161, 162, 163, 164, 165, 166, 170, 172, 173, 174, 175,\n", - " 176, 178, 181, 182, 183, 187, 188, 191, 194, 196, 200, 201, 202,\n", - " 205, 206, 207, 209, 210, 214, 215, 216, 217, 218, 221, 222, 223,\n", - " 224, 226, 227, 228, 229, 231, 234, 235, 236, 237, 240, 244, 245,\n", - " 246, 250, 251, 254, 255, 256, 257, 262, 265, 266, 267, 268, 269,\n", - " 271, 274, 275, 276, 278, 284, 285, 287, 289, 290, 291, 292, 293,\n", - " 294, 295, 296, 299, 300, 301, 302, 303, 304, 309, 311, 312, 315,\n", - " 316, 317, 318, 319, 321, 322, 323, 328, 329, 330, 331, 332, 334,\n", - " 335, 340, 343, 347, 349, 350, 351, 352, 353, 354, 355, 356, 357,\n", - " 358, 361, 363, 365, 367, 369, 373, 374, 375, 378, 379, 380, 381,\n", - " 382, 383, 384, 385, 386, 388, 389, 390, 392, 394, 395, 396, 397,\n", - " 398, 399, 400, 401, 402, 404, 406, 407, 408, 409, 410, 411, 412,\n", - " 414, 415, 416, 418, 419, 420, 421, 425, 426, 429, 432, 434, 435,\n", - " 436, 437, 438, 441, 442, 443, 444, 450, 453, 460, 471]),)" + "(array([ 0, 1, 2, 7, 9, 11, 12, 13, 14, 16, 18, 19, 20,\n", + " 22, 23, 24, 25, 29, 31, 34, 42, 43, 44, 45, 46, 47,\n", + " 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 61, 62, 63,\n", + " 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 80, 81, 82,\n", + " 85, 86, 88, 89, 90, 92, 93, 94, 95, 97, 98, 99, 101,\n", + " 103, 104, 105, 106, 107, 109, 110, 112, 113, 114, 115, 117, 119,\n", + " 120, 123, 124, 125, 126, 127, 128, 129, 130, 131, 134, 136, 137,\n", + " 138, 139, 140, 141, 142, 143, 144, 145, 146, 148, 149, 150, 153,\n", + " 154, 155, 156, 159, 160, 161, 162, 163, 164, 165, 166, 170, 171,\n", + " 172, 173, 174, 175, 176, 178, 181, 182, 183, 187, 188, 191, 194,\n", + " 196, 200, 201, 202, 205, 206, 207, 209, 210, 214, 215, 216, 217,\n", + " 218, 221, 222, 223, 224, 226, 227, 228, 229, 231, 234, 235, 236,\n", + " 237, 238, 240, 244, 245, 246, 250, 251, 254, 255, 256, 257, 262,\n", + " 263, 265, 266, 267, 268, 269, 271, 274, 275, 276, 278, 279, 284,\n", + " 285, 287, 289, 290, 291, 292, 293, 294, 295, 296, 297, 299, 300,\n", + " 301, 302, 303, 304, 309, 311, 312, 315, 316, 317, 318, 319, 321,\n", + " 322, 323, 328, 329, 330, 331, 332, 334, 340, 343, 344, 347, 349,\n", + " 350, 351, 352, 353, 354, 355, 356, 357, 358, 361, 363, 365, 367,\n", + " 369, 372, 373, 374, 375, 378, 379, 380, 381, 382, 383, 384, 385,\n", + " 386, 388, 389, 390, 392, 394, 395, 396, 397, 398, 399, 400, 401,\n", + " 402, 404, 406, 407, 408, 409, 410, 411, 412, 414, 415, 416, 418,\n", + " 419, 420, 421, 425, 426, 429, 432, 434, 435, 436, 437, 438, 441,\n", + " 442, 443, 444, 450, 452, 453]),)" ] }, "execution_count": 27, @@ -520,6 +527,57 @@ "(np.where(testpredseries == 1))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export model to ONNX" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = SubwordTokenizer(\"./resources/bert-base-uncased-hash.txt\", do_lower_case=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer_output = tokenizer(dflogs[\"log\"][0:3],\n", + " max_length=128,\n", + " max_num_rows=3,\n", + " truncation=True,\n", + " add_special_tokens=True,\n", + " return_tensors=\"pt\")\n", + "\n", + "sample_model_input = (tokenizer_output[\"input_ids\"], tokenizer_output[\"attention_mask\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "torch.onnx.export(seq_classifier._model.module, \n", + " sample_model_input, \n", + " \"model.onnx\", # where to save the model\n", + " export_params=True, # store the trained parameter weights inside the model file\n", + " opset_version=10, # the ONNX version to export the model to\n", + " do_constant_folding=True, # whether to execute constant folding for optimization\n", + " input_names = ['input_ids','attention_mask'], # the model's input names\n", + " output_names = ['output'], # the model's output names\n", + " dynamic_axes={'input_ids' : {0 : 'batch_size'}, # variable length axes\n", + " 'attention_mask': {0: 'batch_size'}, \n", + " 'output' : {0 : 'batch_size'}})" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -567,7 +625,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.15" } }, "nbformat": 4, diff --git a/models/training-tuning-scripts/sid-models/sid-minibert-20211021-script.py b/models/training-tuning-scripts/sid-models/sid-minibert-20230424-script.py similarity index 96% rename from models/training-tuning-scripts/sid-models/sid-minibert-20211021-script.py rename to models/training-tuning-scripts/sid-models/sid-minibert-20230424-script.py index 7ebf5a9c93..e283bd40f0 100644 --- a/models/training-tuning-scripts/sid-models/sid-minibert-20211021-script.py +++ b/models/training-tuning-scripts/sid-models/sid-minibert-20230424-script.py @@ -13,19 +13,19 @@ # limitations under the License. """ Example Usage: -python sid-minibert-20210614-script.py \ - --training-data /datasets/training-data/sid-sample-training-data.csv \ +python sid-minibert-20230424-script.py \ + --training-data ../../datasets/training-data/sid-sample-training-data.csv \ --model-dir google/bert_uncased_L-4_H-256_A-4 \ - --tokenizer-hash-filepath /resources/bert-base-uncased-hash.txt - --output-file /trained_models/model.pth + --tokenizer-hash-filepath /resources/bert-base-uncased-hash.txt \ + --output-file sid-minibert-model.pt """ import argparse - -import torch from sklearn.metrics import accuracy_score from sklearn.metrics import f1_score from sklearn.metrics import multilabel_confusion_matrix + +import torch from torch.nn import BCEWithLogitsLoss from torch.utils.data import DataLoader from torch.utils.data import TensorDataset @@ -62,9 +62,9 @@ def data_preprocessing(training_data): cased_tokenizer = SubwordTokenizer("resources/bert-base-uncased-hash.txt", do_lower_case=True) - tokenizer_output = cased_tokenizer(df.text, + tokenizer_output = cased_tokenizer(df.data, max_length=256, - max_num_rows=len(df.text), + max_num_rows=len(df.data), padding='max_length', return_tensors='pt', truncation=True, diff --git a/models/training-tuning-scripts/sid-models/sid-minibert-20211021.ipynb b/models/training-tuning-scripts/sid-models/sid-minibert-20230424.ipynb similarity index 82% rename from models/training-tuning-scripts/sid-models/sid-minibert-20211021.ipynb rename to models/training-tuning-scripts/sid-models/sid-minibert-20230424.ipynb index 5bf632bdf0..a5564de42b 100644 --- a/models/training-tuning-scripts/sid-models/sid-minibert-20211021.ipynb +++ b/models/training-tuning-scripts/sid-models/sid-minibert-20230424.ipynb @@ -63,7 +63,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = cudf.read_csv(\"../datasets/training-data/sid-sample-training-data.csv\")" + "df = cudf.read_csv(\"../../datasets/training-data/sid-sample-training-data.csv\")" ] }, { @@ -224,7 +224,7 @@ "# load the following model for mini-bert from huggingface\n", "# model = AutoModelForSequenceClassification.from_pretrained(\"google/bert_uncased_L-4_H-256_A-4\", num_labels=num_labels)\n", "\n", - "model = torch.load('repo_model/sid-minibert-20211021.pth')" + "model = torch.load('../../sid-models/sid-minibert-20230424.pt')" ] }, { @@ -266,7 +266,16 @@ "execution_count": 13, "id": "educational-channel", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/envs/morpheus/lib/python3.8/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n" + ] + } + ], "source": [ "# using hyperparameters recommended in orginial BERT paper\n", "# the optimizer allows us to apply different hyperpameters for specific parameter groups\n", @@ -293,14 +302,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Epoch: 100%|██████████| 1/1 [00:24<00:00, 24.37s/it]" + "Epoch: 100%|██████████| 1/1 [00:02<00:00, 2.61s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Train loss: 0.0006268636239110492\n" + "Train loss: 0.000367460027046036\n" ] }, { @@ -373,8 +382,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "F1 Macro Validation Accuracy: 99.87012987012986\n", - "Flat Validation Accuracy: 99.75\n" + "F1 Macro Validation Accuracy: 100.0\n", + "Flat Validation Accuracy: 100.0\n" ] } ], @@ -427,32 +436,32 @@ "[[370 0]\n", " [ 0 30]]\n", "si_bank_acct\n", - "[[354 0]\n", - " [ 0 46]]\n", + "[[358 0]\n", + " [ 0 42]]\n", "si_credit_card\n", - "[[357 0]\n", - " [ 0 43]]\n", + "[[363 0]\n", + " [ 0 37]]\n", "si_email\n", - "[[362 0]\n", - " [ 0 38]]\n", + "[[369 0]\n", + " [ 0 31]]\n", "si_govt_id\n", - "[[361 0]\n", - " [ 0 39]]\n", - "si_name\n", - "[[361 1]\n", - " [ 0 38]]\n", - "si_password\n", "[[357 0]\n", " [ 0 43]]\n", + "si_name\n", + "[[369 0]\n", + " [ 0 31]]\n", + "si_password\n", + "[[358 0]\n", + " [ 0 42]]\n", "si_phone_num\n", - "[[355 0]\n", - " [ 0 45]]\n", + "[[368 0]\n", + " [ 0 32]]\n", "si_secret_keys\n", - "[[365 0]\n", - " [ 0 35]]\n", + "[[361 0]\n", + " [ 0 39]]\n", "si_user\n", - "[[365 0]\n", - " [ 0 35]]\n" + "[[361 0]\n", + " [ 0 39]]\n" ] } ], @@ -485,6 +494,55 @@ "# torch.save(model, output_file) " ] }, + { + "cell_type": "markdown", + "id": "96d4dc3b-54a4-4ca8-8a40-bb7dca765180", + "metadata": {}, + "source": [ + "## Export Model to ONNX" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "96f92112-cf40-4b1c-b796-b196ab9f3928", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tokenizer_output = bert_uncased_tokenizer(df[\"data\"][0:3],\n", + " max_length=256,\n", + " max_num_rows=3,\n", + " truncation=True,\n", + " add_special_tokens=True,\n", + " return_tensors=\"pt\")\n", + "\n", + "sample_model_input = (tokenizer_output[\"input_ids\"], tokenizer_output[\"attention_mask\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "beef0eab-e852-4fb1-8020-7fc02d475c1e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "torch.onnx.export(model, \n", + " sample_model_input, \n", + " \"model.onnx\", # where to save the model\n", + " export_params=True, # store the trained parameter weights inside the model file\n", + " opset_version=10, # the ONNX version to export the model to\n", + " do_constant_folding=True, # whether to execute constant folding for optimization\n", + " input_names = ['input_ids','attention_mask'], # the model's input names\n", + " output_names = ['output'], # the model's output names\n", + " dynamic_axes={'input_ids' : {0 : 'batch_size'}, # variable length axes\n", + " 'attention_mask': {0: 'batch_size'}, \n", + " 'output' : {0 : 'batch_size'}})" + ] + }, { "cell_type": "markdown", "id": "ideal-community", @@ -498,7 +556,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -512,7 +570,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.8.15" } }, "nbformat": 4, diff --git a/models/triton-model-repo/phishing-bert-onnx/1/model.onnx b/models/triton-model-repo/phishing-bert-onnx/1/model.onnx index a3aacf0265..164a725392 120000 --- a/models/triton-model-repo/phishing-bert-onnx/1/model.onnx +++ b/models/triton-model-repo/phishing-bert-onnx/1/model.onnx @@ -1 +1 @@ -../../../phishing-models/phishing-bert-20221115.onnx \ No newline at end of file +../../../phishing-models/phishing-bert-20230421.onnx \ No newline at end of file diff --git a/models/triton-model-repo/sid-minibert-onnx/1/model.onnx b/models/triton-model-repo/sid-minibert-onnx/1/model.onnx index 7340667742..b46b5afbc1 120000 --- a/models/triton-model-repo/sid-minibert-onnx/1/model.onnx +++ b/models/triton-model-repo/sid-minibert-onnx/1/model.onnx @@ -1 +1 @@ -../../../sid-models/sid-minibert-20211021.onnx \ No newline at end of file +../../../sid-models/sid-minibert-20230424.onnx \ No newline at end of file