From 5554fb4197f730205456baafe960dae4676d0641 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Schr=C3=B6der?= Date: Sat, 20 Apr 2024 23:45:39 +0200 Subject: [PATCH 1/3] Update showcase section in docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Christopher Schröder --- docs/showcase.rst | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/docs/showcase.rst b/docs/showcase.rst index 687882a5..ed2980b6 100644 --- a/docs/showcase.rst +++ b/docs/showcase.rst @@ -2,9 +2,34 @@ Showcase ======== +In this section, we collect publications, tutorials, and other resources that have used small-text. + +---- + +.. contents:: Overview + :depth: 1 + :local: + :backlinks: none + +---- + Papers ------ +2023 +^^^^ + +- | David Kartchner, Irfan Al-Hussaini, Haydn Turner, Jennifer Deng, Shubham Lohiya, Prasanth Bathala, and Cassie S. Mitchell. 2023. + | `BioSift: A Dataset for Filtering Biomedical Abstracts for Drug Repurposing and Clinical Meta-Analysis. `_ + | In: Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval, pages 2913–2923. + +- | Klaus Schmidt, Andreas Niekler, Cathleen Kantner, and Manuel Burghardt. 2023. + | `Classifying Speech Acts in Political Communication: A Transformer-based Approach with Weak Supervision and Active Learning `_ + | In: Proceedings of the 18th Conference on Computer Science and Intelligence Systems, ACSIS, Vol. 35, pages 739–748. + +2022 +^^^^ + - | Hannah Kirk, Bertie Vidgen, and Scott Hale. 2022. | `Is More Data Better? Re-thinking the Importance of Efficiency in Abusive Language Detection with Transformers-Based Active Learning. `_ | In Proceedings of the Third Workshop on Threat, Aggression and Cyberbullying (TRAC 2022), pages 52–61, Gyeongju, Republic of Korea. Association for Computational Linguistics. From c67d91c3779c97912ed1d6f2215bd5b86c835291 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Schr=C3=B6der?= Date: Fri, 26 Apr 2024 21:48:58 +0200 Subject: [PATCH 2/3] Fix setfit seed control MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Christopher Schröder --- CHANGELOG.md | 12 +++++++++++ .../transformers/classifiers/setfit.py | 2 ++ .../integrations/transformers/utils/setfit.py | 3 +++ .../transformers/classifiers/test_setfit.py | 20 +++++++++++++++++++ .../transformers/classifiers/test_setfit.py | 9 +++++++++ 5 files changed, 46 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3677a4af..19e77e2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog +## Version 1.4.0 - unreleased + +### Fixed + +- Changed the way how the seed is controlled in `SetFitClassification` since the seed was fixed unless explicitly set via the respective trainer keyword argument. + +### Changed + +- Documentation: Updated showcase section. + +--- + ## Version 1.3.3 - 2023-12-29 ### Changed diff --git a/small_text/integrations/transformers/classifiers/setfit.py b/small_text/integrations/transformers/classifiers/setfit.py index d60ff668..9c2cebcb 100644 --- a/small_text/integrations/transformers/classifiers/setfit.py +++ b/small_text/integrations/transformers/classifiers/setfit.py @@ -257,11 +257,13 @@ def _get_train_and_valid_sets(self, x_train, y_train, x_valid, y_valid): return sub_train, sub_valid def _fit(self, sub_train, sub_valid, setfit_train_kwargs): + seed = np.random.randint(2**32-1) trainer = SetFitTrainer( self.model, sub_train, eval_dataset=sub_valid, batch_size=self.mini_batch_size, + seed=seed, **self.trainer_kwargs ) trainer.train(max_length=self.max_seq_len, **setfit_train_kwargs) diff --git a/small_text/integrations/transformers/utils/setfit.py b/small_text/integrations/transformers/utils/setfit.py index 2ff647fc..fea6126b 100644 --- a/small_text/integrations/transformers/utils/setfit.py +++ b/small_text/integrations/transformers/utils/setfit.py @@ -22,6 +22,9 @@ def _check_trainer_kwargs(trainer_kwargs): raise ValueError('Invalid keyword argument in trainer_kwargs: ' 'Argument "batch_size" can be set via "mini_batch_size" in ' 'SetFitClassification.') + if 'seed' in trainer_kwargs: + raise ValueError('Invalid keyword argument in trainer_kwargs: ' + 'Argument "seed" cannot be set via train_kwargs.') return trainer_kwargs diff --git a/tests/integration/small_text/integrations/transformers/classifiers/test_setfit.py b/tests/integration/small_text/integrations/transformers/classifiers/test_setfit.py index 48bff3ef..df6590c6 100644 --- a/tests/integration/small_text/integrations/transformers/classifiers/test_setfit.py +++ b/tests/integration/small_text/integrations/transformers/classifiers/test_setfit.py @@ -324,6 +324,26 @@ def test_fit_with_non_default_settings(self): self.assertEqual(1, train_mock.call_count) self.assertEqual(max_seq_len, train_mock.call_args_list[0].kwargs['max_length']) + def test_fit_prevent_fixed_seed(self): + ds = twenty_news_text(10, num_classes=self.num_classes, multi_label=self.multi_label) + num_classes = 5 + + setfit_model_args = SetFitModelArguments('sentence-transformers/all-MiniLM-L6-v2') + setfit_train_kwargs = {'show_progress_bar': False} + + with patch('setfit.trainer.set_seed') as set_seed_mock: + clf = SetFitClassification(setfit_model_args, num_classes, multi_label=self.multi_label) + + clf.fit(ds, setfit_train_kwargs=setfit_train_kwargs) + self.assertEqual(1, set_seed_mock.call_count) + first_seed = set_seed_mock.call_args_list[0][0] + + clf.fit(ds, setfit_train_kwargs=setfit_train_kwargs) + self.assertEqual(2, set_seed_mock.call_count) + second_seed = set_seed_mock.call_args_list[1][0] + + self.assertNotEqual(first_seed, second_seed) + @pytest.mark.pytorch @pytest.mark.optional diff --git a/tests/unit/small_text/integrations/transformers/classifiers/test_setfit.py b/tests/unit/small_text/integrations/transformers/classifiers/test_setfit.py index 701772f5..b01db75f 100644 --- a/tests/unit/small_text/integrations/transformers/classifiers/test_setfit.py +++ b/tests/unit/small_text/integrations/transformers/classifiers/test_setfit.py @@ -79,6 +79,15 @@ def test_init_with_misplaced_batch_size_kwargs(self): with self.assertRaisesRegex(ValueError, 'Invalid keyword argument in trainer_kwargs'): SetFitClassification(setfit_model_args, num_classes, trainer_kwargs=trainer_kwargs) + def test_init_with_misplaced_seed_kwargs(self): + setfit_model_args = SetFitModelArguments('sentence-transformers/all-MiniLM-L6-v2') + num_classes = 5 + + trainer_kwargs = {'seed': 4242} + + with self.assertRaisesRegex(ValueError, 'Invalid keyword argument in trainer_kwargs'): + SetFitClassification(setfit_model_args, num_classes, trainer_kwargs=trainer_kwargs) + class _SetFitClassification(object): From 668c326a6ea8d32ded5c661bf4f77769c2a22f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Schr=C3=B6der?= Date: Fri, 26 Apr 2024 21:50:22 +0200 Subject: [PATCH 3/3] Bump version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Christopher Schröder --- small_text/version.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/small_text/version.json b/small_text/version.json index e175b773..6032499f 100644 --- a/small_text/version.json +++ b/small_text/version.json @@ -1,6 +1,6 @@ { "major": 1, - "minor": 3, - "micro": 3, - "pre_release": "" + "minor": 4, + "micro": 0, + "pre_release": "dev1" } \ No newline at end of file