Removed invalid import from root init; Added out-of-memory warning fo…

…r creating ContrastiveDataset in Trainer; Removed unnecessary code from Notebook
sebischair · Jan 7, 2025 · ae10b1c · ae10b1c
1 parent e99ea60
commit ae10b1c
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 108 deletions.
diff --git a/Evaluate_Description-Embedding_Body.ipynb b/Evaluate_Description-Embedding_Body.ipynb
@@ -28,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "9ef91033",
    "metadata": {},
    "outputs": [],
@@ -78,7 +78,7 @@
     "        try:\n",
     "            dataset_split = load_dataset(f\"SetFit/{dataset_id}\", split=split)\n",
     "            datasets_original[dataset_id][split] = dataset_split\n",
-    "        except ValueError as e:\n",
+    "        except (ValueError, KeyError) as e:\n",
     "            print(f\"Could not load dataset '{dataset_id}'. An error occurred: {e}\")\n",
     "            datasets_original.pop(dataset_id)\n",
     "            break\n",
@@ -156,8 +156,9 @@
     "    if (not regenerate) and os.path.exists(description_file_path):\n",
     "        print(f\"Skipped label generation for '{dataset_id}' dataset (File already exists).\")\n",
     "        continue\n",
-    "    # Samples from SetFit/enron_spam are too large.\n",
-    "    if dataset_id == \"enron_spam\":\n",
+    "    if dataset_id == \"enron_spam\": # Samples from SetFit/enron_spam are too large.\n",
+    "        continue\n",
+    "    if dataset_id not in datasets_original.keys():\n",
     "        continue\n",
     "\n",
     "    # Process the dataset to get label-to-data mapping\n",
@@ -208,97 +209,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "id": "6519f3c6",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "1\n",
-      "[\"The label '1', denoted as 'positive', applies to data samples expressing favorable, satisfactory, or beneficial opinions, experiences, or outcomes.\"]\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "1\n",
-      "[\"The label '1', denoted as 'positive', applies to data samples expressing favorable, satisfactory, or beneficial opinions, experiences, or outcomes.\"]\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "1\n",
-      "['positive']\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "1\n",
-      "['positive']\n",
-      "Sucessfully formatted dataset 'CR'.\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[1 0 0 0 0 0]\n",
-      "[\"The essence of the label '0': 'sadness' is characterized by feelings of hopelessness, disappointment, melancholy, and vulnerability, often accompanied by a sense of isolation or being overwhelmed.\"]\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[1 0 0 0 0 0]\n",
-      "[\"The essence of the label '0': 'sadness' is characterized by feelings of hopelessness, disappointment, melancholy, and vulnerability, often accompanied by a sense of isolation or being overwhelmed.\"]\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[1 0 0 0 0 0]\n",
-      "['sadness']\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[1 0 0 0 0 0]\n",
-      "['sadness']\n",
-      "Sucessfully formatted dataset 'emotion'.\n",
-      "Skipping formatting dataset 'enron_spam': Description file not found.\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "1\n",
-      "[\"The label 'joy' encompasses examples demonstrating feelings of happiness, satisfaction, gladness, or positive emotional states experienced by individuals.\"]\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "1\n",
-      "[\"The label 'joy' encompasses examples demonstrating feelings of happiness, satisfaction, gladness, or positive emotional states experienced by individuals.\"]\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "1\n",
-      "['spam']\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "1\n",
-      "['spam']\n",
-      "Sucessfully formatted dataset 'enron_spam'.\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[0 0 0 0 1]\n",
-      "[\"The label '4': 'very positive' is used for data samples that express strong or intense positive sentiments, enthusiasm, or approval.\"]\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[0 1 0 0 0]\n",
-      "[\"The label '1', 'negative', is used for reviews or comments that express dissatisfaction, disapproval, or disappointment regarding a subject.\"]\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[0 0 0 0 1]\n",
-      "['very positive']\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[0 1 0 0 0]\n",
-      "['negative']\n",
-      "Sucessfully formatted dataset 'sst5'.\n",
-      "Skipping formatting dataset 'amazon_counterfactual': Key 'train' and/or 'test' not found.\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[1 0 0 0 0 0]\n",
-      "[\"The essence of the label '0': 'sadness' is characterized by feelings of hopelessness, disappointment, melancholy, and vulnerability, often accompanied by a sense of isolation or being overwhelmed.\"]\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[1 0 0 0 0 0]\n",
-      "[\"The essence of the label '0': 'sadness' is characterized by feelings of hopelessness, disappointment, melancholy, and vulnerability, often accompanied by a sense of isolation or being overwhelmed.\"]\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[1 0 0 0 0 0]\n",
-      "['sadness']\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[1 0 0 0 0 0]\n",
-      "['sadness']\n",
-      "Sucessfully formatted dataset 'emotion'.\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[0 0 1 0]\n",
-      "[\"The label '2': 'Business' encompasses news and information related to commerce, trade, financial markets, companies, and economic trends.\"]\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[0 0 1 0]\n",
-      "[\"The label '2': 'Business' encompasses news and information related to commerce, trade, financial markets, companies, and economic trends.\"]\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[0 0 1 0]\n",
-      "['Business']\n",
-      "Warning: Limiting dataset size to 250 elements for testing!\n",
-      "[0 0 1 0]\n",
-      "['Business']\n",
-      "Sucessfully formatted dataset 'ag_news'.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "formatted_datasets = {}\n",
     "def format_dataset(original_dataset, label_to_description=None) -> Dataset:\n",
@@ -324,17 +238,6 @@
     "    else:\n",
     "        label_descriptions = [[label_to_description[str(d['label'])]] for d in original_dataset]\n",
     "\n",
-    "    # Limit to 250 elements for testing.\n",
-    "    # TODO: Deal with error in setfit.\n",
-    "    #   Error occurrs in setfit.sampler, line 29: 'idxs = np.stack(np.triu_indices(n, k), axis=-1)'\n",
-    "    #   with n being the sample size, k=1 if sampled with replacedmed, 0 otherwise.\n",
-    "    #   Reason: Out-of memory. Latest numpy+setfit versions do not fix this.\n",
-    "    input_texts = input_texts[:250]\n",
-    "    labels = labels[:250]\n",
-    "    label_descriptions = label_descriptions[:250]\n",
-    "    print(\"Warning: Limiting dataset size to 250 elements for testing!\")\n",
-    "    print(labels[0])\n",
-    "    print(label_descriptions[0])\n",
     "    return Dataset.from_dict({\n",
     "        \"text\": input_texts,\n",
     "        \"label\": labels,\n",
@@ -386,7 +289,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "id": "ffad2066",
    "metadata": {
     "scrolled": true
@@ -467,7 +370,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.15"
+   "version": "3.10.16"
   }
  },
  "nbformat": 4,

diff --git a/fusionsent/__init__.py b/fusionsent/__init__.py
@@ -1,4 +1,3 @@
-from ._version import __version__
 from .modeling import FusionSentModel
 from .trainer import Trainer
 from .training_args import TrainingArguments
diff --git a/fusionsent/trainer.py b/fusionsent/trainer.py
@@ -19,6 +19,7 @@
 from setfit.losses import SupConLoss
 import gc
 import json
+import psutil
 
 from .training_args import TrainingArguments
 from .modeling import FusionSentModel
@@ -195,6 +196,18 @@ def _get_setfit_dataloader(
                     margin=args.setfit_margin,
                 )
         else:
+            # Estimate memory requirement for sampleing idxs within setfit.sampler.shuffle_combinations
+            # and log a warning in case memory might not be sufficient.
+            n = len(input_data)
+            num_combinations = n * (n + 1) // 2
+            memory_required = num_combinations * 2 * np.dtype(np.int64).itemsize  # Two arrays of int64 indices
+            available_memory = psutil.virtual_memory().available
+            if memory_required > available_memory * 0.8:  # Reserve 20% for system processes
+                logger.warning(
+                    f" Likley insufficient memory to create ContrastiveDataset (n={n}). Process might be killed."
+                    f" Estimated: {memory_required / (1024**3):.2f} GB, Available: {available_memory / (1024**3):.2f} GB. "
+                )
+
             data_sampler = ContrastiveDataset(
                 examples=input_data,
                 multilabel=Trainer._has_any_multilabel(input_data),

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name="fusionsent",
-    version="0.0.8",
+    version="0.0.9",
     author="Tim Schopf, Alexander Blatzheim",
     author_email="tim.schopf@tum.de, alexander.blatzheim@tum.de",
     description="FusionSent: A Fusion-Based Multi-Task Sentence Embedding Model",