Skip to content

Commit

Permalink
Removed invalid import from root init; Added out-of-memory warning fo…
Browse files Browse the repository at this point in the history
…r creating ContrastiveDataset in Trainer; Removed unnecessary code from Notebook
  • Loading branch information
Blatzheim committed Jan 7, 2025
1 parent e99ea60 commit ae10b1c
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 108 deletions.
115 changes: 9 additions & 106 deletions Evaluate_Description-Embedding_Body.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "9ef91033",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -78,7 +78,7 @@
" try:\n",
" dataset_split = load_dataset(f\"SetFit/{dataset_id}\", split=split)\n",
" datasets_original[dataset_id][split] = dataset_split\n",
" except ValueError as e:\n",
" except (ValueError, KeyError) as e:\n",
" print(f\"Could not load dataset '{dataset_id}'. An error occurred: {e}\")\n",
" datasets_original.pop(dataset_id)\n",
" break\n",
Expand Down Expand Up @@ -156,8 +156,9 @@
" if (not regenerate) and os.path.exists(description_file_path):\n",
" print(f\"Skipped label generation for '{dataset_id}' dataset (File already exists).\")\n",
" continue\n",
" # Samples from SetFit/enron_spam are too large.\n",
" if dataset_id == \"enron_spam\":\n",
" if dataset_id == \"enron_spam\": # Samples from SetFit/enron_spam are too large.\n",
" continue\n",
" if dataset_id not in datasets_original.keys():\n",
" continue\n",
"\n",
" # Process the dataset to get label-to-data mapping\n",
Expand Down Expand Up @@ -208,97 +209,10 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"id": "6519f3c6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Warning: Limiting dataset size to 250 elements for testing!\n",
"1\n",
"[\"The label '1', denoted as 'positive', applies to data samples expressing favorable, satisfactory, or beneficial opinions, experiences, or outcomes.\"]\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"1\n",
"[\"The label '1', denoted as 'positive', applies to data samples expressing favorable, satisfactory, or beneficial opinions, experiences, or outcomes.\"]\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"1\n",
"['positive']\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"1\n",
"['positive']\n",
"Sucessfully formatted dataset 'CR'.\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[1 0 0 0 0 0]\n",
"[\"The essence of the label '0': 'sadness' is characterized by feelings of hopelessness, disappointment, melancholy, and vulnerability, often accompanied by a sense of isolation or being overwhelmed.\"]\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[1 0 0 0 0 0]\n",
"[\"The essence of the label '0': 'sadness' is characterized by feelings of hopelessness, disappointment, melancholy, and vulnerability, often accompanied by a sense of isolation or being overwhelmed.\"]\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[1 0 0 0 0 0]\n",
"['sadness']\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[1 0 0 0 0 0]\n",
"['sadness']\n",
"Sucessfully formatted dataset 'emotion'.\n",
"Skipping formatting dataset 'enron_spam': Description file not found.\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"1\n",
"[\"The label 'joy' encompasses examples demonstrating feelings of happiness, satisfaction, gladness, or positive emotional states experienced by individuals.\"]\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"1\n",
"[\"The label 'joy' encompasses examples demonstrating feelings of happiness, satisfaction, gladness, or positive emotional states experienced by individuals.\"]\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"1\n",
"['spam']\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"1\n",
"['spam']\n",
"Sucessfully formatted dataset 'enron_spam'.\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[0 0 0 0 1]\n",
"[\"The label '4': 'very positive' is used for data samples that express strong or intense positive sentiments, enthusiasm, or approval.\"]\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[0 1 0 0 0]\n",
"[\"The label '1', 'negative', is used for reviews or comments that express dissatisfaction, disapproval, or disappointment regarding a subject.\"]\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[0 0 0 0 1]\n",
"['very positive']\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[0 1 0 0 0]\n",
"['negative']\n",
"Sucessfully formatted dataset 'sst5'.\n",
"Skipping formatting dataset 'amazon_counterfactual': Key 'train' and/or 'test' not found.\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[1 0 0 0 0 0]\n",
"[\"The essence of the label '0': 'sadness' is characterized by feelings of hopelessness, disappointment, melancholy, and vulnerability, often accompanied by a sense of isolation or being overwhelmed.\"]\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[1 0 0 0 0 0]\n",
"[\"The essence of the label '0': 'sadness' is characterized by feelings of hopelessness, disappointment, melancholy, and vulnerability, often accompanied by a sense of isolation or being overwhelmed.\"]\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[1 0 0 0 0 0]\n",
"['sadness']\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[1 0 0 0 0 0]\n",
"['sadness']\n",
"Sucessfully formatted dataset 'emotion'.\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[0 0 1 0]\n",
"[\"The label '2': 'Business' encompasses news and information related to commerce, trade, financial markets, companies, and economic trends.\"]\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[0 0 1 0]\n",
"[\"The label '2': 'Business' encompasses news and information related to commerce, trade, financial markets, companies, and economic trends.\"]\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[0 0 1 0]\n",
"['Business']\n",
"Warning: Limiting dataset size to 250 elements for testing!\n",
"[0 0 1 0]\n",
"['Business']\n",
"Sucessfully formatted dataset 'ag_news'.\n"
]
}
],
"outputs": [],
"source": [
"formatted_datasets = {}\n",
"def format_dataset(original_dataset, label_to_description=None) -> Dataset:\n",
Expand All @@ -324,17 +238,6 @@
" else:\n",
" label_descriptions = [[label_to_description[str(d['label'])]] for d in original_dataset]\n",
"\n",
" # Limit to 250 elements for testing.\n",
" # TODO: Deal with error in setfit.\n",
" # Error occurrs in setfit.sampler, line 29: 'idxs = np.stack(np.triu_indices(n, k), axis=-1)'\n",
" # with n being the sample size, k=1 if sampled with replacedmed, 0 otherwise.\n",
" # Reason: Out-of memory. Latest numpy+setfit versions do not fix this.\n",
" input_texts = input_texts[:250]\n",
" labels = labels[:250]\n",
" label_descriptions = label_descriptions[:250]\n",
" print(\"Warning: Limiting dataset size to 250 elements for testing!\")\n",
" print(labels[0])\n",
" print(label_descriptions[0])\n",
" return Dataset.from_dict({\n",
" \"text\": input_texts,\n",
" \"label\": labels,\n",
Expand Down Expand Up @@ -386,7 +289,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"id": "ffad2066",
"metadata": {
"scrolled": true
Expand Down Expand Up @@ -467,7 +370,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
"version": "3.10.16"
}
},
"nbformat": 4,
Expand Down
1 change: 0 additions & 1 deletion fusionsent/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from ._version import __version__
from .modeling import FusionSentModel
from .trainer import Trainer
from .training_args import TrainingArguments
13 changes: 13 additions & 0 deletions fusionsent/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from setfit.losses import SupConLoss
import gc
import json
import psutil

from .training_args import TrainingArguments
from .modeling import FusionSentModel
Expand Down Expand Up @@ -195,6 +196,18 @@ def _get_setfit_dataloader(
margin=args.setfit_margin,
)
else:
# Estimate memory requirement for sampleing idxs within setfit.sampler.shuffle_combinations
# and log a warning in case memory might not be sufficient.
n = len(input_data)
num_combinations = n * (n + 1) // 2
memory_required = num_combinations * 2 * np.dtype(np.int64).itemsize # Two arrays of int64 indices
available_memory = psutil.virtual_memory().available
if memory_required > available_memory * 0.8: # Reserve 20% for system processes
logger.warning(
f" Likley insufficient memory to create ContrastiveDataset (n={n}). Process might be killed."
f" Estimated: {memory_required / (1024**3):.2f} GB, Available: {available_memory / (1024**3):.2f} GB. "
)

data_sampler = ContrastiveDataset(
examples=input_data,
multilabel=Trainer._has_any_multilabel(input_data),
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name="fusionsent",
version="0.0.8",
version="0.0.9",
author="Tim Schopf, Alexander Blatzheim",
author_email="tim.schopf@tum.de, alexander.blatzheim@tum.de",
description="FusionSent: A Fusion-Based Multi-Task Sentence Embedding Model",
Expand Down

0 comments on commit ae10b1c

Please sign in to comment.