-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval_natural_ins.py
126 lines (104 loc) · 4.05 KB
/
eval_natural_ins.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Author: Qianxi Li
Date: June 20, 2024
Description: This script evaluates a model on the Natural Instructions dataset using a transformer-based model with adapters. It includes a custom inference method for generating predictions.
"""
import transformers
import torch
import os
import tqdm
import json
import sys
import logging
from torchmetrics.text.rouge import ROUGEScore
from utils import (
log_method, ClearCache, load_model_with_adapters, load_tokenizer, split_into_batches
)
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
def inference(model, tokenizer, batch_input_text):
"""
Perform inference on a batch of input text.
Args:
model: Transformer model for text generation.
tokenizer: Tokenizer associated with the model.
batch_input_text: List of input texts for processing.
Returns:
List of generated responses for the input text batch.
"""
# Tokenize input texts
input_ids = tokenizer(
batch_input_text,
return_tensors="pt",
max_length=2048,
padding=True,
truncation=True
).to('cuda:0')
# Perform model inference without gradients
with torch.no_grad():
outputs = model.generate(
input_ids=input_ids['input_ids'],
do_sample=True,
use_cache=True,
num_return_sequences=1,
max_new_tokens=100,
attention_mask=input_ids['attention_mask'],
pad_token_id=tokenizer.pad_token_id
)
# Decode generated outputs
results = [tokenizer.decode(each, skip_special_tokens=True) for each in outputs]
# Clean up memory
del input_ids
return results
@log_method
def eval_natural_ins():
"""
Evaluate the model on the Natural Instructions dataset.
Parses the arguments from the command line, loads the dataset, performs inference,
calculates ROUGE metrics, and saves the results to a file.
"""
# Parse command-line arguments
arguments = json.loads(sys.argv[1])
iteration = int(arguments['cur_iteration'])
natural_ins_eval_result_path = arguments['natural_ins_eval_result_path']
natural_ins_eval_path = arguments['natural_ins_eval_path']
adapters_path = arguments['adapters_path']
model_path = arguments['model_path']
inference_batch_size = int(arguments['inference_batch_size'])
# Initialize ROUGE score calculator
rouge = ROUGEScore()
# Clear GPU memory before starting the evaluation
with ClearCache():
# Load model and tokenizer
model = load_model_with_adapters(iteration, adapters_path, model_path)
tokenizer = load_tokenizer(model_path)
model.eval()
# Load evaluation dataset
with open(natural_ins_eval_path, 'r') as obj:
natural_ins_data = json.load(obj)
# Extract labels from dataset
labels = [item['label'] for item in natural_ins_data]
# Prepare for batch inference
batches = split_into_batches(natural_ins_data, inference_batch_size)
predictions = []
for each_batch in tqdm.tqdm(batches, desc="natural_ins_eval"):
# Generate prompts for each input
full_prompt_list = [item['input'] for item in each_batch]
# Perform inference
results = inference(model, tokenizer, full_prompt_list)
for idx, each_output in enumerate(results):
# Extract the model's output
output_text = each_output[len(full_prompt_list[idx]):].strip()
predictions.append(output_text)
# Calculate ROUGE metrics
metrics = rouge(predictions, labels)
metrics = {k: v.item() for k, v in metrics.items()}
logging.info("Natural Instructions metrics: %s", metrics)
# Save the metrics to a file
with open(natural_ins_eval_result_path, 'w') as obj:
json.dump(metrics, obj)
# Clean up memory
del labels, predictions, natural_ins_data
# Execute the evaluation
if __name__ == "__main__":
eval_natural_ins()