You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
The script for training VM PRM/train_VM_mistral.py seems to work only with a single GPU card. I tried to utilize transformers Trainer and deepspeed but no luck, it becomes much slower and could easily trigger OOM error (using H800-80G cards).
My script is as below:
import os
import json
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, AdamW, Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from datasets import Dataset
class Mistral_VM(nn.Module):
def __init__(self, base, vocab_size=32000):
super(Mistral_VM, self).__init__()
self.base_model = base
self.LN = nn.Linear(vocab_size, 1)
def forward(self, input_ids, attention_mask, labels=None):
transformer_outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
hidden_states = transformer_outputs[0]
value_outputs = self.LN(transformer_outputs.logits[:, -1, :])
value_outputs = value_outputs.squeeze(dim=1)
if labels is not None:
loss = torch.nn.functional.mse_loss(value_outputs, labels, reduction="mean")
else:
loss = None
return {"loss": loss, "score": value_outputs}
max_length = 900
# Load the pre-trained Mistral-7b model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("/mnt/data102_d2/huggingface/models/MetaMath-Mistral-7B", trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained("/mnt/data102_d2/huggingface/models/MetaMath-Mistral-7B", trust_remote_code=True, torch_dtype=torch.bfloat16)
vocab_size = base_model.config.vocab_size
# print(vocab_size)
VM = Mistral_VM(base_model, vocab_size)
tokenizer.pad_token_id = tokenizer.eos_token_id
# VM = AutoModelForSequenceClassification.from_pretrained("/mnt/data102_d2/huggingface/models/MetaMath-Mistral-7B", trust_remote_code=True, torch_dtype=torch.bfloat16)
# VM.config.pad_token_id = VM.config.eos_token_id
# Load training set, validation set, and test set data
train_js = 'data/train_en.json'
test_js = 'data/test_en.json'
val_js = 'data/valid_en.json'
def read_json(source, debug=False):
json_list = []
with open(source, 'r', encoding='utf-8') as f:
for line in f:
json_list.append(json.loads(line))
dataset = Dataset.from_list(json_list[:1000] if debug else json_list)
def process(data):
prompt_answer = data['prompt_answer']
label = data['label']
encoded_pair = tokenizer.encode_plus(
prompt_answer,
padding='max_length',
max_length=max_length, # Set the max length
truncation=True,
return_tensors='pt', # Return PyTorch Tensor format
)
return {
'input_ids': encoded_pair['input_ids'].squeeze(),
'attention_mask': encoded_pair['attention_mask'].squeeze(),
'labels': label
}
dataset = dataset.map(process, num_proc=12, remove_columns=["prompt_answer", "label"])
return dataset
train_dataset = read_json(train_js) # This section uses a CSV file as an example to describe how to load data
val_dataset = read_json(val_js)
test_dataset = read_json(test_js)
# Set training parameters
training_arguments = TrainingArguments(
output_dir="test",
num_train_epochs=2,
per_device_train_batch_size=3,
save_strategy="no",
logging_strategy="epoch",
learning_rate=1e-6,
report_to="none",
logging_steps=10,
optim="adamw_hf",
do_train=True,
do_eval=True,
save_only_model=True,
load_best_model_at_end=True,
push_to_hub=False,
bf16=True,
tf32=True,
log_level="info",
ddp_find_unused_parameters=False,
)
# Define loss function and optimizer
trainer = Trainer(VM,
training_arguments,
tokenizer=tokenizer,
train_dataset=train_dataset,
eval_dataset=val_dataset)
trainer.train()
trainer.save_model()
The text was updated successfully, but these errors were encountered:
The script for training VM
PRM/train_VM_mistral.py
seems to work only with a single GPU card. I tried to utilize transformers Trainer and deepspeed but no luck, it becomes much slower and could easily trigger OOM error (using H800-80G cards).My script is as below:
The text was updated successfully, but these errors were encountered: