Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

distributed training of VM #8

Open
skepsun opened this issue Oct 18, 2024 · 0 comments
Open

distributed training of VM #8

skepsun opened this issue Oct 18, 2024 · 0 comments

Comments

@skepsun
Copy link

skepsun commented Oct 18, 2024

The script for training VM PRM/train_VM_mistral.py seems to work only with a single GPU card. I tried to utilize transformers Trainer and deepspeed but no luck, it becomes much slower and could easily trigger OOM error (using H800-80G cards).
My script is as below:

import os

import json
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, AdamW, Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from datasets import Dataset

class Mistral_VM(nn.Module):
    def __init__(self, base, vocab_size=32000):
        super(Mistral_VM, self).__init__()
        self.base_model = base
        self.LN = nn.Linear(vocab_size, 1)

    def forward(self, input_ids, attention_mask, labels=None):
        transformer_outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = transformer_outputs[0]
        value_outputs = self.LN(transformer_outputs.logits[:, -1, :])
        value_outputs = value_outputs.squeeze(dim=1)
        if labels is not None:
            loss = torch.nn.functional.mse_loss(value_outputs, labels, reduction="mean")
        else:
            loss = None
        return {"loss": loss, "score": value_outputs}

    
max_length = 900

# Load the pre-trained Mistral-7b model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("/mnt/data102_d2/huggingface/models/MetaMath-Mistral-7B", trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained("/mnt/data102_d2/huggingface/models/MetaMath-Mistral-7B", trust_remote_code=True, torch_dtype=torch.bfloat16)
vocab_size = base_model.config.vocab_size
# print(vocab_size)
VM = Mistral_VM(base_model, vocab_size)
tokenizer.pad_token_id = tokenizer.eos_token_id
# VM = AutoModelForSequenceClassification.from_pretrained("/mnt/data102_d2/huggingface/models/MetaMath-Mistral-7B", trust_remote_code=True, torch_dtype=torch.bfloat16)
# VM.config.pad_token_id = VM.config.eos_token_id
# Load training set, validation set, and test set data
train_js = 'data/train_en.json'
test_js = 'data/test_en.json'
val_js = 'data/valid_en.json'

def read_json(source, debug=False):
    json_list = []
    with open(source, 'r', encoding='utf-8') as f:
        for line in f:
            json_list.append(json.loads(line))
    dataset = Dataset.from_list(json_list[:1000] if debug else json_list)
    def process(data):
        prompt_answer = data['prompt_answer']
        label = data['label']

        encoded_pair = tokenizer.encode_plus(
            prompt_answer,
            padding='max_length',
            max_length=max_length,  # Set the max length
            truncation=True,
            return_tensors='pt',  # Return PyTorch Tensor format
        )

        return {
            'input_ids': encoded_pair['input_ids'].squeeze(),
            'attention_mask': encoded_pair['attention_mask'].squeeze(),
            'labels': label
        }
    dataset = dataset.map(process, num_proc=12, remove_columns=["prompt_answer", "label"])
    return dataset

train_dataset = read_json(train_js)  # This section uses a CSV file as an example to describe how to load data
val_dataset = read_json(val_js)
test_dataset = read_json(test_js)


# Set training parameters
training_arguments = TrainingArguments(
    output_dir="test",
    num_train_epochs=2,
    per_device_train_batch_size=3,
    save_strategy="no",
    logging_strategy="epoch",
    learning_rate=1e-6,
    report_to="none",
    logging_steps=10,
    optim="adamw_hf",
    do_train=True,
    do_eval=True,
    save_only_model=True,
    load_best_model_at_end=True,
    push_to_hub=False,
    bf16=True,
    tf32=True,
    log_level="info",
    ddp_find_unused_parameters=False,
)

# Define loss function and optimizer


trainer = Trainer(VM,
                  training_arguments,
                  tokenizer=tokenizer,
                  train_dataset=train_dataset,
                  eval_dataset=val_dataset)
trainer.train()
trainer.save_model()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

2 participants