Skip to content

Commit

Permalink
improve HPU usage (#1643)
Browse files Browse the repository at this point in the history
 refine example
 add hpu in auto accelerator and fix bug
---------

Signed-off-by: xin3he <xin3.he@intel.com>
  • Loading branch information
xin3he authored Mar 4, 2024
1 parent d4bcdd4 commit 0a3d4bd
Show file tree
Hide file tree
Showing 5 changed files with 179 additions and 116 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
transformers
datasets
accelerate
SentencePiece
intel_extension_for_transformers
lm_eval
lm_eval==0.3.0
openpyxl
einops
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,15 @@
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import habana_frameworks.torch.core as htcore
import numpy as np
import lm_eval
import lm_eval.tasks
import lm_eval.evaluator
from accelerate import init_empty_weights
from utils import itrex_bootstrap_stderr, show_msg, save_to_excel
from utils import show_msg, eval_func


torch.set_grad_enabled(False)
htcore.hpu_set_env()
torch.device('hpu')


# to avoid out-of-memory caused by Popen for large language models.
lm_eval.metrics.bootstrap_stderr = itrex_bootstrap_stderr


parser = argparse.ArgumentParser()
Expand All @@ -52,6 +46,7 @@
parser.add_argument("--precision", type=str, default='fp8_e4m3',
help="Select from ['fp8_e4m3', 'fp8_e5m2', 'bf16', 'fp16'], \
['bf16', 'fp16'] only work with cast approach")
parser.add_argument("--autotune", action="store_true")
parser.add_argument("--accuracy", action="store_true")
parser.add_argument("--performance", action="store_true")
parser.add_argument("--generate", action="store_true")
Expand Down Expand Up @@ -182,8 +177,9 @@
### dynamic & static quantization ###
if args.approach in ["dynamic", "static"] and not args.load:
print("device:", next(user_model.parameters()).device)
from neural_compressor.torch.quantization.config import FP8Config, get_default_fp8_config
from neural_compressor.torch.quantization import quantize
from neural_compressor.torch.quantization import (
quantize, autotune, FP8Config, get_default_fp8_config, TuningConfig, get_default_fp8_config_set
)
dtype = args.precision
if args.approach == "dynamic":
from neural_compressor.torch.algorithms.habana_fp8 import quantize_dynamic
Expand Down Expand Up @@ -300,106 +296,7 @@ def replace_torch_mm_bmm():


if args.accuracy:

class HabanaModelAdapter(lm_eval.base.BaseLM):
def __init__(self, tokenizer, model, args, options):
super().__init__()
self.tokenizer = tokenizer
self.model = model.eval()
self._batch_size = args.batch_size
self.buckets = list(sorted(args.buckets))
self.options = options
self._device = "hpu"
torch.set_grad_enabled(False)

@property
def eot_token_id(self):
return self.model.config.eos_token_id

@property
def max_length(self):
return self.buckets[-1]

@property
def max_gen_toks(self):
raise NotImplementedError()

@property
def batch_size(self):
return self._batch_size

@property
def device(self):
# We need to do padding ourselves, otherwise we'll end up with recompilations
# Returning 'cpu' to keep tensors on CPU in lm_eval code
return 'cpu' # 'hpu'

def tok_encode(self, string):
if re.search("chatglm3", args.model.lower()) or re.search("llama", args.model.lower()) :
string = string.lstrip()
return self.tokenizer.encode(string, add_special_tokens=False)

def tok_decode(self, tokens):
return self.tokenizer.decode(tokens, skip_special_tokens=True)

def _model_generate(self, context, max_length, eos_token_id):
raise NotImplementedError()

def find_bucket(self, length):
return [b for b in self.buckets if b >= length][0]

def _model_call(self, inps):
seq_length = inps.shape[-1]
padding_length = 0
bucket_length = self.find_bucket(seq_length)
padding_length = bucket_length - seq_length
inps = F.pad(inps, (0, padding_length), value=self.model.config.pad_token_id)
logits = self.model(inps.to(self._device))["logits"].cpu()

if padding_length > 0:
logits = logits[:, :-padding_length, :]
logits = logits.to(torch.float32)
return logits

lm_tasks = lm_eval.tasks.get_task_dict(args.tasks)
options = None
lm = HabanaModelAdapter(tokenizer, user_model, args, options)

eval_start = time.perf_counter()
if args.approach == "cast":
from neural_compressor.torch.amp import autocast
if args.precision == "fp8_e4m3":
dtype = torch.float8_e4m3fn
elif args.precision == "fp8_e5m2":
dtype = torch.float8_e5m2
elif args.precision == "fp16":
dtype = torch.float16
elif args.precision == "bf16":
dtype = torch.bfloat16
with autocast('hpu', dtype=dtype):
results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit)
else:
results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit)
print(lm_eval.evaluator.make_table(results))
eval_end = time.perf_counter()
print("Duration:", eval_end - eval_start)
results['args'] = vars(args)
results['duration'] = eval_end - eval_start


dumped = json.dumps(results, indent=2)
accu_dict = {}
case_name = args.approach + "-" + args.precision
for task_name in args.tasks:
if task_name == "wikitext":
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity"]), flush=True)
accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["word_perplexity"]]
else:
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc"]), flush=True)
accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["acc"]]
if args.dump_to_excel and local_rank in [-1, 0]:
save_to_excel(accu_dict)

eval_func(user_model, tokenizer=tokenizer, args=args)

# dump final message of HPU
show_msg()
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,125 @@ def save_to_excel(dict):
df_existing = pd.DataFrame()
df_combined = pd.concat([df_existing, df_new], axis=0, ignore_index=True)
df_combined.to_excel('output.xlsx', index=False, engine='openpyxl', header=True)


def eval_func(user_model, tokenizer, args):
import os
import re
import time
import json
import torch
import habana_frameworks.torch.hpex
import torch.nn.functional as F
import lm_eval
import lm_eval.tasks
import lm_eval.evaluator

# to avoid out-of-memory caused by Popen for large language models.
lm_eval.metrics.bootstrap_stderr = itrex_bootstrap_stderr

class HabanaModelAdapter(lm_eval.base.BaseLM):
def __init__(self, tokenizer, model, args, options):
super().__init__()
self.tokenizer = tokenizer
self.model = model.eval()
self._batch_size = args.batch_size
self.buckets = list(sorted(args.buckets))
self.options = options
self._device = "hpu"
torch.set_grad_enabled(False)

@property
def eot_token_id(self):
return self.model.config.eos_token_id

@property
def max_length(self):
return self.buckets[-1]

@property
def max_gen_toks(self):
raise NotImplementedError()

@property
def batch_size(self):
return self._batch_size

@property
def device(self):
# We need to do padding ourselves, otherwise we'll end up with recompilations
# Returning 'cpu' to keep tensors on CPU in lm_eval code
return 'cpu' # 'hpu'

def tok_encode(self, string):
if (
re.search("chatglm3", args.model.lower()) or
re.search("llama", args.model.lower()) or
re.search("mistral", args.model.lower())
):
string = string.lstrip()
return self.tokenizer.encode(string, add_special_tokens=False)

def tok_decode(self, tokens):
return self.tokenizer.decode(tokens, skip_special_tokens=True)

def _model_generate(self, context, max_length, eos_token_id):
raise NotImplementedError()

def find_bucket(self, length):
return [b for b in self.buckets if b >= length][0]

def _model_call(self, inputs):
seq_length = inputs.shape[-1]
padding_length = 0
bucket_length = self.find_bucket(seq_length)
padding_length = bucket_length - seq_length
inputs = F.pad(inputs, (0, padding_length), value=self.model.config.pad_token_id)
logits = self.model(inputs.to(self._device))["logits"].cpu()

if padding_length > 0:
logits = logits[:, :-padding_length, :]
logits = logits.to(torch.float32)
return logits

lm_tasks = lm_eval.tasks.get_task_dict(args.tasks)
options = None
lm = HabanaModelAdapter(tokenizer, user_model, args, options)

eval_start = time.perf_counter()
if args.approach == "cast":
from neural_compressor.torch.amp import autocast
if args.precision == "fp8_e4m3":
dtype = torch.float8_e4m3fn
elif args.precision == "fp8_e5m2":
dtype = torch.float8_e5m2
elif args.precision == "fp16":
dtype = torch.float16
elif args.precision == "bf16":
dtype = torch.bfloat16
with autocast('hpu', dtype=dtype):
results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit)
else:
results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit)
print(lm_eval.evaluator.make_table(results))
eval_end = time.perf_counter()
print("Duration:", eval_end - eval_start)
results['args'] = vars(args)
results['duration'] = eval_end - eval_start

# make sure that result is dumped only once during multi-cards evaluation
local_rank = int(os.getenv('LOCAL_RANK', '-1'))
if local_rank in [-1, 0]:
dumped = json.dumps(results, indent=2)
accu_dict = {}
case_name = args.approach + "-" + args.precision
for task_name in args.tasks:
if task_name == "wikitext":
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity"]), flush=True)
accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["word_perplexity"]]
else:
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc"]), flush=True)
accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["acc"]]
if args.dump_to_excel:
save_to_excel(accu_dict)
return results["results"][task_name]["acc"]
3 changes: 1 addition & 2 deletions neural_compressor/torch/algorithms/weight_only/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,7 @@ def __init__(

# device
self.device = get_device(kwargs.pop("device", "auto"))
if str(self.model.device).startswith("cuda"):
self.device = self.model.device
self.model.to(self.device)
self.is_ready = False

self.export_compressed_model = export_compressed_model
Expand Down
49 changes: 46 additions & 3 deletions neural_compressor/torch/utils/auto_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@

from neural_compressor.torch.utils import logger

PRIORITY_CUDA = 100
PRIORITY_HPU = 100
PRIORITY_CUDA = 95
PRIORITY_CPU = 90


Expand All @@ -53,8 +54,9 @@ class CPU_Accelerator:
"""

def decorator(accelerator_cls):
cls.registered_accelerators.setdefault(name, {})
cls.registered_accelerators[name] = (accelerator_cls, priority)
if accelerator_cls.is_available():
cls.registered_accelerators.setdefault(name, {})
cls.registered_accelerators[name] = (accelerator_cls, priority)
return accelerator_cls

return decorator
Expand Down Expand Up @@ -202,6 +204,47 @@ def empty_cache(self):
return torch.cuda.empty_cache()


@register_accelerator(name="hpu", priority=PRIORITY_HPU)
class HPU_Accelerator(Auto_Accelerator):
def __init__(self) -> None:
self._name = "hpu"

def name(self) -> str:
return self._name

@classmethod
def is_available(cls) -> bool:
from .environ import is_hpex_available

if is_hpex_available():
return torch.hpu.is_available()
else:
return False

def device_name(self, device_indx) -> str:
if device_indx is None:
return "hpu"
return f"hpu:{device_indx}"

def synchronize(self):
return torch.hpu.synchronize()

def set_device(self, device_index):
return torch.hpu.set_device(device_index)

def current_device(self):
return torch.hpu.current_device()

def current_device_name(self):
return "hpu:{}".format(torch.hpu.current_device())

def device(self, device_index=None):
return torch.hpu.device(device_index)

def empty_cache(self):
return torch.hpu.empty_cache()


def auto_detect_accelerator(device_name="auto") -> Auto_Accelerator:
# Force use the cpu on node has both cpu and gpu: `FORCE_DEVICE=cpu` python main.py ...
# The `FORCE_DEVICE` is case insensitive.
Expand Down

0 comments on commit 0a3d4bd

Please sign in to comment.