From 1b78ef04ba4a895bd0e4390765cc77b27c447452 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Thu, 1 Sep 2022 23:19:46 +0530 Subject: [PATCH 1/7] fix grpc --- .../ds_inference/grpc_server.py | 55 +++++++++---------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/scripts/bloom-inference-server/ds_inference/grpc_server.py b/scripts/bloom-inference-server/ds_inference/grpc_server.py index 47ab08c73..d9d3e6016 100644 --- a/scripts/bloom-inference-server/ds_inference/grpc_server.py +++ b/scripts/bloom-inference-server/ds_inference/grpc_server.py @@ -6,59 +6,57 @@ from transformers import AutoTokenizer import mii -from utils import GenerateRequest, GenerateResponse, Model, get_filter_dict, get_str_dtype, print_rank_n +from utils import ( + GenerateRequest, + GenerateResponse, + Model, + get_downloaded_model_path, + get_filter_dict, + get_str_dtype, + print_rank_n +) class DSInferenceGRPCServer(Model): def __init__(self, args: argparse.Namespace) -> None: self.deployment_name = "ds_inference_grpc_server" - files = os.listdir(args.save_mp_checkpoint_path) - for file in files: - if (file.endswith(".json")): - checkpoints_json = json.load( - open(os.path.join(args.save_mp_checkpoint_path, file), "r")) - break + downloaded_model_path = get_downloaded_model_path(args.model_name) - if ("base_dir" in checkpoints_json): - del checkpoints_json["base_dir"] + self.tokenizer = AutoTokenizer.from_pretrained(downloaded_model_path) + self.pad = self.tokenizer.pad_token_id + + if (args.dtype in [torch.float16, torch.int8]): + checkpoints_json = os.path.join( + downloaded_model_path, "BLOOM_ds-inference_config.json") - if (args.dtype == torch.float16): mii.deploy( task="text-generation", model=args.model_name, deployment_name=self.deployment_name, + model_path=downloaded_model_path, mii_config={ "dtype": get_str_dtype(args.dtype), "tensor_parallel": 8, "port_number": 50950, - "checkpoint_dict": checkpoints_json - }, - model_path=args.save_mp_checkpoint_path + "checkpoint_dict": json.load(open(checkpoints_json, "r")) + } ) - else: - raise NotImplementedError("This is not yet supported") + elif (args.dtype == torch.bfloat16): + raise NotImplementedError("bfloat16 is not yet supported") - self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) - self.pad = self.tokenizer.pad_token_id self.model = mii.mii_query_handle(self.deployment_name) def generate(self, request: GenerateRequest) -> GenerateResponse: - text = request.text - - return_type = type(text) - if (return_type == str): - text = [text] - output_text = self.model.query( - {"query": text}, + {"query": request.text}, **get_filter_dict(request) ).response output_text = [_ for _ in output_text] # Remove input from output - input_tokens = self.tokenizer(text).input_ids + input_tokens = self.tokenizer(request.text).input_ids output_tokens = self.tokenizer(output_text).input_ids input_token_lengths = [len(x) for x in input_tokens] @@ -72,10 +70,6 @@ def generate(self, request: GenerateRequest) -> GenerateResponse: output_text = self.tokenizer.batch_decode( output_tokens, skip_special_tokens=True) - if (return_type == str): - output_text = output_text[0] - num_generated_tokens = num_generated_tokens[0] - return GenerateResponse( text=output_text, num_generated_tokens=num_generated_tokens @@ -87,4 +81,5 @@ def shutdown(self) -> None: try: mii.terminate(self.deployment_name) except Exception: - exit() + pass + exit() From 76e78bab4696a7037f1a8772e52dfae64ec7ddd3 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Fri, 2 Sep 2022 08:21:56 +0530 Subject: [PATCH 2/7] use functools.partial --- scripts/bloom-inference-server/benchmark.py | 13 +++++------ .../ds_inference/model.py | 12 +++------- scripts/bloom-inference-server/server.py | 5 +++-- scripts/bloom-inference-server/utils/model.py | 22 +++++++++---------- scripts/bloom-inference-server/utils/utils.py | 17 +++++++------- 5 files changed, 30 insertions(+), 39 deletions(-) diff --git a/scripts/bloom-inference-server/benchmark.py b/scripts/bloom-inference-server/benchmark.py index 23c519a09..e61f50ccd 100644 --- a/scripts/bloom-inference-server/benchmark.py +++ b/scripts/bloom-inference-server/benchmark.py @@ -1,6 +1,7 @@ import argparse import gc import os +from functools import partial import deepspeed import torch @@ -57,7 +58,7 @@ def benchmark_end_to_end(args: argparse.Namespace, model_class: Model, zero_activated: bool = False) -> None: model, initialization_time = run_and_log_time( - (model_class, {"args": args}) + partial(model_class, args=args) ) request = parse_generate_kwargs( @@ -87,13 +88,11 @@ def benchmark_end_to_end(args: argparse.Namespace, # benchmark total_new_tokens_generated, benchmark_time = run_and_log_time( - ( + partial( benchmark_generation, - { - "model": model, - "request": request, - "cycles": args.benchmark_cycles - } + model=model, + request=request, + cycles=args.benchmark_cycles ) ) diff --git a/scripts/bloom-inference-server/ds_inference/model.py b/scripts/bloom-inference-server/ds_inference/model.py index c42aa5250..ebdfa1213 100644 --- a/scripts/bloom-inference-server/ds_inference/model.py +++ b/scripts/bloom-inference-server/ds_inference/model.py @@ -3,6 +3,7 @@ import json import os from argparse import Namespace +from functools import partial import deepspeed import torch @@ -77,17 +78,10 @@ def write_checkpoints_json(self, model_path: str) -> None: def __enter__(self): run_rank_n( - os.makedirs, - { - "name": self.tmp_directory, - "exist_ok": True - } + partial(os.makedirs, name=self.tmp_directory, exist_ok=True) ) run_rank_n( - self.write_checkpoints_json, - { - "model_path": self.model_path - }, + partial(self.write_checkpoints_json, model_path=self.model_path), barrier=True ) return self.tmp_file diff --git a/scripts/bloom-inference-server/server.py b/scripts/bloom-inference-server/server.py index 4b155c503..4139f1134 100644 --- a/scripts/bloom-inference-server/server.py +++ b/scripts/bloom-inference-server/server.py @@ -1,6 +1,7 @@ import argparse import sys import traceback +from functools import partial import utils from ds_inference import DSInferenceGRPCServer @@ -113,7 +114,7 @@ def generate(self, request: GenerateRequest) -> GenerateResponse: request.max_new_tokens, self.allowed_max_new_tokens) response, total_time_taken = run_and_log_time( - (self.model.generate, {"request": request}) + partial(self.model.generate, request=request) ) response.query_id = self.query_ids.generate_query_id @@ -130,7 +131,7 @@ def generate(self, request: GenerateRequest) -> GenerateResponse: def tokenize(self, request: TokenizeRequest) -> TokenizeResponse: try: response, total_time_taken = run_and_log_time( - (self.model.tokenize, {"request": request}) + partial(self.model.tokenize, request=request) ) response.query_id = self.query_ids.tokenize_query_id diff --git a/scripts/bloom-inference-server/utils/model.py b/scripts/bloom-inference-server/utils/model.py index 267256a9b..d5f03c559 100644 --- a/scripts/bloom-inference-server/utils/model.py +++ b/scripts/bloom-inference-server/utils/model.py @@ -1,5 +1,6 @@ import argparse import os +from functools import partial import torch from transformers.utils import is_offline_mode @@ -91,19 +92,16 @@ def shutdown(self) -> None: def get_downloaded_model_path(model_name: str): - kwargs = { - "repo_id": model_name, - "allow_patterns": ["*"], - "local_files_only": is_offline_mode(), - "cache_dir": os.getenv("TRANSFORMERS_CACHE", None) - } - # download only on 1 process - run_rank_n( + f = partial( snapshot_download, - kwargs, - barrier=True + repo_id=model_name, + allow_patterns=["*"], + local_files_only=is_offline_mode(), + cache_dir=os.getenv("TRANSFORMERS_CACHE", None) ) + # download only on 1 process + run_rank_n(f) # now since the snapshot is downloaded, pass the # model_path to all processes - model_path = snapshot_download(**kwargs) - return model_path + return f() + diff --git a/scripts/bloom-inference-server/utils/utils.py b/scripts/bloom-inference-server/utils/utils.py index 722df86e4..f15781f9a 100644 --- a/scripts/bloom-inference-server/utils/utils.py +++ b/scripts/bloom-inference-server/utils/utils.py @@ -3,6 +3,7 @@ import json import math import time +from functools import partial from typing import Any, List, Tuple, Union import torch @@ -85,14 +86,13 @@ def get_args(parser: argparse.ArgumentParser, script: str) -> argparse.Namespace return args -def run_rank_n(func: callable, - kwargs: dict, +def run_rank_n(func: partial, barrier: bool = False, rank: int = 0, other_rank_output: Any = None) -> Any: if (dist.is_initialized()): if (dist.get_rank() == rank): - output = func(**kwargs) + output = func() if (barrier): dist.barrier() return output @@ -101,7 +101,7 @@ def run_rank_n(func: callable, dist.barrier() return other_rank_output else: - return func(**kwargs) + return func() def print_rank_n(*values, rank: int = 0) -> None: @@ -158,16 +158,15 @@ def get_num_tokens_to_generate(max_new_tokens: int, return min(max_new_tokens, allowed_max_new_tokens) -def run_and_log_time(execs: Union[List[Tuple[callable, dict]], - Tuple[callable, dict]]) -> Tuple[Union[List[Any], Any], float]: +def run_and_log_time(execs: Union[List[partial], partial]) -> Tuple[Union[List[Any], Any], float]: start_time = time.time() if (type(execs) == list): results = [] - for f, k in execs: - results.append(f(**k)) + for f in execs: + results.append(f()) else: - results = execs[0](**execs[1]) + results = execs() time_elapsed = time.time() - start_time return results, time_elapsed From 557342819bab8431ce754cf3c4edf8f679b6684c Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sat, 3 Sep 2022 16:50:40 +0530 Subject: [PATCH 3/7] fix ds-inference server --- scripts/bloom-inference-server/ds_inference/grpc_server.py | 6 ++++-- scripts/bloom-inference-server/ds_inference/model.py | 2 ++ scripts/bloom-inference-server/ds_zero/model.py | 2 ++ scripts/bloom-inference-server/utils/model.py | 3 +-- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/scripts/bloom-inference-server/ds_inference/grpc_server.py b/scripts/bloom-inference-server/ds_inference/grpc_server.py index d9d3e6016..4379587f4 100644 --- a/scripts/bloom-inference-server/ds_inference/grpc_server.py +++ b/scripts/bloom-inference-server/ds_inference/grpc_server.py @@ -28,11 +28,13 @@ def __init__(self, args: argparse.Namespace) -> None: if (args.dtype in [torch.float16, torch.int8]): checkpoints_json = os.path.join( - downloaded_model_path, "BLOOM_ds-inference_config.json") + downloaded_model_path, "ds_inference_config.json") mii.deploy( task="text-generation", - model=args.model_name, + # should pass args.model_name but can't since the new + # weights are not supported yet. So, this is a hack + model="bigscience/bloom", deployment_name=self.deployment_name, model_path=downloaded_model_path, mii_config={ diff --git a/scripts/bloom-inference-server/ds_inference/model.py b/scripts/bloom-inference-server/ds_inference/model.py index ebdfa1213..0d8323f6e 100644 --- a/scripts/bloom-inference-server/ds_inference/model.py +++ b/scripts/bloom-inference-server/ds_inference/model.py @@ -7,6 +7,7 @@ import deepspeed import torch +import torch.distributed as dist from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from utils import Model, get_downloaded_model_path, print_rank_n, run_rank_n @@ -59,6 +60,7 @@ def __init__(self, args: Namespace) -> None: self.input_device = torch.cuda.current_device() print_rank_n("Model loaded") + dist.barrier() class TemporaryCheckpointsJSON: diff --git a/scripts/bloom-inference-server/ds_zero/model.py b/scripts/bloom-inference-server/ds_zero/model.py index 8bb0d7cfc..6729f5fc4 100644 --- a/scripts/bloom-inference-server/ds_zero/model.py +++ b/scripts/bloom-inference-server/ds_zero/model.py @@ -3,6 +3,7 @@ import deepspeed import torch +import torch.distributed as dist from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.deepspeed import HfDeepSpeedConfig @@ -64,3 +65,4 @@ def __init__(self, args: Namespace) -> None: self.input_device = torch.cuda.current_device() print_rank_n("Model loaded") + dist.barrier() diff --git a/scripts/bloom-inference-server/utils/model.py b/scripts/bloom-inference-server/utils/model.py index d5f03c559..f681f3b17 100644 --- a/scripts/bloom-inference-server/utils/model.py +++ b/scripts/bloom-inference-server/utils/model.py @@ -100,8 +100,7 @@ def get_downloaded_model_path(model_name: str): cache_dir=os.getenv("TRANSFORMERS_CACHE", None) ) # download only on 1 process - run_rank_n(f) + run_rank_n(f, barrier=True) # now since the snapshot is downloaded, pass the # model_path to all processes return f() - From e7e90bd6f707e8e99ec07cbf2cafd9726860bf86 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sat, 3 Sep 2022 18:54:10 +0530 Subject: [PATCH 4/7] add support for int8 in the future --- .../hf_accelerate/model.py | 45 +++++++++++++------ .../bloom-inference-server/utils/constants.py | 9 ++-- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/scripts/bloom-inference-server/hf_accelerate/model.py b/scripts/bloom-inference-server/hf_accelerate/model.py index 77d27a4af..49f2c054d 100644 --- a/scripts/bloom-inference-server/hf_accelerate/model.py +++ b/scripts/bloom-inference-server/hf_accelerate/model.py @@ -15,13 +15,20 @@ def __init__(self, args: Namespace) -> None: self.tokenizer = AutoTokenizer.from_pretrained(downloaded_model_path) self.pad = self.tokenizer.pad_token_id - self.model = AutoModelForCausalLM.from_pretrained( - downloaded_model_path, - device_map="auto", - max_memory=get_max_memory_per_gpu_dict( - args.dtype, args.model_name), - torch_dtype=args.dtype - ) + kwargs = { + "pretrained_model_name_or_path": downloaded_model_path, + "device_map": "auto", + "max_memory": get_max_memory_per_gpu_dict( + args.dtype, + args.model_name + ) + } + if (args.dtype == torch.int8): + kwargs["load_in_8bit"] = True + else: + kwargs["torch_dtype"] = args.dtype + + self.model = AutoModelForCausalLM.from_pretrained(**kwargs) self.model.requires_grad_(False) self.model.eval() @@ -39,14 +46,20 @@ def get_max_memory_per_gpu_dict(dtype, model_name): if model_name == "bigscience/bloom" and n_gpus == 8 and torch.cuda.get_device_properties(0).total_memory > 79*2**30: # hand crafted optimized memory map for 8x80 setup over BLOOM # this works with bs=40 - return {0: '0GIB', 1: '51GIB', 2: '51GIB', 3: '51GIB', 4: '51GIB', 5: '51GIB', 6: '51GIB', 7: '51GIB'} - + if (dtype in [torch.bfloat16, torch.float16]): + max_memory_per_gpu = {0: '0GIB', 1: '51GIB', 2: '51GIB', 3: '51GIB', + 4: '51GIB', 5: '51GIB', 6: '51GIB', 7: '51GIB'} + elif (dtype == torch.int8): + max_memory_per_gpu = {0: '0GIB', 1: '26GIB', 2: '26GIB', 3: '26GIB', + 4: '26GIB', 5: '26GIB', 6: '26GIB', 7: '26GIB'} + print_rank_n("Max memory per gpu:", max_memory_per_gpu) + return max_memory_per_gpu try: # model_params calculation, as we don't have a model yet to do: #model_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) config = AutoConfig.from_pretrained(model_name) - h = config.n_embed + h = config.hidden_size l = config.n_layer v = config.vocab_size # from https://github.com/bigscience-workshop/bigscience/tree/6917a3b5fefcf439d3485ca184b4d9f6ab605150/math#model-sizing @@ -56,11 +69,14 @@ def get_max_memory_per_gpu_dict(dtype, model_name): f"The model {model_name} has a broken config file. Please notify the owner") raise - bytes = torch.finfo(dtype).bits / 8 + if (dtype == torch.int8): + bytes = 1 + else: + bytes = torch.finfo(dtype).bits / 8 param_memory_total_in_bytes = model_params * bytes # add 5% since weight sizes aren't the same and some GPU may need more memory param_memory_per_gpu_in_bytes = int( - param_memory_total_in_bytes / n_gpus * 1.05) + param_memory_total_in_bytes / n_gpus * 1.10) print_rank_n( f"Estimating {param_memory_per_gpu_in_bytes/2**30:0.2f}GB per gpu for weights") @@ -72,4 +88,7 @@ def get_max_memory_per_gpu_dict(dtype, model_name): raise ValueError( f"Unable to generate the memory map automatically as the needed estimated memory per gpu ({param_memory_per_gpu_in_bytes/2**30:0.2f}GB) is bigger than the available per gpu memory ({max_memory_per_gpu_in_bytes/2**30:0.2f}GB)") - return {i: param_memory_per_gpu_in_bytes for i in range(torch.cuda.device_count())} + max_memory_per_gpu = { + i: param_memory_per_gpu_in_bytes for i in range(torch.cuda.device_count())} + print("Max memory per gpu:", max_memory_per_gpu) + return max_memory_per_gpu diff --git a/scripts/bloom-inference-server/utils/constants.py b/scripts/bloom-inference-server/utils/constants.py index 50f583f21..056d21920 100644 --- a/scripts/bloom-inference-server/utils/constants.py +++ b/scripts/bloom-inference-server/utils/constants.py @@ -20,7 +20,8 @@ HF_ACCELERATE: { BIGSCIENCE_BLOOM: { BF16, - FP16 + FP16, + # INT8 } }, DS_INFERENCE: { @@ -45,7 +46,8 @@ HF_ACCELERATE: { BIGSCIENCE_BLOOM: { BF16, - FP16 + FP16, + # INT8 } }, DS_INFERENCE: { @@ -61,7 +63,8 @@ HF_ACCELERATE: { BIGSCIENCE_BLOOM: { BF16, - FP16 + FP16, + # INT8 } }, DS_INFERENCE: { From 701aaca00cc41a29a2c4fbfb1b100e33d35143c7 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sun, 11 Sep 2022 21:21:53 +0530 Subject: [PATCH 5/7] bug fix --- scripts/bloom-inference-server/benchmark.py | 2 ++ scripts/bloom-inference-server/cli.py | 3 +++ scripts/bloom-inference-server/server.py | 2 ++ scripts/bloom-inference-server/utils/model.py | 2 +- scripts/bloom-inference-server/utils/requests.py | 8 ++++++-- 5 files changed, 14 insertions(+), 3 deletions(-) diff --git a/scripts/bloom-inference-server/benchmark.py b/scripts/bloom-inference-server/benchmark.py index e61f50ccd..455af9e44 100644 --- a/scripts/bloom-inference-server/benchmark.py +++ b/scripts/bloom-inference-server/benchmark.py @@ -66,6 +66,8 @@ def benchmark_end_to_end(args: argparse.Namespace, args.generate_kwargs ) + request.preprocess() + print_rank_n(f"generate_kwargs = {args.generate_kwargs}") print_rank_n(f"batch_size = {args.batch_size}") diff --git a/scripts/bloom-inference-server/cli.py b/scripts/bloom-inference-server/cli.py index 477a0ad2e..1b91b64c2 100644 --- a/scripts/bloom-inference-server/cli.py +++ b/scripts/bloom-inference-server/cli.py @@ -55,6 +55,9 @@ def main() -> None: continue request = parse_generate_kwargs([input_text], generate_kwargs) + + request.preprocess() + response = model.generate(request) print_rank_n("Output text:", response.text[0]) diff --git a/scripts/bloom-inference-server/server.py b/scripts/bloom-inference-server/server.py index 4139f1134..fa91fda6e 100644 --- a/scripts/bloom-inference-server/server.py +++ b/scripts/bloom-inference-server/server.py @@ -110,6 +110,8 @@ def get_exception_response(self, query_id: int, method: str): def generate(self, request: GenerateRequest) -> GenerateResponse: try: + request.preprocess() + request.max_new_tokens = get_num_tokens_to_generate( request.max_new_tokens, self.allowed_max_new_tokens) diff --git a/scripts/bloom-inference-server/utils/model.py b/scripts/bloom-inference-server/utils/model.py index f681f3b17..5e36c9e72 100644 --- a/scripts/bloom-inference-server/utils/model.py +++ b/scripts/bloom-inference-server/utils/model.py @@ -38,7 +38,7 @@ def generate(self, request: GenerateRequest) -> GenerateResponse: top_k=request.top_k, top_p=request.top_p, typical_p=request.typical_p, - repitition_penalty=request.repitition_penalty, + repetition_penalty=request.repetition_penalty, bos_token_id=request.bos_token_id, pad_token_id=request.pad_token_id, eos_token_id=request.eos_token_id, diff --git a/scripts/bloom-inference-server/utils/requests.py b/scripts/bloom-inference-server/utils/requests.py index 5832d6ae2..0bae3eebe 100644 --- a/scripts/bloom-inference-server/utils/requests.py +++ b/scripts/bloom-inference-server/utils/requests.py @@ -18,7 +18,7 @@ class GenerateRequest(BaseModel): top_k: int = None top_p: float = None typical_p: float = None - repitition_penalty: float = None + repetition_penalty: float = None bos_token_id: int = None pad_token_id: int = None eos_token_id: int = None @@ -37,6 +37,10 @@ class GenerateRequest(BaseModel): remove_input_from_output: bool = False method: str = "generate" + def preprocess(self) -> None: + if (self.temperature == 0): + self.do_sample = False + class GenerateResponse(BaseResponse): text: List[str] = None @@ -91,7 +95,7 @@ def parse_generate_kwargs(text: List[str], kwargs: dict) -> GenerateRequest: top_k=parse_field(kwargs, "top_k", int), top_p=parse_field(kwargs, "top_p", float), typical_p=parse_field(kwargs, "typical_p", float), - repitition_penalty=parse_field(kwargs, "repitition_penalty", float), + repetition_penalty=parse_field(kwargs, "repetition_penalty", float), bos_token_id=parse_field(kwargs, "bos_token_id", int), pad_token_id=parse_field(kwargs, "pad_token_id", int), eos_token_id=parse_field(kwargs, "eos_token_id", int), From 93e81e76e470fa89d1584eb2674d73a720b9893f Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sun, 11 Sep 2022 22:17:57 +0530 Subject: [PATCH 6/7] update README --- scripts/bloom-inference-server/README.md | 53 +++++++----------------- 1 file changed, 14 insertions(+), 39 deletions(-) diff --git a/scripts/bloom-inference-server/README.md b/scripts/bloom-inference-server/README.md index 6d140b36f..0c412a9c2 100644 --- a/scripts/bloom-inference-server/README.md +++ b/scripts/bloom-inference-server/README.md @@ -4,13 +4,7 @@ We support HuggingFace accelerate and DeepSpeed Inference for generation. Install required packages: ```shell -pip install fastapi uvicorn accelerate huggingface_hub>=0.9.0 -``` -To install [DeepSpeed](https://github.com/microsoft/DeepSpeed): -```shell -git clone https://github.com/microsoft/DeepSpeed -cd DeepSpeed -CFLAGS="-I$CONDA_PREFIX/include/" LDFLAGS="-L$CONDA_PREFIX/lib/" TORCH_CUDA_ARCH_LIST="7.0" DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 pip install -e . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check +pip install fastapi uvicorn accelerate huggingface_hub>=0.9.0 deepspeed>=0.7.3 ``` To install [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII): ```shell @@ -19,14 +13,9 @@ cd DeepSpeed-MII pip install . ``` -All the provided scripts are tested on 8 A100 80GB GPUs for BLOOM 176B. These scripts might not work for other models or a different number of GPUs. -DS inference only supports fp16 for cli and server application. However, for benchmarking, it supports both fp16 and bf16. bf16 support will be added once DeepSpeed adds suitable CUDA kernels for these. +All the provided scripts are tested on 8 A100 80GB GPUs for BLOOM 176B (fp16/bf16) and 4 A100 80GB GPUs for BLOOM 176B (int8). These scripts might not work for other models or a different number of GPUs. -DS inference is deployed using the DeepSpeed MII library which requires the resharded checkpoints for 8 x Tensor Parallel. The HuggingFace checkpoints can be resharded and cached using the following command: -```shell -deepspeed --num_gpus 8 scripts/bloom-inference-server/cache_ds_checkpoints.py --model_name bigscience/bloom --dtype fp16 --save_mp_checkpoint_path -``` -Note: Running the above script will consume ~350 GB of disk space and will take some time (~30 minutes), depending on both the speed of your GPUs and storage. +DS inference is deployed using the DeepSpeed MII library which requires the resharded checkpoints for 8 x Tensor Parallel. Note: sometimes GPU memory is not freed when DS inference deployment is shutdown. You can free this memory by running: ```python @@ -35,6 +24,10 @@ mii.terminate("ds_inference_grpc_server") ``` or alternatively, just doing a `killall python` in terminal. +For using BLOOM quantized, use dtype = int8. Also, change the model_name to microsoft/bloom-deepspeed-inference-int8 for DeepSpeed-Inference. For HF accelerate, no change is needed for model_name. + +HF accelerate uses [LLM.int8()](https://arxiv.org/abs/2208.07339) and DS-inference uses [ZeroQuant](https://arxiv.org/abs/2206.01861) for post-training quantization. + #### BLOOM inference via command-line This asks for generate_kwargs everytime. Example: generate_kwargs = @@ -49,7 +42,7 @@ python scripts/bloom-inference-server/cli.py --model_name bigscience/bloom --dty 2. using DS inference ```shell -python scripts/bloom-inference-server/cli.py --model_name bigscience/bloom --dtype fp16 --deployment_framework ds_inference --save_mp_checkpoint_path --generate_kwargs '{"min_length": 100, "max_new_tokens": 100, "do_sample": false}' +python scripts/bloom-inference-server/cli.py --model_name microsoft/bloom-deepspeed-inference-fp16 --dtype fp16 --deployment_framework ds_inference --generate_kwargs '{"min_length": 100, "max_new_tokens": 100, "do_sample": false}' ``` #### BLOOM server deployment @@ -60,7 +53,7 @@ python scripts/bloom-inference-server/server.py --model_name bigscience/bloom -- 2. using DS inference ```shell -python scripts/bloom-inference-server/server.py --model_name bigscience/bloom --dtype fp16 --deployment_framework ds_inference --save_mp_checkpoint_path --host --port --allowed_max_new_tokens 100 +python scripts/bloom-inference-server/server.py --model_name microsoft/bloom-deepspeed-inference-fp16 --dtype fp16 --deployment_framework ds_inference --host --port --allowed_max_new_tokens 100 ``` We provide an example [script](examples/server_request.py) to query the BLOOM server is provided. To run this script: @@ -76,32 +69,14 @@ python scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom 2. using DS inference ```shell -deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype fp16 --deployment_framework ds_inference --save_mp_checkpoint_path --benchmark_cycles 5 +deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype fp16 --deployment_framework ds_inference --benchmark_cycles 5 ``` - -3. using DS ZeRO +alternatively, to load model faster: ```shell -deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype bf16 --deployment_framework ds_zero --benchmark_cycles 5 +deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name microsoft/bloom-deepspeed-inference-fp16 --dtype fp16 --deployment_framework ds_inference --benchmark_cycles 5 ``` -Alternatively, the following shell script will benchmark different batch sizes for the model. -```shell -mkdir -p logs - -for bs in {1,2,4,8,16,32,64,128} -do - python scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype bf16 --deployment_framework hf_accelerate --benchmark_cycles 5 --batch_size $bs 2>&1 | tee logs/hf-$bs.log - - deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype fp16 --deployment_framework ds_inference --save_mp_checkpoint_path --benchmark_cycles 5 --batch_size $bs 2>&1 | tee logs/ds-$bs.log - - deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype bf16 --deployment_framework ds_zero --benchmark_cycles 5 --batch_size $bs 2>&1 | tee logs/ds-zero-$bs.log -done -``` - -The following will benchmark sequence length for batch size = 1 on DS inference. +3. using DS ZeRO ```shell -for sq in {1,10,50,100,200,300,400,500,600,700,800,900,1000,1500,2000,2500,3000,3500,4000,4500,5000} -do - deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype fp16 --batch_size 1 --benchmark_cycles 5 --deployment_framework ds_inference --generate_kwargs '{"do_sample": false, "min_length": '$sq', "max_new_tokens": '$sq'}' 2>&1 | tee logs/ds_$sq.log -done +deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype bf16 --deployment_framework ds_zero --benchmark_cycles 5 ``` From 4b5205bdb6eae5cc8f39ee04f3c545d0b656a8f8 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sun, 11 Sep 2022 22:20:20 +0530 Subject: [PATCH 7/7] fix --- scripts/bloom-inference-server/utils/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/bloom-inference-server/utils/constants.py b/scripts/bloom-inference-server/utils/constants.py index 056d21920..660f10cd7 100644 --- a/scripts/bloom-inference-server/utils/constants.py +++ b/scripts/bloom-inference-server/utils/constants.py @@ -21,7 +21,7 @@ BIGSCIENCE_BLOOM: { BF16, FP16, - # INT8 + INT8 } }, DS_INFERENCE: {