[Infra] use python logging (#752)

intel · Dec 9, 2023 · 60942ea · 60942ea
1 parent 5ba7977
commit 60942ea
Show file tree

Hide file tree

Showing 24 changed files with 391 additions and 136 deletions.
diff --git a/intel_extension_for_transformers/neural_chat/chatbot.py b/intel_extension_for_transformers/neural_chat/chatbot.py
@@ -23,6 +23,10 @@
 from .config import DeviceOptions
 from .plugins import plugins
 
+from .config_logging import configure_logging
+logger = configure_logging()
+
+
 def build_chatbot(config: PipelineConfig=None):
     """Build the chatbot with a given configuration.
 
@@ -106,8 +110,8 @@ def build_chatbot(config: PipelineConfig=None):
                     plugins[plugin_name]['class'] = SadTalker
                 else: # pragma: no cover
                     raise ValueError("NeuralChat Error: Unsupported plugin")
-                print(f"create {plugin_name} plugin instance...")
-                print(f"plugin parameters: ", plugin_value['args'])
+                logger.info("create %s plugin instance...", plugin_name)
+                logger.info("plugin parameters: %s", plugin_value['args'])
                 plugins[plugin_name]["instance"] = plugins[plugin_name]['class'](**plugin_value['args'])
                 adapter.register_plugin_instance(plugin_name, plugins[plugin_name]["instance"])
 

diff --git a/intel_extension_for_transformers/neural_chat/cli/cli_commands.py b/intel_extension_for_transformers/neural_chat/cli/cli_commands.py
@@ -26,6 +26,8 @@
 from ..plugins import plugins
 from transformers import TrainingArguments
 from ..chatbot import build_chatbot, finetune_model
+from ..config_logging import configure_logging
+logger = configure_logging()
 
 __all__ = ['BaseCommand', 'HelpCommand', 'TextVoiceChatExecutor', 'FinetuingExecutor']
 
@@ -163,7 +165,7 @@ def execute(self, argv: List[str]) -> bool:
         msg = 'Package Version:\n'
         msg += '    {}\n\n'.format(version)
 
-        print(msg)
+        logger.info(msg)
         return True
 
 
@@ -225,10 +227,10 @@ def execute(self, argv: List[str]) -> bool:
         self.chatbot = build_chatbot(self.config)
         try:
             res = self(prompt)
-            print(res)
+            logger.info(res)
             return True
         except Exception as e:
-            print("TextVoiceChatExecutor Exception: ", e)
+            logger.info("TextVoiceChatExecutor Exception: {}".format(e))
             return False
 
     def __call__(
@@ -266,10 +268,10 @@ def execute(self, argv: List[str]) -> bool:
         self.finetuneCfg = TextGenerationFinetuningConfig(model_args, data_args, training_args, finetune_args)
         try:
             res = self()
-            print(res)
+            logger.info(res)
             return True
         except Exception as e:
-            print("FinetuingExecutor Exception: ", e)
+            logger.info("FinetuingExecutor Exception: {}".format(e))
             return False
 
     def __call__(self):

diff --git a/intel_extension_for_transformers/neural_chat/config_logging.py b/intel_extension_for_transformers/neural_chat/config_logging.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Neural Chat Python logging."""
+
+import logging
+
+def configure_logging(log_file="app.log", log_level=logging.INFO):
+    """
+    Configure logging for the application.
+
+    Parameters:
+    - log_file: str, optional, default: "app.log"
+        The name of the log file.
+    - log_level: int, optional, default: logging.INFO
+        The logging level.
+
+    Returns:
+    - logger: logging.Logger
+        The configured logger instance with specified handlers and formatters.
+    """
+    logger = logging.getLogger("my_app")
+    logger.setLevel(log_level)
+
+    file_handler = logging.FileHandler(log_file, delay=True)
+    file_handler.setLevel(log_level)
+
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+
+    file_handler.setFormatter(formatter)
+    console_handler.setFormatter(formatter)
+
+    logger.addHandler(file_handler)
+    logger.addHandler(console_handler)
+
+    return logger
+
diff --git a/intel_extension_for_transformers/neural_chat/examples/finetuning/tts/asr.py b/intel_extension_for_transformers/neural_chat/examples/finetuning/tts/asr.py
@@ -18,6 +18,13 @@
 
 from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.asr import AudioSpeechRecognition
 import argparse
+import logging
+logging.basicConfig(
+    format="%(asctime)s %(name)s:%(levelname)s:%(message)s",
+    datefmt="%d-%M-%Y %H:%M:%S",
+    level=logging.INFO
+)
+
 parser = argparse.ArgumentParser(
                     prog='asr',
                     description='Audio Speech Recognition')
@@ -27,4 +34,4 @@
 args = parser.parse_args()
 asr = AudioSpeechRecognition(model_name_or_path=args.model_name_or_path, device=args.device)
 text = asr.audio2text(args.input_audio)
-print(text)
+logging.info(text)
diff --git a/intel_extension_for_transformers/neural_chat/examples/finetuning/tts/inference.py b/intel_extension_for_transformers/neural_chat/examples/finetuning/tts/inference.py
@@ -26,6 +26,12 @@
 import soundfile as sf
 from datetime import datetime
 from num2words import num2words
+import logging
+logging.basicConfig(
+    format="%(asctime)s %(name)s:%(levelname)s:%(message)s",
+    datefmt="%d-%M-%Y %H:%M:%S",
+    level=logging.INFO
+)
 
 workdir = os.getcwd()
 
@@ -102,7 +108,7 @@ def correct_number(text):
             try:
                 word = num2words(word)
             except Exception as e:
-                print(f"num2words fail with word: {word} and exception: {e}")
+                logging.info("num2words fail with word: %s and exception: %s", word, e)
         else:
             try:
                 val = int(word)
@@ -130,5 +136,6 @@ def correct_number(text):
         time_stamp = now.strftime("%d_%m_%Y_%H_%M_%S")
         sf.write(f"output_{time_stamp}.wav", speech.cpu().numpy(), samplerate=16000)
     except Exception as e:
-        print(f"Catch exception: {e}")
-        print("Restarting\n")
+        logging.info("Catch exception: %s", e)
+        logging.info("Restarting\n")
+
diff --git a/intel_extension_for_transformers/neural_chat/examples/plugins/retrieval/retrieval_chat.py b/intel_extension_for_transformers/neural_chat/examples/plugins/retrieval/retrieval_chat.py
@@ -18,6 +18,12 @@
 from intel_extension_for_transformers.neural_chat.config import PipelineConfig
 from intel_extension_for_transformers.neural_chat.chatbot import build_chatbot
 from intel_extension_for_transformers.neural_chat.plugins import plugins
+import logging
+logging.basicConfig(
+    format="%(asctime)s %(name)s:%(levelname)s:%(message)s",
+    datefmt="%d-%M-%Y %H:%M:%S",
+    level=logging.INFO
+)
 
 def main():
     plugins.retrieval.enable = True
@@ -27,7 +33,7 @@ def main():
     chatbot = build_chatbot(pipeline_args)
 
     response = chatbot.predict(query="What is IDM 2.0?")
-    print(response)
+    logging.info(response)
 
 if __name__ == "__main__":
     main()
diff --git a/intel_extension_for_transformers/neural_chat/models/base_model.py b/intel_extension_for_transformers/neural_chat/models/base_model.py
@@ -24,6 +24,12 @@
 from ..utils.common import is_audio_file
 from .model_utils import load_model, predict, predict_stream, MODELS
 from ..prompts import PromptTemplate
+import logging
+logging.basicConfig(
+    format="%(asctime)s %(name)s:%(levelname)s:%(message)s",
+    datefmt="%d-%M-%Y %H:%M:%S",
+    level=logging.INFO
+)
 
 
 def construct_parameters(query, model_name, device, assistant_model, config):
@@ -167,7 +173,7 @@ def predict_stream(self, query, origin_query="", config=None):
                         if plugin_name == "cache":
                             response = plugin_instance.pre_llm_inference_actions(query)
                             if response:
-                                print(f"Get response: {response} from cache")
+                                logging.info("Get response: %s from cache", response)
                                 return response['choices'][0]['text'], link
                         if plugin_name == "asr" and not is_audio_file(query):
                             continue
@@ -249,7 +255,7 @@ def predict(self, query, origin_query="", config=None):
                         if plugin_name == "cache":
                             response = plugin_instance.pre_llm_inference_actions(query)
                             if response:
-                                print(f"Get response: {response} from cache")
+                                logging.info("Get response: %s from cache", response)
                                 return response['choices'][0]['text']
                         if plugin_name == "asr" and not is_audio_file(query):
                             continue

diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py
@@ -19,6 +19,7 @@
 from pathlib import Path
 import copy, time
 from datetime import datetime
+import sys
 import torch
 import transformers
 import warnings
@@ -27,6 +28,13 @@
 from threading import Thread
 import contextlib
 from huggingface_hub import snapshot_download
+import logging
+logging.basicConfig(
+    format="%(asctime)s %(name)s:%(levelname)s:%(message)s",
+    datefmt="%d-%M-%Y %H:%M:%S",
+    level=logging.INFO,
+    stream=sys.stdout
+)
 from typing import List
 from transformers import (
     GenerationConfig,
@@ -47,7 +55,6 @@
     WeightOnlyQuantConfig,
     BitsAndBytesConfig
 )
-
 if is_deepspeed_available():
     import deepspeed # pylint: disable=E0401
 
@@ -82,7 +89,7 @@ def get_repo_root(model_name_or_path, local_rank=-1, token=None):
         # Checks if online or not
         if is_offline_mode():
             if local_rank == 0:
-                print("Offline mode: forcing local_files_only=True")
+                logging.info("Offline mode: forcing local_files_only=True")
 
         # Only download PyTorch weights by default
         allow_patterns = ["*.bin"]
@@ -207,7 +214,7 @@ def max_input_len(input_text_length):
     elif input_text_length <= 2048:
         return 2048
     else:
-        print("Max support length is 4096")
+        logging.info("Max support length is 4096")
         return 4096
 
 
@@ -230,7 +237,7 @@ def import_deepspeed():
         )
     # Initialize process(es) for DeepSpeed
     deepspeed.init_distributed(dist_backend="hccl")
-    print("DeepSpeed is enabled.")
+    logging.info("DeepSpeed is enabled.")
 
 
 def init_deepspeed_inference(model, model_name_or_path, use_hpu_graphs, is_meta, token=None):
@@ -310,7 +317,7 @@ def load_model(
         if device == "cuda" and is_bitsandbytes_available() and torch.cuda.is_available():
             bitsandbytes_quant_config = optimization_config
         else:
-            print(
+            logging.warning(
                 "CUDA device or bitsandbytes is not available, please make sure CUDA device and bitsandbytes" \
                 + " library is available, ignoring bitsandbytes config now."
             )
@@ -322,7 +329,7 @@ def load_model(
     elif dtype == "float32":
         torch_dtype = torch.float32
     else:
-        print(f"Unsupported dtype {dtype}, using float32 now.")
+        logging.warning(f"Unsupported dtype {dtype}, using float32 now.")
         torch_dtype = torch.float32
 
     MODELS[model_name] = {}
@@ -356,7 +363,8 @@ def load_model(
     config = AutoConfig.from_pretrained(model_name, use_auth_token=hf_access_token, trust_remote_code=True \
                                         if re.search("chatglm", model_name, re.IGNORECASE) else False)
     load_to_meta = model_on_meta(config)
-    if isinstance(optimization_config, WeightOnlyQuantConfig) and not re.search("llama", model_name, re.IGNORECASE):
+
+    if isinstance(optimization_config, WeightOnlyQuantConfig):
         from intel_extension_for_transformers.neural_chat.chatbot import optimize_model
         model = optimize_model(model_name, optimization_config, use_llm_runtime)
         if not model.config.is_encoder_decoder:
@@ -365,13 +373,12 @@ def load_model(
             tokenizer.pad_token = tokenizer.eos_token
         MODELS[model_name]["model"] = model
         MODELS[model_name]["tokenizer"] = tokenizer
-        print("Optimized Model loaded.")
+        logging.info("Optimized Model loaded.")
         return
-    
+
     if peft_path and device == "hpu" and use_deepspeed and load_to_meta:
-        print("PEFT could not work in deepspeed sharded checkpt loading mode, set load_to_meta to False")
+        logging.warning("PEFT could not work in deepspeed sharded checkpt loading mode, set load_to_meta to False")
         load_to_meta = False
-
     if device == "hpu" and use_deepspeed and load_to_meta:
         with deepspeed.OnDevice(dtype=torch.bfloat16, device="meta"):
             model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
@@ -492,14 +499,6 @@ def load_model(
     if model.generation_config.eos_token_id is None:
         model.generation_config.eos_token_id = tokenizer.eos_token_id
 
-    if isinstance(optimization_config, WeightOnlyQuantConfig) and not re.search("llama", model_name, re.IGNORECASE):
-        from intel_extension_for_transformers.neural_chat.chatbot import optimize_model
-        model = optimize_model(model, optimization_config, use_llm_runtime)
-
-        MODELS[model_name]["model"] = model
-        MODELS[model_name]["tokenizer"] = tokenizer
-        print("Optimized Model loaded.")
-        return
     if device == "hpu":
         if peft_path:
             from peft import PeftModel
@@ -529,18 +528,9 @@ def load_model(
             model = model.to(dtype=torch_dtype)
 
         if device == "cpu":
-            import intel_extension_for_pytorch as intel_ipex
-            if re.search("llama", model_name, re.IGNORECASE):
-                qconfig = None if ipex_int8 == False else intel_ipex.quantization.get_weight_only_quant_qconfig_mapping(
-                    weight_dtype=torch.quint4x2, lowp_mode=intel_ipex.quantization.WoqLowpMode.BF16
-                )
-                model = intel_ipex.optimize_transformers(model.eval(),
-                                                         dtype=torch_dtype,
-                                                         inplace=True,
-                                                         quantization_config=qconfig,
-                                                         deployment_mode=cpu_jit
-                                                        )
-            elif torch_dtype == torch.bfloat16 and not ipex_int8:
+            if torch_dtype == torch.bfloat16 and not ipex_int8:
+                import intel_extension_for_pytorch as intel_ipex
+
                 model = intel_ipex.optimize(
                     model.eval(),
                     dtype=torch_dtype,
@@ -807,7 +797,7 @@ def generate_output():
                                     generation_config=generation_config,
                                     return_dict_in_generate=True,
                                 )
-                    output_token_len = len(output_token[0]) if is_llm_runtime_model(model) else \
+                    output_token_len= len(output_token[0]) if is_llm_runtime_model(model) else \
                                       output_token.sequences[0].shape[-1]
                     return output_token
             except Exception as e: