diff --git a/examples/huggingface/pytorch/code-generation/quantization/README.md b/examples/huggingface/pytorch/code-generation/quantization/README.md
index de6e15284c1..b7310135d9a 100644
--- a/examples/huggingface/pytorch/code-generation/quantization/README.md
+++ b/examples/huggingface/pytorch/code-generation/quantization/README.md
@@ -26,7 +26,7 @@ Required libraries.
 pip install -r requirements.txt
 ```
 
-We use the local gpt_bigcode defination script `modeling_gpt_bigcode.py` in `run_generation.py`. Here is a little change to success trace.
+We use the gpt_bigcode defination script [modeling_gpt_bigcode.py](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/transformers/modeling/gpt_bigcode/modeling_gpt_bigcode.py) in `run_generation.py`. Here is a little change to success trace.
 ```diff
 # Line 227 in modeling_gpt_bigcode.py on transformers 4.28.1
 -      query, key_value = self.c_attn(hidden_states).split((self.embed_dim, 2 * self.kv_dim), dim=2)
diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation.py
index 2ce21aa549b..f73795c759c 100644
--- a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py
+++ b/examples/huggingface/pytorch/code-generation/quantization/run_generation.py
@@ -10,16 +10,13 @@
 from datasets import load_dataset
 from torch.nn.functional import pad
 from torch.utils.data import DataLoader
-from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig
+from transformers import AutoTokenizer, PretrainedConfig
 import transformers
 from optimum.utils import NormalizedConfigManager
 
 import numpy as np
 from itertools import chain
 
-from modeling_gpt_bigcode import GPTBigCodeForCausalLM
-transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTJForCausalLM = GPTBigCodeForCausalLM
-
 parser = argparse.ArgumentParser()
 
 # Main config
@@ -87,9 +84,23 @@
 parser.add_argument("--top_p", default=0.95, type=float)
 parser.add_argument("--top_k", default=0, type=int)
 parser.add_argument("--do_sample", action="store_true")
+parser.add_argument("--check_references", action="store_true")
+parser.add_argument("--max_memory_per_gpu", type=str, default=None)
+parser.add_argument(
+    "--modeltype",
+    default="causal",
+    help="AutoModel to use, it can be causal or seq2seq",
+)
+parser.add_argument(
+    "--limit_start",
+    type=int,
+    default=0,
+    help="Optional offset to start from when limiting the number of samples",
+)   
 args = parser.parse_args()
 
 
+from intel_extension_for_transformers.transformers import AutoModelForCausalLM
 user_model = AutoModelForCausalLM.from_pretrained(
     args.model,
     torchscript=True
diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md
index 562eb48059b..4e792a43df5 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/README.md
+++ b/examples/huggingface/pytorch/text-generation/quantization/README.md
@@ -20,7 +20,7 @@ cd intel-extension-for-pytorch
 git submodule sync && git submodule update --init --recursive
 python setup.py install
 ```
-We use the local GPTJ defination script `modeling_gptj.py` in `run_generation.py`. Here is a little change to success trace.
+We use the GPTJ defination script [modeling_gptj.py](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/transformers/modeling/gptj/modeling_gptj.py) in `run_generation.py`. Here is a little change to success trace.
 ```diff
 # Line 602 in modeling_gptj.py on transformers 4.28.1
 
diff --git a/intel_extension_for_transformers/transformers/modeling/gpt_bigcode/__init__.py b/intel_extension_for_transformers/transformers/modeling/gpt_bigcode/__init__.py
new file mode 100644
index 00000000000..96a9dc31f6b
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/gpt_bigcode/__init__.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/examples/huggingface/pytorch/code-generation/quantization/modeling_gpt_bigcode.py b/intel_extension_for_transformers/transformers/modeling/gpt_bigcode/modeling_gpt_bigcode.py
similarity index 96%
rename from examples/huggingface/pytorch/code-generation/quantization/modeling_gpt_bigcode.py
rename to intel_extension_for_transformers/transformers/modeling/gpt_bigcode/modeling_gpt_bigcode.py
index c758a5d6839..fd712d4b2de 100644
--- a/examples/huggingface/pytorch/code-generation/quantization/modeling_gpt_bigcode.py
+++ b/intel_extension_for_transformers/transformers/modeling/gpt_bigcode/modeling_gpt_bigcode.py
@@ -1,5 +1,23 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # coding=utf-8
-# Copyright 2023 The Bigcode team and HuggingFace Inc. team.
+# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -383,8 +401,9 @@ def _init_weights(self, module):
         """Initialize the weights."""
         if isinstance(module, (GPTBigCodeMLP, GPTBigCodeAttention)):
             # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
-            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
-            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth.
+            #   > Scale the weights of residual layers at initialization by a factor of 1/√N where N is the 
+            #   > of residual layers.
             #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
             #
             # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
@@ -406,7 +425,8 @@ def _init_weights(self, module):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
-    # Copied from transformers.models.gpt2.modeling_gpt2.GPT2PreTrainedModel._set_gradient_checkpointing with GPT2->GPTBigCode
+    # Copied from transformers.models.gpt2.modeling_gpt2.GPT2PreTrainedModel._set_gradient_checkpointing with
+    # GPT2->GPTBigCode
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, GPTBigCodeModel):
             module.gradient_checkpointing = value
@@ -477,15 +497,15 @@ def _set_gradient_checkpointing(self, module, value=False):
             - 0 indicates the head is **masked**.
 
         inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
 
             If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
             `past_key_values`).
         use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+            (see `past_key_values`).
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -717,8 +737,8 @@ def custom_forward(*inputs):
 
 @add_start_docstrings(
     """
-    The GPT_BIGCODE Model transformer with a language modeling head on top (linear layer with weights tied to the input
-    embeddings).
+    The GPT_BIGCODE Model transformer with a language modeling head on top
+    (linear layer with weights tied to the input embeddings).
     """,
     GPT_BIGCODE_START_DOCSTRING,
 )
@@ -868,10 +888,10 @@ def _reorder_cache(
     models (e.g. GPT-1) do.
 
     Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row.
+    If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess
+    the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value
+    in each row of the batch).
     """,
     GPT_BIGCODE_START_DOCSTRING,
 )
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_causal.py b/intel_extension_for_transformers/transformers/modeling/modeling_causal.py
index dfcc25aa366..da3f8cef44e 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_causal.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_causal.py
@@ -46,6 +46,9 @@
 from .bloom.modeling_bloom import BloomForCausalLM
 from .gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
 from .opt.modeling_opt import OPTForCausalLM
+from .gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeForCausalLM
+# to use modeling modification base transformers 4.30.2:
+transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTJForCausalLM = GPTBigCodeForCausalLM
 # to use modeling modification base transformers 4.28.1:
 transformers.models.gptj.modeling_gptj.GPTJForCausalLM = GPTJForCausalLM
 transformers.models.llama.modeling_llama.LlamaForCausalLM = LlamaForCausalLM
@@ -118,4 +121,4 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         )
 
 class AutoModelForCausalLM(_BaseAutoModelClass):
-    _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING
\ No newline at end of file
+    _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING