Merge branch 'huggingface:main' into fix_fsdp_with_fp8_in_trainer

huggingface · Oct 21, 2024 · 58d18f6 · 58d18f6
2 parents d7a0194 + 32590b5
commit 58d18f6
Show file tree

Hide file tree

Showing 48 changed files with 5,883 additions and 1,176 deletions.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -414,6 +414,8 @@
         title: Gemma
       - local: model_doc/gemma2
         title: Gemma2
+      - local: model_doc/glm
+        title: GLM
       - local: model_doc/openai-gpt
         title: GPT
       - local: model_doc/gpt_neo

diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md
@@ -332,7 +332,7 @@ This code can quickly be converted into a tool, just by wrapping it in a functio
 from transformers import tool
 
 @tool
-def model_download_counter(task: str) -> str:
+def model_download_tool(task: str) -> str:
     """
     This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub.
     It returns the name of the checkpoint.
@@ -345,7 +345,7 @@ def model_download_counter(task: str) -> str:
 ```
 
 The function needs:
-- A clear name. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's put `model_download_counter`.
+- A clear name. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's put `model_download_tool`.
 - Type hints on both inputs and output
 - A description, that includes an 'Args:' part where each argument is described (without a type indication this time, it will be pulled from the type hint).
 All these will be automatically baked into the agent's system prompt upon initialization: so strive to make them as clear as possible!
@@ -367,7 +367,7 @@ You get the following:
 ======== New task ========
 Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?
 ==== Agent is executing the code below:
-most_downloaded_model = model_download_counter(task="text-to-video")
+most_downloaded_model = model_download_tool(task="text-to-video")
 print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.")
 ====
 ```

diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md
@@ -943,6 +943,35 @@ all implementations of Jinja:
 - Directly rendering a dict or list may give different results in other implementations (for example, string entries
   might change from single-quoted to double-quoted). Adding the `tojson` filter can help to ensure consistency here.
 
+### Writing generation prompts
+
+We mentioned above that `add_generation_prompt` is a special variable that will be accessible inside your template,
+and is controlled by the user setting the `add_generation_prompt` flag. If your model expects a header for
+assistant messages, then your template must support adding the header when `add_generation_prompt` is set.
+
+Here is an example of a template that formats messages ChatML-style, with generation prompt support:
+
+```text
+{{- bos_token }}
+{%- for message in messages %}
+    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
+```
+
+The exact content of the assistant header will depend on your specific model, but it should always be **the string
+that represents the start of an assistant message**, so that if the user applies your template with 
+`add_generation_prompt=True` and then generates text, the model will write an assistant response. Also note that some
+models do not need a generation prompt, because assistant messages always begin immediately after user messages. 
+This is particularly common for LLaMA and Mistral models, where assistant messages begin immediately after the `[/INST]`
+token that ends user messages. In these cases, the template can ignore the `add_generation_prompt` flag.
+
+Generation prompts are important! If your model requires a generation prompt but it is not set in the template, then
+model generations will likely be severely degraded, or the model may display unusual behaviour like continuing 
+the final user message! 
+
 ### Writing and debugging larger templates
 
 When this feature was introduced, most templates were quite small, the Jinja equivalent of a "one-liner" script. 

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
@@ -150,6 +150,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                         [Gemma](model_doc/gemma)                         |       ✅        |         ❌         |      ✅      |
 |                        [Gemma2](model_doc/gemma2)                        |       ✅        |         ❌         |      ❌      |
 |                           [GIT](model_doc/git)                           |       ✅        |         ❌         |      ❌      |
+|                           [GLM](model_doc/glm)                           |       ✅        |         ❌         |      ❌      |
 |                          [GLPN](model_doc/glpn)                          |       ✅        |         ❌         |      ❌      |
 |                       [GPT Neo](model_doc/gpt_neo)                       |       ✅        |         ❌         |      ✅      |
 |                      [GPT NeoX](model_doc/gpt_neox)                      |       ✅        |         ❌         |      ❌      |

diff --git a/docs/source/en/model_doc/detr.md b/docs/source/en/model_doc/detr.md
@@ -181,6 +181,15 @@ If you're interested in submitting a resource to be included here, please feel f
     - post_process_instance_segmentation
     - post_process_panoptic_segmentation
 
+## DetrImageProcessorFast
+
+[[autodoc]] DetrImageProcessorFast
+    - preprocess
+    - post_process_object_detection
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
 ## DetrFeatureExtractor
 
 [[autodoc]] DetrFeatureExtractor

diff --git a/docs/source/en/model_doc/glm.md b/docs/source/en/model_doc/glm.md
@@ -0,0 +1,99 @@
+<!--Copyright 2024 The GLM & ZhipuAI team and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GLM
+
+## Overview
+
+The GLM Model was proposed
+in [ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools](https://arxiv.org/html/2406.12793v1)
+by GLM Team, THUDM & ZhipuAI.
+
+The abstract from the paper is the following:
+
+*We introduce ChatGLM, an evolving family of large language models that we have been developing over time. This report
+primarily focuses on the GLM-4 language series, which includes GLM-4, GLM-4-Air, and GLM-4-9B. They represent our most
+capable models that are trained with all the insights and lessons gained from the preceding three generations of
+ChatGLM. To date, the GLM-4 models are pre-trained on ten trillions of tokens mostly in Chinese and English, along with
+a small set of corpus from 24 languages, and aligned primarily for Chinese and English usage. The high-quality alignment
+is achieved via a multi-stage post-training process, which involves supervised fine-tuning and learning from human
+feedback. Evaluations show that GLM-4 1) closely rivals or outperforms GPT-4 in terms of general metrics such as MMLU,
+GSM8K, MATH, BBH, GPQA, and HumanEval, 2) gets close to GPT-4-Turbo in instruction following as measured by IFEval, 3)
+matches GPT-4 Turbo (128K) and Claude 3 for long context tasks, and 4) outperforms GPT-4 in Chinese alignments as
+measured by AlignBench. The GLM-4 All Tools model is further aligned to understand user intent and autonomously decide
+when and which tool(s) to use—including web browser, Python interpreter, text-to-image model, and user-defined
+functions—to effectively complete complex tasks. In practical applications, it matches and even surpasses GPT-4 All
+Tools in tasks like accessing online information via web browsing and solving math problems using Python interpreter.
+Over the course, we have open-sourced a series of models, including ChatGLM-6B (three generations), GLM-4-9B (128K, 1M),
+GLM-4V-9B, WebGLM, and CodeGeeX, attracting over 10 million downloads on Hugging face in the year 2023 alone.*
+
+Tips:
+
+- This model was contributed by [THUDM](https://huggingface.co/THUDM). The most recent code can be
+  found [here](https://github.com/thudm/GLM-4).
+
+
+## Usage tips
+
+`GLM-4` can be found on the [Huggingface Hub](https://huggingface.co/collections/THUDM/glm-4-665fcf188c414b03c2f7e3b7)
+
+In the following, we demonstrate how to use `glm-4-9b-chat` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+>>> device = "cuda" # the device to load the model onto
+
+>>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat")
+
+>>> prompt = "Give me a short introduction to large language model."
+
+>>> messages = [{"role": "user", "content": prompt}]
+
+>>> text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+>>> model_inputs = tokenizer([text], return_tensors="pt").to(device)
+
+>>> generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True)
+
+>>> generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
+
+>>> response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+```
+
+## GlmConfig
+
+[[autodoc]] GlmConfig
+
+## GlmModel
+
+[[autodoc]] GlmModel
+    - forward
+
+## GlmForCausalLM
+
+[[autodoc]] GlmForCausalLM
+    - forward
+
+## GlmForSequenceClassification
+
+[[autodoc]] GlmForSequenceClassification
+    - forward
+
+## GlmForTokenClassification
+
+[[autodoc]] GlmForTokenClassification
+    - forward
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
@@ -42,6 +42,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon)
 * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
+* [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
@@ -216,6 +217,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.CamembertModel)
 * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon)
 * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel)
+* [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
 * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)

diff --git a/docs/source/ja/model_doc/detr.md b/docs/source/ja/model_doc/detr.md
@@ -184,6 +184,15 @@ DETR の使用を開始するのに役立つ公式 Hugging Face およびコミ
     - post_process_instance_segmentation
     - post_process_panoptic_segmentation
 
+## DetrImageProcessorFast
+
+[[autodoc]] DetrImageProcessorFast
+    - preprocess
+    - post_process_object_detection
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
 ## DetrFeatureExtractor
 
 [[autodoc]] DetrFeatureExtractor

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -454,6 +454,7 @@
         "GitProcessor",
         "GitVisionConfig",
     ],
+    "models.glm": ["GlmConfig"],
     "models.glpn": ["GLPNConfig"],
     "models.gpt2": [
         "GPT2Config",
@@ -1190,7 +1191,7 @@
     _import_structure["models.deprecated.efficientformer"].append("EfficientFormerImageProcessor")
     _import_structure["models.deprecated.tvlt"].append("TvltImageProcessor")
     _import_structure["models.deprecated.vit_hybrid"].extend(["ViTHybridImageProcessor"])
-    _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor"])
+    _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor", "DetrImageProcessorFast"])
     _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"])
     _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
     _import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
@@ -2294,6 +2295,15 @@
             "GitVisionModel",
         ]
     )
+    _import_structure["models.glm"].extend(
+        [
+            "GlmForCausalLM",
+            "GlmForSequenceClassification",
+            "GlmForTokenClassification",
+            "GlmModel",
+            "GlmPreTrainedModel",
+        ]
+    )
     _import_structure["models.glpn"].extend(
         [
             "GLPNForDepthEstimation",
@@ -5304,6 +5314,7 @@
         GitProcessor,
         GitVisionConfig,
     )
+    from .models.glm import GlmConfig
     from .models.glpn import GLPNConfig
     from .models.gpt2 import (
         GPT2Config,
@@ -6079,7 +6090,7 @@
         from .models.deprecated.efficientformer import EfficientFormerImageProcessor
         from .models.deprecated.tvlt import TvltImageProcessor
         from .models.deprecated.vit_hybrid import ViTHybridImageProcessor
-        from .models.detr import DetrFeatureExtractor, DetrImageProcessor
+        from .models.detr import DetrFeatureExtractor, DetrImageProcessor, DetrImageProcessorFast
         from .models.donut import DonutFeatureExtractor, DonutImageProcessor
         from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
         from .models.efficientnet import EfficientNetImageProcessor
@@ -7024,6 +7035,13 @@
             GitPreTrainedModel,
             GitVisionModel,
         )
+        from .models.glm import (
+            GlmForCausalLM,
+            GlmForSequenceClassification,
+            GlmForTokenClassification,
+            GlmModel,
+            GlmPreTrainedModel,
+        )
         from .models.glpn import (
             GLPNForDepthEstimation,
             GLPNModel,

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
@@ -32,6 +32,7 @@
     is_tf_available,
     is_torch_available,
     is_torchvision_available,
+    is_torchvision_v2_available,
     is_vision_available,
     requires_backends,
 )
@@ -51,7 +52,9 @@
 if is_flax_available():
     import jax.numpy as jnp
 
-if is_torchvision_available():
+if is_torchvision_v2_available():
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
     from torchvision.transforms import functional as F
 
 
@@ -123,11 +126,11 @@ def rescale(
     if not isinstance(image, np.ndarray):
         raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}")
 
-    rescaled_image = image * scale
+    rescaled_image = image.astype(np.float64) * scale  # Numpy type promotion has changed, so always upcast first
     if data_format is not None:
         rescaled_image = to_channel_dimension_format(rescaled_image, data_format, input_data_format)
 
-    rescaled_image = rescaled_image.astype(dtype)
+    rescaled_image = rescaled_image.astype(dtype)  # Finally downcast to the desired dtype at the end
 
     return rescaled_image
 

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -97,6 +97,7 @@
     gemma,
     gemma2,
     git,
+    glm,
     glpn,
     gpt2,
     gpt_bigcode,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -114,6 +114,7 @@
         ("gemma", "GemmaConfig"),
         ("gemma2", "Gemma2Config"),
         ("git", "GitConfig"),
+        ("glm", "GlmConfig"),
         ("glpn", "GLPNConfig"),
         ("gpt-sw3", "GPT2Config"),
         ("gpt2", "GPT2Config"),
@@ -416,6 +417,7 @@
         ("gemma", "Gemma"),
         ("gemma2", "Gemma2"),
         ("git", "GIT"),
+        ("glm", "GLM"),
         ("glpn", "GLPN"),
         ("gpt-sw3", "GPT-Sw3"),
         ("gpt2", "OpenAI GPT-2"),

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -72,7 +72,7 @@
             ("deit", ("DeiTImageProcessor",)),
             ("depth_anything", ("DPTImageProcessor",)),
             ("deta", ("DetaImageProcessor",)),
-            ("detr", ("DetrImageProcessor",)),
+            ("detr", ("DetrImageProcessor", "DetrImageProcessorFast")),
             ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
             ("dinov2", ("BitImageProcessor",)),
             ("donut-swin", ("DonutImageProcessor",)),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -111,6 +111,7 @@
         ("gemma", "GemmaModel"),
         ("gemma2", "Gemma2Model"),
         ("git", "GitModel"),
+        ("glm", "GlmModel"),
         ("glpn", "GLPNModel"),
         ("gpt-sw3", "GPT2Model"),
         ("gpt2", "GPT2Model"),
@@ -486,6 +487,7 @@
         ("gemma", "GemmaForCausalLM"),
         ("gemma2", "Gemma2ForCausalLM"),
         ("git", "GitForCausalLM"),
+        ("glm", "GlmForCausalLM"),
         ("gpt-sw3", "GPT2LMHeadModel"),
         ("gpt2", "GPT2LMHeadModel"),
         ("gpt_bigcode", "GPTBigCodeForCausalLM"),
@@ -941,6 +943,7 @@
         ("funnel", "FunnelForSequenceClassification"),
         ("gemma", "GemmaForSequenceClassification"),
         ("gemma2", "Gemma2ForSequenceClassification"),
+        ("glm", "GlmForSequenceClassification"),
         ("gpt-sw3", "GPT2ForSequenceClassification"),
         ("gpt2", "GPT2ForSequenceClassification"),
         ("gpt_bigcode", "GPTBigCodeForSequenceClassification"),
@@ -1131,6 +1134,7 @@
         ("funnel", "FunnelForTokenClassification"),
         ("gemma", "GemmaForTokenClassification"),
         ("gemma2", "Gemma2ForTokenClassification"),
+        ("glm", "GlmForTokenClassification"),
         ("gpt-sw3", "GPT2ForTokenClassification"),
         ("gpt2", "GPT2ForTokenClassification"),
         ("gpt_bigcode", "GPTBigCodeForTokenClassification"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -204,6 +204,7 @@
                 ),
             ),
             ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+            ("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
             ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
             ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),