Skip to content

Commit

Permalink
feat: Alpaca pydantic class for easy conversion, validation, and stru…
Browse files Browse the repository at this point in the history
…ctured output generation (#1202)

Signed-off-by: Caelum Forder <caelum119@gmail.com>
Co-authored-by: Wendong-Fan <133094783+Wendong-Fan@users.noreply.github.com>
Co-authored-by: Wendong <w3ndong.fan@gmail.com>
  • Loading branch information
3 people authored Dec 2, 2024
1 parent bce437a commit 3dd959a
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 2 deletions.
2 changes: 1 addition & 1 deletion camel/messages/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
HermesFunctionFormatter,
ShareGPTMessage,
)
from .conversion.models import (
from .conversion.conversation_models import (
ShareGPTConversation,
)
from .conversion.sharegpt.function_call_formatter import (
Expand Down
4 changes: 3 additions & 1 deletion camel/messages/conversion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

from .models import (
from .alpaca import AlpacaItem
from .conversation_models import (
ShareGPTConversation,
ShareGPTMessage,
ToolCall,
Expand All @@ -24,6 +25,7 @@
'ShareGPTMessage',
'ShareGPTConversation',
'HermesFunctionFormatter',
'AlpacaItem',
'ToolCall',
'ToolResponse',
]
122 changes: 122 additions & 0 deletions camel/messages/conversion/alpaca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

import re

from pydantic import BaseModel, Field, field_validator


class AlpacaItem(BaseModel):
r"""Represents an instruction-response item in the Alpaca format.
Appropripate for both cases where input field is empty, or populated.
Provides parsing from string format using the class method from_string().
Args:
instruction (str): The instruction/question/prompt
input (str): Input context or examples (put empty string if none)
output (str): The response/answer to the instruction
"""

instruction: str = Field(description="The instruction/question/prompt")
input: str = Field(
description="Optional context or input for the task."
" For example, when the instruction is \"Summarize the "
"following article\", the input is the article."
)
output: str = Field(description="The response/answer to the instruction")

@field_validator('instruction', 'output')
def no_section_markers(cls, value: str) -> str:
r"""Ensures fields don't contain section markers like '###
Response:'
"""
if (
'### Response' in value
or '### Instruction' in value
or '### Input' in value
):
raise ValueError("Field cannot contain section markers")
return value.strip()

@classmethod
def from_string(cls, text: str) -> "AlpacaItem":
r"""Creates an AlpacaItem from a formatted string.
Args:
text: String in either of these formats:
With input:
### Instruction:
{instruction}
### Input:
{input}
### Response:
{response}
Without input:
### Instruction:
{instruction}
### Response:
{response}
Returns:
AlpacaItem: Parsed instance
Raises:
ValueError: text doesn't match expected format or sections missing
"""
# Strip and standardize newlines
text = text.strip().replace('\r\n', '\n')

# Try to extract sections using regex
instruction_match = re.search(
r'###\s*Instruction:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
)
input_match = re.search(
r'###\s*Input:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
)
response_match = re.search(
r'###\s*Response:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
)

if not instruction_match or not response_match:
raise ValueError(
"Text must contain '### Instruction:'"
" and '### Response:' sections"
)

return cls(
instruction=instruction_match.group(1).strip(),
input=input_match.group(1).strip() if input_match else "",
output=response_match.group(1).strip(),
)

def to_string(self) -> str:
r"""Converts the AlpacaItem to its string representation.
Returns:
str: Formatted string representation with sections markers
"""
return "\n".join(
[
"### Instruction:",
self.instruction,
"",
"### Input:",
self.input,
"",
"### Response:",
self.output,
]
)
File renamed without changes.

0 comments on commit 3dd959a

Please sign in to comment.