Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Alpaca pydantic class for easy conversion, validation, and structured output generation #1202

Merged
merged 10 commits into from
Dec 2, 2024
2 changes: 1 addition & 1 deletion camel/messages/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
HermesFunctionFormatter,
ShareGPTMessage,
)
from .conversion.models import (
from .conversion.conversation_models import (
ShareGPTConversation,
)
from .conversion.sharegpt.function_call_formatter import (
Expand Down
4 changes: 3 additions & 1 deletion camel/messages/conversion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

from .models import (
from .alpaca import AlpacaItem
from .conversation_models import (
ShareGPTConversation,
ShareGPTMessage,
ToolCall,
Expand All @@ -24,6 +25,7 @@
'ShareGPTMessage',
'ShareGPTConversation',
'HermesFunctionFormatter',
'AlpacaItem',
'ToolCall',
'ToolResponse',
]
115 changes: 115 additions & 0 deletions camel/messages/conversion/alpaca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
Wendong-Fan marked this conversation as resolved.
Show resolved Hide resolved
import re

from pydantic import BaseModel, Field, field_validator


class AlpacaItem(BaseModel):
r"""Represents an instruction-response item in the Alpaca format.

Appropripate for both cases where input field is empty, or populated.
Provides parsing from string format using the class method from_string().

Args:
instruction (str): The instruction/question/prompt
input (str): Input context or examples (put empty string if none)
output (str): The response/answer to the instruction
"""

instruction: str = Field(description="The instruction/question/prompt")
input: str = Field(
description="Optional context or input for the task."
" For example, when the instruction is \"Summarize the "
"following article\", the input is the article."
)
output: str = Field(description="The response/answer to the instruction")

@field_validator('instruction', 'output')
def no_section_markers(cls, value: str) -> str:
r"""Ensures fields don't contain section markers like '### Response:'"""
if '### Response' in value or '### Instruction' in value or '### Input' in value:
raise ValueError("Field cannot contain section markers")
return value.strip()

@classmethod
def from_string(cls, text: str) -> "AlpacaItem":
r"""Creates an AlpacaItem from a formatted string.

Args:
text: String in either of these formats:
With input:
### Instruction:
{instruction}
### Input:
{input}
### Response:
{response}

Without input:
### Instruction:
{instruction}
### Response:
{response}

Returns:
AlpacaItem: Parsed instance

Raises:
ValueError: text doesn't match expected format or sections missing
"""
# Strip and standardize newlines
text = text.strip().replace('\r\n', '\n')

# Try to extract sections using regex
instruction_match = re.search(
r'###\s*Instruction:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
)
input_match = re.search(
r'###\s*Input:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
)
response_match = re.search(
r'###\s*Response:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
)

if not instruction_match or not response_match:
raise ValueError(
"Text must contain '### Instruction:'"
" and '### Response:' sections"
)

return cls(
instruction=instruction_match.group(1).strip(),
input=input_match.group(1).strip() if input_match else "",
output=response_match.group(1).strip(),
)

def to_string(self) -> str:
r"""Converts the AlpacaItem to its string representation.

Returns:
str: Formatted string representation with sections markers
"""
return "\n".join(
[
"### Instruction:",
self.instruction,
"",
"### Input:",
self.input,
"",
"### Response:",
self.output,
]
)
Loading