Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: structured loader #1395

Merged
merged 28 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
a309bc7
support workforce
liuxukun2000 Dec 10, 2024
ea738a1
pass pre-commit
liuxukun2000 Dec 10, 2024
e01e108
update workforce example
liuxukun2000 Dec 27, 2024
4fcf24a
add PandaReader
liuxukun2000 Dec 27, 2024
937e1b2
add test, rewrite code
liuxukun2000 Jan 3, 2025
3d80abb
Merge branch 'master' into feat/structured_loader
Wendong-Fan Jan 3, 2025
9b13192
clean up commit history
liuxukun2000 Jan 6, 2025
9fa3cc5
remove unused vars
liuxukun2000 Jan 6, 2025
9235a3e
Merge branch 'master' into feat/structured_loader
liuxukun2000 Jan 6, 2025
a26864c
update dependency
liuxukun2000 Jan 7, 2025
b7c8152
Merge branch 'feat/structured_loader' of https://github.com/camel-ai/…
liuxukun2000 Jan 7, 2025
00aa054
Merge branch 'master' into feat/structured_loader
liuxukun2000 Jan 7, 2025
89fe187
update lock
liuxukun2000 Jan 7, 2025
6dff13b
Merge branch 'master' into feat/structured_loader
Wendong-Fan Jan 8, 2025
a25cfe5
remove dup code
liuxukun2000 Jan 10, 2025
13f3513
add decorator
liuxukun2000 Jan 10, 2025
97ecf28
minor format update
Wendong-Fan Jan 10, 2025
ca31672
update lock file
Wendong-Fan Jan 10, 2025
6cc1699
Merge branch 'master' into feat/structured_loader
liuxukun2000 Jan 15, 2025
5dc65d8
update lock
liuxukun2000 Jan 15, 2025
0339306
Merge branch 'feat/structured_loader' of https://github.com/camel-ai/…
liuxukun2000 Jan 17, 2025
5d66360
remove logger filter
liuxukun2000 Jan 17, 2025
578493a
update lock
liuxukun2000 Jan 17, 2025
e9a3b6f
Merge branch 'master' into feat/structured_loader
liuxukun2000 Jan 17, 2025
baa1ca2
update lock file
Wendong-Fan Jan 17, 2025
ab0759a
Merge branch 'master' into feat/structured_loader
Wendong-Fan Jan 17, 2025
322835e
type fix
Wendong-Fan Jan 17, 2025
31fad63
update doc
Wendong-Fan Jan 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions camel/benchmarks/apibank.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ def __init__(self, samples: List[APIBankSample]):
# Place holder for import as the import
# only works after the files have been downloaded
try:
from api_bank.tool_manager import ( # type: ignore[import-not-found]
from api_bank.tool_manager import ( # type: ignore[import-untyped]
liuxukun2000 marked this conversation as resolved.
Show resolved Hide resolved
ToolManager,
)
except Exception as e:
Expand Down Expand Up @@ -496,7 +496,7 @@ def get_model_input(self, sample_id: int):

def evaluate(self, sample_id, model_output):
try:
from api_bank.api_call_extraction import ( # type: ignore[import-not-found]
from api_bank.api_call_extraction import ( # type: ignore[import-untyped]
parse_api_call,
)
except Exception as e:
Expand Down
4 changes: 2 additions & 2 deletions camel/benchmarks/gaia.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,11 +280,11 @@ def _prepare_task(self, task: Dict[str, Any]) -> bool:
f"Skipping task because file not found: {file_path}"
)
return False
if file_path.suffix in ['.pdf', '.docx', '.doc', '.txt']:
if file_path.suffix in [".pdf", ".docx", ".doc", ".txt"]:
if not self.retriever.reset(task_id=task["task_id"]):
return False
retrieved_info = self.retriever.retrieve(
query=task["Question"], contents=[task['file_name']]
query=task["Question"], contents=[task["file_name"]]
)
retrieved_content = [
item["text"]
Expand Down
2 changes: 2 additions & 0 deletions camel/loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .chunkr_reader import ChunkrReader
from .firecrawl_reader import Firecrawl
from .jina_url_reader import JinaURLReader
from .panda_reader import PandaReader
from .unstructured_io import UnstructuredIO

__all__ = [
Expand All @@ -28,4 +29,5 @@
'Firecrawl',
'Apify',
'ChunkrReader',
'PandaReader',
]
325 changes: 325 additions & 0 deletions camel/loaders/panda_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,325 @@
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import os
from functools import wraps
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union

import pandas as pd # type: ignore[import-untyped]
from pandas import DataFrame
from pandasai import SmartDataframe # type: ignore[import-untyped]
from pandasai.llm import OpenAI # type: ignore[import-untyped]


def check_suffix(valid_suffixs: List[str]) -> Callable:
r"""A decorator to check the file suffix of a given file path.

Args:
valid_suffix (str): The required file suffix.

Returns:
Callable: The decorator function.
"""

def decorator(func: Callable):
@wraps(func)
def wrapper(
self, file_path: str, *args: Any, **kwargs: Dict[str, Any]
) -> DataFrame:
suffix = Path(file_path).suffix
if suffix not in valid_suffixs:
raise ValueError(
f"Only {', '.join(valid_suffixs)} files are supported"
)
return func(self, file_path, *args, **kwargs)

return wrapper

return decorator


class PandaReader:
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
r"""Initializes the PandaReader class.

Args:
config (Optional[Dict[str, Any]], optional):
The configuration dictionary. (default: :obj:`None`)
"""

self.config = config or {}
if "llm" not in self.config:
self.config["llm"] = OpenAI(
api_token=os.getenv("OPENAI_API_KEY"),
)

self.__LOADER = {
".csv": self.read_csv,
".xlsx": self.read_excel,
".xls": self.read_excel,
".json": self.read_json,
".parquet": self.read_parquet,
".sql": self.read_sql,
".html": self.read_html,
".feather": self.read_feather,
".dta": self.read_stata,
".sas": self.read_sas,
".pkl": self.read_pickle,
".h5": self.read_hdf,
".orc": self.read_orc,
}

def load(
liuxukun2000 marked this conversation as resolved.
Show resolved Hide resolved
self, data: Union[DataFrame, str], *args: Any, **kwargs: Dict[str, Any]
) -> SmartDataframe:
r"""Loads a file or DataFrame and returns a SmartDataframe object.

args:
data (Union[DataFrame, str]): The data to load.
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
SmartDataframe: The SmartDataframe object.
"""
if isinstance(data, DataFrame):
return SmartDataframe(data, config=self.config)
file_path = str(data)
path = Path(file_path)
if not file_path.startswith("http") and not path.exists():
raise FileNotFoundError(f"File {file_path} not found")
if path.suffix in self.__LOADER:
return SmartDataframe(
self.__LOADER[path.suffix](file_path, *args, **kwargs), # type: ignore[operator]
config=self.config,
)
else:
raise ValueError(f"Unsupported file format: {path.suffix}")
Comment on lines +108 to +120
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the loader is made for agent. We are not expect to use LLM to ask questions here. If we do want to use a chat method. Should we have at least a wrapper chat method if we want to use SmartDataframe from pandasai or we can add it into toolkit?


@check_suffix([".csv"])
def read_csv(
self, file_path: str, *args: Any, **kwargs: Dict[str, Any]
) -> DataFrame:
r"""Reads a CSV file and returns a DataFrame.

Args:
file_path (str): The path to the CSV file.
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
DataFrame: The DataFrame object.
"""
return pd.read_csv(file_path, *args, **kwargs)

@check_suffix([".xlsx", ".xls"])
def read_excel(
self, file_path: str, *args: Any, **kwargs: Dict[str, Any]
) -> DataFrame:
r"""Reads an Excel file and returns a DataFrame.

Args:
file_path (str): The path to the Excel file.
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
DataFrame: The DataFrame object.
"""
return pd.read_excel(file_path, *args, **kwargs)

@check_suffix([".json"])
def read_json(
self, file_path: str, *args: Any, **kwargs: Dict[str, Any]
) -> DataFrame:
r"""Reads a JSON file and returns a DataFrame.

Args:
file_path (str): The path to the JSON file.
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
DataFrame: The DataFrame object.
"""
return pd.read_json(file_path, *args, **kwargs)

@check_suffix([".parquet"])
def read_parquet(
self, file_path: str, *args: Any, **kwargs: Dict[str, Any]
) -> DataFrame:
r"""Reads a Parquet file and returns a DataFrame.

Args:
file_path (str): The path to the Parquet file.
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
DataFrame: The DataFrame object.
"""
return pd.read_parquet(file_path, *args, **kwargs)

def read_sql(self, *args: Any, **kwargs: Dict[str, Any]) -> DataFrame:
r"""Reads a SQL file and returns a DataFrame.

Args:
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
DataFrame: The DataFrame object.
"""
return pd.read_sql(*args, **kwargs)

def read_table(
self, file_path: str, *args: Any, **kwargs: Dict[str, Any]
) -> DataFrame:
r"""Reads a table and returns a DataFrame.

Args:
file_path (str): The path to the table.
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
DataFrame: The DataFrame object.
"""
return pd.read_table(file_path, *args, **kwargs)

def read_clipboard(
self, *args: Any, **kwargs: Dict[str, Any]
) -> DataFrame:
r"""Reads a clipboard and returns a DataFrame.

Args:
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
DataFrame: The DataFrame object.
"""
return pd.read_clipboard(*args, **kwargs)

@check_suffix([".html"])
def read_html(
self, file_path: str, *args: Any, **kwargs: Dict[str, Any]
) -> DataFrame:
r"""Reads an HTML file and returns a DataFrame.

Args:
file_path (str): The path to the HTML file.
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
DataFrame: The DataFrame object.
"""
return pd.read_html(file_path, *args, **kwargs)

@check_suffix([".feather"])
def read_feather(
self, file_path: str, *args: Any, **kwargs: Dict[str, Any]
) -> DataFrame:
r"""Reads a Feather file and returns a DataFrame.

Args:
file_path (str): The path to the Feather file.
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
DataFrame: The DataFrame object.
"""
return pd.read_feather(file_path, *args, **kwargs)

@check_suffix([".dta"])
def read_stata(
self, file_path: str, *args: Any, **kwargs: Dict[str, Any]
) -> DataFrame:
r"""Reads a Stata file and returns a DataFrame.

Args:
file_path (str): The path to the Stata file.
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
DataFrame: The DataFrame object.
"""
return pd.read_stata(file_path, *args, **kwargs)

@check_suffix([".sas"])
def read_sas(
self, file_path: str, *args: Any, **kwargs: Dict[str, Any]
) -> DataFrame:
r"""Reads a SAS file and returns a DataFrame.

Args:
file_path (str): The path to the SAS file.
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
DataFrame: The DataFrame object.
"""
return pd.read_sas(file_path, *args, **kwargs)

@check_suffix([".pkl"])
def read_pickle(
self, file_path: str, *args: Any, **kwargs: Dict[str, Any]
) -> DataFrame:
r"""Reads a Pickle file and returns a DataFrame.

Args:
file_path (str): The path to the Pickle file.
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
DataFrame: The DataFrame object.
"""
return pd.read_pickle(file_path, *args, **kwargs)

@check_suffix([".h5"])
def read_hdf(
self, file_path: str, *args: Any, **kwargs: Dict[str, Any]
) -> DataFrame:
r"""Reads an HDF file and returns a DataFrame.

Args:
file_path (str): The path to the HDF file.
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
DataFrame: The DataFrame object.
"""
return pd.read_hdf(file_path, *args, **kwargs)

@check_suffix([".orc"])
def read_orc(
self, file_path: str, *args: Any, **kwargs: Dict[str, Any]
) -> DataFrame:
r"""Reads an ORC file and returns a DataFrame.

Args:
file_path (str): The path to the ORC file.
*args (Any): Additional positional arguments.
**kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:
DataFrame: The DataFrame object.
"""
return pd.read_orc(file_path, *args, **kwargs)
Loading