-
Notifications
You must be signed in to change notification settings - Fork 748
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: liuxukun2000 <xukunliu2025@u.northwestern.edu> Co-authored-by: Xukun Liu <57486241+liuxukun2000@users.noreply.github.com>
- Loading branch information
Showing
8 changed files
with
1,529 additions
and
821 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,337 @@ | ||
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | ||
import os | ||
from functools import wraps | ||
from pathlib import Path | ||
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union | ||
|
||
import pandas as pd | ||
|
||
if TYPE_CHECKING: | ||
from pandas import DataFrame | ||
from pandasai import SmartDataframe | ||
|
||
|
||
def check_suffix(valid_suffixs: List[str]) -> Callable: | ||
r"""A decorator to check the file suffix of a given file path. | ||
Args: | ||
valid_suffix (str): The required file suffix. | ||
Returns: | ||
Callable: The decorator function. | ||
""" | ||
|
||
def decorator(func: Callable): | ||
@wraps(func) | ||
def wrapper( | ||
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] | ||
) -> "DataFrame": | ||
suffix = Path(file_path).suffix | ||
if suffix not in valid_suffixs: | ||
raise ValueError( | ||
f"Only {', '.join(valid_suffixs)} files are supported" | ||
) | ||
return func(self, file_path, *args, **kwargs) | ||
|
||
return wrapper | ||
|
||
return decorator | ||
|
||
|
||
class PandaReader: | ||
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: | ||
r"""Initializes the PandaReader class. | ||
Args: | ||
config (Optional[Dict[str, Any]], optional): The configuration | ||
dictionary that can include LLM API settings for LLM-based | ||
processing. If not provided, it will use OpenAI with the API | ||
key from the OPENAI_API_KEY environment variable. You can | ||
customize the LLM configuration by providing a 'llm' key in | ||
the config dictionary. (default: :obj:`None`) | ||
""" | ||
from pandasai.llm import OpenAI # type: ignore[import-untyped] | ||
|
||
self.config = config or {} | ||
if "llm" not in self.config: | ||
self.config["llm"] = OpenAI( | ||
api_token=os.getenv("OPENAI_API_KEY"), | ||
) | ||
|
||
self.__LOADER = { | ||
".csv": self.read_csv, | ||
".xlsx": self.read_excel, | ||
".xls": self.read_excel, | ||
".json": self.read_json, | ||
".parquet": self.read_parquet, | ||
".sql": self.read_sql, | ||
".html": self.read_html, | ||
".feather": self.read_feather, | ||
".dta": self.read_stata, | ||
".sas": self.read_sas, | ||
".pkl": self.read_pickle, | ||
".h5": self.read_hdf, | ||
".orc": self.read_orc, | ||
} | ||
|
||
def load( | ||
self, | ||
data: Union["DataFrame", str], | ||
*args: Any, | ||
**kwargs: Dict[str, Any], | ||
) -> "SmartDataframe": | ||
r"""Loads a file or DataFrame and returns a SmartDataframe object. | ||
args: | ||
data (Union[DataFrame, str]): The data to load. | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
SmartDataframe: The SmartDataframe object. | ||
""" | ||
from pandas import DataFrame | ||
from pandasai import SmartDataframe | ||
|
||
if isinstance(data, DataFrame): | ||
return SmartDataframe(data, config=self.config) | ||
file_path = str(data) | ||
path = Path(file_path) | ||
if not file_path.startswith("http") and not path.exists(): | ||
raise FileNotFoundError(f"File {file_path} not found") | ||
if path.suffix in self.__LOADER: | ||
return SmartDataframe( | ||
self.__LOADER[path.suffix](file_path, *args, **kwargs), # type: ignore[operator] | ||
config=self.config, | ||
) | ||
else: | ||
raise ValueError(f"Unsupported file format: {path.suffix}") | ||
|
||
@check_suffix([".csv"]) | ||
def read_csv( | ||
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] | ||
) -> "DataFrame": | ||
r"""Reads a CSV file and returns a DataFrame. | ||
Args: | ||
file_path (str): The path to the CSV file. | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
DataFrame: The DataFrame object. | ||
""" | ||
return pd.read_csv(file_path, *args, **kwargs) | ||
|
||
@check_suffix([".xlsx", ".xls"]) | ||
def read_excel( | ||
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] | ||
) -> "DataFrame": | ||
r"""Reads an Excel file and returns a DataFrame. | ||
Args: | ||
file_path (str): The path to the Excel file. | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
DataFrame: The DataFrame object. | ||
""" | ||
return pd.read_excel(file_path, *args, **kwargs) | ||
|
||
@check_suffix([".json"]) | ||
def read_json( | ||
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] | ||
) -> "DataFrame": | ||
r"""Reads a JSON file and returns a DataFrame. | ||
Args: | ||
file_path (str): The path to the JSON file. | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
DataFrame: The DataFrame object. | ||
""" | ||
return pd.read_json(file_path, *args, **kwargs) | ||
|
||
@check_suffix([".parquet"]) | ||
def read_parquet( | ||
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] | ||
) -> "DataFrame": | ||
r"""Reads a Parquet file and returns a DataFrame. | ||
Args: | ||
file_path (str): The path to the Parquet file. | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
DataFrame: The DataFrame object. | ||
""" | ||
return pd.read_parquet(file_path, *args, **kwargs) | ||
|
||
def read_sql(self, *args: Any, **kwargs: Dict[str, Any]) -> "DataFrame": | ||
r"""Reads a SQL file and returns a DataFrame. | ||
Args: | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
DataFrame: The DataFrame object. | ||
""" | ||
return pd.read_sql(*args, **kwargs) | ||
|
||
def read_table( | ||
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] | ||
) -> "DataFrame": | ||
r"""Reads a table and returns a DataFrame. | ||
Args: | ||
file_path (str): The path to the table. | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
DataFrame: The DataFrame object. | ||
""" | ||
return pd.read_table(file_path, *args, **kwargs) | ||
|
||
def read_clipboard( | ||
self, *args: Any, **kwargs: Dict[str, Any] | ||
) -> "DataFrame": | ||
r"""Reads a clipboard and returns a DataFrame. | ||
Args: | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
DataFrame: The DataFrame object. | ||
""" | ||
return pd.read_clipboard(*args, **kwargs) | ||
|
||
@check_suffix([".html"]) | ||
def read_html( | ||
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] | ||
) -> "DataFrame": | ||
r"""Reads an HTML file and returns a DataFrame. | ||
Args: | ||
file_path (str): The path to the HTML file. | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
DataFrame: The DataFrame object. | ||
""" | ||
return pd.read_html(file_path, *args, **kwargs) | ||
|
||
@check_suffix([".feather"]) | ||
def read_feather( | ||
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] | ||
) -> "DataFrame": | ||
r"""Reads a Feather file and returns a DataFrame. | ||
Args: | ||
file_path (str): The path to the Feather file. | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
DataFrame: The DataFrame object. | ||
""" | ||
return pd.read_feather(file_path, *args, **kwargs) | ||
|
||
@check_suffix([".dta"]) | ||
def read_stata( | ||
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] | ||
) -> "DataFrame": | ||
r"""Reads a Stata file and returns a DataFrame. | ||
Args: | ||
file_path (str): The path to the Stata file. | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
DataFrame: The DataFrame object. | ||
""" | ||
return pd.read_stata(file_path, *args, **kwargs) | ||
|
||
@check_suffix([".sas"]) | ||
def read_sas( | ||
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] | ||
) -> "DataFrame": | ||
r"""Reads a SAS file and returns a DataFrame. | ||
Args: | ||
file_path (str): The path to the SAS file. | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
DataFrame: The DataFrame object. | ||
""" | ||
return pd.read_sas(file_path, *args, **kwargs) | ||
|
||
@check_suffix([".pkl"]) | ||
def read_pickle( | ||
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] | ||
) -> "DataFrame": | ||
r"""Reads a Pickle file and returns a DataFrame. | ||
Args: | ||
file_path (str): The path to the Pickle file. | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
DataFrame: The DataFrame object. | ||
""" | ||
return pd.read_pickle(file_path, *args, **kwargs) | ||
|
||
@check_suffix([".h5"]) | ||
def read_hdf( | ||
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] | ||
) -> "DataFrame": | ||
r"""Reads an HDF file and returns a DataFrame. | ||
Args: | ||
file_path (str): The path to the HDF file. | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
DataFrame: The DataFrame object. | ||
""" | ||
return pd.read_hdf(file_path, *args, **kwargs) | ||
|
||
@check_suffix([".orc"]) | ||
def read_orc( | ||
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] | ||
) -> "DataFrame": | ||
r"""Reads an ORC file and returns a DataFrame. | ||
Args: | ||
file_path (str): The path to the ORC file. | ||
*args (Any): Additional positional arguments. | ||
**kwargs (Dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
DataFrame: The DataFrame object. | ||
""" | ||
return pd.read_orc(file_path, *args, **kwargs) |
Oops, something went wrong.