Skip to content

Commit

Permalink
Merge pull request #2049 from PierreMesure/no_ascii
Browse files Browse the repository at this point in the history
Add force_ascii=False when generating JSONL
  • Loading branch information
henchaves authored Oct 25, 2024
2 parents 750db5c + 25b4e61 commit acad411
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 6 deletions.
16 changes: 11 additions & 5 deletions giskard/rag/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,18 +156,24 @@ def save(self, folder_path: str):

report_details = {"recommendation": self._recommendation}
with open(path / "report_details.json", "w", encoding="utf-8") as f:
json.dump(report_details, f)
json.dump(report_details, f, ensure_ascii=False)

self._knowledge_base._knowledge_base_df.to_json(path / "knowledge_base.jsonl", orient="records", lines=True)
self._knowledge_base._knowledge_base_df.to_json(
path / "knowledge_base.jsonl", orient="records", lines=True, force_ascii=False
)
with open(path / "knowledge_base_meta.json", "w", encoding="utf-8") as f:
json.dump(self._knowledge_base.get_savable_data(), f)
json.dump(self._knowledge_base.get_savable_data(), f, ensure_ascii=False)

with open(path / "agent_answer.json", "w", encoding="utf-8") as f:
json.dump([{"message": output.message, "documents": output.documents} for output in self._model_outputs], f)
json.dump(
[{"message": output.message, "documents": output.documents} for output in self._model_outputs],
f,
ensure_ascii=False,
)

if self._metrics_results is not None:
with open(path / "metrics_results.json", "w", encoding="utf-8") as f:
json.dump(self._metrics_results, f)
json.dump(self._metrics_results, f, ensure_ascii=False)

@classmethod
def load(
Expand Down
2 changes: 1 addition & 1 deletion giskard/rag/testset.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def save(self, path):
path : str
The path to the output JSONL file.
"""
self._dataframe.reset_index().to_json(path, orient="records", lines=True)
self._dataframe.reset_index().to_json(path, orient="records", lines=True, force_ascii=False)

@classmethod
def load(cls, path):
Expand Down
45 changes: 45 additions & 0 deletions tests/rag/test_qa_testset.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,37 @@ def make_testset_samples():
]


def make_swedish_testset_samples():
return [
QuestionSample(
id="1",
question="Vilken mjölk används för att göra Camembert?",
reference_answer="Komjölk används för att göra Camembert.",
reference_context="Camembert är en fuktig, mjuk, krämig, ytmognad ost av komjölk.",
conversation_history=[],
metadata={
"question_type": "enkel",
"color": "blå",
"topic": "Ost_1",
"seed_document_id": "1",
},
),
QuestionSample(
id="2",
question="Varifrån kommer Scamorza?",
reference_answer="Scamorza kommer från södra Italien.",
reference_context="Scamorza är en ost av komjölk från södra Italien.",
conversation_history=[],
metadata={
"question_type": "enkel",
"color": "röd",
"topic": "Ost_1",
"seed_document_id": "2",
},
),
]


def test_qa_testset_creation():
question_samples = make_testset_samples()
testset = QATestset(question_samples)
Expand Down Expand Up @@ -146,6 +177,20 @@ def test_qa_testset_saving_loading(tmp_path):
)


def test_qa_testset_saving_loading_swedish(tmp_path):
testset = QATestset(make_swedish_testset_samples())
path = tmp_path / "testset.jsonl"
testset.save(path)
loaded_testset = QATestset.load(path)

assert all(
[
original == loaded
for original, loaded in zip(testset._dataframe["metadata"], loaded_testset._dataframe["metadata"])
]
)


def test_metadata_value_retrieval():
testset = QATestset(make_testset_samples())

Expand Down

0 comments on commit acad411

Please sign in to comment.