From e2f7cf7ba7dfdb4121c3407b2faaa18663fd46f7 Mon Sep 17 00:00:00 2001 From: cecilia-uu Date: Thu, 4 Jul 2024 16:53:50 +0800 Subject: [PATCH 1/6] API: start parsing --- api/apps/dataset_api.py | 57 ++++++++++++++++++++++++++-- sdk/python/ragflow/ragflow.py | 5 +++ sdk/python/test/test_document.py | 64 ++++++++++++++++++++++++++++++++ 3 files changed, 123 insertions(+), 3 deletions(-) diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index 3b290630bdd..5f2f60ba01b 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -18,26 +18,30 @@ import warnings from io import BytesIO +from elasticsearch_dsl import Q from flask import request, send_file from flask_login import login_required, current_user from httpx import HTTPError from minio import S3Error from api.contants import NAME_LENGTH_LIMIT -from api.db import FileType, ParserType, FileSource +from api.db import FileType, ParserType, FileSource, TaskStatus from api.db import StatusEnum -from api.db.db_models import File +from api.db.db_models import File, Task from api.db.services import duplicate_name from api.db.services.document_service import DocumentService from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.db.services.knowledgebase_service import KnowledgebaseService +from api.db.services.task_service import TaskService, queue_tasks from api.db.services.user_service import TenantService from api.settings import RetCode from api.utils import get_uuid from api.utils.api_utils import construct_json_result, construct_error_response from api.utils.api_utils import construct_result, validate_request from api.utils.file_utils import filename_type, thumbnail +from rag.nlp import search +from rag.utils.es_conn import ELASTICSEARCH from rag.utils.minio_conn import MINIO MAXIMUM_OF_UPLOADING_FILES = 256 @@ -341,7 +345,7 @@ def upload_documents(dataset_id): blob = file.read() # the content is empty, raising a warning if blob == b'': - warnings.warn(f"[WARNING]: The file {filename} is empty.") + warnings.warn(f"[WARNING]: The content of the file {filename} is empty.") MINIO.put(dataset_id, location, blob) @@ -592,7 +596,54 @@ def download_document(dataset_id, document_id): return construct_error_response(e) # ----------------------------start parsing----------------------------------------------------- +@manager.route("//documents//status", methods=["POST"]) +@login_required +@validate_request("status") +def run(dataset_id, document_id): + run_value = request.json["status"] + try: + exist, _ = KnowledgebaseService.get_by_id(dataset_id) + if not exist: + return construct_json_result(code=RetCode.DATA_ERROR, + message=f"This dataset '{dataset_id}' cannot be found!") + # valid document? + exist, _ = DocumentService.get_by_id(document_id) + if not exist: + return construct_json_result(code=RetCode.DATA_ERROR, message=f"This '{document_id}' is not a valid document.") + + info = {"run": run_value, "progress": 0} # initial progress of 0 / reset the progress + + if run_value != TaskStatus.RUNNING.value: + return construct_json_result( + code=RetCode.ARGUMENT_ERROR, + message=f"The status value should be '1', not {run_value}." + ) + info["progress_msg"] = "" + info["chunk_num"] = 0 + info["token_num"] = 0 + DocumentService.update_by_id(document_id, info) # update information regarding it + + # in case that it's null + tenant_id = DocumentService.get_tenant_id(document_id) + if not tenant_id: + return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR) + + # delete it from es + ELASTICSEARCH.deleteByQuery(Q("match", doc_id=document_id), idxnm=search.index_name(tenant_id)) + + # delete the tasks from the cache + TaskService.filter_delete([Task.doc_id == document_id]) + _, doc = DocumentService.get_by_id(document_id) + doc = doc.to_dict() + # renew + doc["tenant_id"] = tenant_id + bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"]) # address + queue_tasks(doc, bucket, name) # as queue + + return construct_json_result(data=True, code=RetCode.SUCCESS) + except Exception as e: + return construct_error_response(e) # ----------------------------stop parsing----------------------------------------------------- # ----------------------------show the status of the file----------------------------------------------------- diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py index 6275f921c3c..f8af491596b 100644 --- a/sdk/python/ragflow/ragflow.py +++ b/sdk/python/ragflow/ragflow.py @@ -142,8 +142,13 @@ def download_file(self, dataset_id, document_id): with open(file_path, "wb") as file: file.write(content) return {"code": RetCode.SUCCESS, "data": content} + # ----------------------------start parsing----------------------------------------------------- + def start_parsing(self, dataset_id, document_id): + endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}/status" + res = requests.post(endpoint, headers=self.authorization_header, json={"status": "1"}) + return res.json() # ----------------------------stop parsing----------------------------------------------------- # ----------------------------show the status of the file----------------------------------------------------- diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py index f7f87a148b8..9698ca0a3ef 100644 --- a/sdk/python/test/test_document.py +++ b/sdk/python/test/test_document.py @@ -695,6 +695,70 @@ def test_download_an_empty_document(self): assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This file is empty." # ----------------------------start parsing----------------------------------------------------- + def test_start_parsing_document_with_success(self): + """ + Test the parsing of a document with success. + """ + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_start_parsing_document_with_success") + created_res_id = created_res["data"]["dataset_id"] + # upload files + file_paths = ["test_data/test.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + # get the doc_id + data = uploading_res["data"][0] + doc_id = data["id"] + # parse file + res = ragflow.start_parsing(created_res_id, doc_id) + assert res["code"] == RetCode.SUCCESS and res["data"] is True + + def test_start_parsing_nonexistent_document(self): + """ + Test the parsing a document which does not exist. + """ + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_start_parsing_nonexistent_document") + created_res_id = created_res["data"]["dataset_id"] + res = ragflow.start_parsing(created_res_id, "imagination") + assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This 'imagination' is not a valid document." + + def test_start_parsing_document_in_nonexistent_dataset(self): + """ + Test the parsing a document whose dataset is nonexistent. + """ + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_download_nonexistent_document") + created_res_id = created_res["data"]["dataset_id"] + # upload files + file_paths = ["test_data/test.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + # get the doc_id + data = uploading_res["data"][0] + doc_id = data["id"] + # parse + res = ragflow.start_parsing("imagination", doc_id) + assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!" + + def test_start_parsing_an_empty_document(self): + """ + Test the parsing of an empty document. + """ + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_download_nonexistent_document") + created_res_id = created_res["data"]["dataset_id"] + # upload files + file_paths = ["test_data/empty.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + # get the doc_id + data = uploading_res["data"][0] + doc_id = data["id"] + + res = ragflow.start_parsing(created_res_id, doc_id) + assert res["code"] == RetCode.SUCCESS and res["data"] is True # ----------------------------stop parsing----------------------------------------------------- From 65a9573f2b8ff9d063c7f7e6ade719f9a8e566cc Mon Sep 17 00:00:00 2001 From: cecilia-uu Date: Mon, 8 Jul 2024 20:25:05 +0800 Subject: [PATCH 2/6] fixed bugs --- api/apps/dataset_api.py | 132 ++++++++++++++++++++++++++----- sdk/python/ragflow/ragflow.py | 8 +- sdk/python/test/test_document.py | 20 +++-- 3 files changed, 135 insertions(+), 25 deletions(-) diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index 5f2f60ba01b..15b24eb46e8 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -22,7 +22,6 @@ from flask import request, send_file from flask_login import login_required, current_user from httpx import HTTPError -from minio import S3Error from api.contants import NAME_LENGTH_LIMIT from api.db import FileType, ParserType, FileSource, TaskStatus @@ -33,19 +32,21 @@ from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.db.services.knowledgebase_service import KnowledgebaseService -from api.db.services.task_service import TaskService, queue_tasks +from api.db.services.task_service import TaskService from api.db.services.user_service import TenantService from api.settings import RetCode from api.utils import get_uuid from api.utils.api_utils import construct_json_result, construct_error_response from api.utils.api_utils import construct_result, validate_request from api.utils.file_utils import filename_type, thumbnail +from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture from rag.nlp import search from rag.utils.es_conn import ELASTICSEARCH from rag.utils.minio_conn import MINIO MAXIMUM_OF_UPLOADING_FILES = 256 + # ------------------------------ create a dataset --------------------------------------- @manager.route("/", methods=["POST"]) @@ -120,6 +121,7 @@ def create_dataset(): except Exception as e: return construct_error_response(e) + # -----------------------------list datasets------------------------------------------------------- @manager.route("/", methods=["GET"]) @@ -139,6 +141,7 @@ def list_datasets(): except HTTPError as http_err: return construct_json_result(http_err) + # ---------------------------------delete a dataset ---------------------------- @manager.route("/", methods=["DELETE"]) @@ -166,13 +169,15 @@ def remove_dataset(dataset_id): # delete the dataset if not KnowledgebaseService.delete_by_id(dataset_id): - return construct_json_result(code=RetCode.DATA_ERROR, message="There was an error during the dataset removal process. " - "Please check the status of the RAGFlow server and try the removal again.") + return construct_json_result(code=RetCode.DATA_ERROR, + message="There was an error during the dataset removal process. " + "Please check the status of the RAGFlow server and try the removal again.") # success return construct_json_result(code=RetCode.SUCCESS, message=f"Remove dataset: {dataset_id} successfully") except Exception as e: return construct_error_response(e) + # ------------------------------ get details of a dataset ---------------------------------------- @manager.route("/", methods=["GET"]) @@ -186,6 +191,7 @@ def get_dataset(dataset_id): except Exception as e: return construct_json_result(e) + # ------------------------------ update a dataset -------------------------------------------- @manager.route("/", methods=["PUT"]) @@ -213,8 +219,9 @@ def update_dataset(dataset_id): if name.lower() != dataset.name.lower() \ and len(KnowledgebaseService.query(name=name, tenant_id=current_user.id, status=StatusEnum.VALID.value)) > 1: - return construct_json_result(code=RetCode.DATA_ERROR, message=f"The name: {name.lower()} is already used by other " - f"datasets. Please choose a different name.") + return construct_json_result(code=RetCode.DATA_ERROR, + message=f"The name: {name.lower()} is already used by other " + f"datasets. Please choose a different name.") dataset_updating_data = {} chunk_num = req.get("chunk_num") @@ -269,12 +276,14 @@ def update_dataset(dataset_id): except Exception as e: return construct_error_response(e) + # --------------------------------content management ---------------------------------------------- # ----------------------------upload files----------------------------------------------------- @manager.route("//documents/", methods=["POST"]) @login_required def upload_documents(dataset_id): + print("WWWWWWWWWWWWWWWWWWWWWWWWwwwww", flush=True) # no files if not request.files: return construct_json_result( @@ -343,6 +352,7 @@ def upload_documents(dataset_id): location += "_" blob = file.read() + # the content is empty, raising a warning if blob == b'': warnings.warn(f"[WARNING]: The content of the file {filename} is empty.") @@ -457,6 +467,7 @@ def list_documents(dataset_id): except Exception as e: return construct_error_response(e) + # ----------------------------update: enable rename----------------------------------------------------- @manager.route("//documents/", methods=["PUT"]) @login_required @@ -559,6 +570,7 @@ def update_document(dataset_id, document_id): def is_illegal_value_for_enum(value, enum_class): return value not in enum_class.__members__.values() + # ----------------------------download a file----------------------------------------------------- @manager.route("//documents/", methods=["GET"]) @login_required @@ -567,7 +579,8 @@ def download_document(dataset_id, document_id): # Check whether there is this dataset exist, _ = KnowledgebaseService.get_by_id(dataset_id) if not exist: - return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset '{dataset_id}' cannot be found!") + return construct_json_result(code=RetCode.DATA_ERROR, + message=f"This dataset '{dataset_id}' cannot be found!") # Check whether there is this document exist, document = DocumentService.get_by_id(document_id) @@ -595,13 +608,46 @@ def download_document(dataset_id, document_id): except Exception as e: return construct_error_response(e) -# ----------------------------start parsing----------------------------------------------------- +# ----------------------------start parsing a document----------------------------------------------------- +def dummy(prog=None, msg=""): + pass + +def doc_chunk(binary, name, doc, tenant_id): + parser_id = doc["parser_id"] + if parser_id == "book": + book.chunk(name, binary=binary, callback=dummy) + elif parser_id == "laws": + laws.chunk(name, binary=binary, callback=dummy) + elif parser_id == "manual": + manual.chunk(name, binary=binary, callback=dummy) + elif parser_id == "naive": + naive.chunk(name, binary=binary, callback=dummy) + elif parser_id == "one": + one.chunk(name, binary=binary, callback=dummy) + elif parser_id == "paper": + paper.chunk(name, binary=binary, callback=dummy) + elif parser_id == "picture": + picture.chunk(name, binary=binary, tenant_id=tenant_id, lang="Chinese", callback=dummy) + elif parser_id == "presentation": + presentation.chunk(name, binary=binary, callback=dummy) + elif parser_id == "qa": + qa.chunk(name, binary=binary, callback=dummy) + elif parser_id == "resume": + resume.chunk(name, binary=binary, callback=dummy) + elif parser_id == "table": + table.chunk(name, binary=binary, callback=dummy) + else: + return False + + return True + @manager.route("//documents//status", methods=["POST"]) @login_required @validate_request("status") -def run(dataset_id, document_id): +def parse_document(dataset_id, document_id): run_value = request.json["status"] try: + # valid dataset exist, _ = KnowledgebaseService.get_by_id(dataset_id) if not exist: return construct_json_result(code=RetCode.DATA_ERROR, @@ -611,6 +657,11 @@ def run(dataset_id, document_id): if not exist: return construct_json_result(code=RetCode.DATA_ERROR, message=f"This '{document_id}' is not a valid document.") + # in case that it's null + tenant_id = DocumentService.get_tenant_id(document_id) + if not tenant_id: + return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR) + info = {"run": run_value, "progress": 0} # initial progress of 0 / reset the progress if run_value != TaskStatus.RUNNING.value: @@ -624,11 +675,6 @@ def run(dataset_id, document_id): DocumentService.update_by_id(document_id, info) # update information regarding it - # in case that it's null - tenant_id = DocumentService.get_tenant_id(document_id) - if not tenant_id: - return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR) - # delete it from es ELASTICSEARCH.deleteByQuery(Q("match", doc_id=document_id), idxnm=search.index_name(tenant_id)) @@ -636,14 +682,65 @@ def run(dataset_id, document_id): TaskService.filter_delete([Task.doc_id == document_id]) _, doc = DocumentService.get_by_id(document_id) doc = doc.to_dict() + # renew doc["tenant_id"] = tenant_id - bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"]) # address - queue_tasks(doc, bucket, name) # as queue + bucket, name = File2DocumentService.get_minio_address(doc_id=document_id) # address + binary = MINIO.get(bucket, name) + if binary: + if doc_chunk(binary, name, doc, tenant_id) is True: + return construct_json_result(data=True, code=RetCode.SUCCESS) + return construct_json_result(code=RetCode.DATA_ERROR, message="wrong parser id") + return construct_json_result(code=RetCode.DATA_ERROR, message=f"Empty data in the document: {document_id}") + except Exception as e: + return construct_error_response(e) + +# ----------------------------start parsing documents----------------------------------------------------- +@manager.route("//documents/status", methods=["POST"]) +@login_required +@validate_request("status") +def parse_documents(dataset_id): + run_value = request.json["status"] + try: + exist, _ = KnowledgebaseService.get_by_id(dataset_id) + if not exist: + return construct_json_result(code=RetCode.DATA_ERROR, + message=f"This dataset '{dataset_id}' cannot be found!") + # documents inside the dataset + docs, total = DocumentService.list_documents_in_dataset(dataset_id, 0, -1, "create_time", + True, "") + # for loop + for doc in docs: + id = doc["id"] + + tenant_id = DocumentService.get_tenant_id(id) + if not tenant_id: + return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR) + + info = {"run": run_value, "progress": 0} + if run_value == TaskStatus.RUNNING.value: + info["progress_msg"] = "" + info["chunk_num"] = 0 + info["token_num"] = 0 + DocumentService.update_by_id(id, info) + + ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id)) + + TaskService.filter_delete([Task.doc_id == id]) + _, doc = DocumentService.get_by_id(id) + doc = doc.to_dict() + + doc["tenant_id"] = tenant_id + bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"]) + binary = MINIO.get(bucket, name) + if binary: + if doc_chunk(binary, name, doc, tenant_id) is False: + return construct_json_result(code=RetCode.DATA_ERROR, message="wrong parser id") return construct_json_result(data=True, code=RetCode.SUCCESS) except Exception as e: return construct_error_response(e) + # ----------------------------stop parsing----------------------------------------------------- # ----------------------------show the status of the file----------------------------------------------------- @@ -661,6 +758,3 @@ def run(dataset_id, document_id): # ----------------------------get a specific chunk----------------------------------------------------- # ----------------------------retrieval test----------------------------------------------------- - - - diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py index f8af491596b..2e03750ee89 100644 --- a/sdk/python/ragflow/ragflow.py +++ b/sdk/python/ragflow/ragflow.py @@ -144,10 +144,16 @@ def download_file(self, dataset_id, document_id): return {"code": RetCode.SUCCESS, "data": content} # ----------------------------start parsing----------------------------------------------------- - def start_parsing(self, dataset_id, document_id): + def start_parsing_document(self, dataset_id, document_id): endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}/status" res = requests.post(endpoint, headers=self.authorization_header, json={"status": "1"}) + return res.json() + + def start_parsing_documents(self, dataset_id): + endpoint = f"{self.dataset_url}/{dataset_id}/documents/status" + res = requests.post(endpoint, headers=self.authorization_header, json={"status": "1"}) + return res.json() # ----------------------------stop parsing----------------------------------------------------- diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py index 9698ca0a3ef..73170fba488 100644 --- a/sdk/python/test/test_document.py +++ b/sdk/python/test/test_document.py @@ -704,13 +704,13 @@ def test_start_parsing_document_with_success(self): created_res = ragflow.create_dataset("test_start_parsing_document_with_success") created_res_id = created_res["data"]["dataset_id"] # upload files - file_paths = ["test_data/test.txt"] + file_paths = ["test_data/lol.txt"] uploading_res = ragflow.upload_local_file(created_res_id, file_paths) # get the doc_id data = uploading_res["data"][0] doc_id = data["id"] # parse file - res = ragflow.start_parsing(created_res_id, doc_id) + res = ragflow.start_parsing_document(created_res_id, doc_id) assert res["code"] == RetCode.SUCCESS and res["data"] is True def test_start_parsing_nonexistent_document(self): @@ -721,7 +721,7 @@ def test_start_parsing_nonexistent_document(self): ragflow = RAGFlow(API_KEY, HOST_ADDRESS) created_res = ragflow.create_dataset("test_start_parsing_nonexistent_document") created_res_id = created_res["data"]["dataset_id"] - res = ragflow.start_parsing(created_res_id, "imagination") + res = ragflow.start_parsing_document(created_res_id, "imagination") assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This 'imagination' is not a valid document." def test_start_parsing_document_in_nonexistent_dataset(self): @@ -739,7 +739,7 @@ def test_start_parsing_document_in_nonexistent_dataset(self): data = uploading_res["data"][0] doc_id = data["id"] # parse - res = ragflow.start_parsing("imagination", doc_id) + res = ragflow.start_parsing_document("imagination", doc_id) assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!" def test_start_parsing_an_empty_document(self): @@ -756,8 +756,18 @@ def test_start_parsing_an_empty_document(self): # get the doc_id data = uploading_res["data"][0] doc_id = data["id"] + res = ragflow.start_parsing_document(created_res_id, doc_id) + assert res["code"] == RetCode.DATA_ERROR and res["message"] == f"Empty data in the document: {doc_id}" - res = ragflow.start_parsing(created_res_id, doc_id) + def test_start_parsing_multiple_documents(self): + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") + created_res_id = created_res["data"]["dataset_id"] + # upload files + file_paths = ["test_data/test.txt", "test_data/test1.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + res = ragflow.start_parsing_documents(created_res_id) assert res["code"] == RetCode.SUCCESS and res["data"] is True # ----------------------------stop parsing----------------------------------------------------- From c5f1105257889bb575d4003677920d783af81b8b Mon Sep 17 00:00:00 2001 From: cecilia-uu Date: Tue, 9 Jul 2024 10:41:20 +0800 Subject: [PATCH 3/6] added tests --- api/apps/dataset_api.py | 41 +++++++++++++++++--------------- sdk/python/test/test_document.py | 15 ++++++++++-- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index 15b24eb46e8..ae2d65cf519 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -283,7 +283,6 @@ def update_dataset(dataset_id): @manager.route("//documents/", methods=["POST"]) @login_required def upload_documents(dataset_id): - print("WWWWWWWWWWWWWWWWWWWWWWWWwwwww", flush=True) # no files if not request.files: return construct_json_result( @@ -609,11 +608,12 @@ def download_document(dataset_id, document_id): return construct_error_response(e) # ----------------------------start parsing a document----------------------------------------------------- +# helper method for parsing def dummy(prog=None, msg=""): pass -def doc_chunk(binary, name, doc, tenant_id): - parser_id = doc["parser_id"] + +def doc_chunk(binary, name, parser_id, tenant_id): if parser_id == "book": book.chunk(name, binary=binary, callback=dummy) elif parser_id == "laws": @@ -665,33 +665,33 @@ def parse_document(dataset_id, document_id): info = {"run": run_value, "progress": 0} # initial progress of 0 / reset the progress if run_value != TaskStatus.RUNNING.value: - return construct_json_result( - code=RetCode.ARGUMENT_ERROR, - message=f"The status value should be '1', not {run_value}." - ) + return construct_json_result(code=RetCode.ARGUMENT_ERROR, + message=f"The status value should be '1', not {run_value}.") + info["progress_msg"] = "" info["chunk_num"] = 0 info["token_num"] = 0 - DocumentService.update_by_id(document_id, info) # update information regarding it + DocumentService.update_by_id(document_id, info) # update information # delete it from es ELASTICSEARCH.deleteByQuery(Q("match", doc_id=document_id), idxnm=search.index_name(tenant_id)) # delete the tasks from the cache - TaskService.filter_delete([Task.doc_id == document_id]) - _, doc = DocumentService.get_by_id(document_id) + _, doc = DocumentService.get_by_id(document_id) # get doc object doc = doc.to_dict() # renew - doc["tenant_id"] = tenant_id bucket, name = File2DocumentService.get_minio_address(doc_id=document_id) # address - binary = MINIO.get(bucket, name) + binary = MINIO.get(bucket, name) # content + parser_id = doc["parser_id"] if binary: - if doc_chunk(binary, name, doc, tenant_id) is True: + if doc_chunk(binary, name, parser_id, tenant_id) is True: return construct_json_result(data=True, code=RetCode.SUCCESS) - return construct_json_result(code=RetCode.DATA_ERROR, message="wrong parser id") - return construct_json_result(code=RetCode.DATA_ERROR, message=f"Empty data in the document: {document_id}") + + return construct_json_result(code=RetCode.DATA_ERROR, + message=f"Parser id: {parser_id} is not supported") + return construct_json_result(code=RetCode.DATA_ERROR, message=f"Empty data in the document: {name}") except Exception as e: return construct_error_response(e) @@ -722,20 +722,23 @@ def parse_documents(dataset_id): info["progress_msg"] = "" info["chunk_num"] = 0 info["token_num"] = 0 + DocumentService.update_by_id(id, info) ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id)) - TaskService.filter_delete([Task.doc_id == id]) _, doc = DocumentService.get_by_id(id) doc = doc.to_dict() - doc["tenant_id"] = tenant_id bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"]) binary = MINIO.get(bucket, name) + parser_id = doc["parser_id"] if binary: - if doc_chunk(binary, name, doc, tenant_id) is False: - return construct_json_result(code=RetCode.DATA_ERROR, message="wrong parser id") + if doc_chunk(binary, name, parser_id, tenant_id) is False: + return construct_json_result(code=RetCode.DATA_ERROR, + message=f"Parser id: {parser_id} is not supported") + else: + return construct_json_result(code=RetCode.DATA_ERROR, message=f"Empty data in the document: {name}") return construct_json_result(data=True, code=RetCode.SUCCESS) except Exception as e: diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py index 73170fba488..e0e2d227dbe 100644 --- a/sdk/python/test/test_document.py +++ b/sdk/python/test/test_document.py @@ -757,7 +757,7 @@ def test_start_parsing_an_empty_document(self): data = uploading_res["data"][0] doc_id = data["id"] res = ragflow.start_parsing_document(created_res_id, doc_id) - assert res["code"] == RetCode.DATA_ERROR and res["message"] == f"Empty data in the document: {doc_id}" + assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Empty data in the document: empty.txt" def test_start_parsing_multiple_documents(self): # create a dataset @@ -766,10 +766,21 @@ def test_start_parsing_multiple_documents(self): created_res_id = created_res["data"]["dataset_id"] # upload files file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + ragflow.upload_local_file(created_res_id, file_paths) res = ragflow.start_parsing_documents(created_res_id) assert res["code"] == RetCode.SUCCESS and res["data"] is True + def test_start_parsing_multiple_documents_with_one_empty_file(self): + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") + created_res_id = created_res["data"]["dataset_id"] + # upload files + file_paths = ["test_data/test.txt", "test_data/test1.txt", "test_data/empty.txt"] + ragflow.upload_local_file(created_res_id, file_paths) + res = ragflow.start_parsing_documents(created_res_id) + assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Empty data in the document: empty.txt" + # ----------------------------stop parsing----------------------------------------------------- # ----------------------------show the status of the file----------------------------------------------------- From fb38a9c4e265f744bb457b535cb901411575a289 Mon Sep 17 00:00:00 2001 From: cecilia-uu Date: Thu, 11 Jul 2024 11:10:58 +0800 Subject: [PATCH 4/6] json parameters changed --- api/apps/dataset_api.py | 150 +++++++++++++++++-------------- sdk/python/ragflow/ragflow.py | 7 +- sdk/python/test/test_document.py | 38 +++++++- 3 files changed, 122 insertions(+), 73 deletions(-) diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index ae2d65cf519..581d2e31f31 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -607,45 +607,46 @@ def download_document(dataset_id, document_id): except Exception as e: return construct_error_response(e) + # ----------------------------start parsing a document----------------------------------------------------- # helper method for parsing def dummy(prog=None, msg=""): pass -def doc_chunk(binary, name, parser_id, tenant_id): - if parser_id == "book": - book.chunk(name, binary=binary, callback=dummy) - elif parser_id == "laws": - laws.chunk(name, binary=binary, callback=dummy) - elif parser_id == "manual": - manual.chunk(name, binary=binary, callback=dummy) - elif parser_id == "naive": - naive.chunk(name, binary=binary, callback=dummy) - elif parser_id == "one": - one.chunk(name, binary=binary, callback=dummy) - elif parser_id == "paper": - paper.chunk(name, binary=binary, callback=dummy) - elif parser_id == "picture": - picture.chunk(name, binary=binary, tenant_id=tenant_id, lang="Chinese", callback=dummy) - elif parser_id == "presentation": - presentation.chunk(name, binary=binary, callback=dummy) - elif parser_id == "qa": - qa.chunk(name, binary=binary, callback=dummy) - elif parser_id == "resume": - resume.chunk(name, binary=binary, callback=dummy) - elif parser_id == "table": - table.chunk(name, binary=binary, callback=dummy) - else: - return False +def doc_parse(binary, name, parser_id, tenant_id): + match parser_id: + case "book": + book.chunk(name, binary=binary, callback=dummy) + case "laws": + laws.chunk(name, binary=binary, callback=dummy) + case "manual": + manual.chunk(name, binary=binary, callback=dummy) + case "naive": + naive.chunk(name, binary=binary, callback=dummy) + case "one": + one.chunk(name, binary=binary, callback=dummy) + case "paper": + paper.chunk(name, binary=binary, callback=dummy) + case "picture": + picture.chunk(name, binary=binary, tenant_id=tenant_id, lang="Chinese", callback=dummy) + case "presentation": + presentation.chunk(name, binary=binary, callback=dummy) + case "qa": + qa.chunk(name, binary=binary, callback=dummy) + case "resume": + resume.chunk(name, binary=binary, callback=dummy) + case "table": + table.chunk(name, binary=binary, callback=dummy) + case _: + return False return True + @manager.route("//documents//status", methods=["POST"]) @login_required -@validate_request("status") def parse_document(dataset_id, document_id): - run_value = request.json["status"] try: # valid dataset exist, _ = KnowledgebaseService.get_by_id(dataset_id) @@ -655,18 +656,15 @@ def parse_document(dataset_id, document_id): # valid document? exist, _ = DocumentService.get_by_id(document_id) if not exist: - return construct_json_result(code=RetCode.DATA_ERROR, message=f"This '{document_id}' is not a valid document.") + return construct_json_result(code=RetCode.DATA_ERROR, + message=f"This '{document_id}' is not a valid document.") # in case that it's null tenant_id = DocumentService.get_tenant_id(document_id) if not tenant_id: return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR) - info = {"run": run_value, "progress": 0} # initial progress of 0 / reset the progress - - if run_value != TaskStatus.RUNNING.value: - return construct_json_result(code=RetCode.ARGUMENT_ERROR, - message=f"The status value should be '1', not {run_value}.") + info = {"run": "1", "progress": 0} # initial progress of 0 / reset the progress info["progress_msg"] = "" info["chunk_num"] = 0 @@ -686,64 +684,82 @@ def parse_document(dataset_id, document_id): binary = MINIO.get(bucket, name) # content parser_id = doc["parser_id"] if binary: - if doc_chunk(binary, name, parser_id, tenant_id) is True: + if doc_parse(binary, name, parser_id, tenant_id) is True: return construct_json_result(data=True, code=RetCode.SUCCESS) return construct_json_result(code=RetCode.DATA_ERROR, message=f"Parser id: {parser_id} is not supported") - return construct_json_result(code=RetCode.DATA_ERROR, message=f"Empty data in the document: {name}") + return construct_json_result(code=RetCode.SUCCESS, message=f"Empty data in the document: {name}") except Exception as e: return construct_error_response(e) + # ----------------------------start parsing documents----------------------------------------------------- @manager.route("//documents/status", methods=["POST"]) @login_required -@validate_request("status") def parse_documents(dataset_id): - run_value = request.json["status"] + doc_ids = request.json["doc_ids"] try: exist, _ = KnowledgebaseService.get_by_id(dataset_id) if not exist: return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset '{dataset_id}' cannot be found!") - # documents inside the dataset - docs, total = DocumentService.list_documents_in_dataset(dataset_id, 0, -1, "create_time", - True, "") - # for loop - for doc in docs: - id = doc["id"] - - tenant_id = DocumentService.get_tenant_id(id) - if not tenant_id: - return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR) - - info = {"run": run_value, "progress": 0} - if run_value == TaskStatus.RUNNING.value: + + def process(doc_ids): + message = "" + # for loop + for id in doc_ids: + # Check whether there is this document + exist, document = DocumentService.get_by_id(id) + if not exist: + return construct_json_result(message=f"This document '{id}' cannot be found!", + code=RetCode.ARGUMENT_ERROR) + + tenant_id = DocumentService.get_tenant_id(id) + if not tenant_id: + return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR) + + info = {"run": "1", "progress": 0} info["progress_msg"] = "" info["chunk_num"] = 0 info["token_num"] = 0 - DocumentService.update_by_id(id, info) + DocumentService.update_by_id(id, info) + + ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id)) + + _, doc = DocumentService.get_by_id(id) + doc = doc.to_dict() + doc_id = doc["id"] + + bucket, name = File2DocumentService.get_minio_address(doc_id=doc_id) + binary = MINIO.get(bucket, name) + parser_id = doc["parser_id"] + if binary: + res = doc_parse(binary, name, parser_id, tenant_id) + if res is False: + message += f"The parser id: {parser_id} of the document {doc_id} is not supported; " + else: + message += f"Empty data in the document: {name}; " + # failed in parsing + if doc["status"] == TaskStatus.FAIL.value: + message += f"Failed in parsing the document: {doc_id}; " + return construct_json_result(data=True, code=RetCode.SUCCESS, message=message) + + # two conditions + if doc_ids: + return process(doc_ids) + else: + # documents inside the dataset + docs, total = DocumentService.list_documents_in_dataset(dataset_id, 0, -1, "create_time", + True, "") + doc_ids = [doc["id"] for doc in docs] + return process(doc_ids) - ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id)) - - _, doc = DocumentService.get_by_id(id) - doc = doc.to_dict() - - bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"]) - binary = MINIO.get(bucket, name) - parser_id = doc["parser_id"] - if binary: - if doc_chunk(binary, name, parser_id, tenant_id) is False: - return construct_json_result(code=RetCode.DATA_ERROR, - message=f"Parser id: {parser_id} is not supported") - else: - return construct_json_result(code=RetCode.DATA_ERROR, message=f"Empty data in the document: {name}") - - return construct_json_result(data=True, code=RetCode.SUCCESS) except Exception as e: return construct_error_response(e) + # ----------------------------stop parsing----------------------------------------------------- # ----------------------------show the status of the file----------------------------------------------------- diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py index 2e03750ee89..9eccc4cb3d4 100644 --- a/sdk/python/ragflow/ragflow.py +++ b/sdk/python/ragflow/ragflow.py @@ -146,15 +146,16 @@ def download_file(self, dataset_id, document_id): # ----------------------------start parsing----------------------------------------------------- def start_parsing_document(self, dataset_id, document_id): endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}/status" - res = requests.post(endpoint, headers=self.authorization_header, json={"status": "1"}) + res = requests.post(endpoint, headers=self.authorization_header) return res.json() - def start_parsing_documents(self, dataset_id): + def start_parsing_documents(self, dataset_id, doc_ids=None): endpoint = f"{self.dataset_url}/{dataset_id}/documents/status" - res = requests.post(endpoint, headers=self.authorization_header, json={"status": "1"}) + res = requests.post(endpoint, headers=self.authorization_header, json={"doc_ids": doc_ids}) return res.json() + # ----------------------------stop parsing----------------------------------------------------- # ----------------------------show the status of the file----------------------------------------------------- diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py index e0e2d227dbe..922afb6acdb 100644 --- a/sdk/python/test/test_document.py +++ b/sdk/python/test/test_document.py @@ -757,7 +757,23 @@ def test_start_parsing_an_empty_document(self): data = uploading_res["data"][0] doc_id = data["id"] res = ragflow.start_parsing_document(created_res_id, doc_id) - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Empty data in the document: empty.txt" + assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt" + + # ------------------------parsing multiple documents---------------------------- + def test_start_parsing_documents_in_nonexistent_dataset(self): + """ + Test the parsing documents whose dataset is nonexistent. + """ + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_download_nonexistent_document") + created_res_id = created_res["data"]["dataset_id"] + # upload files + file_paths = ["test_data/test.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + # parse + res = ragflow.start_parsing_documents("imagination") + assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!" def test_start_parsing_multiple_documents(self): # create a dataset @@ -768,7 +784,7 @@ def test_start_parsing_multiple_documents(self): file_paths = ["test_data/test.txt", "test_data/test1.txt"] ragflow.upload_local_file(created_res_id, file_paths) res = ragflow.start_parsing_documents(created_res_id) - assert res["code"] == RetCode.SUCCESS and res["data"] is True + assert res["code"] == RetCode.SUCCESS and res["data"] is True and res["message"] == "" def test_start_parsing_multiple_documents_with_one_empty_file(self): # create a dataset @@ -779,7 +795,23 @@ def test_start_parsing_multiple_documents_with_one_empty_file(self): file_paths = ["test_data/test.txt", "test_data/test1.txt", "test_data/empty.txt"] ragflow.upload_local_file(created_res_id, file_paths) res = ragflow.start_parsing_documents(created_res_id) - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Empty data in the document: empty.txt" + assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt; " + + def test_start_parsing_multiple_specific_documents(self): + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") + created_res_id = created_res["data"]["dataset_id"] + # upload files + file_paths = ["test_data/test.txt", "test_data/test1.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + # get the doc_id + data = uploading_res["data"] + doc_ids = [] + for d in data: + doc_ids.append(d["id"]) + res = ragflow.start_parsing_documents(created_res_id, doc_ids) + assert res["code"] == RetCode.SUCCESS and res["message"] == "" # ----------------------------stop parsing----------------------------------------------------- From 77ad427221aebf9eb1c2491a0e026d75a2d7a8c6 Mon Sep 17 00:00:00 2001 From: cecilia-uu Date: Thu, 11 Jul 2024 15:12:17 +0800 Subject: [PATCH 5/6] avoid duplicate code and added more tests for reparsing --- api/apps/dataset_api.py | 137 +++++++++++++--------------- sdk/python/test/test_data/lol.txt | 3 + sdk/python/test/test_document.py | 143 +++++++++++++++++++++++++++++- 3 files changed, 205 insertions(+), 78 deletions(-) create mode 100644 sdk/python/test/test_data/lol.txt diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index 581d2e31f31..87ba1766d9a 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -233,17 +233,21 @@ def update_dataset(dataset_id): if chunk_num == 0: dataset_updating_data["embd_id"] = req["embedding_model_id"] else: - construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document in this " + return construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document in this " "dataset, so you cannot change the embedding " "model.") # only if chunk_num is 0, the user can update the chunk_method - if req.get("chunk_method"): - if chunk_num == 0: - dataset_updating_data['parser_id'] = req["chunk_method"] - else: + if "chunk_method" in req: + type_value = req["chunk_method"] + if is_illegal_value_for_enum(type_value, ParserType): + return construct_json_result(message=f"Illegal value {type_value} for 'chunk_method' field.", + code=RetCode.DATA_ERROR) + if chunk_num != 0: construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document " "in this dataset, so you cannot " "change the chunk method.") + dataset_updating_data["parser_id"] = req["template_type"] + # convert the photo parameter to avatar if req.get("photo"): dataset_updating_data["avatar"] = req["photo"] @@ -623,6 +627,7 @@ def doc_parse(binary, name, parser_id, tenant_id): case "manual": manual.chunk(name, binary=binary, callback=dummy) case "naive": + # It's the mode by default, which is general in the front-end naive.chunk(name, binary=binary, callback=dummy) case "one": one.chunk(name, binary=binary, callback=dummy) @@ -653,43 +658,14 @@ def parse_document(dataset_id, document_id): if not exist: return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset '{dataset_id}' cannot be found!") - # valid document? - exist, _ = DocumentService.get_by_id(document_id) - if not exist: - return construct_json_result(code=RetCode.DATA_ERROR, - message=f"This '{document_id}' is not a valid document.") - - # in case that it's null - tenant_id = DocumentService.get_tenant_id(document_id) - if not tenant_id: - return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR) - - info = {"run": "1", "progress": 0} # initial progress of 0 / reset the progress - - info["progress_msg"] = "" - info["chunk_num"] = 0 - info["token_num"] = 0 - - DocumentService.update_by_id(document_id, info) # update information - - # delete it from es - ELASTICSEARCH.deleteByQuery(Q("match", doc_id=document_id), idxnm=search.index_name(tenant_id)) - - # delete the tasks from the cache - _, doc = DocumentService.get_by_id(document_id) # get doc object - doc = doc.to_dict() - - # renew - bucket, name = File2DocumentService.get_minio_address(doc_id=document_id) # address - binary = MINIO.get(bucket, name) # content - parser_id = doc["parser_id"] - if binary: - if doc_parse(binary, name, parser_id, tenant_id) is True: - return construct_json_result(data=True, code=RetCode.SUCCESS) + message = "" + res = get_message_during_parsing_document(document_id, message) + if isinstance(res, str): + message += res + return construct_json_result(code=RetCode.SUCCESS, message=message) + else: + return res - return construct_json_result(code=RetCode.DATA_ERROR, - message=f"Parser id: {parser_id} is not supported") - return construct_json_result(code=RetCode.SUCCESS, message=f"Empty data in the document: {name}") except Exception as e: return construct_error_response(e) @@ -709,41 +685,11 @@ def process(doc_ids): message = "" # for loop for id in doc_ids: - # Check whether there is this document - exist, document = DocumentService.get_by_id(id) - if not exist: - return construct_json_result(message=f"This document '{id}' cannot be found!", - code=RetCode.ARGUMENT_ERROR) - - tenant_id = DocumentService.get_tenant_id(id) - if not tenant_id: - return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR) - - info = {"run": "1", "progress": 0} - info["progress_msg"] = "" - info["chunk_num"] = 0 - info["token_num"] = 0 - - DocumentService.update_by_id(id, info) - - ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id)) - - _, doc = DocumentService.get_by_id(id) - doc = doc.to_dict() - doc_id = doc["id"] - - bucket, name = File2DocumentService.get_minio_address(doc_id=doc_id) - binary = MINIO.get(bucket, name) - parser_id = doc["parser_id"] - if binary: - res = doc_parse(binary, name, parser_id, tenant_id) - if res is False: - message += f"The parser id: {parser_id} of the document {doc_id} is not supported; " + res = get_message_during_parsing_document(id, message) + if isinstance(res, str): + message += res else: - message += f"Empty data in the document: {name}; " - # failed in parsing - if doc["status"] == TaskStatus.FAIL.value: - message += f"Failed in parsing the document: {doc_id}; " + return res return construct_json_result(data=True, code=RetCode.SUCCESS, message=message) # two conditions @@ -760,6 +706,47 @@ def process(doc_ids): return construct_error_response(e) +# helper method for getting message or response when parsing the document +def get_message_during_parsing_document(id, message): + try: + # Check whether there is this document + exist, document = DocumentService.get_by_id(id) + if not exist: + return construct_json_result(message=f"This document '{id}' cannot be found!", + code=RetCode.ARGUMENT_ERROR) + + tenant_id = DocumentService.get_tenant_id(id) + if not tenant_id: + return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR) + + info = {"run": "1", "progress": 0} + info["progress_msg"] = "" + info["chunk_num"] = 0 + info["token_num"] = 0 + + DocumentService.update_by_id(id, info) + + ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id)) + + _, doc = DocumentService.get_by_id(id) + doc = doc.to_dict() + doc_id = doc["id"] + + bucket, name = File2DocumentService.get_minio_address(doc_id=doc_id) + binary = MINIO.get(bucket, name) + parser_id = doc["parser_id"] + if binary: + res = doc_parse(binary, name, parser_id, tenant_id) + if res is False: + message += f"The parser id: {parser_id} of the document {doc_id} is not supported; " + else: + message += f"Empty data in the document: {name}; " + # failed in parsing + if doc["status"] == TaskStatus.FAIL.value: + message += f"Failed in parsing the document: {doc_id}; " + return message + except Exception as e: + return construct_error_response(e) # ----------------------------stop parsing----------------------------------------------------- # ----------------------------show the status of the file----------------------------------------------------- diff --git a/sdk/python/test/test_data/lol.txt b/sdk/python/test/test_data/lol.txt new file mode 100644 index 00000000000..34883d23ace --- /dev/null +++ b/sdk/python/test/test_data/lol.txt @@ -0,0 +1,3 @@ +llll +ooooo +llll \ No newline at end of file diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py index 922afb6acdb..38839d73a81 100644 --- a/sdk/python/test/test_document.py +++ b/sdk/python/test/test_document.py @@ -711,7 +711,7 @@ def test_start_parsing_document_with_success(self): doc_id = data["id"] # parse file res = ragflow.start_parsing_document(created_res_id, doc_id) - assert res["code"] == RetCode.SUCCESS and res["data"] is True + assert res["code"] == RetCode.SUCCESS and res["message"] == "" def test_start_parsing_nonexistent_document(self): """ @@ -722,7 +722,7 @@ def test_start_parsing_nonexistent_document(self): created_res = ragflow.create_dataset("test_start_parsing_nonexistent_document") created_res_id = created_res["data"]["dataset_id"] res = ragflow.start_parsing_document(created_res_id, "imagination") - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This 'imagination' is not a valid document." + assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == "This document 'imagination' cannot be found!" def test_start_parsing_document_in_nonexistent_dataset(self): """ @@ -757,7 +757,7 @@ def test_start_parsing_an_empty_document(self): data = uploading_res["data"][0] doc_id = data["id"] res = ragflow.start_parsing_document(created_res_id, doc_id) - assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt" + assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt; " # ------------------------parsing multiple documents---------------------------- def test_start_parsing_documents_in_nonexistent_dataset(self): @@ -776,6 +776,9 @@ def test_start_parsing_documents_in_nonexistent_dataset(self): assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!" def test_start_parsing_multiple_documents(self): + """ + Test the parsing documents with a success. + """ # create a dataset ragflow = RAGFlow(API_KEY, HOST_ADDRESS) created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") @@ -787,6 +790,9 @@ def test_start_parsing_multiple_documents(self): assert res["code"] == RetCode.SUCCESS and res["data"] is True and res["message"] == "" def test_start_parsing_multiple_documents_with_one_empty_file(self): + """ + Test the parsing documents, one of which is empty. + """ # create a dataset ragflow = RAGFlow(API_KEY, HOST_ADDRESS) created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") @@ -798,6 +804,50 @@ def test_start_parsing_multiple_documents_with_one_empty_file(self): assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt; " def test_start_parsing_multiple_specific_documents(self): + """ + Test the parsing documents whose document ids are specified. + """ + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") + created_res_id = created_res["data"]["dataset_id"] + # upload files + file_paths = ["test_data/test.txt", "test_data/test1.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + # get the doc_id + data = uploading_res["data"] + doc_ids = [] + for d in data: + doc_ids.append(d["id"]) + res = ragflow.start_parsing_documents(created_res_id, doc_ids) + assert res["code"] == RetCode.SUCCESS and res["message"] == "" + + def test_start_re_parsing_multiple_specific_documents(self): + """ + Test the re-parsing documents. + """ + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") + created_res_id = created_res["data"]["dataset_id"] + # upload files + file_paths = ["test_data/test.txt", "test_data/test1.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + # get the doc_id + data = uploading_res["data"] + doc_ids = [] + for d in data: + doc_ids.append(d["id"]) + res = ragflow.start_parsing_documents(created_res_id, doc_ids) + assert res["code"] == RetCode.SUCCESS and res["message"] == "" + # re-parse + res = ragflow.start_parsing_documents(created_res_id, doc_ids) + assert res["code"] == RetCode.SUCCESS and res["message"] == "" + + def test_start_re_parsing_multiple_specific_documents_with_changing_parser_id(self): + """ + Test the re-parsing documents after changing the parser id. + """ # create a dataset ragflow = RAGFlow(API_KEY, HOST_ADDRESS) created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") @@ -812,7 +862,94 @@ def test_start_parsing_multiple_specific_documents(self): doc_ids.append(d["id"]) res = ragflow.start_parsing_documents(created_res_id, doc_ids) assert res["code"] == RetCode.SUCCESS and res["message"] == "" + # general -> laws + params = { + "template_type": "laws" + } + ragflow.update_file(created_res_id, doc_ids[0], **params) + # re-parse + res = ragflow.start_parsing_documents(created_res_id, doc_ids) + assert res["code"] == RetCode.SUCCESS and res["message"] == "" + def test_start_re_parsing_multiple_specific_documents_with_changing_illegal_parser_id(self): + """ + Test the re-parsing documents after changing an illegal parser id. + """ + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") + created_res_id = created_res["data"]["dataset_id"] + # upload files + file_paths = ["test_data/test.txt", "test_data/test1.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + # get the doc_id + data = uploading_res["data"] + doc_ids = [] + for d in data: + doc_ids.append(d["id"]) + res = ragflow.start_parsing_documents(created_res_id, doc_ids) + assert res["code"] == RetCode.SUCCESS and res["message"] == "" + # general -> illegal + params = { + "template_type": "illegal" + } + res = ragflow.update_file(created_res_id, doc_ids[0], **params) + assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'template_type' field." + # re-parse + res = ragflow.start_parsing_documents(created_res_id, doc_ids) + assert res["code"] == RetCode.SUCCESS and res["message"] == "" + + def test_start_parsing_multiple_specific_documents_with_changing_illegal_parser_id(self): + """ + Test the parsing documents after changing an illegal parser id. + """ + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") + created_res_id = created_res["data"]["dataset_id"] + # upload files + file_paths = ["test_data/test.txt", "test_data/test1.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + # get the doc_id + data = uploading_res["data"] + doc_ids = [] + for d in data: + doc_ids.append(d["id"]) + # general -> illegal + params = { + "template_type": "illegal" + } + res = ragflow.update_file(created_res_id, doc_ids[0], **params) + assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'template_type' field." + # re-parse + res = ragflow.start_parsing_documents(created_res_id, doc_ids) + assert res["code"] == RetCode.SUCCESS and res["message"] == "" + + def test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal(self): + """ + Test the parsing documents whose dataset's parser id is illegal. + """ + # create a dataset + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal") + created_res_id = created_res["data"]["dataset_id"] + # update the parser id + params = { + "chunk_method": "illegal" + } + res = ragflow.update_dataset("test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal", **params) + assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'chunk_method' field." + # upload files + file_paths = ["test_data/test.txt", "test_data/test1.txt"] + uploading_res = ragflow.upload_local_file(created_res_id, file_paths) + # get the doc_id + data = uploading_res["data"] + doc_ids = [] + for d in data: + doc_ids.append(d["id"]) + # parse + res = ragflow.start_parsing_documents(created_res_id, doc_ids) + assert res["code"] == RetCode.SUCCESS and res["message"] == "" # ----------------------------stop parsing----------------------------------------------------- # ----------------------------show the status of the file----------------------------------------------------- From c5fde04eacc5207974be0dae8fe97bd7b445594f Mon Sep 17 00:00:00 2001 From: cecilia-uu Date: Thu, 11 Jul 2024 17:35:37 +0800 Subject: [PATCH 6/6] rename some variables --- api/apps/dataset_api.py | 46 ++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index 87ba1766d9a..67201c46eb1 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -618,31 +618,31 @@ def dummy(prog=None, msg=""): pass -def doc_parse(binary, name, parser_id, tenant_id): - match parser_id: +def doc_parse(binary, doc_name, parser_name, tenant_id): + match parser_name: case "book": - book.chunk(name, binary=binary, callback=dummy) + book.chunk(doc_name, binary=binary, callback=dummy) case "laws": - laws.chunk(name, binary=binary, callback=dummy) + laws.chunk(doc_name, binary=binary, callback=dummy) case "manual": - manual.chunk(name, binary=binary, callback=dummy) + manual.chunk(doc_name, binary=binary, callback=dummy) case "naive": # It's the mode by default, which is general in the front-end - naive.chunk(name, binary=binary, callback=dummy) + naive.chunk(doc_name, binary=binary, callback=dummy) case "one": - one.chunk(name, binary=binary, callback=dummy) + one.chunk(doc_name, binary=binary, callback=dummy) case "paper": - paper.chunk(name, binary=binary, callback=dummy) + paper.chunk(doc_name, binary=binary, callback=dummy) case "picture": - picture.chunk(name, binary=binary, tenant_id=tenant_id, lang="Chinese", callback=dummy) + picture.chunk(doc_name, binary=binary, tenant_id=tenant_id, lang="Chinese", callback=dummy) case "presentation": - presentation.chunk(name, binary=binary, callback=dummy) + presentation.chunk(doc_name, binary=binary, callback=dummy) case "qa": - qa.chunk(name, binary=binary, callback=dummy) + qa.chunk(doc_name, binary=binary, callback=dummy) case "resume": - resume.chunk(name, binary=binary, callback=dummy) + resume.chunk(doc_name, binary=binary, callback=dummy) case "table": - table.chunk(name, binary=binary, callback=dummy) + table.chunk(doc_name, binary=binary, callback=dummy) case _: return False @@ -728,21 +728,21 @@ def get_message_during_parsing_document(id, message): ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id)) - _, doc = DocumentService.get_by_id(id) - doc = doc.to_dict() - doc_id = doc["id"] + _, doc_attributes = DocumentService.get_by_id(id) + doc_attributes = doc_attributes.to_dict() + doc_id = doc_attributes["id"] - bucket, name = File2DocumentService.get_minio_address(doc_id=doc_id) - binary = MINIO.get(bucket, name) - parser_id = doc["parser_id"] + bucket, doc_name = File2DocumentService.get_minio_address(doc_id=doc_id) + binary = MINIO.get(bucket, doc_name) + parser_name = doc_attributes["parser_id"] if binary: - res = doc_parse(binary, name, parser_id, tenant_id) + res = doc_parse(binary, doc_name, parser_name, tenant_id) if res is False: - message += f"The parser id: {parser_id} of the document {doc_id} is not supported; " + message += f"The parser id: {parser_name} of the document {doc_id} is not supported; " else: - message += f"Empty data in the document: {name}; " + message += f"Empty data in the document: {doc_name}; " # failed in parsing - if doc["status"] == TaskStatus.FAIL.value: + if doc_attributes["status"] == TaskStatus.FAIL.value: message += f"Failed in parsing the document: {doc_id}; " return message except Exception as e: