From e2f7cf7ba7dfdb4121c3407b2faaa18663fd46f7 Mon Sep 17 00:00:00 2001
From: cecilia-uu <huikong1996@163.com>
Date: Thu, 4 Jul 2024 16:53:50 +0800
Subject: [PATCH 1/6] API: start parsing

---
 api/apps/dataset_api.py          | 57 ++++++++++++++++++++++++++--
 sdk/python/ragflow/ragflow.py    |  5 +++
 sdk/python/test/test_document.py | 64 ++++++++++++++++++++++++++++++++
 3 files changed, 123 insertions(+), 3 deletions(-)

diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py
index 3b290630bdd..5f2f60ba01b 100644
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -18,26 +18,30 @@
 import warnings
 from io import BytesIO
 
+from elasticsearch_dsl import Q
 from flask import request, send_file
 from flask_login import login_required, current_user
 from httpx import HTTPError
 from minio import S3Error
 
 from api.contants import NAME_LENGTH_LIMIT
-from api.db import FileType, ParserType, FileSource
+from api.db import FileType, ParserType, FileSource, TaskStatus
 from api.db import StatusEnum
-from api.db.db_models import File
+from api.db.db_models import File, Task
 from api.db.services import duplicate_name
 from api.db.services.document_service import DocumentService
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
 from api.db.services.knowledgebase_service import KnowledgebaseService
+from api.db.services.task_service import TaskService, queue_tasks
 from api.db.services.user_service import TenantService
 from api.settings import RetCode
 from api.utils import get_uuid
 from api.utils.api_utils import construct_json_result, construct_error_response
 from api.utils.api_utils import construct_result, validate_request
 from api.utils.file_utils import filename_type, thumbnail
+from rag.nlp import search
+from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils.minio_conn import MINIO
 
 MAXIMUM_OF_UPLOADING_FILES = 256
@@ -341,7 +345,7 @@ def upload_documents(dataset_id):
             blob = file.read()
             # the content is empty, raising a warning
             if blob == b'':
-                warnings.warn(f"[WARNING]: The file {filename} is empty.")
+                warnings.warn(f"[WARNING]: The content of the file {filename} is empty.")
 
             MINIO.put(dataset_id, location, blob)
 
@@ -592,7 +596,54 @@ def download_document(dataset_id, document_id):
         return construct_error_response(e)
 
 # ----------------------------start parsing-----------------------------------------------------
+@manager.route("/<dataset_id>/documents/<document_id>/status", methods=["POST"])
+@login_required
+@validate_request("status")
+def run(dataset_id, document_id):
+    run_value = request.json["status"]
+    try:
+        exist, _ = KnowledgebaseService.get_by_id(dataset_id)
+        if not exist:
+            return construct_json_result(code=RetCode.DATA_ERROR,
+                                         message=f"This dataset '{dataset_id}' cannot be found!")
+        # valid document?
+        exist, _ = DocumentService.get_by_id(document_id)
+        if not exist:
+            return construct_json_result(code=RetCode.DATA_ERROR, message=f"This '{document_id}' is not a valid document.")
+
+        info = {"run": run_value, "progress": 0}  # initial progress of 0 / reset the progress
+
+        if run_value != TaskStatus.RUNNING.value:
+            return construct_json_result(
+                code=RetCode.ARGUMENT_ERROR,
+                message=f"The status value should be '1', not {run_value}."
+            )
+        info["progress_msg"] = ""
+        info["chunk_num"] = 0
+        info["token_num"] = 0
 
+        DocumentService.update_by_id(document_id, info)  # update information regarding it
+
+        # in case that it's null
+        tenant_id = DocumentService.get_tenant_id(document_id)
+        if not tenant_id:
+            return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR)
+
+        # delete it from es
+        ELASTICSEARCH.deleteByQuery(Q("match", doc_id=document_id), idxnm=search.index_name(tenant_id))
+
+        # delete the tasks from the cache
+        TaskService.filter_delete([Task.doc_id == document_id])
+        _, doc = DocumentService.get_by_id(document_id)
+        doc = doc.to_dict()
+        # renew
+        doc["tenant_id"] = tenant_id
+        bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"])  # address
+        queue_tasks(doc, bucket, name)  # as queue
+
+        return construct_json_result(data=True, code=RetCode.SUCCESS)
+    except Exception as e:
+        return construct_error_response(e)
 # ----------------------------stop parsing-----------------------------------------------------
 
 # ----------------------------show the status of the file-----------------------------------------------------
diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py
index 6275f921c3c..f8af491596b 100644
--- a/sdk/python/ragflow/ragflow.py
+++ b/sdk/python/ragflow/ragflow.py
@@ -142,8 +142,13 @@ def download_file(self, dataset_id, document_id):
             with open(file_path, "wb") as file:
                 file.write(content)
             return {"code": RetCode.SUCCESS, "data": content}
+
     # ----------------------------start parsing-----------------------------------------------------
+    def start_parsing(self, dataset_id, document_id):
+        endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}/status"
+        res = requests.post(endpoint, headers=self.authorization_header, json={"status": "1"})
 
+        return res.json()
     # ----------------------------stop parsing-----------------------------------------------------
 
     # ----------------------------show the status of the file-----------------------------------------------------
diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py
index f7f87a148b8..9698ca0a3ef 100644
--- a/sdk/python/test/test_document.py
+++ b/sdk/python/test/test_document.py
@@ -695,6 +695,70 @@ def test_download_an_empty_document(self):
         assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This file is empty."
 
 # ----------------------------start parsing-----------------------------------------------------
+    def test_start_parsing_document_with_success(self):
+        """
+        Test the parsing of a document with success.
+        """
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_start_parsing_document_with_success")
+        created_res_id = created_res["data"]["dataset_id"]
+        # upload files
+        file_paths = ["test_data/test.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        # get the doc_id
+        data = uploading_res["data"][0]
+        doc_id = data["id"]
+        # parse file
+        res = ragflow.start_parsing(created_res_id, doc_id)
+        assert res["code"] == RetCode.SUCCESS and res["data"] is True
+
+    def test_start_parsing_nonexistent_document(self):
+        """
+        Test the parsing a document which does not exist.
+        """
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_start_parsing_nonexistent_document")
+        created_res_id = created_res["data"]["dataset_id"]
+        res = ragflow.start_parsing(created_res_id, "imagination")
+        assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This 'imagination' is not a valid document."
+
+    def test_start_parsing_document_in_nonexistent_dataset(self):
+        """
+        Test the parsing a document whose dataset is nonexistent.
+        """
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_download_nonexistent_document")
+        created_res_id = created_res["data"]["dataset_id"]
+        # upload files
+        file_paths = ["test_data/test.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        # get the doc_id
+        data = uploading_res["data"][0]
+        doc_id = data["id"]
+        # parse
+        res = ragflow.start_parsing("imagination", doc_id)
+        assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!"
+
+    def test_start_parsing_an_empty_document(self):
+        """
+        Test the parsing of an empty document.
+        """
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_download_nonexistent_document")
+        created_res_id = created_res["data"]["dataset_id"]
+        # upload files
+        file_paths = ["test_data/empty.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        # get the doc_id
+        data = uploading_res["data"][0]
+        doc_id = data["id"]
+
+        res = ragflow.start_parsing(created_res_id, doc_id)
+        assert res["code"] == RetCode.SUCCESS and res["data"] is True
 
 # ----------------------------stop parsing-----------------------------------------------------
 

From 65a9573f2b8ff9d063c7f7e6ade719f9a8e566cc Mon Sep 17 00:00:00 2001
From: cecilia-uu <huikong1996@163.com>
Date: Mon, 8 Jul 2024 20:25:05 +0800
Subject: [PATCH 2/6] fixed bugs

---
 api/apps/dataset_api.py          | 132 ++++++++++++++++++++++++++-----
 sdk/python/ragflow/ragflow.py    |   8 +-
 sdk/python/test/test_document.py |  20 +++--
 3 files changed, 135 insertions(+), 25 deletions(-)

diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py
index 5f2f60ba01b..15b24eb46e8 100644
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -22,7 +22,6 @@
 from flask import request, send_file
 from flask_login import login_required, current_user
 from httpx import HTTPError
-from minio import S3Error
 
 from api.contants import NAME_LENGTH_LIMIT
 from api.db import FileType, ParserType, FileSource, TaskStatus
@@ -33,19 +32,21 @@
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
 from api.db.services.knowledgebase_service import KnowledgebaseService
-from api.db.services.task_service import TaskService, queue_tasks
+from api.db.services.task_service import TaskService
 from api.db.services.user_service import TenantService
 from api.settings import RetCode
 from api.utils import get_uuid
 from api.utils.api_utils import construct_json_result, construct_error_response
 from api.utils.api_utils import construct_result, validate_request
 from api.utils.file_utils import filename_type, thumbnail
+from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture
 from rag.nlp import search
 from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils.minio_conn import MINIO
 
 MAXIMUM_OF_UPLOADING_FILES = 256
 
+
 # ------------------------------ create a dataset ---------------------------------------
 
 @manager.route("/", methods=["POST"])
@@ -120,6 +121,7 @@ def create_dataset():
     except Exception as e:
         return construct_error_response(e)
 
+
 # -----------------------------list datasets-------------------------------------------------------
 
 @manager.route("/", methods=["GET"])
@@ -139,6 +141,7 @@ def list_datasets():
     except HTTPError as http_err:
         return construct_json_result(http_err)
 
+
 # ---------------------------------delete a dataset ----------------------------
 
 @manager.route("/<dataset_id>", methods=["DELETE"])
@@ -166,13 +169,15 @@ def remove_dataset(dataset_id):
 
         # delete the dataset
         if not KnowledgebaseService.delete_by_id(dataset_id):
-            return construct_json_result(code=RetCode.DATA_ERROR, message="There was an error during the dataset removal process. "
-                                                                          "Please check the status of the RAGFlow server and try the removal again.")
+            return construct_json_result(code=RetCode.DATA_ERROR,
+                                         message="There was an error during the dataset removal process. "
+                                                 "Please check the status of the RAGFlow server and try the removal again.")
         # success
         return construct_json_result(code=RetCode.SUCCESS, message=f"Remove dataset: {dataset_id} successfully")
     except Exception as e:
         return construct_error_response(e)
 
+
 # ------------------------------ get details of a dataset ----------------------------------------
 
 @manager.route("/<dataset_id>", methods=["GET"])
@@ -186,6 +191,7 @@ def get_dataset(dataset_id):
     except Exception as e:
         return construct_json_result(e)
 
+
 # ------------------------------ update a dataset --------------------------------------------
 
 @manager.route("/<dataset_id>", methods=["PUT"])
@@ -213,8 +219,9 @@ def update_dataset(dataset_id):
             if name.lower() != dataset.name.lower() \
                     and len(KnowledgebaseService.query(name=name, tenant_id=current_user.id,
                                                        status=StatusEnum.VALID.value)) > 1:
-                return construct_json_result(code=RetCode.DATA_ERROR, message=f"The name: {name.lower()} is already used by other "
-                                                                              f"datasets. Please choose a different name.")
+                return construct_json_result(code=RetCode.DATA_ERROR,
+                                             message=f"The name: {name.lower()} is already used by other "
+                                                     f"datasets. Please choose a different name.")
 
         dataset_updating_data = {}
         chunk_num = req.get("chunk_num")
@@ -269,12 +276,14 @@ def update_dataset(dataset_id):
     except Exception as e:
         return construct_error_response(e)
 
+
 # --------------------------------content management ----------------------------------------------
 
 # ----------------------------upload files-----------------------------------------------------
 @manager.route("/<dataset_id>/documents/", methods=["POST"])
 @login_required
 def upload_documents(dataset_id):
+    print("WWWWWWWWWWWWWWWWWWWWWWWWwwwww", flush=True)
     # no files
     if not request.files:
         return construct_json_result(
@@ -343,6 +352,7 @@ def upload_documents(dataset_id):
                 location += "_"
 
             blob = file.read()
+
             # the content is empty, raising a warning
             if blob == b'':
                 warnings.warn(f"[WARNING]: The content of the file {filename} is empty.")
@@ -457,6 +467,7 @@ def list_documents(dataset_id):
     except Exception as e:
         return construct_error_response(e)
 
+
 # ----------------------------update: enable rename-----------------------------------------------------
 @manager.route("/<dataset_id>/documents/<document_id>", methods=["PUT"])
 @login_required
@@ -559,6 +570,7 @@ def update_document(dataset_id, document_id):
 def is_illegal_value_for_enum(value, enum_class):
     return value not in enum_class.__members__.values()
 
+
 # ----------------------------download a file-----------------------------------------------------
 @manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"])
 @login_required
@@ -567,7 +579,8 @@ def download_document(dataset_id, document_id):
         # Check whether there is this dataset
         exist, _ = KnowledgebaseService.get_by_id(dataset_id)
         if not exist:
-            return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset '{dataset_id}' cannot be found!")
+            return construct_json_result(code=RetCode.DATA_ERROR,
+                                         message=f"This dataset '{dataset_id}' cannot be found!")
 
         # Check whether there is this document
         exist, document = DocumentService.get_by_id(document_id)
@@ -595,13 +608,46 @@ def download_document(dataset_id, document_id):
     except Exception as e:
         return construct_error_response(e)
 
-# ----------------------------start parsing-----------------------------------------------------
+# ----------------------------start parsing a document-----------------------------------------------------
+def dummy(prog=None, msg=""):
+    pass
+
+def doc_chunk(binary, name, doc, tenant_id):
+    parser_id = doc["parser_id"]
+    if parser_id == "book":
+        book.chunk(name, binary=binary, callback=dummy)
+    elif parser_id == "laws":
+        laws.chunk(name, binary=binary, callback=dummy)
+    elif parser_id == "manual":
+        manual.chunk(name, binary=binary, callback=dummy)
+    elif parser_id == "naive":
+        naive.chunk(name, binary=binary, callback=dummy)
+    elif parser_id == "one":
+        one.chunk(name, binary=binary, callback=dummy)
+    elif parser_id == "paper":
+        paper.chunk(name, binary=binary, callback=dummy)
+    elif parser_id == "picture":
+        picture.chunk(name, binary=binary, tenant_id=tenant_id, lang="Chinese", callback=dummy)
+    elif parser_id == "presentation":
+        presentation.chunk(name, binary=binary, callback=dummy)
+    elif parser_id == "qa":
+        qa.chunk(name, binary=binary, callback=dummy)
+    elif parser_id == "resume":
+        resume.chunk(name, binary=binary, callback=dummy)
+    elif parser_id == "table":
+        table.chunk(name, binary=binary, callback=dummy)
+    else:
+        return False
+
+    return True
+
 @manager.route("/<dataset_id>/documents/<document_id>/status", methods=["POST"])
 @login_required
 @validate_request("status")
-def run(dataset_id, document_id):
+def parse_document(dataset_id, document_id):
     run_value = request.json["status"]
     try:
+        # valid dataset
         exist, _ = KnowledgebaseService.get_by_id(dataset_id)
         if not exist:
             return construct_json_result(code=RetCode.DATA_ERROR,
@@ -611,6 +657,11 @@ def run(dataset_id, document_id):
         if not exist:
             return construct_json_result(code=RetCode.DATA_ERROR, message=f"This '{document_id}' is not a valid document.")
 
+        # in case that it's null
+        tenant_id = DocumentService.get_tenant_id(document_id)
+        if not tenant_id:
+            return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR)
+
         info = {"run": run_value, "progress": 0}  # initial progress of 0 / reset the progress
 
         if run_value != TaskStatus.RUNNING.value:
@@ -624,11 +675,6 @@ def run(dataset_id, document_id):
 
         DocumentService.update_by_id(document_id, info)  # update information regarding it
 
-        # in case that it's null
-        tenant_id = DocumentService.get_tenant_id(document_id)
-        if not tenant_id:
-            return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR)
-
         # delete it from es
         ELASTICSEARCH.deleteByQuery(Q("match", doc_id=document_id), idxnm=search.index_name(tenant_id))
 
@@ -636,14 +682,65 @@ def run(dataset_id, document_id):
         TaskService.filter_delete([Task.doc_id == document_id])
         _, doc = DocumentService.get_by_id(document_id)
         doc = doc.to_dict()
+
         # renew
         doc["tenant_id"] = tenant_id
-        bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"])  # address
-        queue_tasks(doc, bucket, name)  # as queue
+        bucket, name = File2DocumentService.get_minio_address(doc_id=document_id)  # address
+        binary = MINIO.get(bucket, name)
+        if binary:
+            if doc_chunk(binary, name, doc, tenant_id) is True:
+                return construct_json_result(data=True, code=RetCode.SUCCESS)
+            return construct_json_result(code=RetCode.DATA_ERROR, message="wrong parser id")
+        return construct_json_result(code=RetCode.DATA_ERROR, message=f"Empty data in the document: {document_id}")
+    except Exception as e:
+        return construct_error_response(e)
+
+# ----------------------------start parsing documents-----------------------------------------------------
+@manager.route("/<dataset_id>/documents/status", methods=["POST"])
+@login_required
+@validate_request("status")
+def parse_documents(dataset_id):
+    run_value = request.json["status"]
+    try:
+        exist, _ = KnowledgebaseService.get_by_id(dataset_id)
+        if not exist:
+            return construct_json_result(code=RetCode.DATA_ERROR,
+                                         message=f"This dataset '{dataset_id}' cannot be found!")
+        # documents inside the dataset
+        docs, total = DocumentService.list_documents_in_dataset(dataset_id, 0, -1, "create_time",
+                                                                True, "")
+        # for loop
+        for doc in docs:
+            id = doc["id"]
+
+            tenant_id = DocumentService.get_tenant_id(id)
+            if not tenant_id:
+                return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR)
+
+            info = {"run": run_value, "progress": 0}
+            if run_value == TaskStatus.RUNNING.value:
+                info["progress_msg"] = ""
+                info["chunk_num"] = 0
+                info["token_num"] = 0
+            DocumentService.update_by_id(id, info)
+
+            ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
+
+            TaskService.filter_delete([Task.doc_id == id])
+            _, doc = DocumentService.get_by_id(id)
+            doc = doc.to_dict()
+
+            doc["tenant_id"] = tenant_id
+            bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"])
+            binary = MINIO.get(bucket, name)
+            if binary:
+                if doc_chunk(binary, name, doc, tenant_id) is False:
+                    return construct_json_result(code=RetCode.DATA_ERROR, message="wrong parser id")
 
         return construct_json_result(data=True, code=RetCode.SUCCESS)
     except Exception as e:
         return construct_error_response(e)
+
 # ----------------------------stop parsing-----------------------------------------------------
 
 # ----------------------------show the status of the file-----------------------------------------------------
@@ -661,6 +758,3 @@ def run(dataset_id, document_id):
 # ----------------------------get a specific chunk-----------------------------------------------------
 
 # ----------------------------retrieval test-----------------------------------------------------
-
-
-
diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py
index f8af491596b..2e03750ee89 100644
--- a/sdk/python/ragflow/ragflow.py
+++ b/sdk/python/ragflow/ragflow.py
@@ -144,10 +144,16 @@ def download_file(self, dataset_id, document_id):
             return {"code": RetCode.SUCCESS, "data": content}
 
     # ----------------------------start parsing-----------------------------------------------------
-    def start_parsing(self, dataset_id, document_id):
+    def start_parsing_document(self, dataset_id, document_id):
         endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}/status"
         res = requests.post(endpoint, headers=self.authorization_header, json={"status": "1"})
 
+        return res.json()
+
+    def start_parsing_documents(self, dataset_id):
+        endpoint = f"{self.dataset_url}/{dataset_id}/documents/status"
+        res = requests.post(endpoint, headers=self.authorization_header, json={"status": "1"})
+
         return res.json()
     # ----------------------------stop parsing-----------------------------------------------------
 
diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py
index 9698ca0a3ef..73170fba488 100644
--- a/sdk/python/test/test_document.py
+++ b/sdk/python/test/test_document.py
@@ -704,13 +704,13 @@ def test_start_parsing_document_with_success(self):
         created_res = ragflow.create_dataset("test_start_parsing_document_with_success")
         created_res_id = created_res["data"]["dataset_id"]
         # upload files
-        file_paths = ["test_data/test.txt"]
+        file_paths = ["test_data/lol.txt"]
         uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
         # get the doc_id
         data = uploading_res["data"][0]
         doc_id = data["id"]
         # parse file
-        res = ragflow.start_parsing(created_res_id, doc_id)
+        res = ragflow.start_parsing_document(created_res_id, doc_id)
         assert res["code"] == RetCode.SUCCESS and res["data"] is True
 
     def test_start_parsing_nonexistent_document(self):
@@ -721,7 +721,7 @@ def test_start_parsing_nonexistent_document(self):
         ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
         created_res = ragflow.create_dataset("test_start_parsing_nonexistent_document")
         created_res_id = created_res["data"]["dataset_id"]
-        res = ragflow.start_parsing(created_res_id, "imagination")
+        res = ragflow.start_parsing_document(created_res_id, "imagination")
         assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This 'imagination' is not a valid document."
 
     def test_start_parsing_document_in_nonexistent_dataset(self):
@@ -739,7 +739,7 @@ def test_start_parsing_document_in_nonexistent_dataset(self):
         data = uploading_res["data"][0]
         doc_id = data["id"]
         # parse
-        res = ragflow.start_parsing("imagination", doc_id)
+        res = ragflow.start_parsing_document("imagination", doc_id)
         assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!"
 
     def test_start_parsing_an_empty_document(self):
@@ -756,8 +756,18 @@ def test_start_parsing_an_empty_document(self):
         # get the doc_id
         data = uploading_res["data"][0]
         doc_id = data["id"]
+        res = ragflow.start_parsing_document(created_res_id, doc_id)
+        assert res["code"] == RetCode.DATA_ERROR and res["message"] == f"Empty data in the document: {doc_id}"
 
-        res = ragflow.start_parsing(created_res_id, doc_id)
+    def test_start_parsing_multiple_documents(self):
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
+        created_res_id = created_res["data"]["dataset_id"]
+        # upload files
+        file_paths = ["test_data/test.txt", "test_data/test1.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        res = ragflow.start_parsing_documents(created_res_id)
         assert res["code"] == RetCode.SUCCESS and res["data"] is True
 
 # ----------------------------stop parsing-----------------------------------------------------

From c5f1105257889bb575d4003677920d783af81b8b Mon Sep 17 00:00:00 2001
From: cecilia-uu <huikong1996@163.com>
Date: Tue, 9 Jul 2024 10:41:20 +0800
Subject: [PATCH 3/6] added tests

---
 api/apps/dataset_api.py          | 41 +++++++++++++++++---------------
 sdk/python/test/test_document.py | 15 ++++++++++--
 2 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py
index 15b24eb46e8..ae2d65cf519 100644
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -283,7 +283,6 @@ def update_dataset(dataset_id):
 @manager.route("/<dataset_id>/documents/", methods=["POST"])
 @login_required
 def upload_documents(dataset_id):
-    print("WWWWWWWWWWWWWWWWWWWWWWWWwwwww", flush=True)
     # no files
     if not request.files:
         return construct_json_result(
@@ -609,11 +608,12 @@ def download_document(dataset_id, document_id):
         return construct_error_response(e)
 
 # ----------------------------start parsing a document-----------------------------------------------------
+# helper method for parsing
 def dummy(prog=None, msg=""):
     pass
 
-def doc_chunk(binary, name, doc, tenant_id):
-    parser_id = doc["parser_id"]
+
+def doc_chunk(binary, name, parser_id, tenant_id):
     if parser_id == "book":
         book.chunk(name, binary=binary, callback=dummy)
     elif parser_id == "laws":
@@ -665,33 +665,33 @@ def parse_document(dataset_id, document_id):
         info = {"run": run_value, "progress": 0}  # initial progress of 0 / reset the progress
 
         if run_value != TaskStatus.RUNNING.value:
-            return construct_json_result(
-                code=RetCode.ARGUMENT_ERROR,
-                message=f"The status value should be '1', not {run_value}."
-            )
+            return construct_json_result(code=RetCode.ARGUMENT_ERROR,
+                                         message=f"The status value should be '1', not {run_value}.")
+
         info["progress_msg"] = ""
         info["chunk_num"] = 0
         info["token_num"] = 0
 
-        DocumentService.update_by_id(document_id, info)  # update information regarding it
+        DocumentService.update_by_id(document_id, info)  # update information
 
         # delete it from es
         ELASTICSEARCH.deleteByQuery(Q("match", doc_id=document_id), idxnm=search.index_name(tenant_id))
 
         # delete the tasks from the cache
-        TaskService.filter_delete([Task.doc_id == document_id])
-        _, doc = DocumentService.get_by_id(document_id)
+        _, doc = DocumentService.get_by_id(document_id)  # get doc object
         doc = doc.to_dict()
 
         # renew
-        doc["tenant_id"] = tenant_id
         bucket, name = File2DocumentService.get_minio_address(doc_id=document_id)  # address
-        binary = MINIO.get(bucket, name)
+        binary = MINIO.get(bucket, name)  # content
+        parser_id = doc["parser_id"]
         if binary:
-            if doc_chunk(binary, name, doc, tenant_id) is True:
+            if doc_chunk(binary, name, parser_id, tenant_id) is True:
                 return construct_json_result(data=True, code=RetCode.SUCCESS)
-            return construct_json_result(code=RetCode.DATA_ERROR, message="wrong parser id")
-        return construct_json_result(code=RetCode.DATA_ERROR, message=f"Empty data in the document: {document_id}")
+
+            return construct_json_result(code=RetCode.DATA_ERROR,
+                                         message=f"Parser id: {parser_id} is not supported")
+        return construct_json_result(code=RetCode.DATA_ERROR, message=f"Empty data in the document: {name}")
     except Exception as e:
         return construct_error_response(e)
 
@@ -722,20 +722,23 @@ def parse_documents(dataset_id):
                 info["progress_msg"] = ""
                 info["chunk_num"] = 0
                 info["token_num"] = 0
+
             DocumentService.update_by_id(id, info)
 
             ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
 
-            TaskService.filter_delete([Task.doc_id == id])
             _, doc = DocumentService.get_by_id(id)
             doc = doc.to_dict()
 
-            doc["tenant_id"] = tenant_id
             bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"])
             binary = MINIO.get(bucket, name)
+            parser_id = doc["parser_id"]
             if binary:
-                if doc_chunk(binary, name, doc, tenant_id) is False:
-                    return construct_json_result(code=RetCode.DATA_ERROR, message="wrong parser id")
+                if doc_chunk(binary, name, parser_id, tenant_id) is False:
+                    return construct_json_result(code=RetCode.DATA_ERROR,
+                                                 message=f"Parser id: {parser_id} is not supported")
+            else:
+                return construct_json_result(code=RetCode.DATA_ERROR, message=f"Empty data in the document: {name}")
 
         return construct_json_result(data=True, code=RetCode.SUCCESS)
     except Exception as e:
diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py
index 73170fba488..e0e2d227dbe 100644
--- a/sdk/python/test/test_document.py
+++ b/sdk/python/test/test_document.py
@@ -757,7 +757,7 @@ def test_start_parsing_an_empty_document(self):
         data = uploading_res["data"][0]
         doc_id = data["id"]
         res = ragflow.start_parsing_document(created_res_id, doc_id)
-        assert res["code"] == RetCode.DATA_ERROR and res["message"] == f"Empty data in the document: {doc_id}"
+        assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Empty data in the document: empty.txt"
 
     def test_start_parsing_multiple_documents(self):
         # create a dataset
@@ -766,10 +766,21 @@ def test_start_parsing_multiple_documents(self):
         created_res_id = created_res["data"]["dataset_id"]
         # upload files
         file_paths = ["test_data/test.txt", "test_data/test1.txt"]
-        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        ragflow.upload_local_file(created_res_id, file_paths)
         res = ragflow.start_parsing_documents(created_res_id)
         assert res["code"] == RetCode.SUCCESS and res["data"] is True
 
+    def test_start_parsing_multiple_documents_with_one_empty_file(self):
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
+        created_res_id = created_res["data"]["dataset_id"]
+        # upload files
+        file_paths = ["test_data/test.txt", "test_data/test1.txt", "test_data/empty.txt"]
+        ragflow.upload_local_file(created_res_id, file_paths)
+        res = ragflow.start_parsing_documents(created_res_id)
+        assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Empty data in the document: empty.txt"
+
 # ----------------------------stop parsing-----------------------------------------------------
 
 # ----------------------------show the status of the file-----------------------------------------------------

From fb38a9c4e265f744bb457b535cb901411575a289 Mon Sep 17 00:00:00 2001
From: cecilia-uu <huikong1996@163.com>
Date: Thu, 11 Jul 2024 11:10:58 +0800
Subject: [PATCH 4/6] json parameters changed

---
 api/apps/dataset_api.py          | 150 +++++++++++++++++--------------
 sdk/python/ragflow/ragflow.py    |   7 +-
 sdk/python/test/test_document.py |  38 +++++++-
 3 files changed, 122 insertions(+), 73 deletions(-)

diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py
index ae2d65cf519..581d2e31f31 100644
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -607,45 +607,46 @@ def download_document(dataset_id, document_id):
     except Exception as e:
         return construct_error_response(e)
 
+
 # ----------------------------start parsing a document-----------------------------------------------------
 # helper method for parsing
 def dummy(prog=None, msg=""):
     pass
 
 
-def doc_chunk(binary, name, parser_id, tenant_id):
-    if parser_id == "book":
-        book.chunk(name, binary=binary, callback=dummy)
-    elif parser_id == "laws":
-        laws.chunk(name, binary=binary, callback=dummy)
-    elif parser_id == "manual":
-        manual.chunk(name, binary=binary, callback=dummy)
-    elif parser_id == "naive":
-        naive.chunk(name, binary=binary, callback=dummy)
-    elif parser_id == "one":
-        one.chunk(name, binary=binary, callback=dummy)
-    elif parser_id == "paper":
-        paper.chunk(name, binary=binary, callback=dummy)
-    elif parser_id == "picture":
-        picture.chunk(name, binary=binary, tenant_id=tenant_id, lang="Chinese", callback=dummy)
-    elif parser_id == "presentation":
-        presentation.chunk(name, binary=binary, callback=dummy)
-    elif parser_id == "qa":
-        qa.chunk(name, binary=binary, callback=dummy)
-    elif parser_id == "resume":
-        resume.chunk(name, binary=binary, callback=dummy)
-    elif parser_id == "table":
-        table.chunk(name, binary=binary, callback=dummy)
-    else:
-        return False
+def doc_parse(binary, name, parser_id, tenant_id):
+    match parser_id:
+        case "book":
+            book.chunk(name, binary=binary, callback=dummy)
+        case "laws":
+            laws.chunk(name, binary=binary, callback=dummy)
+        case "manual":
+            manual.chunk(name, binary=binary, callback=dummy)
+        case "naive":
+            naive.chunk(name, binary=binary, callback=dummy)
+        case "one":
+            one.chunk(name, binary=binary, callback=dummy)
+        case "paper":
+            paper.chunk(name, binary=binary, callback=dummy)
+        case "picture":
+            picture.chunk(name, binary=binary, tenant_id=tenant_id, lang="Chinese", callback=dummy)
+        case "presentation":
+            presentation.chunk(name, binary=binary, callback=dummy)
+        case "qa":
+            qa.chunk(name, binary=binary, callback=dummy)
+        case "resume":
+            resume.chunk(name, binary=binary, callback=dummy)
+        case "table":
+            table.chunk(name, binary=binary, callback=dummy)
+        case _:
+            return False
 
     return True
 
+
 @manager.route("/<dataset_id>/documents/<document_id>/status", methods=["POST"])
 @login_required
-@validate_request("status")
 def parse_document(dataset_id, document_id):
-    run_value = request.json["status"]
     try:
         # valid dataset
         exist, _ = KnowledgebaseService.get_by_id(dataset_id)
@@ -655,18 +656,15 @@ def parse_document(dataset_id, document_id):
         # valid document?
         exist, _ = DocumentService.get_by_id(document_id)
         if not exist:
-            return construct_json_result(code=RetCode.DATA_ERROR, message=f"This '{document_id}' is not a valid document.")
+            return construct_json_result(code=RetCode.DATA_ERROR,
+                                         message=f"This '{document_id}' is not a valid document.")
 
         # in case that it's null
         tenant_id = DocumentService.get_tenant_id(document_id)
         if not tenant_id:
             return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR)
 
-        info = {"run": run_value, "progress": 0}  # initial progress of 0 / reset the progress
-
-        if run_value != TaskStatus.RUNNING.value:
-            return construct_json_result(code=RetCode.ARGUMENT_ERROR,
-                                         message=f"The status value should be '1', not {run_value}.")
+        info = {"run": "1", "progress": 0}  # initial progress of 0 / reset the progress
 
         info["progress_msg"] = ""
         info["chunk_num"] = 0
@@ -686,64 +684,82 @@ def parse_document(dataset_id, document_id):
         binary = MINIO.get(bucket, name)  # content
         parser_id = doc["parser_id"]
         if binary:
-            if doc_chunk(binary, name, parser_id, tenant_id) is True:
+            if doc_parse(binary, name, parser_id, tenant_id) is True:
                 return construct_json_result(data=True, code=RetCode.SUCCESS)
 
             return construct_json_result(code=RetCode.DATA_ERROR,
                                          message=f"Parser id: {parser_id} is not supported")
-        return construct_json_result(code=RetCode.DATA_ERROR, message=f"Empty data in the document: {name}")
+        return construct_json_result(code=RetCode.SUCCESS, message=f"Empty data in the document: {name}")
     except Exception as e:
         return construct_error_response(e)
 
+
 # ----------------------------start parsing documents-----------------------------------------------------
 @manager.route("/<dataset_id>/documents/status", methods=["POST"])
 @login_required
-@validate_request("status")
 def parse_documents(dataset_id):
-    run_value = request.json["status"]
+    doc_ids = request.json["doc_ids"]
     try:
         exist, _ = KnowledgebaseService.get_by_id(dataset_id)
         if not exist:
             return construct_json_result(code=RetCode.DATA_ERROR,
                                          message=f"This dataset '{dataset_id}' cannot be found!")
-        # documents inside the dataset
-        docs, total = DocumentService.list_documents_in_dataset(dataset_id, 0, -1, "create_time",
-                                                                True, "")
-        # for loop
-        for doc in docs:
-            id = doc["id"]
-
-            tenant_id = DocumentService.get_tenant_id(id)
-            if not tenant_id:
-                return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR)
-
-            info = {"run": run_value, "progress": 0}
-            if run_value == TaskStatus.RUNNING.value:
+
+        def process(doc_ids):
+            message = ""
+            # for loop
+            for id in doc_ids:
+                # Check whether there is this document
+                exist, document = DocumentService.get_by_id(id)
+                if not exist:
+                    return construct_json_result(message=f"This document '{id}' cannot be found!",
+                                                 code=RetCode.ARGUMENT_ERROR)
+
+                tenant_id = DocumentService.get_tenant_id(id)
+                if not tenant_id:
+                    return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR)
+
+                info = {"run": "1", "progress": 0}
                 info["progress_msg"] = ""
                 info["chunk_num"] = 0
                 info["token_num"] = 0
 
-            DocumentService.update_by_id(id, info)
+                DocumentService.update_by_id(id, info)
+
+                ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
+
+                _, doc = DocumentService.get_by_id(id)
+                doc = doc.to_dict()
+                doc_id = doc["id"]
+
+                bucket, name = File2DocumentService.get_minio_address(doc_id=doc_id)
+                binary = MINIO.get(bucket, name)
+                parser_id = doc["parser_id"]
+                if binary:
+                    res = doc_parse(binary, name, parser_id, tenant_id)
+                    if res is False:
+                        message += f"The parser id: {parser_id} of the document {doc_id} is not supported; "
+                else:
+                    message += f"Empty data in the document: {name}; "
+                # failed in parsing
+                if doc["status"] == TaskStatus.FAIL.value:
+                    message += f"Failed in parsing the document: {doc_id}; "
+            return construct_json_result(data=True, code=RetCode.SUCCESS, message=message)
+
+        # two conditions
+        if doc_ids:
+            return process(doc_ids)
+        else:
+            # documents inside the dataset
+            docs, total = DocumentService.list_documents_in_dataset(dataset_id, 0, -1, "create_time",
+                                                                    True, "")
+            doc_ids = [doc["id"] for doc in docs]
+            return process(doc_ids)
 
-            ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
-
-            _, doc = DocumentService.get_by_id(id)
-            doc = doc.to_dict()
-
-            bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"])
-            binary = MINIO.get(bucket, name)
-            parser_id = doc["parser_id"]
-            if binary:
-                if doc_chunk(binary, name, parser_id, tenant_id) is False:
-                    return construct_json_result(code=RetCode.DATA_ERROR,
-                                                 message=f"Parser id: {parser_id} is not supported")
-            else:
-                return construct_json_result(code=RetCode.DATA_ERROR, message=f"Empty data in the document: {name}")
-
-        return construct_json_result(data=True, code=RetCode.SUCCESS)
     except Exception as e:
         return construct_error_response(e)
 
+
 # ----------------------------stop parsing-----------------------------------------------------
 
 # ----------------------------show the status of the file-----------------------------------------------------
diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py
index 2e03750ee89..9eccc4cb3d4 100644
--- a/sdk/python/ragflow/ragflow.py
+++ b/sdk/python/ragflow/ragflow.py
@@ -146,15 +146,16 @@ def download_file(self, dataset_id, document_id):
     # ----------------------------start parsing-----------------------------------------------------
     def start_parsing_document(self, dataset_id, document_id):
         endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}/status"
-        res = requests.post(endpoint, headers=self.authorization_header, json={"status": "1"})
+        res = requests.post(endpoint, headers=self.authorization_header)
 
         return res.json()
 
-    def start_parsing_documents(self, dataset_id):
+    def start_parsing_documents(self, dataset_id, doc_ids=None):
         endpoint = f"{self.dataset_url}/{dataset_id}/documents/status"
-        res = requests.post(endpoint, headers=self.authorization_header, json={"status": "1"})
+        res = requests.post(endpoint, headers=self.authorization_header, json={"doc_ids": doc_ids})
 
         return res.json()
+
     # ----------------------------stop parsing-----------------------------------------------------
 
     # ----------------------------show the status of the file-----------------------------------------------------
diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py
index e0e2d227dbe..922afb6acdb 100644
--- a/sdk/python/test/test_document.py
+++ b/sdk/python/test/test_document.py
@@ -757,7 +757,23 @@ def test_start_parsing_an_empty_document(self):
         data = uploading_res["data"][0]
         doc_id = data["id"]
         res = ragflow.start_parsing_document(created_res_id, doc_id)
-        assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Empty data in the document: empty.txt"
+        assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt"
+
+    # ------------------------parsing multiple documents----------------------------
+    def test_start_parsing_documents_in_nonexistent_dataset(self):
+        """
+        Test the parsing documents whose dataset is nonexistent.
+        """
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_download_nonexistent_document")
+        created_res_id = created_res["data"]["dataset_id"]
+        # upload files
+        file_paths = ["test_data/test.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        # parse
+        res = ragflow.start_parsing_documents("imagination")
+        assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!"
 
     def test_start_parsing_multiple_documents(self):
         # create a dataset
@@ -768,7 +784,7 @@ def test_start_parsing_multiple_documents(self):
         file_paths = ["test_data/test.txt", "test_data/test1.txt"]
         ragflow.upload_local_file(created_res_id, file_paths)
         res = ragflow.start_parsing_documents(created_res_id)
-        assert res["code"] == RetCode.SUCCESS and res["data"] is True
+        assert res["code"] == RetCode.SUCCESS and res["data"] is True and res["message"] == ""
 
     def test_start_parsing_multiple_documents_with_one_empty_file(self):
         # create a dataset
@@ -779,7 +795,23 @@ def test_start_parsing_multiple_documents_with_one_empty_file(self):
         file_paths = ["test_data/test.txt", "test_data/test1.txt", "test_data/empty.txt"]
         ragflow.upload_local_file(created_res_id, file_paths)
         res = ragflow.start_parsing_documents(created_res_id)
-        assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Empty data in the document: empty.txt"
+        assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt; "
+
+    def test_start_parsing_multiple_specific_documents(self):
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
+        created_res_id = created_res["data"]["dataset_id"]
+        # upload files
+        file_paths = ["test_data/test.txt", "test_data/test1.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        # get the doc_id
+        data = uploading_res["data"]
+        doc_ids = []
+        for d in data:
+            doc_ids.append(d["id"])
+        res = ragflow.start_parsing_documents(created_res_id, doc_ids)
+        assert res["code"] == RetCode.SUCCESS and res["message"] == ""
 
 # ----------------------------stop parsing-----------------------------------------------------
 

From 77ad427221aebf9eb1c2491a0e026d75a2d7a8c6 Mon Sep 17 00:00:00 2001
From: cecilia-uu <huikong1996@163.com>
Date: Thu, 11 Jul 2024 15:12:17 +0800
Subject: [PATCH 5/6] avoid duplicate code and added more tests for reparsing

---
 api/apps/dataset_api.py           | 137 +++++++++++++---------------
 sdk/python/test/test_data/lol.txt |   3 +
 sdk/python/test/test_document.py  | 143 +++++++++++++++++++++++++++++-
 3 files changed, 205 insertions(+), 78 deletions(-)
 create mode 100644 sdk/python/test/test_data/lol.txt

diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py
index 581d2e31f31..87ba1766d9a 100644
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -233,17 +233,21 @@ def update_dataset(dataset_id):
             if chunk_num == 0:
                 dataset_updating_data["embd_id"] = req["embedding_model_id"]
             else:
-                construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document in this "
+                return construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document in this "
                                                                        "dataset, so you cannot change the embedding "
                                                                        "model.")
         # only if chunk_num is 0, the user can update the chunk_method
-        if req.get("chunk_method"):
-            if chunk_num == 0:
-                dataset_updating_data['parser_id'] = req["chunk_method"]
-            else:
+        if "chunk_method" in req:
+            type_value = req["chunk_method"]
+            if is_illegal_value_for_enum(type_value, ParserType):
+                return construct_json_result(message=f"Illegal value {type_value} for 'chunk_method' field.",
+                                             code=RetCode.DATA_ERROR)
+            if chunk_num != 0:
                 construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document "
                                                                        "in this dataset, so you cannot "
                                                                        "change the chunk method.")
+            dataset_updating_data["parser_id"] = req["template_type"]
+
         # convert the photo parameter to avatar
         if req.get("photo"):
             dataset_updating_data["avatar"] = req["photo"]
@@ -623,6 +627,7 @@ def doc_parse(binary, name, parser_id, tenant_id):
         case "manual":
             manual.chunk(name, binary=binary, callback=dummy)
         case "naive":
+            # It's the mode by default, which is general in the front-end
             naive.chunk(name, binary=binary, callback=dummy)
         case "one":
             one.chunk(name, binary=binary, callback=dummy)
@@ -653,43 +658,14 @@ def parse_document(dataset_id, document_id):
         if not exist:
             return construct_json_result(code=RetCode.DATA_ERROR,
                                          message=f"This dataset '{dataset_id}' cannot be found!")
-        # valid document?
-        exist, _ = DocumentService.get_by_id(document_id)
-        if not exist:
-            return construct_json_result(code=RetCode.DATA_ERROR,
-                                         message=f"This '{document_id}' is not a valid document.")
-
-        # in case that it's null
-        tenant_id = DocumentService.get_tenant_id(document_id)
-        if not tenant_id:
-            return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR)
-
-        info = {"run": "1", "progress": 0}  # initial progress of 0 / reset the progress
-
-        info["progress_msg"] = ""
-        info["chunk_num"] = 0
-        info["token_num"] = 0
-
-        DocumentService.update_by_id(document_id, info)  # update information
-
-        # delete it from es
-        ELASTICSEARCH.deleteByQuery(Q("match", doc_id=document_id), idxnm=search.index_name(tenant_id))
-
-        # delete the tasks from the cache
-        _, doc = DocumentService.get_by_id(document_id)  # get doc object
-        doc = doc.to_dict()
-
-        # renew
-        bucket, name = File2DocumentService.get_minio_address(doc_id=document_id)  # address
-        binary = MINIO.get(bucket, name)  # content
-        parser_id = doc["parser_id"]
-        if binary:
-            if doc_parse(binary, name, parser_id, tenant_id) is True:
-                return construct_json_result(data=True, code=RetCode.SUCCESS)
+        message = ""
+        res = get_message_during_parsing_document(document_id, message)
+        if isinstance(res, str):
+            message += res
+            return construct_json_result(code=RetCode.SUCCESS, message=message)
+        else:
+            return res
 
-            return construct_json_result(code=RetCode.DATA_ERROR,
-                                         message=f"Parser id: {parser_id} is not supported")
-        return construct_json_result(code=RetCode.SUCCESS, message=f"Empty data in the document: {name}")
     except Exception as e:
         return construct_error_response(e)
 
@@ -709,41 +685,11 @@ def process(doc_ids):
             message = ""
             # for loop
             for id in doc_ids:
-                # Check whether there is this document
-                exist, document = DocumentService.get_by_id(id)
-                if not exist:
-                    return construct_json_result(message=f"This document '{id}' cannot be found!",
-                                                 code=RetCode.ARGUMENT_ERROR)
-
-                tenant_id = DocumentService.get_tenant_id(id)
-                if not tenant_id:
-                    return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR)
-
-                info = {"run": "1", "progress": 0}
-                info["progress_msg"] = ""
-                info["chunk_num"] = 0
-                info["token_num"] = 0
-
-                DocumentService.update_by_id(id, info)
-
-                ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
-
-                _, doc = DocumentService.get_by_id(id)
-                doc = doc.to_dict()
-                doc_id = doc["id"]
-
-                bucket, name = File2DocumentService.get_minio_address(doc_id=doc_id)
-                binary = MINIO.get(bucket, name)
-                parser_id = doc["parser_id"]
-                if binary:
-                    res = doc_parse(binary, name, parser_id, tenant_id)
-                    if res is False:
-                        message += f"The parser id: {parser_id} of the document {doc_id} is not supported; "
+                res = get_message_during_parsing_document(id, message)
+                if isinstance(res, str):
+                    message += res
                 else:
-                    message += f"Empty data in the document: {name}; "
-                # failed in parsing
-                if doc["status"] == TaskStatus.FAIL.value:
-                    message += f"Failed in parsing the document: {doc_id}; "
+                    return res
             return construct_json_result(data=True, code=RetCode.SUCCESS, message=message)
 
         # two conditions
@@ -760,6 +706,47 @@ def process(doc_ids):
         return construct_error_response(e)
 
 
+# helper method for getting message or response when parsing the document
+def get_message_during_parsing_document(id, message):
+    try:
+        # Check whether there is this document
+        exist, document = DocumentService.get_by_id(id)
+        if not exist:
+            return construct_json_result(message=f"This document '{id}' cannot be found!",
+                                         code=RetCode.ARGUMENT_ERROR)
+
+        tenant_id = DocumentService.get_tenant_id(id)
+        if not tenant_id:
+            return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR)
+
+        info = {"run": "1", "progress": 0}
+        info["progress_msg"] = ""
+        info["chunk_num"] = 0
+        info["token_num"] = 0
+
+        DocumentService.update_by_id(id, info)
+
+        ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
+
+        _, doc = DocumentService.get_by_id(id)
+        doc = doc.to_dict()
+        doc_id = doc["id"]
+
+        bucket, name = File2DocumentService.get_minio_address(doc_id=doc_id)
+        binary = MINIO.get(bucket, name)
+        parser_id = doc["parser_id"]
+        if binary:
+            res = doc_parse(binary, name, parser_id, tenant_id)
+            if res is False:
+                message += f"The parser id: {parser_id} of the document {doc_id} is not supported; "
+        else:
+            message += f"Empty data in the document: {name}; "
+        # failed in parsing
+        if doc["status"] == TaskStatus.FAIL.value:
+            message += f"Failed in parsing the document: {doc_id}; "
+        return message
+    except Exception as e:
+        return construct_error_response(e)
 # ----------------------------stop parsing-----------------------------------------------------
 
 # ----------------------------show the status of the file-----------------------------------------------------
diff --git a/sdk/python/test/test_data/lol.txt b/sdk/python/test/test_data/lol.txt
new file mode 100644
index 00000000000..34883d23ace
--- /dev/null
+++ b/sdk/python/test/test_data/lol.txt
@@ -0,0 +1,3 @@
+llll
+ooooo
+llll
\ No newline at end of file
diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py
index 922afb6acdb..38839d73a81 100644
--- a/sdk/python/test/test_document.py
+++ b/sdk/python/test/test_document.py
@@ -711,7 +711,7 @@ def test_start_parsing_document_with_success(self):
         doc_id = data["id"]
         # parse file
         res = ragflow.start_parsing_document(created_res_id, doc_id)
-        assert res["code"] == RetCode.SUCCESS and res["data"] is True
+        assert res["code"] == RetCode.SUCCESS and res["message"] == ""
 
     def test_start_parsing_nonexistent_document(self):
         """
@@ -722,7 +722,7 @@ def test_start_parsing_nonexistent_document(self):
         created_res = ragflow.create_dataset("test_start_parsing_nonexistent_document")
         created_res_id = created_res["data"]["dataset_id"]
         res = ragflow.start_parsing_document(created_res_id, "imagination")
-        assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This 'imagination' is not a valid document."
+        assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == "This document 'imagination' cannot be found!"
 
     def test_start_parsing_document_in_nonexistent_dataset(self):
         """
@@ -757,7 +757,7 @@ def test_start_parsing_an_empty_document(self):
         data = uploading_res["data"][0]
         doc_id = data["id"]
         res = ragflow.start_parsing_document(created_res_id, doc_id)
-        assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt"
+        assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt; "
 
     # ------------------------parsing multiple documents----------------------------
     def test_start_parsing_documents_in_nonexistent_dataset(self):
@@ -776,6 +776,9 @@ def test_start_parsing_documents_in_nonexistent_dataset(self):
         assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!"
 
     def test_start_parsing_multiple_documents(self):
+        """
+        Test the parsing documents with a success.
+        """
         # create a dataset
         ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
         created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
@@ -787,6 +790,9 @@ def test_start_parsing_multiple_documents(self):
         assert res["code"] == RetCode.SUCCESS and res["data"] is True and res["message"] == ""
 
     def test_start_parsing_multiple_documents_with_one_empty_file(self):
+        """
+        Test the parsing documents, one of which is empty.
+        """
         # create a dataset
         ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
         created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
@@ -798,6 +804,50 @@ def test_start_parsing_multiple_documents_with_one_empty_file(self):
         assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt; "
 
     def test_start_parsing_multiple_specific_documents(self):
+        """
+        Test the parsing documents whose document ids are specified.
+        """
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
+        created_res_id = created_res["data"]["dataset_id"]
+        # upload files
+        file_paths = ["test_data/test.txt", "test_data/test1.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        # get the doc_id
+        data = uploading_res["data"]
+        doc_ids = []
+        for d in data:
+            doc_ids.append(d["id"])
+        res = ragflow.start_parsing_documents(created_res_id, doc_ids)
+        assert res["code"] == RetCode.SUCCESS and res["message"] == ""
+
+    def test_start_re_parsing_multiple_specific_documents(self):
+        """
+        Test the re-parsing documents.
+        """
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
+        created_res_id = created_res["data"]["dataset_id"]
+        # upload files
+        file_paths = ["test_data/test.txt", "test_data/test1.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        # get the doc_id
+        data = uploading_res["data"]
+        doc_ids = []
+        for d in data:
+            doc_ids.append(d["id"])
+        res = ragflow.start_parsing_documents(created_res_id, doc_ids)
+        assert res["code"] == RetCode.SUCCESS and res["message"] == ""
+        # re-parse
+        res = ragflow.start_parsing_documents(created_res_id, doc_ids)
+        assert res["code"] == RetCode.SUCCESS and res["message"] == ""
+
+    def test_start_re_parsing_multiple_specific_documents_with_changing_parser_id(self):
+        """
+        Test the re-parsing documents after changing the parser id.
+        """
         # create a dataset
         ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
         created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
@@ -812,7 +862,94 @@ def test_start_parsing_multiple_specific_documents(self):
             doc_ids.append(d["id"])
         res = ragflow.start_parsing_documents(created_res_id, doc_ids)
         assert res["code"] == RetCode.SUCCESS and res["message"] == ""
+        # general -> laws
+        params = {
+            "template_type": "laws"
+        }
+        ragflow.update_file(created_res_id, doc_ids[0], **params)
+        # re-parse
+        res = ragflow.start_parsing_documents(created_res_id, doc_ids)
+        assert res["code"] == RetCode.SUCCESS and res["message"] == ""
 
+    def test_start_re_parsing_multiple_specific_documents_with_changing_illegal_parser_id(self):
+        """
+        Test the re-parsing documents after changing an illegal parser id.
+        """
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
+        created_res_id = created_res["data"]["dataset_id"]
+        # upload files
+        file_paths = ["test_data/test.txt", "test_data/test1.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        # get the doc_id
+        data = uploading_res["data"]
+        doc_ids = []
+        for d in data:
+            doc_ids.append(d["id"])
+        res = ragflow.start_parsing_documents(created_res_id, doc_ids)
+        assert res["code"] == RetCode.SUCCESS and res["message"] == ""
+        # general -> illegal
+        params = {
+            "template_type": "illegal"
+        }
+        res = ragflow.update_file(created_res_id, doc_ids[0], **params)
+        assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'template_type' field."
+        # re-parse
+        res = ragflow.start_parsing_documents(created_res_id, doc_ids)
+        assert res["code"] == RetCode.SUCCESS and res["message"] == ""
+
+    def test_start_parsing_multiple_specific_documents_with_changing_illegal_parser_id(self):
+        """
+        Test the parsing documents after changing an illegal parser id.
+        """
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents")
+        created_res_id = created_res["data"]["dataset_id"]
+        # upload files
+        file_paths = ["test_data/test.txt", "test_data/test1.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        # get the doc_id
+        data = uploading_res["data"]
+        doc_ids = []
+        for d in data:
+            doc_ids.append(d["id"])
+        # general -> illegal
+        params = {
+            "template_type": "illegal"
+        }
+        res = ragflow.update_file(created_res_id, doc_ids[0], **params)
+        assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'template_type' field."
+        # re-parse
+        res = ragflow.start_parsing_documents(created_res_id, doc_ids)
+        assert res["code"] == RetCode.SUCCESS and res["message"] == ""
+
+    def test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal(self):
+        """
+        Test the parsing documents whose dataset's parser id is illegal.
+        """
+        # create a dataset
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal")
+        created_res_id = created_res["data"]["dataset_id"]
+        # update the parser id
+        params = {
+            "chunk_method": "illegal"
+        }
+        res = ragflow.update_dataset("test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal", **params)
+        assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'chunk_method' field."
+        # upload files
+        file_paths = ["test_data/test.txt", "test_data/test1.txt"]
+        uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
+        # get the doc_id
+        data = uploading_res["data"]
+        doc_ids = []
+        for d in data:
+            doc_ids.append(d["id"])
+        # parse
+        res = ragflow.start_parsing_documents(created_res_id, doc_ids)
+        assert res["code"] == RetCode.SUCCESS and res["message"] == ""
 # ----------------------------stop parsing-----------------------------------------------------
 
 # ----------------------------show the status of the file-----------------------------------------------------

From c5fde04eacc5207974be0dae8fe97bd7b445594f Mon Sep 17 00:00:00 2001
From: cecilia-uu <huikong1996@163.com>
Date: Thu, 11 Jul 2024 17:35:37 +0800
Subject: [PATCH 6/6] rename some variables

---
 api/apps/dataset_api.py | 46 ++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py
index 87ba1766d9a..67201c46eb1 100644
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -618,31 +618,31 @@ def dummy(prog=None, msg=""):
     pass
 
 
-def doc_parse(binary, name, parser_id, tenant_id):
-    match parser_id:
+def doc_parse(binary, doc_name, parser_name, tenant_id):
+    match parser_name:
         case "book":
-            book.chunk(name, binary=binary, callback=dummy)
+            book.chunk(doc_name, binary=binary, callback=dummy)
         case "laws":
-            laws.chunk(name, binary=binary, callback=dummy)
+            laws.chunk(doc_name, binary=binary, callback=dummy)
         case "manual":
-            manual.chunk(name, binary=binary, callback=dummy)
+            manual.chunk(doc_name, binary=binary, callback=dummy)
         case "naive":
             # It's the mode by default, which is general in the front-end
-            naive.chunk(name, binary=binary, callback=dummy)
+            naive.chunk(doc_name, binary=binary, callback=dummy)
         case "one":
-            one.chunk(name, binary=binary, callback=dummy)
+            one.chunk(doc_name, binary=binary, callback=dummy)
         case "paper":
-            paper.chunk(name, binary=binary, callback=dummy)
+            paper.chunk(doc_name, binary=binary, callback=dummy)
         case "picture":
-            picture.chunk(name, binary=binary, tenant_id=tenant_id, lang="Chinese", callback=dummy)
+            picture.chunk(doc_name, binary=binary, tenant_id=tenant_id, lang="Chinese", callback=dummy)
         case "presentation":
-            presentation.chunk(name, binary=binary, callback=dummy)
+            presentation.chunk(doc_name, binary=binary, callback=dummy)
         case "qa":
-            qa.chunk(name, binary=binary, callback=dummy)
+            qa.chunk(doc_name, binary=binary, callback=dummy)
         case "resume":
-            resume.chunk(name, binary=binary, callback=dummy)
+            resume.chunk(doc_name, binary=binary, callback=dummy)
         case "table":
-            table.chunk(name, binary=binary, callback=dummy)
+            table.chunk(doc_name, binary=binary, callback=dummy)
         case _:
             return False
 
@@ -728,21 +728,21 @@ def get_message_during_parsing_document(id, message):
 
         ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
 
-        _, doc = DocumentService.get_by_id(id)
-        doc = doc.to_dict()
-        doc_id = doc["id"]
+        _, doc_attributes = DocumentService.get_by_id(id)
+        doc_attributes = doc_attributes.to_dict()
+        doc_id = doc_attributes["id"]
 
-        bucket, name = File2DocumentService.get_minio_address(doc_id=doc_id)
-        binary = MINIO.get(bucket, name)
-        parser_id = doc["parser_id"]
+        bucket, doc_name = File2DocumentService.get_minio_address(doc_id=doc_id)
+        binary = MINIO.get(bucket, doc_name)
+        parser_name = doc_attributes["parser_id"]
         if binary:
-            res = doc_parse(binary, name, parser_id, tenant_id)
+            res = doc_parse(binary, doc_name, parser_name, tenant_id)
             if res is False:
-                message += f"The parser id: {parser_id} of the document {doc_id} is not supported; "
+                message += f"The parser id: {parser_name} of the document {doc_id} is not supported; "
         else:
-            message += f"Empty data in the document: {name}; "
+            message += f"Empty data in the document: {doc_name}; "
         # failed in parsing
-        if doc["status"] == TaskStatus.FAIL.value:
+        if doc_attributes["status"] == TaskStatus.FAIL.value:
             message += f"Failed in parsing the document: {doc_id}; "
         return message
     except Exception as e: