Skip to content

Commit

Permalink
add support for eml file parser (infiniflow#1768)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

add support for eml file parser
infiniflow#1363

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
  • Loading branch information
3 people authored Aug 6, 2024
1 parent 5f5a4af commit f9e1593
Show file tree
Hide file tree
Showing 12 changed files with 178 additions and 28 deletions.
4 changes: 3 additions & 1 deletion api/apps/dataset_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from api.utils.api_utils import construct_json_result, construct_error_response
from api.utils.api_utils import construct_result, validate_request
from api.utils.file_utils import filename_type, thumbnail
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
from rag.nlp import search
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils.minio_conn import MINIO
Expand Down Expand Up @@ -652,6 +652,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
case "audio":
audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
case "email":
email.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
case _:
return False
Expand Down
1 change: 1 addition & 0 deletions api/db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ class ParserType(StrEnum):
PICTURE = "picture"
ONE = "one"
AUDIO = "audio"
EMAIL = "email"
KG = "knowledge_graph"


Expand Down
2 changes: 1 addition & 1 deletion api/db/init_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def init_llm_factory():
LLMService.filter_delete([LLMService.model.fid == "QAnything"])
TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
TenantService.filter_update([1 == 1], {
"parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph"})
"parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email"})
## insert openai two embedding models to the current openai user.
print("Start to insert 2 OpenAI embedding models...")
tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
Expand Down
2 changes: 1 addition & 1 deletion api/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@
API_KEY = LLM.get("api_key", "")
PARSERS = LLM.get(
"parsers",
"naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph")
"naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email")

# distribution
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
Expand Down
2 changes: 1 addition & 1 deletion api/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def filename_type(filename):
return FileType.PDF.value

if re.match(
r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
return FileType.DOC.value

if re.match(
Expand Down
3 changes: 2 additions & 1 deletion deepdoc/parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@
from .ppt_parser import RAGFlowPptParser as PptParser
from .html_parser import RAGFlowHtmlParser as HtmlParser
from .json_parser import RAGFlowJsonParser as JsonParser
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
from .txt_parser import RAGFlowTxtParser as TxtParser
7 changes: 6 additions & 1 deletion deepdoc/parser/html_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,15 @@ def __call__(self, fnm, binary=None):
else:
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
txt = f.read()
return self.parser_txt(txt)

@classmethod
def parser_txt(cls, txt):
if type(txt) != str:
raise TypeError("txt type should be str!")
html_doc = readability.Document(txt)
title = html_doc.title()
content = html_text.extract_text(html_doc.summary(html_partial=True))
txt = f'{title}\n{content}'
txt = f"{title}\n{content}"
sections = txt.split("\n")
return sections
42 changes: 42 additions & 0 deletions deepdoc/parser/txt_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from rag.nlp import find_codec,num_tokens_from_string

class RAGFlowTxtParser:
def __call__(self, fnm, binary=None, chunk_token_num=128):
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(fnm, "r") as f:
while True:
l = f.readline()
if not l:
break
txt += l
return self.parser_txt(txt, chunk_token_num)

@classmethod
def parser_txt(cls, txt, chunk_token_num=128):
if type(txt) != str:
raise TypeError("txt type should be str!")
sections = []
for sec in txt.split("\n"):
if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
sections.append((sec[: int(len(sec) / 2)], ""))
sections.append((sec[int(len(sec) / 2) :], ""))
else:
sections.append((sec, ""))
return sections
114 changes: 114 additions & 0 deletions rag/app/email.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from email import policy
from email.parser import BytesParser
from rag.app.naive import chunk as naive_chunk
import re
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
from deepdoc.parser import HtmlParser, TxtParser
from timeit import default_timer as timer
from rag.settings import cron_logger
import io


def chunk(
filename,
binary=None,
from_page=0,
to_page=100000,
lang="Chinese",
callback=None,
**kwargs,
):
"""
Only eml is supported
"""
eng = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get(
"parser_config",
{"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True},
)
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
main_res = []
attachment_res = []

if binary:
msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
else:
msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))

text_txt, html_txt = [], []
# get the email header info
for header, value in msg.items():
text_txt.append(f"{header}: {value}")

# get the email main info
def _add_content(msg, content_type):
if content_type == "text/plain":
text_txt.append(
msg.get_payload(decode=True).decode(msg.get_content_charset())
)
elif content_type == "text/html":
html_txt.append(
msg.get_payload(decode=True).decode(msg.get_content_charset())
)
elif "multipart" in content_type:
if msg.is_multipart():
for part in msg.iter_parts():
_add_content(part, part.get_content_type())

_add_content(msg, msg.get_content_type())

sections = TxtParser.parser_txt("\n".join(text_txt)) + [
(l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l
]

st = timer()
chunks = naive_merge(
sections,
int(parser_config.get("chunk_token_num", 128)),
parser_config.get("delimiter", "\n!?。;!?"),
)

main_res.extend(tokenize_chunks(chunks, doc, eng, None))
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
# get the attachment info
for part in msg.iter_attachments():
content_disposition = part.get("Content-Disposition")
if content_disposition:
dispositions = content_disposition.strip().split(";")
if dispositions[0].lower() == "attachment":
filename = part.get_filename()
payload = part.get_payload(decode=True)
try:
attachment_res.extend(
naive_chunk(filename, payload, callback=callback, **kwargs)
)
except Exception:
pass

return main_res + attachment_res


if __name__ == "__main__":
import sys

def dummy(prog=None, msg=""):
pass

chunk(sys.argv[1], callback=dummy)
23 changes: 3 additions & 20 deletions rag/app/naive.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import re
from deepdoc.parser.pdf_parser import PlainParser
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
from rag.settings import cron_logger
from rag.utils import num_tokens_from_string
from PIL import Image
Expand Down Expand Up @@ -170,6 +170,7 @@ def __call__(self, filename, binary=None):
return sections, tbls



def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Expand Down Expand Up @@ -222,25 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:
break
txt += l
sections = []
for sec in txt.split("\n"):
if num_tokens_from_string(sec) > 10 * int(parser_config.get("chunk_token_num", 128)):
sections.append((sec[:int(len(sec)/2)], ""))
sections.append((sec[int(len(sec)/2):], ""))
else:
sections.append((sec, ""))

sections = TxtParser()(filename,binary,parser_config.get("chunk_token_num", 128))
callback(0.8, "Finish parsing.")

elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
Expand Down
3 changes: 2 additions & 1 deletion rag/svr/task_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
from io import BytesIO
import pandas as pd

from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email

from api.db import LLMType, ParserType
from api.db.services.document_service import DocumentService
Expand All @@ -69,6 +69,7 @@
ParserType.PICTURE.value: picture,
ParserType.ONE.value: one,
ParserType.AUDIO.value: audio,
ParserType.EMAIL.value: email,
ParserType.KG.value: knowledge_graph
}

Expand Down
3 changes: 2 additions & 1 deletion web/src/components/chunk-method-modal/hooks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ const ParserListMap = new Map([
'one',
'qa',
'manual',
'knowledge_graph',
'knowledge_graph'
],
],
[
Expand Down Expand Up @@ -67,6 +67,7 @@ const ParserListMap = new Map([
],
[['md'], ['naive', 'qa', 'knowledge_graph']],
[['json'], ['naive', 'knowledge_graph']],
[['eml'], ['email']]
]);

const getParserList = (
Expand Down

0 comments on commit f9e1593

Please sign in to comment.