Skip to content

Commit

Permalink
added multi-keyword chunker
Browse files Browse the repository at this point in the history
  • Loading branch information
emcf committed Sep 9, 2024
1 parent ccfe50a commit 586675b
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 6 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def read_git_requirements(file):

setup(
name='thepipe_api',
version='1.3.4',
version='1.3.7',
author='Emmett McFarlane',
author_email='emmett@thepi.pe',
description='AI-native extractor, powered by multimodal LLMs.',
Expand Down
23 changes: 23 additions & 0 deletions thepipe/chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,27 @@ def chunk_semantic(chunks: List[Chunk], model_name: str = 'sentence-transformers
seen_images.append(image)
new_chunks.append(Chunk(path=group_path, texts=group_texts, images=group_images))

return new_chunks

# starts a new chunk any time a word is found
def chunk_by_keywords(chunks: List[Chunk], keywords: List[str] = ['section']) -> List[Chunk]:
new_chunks = []
current_chunk_text = ""
current_chunk_images = []
current_chunk_path = chunks[0].path
for chunk in chunks:
chunk_text = '\n'.join(chunk.texts)
chunk_images = chunk.images
lines = chunk_text.split('\n')
for line in lines:
if any(keyword.lower() in line.lower() for keyword in keywords):
if current_chunk_text:
new_chunks.append(Chunk(path=chunk.path, texts=[current_chunk_text], images=current_chunk_images))
current_chunk_text = ""
current_chunk_images = chunk_images
current_chunk_path = chunk.path
current_chunk_text += line + '\n'
current_chunk_images.extend(chunk_images)
if current_chunk_text:
new_chunks.append(Chunk(path=current_chunk_path, texts=[current_chunk_text], images=current_chunk_images))
return new_chunks
8 changes: 4 additions & 4 deletions thepipe/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
import json
import re
from typing import List, Dict, Union, Optional, Tuple, Callable
from thepipe.core import HOST_URL, THEPIPE_API_KEY, Chunk, calculate_tokens
from thepipe.scraper import scrape_url, scrape_file
from thepipe.chunker import chunk_by_page
from .core import HOST_URL, THEPIPE_API_KEY, Chunk, calculate_tokens
from .scraper import scrape_url, scrape_file
from .chunker import chunk_by_page, chunk_by_document, chunk_by_section, chunk_semantic, chunk_by_keywords
import requests
import os
from openai import OpenAI
Expand Down Expand Up @@ -98,7 +98,7 @@ def extract_from_chunk(chunk: Chunk, chunk_index: int, schema: str, ai_model: st
if isinstance(llm_response_dict, dict):
response_dict.update(llm_response_dict)
elif isinstance(llm_response_dict, list):
response_dict["error"] = f"Expected a single JSON object but received a list: {llm_response_dict}"
response_dict["error"] = f"Expected a single JSON object but received a list: {llm_response_dict}. Try enabling multiple extractions."
else:
response_dict["error"] = f"Invalid JSON structure in LLM response: {llm_response_dict}"
else:
Expand Down
2 changes: 1 addition & 1 deletion thepipe/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import requests
import json
from .core import HOST_URL, THEPIPE_API_KEY, HOST_IMAGES, Chunk, make_image_url
from .chunker import chunk_by_page
from .chunker import chunk_by_page, chunk_by_document, chunk_by_section, chunk_semantic, chunk_by_keywords
import tempfile
import mimetypes
import dotenv
Expand Down

0 comments on commit 586675b

Please sign in to comment.