Skip to content

Commit

Permalink
Openai (#5)
Browse files Browse the repository at this point in the history
* create specific content subest for each repo document and calculate length of document content in tokens

* Migrate to openai embeddings

* Fix bug for repos written in languages that were often polyglot and for truncated links

* Fix up todos and tests

* Remove GPT4All dependency
  • Loading branch information
DaveParr authored Mar 4, 2024
1 parent 8ef7322 commit 1720974
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 68 deletions.
22 changes: 1 addition & 21 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ starpilot = "starpilot.main:app"
python = "^3.10"
typer = { extras = ["all"], version = "^0.9.0" }
langchain = "^0.1"
gpt4all = "^2.0.2"
chromadb = "0.4.15"
unstructured = "^0.10.29"
markdown = "^3.5.1"
Expand All @@ -32,6 +31,7 @@ gql = { extras = ["all"], version = "^3.5.0" }
langchain-community = "^0.0.24"
langchain-openai = "^0.0.7"
langchain-core = "^0.1.27"
tiktoken = "^0.6.0"

[tool.poetry.group.dev.dependencies]
black = "^23.10.1"
Expand Down
55 changes: 36 additions & 19 deletions starpilot/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.retrievers.self_query.chroma import ChromaTranslator
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from rich import print
from typing_extensions import Optional

Expand All @@ -30,9 +30,7 @@
except Exception: # Graceful fallback if IceCream isn't installed.
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa

structlog.configure(
wrapper_class=structlog.make_filtering_bound_logger(logging.WARNING)
)
structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(logging.INFO))
logger = structlog.get_logger()


Expand Down Expand Up @@ -103,13 +101,14 @@ def setup(
@app.command()
def read(
user: str,
k: Optional[int] = typer.Option(500, help="Number of repositories to load"),
k: Optional[int] = typer.Option(900, help="Number of repositories to load"),
) -> None:
"""
Read stars from GitHub
"""

GITHUB_API_KEY = os.environ["GITHUB_API_KEY"]
embedding_function = OpenAIEmbeddings(model="text-embedding-3-large")

repos = utils.get_user_starred_repos(
username=user,
Expand All @@ -120,7 +119,12 @@ def read(
for repo in repos:
formatted_repos.append(utils.format_repo(repo))

utils.save_repo_contents_to_disk(repo_contents=formatted_repos)
# order by stars and pick top k
top_k_formatted_repos = sorted(
formatted_repos, key=lambda x: x["stargazerCount"], reverse=True
)[:k]

utils.save_repo_contents_to_disk(repo_contents=top_k_formatted_repos)

vectorstore_path = "./vectorstore-chroma"

Expand All @@ -135,7 +139,7 @@ def read(

Chroma.from_documents(
documents=repo_documents,
embedding=GPT4AllEmbeddings(client=None),
embedding=embedding_function,
persist_directory=vectorstore_path,
)

Expand Down Expand Up @@ -212,6 +216,11 @@ def astrologer(
description="the url of a repository on GitHub",
type="string",
),
AttributeInfo(
name="stargazerCount",
description="the number of stars a repository has on GitHub",
type="number",
),
]

document_content_description = "content describing a repository on GitHub"
Expand All @@ -233,26 +242,33 @@ def astrologer(
"Python machine learning repos",
{
"query": "machine learning",
"filter": 'eq("languages", "python")',
"filter": 'eq("primaryLanguage", "Python")',
},
),
(
"Rust Dataframe crates",
{"query": "data frame", "filter": 'eq("languages", "rust")'},
{"query": "data frame", "filter": 'eq("primaryLanguage", "Rust")'},
),
(
"What R packages do time series analysis",
{"query": "time series", "filter": 'eq("languages", "R")'},
{"query": "time series", "filter": 'eq("primaryLanguage", "R")'},
),
(
"data frame packages with 100 stars or more",
{
"query": "data frame",
"filter": 'gte("stargazerCount", 100)',
},
),
],
allowed_comparators=[
Comparator.EQ,
Comparator.NE,
Comparator.GT,
Comparator.GTE,
Comparator.LT,
Comparator.LTE,
],
# allowed_comparators=[
# Comparator.EQ,
# Comparator.NE,
# Comparator.GT,
# Comparator.GTE,
# Comparator.LT,
# Comparator.LTE,
# ],
)

output_parser = StructuredQueryOutputParser.from_components()
Expand All @@ -261,14 +277,15 @@ def astrologer(

vectorstore = Chroma(
persist_directory=VECTORSTORE_PATH,
embedding_function=GPT4AllEmbeddings(client=None),
embedding_function=OpenAIEmbeddings(model="text-embedding-3-large"),
)

retriever = SelfQueryRetriever(
query_constructor=query_constructor, # type: ignore because it's documented as a pattern https://python.langchain.com/docs/modules/data_connection/retrievers/self_query#constructing-from-scratch-with-lcel:~:text=The%20next%20key,Integrations%20section.
vectorstore=vectorstore,
structured_query_translator=ChromaTranslator(),
search_kwargs={"k": k},
verbose=True,
)

results = retriever.invoke(query)
Expand Down
76 changes: 63 additions & 13 deletions starpilot/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
from langchain.schema.document import Document
from langchain.vectorstores.utils import filter_complex_metadata
from langchain_community.document_loaders import JSONLoader
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai.embeddings import OpenAIEmbeddings
from rich.progress import track
from rich.table import Table

Expand Down Expand Up @@ -174,8 +174,6 @@ def _get_next_cursor(edges: List[Dict]) -> Optional[str]:
user=username, github_api_key=github_api_key, after_cursor=after_cursor
)

ic(result)

all_results.append(result)

if after_cursor is None:
Expand Down Expand Up @@ -209,6 +207,25 @@ def format_repo(repo: Dict) -> Dict:
"topics": [
topic["topic"]["name"] for topic in repo["repositoryTopics"]["nodes"]
],
# join name, description, topics if they are not none
"content": " ".join(
filter(
None,
[
repo["name"],
repo["description"],
" ".join(
[
topic["topic"]["name"]
for topic in repo["repositoryTopics"]["nodes"]
]
),
repo["primaryLanguage"]["name"]
if repo["primaryLanguage"]
else None,
],
)
),
}

# remove keys with None, empty values, or empty strings
Expand Down Expand Up @@ -248,13 +265,18 @@ def prepare_documents(
"""
Prepare the documents for ingestion into the vectorstore
"""
import tiktoken

file_paths = []
for file in os.listdir(repo_contents_dir):
file_paths.append(os.path.join(repo_contents_dir, file))

def _metadata_func(record: dict, metadata: dict) -> dict:
metadata["url"] = record.get("url")
metadata["name"] = record.get("name")
metadata["stargazerCount"] = record["stargazerCount"]
if (primary_language := record.get("primaryLanguage")) is not None:
metadata["primaryLanguage"] = primary_language
if (description := record.get("description")) is not None:
metadata["description"] = description
if (topics := record.get("topics")) is not None:
Expand All @@ -275,21 +297,41 @@ def _metadata_func(record: dict, metadata: dict) -> dict:

return metadata

# /home/dave/.cache/pypoetry/virtualenvs/starpilot-OKleAcjU-py3.10/lib/python3.10/site-packages/langchain/vectorstores/chroma.py:309 in add_texts
# ValueError: Expected metadata value to be a str, int, float or bool, got None which is a <class 'NoneType'>

# Try filtering complex metadata from the document using langchain.vectorstores.utils.filter_complex_metadata.
documents = []
for file_path in track(file_paths, description="Loading documents..."):
logger.debug("Loading document", file=file_path)
loader = JSONLoader(
file_path,
jq_schema=".",
content_key="content",
metadata_func=_metadata_func,
text_content=False,
)
if (loaded := loader.load())[0].page_content != "":
documents.extend(loaded)
if (loaded_document := loader.load())[0].page_content != "":
documents.extend(loaded_document)

def _num_tokens_from_string(string: str, encoding_name: str) -> int:
"""Returns the number of tokens in a text string."""
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens

# calculate the sum total tokens for the content of each document

token_lengths = []
for document in documents:
token_lengths.append(
_num_tokens_from_string(document.page_content, "cl100k_base")
)

price_per_million_tokens = 0.13

logger.info(
"Token lengths",
total_tokens=sum(token_lengths),
mean_tokens=sum(token_lengths) / len(token_lengths),
total_cost=sum(token_lengths) * price_per_million_tokens / 1e6,
)

documents = filter_complex_metadata(documents)

Expand All @@ -316,7 +358,7 @@ def create_retriever(
"""
return Chroma(
persist_directory=vectorstore_path,
embedding_function=GPT4AllEmbeddings(), # type:ignore # Tried to find a way to suppress the model card from being printed, failed: https://github.com/langchain-ai/langchain/discussions/13663 # type: ignore
embedding_function=OpenAIEmbeddings(model="text-embedding-3-large"), # type:ignore # Tried to find a way to suppress the model card from being printed, failed: https://github.com/langchain-ai/langchain/discussions/13663 # type: ignore
).as_retriever(
search_type=method,
search_kwargs={
Expand All @@ -333,17 +375,25 @@ def create_results_table(response: List[Document]) -> Table:

table.add_column("Repo")
table.add_column("Description")
table.add_column("URL")
table.add_column(
"URL", no_wrap=True
) # `no_wrap` is so the link is always clickable, truncated text in this column truncates the link
table.add_column("Topic")
table.add_column("Language")
table.add_column("Primary Language")
table.add_column("Languages")
table.add_column("Star Count")

for source_document in response:
table.add_row(
source_document.metadata.get("name"),
source_document.metadata.get(
"name"
), # TODO: make this text a link to the repo
source_document.metadata.get("description"),
source_document.metadata.get("url"),
source_document.metadata.get("topics"),
source_document.metadata.get("primaryLanguage"),
source_document.metadata.get("languages"),
str(source_document.metadata.get("stargazerCount")),
)

return table
4 changes: 2 additions & 2 deletions tests/test_data/The-Open-Book.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"name": "The-Open-Book",
"nameWithOwner": "joeycastillo/The-Open-Book",
"url": "https://github.com/joeycastillo/The-Open-Book",
"stargazerCount": 7265,
"stargazerCount": 7312,
"owner": "joeycastillo",
"readme": "# The Open Book: Project Reboot\n\n**A note from the project creator:** The Open Book was my first real electronics project; the earliest designs date to late 2019. I have learned a lot since those early days, and as such, here three years later, I am hitting reset. At this time the repository contains a version of the Open Book designed around the Raspberry Pi Pico. It's optimized for low part count and easy hand assembly, but it runs on AAA batteries and may not be as svelte as some folks might prefer. At a later date, I hope to design a followup with built-in LiPo charging and a slimmer profile, but at this time, this version of the book is the quickest path to getting hardware in people's hands so that we can start hacking on firmware together.\n\nFor details on how to build, test and use Open Book, I've [made documentation available here](https://www.oddlyspecificobjects.com/projects/openbook/).\n\nThe original Open Book repository has been archived in a branch called [original](https://github.com/joeycastillo/The-Open-Book/tree/original).\n\n[The Open Book firmware, called libros, is under development here](https://github.com/joeycastillo/libros). It's a goddamn mess in some ways, and in dire need of some documentation, but for the moment it does do the job of presenting a list of books stored on an SD card, and letting you read them. The canonical format for books is plain text with the book title on the first line, OR plain text plus some front matter and some ASCII control codes for chapter breaks and formatting, [as documented here](https://www.oddlyspecificobjects.com/projects/openbook/#advanced-text-formatting).\n\n## Original Introduction\n\nAs a society, we need an open source device for reading. Books are among the most important documents of our culture, yet the most popular and widespread devices we have for reading \u2014 the Kobo, the Nook, the Kindle and even the iPad \u2014 are closed devices, operating as small moving parts in a set of giant closed platforms whose owners' interests are not always aligned with readers'.\n\nThe Open Book aims to be a simple device that anyone with a soldering iron can build for themselves. The Open Book should be comprehensible: the reader should be able to look at it and understand, at least in broad strokes, how it works. It should be extensible, so that a reader with different needs can write code and add accessories that make the book work for them. It should be global, supporting readers of books in all the languages of the world. Most of all, it should be open, so that anyone can take this design as a starting point and use it to build a better book.\n\n## State of the Book\n\nAt this time, the Pi Pico book is in decent shape if you want to try your hand at building it yourself. You will need to have two custom things fabricated: the **Open Book Main Board** (which you can get as a bare PCB) and the **Castellated E-Paper Driver** module (which you'll want to have done as a PCBA job). All the files you will need to send out for this can be found in the **Fabrication Files** folder in the project root:\n\n* Upload `OSO-BOOK-C1-04-rounded.zip` to your PCB fabrication house of choice. It is designed to be a two-layer, 1 mm thick PCB, and you can use either an ENIG or lead-free HASL finish.\n* If you plan to use JLCPCB's economic PCBA service, upload all three files in `OSO-BOOK-C2-01` to JLCPCB. Opt for a 1 mm thick lead-free HASL finish. Note that the board is slightly wider than it needs to be, just to meet the minimum size requirements for this service.\n* If you plan to use PCBWay's PCBA service, upload all three files in `OSO-BOOK-C2-02` to PCBWay. Once again, opt for a 1 mm thick lead-free HASL finish. \n\nOther Parts: \n\n* Two of these [side mount buttons](https://www.digikey.com/en/products/detail/w\u00fcrth-elektronik/434351045816/5209090)\n* One of these [side-mount switches](https://www.digikey.com/en/products/detail/c-k/JS102011SAQN/1640095)\n* One [MEM2075 MicroSD card slot](https://www.digikey.com/en/products/detail/gct/MEM2075-00-140-01-A/9859614)\n* One [GD25Q16C Flash chip](https://www.digikey.com/en/products/detail/gigadevice-semiconductor-hk-limited/GD25Q16CTIGR/9484675) with SOIC / SOP8 footprint.\n* One [Keystone 1022 dual AAA battery holder](https://www.digikey.com/en/products/detail/keystone-electronics/1022/2137859) (you can get clones on Aliexpress for cheap)\n* Two P-channel MOSFETS with SOT23 footprint (I use the DMG3415)\n* Two 10k\u03a9 resistors with 1206 footprint.\n* Two 10\u00b5F capacitors with 1206 footprint (rated voltage >=6.3V).\n* One 1\u00b5F capacitor with 0805 footprint (rated voltage >=6.3V).\n* Seven through-hole slim tactile buttons (3mm by 6mm; [TL1107 type](https://www.digikey.com/en/products/detail/e-switch/TL1107AF130WQ/378976))\n* One [GDEW042T2 grayscale e-paper display](https://buy-lcd.com/products/42inch-e-inkanel-spi-interface-buy-eaper-display). (Don't get the tri-color version; it'll end in heartbreak)\n* And finally, one [Raspberry Pi Pico](https://www.digikey.com/en/products/detail/raspberry-pi/SC0915/13624793) board\n\nThe Open Book is open source hardware: you should feel free to build one yourself, order parts for ten and do a workshop at your local maker space, or even buy parts for fifty and sell them as kits. \n\nPlease steal this book.\n\nI plan to add more documentation in the new year, but until then, [this half-hour video walks through building one Open Book board in real-time](https://twitter.com/i/broadcasts/1OyKAVPjrvaGb).\n\n### Forking and tweaking the boards\n\n* Design files for the Open Book main board can be found in the `OSO-BOOK-C1` folder. It's a KiCad project.\n* Design files for the castellated e-paper driver module can be found in the `OSO-BOOK-C2` folder. Alas, they are Eagle projects that predate my move to KiCad. There are two versions: an older version that was successfully fabricated with JLCPCB's economic PCBA service (`OSO-BOOK-C2-01`), and a newer version successfully fabricated using PCBWay's PCBA service (`OSO-BOOK-C2-02`). Both work great.\n\n## License\n\n Different components of the project are licensed differently, see [LICENSE.md](https://github.com/joeycastillo/The-Open-Book/blob/main/LICENSE.md).\n"
"content": "The-Open-Book"
}
Loading

0 comments on commit 1720974

Please sign in to comment.