Openai (#5)

* create specific content subest for each repo document and calculate length of document content in tokens * Migrate to openai embeddings * Fix bug for repos written in languages that were often polyglot and for truncated links * Fix up todos and tests * Remove GPT4All dependency
DaveParr · Mar 4, 2024 · 1720974 · 1720974
1 parent 8ef7322
commit 1720974
Show file tree

Hide file tree

Showing 8 changed files with 118 additions and 68 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,6 @@ starpilot = "starpilot.main:app"
 python = "^3.10"
 typer = { extras = ["all"], version = "^0.9.0" }
 langchain = "^0.1"
-gpt4all = "^2.0.2"
 chromadb = "0.4.15"
 unstructured = "^0.10.29"
 markdown = "^3.5.1"
@@ -32,6 +31,7 @@ gql = { extras = ["all"], version = "^3.5.0" }
 langchain-community = "^0.0.24"
 langchain-openai = "^0.0.7"
 langchain-core = "^0.1.27"
+tiktoken = "^0.6.0"
 
 [tool.poetry.group.dev.dependencies]
 black = "^23.10.1"

diff --git a/starpilot/main.py b/starpilot/main.py
@@ -14,9 +14,9 @@
 from langchain.chains.query_constructor.schema import AttributeInfo
 from langchain.retrievers.self_query.base import SelfQueryRetriever
 from langchain.retrievers.self_query.chroma import ChromaTranslator
-from langchain_community.embeddings import GPT4AllEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain_openai import ChatOpenAI
+from langchain_openai.embeddings import OpenAIEmbeddings
 from rich import print
 from typing_extensions import Optional
 
@@ -30,9 +30,7 @@
 except Exception:  # Graceful fallback if IceCream isn't installed.
     ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
 
-structlog.configure(
-    wrapper_class=structlog.make_filtering_bound_logger(logging.WARNING)
-)
+structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(logging.INFO))
 logger = structlog.get_logger()
 
 
@@ -103,13 +101,14 @@ def setup(
 @app.command()
 def read(
     user: str,
-    k: Optional[int] = typer.Option(500, help="Number of repositories to load"),
+    k: Optional[int] = typer.Option(900, help="Number of repositories to load"),
 ) -> None:
     """
     Read stars from GitHub
     """
 
     GITHUB_API_KEY = os.environ["GITHUB_API_KEY"]
+    embedding_function = OpenAIEmbeddings(model="text-embedding-3-large")
 
     repos = utils.get_user_starred_repos(
         username=user,
@@ -120,7 +119,12 @@ def read(
     for repo in repos:
         formatted_repos.append(utils.format_repo(repo))
 
-    utils.save_repo_contents_to_disk(repo_contents=formatted_repos)
+    # order by stars and pick top k
+    top_k_formatted_repos = sorted(
+        formatted_repos, key=lambda x: x["stargazerCount"], reverse=True
+    )[:k]
+
+    utils.save_repo_contents_to_disk(repo_contents=top_k_formatted_repos)
 
     vectorstore_path = "./vectorstore-chroma"
 
@@ -135,7 +139,7 @@ def read(
 
     Chroma.from_documents(
         documents=repo_documents,
-        embedding=GPT4AllEmbeddings(client=None),
+        embedding=embedding_function,
         persist_directory=vectorstore_path,
     )
 
@@ -212,6 +216,11 @@ def astrologer(
             description="the url of a repository on GitHub",
             type="string",
         ),
+        AttributeInfo(
+            name="stargazerCount",
+            description="the number of stars a repository has on GitHub",
+            type="number",
+        ),
     ]
 
     document_content_description = "content describing a repository on GitHub"
@@ -233,26 +242,33 @@ def astrologer(
                 "Python machine learning repos",
                 {
                     "query": "machine learning",
-                    "filter": 'eq("languages", "python")',
+                    "filter": 'eq("primaryLanguage", "Python")',
                 },
             ),
             (
                 "Rust Dataframe crates",
-                {"query": "data frame", "filter": 'eq("languages", "rust")'},
+                {"query": "data frame", "filter": 'eq("primaryLanguage", "Rust")'},
             ),
             (
                 "What R packages do time series analysis",
-                {"query": "time series", "filter": 'eq("languages", "R")'},
+                {"query": "time series", "filter": 'eq("primaryLanguage", "R")'},
             ),
+            (
+                "data frame packages with 100 stars or more",
+                {
+                    "query": "data frame",
+                    "filter": 'gte("stargazerCount", 100)',
+                },
+            ),
+        ],
+        allowed_comparators=[
+            Comparator.EQ,
+            Comparator.NE,
+            Comparator.GT,
+            Comparator.GTE,
+            Comparator.LT,
+            Comparator.LTE,
         ],
-        # allowed_comparators=[
-        #     Comparator.EQ,
-        #     Comparator.NE,
-        #     Comparator.GT,
-        #     Comparator.GTE,
-        #     Comparator.LT,
-        #     Comparator.LTE,
-        # ],
     )
 
     output_parser = StructuredQueryOutputParser.from_components()
@@ -261,14 +277,15 @@ def astrologer(
 
     vectorstore = Chroma(
         persist_directory=VECTORSTORE_PATH,
-        embedding_function=GPT4AllEmbeddings(client=None),
+        embedding_function=OpenAIEmbeddings(model="text-embedding-3-large"),
     )
 
     retriever = SelfQueryRetriever(
         query_constructor=query_constructor,  # type: ignore because it's documented as a pattern https://python.langchain.com/docs/modules/data_connection/retrievers/self_query#constructing-from-scratch-with-lcel:~:text=The%20next%20key,Integrations%20section.
         vectorstore=vectorstore,
         structured_query_translator=ChromaTranslator(),
         search_kwargs={"k": k},
+        verbose=True,
     )
 
     results = retriever.invoke(query)

diff --git a/starpilot/utils/utils.py b/starpilot/utils/utils.py
@@ -16,8 +16,8 @@
 from langchain.schema.document import Document
 from langchain.vectorstores.utils import filter_complex_metadata
 from langchain_community.document_loaders import JSONLoader
-from langchain_community.embeddings import GPT4AllEmbeddings
 from langchain_community.vectorstores import Chroma
+from langchain_openai.embeddings import OpenAIEmbeddings
 from rich.progress import track
 from rich.table import Table
 
@@ -174,8 +174,6 @@ def _get_next_cursor(edges: List[Dict]) -> Optional[str]:
             user=username, github_api_key=github_api_key, after_cursor=after_cursor
         )
 
-        ic(result)
-
         all_results.append(result)
 
         if after_cursor is None:
@@ -209,6 +207,25 @@ def format_repo(repo: Dict) -> Dict:
         "topics": [
             topic["topic"]["name"] for topic in repo["repositoryTopics"]["nodes"]
         ],
+        # join name, description, topics if they are not none
+        "content": " ".join(
+            filter(
+                None,
+                [
+                    repo["name"],
+                    repo["description"],
+                    " ".join(
+                        [
+                            topic["topic"]["name"]
+                            for topic in repo["repositoryTopics"]["nodes"]
+                        ]
+                    ),
+                    repo["primaryLanguage"]["name"]
+                    if repo["primaryLanguage"]
+                    else None,
+                ],
+            )
+        ),
     }
 
     # remove keys with None, empty values, or empty strings
@@ -248,13 +265,18 @@ def prepare_documents(
     """
     Prepare the documents for ingestion into the vectorstore
     """
+    import tiktoken
+
     file_paths = []
     for file in os.listdir(repo_contents_dir):
         file_paths.append(os.path.join(repo_contents_dir, file))
 
     def _metadata_func(record: dict, metadata: dict) -> dict:
         metadata["url"] = record.get("url")
         metadata["name"] = record.get("name")
+        metadata["stargazerCount"] = record["stargazerCount"]
+        if (primary_language := record.get("primaryLanguage")) is not None:
+            metadata["primaryLanguage"] = primary_language
         if (description := record.get("description")) is not None:
             metadata["description"] = description
         if (topics := record.get("topics")) is not None:
@@ -275,21 +297,41 @@ def _metadata_func(record: dict, metadata: dict) -> dict:
 
         return metadata
 
-    # /home/dave/.cache/pypoetry/virtualenvs/starpilot-OKleAcjU-py3.10/lib/python3.10/site-packages/langchain/vectorstores/chroma.py:309 in add_texts
-    # ValueError: Expected metadata value to be a str, int, float or bool, got None which is a <class 'NoneType'>
-
-    # Try filtering complex metadata from the document using langchain.vectorstores.utils.filter_complex_metadata.
     documents = []
     for file_path in track(file_paths, description="Loading documents..."):
         logger.debug("Loading document", file=file_path)
         loader = JSONLoader(
             file_path,
             jq_schema=".",
+            content_key="content",
             metadata_func=_metadata_func,
             text_content=False,
         )
-        if (loaded := loader.load())[0].page_content != "":
-            documents.extend(loaded)
+        if (loaded_document := loader.load())[0].page_content != "":
+            documents.extend(loaded_document)
+
+    def _num_tokens_from_string(string: str, encoding_name: str) -> int:
+        """Returns the number of tokens in a text string."""
+        encoding = tiktoken.get_encoding(encoding_name)
+        num_tokens = len(encoding.encode(string))
+        return num_tokens
+
+    # calculate the sum total tokens for the content of each document
+
+    token_lengths = []
+    for document in documents:
+        token_lengths.append(
+            _num_tokens_from_string(document.page_content, "cl100k_base")
+        )
+
+    price_per_million_tokens = 0.13
+
+    logger.info(
+        "Token lengths",
+        total_tokens=sum(token_lengths),
+        mean_tokens=sum(token_lengths) / len(token_lengths),
+        total_cost=sum(token_lengths) * price_per_million_tokens / 1e6,
+    )
 
     documents = filter_complex_metadata(documents)
 
@@ -316,7 +358,7 @@ def create_retriever(
     """
     return Chroma(
         persist_directory=vectorstore_path,
-        embedding_function=GPT4AllEmbeddings(),  # type:ignore  # Tried to find a way to suppress the model card from being printed, failed: https://github.com/langchain-ai/langchain/discussions/13663 # type: ignore
+        embedding_function=OpenAIEmbeddings(model="text-embedding-3-large"),  # type:ignore  # Tried to find a way to suppress the model card from being printed, failed: https://github.com/langchain-ai/langchain/discussions/13663 # type: ignore
     ).as_retriever(
         search_type=method,
         search_kwargs={
@@ -333,17 +375,25 @@ def create_results_table(response: List[Document]) -> Table:
 
     table.add_column("Repo")
     table.add_column("Description")
-    table.add_column("URL")
+    table.add_column(
+        "URL", no_wrap=True
+    )  # `no_wrap` is so the link is always clickable, truncated text in this column truncates the link
     table.add_column("Topic")
-    table.add_column("Language")
+    table.add_column("Primary Language")
+    table.add_column("Languages")
+    table.add_column("Star Count")
 
     for source_document in response:
         table.add_row(
-            source_document.metadata.get("name"),
+            source_document.metadata.get(
+                "name"
+            ),  # TODO: make this text a link to the repo
             source_document.metadata.get("description"),
             source_document.metadata.get("url"),
             source_document.metadata.get("topics"),
+            source_document.metadata.get("primaryLanguage"),
             source_document.metadata.get("languages"),
+            str(source_document.metadata.get("stargazerCount")),
         )
 
     return table
diff --git a/tests/test_data/The-Open-Book.json b/tests/test_data/The-Open-Book.json
@@ -2,7 +2,7 @@
     "name": "The-Open-Book",
     "nameWithOwner": "joeycastillo/The-Open-Book",
     "url": "https://github.com/joeycastillo/The-Open-Book",
-    "stargazerCount": 7265,
+    "stargazerCount": 7312,
     "owner": "joeycastillo",
-    "readme": "# The Open Book: Project Reboot\n\n**A note from the project creator:** The Open Book was my first real electronics project; the earliest designs date to late 2019. I have learned a lot since those early days, and as such, here three years later, I am hitting reset. At this time the repository contains a version of the Open Book designed around the Raspberry Pi Pico. It's optimized for low part count and easy hand assembly, but it runs on AAA batteries and may not be as svelte as some folks might prefer. At a later date, I hope to design a followup with built-in LiPo charging and a slimmer profile, but at this time, this version of the book is the quickest path to getting hardware in people's hands so that we can start hacking on firmware together.\n\nFor details on how to build, test and use Open Book, I've [made documentation available here](https://www.oddlyspecificobjects.com/projects/openbook/).\n\nThe original Open Book repository has been archived in a branch called [original](https://github.com/joeycastillo/The-Open-Book/tree/original).\n\n[The Open Book firmware, called libros, is under development here](https://github.com/joeycastillo/libros). It's a goddamn mess in some ways, and in dire need of some documentation, but for the moment it does do the job of presenting a list of books stored on an SD card, and letting you read them. The canonical format for books is plain text with the book title on the first line, OR plain text plus some front matter and some ASCII control codes for chapter breaks and formatting, [as documented here](https://www.oddlyspecificobjects.com/projects/openbook/#advanced-text-formatting).\n\n## Original Introduction\n\nAs a society, we need an open source device for reading. Books are among the most important documents of our culture, yet the most popular and widespread devices we have for reading \u2014 the Kobo, the Nook, the Kindle and even the iPad \u2014 are closed devices, operating as small moving parts in a set of giant closed platforms whose owners' interests are not always aligned with readers'.\n\nThe Open Book aims to be a simple device that anyone with a soldering iron can build for themselves. The Open Book should be comprehensible: the reader should be able to look at it and understand, at least in broad strokes, how it works. It should be extensible, so that a reader with different needs can write code and add accessories that make the book work for them. It should be global, supporting readers of books in all the languages of the world. Most of all, it should be open, so that anyone can take this design as a starting point and use it to build a better book.\n\n## State of the Book\n\nAt this time, the Pi Pico book is in decent shape if you want to try your hand at building it yourself. You will need to have two custom things fabricated: the **Open Book Main Board** (which you can get as a bare PCB) and the **Castellated E-Paper Driver** module (which you'll want to have done as a PCBA job). All the files you will need to send out for this can be found in the **Fabrication Files** folder in the project root:\n\n* Upload `OSO-BOOK-C1-04-rounded.zip` to your PCB fabrication house of choice. It is designed to be a two-layer, 1 mm thick PCB, and you can use either an ENIG or lead-free HASL finish.\n* If you plan to use JLCPCB's economic PCBA service, upload all three files in `OSO-BOOK-C2-01` to JLCPCB. Opt for a 1 mm thick lead-free HASL finish. Note that the board is slightly wider than it needs to be, just to meet the minimum size requirements for this service.\n* If you plan to use PCBWay's PCBA service, upload all three files in `OSO-BOOK-C2-02` to PCBWay. Once again, opt for a 1 mm thick lead-free HASL finish. \n\nOther Parts: \n\n* Two of these [side mount buttons](https://www.digikey.com/en/products/detail/w\u00fcrth-elektronik/434351045816/5209090)\n* One of these [side-mount switches](https://www.digikey.com/en/products/detail/c-k/JS102011SAQN/1640095)\n* One [MEM2075 MicroSD card slot](https://www.digikey.com/en/products/detail/gct/MEM2075-00-140-01-A/9859614)\n* One [GD25Q16C Flash chip](https://www.digikey.com/en/products/detail/gigadevice-semiconductor-hk-limited/GD25Q16CTIGR/9484675) with SOIC / SOP8 footprint.\n* One [Keystone 1022 dual AAA battery holder](https://www.digikey.com/en/products/detail/keystone-electronics/1022/2137859) (you can get clones on Aliexpress for cheap)\n* Two P-channel MOSFETS with SOT23 footprint (I use the DMG3415)\n* Two 10k\u03a9 resistors with 1206 footprint.\n* Two 10\u00b5F capacitors with 1206 footprint (rated voltage >=6.3V).\n* One 1\u00b5F capacitor with 0805 footprint (rated voltage >=6.3V).\n* Seven through-hole slim tactile buttons (3mm by 6mm; [TL1107 type](https://www.digikey.com/en/products/detail/e-switch/TL1107AF130WQ/378976))\n* One [GDEW042T2 grayscale e-paper display](https://buy-lcd.com/products/42inch-e-inkanel-spi-interface-buy-eaper-display). (Don't get the tri-color version; it'll end in heartbreak)\n* And finally, one [Raspberry Pi Pico](https://www.digikey.com/en/products/detail/raspberry-pi/SC0915/13624793) board\n\nThe Open Book is open source hardware: you should feel free to build one yourself, order parts for ten and do a workshop at your local maker space, or even buy parts for fifty and sell them as kits. \n\nPlease steal this book.\n\nI plan to add more documentation in the new year, but until then, [this half-hour video walks through building one Open Book board in real-time](https://twitter.com/i/broadcasts/1OyKAVPjrvaGb).\n\n### Forking and tweaking the boards\n\n* Design files for the Open Book main board can be found in the `OSO-BOOK-C1` folder. It's a KiCad project.\n* Design files for the castellated e-paper driver module can be found in the `OSO-BOOK-C2` folder. Alas, they are Eagle projects that predate my move to KiCad. There are two versions: an older version that was successfully fabricated with JLCPCB's economic PCBA service (`OSO-BOOK-C2-01`), and a newer version successfully fabricated using PCBWay's PCBA service (`OSO-BOOK-C2-02`). Both work great.\n\n## License\n\n Different components of the project are licensed differently, see [LICENSE.md](https://github.com/joeycastillo/The-Open-Book/blob/main/LICENSE.md).\n"
+    "content": "The-Open-Book"
 }