From 56d73c1a3f511831ba40cbb7940591b67de05d59 Mon Sep 17 00:00:00 2001 From: Jerry Liu Date: Fri, 17 May 2024 09:22:16 -0700 Subject: [PATCH] llamaparse example over caltrain schedule (#185) --- examples/caltrain/caltrain_text_mode.ipynb | 529 +++++++++++++++++++++ 1 file changed, 529 insertions(+) create mode 100644 examples/caltrain/caltrain_text_mode.ipynb diff --git a/examples/caltrain/caltrain_text_mode.ipynb b/examples/caltrain/caltrain_text_mode.ipynb new file mode 100644 index 0000000..b860f95 --- /dev/null +++ b/examples/caltrain/caltrain_text_mode.ipynb @@ -0,0 +1,529 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c148b65e-e8a6-476e-86ba-bf6a73d479c7", + "metadata": {}, + "source": [ + "# RAG over the Caltrain Weekend Schedule \n", + "\n", + "\"Open\n", + "\n", + "This example shows off LlamaParse parsing capabilities to build a functioning query pipeline over the Caltrain weekend schedule, a big timetable containing all trains northbound and southbound and their stops in various cities.\n", + "\n", + "Naive parsing solutions mess up in representing this tabular representation, leading to LLM hallucinations. In contrast, LlamaParse text-mode spatially lays out the table in a neat format, enabling more sophisticated LLMs like gpt-4-turbo to understand the spacing and reason over all the numbers.\n", + "\n", + "**NOTE**: LlamaParse markdown mode doesn't quite work yet - it's in development!" + ] + }, + { + "cell_type": "markdown", + "id": "ef115dbe-b834-4639-828e-e2c11aef710b", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Download the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6ae2e38-30c9-4865-aa13-47780bc3848f", + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "335ce1d0-757a-4f09-846e-21c409768871", + "metadata": {}, + "outputs": [], + "source": [ + "!wget \"https://www.caltrain.com/media/31602/download?inline?inline\" -O caltrain_schedule_weekend.pdf" + ] + }, + { + "cell_type": "markdown", + "id": "45fa9120-65bb-4772-9db7-53e7cecf9adc", + "metadata": {}, + "source": [ + "## Initialize LlamaParse\n", + "\n", + "Initialize LlamaParse in `text` mode which will represent complex documents incl. text, tables, and figures as nicely formatted text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54aa9579-84d4-49bc-ab54-5474e69c1188", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jerryliu/Programming/llama_parse/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Started parsing the file under job_id 5f73353a-1f4b-480d-9eea-58d1d22b75f6\n" + ] + } + ], + "source": [ + "from llama_parse import LlamaParse\n", + "\n", + "docs = LlamaParse(result_type=\"text\").load_data(\"./caltrain_schedule_weekend.pdf\")" + ] + }, + { + "cell_type": "markdown", + "id": "602756b2-9ea1-4519-a8e3-c773ec624205", + "metadata": {}, + "source": [ + "Take a look at the below text (and zoom out from the browser to really get the effect!). You'll see that the entire table is nicely laid out." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4928281a-591a-4653-b451-b2b8112a7101", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ZONE 2ZONE 3ZONE 4ZONE 4 ZONE 3ZONE 2ZONE 1ZONE 1\n", + " Printer-Friendly Caltrain Schedule\n", + " Northbound – WEEKEND SERVICE to SAN FRANCISCO 2XX Local\n", + "\n", + "\n", + " Train No. 221 225 229 233 237 241 245 249 253 257 261 265 269 273 *277 *281\n", + " Service Types L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2\n", + " Tamien 7:12a 9:05a 10:05a 11:05a 1:05p 3:05p 5:05p 7:05p 9:05p 11:05p\n", + " San Jose Diridon 7:19a 9:12a 10:12a 11:12a 12:12p 1:12p 2:12p 3:12p 4:12p 5:12p 6:12p 7:12p 8:12p 9:12p 10:19p 11:12p\n", + " Santa Clara 7:25a 9:18a 10:18a 11:18a 12:18p 1:18p 2:18p 3:18p 4:18p 5:18p 6:18p 7:18p 8:18p 9:18p 10:25p 11:18p\n", + " Lawrence 7:31a 9:24a 10:24a 11:24a 12:24p 1:24p 2:24p 3:24p 4:24p 5:24p 6:24p 7:24p 8:24p 9:24p 10:31p 11:24p\n", + " Sunnyvale 7:35a 9:28a 10:28a 11:28a 12:28p 1:28p 2:28p 3:28p 4:28p 5:28p 6:28p 7:28p 8:28p 9:28p 10:35p 11:28p\n", + " Mountain View 7:40a 9:34a 10:34a 11:34a 12:34p 1:34p 2:34p 3:34p 4:34p 5:34p 6:34p 7:34p 8:34p 9:34p 10:40p 11:34p\n", + " San Antonio 7:43a 9:37a 10:37a 11:37a 12:37p 1:37p 2:37p 3:37p 4:37p 5:37p 6:37p 7:37p 8:37p 9:37p 10:44p 11:37p\n", + " California Ave 7:48a 9:42a 10:42a 11:42a 12:42p 1:42p 2:42p 3:42p 4:42p 5:42p 6:42p 7:42p 8:42p 9:42p 10:48p 11:42p\n", + " Palo Alto 7:52a 9:46a 10:46a 11:46a 12:46p 1:46p 2:46p 3:46p 4:46p 5:46p 6:46p 7:46p 8:46p 9:46p 10:53p 11:46p\n", + " Menlo Park 7:55a 9:50a 10:50a 11:50a 12:50p 1:50p 2:50p 3:50p 4:50p 5:50p 6:50p 7:50p 8:50p 9:50p 10:56p 11:50p\n", + " Redwood City 8:01a 9:56a 10:56a 11:56a 12:56p 1:56p 2:56p 3:56p 4:56p 5:56p 6:56p 7:56p 8:56p 9:56p 11:02p 11:56p\n", + " San Carlos 8:05a 10:01a 11:01a 12:01p 1:01p 2:01p 3:01p 4:01p 5:01p 6:01p 7:01p 8:01p 9:01p 10:01p 11:07p 12:01a\n", + " Belmont 8:09a 10:04a 11:04a 12:04p 1:04p 2:04p 3:04p 4:04p 5:04p 6:04p 7:04p 8:04p 9:04p 10:04p 11:10p 12:04a\n", + " Hillsdale 8:12a 10:08a 11:08a 12:08p 1:08p 2:08p 3:08p 4:08p 5:08p 6:08p 7:08p 8:08p 9:08p 10:08p 11:14p 12:08a\n", + " Hayward Park 8:15a 10:11a 11:11a 12:11p 1:11p 2:11p 3:11p 4:11p 5:11p 6:11p 7:11p 8:11p 9:11p 10:11p 11:17p 12:11a\n", + " San Mateo 8:19a 10:15a 11:15a 12:15p 1:15p 2:15p 3:15p 4:15p 5:15p 6:15p 7:15p 8:15p 9:15p 10:15p 11:21p 12:15a\n", + " Burlingame 8:22a 10:19a 11:19a 12:19p 1:19p 2:19p 3:19p 4:19p 5:19p 6:19p 7:19p 8:19p 9:19p 10:19p 11:25p 12:19a\n", + " Broadway 8:25a 10:22a 11:22a 12:22p 1:22p 2:22p 3:22p 4:22p 5:22p 6:22p 7:22p 8:22p 9:22p 10:22p 11:28p 12:22a\n", + " Millbrae 8:29a 10:26a 11:26a 12:26p 1:26p 2:26p 3:26p 4:26p 5:26p 6:26p 7:26p 8:26p 9:26p 10:26p 11:32p 12:26a\n", + " San Bruno 8:34a 10:30a 11:30a 12:30p 1:30p 2:30p 3:30p 4:30p 5:30p 6:30p 7:30p 8:30p 9:30p 10:30p 11:37p 12:30a\n", + " S. San Francisco 8:38a 10:34a 11:34a 12:34p 1:34p 2:34p 3:34p 4:34p 5:34p 6:34p 7:34p 8:34p 9:34p 10:34p 11:41p 12:34a\n", + " Bayshore 8:44a 10:41a 11:41a 12:41p 1:41p 2:41p 3:41p 4:41p 5:41p 6:41p 7:41p 8:41p 9:41p 10:41p 11:47p 12:41a\n", + " 22 ndStreet 8:50a 10:46a 11:46a 12:46p 1:46p 2:46p 3:46p 4:46p 5:46p 6:46p 7:46p 8:46p 9:46p 10:46p 11:53p 12:46a\n", + " San Francisco 8:56a 10:52a 11:53a 12:53p 1:52p 2:52p 3:52p 4:52p 5:52p 6:52p 7:52p 8:52p 9:52p 10:52p 11:59p 12:52a\n", + " *On SAP Center event days, Train 277 or Train 281departure from San Jose Diridon station may be delayed and will depart no later than 10:30p or 11:30p respectively.\n", + "\n", + "\n", + " Southbound – WEEKEND SERVICE to SAN JOSE 2XX Local\n", + " Train No. 224 228 232 236 240 244 248 252 256 260 264 268 272 276 280 284\n", + " Service Types L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2\n", + " San Francisco 8:28a 9:58a 10:58a 11:58a 12:58p 1:58p 2:58p 3:58p 4:58p 5:58p 6:58p 7:58p 8:58p 9:58p 10:58p 12:05a\n", + " 22 ndStreet 8:33a 10:03a 11:03a 12:03p 1:03p 2:03p 3:03p 4:03p 5:03p 6:03p 7:03p 8:03p 9:03p 10:03p 11:03p 12:10a\n", + " Bayshore 8:38a 10:08a 11:08a 12:08p 1:08p 2:08p 3:08p 4:08p 5:08p 6:08p 7:08p 8:08p 9:08p 10:08p 11:08p 12:15a\n", + " S. San Francisco 8:45a 10:15a 11:15a 12:15p 1:15p 2:15p 3:15p 4:15p 5:15p 6:15p 7:15p 8:15p 9:15p 10:15p 11:15p 12:22a\n", + " San Bruno 8:49a 10:19a 11:19a 12:19p 1:19p 2:19p 3:19p 4:19p 5:19p 6:19p 7:19p 8:19p 9:19p 10:19p 11:19p 12:26a\n", + " Millbrae 8:53a 10:24a 11:24a 12:24p 1:24p 2:24p 3:24p 4:24p 5:24p 6:24p 7:24p 8:24p 9:24p 10:24p 11:24p 12:31a\n", + " Broadway 8:57a 10:27a 11:27a 12:27p 1:27p 2:27p 3:27p 4:27p 5:27p 6:27p 7:27p 8:27p 9:27p 10:27p 11:27p 12:35a\n", + " Burlingame 9:00a 10:31a 11:31a 12:31p 1:31p 2:31p 3:31p 4:31p 5:31p 6:31p 7:31p 8:31p 9:31p 10:31p 11:31p 12:38a\n", + " San Mateo 9:04a 10:34a 11:34a 12:34p 1:34p 2:34p 3:34p 4:34p 5:34p 6:34p 7:34p 8:34p 9:34p 10:34p 11:34p 12:41a\n", + " Hayward Park 9:07a 10:37a 11:37a 12:37p 1:37p 2:37p 3:37p 4:37p 5:37p 6:37p 7:37p 8:37p 9:37p 10:37p 11:37p 12:45a\n", + " Hillsdale 9:10a 10:41a 11:41a 12:41p 1:41p 2:41p 3:41p 4:41p 5:41p 6:41p 7:41p 8:41p 9:41p 10:41p 11:41p 12:48a\n", + " Belmont 9:14a 10:44a 11:44a 12:44p 1:44p 2:44p 3:44p 4:44p 5:44p 6:44p 7:44p 8:44p 9:44p 10:44p 11:44p 12:52a\n", + " San Carlos 9:17a 10:48a 11:48a 12:48p 1:48p 2:48p 3:48p 4:48p 5:48p 6:48p 7:48p 8:48p 9:48p 10:48p 11:48p 12:55a\n", + " Redwood City 9:21a 10:52a 11:52a 12:52p 1:52p 2:52p 3:52p 4:52p 5:52p 6:52p 7:52p 8:52p 9:52p 10:52p 11:52p 12:59a\n", + " Menlo Park 9:28a 10:58a 11:58a 12:58p 1:58p 2:58p 3:58p 4:58p 5:58p 6:58p 7:58p 8:58p 9:58p 10:58p 11:58p 1:05a\n", + " Palo Alto 9:32a 11:02a 12:02p 1:02p 2:02p 3:02p 4:02p 5:02p 6:02p 7:02p 8:02p 9:02p 10:02p 11:02p 12:02a 1:09a\n", + " California Avenue 9:36a 11:06a 12:06p 1:06p 2:06p 3:06p 4:06p 5:06p 6:06p 7:06p 8:06p 9:06p 10:06p 11:06p 12:06a 1:12a\n", + " San Antonio 9:41a 11:11a 12:11p 1:11p 2:11p 3:11p 4:11p 5:11p 6:11p 7:11p 8:11p 9:11p 10:11p 11:11p 12:10a 1:17a\n", + " Mountain View 9:45a 11:16a 12:16p 1:16p 2:16p 3:16p 4:16p 5:16p 6:16p 7:16p 8:16p 9:16p 10:16p 11:16p 12:15a 1:21a\n", + " Sunnyvale 9:51a 11:21a 12:21p 1:21p 2:21p 3:21p 4:21p 5:21p 6:21p 7:21p 8:21p 9:21p 10:21p 11:21p 12:20a 1:26a\n", + " Lawrence 9:55a 11:26a 12:26p 1:26p 2:26p 3:26p 4:26p 5:26p 6:26p 7:26p 8:26p 9:26p 10:26p 11:26p 12:25a 1:31a\n", + " Santa Clara 10:01a 11:32a 12:32p 1:32p 2:32p 3:32p 4:32p 5:32p 6:32p 7:32p 8:32p 9:32p 10:32p 11:32p 12:31a 1:37a\n", + " San Jose Diridon 10:10a 11:40a 12:40p 1:38p 2:40p 3:38p 4:40p 5:38p 6:40p 7:38p 8:40p 9:38p 10:40p 11:38p 12:39a 1:44a\n", + " Tamien 10:15a 11:45a 12:45p 2:45p 4:45p 6:45p 8:45p 10:45p 12:44a 1:49a\n", + " EFFECTIVE September 12, 2022 Timetable subject to change without notice.\n" + ] + } + ], + "source": [ + "print(docs[0].get_content())" + ] + }, + { + "cell_type": "markdown", + "id": "8f5064d4-3e33-4f67-9b2e-46787161538f", + "metadata": {}, + "source": [ + "## Initialize Query Engine\n", + "\n", + "We now initialize a query engine over this data. Here we use a baseline summary index, which doesn't do vector indexing/chunking and instead dumps the entire text into the prompt.\n", + "\n", + "We see that the LLM (gpt-4-turbo) is able to provide all the stops for train no 225 northbound." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3e985b6-9d38-449f-9cf9-aae166824eed", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core import SummaryIndex\n", + "from llama_index.llms.openai import OpenAI\n", + "\n", + "llm = OpenAI(model=\"gpt-4o\")\n", + "index = SummaryIndex.from_documents(docs)\n", + "query_engine = index.as_query_engine(llm=llm)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66eb0976-2cd6-4b14-9083-124baae9ed5d", + "metadata": {}, + "outputs": [], + "source": [ + "response = query_engine.query(\n", + " \"What are the stops (and times) for train no 237 northbound?\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7dc6f275-07f4-429e-9335-f50982fe974c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The stops and times for train no. 237 northbound are as follows:\n", + "\n", + "- San Jose Diridon: 12:12 PM\n", + "- Santa Clara: 12:18 PM\n", + "- Lawrence: 12:24 PM\n", + "- Sunnyvale: 12:28 PM\n", + "- Mountain View: 12:34 PM\n", + "- San Antonio: 12:37 PM\n", + "- California Ave: 12:42 PM\n", + "- Palo Alto: 12:46 PM\n", + "- Menlo Park: 12:50 PM\n", + "- Redwood City: 12:56 PM\n", + "- San Carlos: 1:01 PM\n", + "- Belmont: 1:04 PM\n", + "- Hillsdale: 1:08 PM\n", + "- Hayward Park: 1:11 PM\n", + "- San Mateo: 1:15 PM\n", + "- Burlingame: 1:19 PM\n", + "- Broadway: 1:22 PM\n", + "- Millbrae: 1:26 PM\n", + "- San Bruno: 1:30 PM\n", + "- S. San Francisco: 1:34 PM\n", + "- Bayshore: 1:41 PM\n", + "- 22nd Street: 1:46 PM\n", + "- San Francisco: 1:52 PM\n" + ] + } + ], + "source": [ + "print(str(response))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "229c4cb0-cf94-4a9f-bc7c-590388f50c1f", + "metadata": {}, + "outputs": [], + "source": [ + "response = query_engine.query(\n", + " \"What are all the trains (and times) that end at Tamien going Southbound?\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6cf9fce0-5067-48f6-a7ef-62aa9e2edc3d", + "metadata": {}, + "source": [ + "It gets most of the answers correct (to be fair it misses two trains)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51cf03ff-7728-4815-ab72-3bf54fc4a2c0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The trains that end at Tamien going Southbound are:\n", + "\n", + "- Train 224 at 10:15a\n", + "- Train 228 at 11:45a\n", + "- Train 240 at 2:45p\n", + "- Train 248 at 4:45p\n", + "- Train 256 at 6:45p\n", + "- Train 264 at 8:45p\n", + "- Train 272 at 10:45p\n", + "- Train 284 at 1:49a\n" + ] + } + ], + "source": [ + "print(str(response))" + ] + }, + { + "cell_type": "markdown", + "id": "e51e7feb-b74f-4101-8963-933ac7ec9763", + "metadata": {}, + "source": [ + "## Try Baseline\n", + "\n", + "In contrast, we try a baseline approach with the default PDF reader (PyPDF) in `SimpleDirectoryReader`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "364e5155-cc75-4302-a754-9444ae28e6b1", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core import SimpleDirectoryReader\n", + "from llama_index.core import SummaryIndex\n", + "from llama_index.llms.openai import OpenAI\n", + "\n", + "llm = OpenAI(model=\"gpt-4o\")\n", + "input_file = \"caltrain_schedule_weekend.pdf\"\n", + "reader = SimpleDirectoryReader(input_files=[input_file])\n", + "base_docs = reader.load_data()\n", + "index = SummaryIndex.from_documents(base_docs)\n", + "base_query_engine = index.as_query_engine(llm=llm)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4011389-2d27-4a1a-bf8d-7309da28ab15", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Southbound – WEEKEND SERVICE to SAN JOSE\n", + "Train No. 224 228 232 236 240 244 248 252 256 260 264 268 272 276 280 284\n", + "Service Types L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2\n", + "San Francisco 8:28a 9:58a 10:58a 11:58a 12:58p 1:58p 2:58p 3:58p 4:58p 5:58p 6:58p 7:58p 8:58p 9:58p 10:58p 12:05a\n", + "22nd Street 8:33a 10:03a 11:03a 12:03p 1:03p 2:03p 3:03p 4:03p 5:03p 6:03p 7:03p 8:03p 9:03p 10:03p 11:03p 12:10a\n", + "Bayshore 8:38a 10:08a 11:08a 12:08p 1:08p 2:08p 3:08p 4:08p 5:08p 6:08p 7:08p 8:08p 9:08p 10:08p 11:08p 12:15a\n", + "S. San Francisco 8:45a 10:15a 11:15a 12:15p 1:15p 2:15p 3:15p 4:15p 5:15p 6:15p 7:15p 8:15p 9:15p 10:15p 11:15p 12:22a\n", + "San Bruno 8:49a 10:19a 11:19a 12:19p 1:19p 2:19p 3:19p 4:19p 5:19p 6:19p 7:19p 8:19p 9:19p 10:19p 11:19p 12:26a\n", + "Millbrae 8:53a 10:24a 11:24a 12:24p 1:24p 2:24p 3:24p 4:24p 5:24p 6:24p 7:24p 8:24p 9:24p 10:24p 11:24p 12:31a\n", + "Broadway 8:57a 10:27a 11:27a 12:27p 1:27p 2:27p 3:27p 4:27p 5:27p 6:27p 7:27p 8:27p 9:27p 10:27p 11:27p 12:35a\n", + "Burlingame 9:00a 10:31a 11:31a 12:31p 1:31p 2:31p 3:31p 4:31p 5:31p 6:31p 7:31p 8:31p 9:31p 10:31p 11:31p 12:38a\n", + "San Mateo 9:04a 10:34a 11:34a 12:34p 1:34p 2:34p 3:34p 4:34p 5:34p 6:34p 7:34p 8:34p 9:34p 10:34p 11:34p 12:41a\n", + "Hayward Park 9:07a 10:37a 11:37a 12:37p 1:37p 2:37p 3:37p 4:37p 5:37p 6:37p 7:37p 8:37p 9:37p 10:37p 11:37p 12:45a\n", + "Hillsdale 9:10a 10:41a 11:41a 12:41p 1:41p 2:41p 3:41p 4:41p 5:41p 6:41p 7:41p 8:41p 9:41p 10:41p 11:41p 12:48a\n", + "Belmont 9:14a 10:44a 11:44a 12:44p 1:44p 2:44p 3:44p 4:44p 5:44p 6:44p 7:44p 8:44p 9:44p 10:44p 11:44p 12:52a\n", + "San Carlos 9:17a 10:48a 11:48a 12:48p 1:48p 2:48p 3:48p 4:48p 5:48p 6:48p 7:48p 8:48p 9:48p 10:48p 11:48p 12:55a\n", + "Redwood City 9:21a 10:52a 11:52a 12:52p 1:52p 2:52p 3:52p 4:52p 5:52p 6:52p 7:52p 8:52p 9:52p 10:52p 11:52p 12:59a\n", + "Menlo Park 9:28a 10:58a 11:58a 12:58p 1:58p 2:58p 3:58p 4:58p 5:58p 6:58p 7:58p 8:58p 9:58p 10:58p 11:58p 1:05a\n", + "Palo Alto 9:32a 11:02a 12:02p 1:02p 2:02p 3:02p 4:02p 5:02p 6:02p 7:02p 8:02p 9:02p 10:02p 11:02p 12:02a 1:09a\n", + "California Avenue 9:36a 11:06a 12:06p 1:06p 2:06p 3:06p 4:06p 5:06p 6:06p 7:06p 8:06p 9:06p 10:06p 11:06p 12:06a 1:12a\n", + "San Antonio 9:41a 11:11a 12:11p 1:11p 2:11p 3:11p 4:11p 5:11p 6:11p 7:11p 8:11p 9:11p 10:11p 11:11p 12:10a 1:17a\n", + "Mountain View 9:45a 11:16a 12:16p 1:16p 2:16p 3:16p 4:16p 5:16p 6:16p 7:16p 8:16p 9:16p 10:16p 11:16p 12:15a 1:21a\n", + "Sunnyvale 9:51a 11:21a 12:21p 1:21p 2:21p 3:21p 4:21p 5:21p 6:21p 7:21p 8:21p 9:21p 10:21p 11:21p 12:20a 1:26a\n", + "Lawrence 9:55a 11:26a 12:26p 1:26p 2:26p 3:26p 4:26p 5:26p 6:26p 7:26p 8:26p 9:26p 10:26p 11:26p 12:25a 1:31a\n", + "Santa Clara 10:01a 11:32a 12:32p 1:32p 2:32p 3:32p 4:32p 5:32p 6:32p 7:32p 8:32p 9:32p 10:32p 11:32p 12:31a 1:37a\n", + "San Jose Diridon 10:10a 11:40a 12:40p 1:38p 2:40p 3:38p 4:40p 5:38p 6:40p 7:38p 8:40p 9:38p 10:40p 11:38p 12:39a 1:44a\n", + "Tamien 10:15a 11:45a 12:45p 2:45p 4:45p 6:45p 8:45p 10:45p 12:44a 1:49aPrinter-Friendly Caltrain Schedule\n", + "Northbound – WEEKEND SERVICE to SAN FRANCISCO\n", + "Train No. 221 225 229 233 237 241 245 249 253 257 261 265 269 273 *277 *281\n", + "Service Types L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2\n", + "Tamien 7:12a 9:05a 10:05a 11:05a 1:05p 3:05p 5:05p 7:05p 9:05p 11:05p\n", + "San Jose Diridon 7:19a 9:12a 10:12a 11:12a 12:12p 1:12p 2:12p 3:12p 4:12p 5:12p 6:12p 7:12p 8:12p 9:12p 10:19p 11:12p\n", + "Santa Clara 7:25a 9:18a 10:18a 11:18a 12:18p 1:18p 2:18p 3:18p 4:18p 5:18p 6:18p 7:18p 8:18p 9:18p 10:25p 11:18p\n", + "Lawrence 7:31a 9:24a 10:24a 11:24a 12:24p 1:24p 2:24p 3:24p 4:24p 5:24p 6:24p 7:24p 8:24p 9:24p 10:31p 11:24p\n", + "Sunnyvale 7:35a 9:28a 10:28a 11:28a 12:28p 1:28p 2:28p 3:28p 4:28p 5:28p 6:28p 7:28p 8:28p 9:28p 10:35p 11:28p\n", + "Mountain View 7:40a 9:34a 10:34a 11:34a 12:34p 1:34p 2:34p 3:34p 4:34p 5:34p 6:34p 7:34p 8:34p 9:34p 10:40p 11:34p\n", + "San Antonio 7:43a 9:37a 10:37a 11:37a 12:37p 1:37p 2:37p 3:37p 4:37p 5:37p 6:37p 7:37p 8:37p 9:37p 10:44p 11:37p\n", + "California Ave 7:48a 9:42a 10:42a 11:42a 12:42p 1:42p 2:42p 3:42p 4:42p 5:42p 6:42p 7:42p 8:42p 9:42p 10:48p 11:42p\n", + "Palo Alto 7:52a 9:46a 10:46a 11:46a 12:46p 1:46p 2:46p 3:46p 4:46p 5:46p 6:46p 7:46p 8:46p 9:46p 10:53p 11:46p\n", + "Menlo Park 7:55a 9:50a 10:50a 11:50a 12:50p 1:50p 2:50p 3:50p 4:50p 5:50p 6:50p 7:50p 8:50p 9:50p 10:56p 11:50p\n", + "Redwood City 8:01a 9:56a 10:56a 11:56a 12:56p 1:56p 2:56p 3:56p 4:56p 5:56p 6:56p 7:56p 8:56p 9:56p 11:02p 11:56p\n", + "San Carlos 8:05a 10:01a 11:01a 12:01p 1:01p 2:01p 3:01p 4:01p 5:01p 6:01p 7:01p 8:01p 9:01p 10:01p 11:07p 12:01a\n", + "Belmont 8:09a 10:04a 11:04a 12:04p 1:04p 2:04p 3:04p 4:04p 5:04p 6:04p 7:04p 8:04p 9:04p 10:04p 11:10p 12:04a\n", + "Hillsdale 8:12a 10:08a 11:08a 12:08p 1:08p 2:08p 3:08p 4:08p 5:08p 6:08p 7:08p 8:08p 9:08p 10:08p 11:14p 12:08a\n", + "Hayward Park 8:15a 10:11a 11:11a 12:11p 1:11p 2:11p 3:11p 4:11p 5:11p 6:11p 7:11p 8:11p 9:11p 10:11p 11:17p 12:11a\n", + "San Mateo 8:19a 10:15a 11:15a 12:15p 1:15p 2:15p 3:15p 4:15p 5:15p 6:15p 7:15p 8:15p 9:15p 10:15p 11:21p 12:15a\n", + "Burlingame 8:22a 10:19a 11:19a 12:19p 1:19p 2:19p 3:19p 4:19p 5:19p 6:19p 7:19p 8:19p 9:19p 10:19p 11:25p 12:19a\n", + "Broadway 8:25a 10:22a 11:22a 12:22p 1:22p 2:22p 3:22p 4:22p 5:22p 6:22p 7:22p 8:22p 9:22p 10:22p 11:28p 12:22a\n", + "Millbrae 8:29a 10:26a 11:26a 12:26p 1:26p 2:26p 3:26p 4:26p 5:26p 6:26p 7:26p 8:26p 9:26p 10:26p 11:32p 12:26a\n", + "San Bruno 8:34a 10:30a 11:30a 12:30p 1:30p 2:30p 3:30p 4:30p 5:30p 6:30p 7:30p 8:30p 9:30p 10:30p 11:37p 12:30a\n", + "S. San Francisco 8:38a 10:34a 11:34a 12:34p 1:34p 2:34p 3:34p 4:34p 5:34p 6:34p 7:34p 8:34p 9:34p 10:34p 11:41p 12:34a\n", + "Bayshore 8:44a 10:41a 11:41a 12:41p 1:41p 2:41p 3:41p 4:41p 5:41p 6:41p 7:41p 8:41p 9:41p 10:41p 11:47p 12:41a\n", + "22nd Street 8:50a 10:46a 11:46a 12:46p 1:46p 2:46p 3:46p 4:46p 5:46p 6:46p 7:46p 8:46p 9:46p 10:46p 11:53p 12:46a\n", + "San Francisco 8:56a 10:52a 11:53a 12:53p 1:52p 2:52p 3:52p 4:52p 5:52p 6:52p 7:52p 8:52p 9:52p 10:52p 11:59p 12:52aZONE 2 ZONE 3 ZONE 4 ZONE 4 ZONE 3 ZONE 2 ZONE 1 ZONE 12XX Local\n", + "2XX Local\n", + "EFFECTIVE September 12, 2022 Timetable subject to change without notice. *On SAP Center event days, Train 277 or Train 281departure from San Jose Diridon station may be delayed and will depart no later than 10:30p or 11:30p respectively.\n" + ] + } + ], + "source": [ + "print(base_docs[0].get_content())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42203c70-7ca7-4200-bf47-6282eefca3bf", + "metadata": {}, + "outputs": [], + "source": [ + "base_response = base_query_engine.query(\n", + " \"What are the stops (and times) for train no 237 northbound?\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06aa47b6-0f31-4b2d-90f0-bf6c74befd38", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train No. 237 northbound stops at the following stations and times:\n", + "\n", + "- Tamien: 1:05p\n", + "- San Jose Diridon: 1:12p\n", + "- Santa Clara: 1:18p\n", + "- Lawrence: 1:24p\n", + "- Sunnyvale: 1:28p\n", + "- Mountain View: 1:34p\n", + "- San Antonio: 1:37p\n", + "- California Ave: 1:42p\n", + "- Palo Alto: 1:46p\n", + "- Menlo Park: 1:50p\n", + "- Redwood City: 1:56p\n", + "- San Carlos: 2:01p\n", + "- Belmont: 2:04p\n", + "- Hillsdale: 2:08p\n", + "- Hayward Park: 2:11p\n", + "- San Mateo: 2:15p\n", + "- Burlingame: 2:19p\n", + "- Broadway: 2:22p\n", + "- Millbrae: 2:26p\n", + "- San Bruno: 2:30p\n", + "- S. San Francisco: 2:34p\n", + "- Bayshore: 2:41p\n", + "- 22nd Street: 2:46p\n", + "- San Francisco: 2:52p\n" + ] + } + ], + "source": [ + "print(str(base_response))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f3c1de7-3351-4cd8-991c-34a777952194", + "metadata": {}, + "outputs": [], + "source": [ + "base_response = base_query_engine.query(\n", + " \"What are all the trains (and times) that end at Tamien going Southbound?\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "513b1007-7508-4fb1-836c-de9353433a67", + "metadata": {}, + "source": [ + "Note that the trains don't line up with the times!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "108edb92-76af-406b-a139-8b9e7c6528f2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The trains that end at Tamien going Southbound are:\n", + "\n", + "- Train 224 at 10:15a\n", + "- Train 228 at 11:45a\n", + "- Train 240 at 2:45p\n", + "- Train 252 at 4:45p\n", + "- Train 264 at 6:45p\n", + "- Train 276 at 8:45p\n", + "- Train 284 at 10:45p\n", + "- Train 284 at 12:44a\n" + ] + } + ], + "source": [ + "print(str(base_response))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llama_parse", + "language": "python", + "name": "llama_parse" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}