Module 7 (#25)

kyryl-opens-ml · Sep 28, 2024 · b17f8a5 · b17f8a5
1 parent f0c1890
commit b17f8a5
Show file tree

Hide file tree

Showing 5 changed files with 195 additions and 26 deletions.
diff --git a/module-2/PRACTICE.md b/module-2/PRACTICE.md
@@ -30,7 +30,7 @@
 - PR2: Develop a CRUD Python client for MinIO and accompany it with comprehensive tests.
 - PR3: Write code to benchmark various Pandas formats in terms of data saving/loading, focusing on load time and save time.
 - PR4: Create code to benchmark inference performance using single and multiple processes, and report the differences in time.
-- PR5: Develop code for converting your dataset into the StreamingDataset format.
+- PR5 (optional): Develop code for converting your dataset into the StreamingDataset format.
 - PR6: Write code for transforming your dataset into a vector format, and utilize VectorDB for ingestion and querying.
 - Google Doc: Update your proposal by adding a section on data storage and processing.
 
@@ -58,8 +58,8 @@
 - Google doc containing dataset labeling section: Estimate costs and time based on your experience labeling ~50 samples, provide instructions for future data labeling, and add a flow for data enrichment in production.
 - PR1: Commit your data with DVC into the GitHub repo.
 - PR2: Write code to deploy a labeling tool (e.g., Label Studio, Argilla), including README instructions.
-- PR3: Write code to generate a synthetic dataset with ChatGPT.
-- PR4: Write code to test your data after labeling (can use Cleanlab or Deepchecks).
+- PR3 (optional): Write code to generate a synthetic dataset with ChatGPT.
+- PR4 (optional): Write code to test your data after labeling (can use Cleanlab or Deepchecks).
 
 ## Criteria:
 

diff --git a/module-7/README.md b/module-7/README.md
@@ -19,7 +19,7 @@
 Create kind cluster 
 
 ```
-kind create cluster --name ml-in-production-course-week-7
+kind create cluster --name ml-in-production
 ```
 
 Run k9s 
@@ -28,6 +28,56 @@ Run k9s
 k9s -A
 ```
 
+# LLM Observability
+
+https://docs.google.com/presentation/d/13ePQfvgSPioMmDN0OOQklvX7PQQ4RcEVCWm9ljum4aU/edit#slide=id.g2f7f3a46425_0_422
+
+
+## Apps
+
+Setup
+
+```
+export PYTHONPATH=llm-apps/AI-Scientist/
+export TRACELOOP_BASE_URL="http://localhost:4318"
+export OPENAI_API_KEY=sk-proj-****
+export LANGCHAIN_TRACING_V2=true
+export LANGCHAIN_API_KEY=lsv2_-****
+```
+
+Run SQL
+
+```
+python llm-apps/sql_app.py
+```
+
+Run AI-Scientist
+
+```
+python llm-apps/reviewer.py
+```
+
+## SigNoz 
+
+
+Install 
+
+```
+DEFAULT_STORAGE_CLASS=$(kubectl get storageclass -o=jsonpath='{.items[?(@.metadata.annotations.storageclass\.kubernetes\.io/is-default-class=="true")].metadata.name}')
+kubectl patch storageclass "$DEFAULT_STORAGE_CLASS" -p '{"allowVolumeExpansion": true}'
+
+helm repo add signoz https://charts.signoz.io
+helm repo list
+kubectl create ns platform
+helm --namespace platform install my-release signoz/signoz
+```
+
+Access:
+
+```
+kubectl --namespace platform port-forward svc/my-release-signoz-frontend 3301:3301
+kubectl --namespace platform port-forward svc/my-release-signoz-otel-collector 4318:4318
+```
 
 # Grafana 
 
@@ -128,25 +178,3 @@ seldon pipeline list
 
 - https://docs.seldon.io/projects/seldon-core/en/latest/analytics/outlier_detection.html
 - https://docs.seldon.io/projects/seldon-core/en/latest/analytics/drift_detection.html
-
-
-## LLM Observability
-
-- https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/llmops/
-- https://www.gantry.io/
-- https://arize.com/
-- https://github.com/Arize-ai/phoenix
-- https://www.rungalileo.io/
-- https://www.vellum.ai/
-- https://smith.langchain.com
-- https://www.arthur.ai/
-- https://docs.wandb.ai/guides/weave/prod-mon
-- https://www.comet.com/site/
-- https://mlflow.org/
-- https://getnomos.com/
-- https://www.modo.ai/
-- https://www.nebuly.com/
-- https://github.com/traceloop/openllmetry
-- https://github.com/evidentlyai/ml_observability_course
-
-
diff --git a/module-7/llm-apps/.gitignore b/module-7/llm-apps/.gitignore
@@ -0,0 +1,2 @@
+AI-Scientist
+2408.06292v2_no_appendix.pdf
diff --git a/module-7/llm-apps/reviewer.py b/module-7/llm-apps/reviewer.py
@@ -0,0 +1,57 @@
+
+from ai_scientist.perform_review import load_paper, perform_review
+import openai
+import typer
+import agentops
+from rich.console import Console
+from langsmith import traceable
+from langsmith.wrappers import wrap_openai
+from traceloop.sdk import Traceloop
+from traceloop.sdk.decorators import workflow
+
+
+
+def review_paper(paper_pdf_path: str, client: openai.OpenAI) -> str:
+    model="gpt-4o-mini-2024-07-18"
+    # Load paper from pdf file (raw text)
+    paper_txt = load_paper(paper_pdf_path)
+    review = perform_review(
+        paper_txt,
+        model,
+        client,
+        num_reflections=5,
+        num_fs_examples=1,
+        num_reviews_ensemble=5,
+        temperature=0.1,
+    )
+
+    res = f'{review["Overall"]}\n{review["Decision"]}\n{review["Weaknesses"]}'
+    return res
+
+
+console = Console()
+
+
+def run_pipeline():
+
+    paper_pdf_path = "llm-apps/2408.06292v2_no_appendix.pdf"
+
+    # console.print("1. Agentops", style="bold green")
+    # agentops.init()
+    # client_agentops = openai.Client()
+    # result = review_paper(paper_pdf_path=paper_pdf_path, client=client_agentops)
+    # agentops.end_session()
+
+    # console.print("2. LangSmith", style="bold green")
+    # client_lang_smith = wrap_openai(openai.Client())
+    # result = review_paper(paper_pdf_path=paper_pdf_path, client=client_lang_smith)
+
+    console.print("3. OpenllMetry", style="bold green")
+    Traceloop.init(app_name="ai-scientist-v2")    
+    client_traceloop = openai.Client()
+    review_paper_traceloop = workflow(name="paper-review")(review_paper)
+    result = review_paper_traceloop(paper_pdf_path=paper_pdf_path, client=client_traceloop)
+
+    print(f"result = {result}")
+if __name__ == "__main__":
+    typer.run(run_pipeline)
diff --git a/module-7/llm-apps/sql_app.py b/module-7/llm-apps/sql_app.py
@@ -0,0 +1,82 @@
+import json
+
+import evaluate
+from datasets import Dataset
+from openai import OpenAI
+import openai
+from tqdm import tqdm
+import typer
+from typing import Tuple
+from datasets import load_dataset
+import random 
+import functools
+import agentops
+from rich.console import Console
+from langsmith import traceable
+from langsmith.wrappers import wrap_openai
+from traceloop.sdk import Traceloop
+from traceloop.sdk.decorators import workflow
+
+console = Console()
+
+
+def get_random_datapint() -> Tuple[str, str]:
+    dataset = load_dataset("gretelai/synthetic_text_to_sql", split='train')
+    dataset_size = len(dataset)
+
+    index = random.randint(0, dataset_size - 1)
+    sample = dataset[index]
+
+    sql_context = sample['sql_context']
+    sql_prompt = sample['sql_prompt']
+    return sql_context, sql_prompt
+
+def get_sql(query: str, context: str, client: OpenAI) -> str:
+
+    prompt = f""""
+    Write the corresponding SQL query based on user requests and database context:
+
+    User requests: {query}
+    Database context: {context}
+
+    Please return in JSON format: {{"sql": ""}}
+    """
+
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
+        model="gpt-4o-mini-2024-07-18",
+        response_format={"type": "json_object"},
+    )
+
+    return json.loads(chat_completion.choices[0].message.content)["sql"]
+
+
+def run_pipeline():
+
+    sql_context, sql_prompt = get_random_datapint()
+
+    # console.print("1. Agentops", style="bold green")
+    # agentops.init()
+    # client_agentops = openai.Client()
+    # result = get_sql(query=sql_prompt, context=sql_context, client=client_agentops)
+    # agentops.end_all_sessions()
+
+    # console.print("2. LangSmith", style="bold green")
+    # client_lang_smith = wrap_openai(openai.Client())
+    # result = get_sql(query=sql_prompt, context=sql_context, client=client_lang_smith)
+
+    console.print("3. OpenllMetry", style="bold green")
+    Traceloop.init(app_name="text2sql-v2")    
+
+    client_traceloop = openai.Client()
+    get_sql_traceloop = workflow(name="get_sql")(get_sql)
+    result = get_sql_traceloop(query=sql_prompt, context=sql_context, client=client_traceloop)
+
+    print(f"result = {result}")
+if __name__ == "__main__":
+    typer.run(run_pipeline)