Skip to content

Commit

Permalink
Module 7 (#25)
Browse files Browse the repository at this point in the history
  • Loading branch information
truskovskiyk authored Sep 28, 2024
1 parent f0c1890 commit b17f8a5
Show file tree
Hide file tree
Showing 5 changed files with 195 additions and 26 deletions.
6 changes: 3 additions & 3 deletions module-2/PRACTICE.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
- PR2: Develop a CRUD Python client for MinIO and accompany it with comprehensive tests.
- PR3: Write code to benchmark various Pandas formats in terms of data saving/loading, focusing on load time and save time.
- PR4: Create code to benchmark inference performance using single and multiple processes, and report the differences in time.
- PR5: Develop code for converting your dataset into the StreamingDataset format.
- PR5 (optional): Develop code for converting your dataset into the StreamingDataset format.
- PR6: Write code for transforming your dataset into a vector format, and utilize VectorDB for ingestion and querying.
- Google Doc: Update your proposal by adding a section on data storage and processing.

Expand Down Expand Up @@ -58,8 +58,8 @@
- Google doc containing dataset labeling section: Estimate costs and time based on your experience labeling ~50 samples, provide instructions for future data labeling, and add a flow for data enrichment in production.
- PR1: Commit your data with DVC into the GitHub repo.
- PR2: Write code to deploy a labeling tool (e.g., Label Studio, Argilla), including README instructions.
- PR3: Write code to generate a synthetic dataset with ChatGPT.
- PR4: Write code to test your data after labeling (can use Cleanlab or Deepchecks).
- PR3 (optional): Write code to generate a synthetic dataset with ChatGPT.
- PR4 (optional): Write code to test your data after labeling (can use Cleanlab or Deepchecks).

## Criteria:

Expand Down
74 changes: 51 additions & 23 deletions module-7/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
Create kind cluster

```
kind create cluster --name ml-in-production-course-week-7
kind create cluster --name ml-in-production
```

Run k9s
Expand All @@ -28,6 +28,56 @@ Run k9s
k9s -A
```

# LLM Observability

https://docs.google.com/presentation/d/13ePQfvgSPioMmDN0OOQklvX7PQQ4RcEVCWm9ljum4aU/edit#slide=id.g2f7f3a46425_0_422


## Apps

Setup

```
export PYTHONPATH=llm-apps/AI-Scientist/
export TRACELOOP_BASE_URL="http://localhost:4318"
export OPENAI_API_KEY=sk-proj-****
export LANGCHAIN_TRACING_V2=true
export LANGCHAIN_API_KEY=lsv2_-****
```

Run SQL

```
python llm-apps/sql_app.py
```

Run AI-Scientist

```
python llm-apps/reviewer.py
```

## SigNoz


Install

```
DEFAULT_STORAGE_CLASS=$(kubectl get storageclass -o=jsonpath='{.items[?(@.metadata.annotations.storageclass\.kubernetes\.io/is-default-class=="true")].metadata.name}')
kubectl patch storageclass "$DEFAULT_STORAGE_CLASS" -p '{"allowVolumeExpansion": true}'
helm repo add signoz https://charts.signoz.io
helm repo list
kubectl create ns platform
helm --namespace platform install my-release signoz/signoz
```

Access:

```
kubectl --namespace platform port-forward svc/my-release-signoz-frontend 3301:3301
kubectl --namespace platform port-forward svc/my-release-signoz-otel-collector 4318:4318
```

# Grafana

Expand Down Expand Up @@ -128,25 +178,3 @@ seldon pipeline list

- https://docs.seldon.io/projects/seldon-core/en/latest/analytics/outlier_detection.html
- https://docs.seldon.io/projects/seldon-core/en/latest/analytics/drift_detection.html


## LLM Observability

- https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/llmops/
- https://www.gantry.io/
- https://arize.com/
- https://github.com/Arize-ai/phoenix
- https://www.rungalileo.io/
- https://www.vellum.ai/
- https://smith.langchain.com
- https://www.arthur.ai/
- https://docs.wandb.ai/guides/weave/prod-mon
- https://www.comet.com/site/
- https://mlflow.org/
- https://getnomos.com/
- https://www.modo.ai/
- https://www.nebuly.com/
- https://github.com/traceloop/openllmetry
- https://github.com/evidentlyai/ml_observability_course


2 changes: 2 additions & 0 deletions module-7/llm-apps/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
AI-Scientist
2408.06292v2_no_appendix.pdf
57 changes: 57 additions & 0 deletions module-7/llm-apps/reviewer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@

from ai_scientist.perform_review import load_paper, perform_review
import openai
import typer
import agentops
from rich.console import Console
from langsmith import traceable
from langsmith.wrappers import wrap_openai
from traceloop.sdk import Traceloop
from traceloop.sdk.decorators import workflow



def review_paper(paper_pdf_path: str, client: openai.OpenAI) -> str:
model="gpt-4o-mini-2024-07-18"
# Load paper from pdf file (raw text)
paper_txt = load_paper(paper_pdf_path)
review = perform_review(
paper_txt,
model,
client,
num_reflections=5,
num_fs_examples=1,
num_reviews_ensemble=5,
temperature=0.1,
)

res = f'{review["Overall"]}\n{review["Decision"]}\n{review["Weaknesses"]}'
return res


console = Console()


def run_pipeline():

paper_pdf_path = "llm-apps/2408.06292v2_no_appendix.pdf"

# console.print("1. Agentops", style="bold green")
# agentops.init()
# client_agentops = openai.Client()
# result = review_paper(paper_pdf_path=paper_pdf_path, client=client_agentops)
# agentops.end_session()

# console.print("2. LangSmith", style="bold green")
# client_lang_smith = wrap_openai(openai.Client())
# result = review_paper(paper_pdf_path=paper_pdf_path, client=client_lang_smith)

console.print("3. OpenllMetry", style="bold green")
Traceloop.init(app_name="ai-scientist-v2")
client_traceloop = openai.Client()
review_paper_traceloop = workflow(name="paper-review")(review_paper)
result = review_paper_traceloop(paper_pdf_path=paper_pdf_path, client=client_traceloop)

print(f"result = {result}")
if __name__ == "__main__":
typer.run(run_pipeline)
82 changes: 82 additions & 0 deletions module-7/llm-apps/sql_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import json

import evaluate
from datasets import Dataset
from openai import OpenAI
import openai
from tqdm import tqdm
import typer
from typing import Tuple
from datasets import load_dataset
import random
import functools
import agentops
from rich.console import Console
from langsmith import traceable
from langsmith.wrappers import wrap_openai
from traceloop.sdk import Traceloop
from traceloop.sdk.decorators import workflow

console = Console()


def get_random_datapint() -> Tuple[str, str]:
dataset = load_dataset("gretelai/synthetic_text_to_sql", split='train')
dataset_size = len(dataset)

index = random.randint(0, dataset_size - 1)
sample = dataset[index]

sql_context = sample['sql_context']
sql_prompt = sample['sql_prompt']
return sql_context, sql_prompt

def get_sql(query: str, context: str, client: OpenAI) -> str:

prompt = f""""
Write the corresponding SQL query based on user requests and database context:
User requests: {query}
Database context: {context}
Please return in JSON format: {{"sql": ""}}
"""

chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": prompt,
}
],
model="gpt-4o-mini-2024-07-18",
response_format={"type": "json_object"},
)

return json.loads(chat_completion.choices[0].message.content)["sql"]


def run_pipeline():

sql_context, sql_prompt = get_random_datapint()

# console.print("1. Agentops", style="bold green")
# agentops.init()
# client_agentops = openai.Client()
# result = get_sql(query=sql_prompt, context=sql_context, client=client_agentops)
# agentops.end_all_sessions()

# console.print("2. LangSmith", style="bold green")
# client_lang_smith = wrap_openai(openai.Client())
# result = get_sql(query=sql_prompt, context=sql_context, client=client_lang_smith)

console.print("3. OpenllMetry", style="bold green")
Traceloop.init(app_name="text2sql-v2")

client_traceloop = openai.Client()
get_sql_traceloop = workflow(name="get_sql")(get_sql)
result = get_sql_traceloop(query=sql_prompt, context=sql_context, client=client_traceloop)

print(f"result = {result}")
if __name__ == "__main__":
typer.run(run_pipeline)

0 comments on commit b17f8a5

Please sign in to comment.