Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Hugging face async coverage #44

Merged
merged 2 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 32 additions & 31 deletions scope3ai/tracers/huggingface/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from dataclasses import asdict, dataclass
from typing import Any, Callable, Optional, Union

import tiktoken
from huggingface_hub import AsyncInferenceClient, InferenceClient # type: ignore[import-untyped]
from huggingface_hub import ChatCompletionOutput as _ChatCompletionOutput
from huggingface_hub import ChatCompletionStreamOutput as _ChatCompletionStreamOutput
Expand Down Expand Up @@ -39,12 +40,12 @@ def huggingface_chat_wrapper_non_stream(
with hf_raise_for_status_capture() as capture_response:
response = wrapped(*args, **kwargs)
http_response = capture_response.get()
model_requested = instance.model
model_used = response.model
model = (
instance.model or kwargs.get("model") or instance.get_recommended_model("chat")
)
compute_time = http_response.headers.get("x-compute-time")
scope3_row = ImpactRow(
model=Model(id=model_requested),
model_used=Model(id=model_used),
model=Model(id=model),
input_tokens=response.usage.prompt_tokens,
output_tokens=response.usage.completion_tokens,
request_duration_ms=float(compute_time) * 1000,
Expand All @@ -60,27 +61,25 @@ def huggingface_chat_wrapper_stream(
wrapped: Callable, instance: InferenceClient, args: Any, kwargs: Any
) -> Iterable[ChatCompletionStreamOutput]:
timer_start = time.perf_counter()
if "stream_options" not in kwargs:
kwargs["stream_options"] = {}
if "include_usage" not in kwargs["stream_options"]:
kwargs["stream_options"]["include_usage"] = True
elif not kwargs["stream_options"]["include_usage"]:
raise ValueError("stream_options include_usage must be True")
stream = wrapped(*args, **kwargs)
token_count = 0
model = kwargs.get("model") or instance.get_recommended_model("chat")
model = (
instance.model or kwargs.get("model") or instance.get_recommended_model("chat")
)
for chunk in stream:
token_count += 1
request_latency = time.perf_counter() - timer_start
scope3_row = ImpactRow(
model=Model(id=model),
input_tokens=chunk.usage.prompt_tokens,
output_tokens=chunk.usage.completion_tokens,
request_duration_ms=request_latency,
output_tokens=token_count,
request_duration_ms=request_latency * 1000,
managed_service_id=PROVIDER,
)
chunk_data = ChatCompletionStreamOutput(**asdict(chunk))
scope3_ctx = Scope3AI.get_instance().submit_impact(scope3_row)
yield ChatCompletionStreamOutput(**asdict(chunk), scope3ai=scope3_ctx)
if scope3_ctx is not None:
chunk_data.scope3ai = scope3_ctx
yield chunk_data


async def huggingface_async_chat_wrapper(
Expand All @@ -97,18 +96,20 @@ async def huggingface_async_chat_wrapper(
async def huggingface_async_chat_wrapper_non_stream(
wrapped: Callable, instance: AsyncInferenceClient, args: Any, kwargs: Any
) -> ChatCompletionOutput:
with hf_raise_for_status_capture() as capture_response:
response = await wrapped(*args, **kwargs)
http_response = capture_response.get()
compute_time = http_response.headers.get("x-compute-time")
model = kwargs.get("model") or instance.get_recommended_model("chat")
timer_start = time.perf_counter()

response = await wrapped(*args, **kwargs)
request_latency = time.perf_counter() - timer_start
model = (
instance.model or kwargs.get("model") or instance.get_recommended_model("chat")
)
encoder = tiktoken.get_encoding("cl100k_base")
output_tokens = len(encoder.encode(response.choices[0].message.content))
scope3_row = ImpactRow(
model=Model(id=model),
input_tokens=response.usage.prompt_tokens,
output_tokens=response.usage.completion_tokens,
request_duration_ms=compute_time
* 1000
* 1000, # TODO: can we get the header that has the processing time
output_tokens=output_tokens,
request_duration_ms=request_latency * 1000,
managed_service_id=PROVIDER,
)

Expand All @@ -125,19 +126,19 @@ async def huggingface_async_chat_wrapper_stream(
timer_start = time.perf_counter()
stream = await wrapped(*args, **kwargs)
token_count = 0
model_request = kwargs["model"]
model_used = instance.model
model_used = instance.model or kwargs["model"]
async for chunk in stream:
token_count += 1
request_latency = time.perf_counter() - timer_start
scope3_row = ImpactRow(
model=Model(id=model_request),
model_used=Model(id=model_used),
input_tokens=chunk.usage.prompt_tokens,
output_tokens=chunk.usage.completion_tokens,
model=Model(id=model_used),
output_tokens=token_count,
request_duration_ms=request_latency
* 1000, # TODO: can we get the header that has the processing time
managed_service_id=PROVIDER,
)
scope3_ctx = Scope3AI.get_instance().submit_impact(scope3_row)
yield ChatCompletionStreamOutput(**asdict(chunk), scope3ai=scope3_ctx)
chunk_data = ChatCompletionStreamOutput(**asdict(chunk))
if scope3_ctx is not None:
chunk_data.scope3ai = scope3_ctx
yield chunk_data
39 changes: 39 additions & 0 deletions tests/cassettes/test_huggingface_hub_async_chat.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
interactions:
- request:
body: null
headers:
authorization:
- DUMMY
user-agent:
- unknown/None; hf_hub/0.26.5; python/3.12.8
method: POST
uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions
response:
body:
string: '{"object":"chat.completion","id":"","created":1735838922,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"message":{"role":"assistant","content":"Hello
World!"},"logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":13,"completion_tokens":4,"total_tokens":17}}'
headers:
Access-Control-Allow-Credentials:
- 'true'
Connection:
- keep-alive
Content-Length:
- '337'
Content-Type:
- application/json
Date:
- Thu, 02 Jan 2025 17:32:52 GMT
Vary:
- Origin, Access-Control-Request-Method, Access-Control-Request-Headers
x-compute-time:
- '0.088139193'
x-compute-type:
- cache
x-request-id:
- 4_n2Eny5ye789kjsN65eX
x-sha:
- 5f0b02c75b57c5855da9ae460ce51323ea669d8a
status:
code: 200
message: OK
version: 1
75 changes: 75 additions & 0 deletions tests/cassettes/test_huggingface_hub_async_stream_chat.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
interactions:
- request:
body: null
headers:
authorization:
- DUMMY
user-agent:
- unknown/None; hf_hub/0.26.5; python/3.12.8
method: POST
uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions
response:
body:
string: 'data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
World"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"!"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
It"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"''s"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
great"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
to"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
meet"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
you"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"!"},"logprobs":null,"finish_reason":"length"}],"usage":null}


data: [DONE]


'
headers:
Access-Control-Allow-Credentials:
- 'true'
Connection:
- keep-alive
Content-Length:
- '2844'
Content-Type:
- text/event-stream
Date:
- Thu, 02 Jan 2025 18:53:01 GMT
Vary:
- Origin, Access-Control-Request-Method, Access-Control-Request-Headers
x-compute-type:
- cache
x-request-id:
- qlnGH7f5zOaSRhrO2iqi3
x-sha:
- 5f0b02c75b57c5855da9ae460ce51323ea669d8a
status:
code: 200
message: OK
version: 1
108 changes: 108 additions & 0 deletions tests/cassettes/test_huggingface_hub_stream_chat.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
interactions:
- request:
body: '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role":
"user", "content": "Hello World!"}], "max_tokens": 15, "stream": true}'
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '141'
Content-Type:
- application/json
X-Amzn-Trace-Id:
- ddf52e0d-a99b-495d-927c-cc446d13fbb0
authorization:
- DUMMY
user-agent:
- unknown/None; hf_hub/0.26.5; python/3.12.8
method: POST
uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions
response:
body:
string: 'data: {"object":"chat.completion.chunk","id":"","created":1735842062,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735842062,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
World"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735842062,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"!"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735842062,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
It"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"''s"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
great"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
to"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
meet"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
you"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"!"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
Is"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
there"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
something"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
I"},"logprobs":null,"finish_reason":null}],"usage":null}


data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
can"},"logprobs":null,"finish_reason":"length"}],"usage":null}


data: [DONE]


'
headers:
Connection:
- keep-alive
Content-Length:
- '4264'
Content-Type:
- text/event-stream
Date:
- Thu, 02 Jan 2025 18:33:35 GMT
access-control-allow-credentials:
- 'true'
vary:
- Origin, Access-Control-Request-Method, Access-Control-Request-Headers
x-compute-type:
- cache
x-request-id:
- nR_dxHmltXw5bzzh9HLRi
x-sha:
- 5f0b02c75b57c5855da9ae460ce51323ea669d8a
status:
code: 200
message: OK
version: 1
Loading
Loading