Skip to content

Commit

Permalink
fix: Hugging face async coverage (#44)
Browse files Browse the repository at this point in the history
* fix: Hugging face async coverage

* fix: delete env var
  • Loading branch information
kevdevg authored Jan 2, 2025
1 parent 11d0204 commit 4b41096
Show file tree
Hide file tree
Showing 5 changed files with 290 additions and 41 deletions.
63 changes: 32 additions & 31 deletions scope3ai/tracers/huggingface/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from dataclasses import asdict, dataclass
from typing import Any, Callable, Optional, Union

import tiktoken
from huggingface_hub import AsyncInferenceClient, InferenceClient # type: ignore[import-untyped]
from huggingface_hub import ChatCompletionOutput as _ChatCompletionOutput
from huggingface_hub import ChatCompletionStreamOutput as _ChatCompletionStreamOutput
Expand Down Expand Up @@ -39,12 +40,12 @@ def huggingface_chat_wrapper_non_stream(
with hf_raise_for_status_capture() as capture_response:
response = wrapped(*args, **kwargs)
http_response = capture_response.get()
model_requested = instance.model
model_used = response.model
model = (
instance.model or kwargs.get("model") or instance.get_recommended_model("chat")
)
compute_time = http_response.headers.get("x-compute-time")
scope3_row = ImpactRow(
model=Model(id=model_requested),
model_used=Model(id=model_used),
model=Model(id=model),
input_tokens=response.usage.prompt_tokens,
output_tokens=response.usage.completion_tokens,
request_duration_ms=float(compute_time) * 1000,
Expand All @@ -60,27 +61,25 @@ def huggingface_chat_wrapper_stream(
wrapped: Callable, instance: InferenceClient, args: Any, kwargs: Any
) -> Iterable[ChatCompletionStreamOutput]:
timer_start = time.perf_counter()
if "stream_options" not in kwargs:
kwargs["stream_options"] = {}
if "include_usage" not in kwargs["stream_options"]:
kwargs["stream_options"]["include_usage"] = True
elif not kwargs["stream_options"]["include_usage"]:
raise ValueError("stream_options include_usage must be True")
stream = wrapped(*args, **kwargs)
token_count = 0
model = kwargs.get("model") or instance.get_recommended_model("chat")
model = (
instance.model or kwargs.get("model") or instance.get_recommended_model("chat")
)
for chunk in stream:
token_count += 1
request_latency = time.perf_counter() - timer_start
scope3_row = ImpactRow(
model=Model(id=model),
input_tokens=chunk.usage.prompt_tokens,
output_tokens=chunk.usage.completion_tokens,
request_duration_ms=request_latency,
output_tokens=token_count,
request_duration_ms=request_latency * 1000,
managed_service_id=PROVIDER,
)
chunk_data = ChatCompletionStreamOutput(**asdict(chunk))
scope3_ctx = Scope3AI.get_instance().submit_impact(scope3_row)
yield ChatCompletionStreamOutput(**asdict(chunk), scope3ai=scope3_ctx)
if scope3_ctx is not None:
chunk_data.scope3ai = scope3_ctx
yield chunk_data


async def huggingface_async_chat_wrapper(
Expand All @@ -97,18 +96,20 @@ async def huggingface_async_chat_wrapper(
async def huggingface_async_chat_wrapper_non_stream(
wrapped: Callable, instance: AsyncInferenceClient, args: Any, kwargs: Any
) -> ChatCompletionOutput:
with hf_raise_for_status_capture() as capture_response:
response = await wrapped(*args, **kwargs)
http_response = capture_response.get()
compute_time = http_response.headers.get("x-compute-time")
model = kwargs.get("model") or instance.get_recommended_model("chat")
timer_start = time.perf_counter()

response = await wrapped(*args, **kwargs)
request_latency = time.perf_counter() - timer_start
model = (
instance.model or kwargs.get("model") or instance.get_recommended_model("chat")
)
encoder = tiktoken.get_encoding("cl100k_base")
output_tokens = len(encoder.encode(response.choices[0].message.content))
scope3_row = ImpactRow(
model=Model(id=model),
input_tokens=response.usage.prompt_tokens,
output_tokens=response.usage.completion_tokens,
request_duration_ms=compute_time
* 1000
* 1000, # TODO: can we get the header that has the processing time
output_tokens=output_tokens,
request_duration_ms=request_latency * 1000,
managed_service_id=PROVIDER,
)

Expand All @@ -125,19 +126,19 @@ async def huggingface_async_chat_wrapper_stream(
timer_start = time.perf_counter()
stream = await wrapped(*args, **kwargs)
token_count = 0
model_request = kwargs["model"]
model_used = instance.model
model_used = instance.model or kwargs["model"]
async for chunk in stream:
token_count += 1
request_latency = time.perf_counter() - timer_start
scope3_row = ImpactRow(
model=Model(id=model_request),
model_used=Model(id=model_used),
input_tokens=chunk.usage.prompt_tokens,
output_tokens=chunk.usage.completion_tokens,
model=Model(id=model_used),
output_tokens=token_count,
request_duration_ms=request_latency
* 1000, # TODO: can we get the header that has the processing time
managed_service_id=PROVIDER,
)
scope3_ctx = Scope3AI.get_instance().submit_impact(scope3_row)
yield ChatCompletionStreamOutput(**asdict(chunk), scope3ai=scope3_ctx)
chunk_data = ChatCompletionStreamOutput(**asdict(chunk))
if scope3_ctx is not None:
chunk_data.scope3ai = scope3_ctx
yield chunk_data
39 changes: 39 additions & 0 deletions tests/cassettes/test_huggingface_hub_async_chat.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
interactions:
- request:
body: null
headers:
authorization:
- DUMMY
user-agent:
- unknown/None; hf_hub/0.26.5; python/3.12.8
method: POST
uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions
response:
body:
string: '{"object":"chat.completion","id":"","created":1735838922,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"message":{"role":"assistant","content":"Hello
World!"},"logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":13,"completion_tokens":4,"total_tokens":17}}'
headers:
Access-Control-Allow-Credentials:
- 'true'
Connection:
- keep-alive
Content-Length:
- '337'
Content-Type:
- application/json
Date:
- Thu, 02 Jan 2025 17:32:52 GMT
Vary:
- Origin, Access-Control-Request-Method, Access-Control-Request-Headers
x-compute-time:
- '0.088139193'
x-compute-type:
- cache
x-request-id:
- 4_n2Eny5ye789kjsN65eX
x-sha:
- 5f0b02c75b57c5855da9ae460ce51323ea669d8a
status:
code: 200
message: OK
version: 1
75 changes: 75 additions & 0 deletions tests/cassettes/test_huggingface_hub_async_stream_chat.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
interactions:
- request:
body: null
headers:
authorization:
- DUMMY
user-agent:
- unknown/None; hf_hub/0.26.5; python/3.12.8
method: POST
uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions
response:
body:
string: 'data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
World"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"!"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
It"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"''s"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
great"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
to"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
meet"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
you"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735843719,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"!"},"logprobs":null,"finish_reason":"length"}],"usage":null}
data: [DONE]
'
headers:
Access-Control-Allow-Credentials:
- 'true'
Connection:
- keep-alive
Content-Length:
- '2844'
Content-Type:
- text/event-stream
Date:
- Thu, 02 Jan 2025 18:53:01 GMT
Vary:
- Origin, Access-Control-Request-Method, Access-Control-Request-Headers
x-compute-type:
- cache
x-request-id:
- qlnGH7f5zOaSRhrO2iqi3
x-sha:
- 5f0b02c75b57c5855da9ae460ce51323ea669d8a
status:
code: 200
message: OK
version: 1
108 changes: 108 additions & 0 deletions tests/cassettes/test_huggingface_hub_stream_chat.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
interactions:
- request:
body: '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role":
"user", "content": "Hello World!"}], "max_tokens": 15, "stream": true}'
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '141'
Content-Type:
- application/json
X-Amzn-Trace-Id:
- ddf52e0d-a99b-495d-927c-cc446d13fbb0
authorization:
- DUMMY
user-agent:
- unknown/None; hf_hub/0.26.5; python/3.12.8
method: POST
uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions
response:
body:
string: 'data: {"object":"chat.completion.chunk","id":"","created":1735842062,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735842062,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
World"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735842062,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"!"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735842062,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
It"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"''s"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
great"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
to"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
meet"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
you"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"!"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
Is"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
there"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
something"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
I"},"logprobs":null,"finish_reason":null}],"usage":null}
data: {"object":"chat.completion.chunk","id":"","created":1735842063,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.3.1-dev0-sha-169178b","choices":[{"index":0,"delta":{"role":"assistant","content":"
can"},"logprobs":null,"finish_reason":"length"}],"usage":null}
data: [DONE]
'
headers:
Connection:
- keep-alive
Content-Length:
- '4264'
Content-Type:
- text/event-stream
Date:
- Thu, 02 Jan 2025 18:33:35 GMT
access-control-allow-credentials:
- 'true'
vary:
- Origin, Access-Control-Request-Method, Access-Control-Request-Headers
x-compute-type:
- cache
x-request-id:
- nR_dxHmltXw5bzzh9HLRi
x-sha:
- 5f0b02c75b57c5855da9ae460ce51323ea669d8a
status:
code: 200
message: OK
version: 1
Loading

0 comments on commit 4b41096

Please sign in to comment.