Skip to content

Commit 722e375

Browse files
committed
feat: add --use-server-token-count CLI option to use server-reported token counts
1 parent 42c6829 commit 722e375

24 files changed

+693
-233
lines changed

docs/cli_options.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ The transport to use for the endpoint. If not provided, it will be auto-detected
6060

6161
Use the legacy 'max_tokens' field instead of 'max_completion_tokens' in request payloads. The OpenAI API now prefers 'max_completion_tokens', but some older APIs or implementations may require 'max_tokens'.
6262

63+
#### `--use-server-token-count`
64+
65+
Use server-reported token counts from API usage fields instead of client-side tokenization. When enabled, tokenizers are still loaded (needed for dataset generation) but tokenizer.encode() is not called for computing metrics. Token count fields will be None if the server does not provide usage information. For chat streaming endpoints, you may need to add `--extra-inputs '{"stream_options": {"include_usage": true}}'` to the CLI command.
66+
6367
## Input Options
6468

6569
#### `--extra-inputs` `<list>`

src/aiperf/common/config/config_defaults.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class EndpointDefaults:
3939
TIMEOUT = 600.0
4040
API_KEY = None
4141
USE_LEGACY_MAX_TOKENS = False
42+
USE_SERVER_TOKEN_COUNT = False
4243

4344

4445
@dataclass(frozen=True)

src/aiperf/common/config/endpoint_config.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,3 +178,22 @@ def validate_streaming(self) -> Self:
178178
group=_CLI_GROUP,
179179
),
180180
] = EndpointDefaults.USE_LEGACY_MAX_TOKENS
181+
182+
use_server_token_count: Annotated[
183+
bool,
184+
Field(
185+
description=(
186+
"Use server-reported token counts from API usage fields instead of "
187+
"client-side tokenization. When enabled, tokenizers are still loaded "
188+
"(needed for dataset generation) but tokenizer.encode() is not called "
189+
"for computing metrics. Token count fields will be None if the server "
190+
"does not provide usage information. For chat streaming endpoints, you may "
191+
'need to add `--extra-inputs \'{"stream_options": {"include_usage": true}}\'` '
192+
"to the CLI command."
193+
),
194+
),
195+
CLIParameter(
196+
name=("--use-server-token-count",),
197+
group=_CLI_GROUP,
198+
),
199+
] = EndpointDefaults.USE_SERVER_TOKEN_COUNT

src/aiperf/common/enums/metric_enums.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,9 @@ class MetricFlags(Flag):
678678
SUPPORTS_VIDEO_ONLY = 1 << 13
679679
"""Metrics that are only applicable to video-based endpoints."""
680680

681+
USAGE_DIFF_ONLY = 1 << 14
682+
"""Metrics that are only applicable when client side tokenization is enabled and the usage field is used."""
683+
681684
def has_flags(self, flags: "MetricFlags") -> bool:
682685
"""Return True if the metric has ALL of the given flag(s) (regardless of other flags)."""
683686
# Bitwise AND will return the input flags only if all of the given flags are present.

src/aiperf/common/models/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@
9393
SSEMessage,
9494
TextResponse,
9595
TextResponseData,
96+
TokenCounts,
9697
)
9798
from aiperf.common.models.sequence_distribution import (
9899
DistributionParser,
@@ -197,6 +198,7 @@
197198
"TextResponseData",
198199
"TimesliceCollectionExportData",
199200
"TimesliceData",
201+
"TokenCounts",
200202
"TransportMetadata",
201203
"Turn",
202204
"Usage",

src/aiperf/common/models/record_models.py

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from typing_extensions import Self
1717

1818
from aiperf.common.aiperf_logger import AIPerfLogger
19-
from aiperf.common.constants import NANOS_PER_SECOND, STAT_KEYS
19+
from aiperf.common.constants import STAT_KEYS
2020
from aiperf.common.enums import CreditPhase, SSEFieldType
2121
from aiperf.common.enums.metric_enums import MetricValueTypeT
2222
from aiperf.common.exceptions import InvalidInferenceResultError
@@ -707,24 +707,33 @@ class ParsedResponse(AIPerfBaseModel):
707707
)
708708

709709

710-
class ParsedResponseRecord(AIPerfBaseModel):
711-
"""Record of a request and its associated responses, already parsed and ready for metrics."""
710+
class TokenCounts(AIPerfBaseModel):
711+
"""Token counts for a record."""
712712

713-
request: RequestRecord = Field(description="The original request record")
714-
responses: list[ParsedResponse] = Field(description="The parsed responses.")
715-
input_token_count: int | None = Field(
713+
input: int | None = Field(
716714
default=None,
717715
description="The number of tokens in the input (client-side tokenization). If None, the number of tokens could not be calculated.",
718716
)
719-
output_token_count: int | None = Field(
717+
output: int | None = Field(
720718
default=None,
721719
description="The number of output tokens across all responses (client-side tokenization). If None, the number of tokens could not be calculated.",
722720
)
723-
reasoning_token_count: int | None = Field(
721+
reasoning: int | None = Field(
724722
default=None,
725723
description="The number of reasoning tokens across all responses (client-side tokenization). If None, the number of tokens could not be calculated, or the model does not support reasoning.",
726724
)
727725

726+
727+
class ParsedResponseRecord(AIPerfBaseModel):
728+
"""Record of a request and its associated responses, already parsed and ready for metrics."""
729+
730+
request: RequestRecord = Field(..., description="The original request record")
731+
responses: list[ParsedResponse] = Field(..., description="The parsed responses.")
732+
token_counts: TokenCounts | None = Field(
733+
default=None,
734+
description="The token counts for the response. If None, the token counts could not be calculated.",
735+
)
736+
728737
@cached_property
729738
def start_perf_ns(self) -> int:
730739
"""Get the start time of the request in nanoseconds (perf_counter_ns)."""
@@ -760,18 +769,6 @@ def content_responses(self) -> list[ParsedResponse]:
760769
"""
761770
return [response for response in self.responses if response.data]
762771

763-
@cached_property
764-
def request_duration_ns(self) -> int:
765-
"""Get the duration of the request in nanoseconds."""
766-
return self.end_perf_ns - self.start_perf_ns
767-
768-
@cached_property
769-
def tokens_per_second(self) -> float | None:
770-
"""Get the number of tokens per second of the request."""
771-
if self.output_token_count is None or self.request_duration_ns == 0:
772-
return None
773-
return self.output_token_count / (self.request_duration_ns / NANOS_PER_SECOND)
774-
775772
@property
776773
def has_error(self) -> bool:
777774
"""Check if the response record has an error."""

src/aiperf/metrics/base_record_metric.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def _parse_record(
2929
record: ParsedResponseRecord,
3030
record_metrics: MetricRecordDict,
3131
) -> int:
32-
return record.input_token_count
32+
return record.token_counts.input
3333
```
3434
"""
3535

src/aiperf/metrics/types/input_sequence_length_metric.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,10 @@ def _parse_record(
3636
Raises:
3737
ValueError: If the record does not have an input token count.
3838
"""
39-
if record.input_token_count is None:
39+
if record.token_counts is None or record.token_counts.input is None:
4040
raise NoMetricValue("Input Token Count is not available for the record.")
4141

42-
return record.input_token_count
42+
return record.token_counts.input
4343

4444

4545
class TotalInputSequenceLengthMetric(DerivedSumMetric[int, InputSequenceLengthMetric]):

src/aiperf/metrics/types/output_sequence_length_metric.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,14 @@ def _parse_record(
4040
Raises:
4141
ValueError: If the record does not have a output or reasoning token count.
4242
"""
43-
if record.output_token_count is None and record.reasoning_token_count is None:
43+
if record.token_counts is None or (
44+
record.token_counts.output is None and record.token_counts.reasoning is None
45+
):
4446
raise NoMetricValue(
4547
"Output and reasoning token counts are missing in the record."
4648
)
4749

48-
return (record.output_token_count or 0) + (record.reasoning_token_count or 0)
50+
return (record.token_counts.output or 0) + (record.token_counts.reasoning or 0)
4951

5052

5153
class TotalOutputSequenceLengthMetric(

src/aiperf/metrics/types/output_token_count.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,10 @@ def _parse_record(
4343
Raises:
4444
NoMetricValue: If the record does not have an output token count.
4545
"""
46-
if not record.output_token_count:
46+
if record.token_counts is None or not record.token_counts.output:
4747
raise NoMetricValue("Output token count is missing in the record.")
4848

49-
return record.output_token_count
49+
return record.token_counts.output
5050

5151

5252
class TotalOutputTokensMetric(DerivedSumMetric[int, OutputTokenCountMetric]):

0 commit comments

Comments
 (0)