ai-dynamo
diff --git a/‎docs/cli_options.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/cli_options.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/aiperf/common/config/config_defaults.py‎
Lines changed: 1 addition & 0 deletions b/‎src/aiperf/common/config/config_defaults.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/aiperf/common/config/endpoint_config.py‎
Lines changed: 19 additions & 0 deletions b/‎src/aiperf/common/config/endpoint_config.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/aiperf/common/enums/metric_enums.py‎
Lines changed: 3 additions & 0 deletions b/‎src/aiperf/common/enums/metric_enums.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/aiperf/common/models/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/aiperf/common/models/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/aiperf/common/models/record_models.py‎
Lines changed: 17 additions & 20 deletions b/‎src/aiperf/common/models/record_models.py‎
Lines changed: 17 additions & 20 deletions
diff --git a/‎src/aiperf/metrics/base_record_metric.py‎
Lines changed: 1 addition & 1 deletion b/‎src/aiperf/metrics/base_record_metric.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/aiperf/metrics/types/input_sequence_length_metric.py‎
Lines changed: 2 additions & 2 deletions b/‎src/aiperf/metrics/types/input_sequence_length_metric.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/aiperf/metrics/types/output_sequence_length_metric.py‎
Lines changed: 4 additions & 2 deletions b/‎src/aiperf/metrics/types/output_sequence_length_metric.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/aiperf/metrics/types/output_token_count.py‎
Lines changed: 2 additions & 2 deletions b/‎src/aiperf/metrics/types/output_token_count.py‎
Lines changed: 2 additions & 2 deletions
@@ -60,6 +60,10 @@ The transport to use for the endpoint. If not provided, it will be auto-detected
 
 Use the legacy 'max_tokens' field instead of 'max_completion_tokens' in request payloads. The OpenAI API now prefers 'max_completion_tokens', but some older APIs or implementations may require 'max_tokens'.
 
+#### `--use-server-token-count`
+
+Use server-reported token counts from API usage fields instead of client-side tokenization. When enabled, tokenizers are still loaded (needed for dataset generation) but tokenizer.encode() is not called for computing metrics. Token count fields will be None if the server does not provide usage information. For chat streaming endpoints, you may need to add `--extra-inputs '{"stream_options": {"include_usage": true}}'` to the CLI command.
+
 ## Input Options
 
 #### `--extra-inputs` `<list>`
 
@@ -39,6 +39,7 @@ class EndpointDefaults:
     TIMEOUT = 600.0
     API_KEY = None
     USE_LEGACY_MAX_TOKENS = False
+    USE_SERVER_TOKEN_COUNT = False
 
 
 @dataclass(frozen=True)
 
@@ -178,3 +178,22 @@ def validate_streaming(self) -> Self:
             group=_CLI_GROUP,
         ),
     ] = EndpointDefaults.USE_LEGACY_MAX_TOKENS
+
+    use_server_token_count: Annotated[
+        bool,
+        Field(
+            description=(
+                "Use server-reported token counts from API usage fields instead of "
+                "client-side tokenization. When enabled, tokenizers are still loaded "
+                "(needed for dataset generation) but tokenizer.encode() is not called "
+                "for computing metrics. Token count fields will be None if the server "
+                "does not provide usage information. For chat streaming endpoints, you may "
+                'need to add `--extra-inputs \'{"stream_options": {"include_usage": true}}\'` '
+                "to the CLI command."
+            ),
+        ),
+        CLIParameter(
+            name=("--use-server-token-count",),
+            group=_CLI_GROUP,
+        ),
+    ] = EndpointDefaults.USE_SERVER_TOKEN_COUNT
@@ -678,6 +678,9 @@ class MetricFlags(Flag):
     SUPPORTS_VIDEO_ONLY = 1 << 13
     """Metrics that are only applicable to video-based endpoints."""
 
+    USAGE_DIFF_ONLY = 1 << 14
+    """Metrics that are only applicable when client side tokenization is enabled and the usage field is used."""
+
     def has_flags(self, flags: "MetricFlags") -> bool:
         """Return True if the metric has ALL of the given flag(s) (regardless of other flags)."""
         # Bitwise AND will return the input flags only if all of the given flags are present.
 
@@ -93,6 +93,7 @@
     SSEMessage,
     TextResponse,
     TextResponseData,
+    TokenCounts,
 )
 from aiperf.common.models.sequence_distribution import (
     DistributionParser,
@@ -197,6 +198,7 @@
     "TextResponseData",
     "TimesliceCollectionExportData",
     "TimesliceData",
+    "TokenCounts",
     "TransportMetadata",
     "Turn",
     "Usage",
 
@@ -16,7 +16,7 @@
 from typing_extensions import Self
 
 from aiperf.common.aiperf_logger import AIPerfLogger
-from aiperf.common.constants import NANOS_PER_SECOND, STAT_KEYS
+from aiperf.common.constants import STAT_KEYS
 from aiperf.common.enums import CreditPhase, SSEFieldType
 from aiperf.common.enums.metric_enums import MetricValueTypeT
 from aiperf.common.exceptions import InvalidInferenceResultError
@@ -707,24 +707,33 @@ class ParsedResponse(AIPerfBaseModel):
     )
 
 
-class ParsedResponseRecord(AIPerfBaseModel):
-    """Record of a request and its associated responses, already parsed and ready for metrics."""
+class TokenCounts(AIPerfBaseModel):
+    """Token counts for a record."""
 
-    request: RequestRecord = Field(description="The original request record")
-    responses: list[ParsedResponse] = Field(description="The parsed responses.")
-    input_token_count: int | None = Field(
+    input: int | None = Field(
         default=None,
         description="The number of tokens in the input (client-side tokenization). If None, the number of tokens could not be calculated.",
     )
-    output_token_count: int | None = Field(
+    output: int | None = Field(
         default=None,
         description="The number of output tokens across all responses (client-side tokenization). If None, the number of tokens could not be calculated.",
     )
-    reasoning_token_count: int | None = Field(
+    reasoning: int | None = Field(
         default=None,
         description="The number of reasoning tokens across all responses (client-side tokenization). If None, the number of tokens could not be calculated, or the model does not support reasoning.",
     )
 
+
+class ParsedResponseRecord(AIPerfBaseModel):
+    """Record of a request and its associated responses, already parsed and ready for metrics."""
+
+    request: RequestRecord = Field(..., description="The original request record")
+    responses: list[ParsedResponse] = Field(..., description="The parsed responses.")
+    token_counts: TokenCounts | None = Field(
+        default=None,
+        description="The token counts for the response. If None, the token counts could not be calculated.",
+    )
+
     @cached_property
     def start_perf_ns(self) -> int:
         """Get the start time of the request in nanoseconds (perf_counter_ns)."""
@@ -760,18 +769,6 @@ def content_responses(self) -> list[ParsedResponse]:
         """
         return [response for response in self.responses if response.data]
 
-    @cached_property
-    def request_duration_ns(self) -> int:
-        """Get the duration of the request in nanoseconds."""
-        return self.end_perf_ns - self.start_perf_ns
-
-    @cached_property
-    def tokens_per_second(self) -> float | None:
-        """Get the number of tokens per second of the request."""
-        if self.output_token_count is None or self.request_duration_ns == 0:
-            return None
-        return self.output_token_count / (self.request_duration_ns / NANOS_PER_SECOND)
-
     @property
     def has_error(self) -> bool:
         """Check if the response record has an error."""
 
@@ -29,7 +29,7 @@ def _parse_record(
             record: ParsedResponseRecord,
             record_metrics: MetricRecordDict,
         ) -> int:
-            return record.input_token_count
+            return record.token_counts.input
     ```
     """
 
 
@@ -36,10 +36,10 @@ def _parse_record(
         Raises:
             ValueError: If the record does not have an input token count.
         """
-        if record.input_token_count is None:
+        if record.token_counts is None or record.token_counts.input is None:
             raise NoMetricValue("Input Token Count is not available for the record.")
 
-        return record.input_token_count
+        return record.token_counts.input
 
 
 class TotalInputSequenceLengthMetric(DerivedSumMetric[int, InputSequenceLengthMetric]):
 
@@ -40,12 +40,14 @@ def _parse_record(
         Raises:
             ValueError: If the record does not have a output or reasoning token count.
         """
-        if record.output_token_count is None and record.reasoning_token_count is None:
+        if record.token_counts is None or (
+            record.token_counts.output is None and record.token_counts.reasoning is None
+        ):
             raise NoMetricValue(
                 "Output and reasoning token counts are missing in the record."
             )
 
-        return (record.output_token_count or 0) + (record.reasoning_token_count or 0)
+        return (record.token_counts.output or 0) + (record.token_counts.reasoning or 0)
 
 
 class TotalOutputSequenceLengthMetric(
 
@@ -43,10 +43,10 @@ def _parse_record(
         Raises:
             NoMetricValue: If the record does not have an output token count.
         """
-        if not record.output_token_count:
+        if record.token_counts is None or not record.token_counts.output:
             raise NoMetricValue("Output token count is missing in the record.")
 
-        return record.output_token_count
+        return record.token_counts.output
 
 
 class TotalOutputTokensMetric(DerivedSumMetric[int, OutputTokenCountMetric]):