Skip to content

Commit 7c41b50

Browse files
committed
Remove has_timing_data from DatasetMetadata
1 parent 20f7901 commit 7c41b50

File tree

4 files changed

+2
-32
lines changed

4 files changed

+2
-32
lines changed

src/aiperf/dataset/dataset_manager.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
ServiceType,
1717
)
1818
from aiperf.common.environment import Environment
19-
from aiperf.common.factories import ComposerFactory, ServiceFactory
19+
from aiperf.common.factories import ComposerFactory, EndpointFactory, ServiceFactory
2020
from aiperf.common.hooks import on_command, on_request
2121
from aiperf.common.messages import (
2222
ConversationRequestMessage,
@@ -35,7 +35,7 @@
3535
RequestInfo,
3636
SessionPayloads,
3737
)
38-
from aiperf.common.protocols import ServiceProtocol
38+
from aiperf.common.protocols import EndpointProtocol, ServiceProtocol
3939
from aiperf.common.tokenizer import Tokenizer
4040
from aiperf.dataset.loader import ShareGPTLoader
4141

@@ -67,7 +67,6 @@ def __init__(
6767
self.dataset: dict[str, Conversation] = {} # session ID -> Conversation mapping
6868
self.dataset_metadata: DatasetMetadata | None = None
6969
self._session_ids_cache: list[str] = []
70-
self._has_timing_data: bool = False
7170
self.dataset_configured = asyncio.Event()
7271

7372
@on_command(CommandType.PROFILE_CONFIGURE)
@@ -109,8 +108,6 @@ def _generate_input_payloads(
109108
) -> InputsFile:
110109
"""Generate input payloads from the dataset for use in the inputs.json file."""
111110
inputs = InputsFile()
112-
from aiperf.common.factories import EndpointFactory
113-
from aiperf.common.protocols import EndpointProtocol
114111

115112
endpoint: EndpointProtocol = EndpointFactory.create_instance(
116113
model_endpoint.endpoint.type,
@@ -240,22 +237,13 @@ async def _configure_dataset(self) -> None:
240237
self._session_ids_cache = [
241238
conversation.session_id for conversation in conversations
242239
]
243-
# Check if all conversations have timing data (first turn must have a timestamp)
244-
# Empty conversations list should be treated as having no timing data
245-
# TODO: This is a temporary solution to check if the dataset has timing data (to be used with fixed schedule strategy)
246-
self._has_timing_data = len(conversations) > 0 and all(
247-
len(conversation.turns) > 0 and conversation.turns[0].timestamp is not None
248-
for conversation in conversations
249-
)
250240

251241
self.dataset_metadata = DatasetMetadata(
252242
conversations=[conversation.metadata() for conversation in conversations],
253243
sampling_strategy=self.user_config.input.dataset_sampling_strategy,
254-
has_timing_data=self._has_timing_data,
255244
)
256245
metadata = self.dataset_metadata
257246
self.info(
258-
f"Dataset metadata: has timing data: {metadata.has_timing_data}, "
259247
f"sampling strategy: {metadata.sampling_strategy}, "
260248
f"unique conversations: {len(metadata.conversations)}, "
261249
f"unique turn count: {sum(len(conversation.turns) for conversation in metadata.conversations)}"

tests/unit/dataset/test_dataset_manager.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ async def test_dataset_configured_notification_for_multi_turn_conversations(
4444
- Include one ConversationMetadata per conversation (not one per turn)
4545
- Include the first_turn_timestamp and turn_delays for each conversation
4646
- Have the correct turn count for each conversation
47-
- Mark has_timing_data as True
4847
"""
4948
# Mock the tokenizer to avoid HTTP requests
5049
mock_tokenizer_from_pretrained.return_value = (
@@ -104,7 +103,6 @@ async def mock_publish(msg):
104103

105104
# Verify dataset metadata structure
106105
assert len(metadata.conversations) == 2 # 2 conversations, not 5 turns
107-
assert metadata.has_timing_data is True
108106

109107
# Extract conversation metadata for easier testing
110108
conv_dict = {conv.conversation_id: conv for conv in metadata.conversations}

tests/unit/timing/conftest.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,6 @@ def create_mock_dataset_metadata(
299299
return DatasetMetadata(
300300
conversations=conversations,
301301
sampling_strategy=sampling_strategy,
302-
has_timing_data=has_timing_data,
303302
)
304303

305304

@@ -345,5 +344,4 @@ def create_mock_dataset_metadata_with_schedule(
345344
return DatasetMetadata(
346345
conversations=conversations,
347346
sampling_strategy=sampling_strategy,
348-
has_timing_data=True,
349347
)

tests/unit/timing/test_dataset_metadata_flow.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ def test_create_basic_metadata_without_timing_data(self):
2121
)
2222

2323
assert len(metadata.conversations) == 3
24-
assert metadata.has_timing_data is False
2524
assert metadata.sampling_strategy == DatasetSamplingStrategy.SEQUENTIAL
2625

2726
# Verify all conversations are present
@@ -45,7 +44,6 @@ def test_create_metadata_with_timing_data(self):
4544
)
4645

4746
assert len(metadata.conversations) == 2
48-
assert metadata.has_timing_data is True
4947

5048
# Find conversations by ID
5149
conv_dict = {conv.conversation_id: conv for conv in metadata.conversations}
@@ -81,7 +79,6 @@ def test_create_metadata_from_schedule(self):
8179
metadata = create_mock_dataset_metadata_with_schedule(schedule)
8280

8381
assert len(metadata.conversations) == 3
84-
assert metadata.has_timing_data is True
8582

8683
# Find conversations by ID
8784
conv_dict = {conv.conversation_id: conv for conv in metadata.conversations}
@@ -111,7 +108,6 @@ def test_create_metadata_with_empty_conversation_list(self):
111108
)
112109

113110
assert len(metadata.conversations) == 0
114-
assert metadata.has_timing_data is False
115111

116112

117113
class TestConversationMetadataValidation:
@@ -186,12 +182,10 @@ def test_dataset_metadata_with_all_fields(self):
186182
metadata = DatasetMetadata(
187183
conversations=conversations,
188184
sampling_strategy=DatasetSamplingStrategy.RANDOM,
189-
has_timing_data=True,
190185
)
191186

192187
assert len(metadata.conversations) == 2
193188
assert metadata.sampling_strategy == DatasetSamplingStrategy.RANDOM
194-
assert metadata.has_timing_data is True
195189

196190
def test_dataset_metadata_default_values(self):
197191
"""Test dataset metadata default values."""
@@ -200,14 +194,12 @@ def test_dataset_metadata_default_values(self):
200194
)
201195

202196
assert len(metadata.conversations) == 0
203-
assert metadata.has_timing_data is False
204197

205198
def test_dataset_metadata_empty_conversations(self):
206199
"""Test dataset metadata with empty conversations list."""
207200
metadata = DatasetMetadata(
208201
conversations=[],
209202
sampling_strategy=DatasetSamplingStrategy.SEQUENTIAL,
210-
has_timing_data=False,
211203
)
212204

213205
assert len(metadata.conversations) == 0
@@ -223,7 +215,6 @@ def test_create_mock_dataset_metadata_default_parameters(self):
223215
)
224216

225217
assert len(metadata.conversations) == 2
226-
assert metadata.has_timing_data is False
227218
assert metadata.sampling_strategy == DatasetSamplingStrategy.SEQUENTIAL
228219

229220
for conv in metadata.conversations:
@@ -254,8 +245,6 @@ def test_create_mock_dataset_metadata_with_timing_data_complete(self):
254245
turn_counts=[3, 3],
255246
)
256247

257-
assert metadata.has_timing_data is True
258-
259248
conv_dict = {conv.conversation_id: conv for conv in metadata.conversations}
260249

261250
assert conv_dict["conv1"].turns[0].timestamp_ms == 0
@@ -270,7 +259,6 @@ def test_create_mock_dataset_metadata_with_schedule_simple(self):
270259
metadata = create_mock_dataset_metadata_with_schedule(schedule)
271260

272261
assert len(metadata.conversations) == 3
273-
assert metadata.has_timing_data is True
274262

275263
conv_dict = {conv.conversation_id: conv for conv in metadata.conversations}
276264

@@ -297,7 +285,6 @@ def test_create_mock_dataset_metadata_with_schedule_multi_turn(self):
297285
metadata = create_mock_dataset_metadata_with_schedule(schedule)
298286

299287
assert len(metadata.conversations) == 2
300-
assert metadata.has_timing_data is True
301288

302289
conv_dict = {conv.conversation_id: conv for conv in metadata.conversations}
303290

@@ -333,7 +320,6 @@ def test_create_mock_dataset_metadata_with_schedule_empty(self):
333320
metadata = create_mock_dataset_metadata_with_schedule([])
334321

335322
assert len(metadata.conversations) == 0
336-
assert metadata.has_timing_data is True
337323

338324

339325
class TestDatasetMetadataIntegration:

0 commit comments

Comments
 (0)