diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py
index 3868b14f29a..37ff1126a76 100644
--- a/ddtrace/llmobs/_constants.py
+++ b/ddtrace/llmobs/_constants.py
@@ -104,6 +104,7 @@
 
 PROXY_REQUEST = "llmobs.proxy_request"
 
+# experiment span baggage keys to be propagated across boundaries
 EXPERIMENT_ID_KEY = "_ml_obs.experiment_id"
 EXPERIMENT_RUN_ID_KEY = "_ml_obs.experiment_run_id"
 EXPERIMENT_RUN_ITERATION_KEY = "_ml_obs.experiment_run_iteration"
@@ -111,6 +112,9 @@
 EXPERIMENT_PROJECT_ID_KEY = "_ml_obs.experiment_project_id"
 EXPERIMENT_DATASET_NAME_KEY = "_ml_obs.experiment_dataset_name"
 EXPERIMENT_NAME_KEY = "_ml_obs.experiment_name"
+
+# experiment context keys
+EXPERIMENT_RECORD_METADATA = "_ml_obs.meta.metadata"
 EXPERIMENT_EXPECTED_OUTPUT = "_ml_obs.meta.input.expected_output"
 EXPERIMENTS_INPUT = "_ml_obs.meta.input"
 EXPERIMENTS_OUTPUT = "_ml_obs.meta.output"
diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index 4a079b2b626..ebf1239b182 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -24,6 +24,7 @@
 from ddtrace.internal.logger import get_logger
 from ddtrace.llmobs._constants import DD_SITES_NEEDING_APP_SUBDOMAIN
 from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT
+from ddtrace.llmobs._constants import EXPERIMENT_RECORD_METADATA
 from ddtrace.llmobs._utils import convert_tags_dict_to_list
 from ddtrace.llmobs._utils import safe_json
 from ddtrace.version import __version__
@@ -488,7 +489,11 @@ def _process_record(self, idx_record: Tuple[int, DatasetRecord], run: _Experimen
             except Exception:
                 span.set_exc_info(*sys.exc_info())
             self._llmobs_instance.annotate(span, input_data=input_data, output_data=output_data, tags=tags)
-            span._set_ctx_item(EXPERIMENT_EXPECTED_OUTPUT, safe_json(record["expected_output"]))
+
+            span._set_ctx_item(EXPERIMENT_EXPECTED_OUTPUT, record["expected_output"])
+            if "metadata" in record:
+                span._set_ctx_item(EXPERIMENT_RECORD_METADATA, record["metadata"])
+
             return {
                 "idx": idx,
                 "span_id": span_id,
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index 8336ee40991..44fd7ef8528 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -1738,9 +1738,9 @@ def _tag_freeform_io(cls, span, input_value=None, output_value=None):
         arbitrary structured or non structured IO values in its spans
         """
         if input_value is not None:
-            span._set_ctx_item(EXPERIMENTS_INPUT, safe_json(input_value))
+            span._set_ctx_item(EXPERIMENTS_INPUT, input_value)
         if output_value is not None:
-            span._set_ctx_item(EXPERIMENTS_OUTPUT, safe_json(output_value))
+            span._set_ctx_item(EXPERIMENTS_OUTPUT, output_value)
 
     @staticmethod
     def _set_dict_attribute(span: Span, key, value: Dict[str, Any]) -> None:
diff --git a/releasenotes/notes/llmobs-dne-experiments-fields-json-metadata-on-spans-53d679fd8e6ab202.yaml b/releasenotes/notes/llmobs-dne-experiments-fields-json-metadata-on-spans-53d679fd8e6ab202.yaml
new file mode 100644
index 00000000000..f59bc460387
--- /dev/null
+++ b/releasenotes/notes/llmobs-dne-experiments-fields-json-metadata-on-spans-53d679fd8e6ab202.yaml
@@ -0,0 +1,7 @@
+---
+upgrade:
+  - |
+    LLM Observability: Experiments spans now contain metadata from the dataset record.
+  - |
+    LLM Observability: Experiments spans' input, output, expected_output fields are now emitted as is so that if data in
+    any of the columns are objects, they can be searchable in Datadog. 
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0969efc9-f104-45cc-b955-25b329e91293_batch_update_post_8be41af9.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0969efc9-f104-45cc-b955-25b329e91293_batch_update_post_8be41af9.yaml
new file mode 100644
index 00000000000..f6019ed1ab0
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0969efc9-f104-45cc-b955-25b329e91293_batch_update_post_8be41af9.yaml
@@ -0,0 +1,47 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "id": "0969efc9-f104-45cc-b955-25b329e91293",
+      "attributes": {"insert_records": [{"input": {"prompt": "What is the capital
+      of France?"}, "expected_output": {"answer": "Paris"}, "metadata": {"difficulty":
+      "easy"}}], "update_records": [], "delete_records": []}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '289'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/0969efc9-f104-45cc-b955-25b329e91293/batch_update
+  response:
+    body:
+      string: '{"data":[]}'
+    headers:
+      content-length:
+      - '11'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Tue, 09 Dec 2025 03:28:32 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
index 0707820a551..67544c9c95f 100644
--- a/tests/llmobs/test_experiments.py
+++ b/tests/llmobs/test_experiments.py
@@ -92,7 +92,9 @@ def test_dataset_name(request) -> str:
 @pytest.fixture
 def test_dataset(llmobs, test_dataset_records, test_dataset_name) -> Generator[Dataset, None, None]:
     ds = llmobs.create_dataset(
-        dataset_name=test_dataset_name, description="A test dataset", records=test_dataset_records
+        dataset_name=test_dataset_name,
+        description="A test dataset",
+        records=test_dataset_records,
     )
 
     # When recording the requests, we need to wait for the dataset to be queryable.
@@ -106,7 +108,27 @@ def test_dataset(llmobs, test_dataset_records, test_dataset_name) -> Generator[D
 @pytest.fixture
 def test_dataset_one_record(llmobs):
     records = [
-        DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})
+        DatasetRecord(
+            input_data={"prompt": "What is the capital of France?"},
+            expected_output={"answer": "Paris"},
+        )
+    ]
+    ds = llmobs.create_dataset(dataset_name="test-dataset-123", description="A test dataset", records=records)
+    wait_for_backend()
+
+    yield ds
+
+    llmobs._delete_dataset(dataset_id=ds._id)
+
+
+@pytest.fixture
+def test_dataset_one_record_w_metadata(llmobs):
+    records = [
+        DatasetRecord(
+            input_data={"prompt": "What is the capital of France?"},
+            expected_output={"answer": "Paris"},
+            metadata={"difficulty": "easy"},
+        )
     ]
     ds = llmobs.create_dataset(dataset_name="test-dataset-123", description="A test dataset", records=records)
     wait_for_backend()
@@ -120,11 +142,15 @@ def test_dataset_one_record(llmobs):
 def test_dataset_one_record_separate_project(llmobs):
     records = [
         DatasetRecord(
-            input_data={"prompt": "What is the capital of Massachusetts?"}, expected_output={"answer": "Boston"}
+            input_data={"prompt": "What is the capital of Massachusetts?"},
+            expected_output={"answer": "Boston"},
         )
     ]
     ds = llmobs.create_dataset(
-        dataset_name="test-dataset-857", project_name="boston-project", description="A boston dataset", records=records
+        dataset_name="test-dataset-857",
+        project_name="boston-project",
+        description="A boston dataset",
+        records=records,
     )
     wait_for_backend()
 
@@ -182,7 +208,9 @@ def test_dataset_create_delete(llmobs):
 
 def test_dataset_create_delete_project_override(llmobs):
     dataset = llmobs.create_dataset(
-        dataset_name="test-dataset-2", project_name="second project", description="A second test dataset"
+        dataset_name="test-dataset-2",
+        project_name="second project",
+        description="A second test dataset",
     )
     assert dataset._id is not None
     assert dataset.url == f"https://app.datadoghq.com/llm/datasets/{dataset._id}"
@@ -216,7 +244,10 @@ def test_csv_dataset_as_dataframe(llmobs, tmp_csv_file_for_upload):
     csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
     dataset_id = None
 
-    with mock.patch("ddtrace.llmobs._writer.tempfile.NamedTemporaryFile", return_value=tmp_csv_file_for_upload):
+    with mock.patch(
+        "ddtrace.llmobs._writer.tempfile.NamedTemporaryFile",
+        return_value=tmp_csv_file_for_upload,
+    ):
         try:
             dataset = llmobs.create_dataset_from_csv(
                 csv_path=csv_path,
@@ -247,7 +278,10 @@ def test_csv_dataset_as_dataframe(llmobs, tmp_csv_file_for_upload):
 def test_dataset_csv_missing_input_col(llmobs):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
-    with pytest.raises(ValueError, match=re.escape("Input columns not found in CSV header: ['in998', 'in999']")):
+    with pytest.raises(
+        ValueError,
+        match=re.escape("Input columns not found in CSV header: ['in998', 'in999']"),
+    ):
         llmobs.create_dataset_from_csv(
             csv_path=csv_path,
             dataset_name="test-dataset-good-csv",
@@ -260,7 +294,10 @@ def test_dataset_csv_missing_input_col(llmobs):
 def test_dataset_csv_missing_output_col(llmobs):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
-    with pytest.raises(ValueError, match=re.escape("Expected output columns not found in CSV header: ['out999']")):
+    with pytest.raises(
+        ValueError,
+        match=re.escape("Expected output columns not found in CSV header: ['out999']"),
+    ):
         llmobs.create_dataset_from_csv(
             csv_path=csv_path,
             dataset_name="test-dataset-good-csv",
@@ -273,7 +310,10 @@ def test_dataset_csv_missing_output_col(llmobs):
 def test_dataset_csv_empty_csv(llmobs):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/empty.csv")
-    with pytest.raises(ValueError, match=re.escape("CSV file appears to be empty or header is missing.")):
+    with pytest.raises(
+        ValueError,
+        match=re.escape("CSV file appears to be empty or header is missing."),
+    ):
         llmobs.create_dataset_from_csv(
             csv_path=csv_path,
             dataset_name="test-dataset-empty-csv",
@@ -287,7 +327,10 @@ def test_dataset_csv_no_expected_output(llmobs, tmp_csv_file_for_upload):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
     dataset_id = None
-    with mock.patch("ddtrace.llmobs._writer.tempfile.NamedTemporaryFile", return_value=tmp_csv_file_for_upload):
+    with mock.patch(
+        "ddtrace.llmobs._writer.tempfile.NamedTemporaryFile",
+        return_value=tmp_csv_file_for_upload,
+    ):
         try:
             dataset = llmobs.create_dataset_from_csv(
                 csv_path=csv_path,
@@ -328,7 +371,10 @@ def test_dataset_csv(llmobs, tmp_csv_file_for_upload):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
     dataset_id = None
-    with mock.patch("ddtrace.llmobs._writer.tempfile.NamedTemporaryFile", return_value=tmp_csv_file_for_upload):
+    with mock.patch(
+        "ddtrace.llmobs._writer.tempfile.NamedTemporaryFile",
+        return_value=tmp_csv_file_for_upload,
+    ):
         try:
             dataset = llmobs.create_dataset_from_csv(
                 csv_path=csv_path,
@@ -376,7 +422,10 @@ def test_dataset_csv_pipe_separated(llmobs, tmp_csv_file_for_upload):
     test_path = os.path.dirname(__file__)
     csv_path = os.path.join(test_path, "static_files/good_dataset_pipe_separated.csv")
     dataset_id = None
-    with mock.patch("ddtrace.llmobs._writer.tempfile.NamedTemporaryFile", return_value=tmp_csv_file_for_upload):
+    with mock.patch(
+        "ddtrace.llmobs._writer.tempfile.NamedTemporaryFile",
+        return_value=tmp_csv_file_for_upload,
+    ):
         try:
             dataset = llmobs.create_dataset_from_csv(
                 csv_path=csv_path,
@@ -477,7 +526,14 @@ def test_dataset_pull_exists_with_record(llmobs, test_dataset_one_record):
 
 @pytest.mark.parametrize(
     "test_dataset_records",
-    [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
+    [
+        [
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of France?"},
+                expected_output={"answer": "Paris"},
+            )
+        ]
+    ],
 )
 def test_dataset_pull_w_versions(llmobs, test_dataset, test_dataset_records):
     assert len(test_dataset) == 1
@@ -487,7 +543,10 @@ def test_dataset_pull_w_versions(llmobs, test_dataset, test_dataset_records):
     assert test_dataset.version == 1
 
     test_dataset.append(
-        {"input_data": {"prompt": "What is the capital of China?"}, "expected_output": {"answer": "Beijing"}}
+        {
+            "input_data": {"prompt": "What is the capital of China?"},
+            "expected_output": {"answer": "Beijing"},
+        }
     )
     test_dataset.push()
     wait_for_backend(4)
@@ -515,18 +574,27 @@ def test_dataset_pull_w_versions(llmobs, test_dataset, test_dataset_records):
 
 @pytest.mark.parametrize(
     "test_dataset_records",
-    [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
+    [
+        [
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of France?"},
+                expected_output={"answer": "Paris"},
+            )
+        ]
+    ],
 )
 def test_dataset_pull_w_invalid_version(llmobs, test_dataset, test_dataset_records):
     with pytest.raises(
-        ValueError, match="Failed to pull dataset records for.*version is greater than the current version or negative"
+        ValueError,
+        match="Failed to pull dataset records for.*version is greater than the current version or negative",
     ):
         llmobs.pull_dataset(dataset_name=test_dataset.name, version=420)
 
 
 def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_project):
     dataset = llmobs.pull_dataset(
-        dataset_name=test_dataset_one_record_separate_project.name, project_name="boston-project"
+        dataset_name=test_dataset_one_record_separate_project.name,
+        project_name="boston-project",
     )
     assert dataset.project.get("name") == "boston-project"
     assert dataset.project.get("_id")
@@ -543,9 +611,13 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj
     "test_dataset_records",
     [
         [
-            DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"}),
             DatasetRecord(
-                input_data={"prompt": "What is the capital of China?"}, expected_output={"answer": "Beijing"}
+                input_data={"prompt": "What is the capital of France?"},
+                expected_output={"answer": "Paris"},
+            ),
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of China?"},
+                expected_output={"answer": "Beijing"},
             ),
         ]
     ],
@@ -571,7 +643,11 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase
 
     test_dataset.update(0, {"expected_output": {"answer": "Berlin"}})
     test_dataset.update(
-        1, {"input_data": {"prompt": "What is the capital of Mexico?"}, "metadata": {"difficulty": "easy"}}
+        1,
+        {
+            "input_data": {"prompt": "What is the capital of Mexico?"},
+            "metadata": {"difficulty": "easy"},
+        },
     )
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"}
     assert test_dataset[0]["expected_output"] == {"answer": "Berlin"}
@@ -628,7 +704,14 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase
 
 @pytest.mark.parametrize(
     "test_dataset_records",
-    [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
+    [
+        [
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of France?"},
+                expected_output={"answer": "Paris"},
+            )
+        ]
+    ],
 )
 def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records):
     assert test_dataset.latest_version == 1
@@ -636,7 +719,10 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records
 
     test_dataset.update(
         0,
-        DatasetRecord(input_data={"prompt": "What is the capital of Germany?"}, expected_output={"answer": "Berlin"}),
+        DatasetRecord(
+            input_data={"prompt": "What is the capital of Germany?"},
+            expected_output={"answer": "Berlin"},
+        ),
     )
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"}
     assert test_dataset[0]["expected_output"] == {"answer": "Berlin"}
@@ -668,7 +754,14 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records
 
 @pytest.mark.parametrize(
     "test_dataset_records",
-    [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
+    [
+        [
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of France?"},
+                expected_output={"answer": "Paris"},
+            )
+        ]
+    ],
 )
 def test_dataset_modify_single_record_empty_record(llmobs, test_dataset, test_dataset_records):
     assert test_dataset.latest_version == 1
@@ -684,14 +777,24 @@ def test_dataset_modify_single_record_empty_record(llmobs, test_dataset, test_da
 
 def test_dataset_estimate_size(llmobs, test_dataset):
     test_dataset.append(
-        {"input_data": {"prompt": "What is the capital of France?"}, "expected_output": {"answer": "Paris"}}
+        {
+            "input_data": {"prompt": "What is the capital of France?"},
+            "expected_output": {"answer": "Paris"},
+        }
     )
     assert 170 <= test_dataset._estimate_delta_size() <= 200
 
 
 @pytest.mark.parametrize(
     "test_dataset_records",
-    [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
+    [
+        [
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of France?"},
+                expected_output={"answer": "Paris"},
+            )
+        ]
+    ],
 )
 def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_records):
     assert test_dataset.latest_version == 1
@@ -774,11 +877,21 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor
 
 @pytest.mark.parametrize(
     "test_dataset_records",
-    [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
+    [
+        [
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of France?"},
+                expected_output={"answer": "Paris"},
+            )
+        ]
+    ],
 )
 def test_dataset_append(llmobs, test_dataset):
     test_dataset.append(
-        DatasetRecord(input_data={"prompt": "What is the capital of Italy?"}, expected_output={"answer": "Rome"})
+        DatasetRecord(
+            input_data={"prompt": "What is the capital of Italy?"},
+            expected_output={"answer": "Rome"},
+        )
     )
     assert len(test_dataset) == 2
     assert test_dataset.latest_version == 1
@@ -811,14 +924,25 @@ def test_dataset_append(llmobs, test_dataset):
 
 @pytest.mark.parametrize(
     "test_dataset_records",
-    [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
+    [
+        [
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of France?"},
+                expected_output={"answer": "Paris"},
+            )
+        ]
+    ],
 )
 def test_dataset_extend(llmobs, test_dataset):
     test_dataset.extend(
         [
-            DatasetRecord(input_data={"prompt": "What is the capital of Italy?"}, expected_output={"answer": "Rome"}),
             DatasetRecord(
-                input_data={"prompt": "What is the capital of Sweden?"}, expected_output={"answer": "Stockholm"}
+                input_data={"prompt": "What is the capital of Italy?"},
+                expected_output={"answer": "Rome"},
+            ),
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of Sweden?"},
+                expected_output={"answer": "Stockholm"},
             ),
         ]
     )
@@ -856,7 +980,14 @@ def test_dataset_extend(llmobs, test_dataset):
 
 @pytest.mark.parametrize(
     "test_dataset_records",
-    [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
+    [
+        [
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of France?"},
+                expected_output={"answer": "Paris"},
+            )
+        ]
+    ],
 )
 def test_dataset_append_no_expected_output(llmobs, test_dataset):
     test_dataset.append(DatasetRecord(input_data={"prompt": "What is the capital of Sealand?"}))
@@ -895,8 +1026,14 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset):
     "test_dataset_records",
     [
         [
-            DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"}),
-            DatasetRecord(input_data={"prompt": "What is the capital of Italy?"}, expected_output={"answer": "Rome"}),
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of France?"},
+                expected_output={"answer": "Paris"},
+            ),
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of Italy?"},
+                expected_output={"answer": "Rome"},
+            ),
         ],
     ],
 )
@@ -965,8 +1102,14 @@ def test_dataset_delete_no_expected_output(llmobs, test_dataset):
     "test_dataset_records",
     [
         [
-            DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"}),
-            DatasetRecord(input_data={"prompt": "What is the capital of Italy?"}, expected_output={"answer": "Rome"}),
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of France?"},
+                expected_output={"answer": "Paris"},
+            ),
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of Italy?"},
+                expected_output={"answer": "Rome"},
+            ),
         ],
     ],
 )
@@ -1006,8 +1149,14 @@ def test_dataset_delete_after_update(llmobs, test_dataset):
     "test_dataset_records",
     [
         [
-            DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"}),
-            DatasetRecord(input_data={"prompt": "What is the capital of Italy?"}, expected_output={"answer": "Rome"}),
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of France?"},
+                expected_output={"answer": "Paris"},
+            ),
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of Italy?"},
+                expected_output={"answer": "Rome"},
+            ),
         ],
     ],
 )
@@ -1104,20 +1253,33 @@ def my_evaluator_missing_expected_output(input_data, output_data):
             pass
 
         llmobs.experiment(
-            "test_experiment", dummy_task, test_dataset_one_record, [my_evaluator_missing_expected_output]
+            "test_experiment",
+            dummy_task,
+            test_dataset_one_record,
+            [my_evaluator_missing_expected_output],
         )
     with pytest.raises(TypeError, match=re.escape(expected_err)):
 
         def my_evaluator_missing_input(output_data, expected_output):
             pass
 
-        llmobs.experiment("test_experiment", dummy_task, test_dataset_one_record, [my_evaluator_missing_input])
+        llmobs.experiment(
+            "test_experiment",
+            dummy_task,
+            test_dataset_one_record,
+            [my_evaluator_missing_input],
+        )
     with pytest.raises(TypeError, match=re.escape(expected_err)):
 
         def my_evaluator_missing_output(input_data, expected_output):
             pass
 
-        llmobs.experiment("test_experiment", dummy_task, test_dataset_one_record, [my_evaluator_missing_output])
+        llmobs.experiment(
+            "test_experiment",
+            dummy_task,
+            test_dataset_one_record,
+            [my_evaluator_missing_output],
+        )
 
 
 def test_project_name_set(run_python_code_in_subprocess):
@@ -1167,7 +1329,13 @@ def test_project_name_not_set_env(ddtrace_run_python_code_in_subprocess):
     pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))]
     if "PYTHONPATH" in env:
         pypath.append(env["PYTHONPATH"])
-    env.update({"PYTHONPATH": ":".join(pypath), "DD_TRACE_ENABLED": "0", "DD_LLMOBS_ENABLED": "1"})
+    env.update(
+        {
+            "PYTHONPATH": ":".join(pypath),
+            "DD_TRACE_ENABLED": "0",
+            "DD_LLMOBS_ENABLED": "1",
+        }
+    )
     out, err, status, pid = ddtrace_run_python_code_in_subprocess(
         """
 from ddtrace.llmobs import LLMObs
@@ -1222,9 +1390,13 @@ def test_experiment_create(llmobs, test_dataset_one_record):
     "test_dataset_records",
     [
         [
-            DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"}),
             DatasetRecord(
-                input_data={"prompt": "What is the capital of Canada?"}, expected_output={"answer": "Ottawa"}
+                input_data={"prompt": "What is the capital of France?"},
+                expected_output={"answer": "Paris"},
+            ),
+            DatasetRecord(
+                input_data={"prompt": "What is the capital of Canada?"},
+                expected_output={"answer": "Ottawa"},
             ),
         ]
     ],
@@ -1295,7 +1467,10 @@ def test_experiment_run_task_error_raises(llmobs, test_dataset_one_record):
     exp = llmobs.experiment("test_experiment", faulty_task, test_dataset_one_record, [dummy_evaluator])
     with pytest.raises(
         RuntimeError,
-        match=re.compile("Error on record 0: This is a test error\n.*ValueError.*in faulty_task.*", flags=re.DOTALL),
+        match=re.compile(
+            "Error on record 0: This is a test error\n.*ValueError.*in faulty_task.*",
+            flags=re.DOTALL,
+        ),
     ):
         exp._run_task(1, run=run_info_with_stable_id(0), raise_errors=True)
 
@@ -1306,7 +1481,10 @@ def test_experiment_run_evaluators(llmobs, test_dataset_one_record):
     assert len(task_results) == 1
     eval_results = exp._run_evaluators(task_results, raise_errors=False)
     assert len(eval_results) == 1
-    assert eval_results[0] == {"idx": 0, "evaluations": {"dummy_evaluator": {"value": False, "error": None}}}
+    assert eval_results[0] == {
+        "idx": 0,
+        "evaluations": {"dummy_evaluator": {"value": False, "error": None}},
+    }
 
 
 def test_experiment_run_summary_evaluators(llmobs, test_dataset_one_record):
@@ -1321,7 +1499,10 @@ def test_experiment_run_summary_evaluators(llmobs, test_dataset_one_record):
     assert len(task_results) == 1
     eval_results = exp._run_evaluators(task_results, raise_errors=False)
     assert len(eval_results) == 1
-    assert eval_results[0] == {"idx": 0, "evaluations": {"dummy_evaluator": {"value": False, "error": None}}}
+    assert eval_results[0] == {
+        "idx": 0,
+        "evaluations": {"dummy_evaluator": {"value": False, "error": None}},
+    }
     summary_eval_results = exp._run_summary_evaluators(task_results, eval_results, raise_errors=False)
     assert len(summary_eval_results) == 1
     assert summary_eval_results[0] == {
@@ -1336,7 +1517,10 @@ def test_experiment_run_evaluators_error(llmobs, test_dataset_one_record):
     assert len(task_results) == 1
     eval_results = exp._run_evaluators(task_results, raise_errors=False)
     assert len(eval_results) == 1
-    assert eval_results[0] == {"idx": 0, "evaluations": {"faulty_evaluator": {"value": None, "error": mock.ANY}}}
+    assert eval_results[0] == {
+        "idx": 0,
+        "evaluations": {"faulty_evaluator": {"value": None, "error": mock.ANY}},
+    }
     err = eval_results[0]["evaluations"]["faulty_evaluator"]["error"]
     assert err["message"] == "This is a test error in evaluator"
     assert err["type"] == "ValueError"
@@ -1355,7 +1539,10 @@ def test_experiment_run_summary_evaluators_error(llmobs, test_dataset_one_record
     assert len(task_results) == 1
     eval_results = exp._run_evaluators(task_results, raise_errors=False)
     assert len(eval_results) == 1
-    assert eval_results[0] == {"idx": 0, "evaluations": {"dummy_evaluator": {"value": False, "error": None}}}
+    assert eval_results[0] == {
+        "idx": 0,
+        "evaluations": {"dummy_evaluator": {"value": False, "error": None}},
+    }
     summary_eval_results = exp._run_summary_evaluators(task_results, eval_results, raise_errors=False)
     assert summary_eval_results[0] == {
         "idx": 0,
@@ -1379,11 +1566,19 @@ def test_experiment_summary_evaluators_missing_eval_error(llmobs, test_dataset_o
     assert len(task_results) == 1
     eval_results = exp._run_evaluators(task_results, raise_errors=False)
     assert len(eval_results) == 1
-    assert eval_results[0] == {"idx": 0, "evaluations": {"dummy_evaluator": {"value": False, "error": None}}}
+    assert eval_results[0] == {
+        "idx": 0,
+        "evaluations": {"dummy_evaluator": {"value": False, "error": None}},
+    }
     summary_eval_results = exp._run_summary_evaluators(task_results, eval_results, raise_errors=False)
     assert summary_eval_results[0] == {
         "idx": 0,
-        "evaluations": {"dummy_summary_evaluator_using_missing_eval_results": {"value": None, "error": mock.ANY}},
+        "evaluations": {
+            "dummy_summary_evaluator_using_missing_eval_results": {
+                "value": None,
+                "error": mock.ANY,
+            }
+        },
     }
     err = summary_eval_results[0]["evaluations"]["dummy_summary_evaluator_using_missing_eval_results"]["error"]
     assert err["message"] == "'non_existent_evaluator'"
@@ -1426,7 +1621,8 @@ def test_experiment_summary_eval_missing_results_raises(llmobs, test_dataset_one
     assert len(task_results) == 1
     eval_results = exp._run_evaluators(task_results, raise_errors=False)
     with pytest.raises(
-        RuntimeError, match="Summary evaluator dummy_summary_evaluator_using_missing_eval_results failed"
+        RuntimeError,
+        match="Summary evaluator dummy_summary_evaluator_using_missing_eval_results failed",
     ):
         exp._run_summary_evaluators(task_results, eval_results, raise_errors=True)
 
@@ -1513,7 +1709,12 @@ def test_experiment_run(llmobs, test_dataset_one_record):
         with mock.patch("ddtrace.llmobs._experiment._ExperimentRunInfo") as mock_experiment_run_info:
             # this is to ensure that the UUID for the run is always the same
             mock_experiment_run_info.return_value = run_info_with_stable_id(0)
-            exp = llmobs.experiment("test_experiment", dummy_task, test_dataset_one_record, [dummy_evaluator])
+            exp = llmobs.experiment(
+                "test_experiment",
+                dummy_task,
+                test_dataset_one_record,
+                [dummy_evaluator],
+            )
             exp._tags = {"ddtrace.version": "1.2.3"}  # FIXME: this is a hack to set the tags for the experiment
             exp_results = exp.run()
 
@@ -1623,9 +1824,14 @@ def test_experiment_run_w_summary(llmobs, test_dataset_one_record):
     assert exp.url == f"https://app.datadoghq.com/llm/experiments/{exp._id}"
 
 
-def test_experiment_span_written_to_experiment_scope(llmobs, llmobs_events, test_dataset_one_record):
+def test_experiment_span_written_to_experiment_scope(llmobs, llmobs_events, test_dataset_one_record_w_metadata):
     """Assert that the experiment span includes expected output field and includes the experiment scope."""
-    exp = llmobs.experiment("test_experiment", dummy_task, test_dataset_one_record, [dummy_evaluator])
+    exp = llmobs.experiment(
+        "test_experiment",
+        dummy_task,
+        test_dataset_one_record_w_metadata,
+        [dummy_evaluator],
+    )
     exp._id = "1234567890"
     exp._run_task(1, run=run_info_with_stable_id(0), raise_errors=False)
     assert len(llmobs_events) == 1
@@ -1634,14 +1840,15 @@ def test_experiment_span_written_to_experiment_scope(llmobs, llmobs_events, test
     for key in ("span_id", "trace_id", "parent_id", "start_ns", "duration", "metrics"):
         assert event[key] == mock.ANY
     assert event["status"] == "ok"
-    assert event["meta"]["input"] == '{"prompt": "What is the capital of France?"}'
-    assert event["meta"]["output"] == '{"prompt": "What is the capital of France?"}'
-    assert event["meta"]["expected_output"] == '{"answer": "Paris"}'
-    assert "dataset_name:{}".format(test_dataset_one_record.name) in event["tags"]
+    assert event["meta"]["input"] == {"prompt": "What is the capital of France?"}
+    assert event["meta"]["output"] == {"prompt": "What is the capital of France?"}
+    assert event["meta"]["expected_output"] == {"answer": "Paris"}
+    assert event["meta"]["metadata"] == {"difficulty": "easy"}
+    assert "dataset_name:{}".format(test_dataset_one_record_w_metadata.name) in event["tags"]
     assert "project_name:test-project" in event["tags"]
     assert "experiment_name:test_experiment" in event["tags"]
-    assert "dataset_id:{}".format(test_dataset_one_record._id) in event["tags"]
-    assert "dataset_record_id:{}".format(test_dataset_one_record._records[0]["record_id"]) in event["tags"]
+    assert "dataset_id:{}".format(test_dataset_one_record_w_metadata._id) in event["tags"]
+    assert "dataset_record_id:{}".format(test_dataset_one_record_w_metadata._records[0]["record_id"]) in event["tags"]
     assert "experiment_id:1234567890" in event["tags"]
     assert f"run_id:{DUMMY_EXPERIMENT_FIRST_RUN_ID}" in event["tags"]
     assert "run_iteration:1" in event["tags"]
@@ -1649,25 +1856,40 @@ def test_experiment_span_written_to_experiment_scope(llmobs, llmobs_events, test
     assert event["_dd"]["scope"] == "experiments"
 
 
-def test_experiment_span_multi_run_tags(llmobs, llmobs_events, test_dataset_one_record):
-    exp = llmobs.experiment("test_experiment", dummy_task, test_dataset_one_record, [dummy_evaluator])
+def test_experiment_span_multi_run_tags(llmobs, llmobs_events, test_dataset_one_record_w_metadata):
+    exp = llmobs.experiment(
+        "test_experiment",
+        dummy_task,
+        test_dataset_one_record_w_metadata,
+        [dummy_evaluator],
+    )
     exp._id = "1234567890"
     for i in range(2):
         exp._run_task(1, run=run_info_with_stable_id(i), raise_errors=False)
         assert len(llmobs_events) == i + 1
         event = llmobs_events[i]
         assert event["name"] == "dummy_task"
-        for key in ("span_id", "trace_id", "parent_id", "start_ns", "duration", "metrics"):
+        for key in (
+            "span_id",
+            "trace_id",
+            "parent_id",
+            "start_ns",
+            "duration",
+            "metrics",
+        ):
             assert event[key] == mock.ANY
         assert event["status"] == "ok"
-        assert event["meta"]["input"] == '{"prompt": "What is the capital of France?"}'
-        assert event["meta"]["output"] == '{"prompt": "What is the capital of France?"}'
-        assert event["meta"]["expected_output"] == '{"answer": "Paris"}'
-        assert "dataset_name:{}".format(test_dataset_one_record.name) in event["tags"]
+        assert event["meta"]["input"] == {"prompt": "What is the capital of France?"}
+        assert event["meta"]["output"] == {"prompt": "What is the capital of France?"}
+        assert event["meta"]["expected_output"] == {"answer": "Paris"}
+        assert event["meta"]["metadata"] == {"difficulty": "easy"}
+        assert "dataset_name:{}".format(test_dataset_one_record_w_metadata.name) in event["tags"]
         assert "project_name:test-project" in event["tags"]
         assert "experiment_name:test_experiment" in event["tags"]
-        assert "dataset_id:{}".format(test_dataset_one_record._id) in event["tags"]
-        assert "dataset_record_id:{}".format(test_dataset_one_record._records[0]["record_id"]) in event["tags"]
+        assert "dataset_id:{}".format(test_dataset_one_record_w_metadata._id) in event["tags"]
+        assert (
+            "dataset_record_id:{}".format(test_dataset_one_record_w_metadata._records[0]["record_id"]) in event["tags"]
+        )
         assert "experiment_id:1234567890" in event["tags"]
         assert f"run_id:{DUMMY_EXPERIMENT_FIRST_RUN_ID}" in event["tags"]
         assert f"run_iteration:{i + 1}" in event["tags"]