Skip to content

Commit b85504d

Browse files
committed
multi run support for experiments
1 parent 5fa0681 commit b85504d

File tree

5 files changed

+53
-26
lines changed

5 files changed

+53
-26
lines changed

ddtrace/llmobs/_constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,8 @@
110110
PROXY_REQUEST = "llmobs.proxy_request"
111111

112112
EXPERIMENT_ID_KEY = "_ml_obs.experiment_id"
113+
EXPERIMENT_RUN_ID_KEY = "_ml_obs.experiment_run_id"
114+
EXPERIMENT_RUN_ITERATION_KEY = "_ml_obs.experiment_run_iteration"
113115
EXPERIMENT_EXPECTED_OUTPUT = "_ml_obs.meta.input.expected_output"
114116
EXPERIMENTS_INPUT = "_ml_obs.meta.input"
115117
EXPERIMENTS_OUTPUT = "_ml_obs.meta.output"

ddtrace/llmobs/_experiment.py

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from concurrent.futures import ThreadPoolExecutor
22
from copy import deepcopy
3+
import itertools
34
import sys
45
import traceback
56
from typing import TYPE_CHECKING
@@ -297,6 +298,12 @@ def as_dataframe(self) -> None:
297298
return pd.DataFrame(data=records_list, columns=pd.MultiIndex.from_tuples(column_tuples))
298299

299300

301+
class _ExperimentRunInfo:
302+
def __init__(self, run_interation: int):
303+
self._id = uuid.uuid4()
304+
self._run_iteration = run_interation
305+
306+
300307
class Experiment:
301308
def __init__(
302309
self,
@@ -316,6 +323,7 @@ def __init__(
316323
]
317324
]
318325
] = None,
326+
runs: Optional[int] = None,
319327
) -> None:
320328
self.name = name
321329
self._task = task
@@ -326,6 +334,7 @@ def __init__(
326334
self._tags: Dict[str, str] = tags or {}
327335
self._tags["ddtrace.version"] = str(ddtrace.__version__)
328336
self._config: Dict[str, JSONType] = config or {}
337+
self._runs: int = runs or 1
329338
self._llmobs_instance = _llmobs_instance
330339

331340
if not project_name:
@@ -358,18 +367,23 @@ def run(self, jobs: int = 1, raise_errors: bool = False, sample_size: Optional[i
358367
self._config,
359368
convert_tags_dict_to_list(self._tags),
360369
self._description,
370+
self._runs,
361371
)
362372
self._id = experiment_id
363373
self._tags["experiment_id"] = str(experiment_id)
364374
self._run_name = experiment_run_name
365-
task_results = self._run_task(jobs, raise_errors, sample_size)
366-
evaluations = self._run_evaluators(task_results, raise_errors=raise_errors)
367-
summary_evals = self._run_summary_evaluators(task_results, evaluations, raise_errors)
368-
experiment_results = self._merge_results(task_results, evaluations, summary_evals)
369-
experiment_evals = self._generate_metrics_from_exp_results(experiment_results)
370-
self._llmobs_instance._dne_client.experiment_eval_post(
371-
self._id, experiment_evals, convert_tags_dict_to_list(self._tags)
372-
)
375+
for run_iteration in range(self._runs):
376+
run = _ExperimentRunInfo(run_iteration)
377+
self._tags["run_id"] = str(run._id)
378+
self._tags["run_iteration"] = str(run._run_iteration)
379+
task_results = self._run_task(jobs, run, raise_errors, sample_size)
380+
evaluations = self._run_evaluators(task_results, raise_errors=raise_errors)
381+
summary_evals = self._run_summary_evaluators(task_results, evaluations, raise_errors)
382+
experiment_results = self._merge_results(task_results, evaluations, summary_evals)
383+
experiment_evals = self._generate_metrics_from_exp_results(experiment_results)
384+
self._llmobs_instance._dne_client.experiment_eval_post(
385+
self._id, experiment_evals, convert_tags_dict_to_list(self._tags)
386+
)
373387

374388
return experiment_results
375389

@@ -378,11 +392,13 @@ def url(self) -> str:
378392
# FIXME: will not work for subdomain orgs
379393
return f"{_get_base_url()}/llm/experiments/{self._id}"
380394

381-
def _process_record(self, idx_record: Tuple[int, DatasetRecord]) -> Optional[TaskResult]:
395+
def _process_record(self, idx_record: Tuple[int, DatasetRecord], run: _ExperimentRunInfo) -> Optional[TaskResult]:
382396
if not self._llmobs_instance or not self._llmobs_instance.enabled:
383397
return None
384398
idx, record = idx_record
385-
with self._llmobs_instance._experiment(name=self._task.__name__, experiment_id=self._id) as span:
399+
with self._llmobs_instance._experiment(
400+
name=self._task.__name__, experiment_id=self._id, run_id=str(run._id), run_iteration=run._run_iteration
401+
) as span:
386402
span_context = self._llmobs_instance.export_span(span=span)
387403
if span_context:
388404
span_id = span_context.get("span_id", "")
@@ -422,7 +438,9 @@ def _process_record(self, idx_record: Tuple[int, DatasetRecord]) -> Optional[Tas
422438
},
423439
}
424440

425-
def _run_task(self, jobs: int, raise_errors: bool = False, sample_size: Optional[int] = None) -> List[TaskResult]:
441+
def _run_task(
442+
self, jobs: int, run: _ExperimentRunInfo, raise_errors: bool = False, sample_size: Optional[int] = None
443+
) -> List[TaskResult]:
426444
if not self._llmobs_instance or not self._llmobs_instance.enabled:
427445
return []
428446
if sample_size is not None and sample_size < len(self._dataset):
@@ -441,7 +459,9 @@ def _run_task(self, jobs: int, raise_errors: bool = False, sample_size: Optional
441459
subset_dataset = self._dataset
442460
task_results = []
443461
with ThreadPoolExecutor(max_workers=jobs) as executor:
444-
for result in executor.map(self._process_record, enumerate(subset_dataset)):
462+
for result in executor.map(
463+
self._process_record, enumerate(subset_dataset), itertools.repeat(run, len(subset_dataset))
464+
):
445465
if not result:
446466
continue
447467
task_results.append(result)

ddtrace/llmobs/_llmobs.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@
5757
from ddtrace.llmobs._constants import EXPERIMENT_CSV_FIELD_MAX_SIZE
5858
from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT
5959
from ddtrace.llmobs._constants import EXPERIMENT_ID_KEY
60+
from ddtrace.llmobs._constants import EXPERIMENT_RUN_ID_KEY
61+
from ddtrace.llmobs._constants import EXPERIMENT_RUN_ITERATION_KEY
6062
from ddtrace.llmobs._constants import EXPERIMENTS_INPUT
6163
from ddtrace.llmobs._constants import EXPERIMENTS_OUTPUT
6264
from ddtrace.llmobs._constants import INPUT_DOCUMENTS
@@ -781,6 +783,7 @@ def experiment(
781783
]
782784
]
783785
] = None,
786+
runs: Optional[int] = 1,
784787
) -> Experiment:
785788
"""Initializes an Experiment to run a task on a Dataset and evaluators.
786789
@@ -797,6 +800,8 @@ def experiment(
797800
to produce a single value.
798801
Must accept parameters ``inputs``, ``outputs``, ``expected_outputs``,
799802
``evaluators_results``.
803+
:param runs: The number of times to run the experiment, or, run the task for every dataset record the defined
804+
number of times.
800805
"""
801806
if not callable(task):
802807
raise TypeError("task must be a callable function.")
@@ -837,6 +842,7 @@ def experiment(
837842
config=config,
838843
_llmobs_instance=cls._instance,
839844
summary_evaluators=summary_evaluators,
845+
runs=runs,
840846
)
841847

842848
@classmethod
@@ -1306,6 +1312,8 @@ def _experiment(
13061312
session_id: Optional[str] = None,
13071313
ml_app: Optional[str] = None,
13081314
experiment_id: Optional[str] = None,
1315+
run_id: Optional[str] = None,
1316+
run_iteration: Optional[int] = None,
13091317
) -> Span:
13101318
"""
13111319
Trace an LLM experiment, only used internally by the experiments SDK.
@@ -1324,6 +1332,12 @@ def _experiment(
13241332
if experiment_id:
13251333
span.context.set_baggage_item(EXPERIMENT_ID_KEY, experiment_id)
13261334

1335+
if run_id:
1336+
span.context.set_baggage_item(EXPERIMENT_RUN_ID_KEY, run_id)
1337+
1338+
if run_iteration:
1339+
span.context.set_baggage_item(EXPERIMENT_RUN_ITERATION_KEY, run_iteration)
1340+
13271341
return span
13281342

13291343
@classmethod

ddtrace/llmobs/_writer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,7 @@ def experiment_create(
611611
exp_config: Optional[Dict[str, JSONType]] = None,
612612
tags: Optional[List[str]] = None,
613613
description: Optional[str] = None,
614+
runs: Optional[int] = 1,
614615
) -> Tuple[str, str]:
615616
path = "/api/unstable/llm-obs/v1/experiments"
616617
resp = self.request(
@@ -628,6 +629,7 @@ def experiment_create(
628629
"config": exp_config or {},
629630
"metadata": {"tags": cast(JSONType, tags or [])},
630631
"ensure_unique": True,
632+
"run_count": runs,
631633
},
632634
}
633635
},

setup.cfg

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,10 @@
1+
[bdist_wheel]
2+
universal=1
3+
14
[codespell]
25
skip = *.json,*.h,*.cpp,*.c,.riot,.tox,.mypy_cache,.git,*ddtrace/vendor,tests/contrib/openai/cassettes/*,tests/contrib/langchain/cassettes/*,ddtrace/appsec/_iast/_taint_tracking/_vendor/*
36
exclude-file = .codespellignorelines
47
ignore-words-list = asend,dne,fo,medias,ment,nin,ot,setttings,statics,ba,spawnve,doas
58

6-
# DEV: We use `conftest.py` as a local pytest plugin to configure hooks for collection
7-
[tool:pytest]
8-
# --cov-report is intentionally empty else pytest-cov will default to generating a report
9-
addopts =
10-
--cov=ddtrace/
11-
--cov=tests/
12-
--cov-append
13-
--cov-report=
14-
--durations=10
15-
--junitxml=test-results/junit.xml
16-
# DEV: The default is `test_*\.py` which will miss `test.py` files
17-
python_files = test*\.py
18-
asyncio_mode = auto
19-
209
[flake8]
2110
max-line-length = 120

0 commit comments

Comments
 (0)