Skip to content

Commit 603ccfe

Browse files
fix(gepa): collect and save trajectory feedback in optimization results (#819)
Fixes Issue ErikBjare/bob#128 Problem: - TrajectoryAnalyzer and trajectory_feedback_metric existed - GEPA received and used trajectory metric internally - But _evaluate_prompt didn't collect trajectory feedback - Results saved without trajectory_feedback field Root cause: - DSPy GEPA uses trajectory metric during optimization but doesn't expose feedback - _evaluate_prompt only calculated scores, no trajectory feedback collection Solution: - Store trajectory_metric in PromptOptimizer when creating GEPA optimizer - In _evaluate_prompt: collect trajectory feedback for each validation example - Add trajectory_feedback to results dict with score + feedback per example This enables: - Phase 3.2 Tests 2-4 validation (previously blocked) - Phase 3.3-4 performance validation - Trajectory-based prompt optimization Testing: - All 17 dspy tests pass - 1 test fails due to missing API key (expected, not code issue)
1 parent 1bdd9de commit 603ccfe

File tree

1 file changed

+21
-1
lines changed

1 file changed

+21
-1
lines changed

gptme/eval/dspy/prompt_optimizer.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ def _create_optimizer(self, eval_specs: list[EvalSpec]):
268268
)
269269
elif self.optimizer_type.lower() == "gepa":
270270
trajectory_metric = create_trajectory_feedback_metric(eval_specs=eval_specs)
271+
self._trajectory_metric = trajectory_metric # Store for evaluation
271272
reflection_model = ModelNameMapper.get_reflection_model(self.model)
272273
reflection_lm = dspy.LM(reflection_model)
273274

@@ -314,6 +315,7 @@ def _evaluate_prompt(self, prompt: str, val_data: PromptDataset) -> dict[str, An
314315
task_scores = []
315316
tool_scores = []
316317
judge_scores = []
318+
trajectory_feedbacks = []
317319
module = GptmeModule(prompt, self.model)
318320

319321
for example in val_data:
@@ -328,6 +330,18 @@ def _evaluate_prompt(self, prompt: str, val_data: PromptDataset) -> dict[str, An
328330
tool_scores.append(tool_metric(example, pred, None))
329331
judge_scores.append(judge_metric(example, pred, None))
330332

333+
# If trajectory metric exists (GEPA), collect feedback
334+
if hasattr(self, "_trajectory_metric"):
335+
trajectory_result = self._trajectory_metric(
336+
example, pred, None, None, None
337+
)
338+
trajectory_feedbacks.append(
339+
{
340+
"score": trajectory_result.score,
341+
"feedback": trajectory_result.feedback,
342+
}
343+
)
344+
331345
# Calculate averages
332346
avg_task = sum(task_scores) / len(task_scores) if task_scores else 0.0
333347
avg_tool = sum(tool_scores) / len(tool_scores) if tool_scores else 0.0
@@ -340,7 +354,7 @@ def _evaluate_prompt(self, prompt: str, val_data: PromptDataset) -> dict[str, An
340354
for t, tool, j in zip(task_scores, tool_scores, judge_scores)
341355
]
342356

343-
return {
357+
results = {
344358
"average_score": avg_composite,
345359
"task_success_rate": avg_task,
346360
"tool_usage_score": avg_tool,
@@ -353,6 +367,12 @@ def _evaluate_prompt(self, prompt: str, val_data: PromptDataset) -> dict[str, An
353367
"optimized_prompt": prompt,
354368
}
355369

370+
# Add trajectory feedback if available (GEPA only)
371+
if trajectory_feedbacks:
372+
results["trajectory_feedback"] = trajectory_feedbacks
373+
374+
return results
375+
356376
def compare_prompts(
357377
self,
358378
prompts: dict[str, str],

0 commit comments

Comments
 (0)