ServiceNow · recursix · Jan 17, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -57,3 +57,4 @@ exclude = '''
 [project.scripts]
 agentlab-assistant = "agentlab.ui_assistant:main"
 agentlab-xray = "agentlab.analyze.agent_xray:main"
+agentlab-analyze = "agentlab.analyze.error_analysis.pipeline:main"
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-black[jupyter]>=24.2.0
+black[jupyter]>=24.2.0,<25
 blacken-docs
 pre-commit
 pytest==7.3.2

diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py
@@ -257,7 +257,7 @@
 )
 
 AGENT_4o_MINI = GenericAgentArgs(
-    chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4o-mini-2024-07-18"],
+    chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"],
     flags=FLAGS_GPT_4o,
 )
 AGENT_CLAUDE_SONNET_35 = GenericAgentArgs(

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
@@ -1,4 +1,5 @@
 import base64
+import json
 import os
 import traceback
 from copy import deepcopy
@@ -30,6 +31,32 @@
 TASK_SEED_KEY = "env.task_seed"
 
 
+def dict_to_markdown(data, level=1):
+    """
+    Convert a nested dictionary to a Markdown string with hierarchical headers.
+
+    Parameters:
+        data (dict): The dictionary to convert.
+        level (int): The current header level (default is 1).
+
+    Returns:
+        str: The formatted Markdown string.
+    """
+    markdown = ""
+
+    for key, value in data.items():
+        if isinstance(value, dict):
+            # Add a header for the key and recursively process the dictionary
+            markdown += f"{'#' * level} {key}\n"
+            markdown += dict_to_markdown(value, level + 1)
+        else:
+            # Add the key-value pair with indentation
+            markdown += f"{'#' * level} {key}\n"
+            markdown += f"    {value}\n"
+
+    return markdown
+
+
 def display_table(df: pd.DataFrame):
     df = df.copy()
     df.columns = clean_column_names(df.columns)
@@ -358,6 +385,9 @@ def run_gradio(results_dir: Path):
             with gr.Tab("Task Error") as tab_error:
                 task_error = gr.Markdown()
 
+            with gr.Tab("Error Analysis") as tab_error_analysis:
+                error_analysis = gr.Markdown()
+
             with gr.Tab("Logs") as tab_logs:
                 logs = gr.Code(language=None, **code_args)
 
@@ -485,6 +515,7 @@ def run_gradio(results_dir: Path):
         tab_axtree.select(fn=update_axtree, outputs=axtree_code)
         tab_chat.select(fn=update_chat_messages, outputs=chat_messages)
         tab_error.select(fn=update_task_error, outputs=task_error)
+        tab_error_analysis.select(fn=update_error_analysis, outputs=error_analysis)
         tab_logs.select(fn=update_logs, outputs=logs)
         tab_stats.select(fn=update_stats, outputs=stats)
         tab_agent_info_html.select(fn=update_agent_info_html, outputs=agent_info_html)
@@ -612,6 +643,20 @@ def update_task_error():
         return "No Task Error"
 
 
+def update_error_analysis():
+    global info
+    try:
+        error_analysis = info.exp_result.exp_dir / "error_analysis.json"
+        if not error_analysis.exists():
+            return "No Error Analysis Found"
+        with error_analysis.open("r") as f:
+            json_data = json.load(f)
+        res = dict_to_markdown(json_data)
+        return res
+    except FileNotFoundError:
+        return "No Error Analysis"
+
+
 def update_logs():
     global info
     try:
@@ -1200,3 +1245,4 @@ def main():
 
 if __name__ == "__main__":
     main()
+    main()
diff --git a/src/agentlab/analyze/error_analysis/__init__.py b/src/agentlab/analyze/error_analysis/__init__.py
diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -0,0 +1,110 @@
+import json
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Generator
+
+from bgym import ExpResult
+
+from agentlab.analyze.error_analysis.summarizer import (
+    ChangeSummarizer,
+    EpisodeErrorSummarizer,
+    EpisodeSummarizer,
+)
+from agentlab.analyze.inspect_results import yield_all_exp_results
+
+
+@dataclass
+class Analyzer:
+    prompt: str
+    llm = None
+
+    def __call__(self, *args, **kwds):
+        return "analysis"
+
+
+def analyze(exp_result, episode_summarizer, save_analysis_func):
+    error_analysis = episode_summarizer(exp_result)
+    save_analysis_func(exp_result, error_analysis)
+
+
+@dataclass
+class ErrorAnalysisPipeline:
+    exp_dir: Path
+    filter: str = None
+    episode_summarizer: EpisodeSummarizer = None
+
+    def filter_exp_results(self) -> Generator[ExpResult, None, None]:
+        # TODO:(thibault) improve filtering
+        exp_results = yield_all_exp_results(self.exp_dir)
+        for exp_result in exp_results:
+            if self.filter is None or self.filter in str(exp_result.exp_dir):
+                yield exp_result
+
+    def run_analysis(self, parallel=False, jobs=-1):
+        filtered_results = self.filter_exp_results()
+
+        if parallel:
+            import joblib
+
+            joblib.Parallel(n_jobs=jobs, backend="threading")(
+                joblib.delayed(analyze)(exp_result, self.episode_summarizer, self.save_analysis)
+                for exp_result in filtered_results
+            )
+
+        else:
+            for exp_result in filtered_results:
+                error_analysis = self.episode_summarizer(exp_result)
+                self.save_analysis(exp_result, error_analysis)
+
+    def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=True):
+        """Save the analysis to json"""
+        analysis_path = exp_result.exp_dir / "error_analysis.json"
+        if not exists_ok and analysis_path.exists():
+            raise FileExistsError(f"{analysis_path} already exists")
+        with analysis_path.open("w") as f:
+            json.dump(error_analysis, f, indent=4)
+
+
+AXTREE_FORMATTER = lambda x: x.get("axtree_txt", "No AXTREE available")
+HTML_FORMATTER = lambda x: x.get("pruned_html", "No HTML available")
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--exp_dir", type=str)
+    parser.add_argument("-f", "--filter", type=str, default=None)
+    parser.add_argument("-p", "--parallel", action="store_true")
+    parser.add_argument("-j", "--jobs", type=int, default=-1)
+    parser.add_argument("-g", "--guess_success", action="store_true")
+
+    args = parser.parse_args()
+
+    assert args.exp_dir is not None, "Please provide an exp_dir, e.g., -e /path/to/exp_dir"
+
+    exp_dir = Path(args.exp_dir)
+    filter = args.filter
+    parallel = args.parallel
+    jobs = args.jobs
+    guess_success = args.guess_success
+
+    from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
+
+    llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-2024-08-06"].make_model()
+
+    pipeline = ErrorAnalysisPipeline(
+        exp_dir=exp_dir,
+        filter=filter,
+        episode_summarizer=EpisodeErrorSummarizer(
+            ChangeSummarizer(llm, AXTREE_FORMATTER), llm, guess_success=guess_success
+        ),
+    )
+
+    pipeline.run_analysis(parallel=parallel, jobs=jobs)
+
+
+if __name__ == "__main__":
+
+    main()
diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -0,0 +1,178 @@
+from dataclasses import dataclass
+
+from bgym import ExpResult, StepInfo
+
+from agentlab.analyze.error_analysis.summarizer_prompts import (
+    CHANGE_SUMMARIZER_PROMPT,
+    ERROR_CLASSIFICATION_PROMPT,
+    ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT,
+)
+from agentlab.llm.llm_utils import json_parser, parse_html_tags
+from agentlab.llm.tracking import set_tracker
+
+
+def _diff(past_obs, current_obs):
+    """TODO: Implement the diff function.
+
+    Returns a diff version of current_obs compares to past_obs, unless there is too many changes.
+
+    Args:
+        past_obs: The past observation.
+        current_obs: The current observation.
+
+    Raises:
+        ValueError: Not implemented yet.
+    """
+    raise ValueError("Not implemented yet.")
+
+
+@dataclass
+class ChangeSummarizer:
+
+    llm: callable  # language model
+    obs_formatter: callable = lambda x: x.get("dom_txt", "No AXTREE available")
+    use_diff: bool = False
+
+    def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]) -> str:
+        """Produces, a summary of the effect of an action."""
+        obs_message = self.obs_formatter(obs.obs)
+        next_obs_message = self.obs_formatter(next_obs.obs)
+
+        action = obs.action
+
+        goal = obs.obs["goal"]  # Use goal object from agentlab
+        # TODO(thibault): switch to 'goal_object'
+        # Outsource everything to formatter
+
+        if self.use_diff:
+            next_obs_message = _diff(obs_message, next_obs_message)
+
+        return self.parse(
+            self.llm(
+                self.make_prompt(
+                    obs_message,
+                    action,
+                    next_obs_message,
+                    past_summaries,
+                    goal,
+                    obs.obs.get("plan", "No plan available"),
+                )
+            )["content"]
+        )
+
+    def make_prompt(
+        self, past_obs_message, action, current_obs_message, past_summaries, goal, plan
+    ):
+        """TODO: Implement the prompt."""
+        return CHANGE_SUMMARIZER_PROMPT.format(
+            goal=goal,
+            plan=plan,
+            past_observation=past_obs_message,
+            current_observation=current_obs_message,
+            past_summaries=past_summaries,
+            action=action,
+        )
+
+    def parse(self, raw_output: str) -> dict:
+        parsed_result = parse_html_tags(
+            raw_output, keys=["changeSummary", "actionAssessment", "explanation", "suggestion"]
+        )[0]
+        return parsed_result
+
+
+@dataclass
+class EpisodeAnalysis:
+    analysis: str  # complete analysis of the episode
+    summary: str  # short summary of the analysis
+    categories: dict[str, float]  # score for each category e.g. type of error or difficulty levels
+
+
+@dataclass
+class EpisodeSummarizer:
+
+    change_summarizer: ChangeSummarizer = None
+    llm: callable = None
+    parser: callable = lambda x: json_parser(x)[0]
+    guess_success: bool = False
+
+    def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ...
+
+    def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
+        """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
+
+        if not self.guess_success:
+            if exp_results.steps_info[-1].reward == 1:
+                return {"analysis": "Success", "summaries": {}}
+
+        with set_tracker("summary") as summaries_tracker:
+            summaries = self.make_change_summaries(exp_results)
+        prompt = self.make_prompt(exp_results, summaries)
+
+        with set_tracker("analysis") as analysis_tracker:
+            raw_analysis = self.llm(prompt)["content"]
+        analysis = self.parse(raw_analysis)
+        res = {
+            "analysis": analysis,
+            "summaries": {i: a for i, a in enumerate(summaries)},
+        }
+        res.update(analysis_tracker.stats)
+        res.update(summaries_tracker.stats)
+        return res
+
+    def make_change_summaries(self, exp_result: ExpResult) -> list[str]:
+        summaries = []  # type: list[str]
+        # this assumes that there is always an extra step at the end of the episode
+        # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info
+        # TODO:(thibault) make some checks or w/e
+        for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]):
+            summaries.append(self.change_summarizer.summarize(step, next_step, summaries))
+        return summaries
+
+    def parse(self, raw_output: str) -> dict:
+        parsed_result = parse_html_tags(raw_output, keys=["explanation", "errorCategory"])[0]
+        return parsed_result
+
+
+@dataclass
+class EpisodeErrorSummarizer(EpisodeSummarizer):
+
+    change_summarizer: ChangeSummarizer = None
+
+    def make_prompt(self, exp_results: ExpResult, summaries: list[str]):
+        """TODO: Implement the prompt."""
+        goal = exp_results.steps_info[0].obs["goal"]
+
+        def format_summary(summary):
+            res = ""
+            for key, value in summary.items():
+                res += f"{key}: {value}\n"
+            return res
+
+        txt_summaries = "\n".join([format_summary(summary) for summary in summaries])
+
+        actions = [step.action for step in exp_results.steps_info[:-1]]
+        action_errors = "\n".join(
+            [step.obs["last_action_error"] for step in exp_results.steps_info[1:]]
+        )
+
+        txt_actions = "\n".join(
+            [
+                f"Action: {action}\nAction Error: {action_error}"
+                for action, action_error in zip(actions, action_errors)
+            ]
+        )
+
+        extra_info = exp_results.steps_info[-1].task_info
+
+        prompt = (
+            ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT
+            if self.guess_success
+            else ERROR_CLASSIFICATION_PROMPT
+        )
+
+        return prompt.format(
+            goal=goal,
+            historical_summaries=txt_summaries,
+            action_history=txt_actions,
+            extra_info=extra_info,
+        )