Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
048a622
Add initial implementation of ChangeSummarizer and EpisodeAnalysis cl…
recursix Jan 17, 2025
fd8fd95
Added chain summarizer prompt
Megh-Thakkar Jan 21, 2025
b8c85b1
Added error classification prompt
Megh-Thakkar Jan 21, 2025
5cb6cc2
Fix typo
jardinetsouffleton Jan 21, 2025
9f531cc
Update error_analysis.py
Megh-Thakkar Jan 21, 2025
31e5bf5
added pipeline and tests
TLSDC Jan 22, 2025
000893d
quick parsing to run from cligit push
TLSDC Jan 22, 2025
4727a9e
even more parsing and making imports absolute
TLSDC Jan 22, 2025
42f0362
.
TLSDC Jan 24, 2025
394999b
chat_models can take str as input
TLSDC Jan 24, 2025
e0e786c
typing
TLSDC Jan 24, 2025
46d2c8c
keep this here bc it's going to pop back up
TLSDC Jan 28, 2025
8a882ad
pipeline mvp
TLSDC Jan 28, 2025
3fab5b4
added a specific tab and viz for it in xray
TLSDC Jan 28, 2025
2be23e5
added formatting options
TLSDC Jan 28, 2025
41f8f69
Update summarizer_prompts.py
Megh-Thakkar Jan 29, 2025
6163b47
xml parsing
TLSDC Jan 29, 2025
a1f3416
fix
TLSDC Jan 29, 2025
a455d0d
add error analysis prediction validation script
jardinetsouffleton Feb 4, 2025
82dbaba
black version update
TLSDC Feb 6, 2025
5fbbe57
phony command, joblib stuff, took think out of prompt
TLSDC Feb 20, 2025
3a3d602
task_info
TLSDC Feb 20, 2025
5bf1bac
added flag to oracle success or no
TLSDC Feb 20, 2025
c7b1c5a
Merge branch 'main' into error-analysis
TLSDC Feb 20, 2025
0972130
darglint
TLSDC Feb 20, 2025
ec1395b
Merge branch 'error-analysis' of github.com:ServiceNow/AgentLab into …
TLSDC Feb 20, 2025
46d1075
tests
TLSDC Feb 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,4 @@ exclude = '''
[project.scripts]
agentlab-assistant = "agentlab.ui_assistant:main"
agentlab-xray = "agentlab.analyze.agent_xray:main"
agentlab-analyze = "agentlab.analyze.error_analysis.pipeline:main"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nicee

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
black[jupyter]>=24.2.0
black[jupyter]>=24.2.0,<25
blacken-docs
pre-commit
pytest==7.3.2
Expand Down
2 changes: 1 addition & 1 deletion src/agentlab/agents/generic_agent/agent_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@
)

AGENT_4o_MINI = GenericAgentArgs(
chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4o-mini-2024-07-18"],
chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"],
flags=FLAGS_GPT_4o,
)
AGENT_CLAUDE_SONNET_35 = GenericAgentArgs(
Expand Down
46 changes: 46 additions & 0 deletions src/agentlab/analyze/agent_xray.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import base64
import json
import os
import traceback
from copy import deepcopy
Expand Down Expand Up @@ -30,6 +31,32 @@
TASK_SEED_KEY = "env.task_seed"


def dict_to_markdown(data, level=1):
"""
Convert a nested dictionary to a Markdown string with hierarchical headers.

Parameters:
data (dict): The dictionary to convert.
level (int): The current header level (default is 1).

Returns:
str: The formatted Markdown string.
"""
markdown = ""

for key, value in data.items():
if isinstance(value, dict):
# Add a header for the key and recursively process the dictionary
markdown += f"{'#' * level} {key}\n"
markdown += dict_to_markdown(value, level + 1)
else:
# Add the key-value pair with indentation
markdown += f"{'#' * level} {key}\n"
markdown += f" {value}\n"

return markdown


def display_table(df: pd.DataFrame):
df = df.copy()
df.columns = clean_column_names(df.columns)
Expand Down Expand Up @@ -358,6 +385,9 @@ def run_gradio(results_dir: Path):
with gr.Tab("Task Error") as tab_error:
task_error = gr.Markdown()

with gr.Tab("Error Analysis") as tab_error_analysis:
error_analysis = gr.Markdown()

with gr.Tab("Logs") as tab_logs:
logs = gr.Code(language=None, **code_args)

Expand Down Expand Up @@ -485,6 +515,7 @@ def run_gradio(results_dir: Path):
tab_axtree.select(fn=update_axtree, outputs=axtree_code)
tab_chat.select(fn=update_chat_messages, outputs=chat_messages)
tab_error.select(fn=update_task_error, outputs=task_error)
tab_error_analysis.select(fn=update_error_analysis, outputs=error_analysis)
tab_logs.select(fn=update_logs, outputs=logs)
tab_stats.select(fn=update_stats, outputs=stats)
tab_agent_info_html.select(fn=update_agent_info_html, outputs=agent_info_html)
Expand Down Expand Up @@ -612,6 +643,20 @@ def update_task_error():
return "No Task Error"


def update_error_analysis():
global info
try:
error_analysis = info.exp_result.exp_dir / "error_analysis.json"
if not error_analysis.exists():
return "No Error Analysis Found"
with error_analysis.open("r") as f:
json_data = json.load(f)
res = dict_to_markdown(json_data)
return res
except FileNotFoundError:
return "No Error Analysis"


def update_logs():
global info
try:
Expand Down Expand Up @@ -1200,3 +1245,4 @@ def main():

if __name__ == "__main__":
main()
main()
Empty file.
110 changes: 110 additions & 0 deletions src/agentlab/analyze/error_analysis/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Generator

from bgym import ExpResult

from agentlab.analyze.error_analysis.summarizer import (
ChangeSummarizer,
EpisodeErrorSummarizer,
EpisodeSummarizer,
)
from agentlab.analyze.inspect_results import yield_all_exp_results


@dataclass
class Analyzer:
prompt: str
llm = None

def __call__(self, *args, **kwds):
return "analysis"


def analyze(exp_result, episode_summarizer, save_analysis_func):
error_analysis = episode_summarizer(exp_result)
save_analysis_func(exp_result, error_analysis)


@dataclass
class ErrorAnalysisPipeline:
exp_dir: Path
filter: str = None
episode_summarizer: EpisodeSummarizer = None

def filter_exp_results(self) -> Generator[ExpResult, None, None]:
# TODO:(thibault) improve filtering
exp_results = yield_all_exp_results(self.exp_dir)
for exp_result in exp_results:
if self.filter is None or self.filter in str(exp_result.exp_dir):
yield exp_result

def run_analysis(self, parallel=False, jobs=-1):
filtered_results = self.filter_exp_results()

if parallel:
import joblib

joblib.Parallel(n_jobs=jobs, backend="threading")(
joblib.delayed(analyze)(exp_result, self.episode_summarizer, self.save_analysis)
for exp_result in filtered_results
)

else:
for exp_result in filtered_results:
error_analysis = self.episode_summarizer(exp_result)
self.save_analysis(exp_result, error_analysis)

def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=True):
"""Save the analysis to json"""
analysis_path = exp_result.exp_dir / "error_analysis.json"
if not exists_ok and analysis_path.exists():
raise FileExistsError(f"{analysis_path} already exists")
with analysis_path.open("w") as f:
json.dump(error_analysis, f, indent=4)


AXTREE_FORMATTER = lambda x: x.get("axtree_txt", "No AXTREE available")
HTML_FORMATTER = lambda x: x.get("pruned_html", "No HTML available")


def main():
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("-e", "--exp_dir", type=str)
parser.add_argument("-f", "--filter", type=str, default=None)
parser.add_argument("-p", "--parallel", action="store_true")
parser.add_argument("-j", "--jobs", type=int, default=-1)
parser.add_argument("-g", "--guess_success", action="store_true")

args = parser.parse_args()

assert args.exp_dir is not None, "Please provide an exp_dir, e.g., -e /path/to/exp_dir"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

required=True?


exp_dir = Path(args.exp_dir)
filter = args.filter
parallel = args.parallel
jobs = args.jobs
guess_success = args.guess_success

from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT

llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-2024-08-06"].make_model()

pipeline = ErrorAnalysisPipeline(
exp_dir=exp_dir,
filter=filter,
episode_summarizer=EpisodeErrorSummarizer(
ChangeSummarizer(llm, AXTREE_FORMATTER), llm, guess_success=guess_success
),
)

pipeline.run_analysis(parallel=parallel, jobs=jobs)


if __name__ == "__main__":

main()
178 changes: 178 additions & 0 deletions src/agentlab/analyze/error_analysis/summarizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
from dataclasses import dataclass

from bgym import ExpResult, StepInfo

from agentlab.analyze.error_analysis.summarizer_prompts import (
CHANGE_SUMMARIZER_PROMPT,
ERROR_CLASSIFICATION_PROMPT,
ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT,
)
from agentlab.llm.llm_utils import json_parser, parse_html_tags
from agentlab.llm.tracking import set_tracker


def _diff(past_obs, current_obs):
"""TODO: Implement the diff function.

Returns a diff version of current_obs compares to past_obs, unless there is too many changes.

Args:
past_obs: The past observation.
current_obs: The current observation.

Raises:
ValueError: Not implemented yet.
"""
raise ValueError("Not implemented yet.")


@dataclass
class ChangeSummarizer:

llm: callable # language model
obs_formatter: callable = lambda x: x.get("dom_txt", "No AXTREE available")
use_diff: bool = False

def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]) -> str:
"""Produces, a summary of the effect of an action."""
obs_message = self.obs_formatter(obs.obs)
next_obs_message = self.obs_formatter(next_obs.obs)

action = obs.action

goal = obs.obs["goal"] # Use goal object from agentlab
# TODO(thibault): switch to 'goal_object'
# Outsource everything to formatter

if self.use_diff:
next_obs_message = _diff(obs_message, next_obs_message)

return self.parse(
self.llm(
self.make_prompt(
obs_message,
action,
next_obs_message,
past_summaries,
goal,
obs.obs.get("plan", "No plan available"),
)
)["content"]
)

def make_prompt(
self, past_obs_message, action, current_obs_message, past_summaries, goal, plan
):
"""TODO: Implement the prompt."""
return CHANGE_SUMMARIZER_PROMPT.format(
goal=goal,
plan=plan,
past_observation=past_obs_message,
current_observation=current_obs_message,
past_summaries=past_summaries,
action=action,
)

def parse(self, raw_output: str) -> dict:
parsed_result = parse_html_tags(
raw_output, keys=["changeSummary", "actionAssessment", "explanation", "suggestion"]
)[0]
return parsed_result


@dataclass
class EpisodeAnalysis:
analysis: str # complete analysis of the episode
summary: str # short summary of the analysis
categories: dict[str, float] # score for each category e.g. type of error or difficulty levels


@dataclass
class EpisodeSummarizer:

change_summarizer: ChangeSummarizer = None
llm: callable = None
parser: callable = lambda x: json_parser(x)[0]
guess_success: bool = False

def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ...

def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
"""Run Change Summarizer for every step in the episode or extract a pre-computed one."""

if not self.guess_success:
if exp_results.steps_info[-1].reward == 1:
return {"analysis": "Success", "summaries": {}}

with set_tracker("summary") as summaries_tracker:
summaries = self.make_change_summaries(exp_results)
prompt = self.make_prompt(exp_results, summaries)

with set_tracker("analysis") as analysis_tracker:
raw_analysis = self.llm(prompt)["content"]
analysis = self.parse(raw_analysis)
res = {
"analysis": analysis,
"summaries": {i: a for i, a in enumerate(summaries)},
}
res.update(analysis_tracker.stats)
res.update(summaries_tracker.stats)
return res

def make_change_summaries(self, exp_result: ExpResult) -> list[str]:
summaries = [] # type: list[str]
# this assumes that there is always an extra step at the end of the episode
# it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info
# TODO:(thibault) make some checks or w/e
for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]):
summaries.append(self.change_summarizer.summarize(step, next_step, summaries))
return summaries

def parse(self, raw_output: str) -> dict:
parsed_result = parse_html_tags(raw_output, keys=["explanation", "errorCategory"])[0]
return parsed_result


@dataclass
class EpisodeErrorSummarizer(EpisodeSummarizer):

change_summarizer: ChangeSummarizer = None

def make_prompt(self, exp_results: ExpResult, summaries: list[str]):
"""TODO: Implement the prompt."""
goal = exp_results.steps_info[0].obs["goal"]

def format_summary(summary):
res = ""
for key, value in summary.items():
res += f"{key}: {value}\n"
return res

txt_summaries = "\n".join([format_summary(summary) for summary in summaries])

actions = [step.action for step in exp_results.steps_info[:-1]]
action_errors = "\n".join(
[step.obs["last_action_error"] for step in exp_results.steps_info[1:]]
)

txt_actions = "\n".join(
[
f"Action: {action}\nAction Error: {action_error}"
for action, action_error in zip(actions, action_errors)
]
)

extra_info = exp_results.steps_info[-1].task_info

prompt = (
ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT
if self.guess_success
else ERROR_CLASSIFICATION_PROMPT
)

return prompt.format(
goal=goal,
historical_summaries=txt_summaries,
action_history=txt_actions,
extra_info=extra_info,
)
Loading