All-Hands-AI · enyst · Jan 15, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 9, 2025
diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
@@ -51,69 +51,138 @@ jobs:
       - name: Install Python dependencies using Poetry
         run: poetry install --without evaluation,llama-index
 
-      - name: Configure config.toml for testing with Haiku
+      # Commenting out CodeActAgent Haiku tests
+      # - name: Configure config.toml for testing with Haiku
+      #   env:
+      #     LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
+      #     LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+      #     LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+      #     MAX_ITERATIONS: 10
+      #   run: |
+      #     echo "[llm.eval]" > config.toml
+      #     echo "model = \"$LLM_MODEL\"" >> config.toml
+      #     echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+      #     echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+      #     echo "temperature = 0.0" >> config.toml
+
+      #- name: Build environment
+      #  run: make build
+      # - name: Run integration test evaluation for Haiku
+      #   env:
+      #     SANDBOX_FORCE_REBUILD_RUNTIME: True
+      #   run: |
+      #     poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run'
+
+      #     # get integration tests report
+      #     REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1)
+      #     echo "REPORT_FILE: $REPORT_FILE_HAIKU"
+      #     echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV
+      #     cat $REPORT_FILE_HAIKU >> $GITHUB_ENV
+      #     echo >> $GITHUB_ENV
+      #     echo "EOF" >> $GITHUB_ENV
+
+      # - name: Wait a little bit
+      #   run: sleep 10
+
+      # Commenting out CodeActAgent DeepSeek tests
+      # - name: Configure config.toml for testing with DeepSeek
+      #   env:
+      #     LLM_MODEL: "litellm_proxy/deepseek-chat"
+      #     LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+      #     LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+      #     MAX_ITERATIONS: 10
+      #   run: |
+      #     echo "[llm.eval]" > config.toml
+      #     echo "model = \"$LLM_MODEL\"" >> config.toml
+      #     echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+      #     echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+      #     echo "temperature = 0.0" >> config.toml
+
+      # - name: Run integration test evaluation for DeepSeek
+      #   env:
+      #     SANDBOX_FORCE_REBUILD_RUNTIME: True
+      #   run: |
+      #     poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run'
+
+      #     # get integration tests report
+      #     REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1)
+      #     echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK"
+      #     echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV
+      #     cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV
+      #     echo >> $GITHUB_ENV
+      #     echo "EOF" >> $GITHUB_ENV
+
+      # -------------------------------------------------------------
+      # Run DelegatorAgent tests for Haiku, limited to t01 and t02
+      - name: Wait a little bit (again)
+        run: sleep 5
+
+      - name: Configure config.toml for testing DelegatorAgent (Haiku)
         env:
           LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
           LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 30
         run: |
           echo "[llm.eval]" > config.toml
           echo "model = \"$LLM_MODEL\"" >> config.toml
           echo "api_key = \"$LLM_API_KEY\"" >> config.toml
           echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
           echo "temperature = 0.0" >> config.toml
 
-      - name: Build environment
-        run: make build
-
-      - name: Run integration test evaluation for Haiku
+      - name: Run integration test evaluation for DelegatorAgent (Haiku)
         env:
           SANDBOX_FORCE_REBUILD_RUNTIME: True
         run: |
-          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run'
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_haiku_run'
 
-          # get integration tests report
-          REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1)
-          echo "REPORT_FILE: $REPORT_FILE_HAIKU"
-          echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV
-          cat $REPORT_FILE_HAIKU >> $GITHUB_ENV
+          # Find and export the delegator test results
+          REPORT_FILE_DELEGATOR_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/*haiku*_maxiter_30_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE_DELEGATOR_HAIKU: $REPORT_FILE_DELEGATOR_HAIKU"
+          echo "INTEGRATION_TEST_REPORT_DELEGATOR_HAIKU<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_DELEGATOR_HAIKU >> $GITHUB_ENV
           echo >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
 
-      - name: Wait a little bit
-        run: sleep 10
+      # -------------------------------------------------------------
+      # Run DelegatorAgent tests for DeepSeek, limited to t01 and t02
+      - name: Wait a little bit (again)
+        run: sleep 5
 
-      - name: Configure config.toml for testing with DeepSeek
+      - name: Configure config.toml for testing DelegatorAgent (DeepSeek)
         env:
           LLM_MODEL: "litellm_proxy/deepseek-chat"
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
           LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 30
         run: |
           echo "[llm.eval]" > config.toml
           echo "model = \"$LLM_MODEL\"" >> config.toml
           echo "api_key = \"$LLM_API_KEY\"" >> config.toml
           echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
           echo "temperature = 0.0" >> config.toml
 
-      - name: Run integration test evaluation for DeepSeek
+      - name: Run integration test evaluation for DelegatorAgent (DeepSeek)
         env:
           SANDBOX_FORCE_REBUILD_RUNTIME: True
         run: |
-          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run'
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_deepseek_run'
 
-          # get integration tests report
-          REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1)
-          echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK"
-          echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV
-          cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV
+          # Find and export the delegator test results
+          REPORT_FILE_DELEGATOR_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/deepseek*_maxiter_30_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE_DELEGATOR_DEEPSEEK: $REPORT_FILE_DELEGATOR_DEEPSEEK"
+          echo "INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_DELEGATOR_DEEPSEEK >> $GITHUB_ENV
           echo >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
 
       - name: Create archive of evaluation outputs
         run: |
           TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
           cd evaluation/evaluation_outputs/outputs  # Change to the outputs directory
-          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/*  # Only include the actual result directories
+          # uncomment me
+          #tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* integration_tests/DelegatorAgent/*  # Only include the actual result directories
+          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/DelegatorAgent/*
 
       - name: Upload evaluation results as artifact
         uses: actions/upload-artifact@v4
@@ -149,10 +218,18 @@ jobs:
               Commit: ${{ github.sha }}
               **Integration Tests Report (Haiku)**
               Haiku LLM Test Results:
-              ${{ env.INTEGRATION_TEST_REPORT_HAIKU }}
+              # uncomment me
+              #${{ env.INTEGRATION_TEST_REPORT_HAIKU }}
               ---
               **Integration Tests Report (DeepSeek)**
               DeepSeek LLM Test Results:
-              ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
+              # uncomment me
+              #${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
+              ---
+                **Integration Tests Report Delegator (Haiku)**
+              ${{ env.INTEGRATION_TEST_REPORT_DELEGATOR_HAIKU }}
+              ---
+              **Integration Tests Report Delegator (DeepSeek)**
+              ${{ env.INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK }}
               ---
               Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})
diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
@@ -8,13 +8,15 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
-    codeact_user_response,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
     run_evaluation,
     update_llm_config_for_completions_logging,
 )
+from evaluation.utils.shared import (
+    codeact_user_response as fake_user_response,
+)
 from openhands.controller.state.state import State
 from openhands.core.config import (
     AgentConfig,
@@ -31,7 +33,8 @@
 from openhands.utils.async_utils import call_async_from_sync
 
 FAKE_RESPONSES = {
-    'CodeActAgent': codeact_user_response,
+    'CodeActAgent': fake_user_response,
+    'DelegatorAgent': fake_user_response,
 }
 
 
@@ -219,7 +222,7 @@ def load_integration_tests() -> pd.DataFrame:
 
     df = pd.read_json(output_file, lines=True, orient='records')
 
-    # record success and reason for failure for the final report
+    # record success and reason
     df['success'] = df['test_result'].apply(lambda x: x['success'])
     df['reason'] = df['test_result'].apply(lambda x: x['reason'])
     logger.info('-' * 100)
@@ -234,15 +237,27 @@ def load_integration_tests() -> pd.DataFrame:
     logger.info('-' * 100)
 
     # record cost for each instance, with 3 decimal places
-    df['cost'] = df['metrics'].apply(lambda x: round(x['accumulated_cost'], 3))
+    # we sum up all the "costs" from the metrics array
+    df['cost'] = df['metrics'].apply(
+        lambda m: round(sum(c['cost'] for c in m['costs']), 3)
+        if m and 'costs' in m
+        else 0.0
+    )
+
+    # capture the top-level error if present, per instance
+    df['error_message'] = df.get('error', None)
+
     logger.info(f'Total cost: USD {df["cost"].sum():.2f}')
 
     report_file = os.path.join(metadata.eval_output_dir, 'report.md')
     with open(report_file, 'w') as f:
         f.write(
-            f'Success rate: {df["success"].mean():.2%} ({df["success"].sum()}/{len(df)})\n'
+            f'Success rate: {df["success"].mean():.2%}'
+            f' ({df["success"].sum()}/{len(df)})\n'
         )
         f.write(f'\nTotal cost: USD {df["cost"].sum():.2f}\n')
         f.write(
-            df[['instance_id', 'success', 'reason', 'cost']].to_markdown(index=False)
+            df[
+                ['instance_id', 'success', 'reason', 'cost', 'error_message']
+            ].to_markdown(index=False)
         )
diff --git a/evaluation/integration_tests/scripts/run_infer.sh b/evaluation/integration_tests/scripts/run_infer.sh
@@ -7,8 +7,9 @@ MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
 EVAL_LIMIT=$4
-NUM_WORKERS=$5
-EVAL_IDS=$6
+MAX_ITERATIONS=$5
+NUM_WORKERS=$6
+EVAL_IDS=$7
 
 if [ -z "$NUM_WORKERS" ]; then
   NUM_WORKERS=1
@@ -43,7 +44,7 @@ fi
 COMMAND="poetry run python evaluation/integration_tests/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
-  --max-iterations 10 \
+  --max-iterations ${MAX_ITERATIONS:-10} \
   --eval-num-workers $NUM_WORKERS \
   --eval-note $EVAL_NOTE"
 

diff --git a/openhands/agenthub/micro/agent.py b/openhands/agenthub/micro/agent.py
@@ -50,6 +50,12 @@ def history_to_json(self, history: list[Event], max_events: int = 20, **kwargs):
         # history is in reverse order, let's fix it
         processed_history.reverse()
 
+        # everything starts with a message
+        # the first message is already in the prompt as the task
+        # so we don't need to include it in the history
+        if event_count < max_events:
+            processed_history.pop(0)
+
         return json.dumps(processed_history, **kwargs)
 
     def __init__(self, llm: LLM, config: AgentConfig):
@@ -62,6 +68,7 @@ def __init__(self, llm: LLM, config: AgentConfig):
 
     def step(self, state: State) -> Action:
         last_user_message, last_image_urls = state.get_current_user_intent()
+        print(f'MICROAGENT:step: {last_user_message}')
         prompt = self.prompt_template.render(
             state=state,
             instructions=instructions,