update: data construction

OS-Copilot · Mar 6, 2025 · 060d19a · 060d19a
1 parent 2324c1d
commit 060d19a
Show file tree

Hide file tree

Showing 4 changed files with 325 additions and 0 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/collection/README.md b/collection/README.md
@@ -0,0 +1,18 @@
+# README
+
+In addition to the collected-synthesized data, we here provide scripts collected from the environment for Reverse Task Synthesis, helping you extend OS-Genesis to more scenarios or synthesize more data as desired.
+
+# Mobile
+
+## Reverse Task Synthesis
+
+Work in progress.
+
+## Trajectory Construction
+
+1. Install the AndroidWorld Environment as described in: https://github.com/google-research/android_world
+2. Move the scripts to the AndroidWorld directory: ``android_env/android_world``
+3. Run the following command to collect the data:
+```bash
+python mobile_runner.py
+```
diff --git a/collection/mobile_runner.py b/collection/mobile_runner.py
@@ -0,0 +1,297 @@
+# Copyright 2024 The android_world Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Build trajectories of OS-Genesis.
+
+"""
+
+from collections.abc import Sequence
+import os
+import random
+from typing import Type
+import json
+import time
+import uuid
+from PIL import Image
+import numpy as np
+import re
+import pickle
+
+from absl import app
+from absl import flags
+from absl import logging
+from android_world import registry
+from android_world.agents import infer
+from android_world.agents import t3a, m3a
+# from android_world.agents import m3a_origin
+from android_world.agents import m3a_utils
+from android_world.agents.t3a import _generate_ui_elements_description_list_full
+from android_world.env import env_launcher, json_action
+from android_world.task_evals import task_eval
+
+logging.set_verbosity(logging.WARNING)
+
+os.environ['GRPC_VERBOSITY'] = 'ERROR'  # Only show errors
+os.environ['GRPC_TRACE'] = 'none'  # Disable tracing
+
+
+def _find_adb_directory() -> str:
+  """Returns the directory where adb is located."""
+  potential_paths = [
+      os.path.expanduser('~/Library/Android/sdk/platform-tools/adb'),
+      os.path.expanduser('~/Android/Sdk/platform-tools/adb'),
+  ]
+  for path in potential_paths:
+    if os.path.isfile(path):
+      return path
+  raise EnvironmentError(
+      'adb not found in the common Android SDK paths. Please install Android'
+      " SDK and ensure adb is in one of the expected directories. If it's"
+      ' already installed, point to the installed location.'
+  )
+
+
+_ADB_PATH = flags.DEFINE_string(
+    'adb_path',
+    _find_adb_directory(),
+    'Path to adb. Set if not installed through SDK.',
+)
+_EMULATOR_SETUP = flags.DEFINE_boolean(
+    'perform_emulator_setup',
+    False,
+    'Whether to perform emulator setup. This must be done once and only once'
+    ' before running Android World. After an emulator is setup, this flag'
+    ' should always be False.',
+)
+_DEVICE_CONSOLE_PORT = flags.DEFINE_integer(
+    'console_port',
+    5554,
+    'The console port of the running Android device. This can usually be'
+    ' retrieved by looking at the output of `adb devices`. In general, the'
+    ' first connected device is port 5554, the second is 5556, and'
+    ' so on.',
+)
+
+_TASK = flags.DEFINE_string(
+    'task',
+    None,
+    'A specific task to run.',
+)
+
+
+def save_image(image, directory):
+    """Same image to a file and return the file name."""
+    unique_id = str(uuid.uuid4())
+    image_name = f"{unique_id}.png"
+    image_path = os.path.join(directory, image_name)
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(np.uint8(image))
+    image.save(image_path)
+    return image_name
+
+
+def get_state(env_state, logical_screen_size, ui_elements):
+    element_list_text = _generate_ui_elements_description_list_full(
+        ui_elements,
+        logical_screen_size,
+    )
+    screen = env_state.pixels.copy()
+    screen = Image.fromarray(screen.astype('uint8'))
+    return screen, element_list_text
+
+
+def element_to_identifier(element):
+    """Converts an element to a JSON-serializable identifier."""
+    bbox = getattr(element, 'bbox_pixels', None)
+    bbox_dict = {'x_min': bbox.x_min, 'x_max': bbox.x_max, 'y_min': bbox.y_min, 'y_max': bbox.y_max} if bbox else None
+    identifier = {
+        'resource_id': getattr(element, 'resource_id', None),
+        'text': getattr(element, 'text', None),
+        'content_description': getattr(element, 'content_description', None),
+        'class_name': getattr(element, 'class_name', None),
+        'bbox_pixels': bbox_dict,
+        'hint_text': getattr(element, 'hint_text', None),
+        'is_checkable': getattr(element, 'is_checkable', None),
+        'is_enabled': getattr(element, 'is_enabled', None),
+        'is_visible': getattr(element, 'is_visible', None),
+        'is_clickable': getattr(element, 'is_clickable', None),
+        'is_editable': getattr(element, 'is_editable', None),
+        'is_focused': getattr(element, 'is_focused', None),
+        'is_focusable': getattr(element, 'is_focusable', None),
+        'is_long_clickable': getattr(element, 'is_long_clickable', None),
+        'is_scrollable': getattr(element, 'is_scrollable', None),
+        'is_selected': getattr(element, 'is_selected', None),
+        'package_name': getattr(element, 'package_name', None),
+        'resource_name': getattr(element, 'resource_name', None),
+    }
+    return identifier
+
+
+def _main() -> None:
+
+  instruction_path = './aw_instructions.json'
+  aw_instrcutions = json.load(open(instruction_path, 'r'))
+
+  SCREEN_GPT_DIR = './screenshots_gpt_v2'
+  if not os.path.exists(SCREEN_GPT_DIR):
+      os.mkdir(SCREEN_GPT_DIR)
+
+  """Initialize Env."""
+  env = env_launcher.load_and_setup_env(
+      console_port=_DEVICE_CONSOLE_PORT.value,
+      emulator_setup=_EMULATOR_SETUP.value,
+      adb_path=_ADB_PATH.value,
+  )
+  env_launcher.verify_api_level(env)
+
+  for task_item in aw_instrcutions:
+      if "task_fail" in task_item:
+          del task_item["task_fail"]
+
+  for task_item in aw_instrcutions:
+
+      total_tasks = len(aw_instrcutions)
+      annotated_tasks = len([item for item in aw_instrcutions if "gpt_traj" in item])
+      print(f"Total task: {total_tasks} --- Annotated task: {annotated_tasks}")
+      failed_tasks = len([item for item in aw_instrcutions if "task_fail" in item])
+      print(f"Total task: {total_tasks} --- Failed task: {failed_tasks}")
+      if "gpt_traj" in task_item or "task_fail" in task_item:
+          continue
+
+      try:
+          env.reset(go_home=True)
+          task_registry = registry.TaskRegistry()
+          aw_registry = task_registry.get_registry(task_registry.ANDROID_WORLD_FAMILY)
+
+          # Initialize based on the task sampled and open the corresponding app.
+          app_name = task_item["app_name"]
+          task_name = task_item["task_name"] if "task_name" in task_item else task_item["task_task"]
+          instrcution = task_item["refine_task"]
+
+          if task_name and task_name != "default":
+            if task_name not in aw_registry:
+              raise ValueError('Task {} not found in registry.'.format(_TASK.value))
+            task_type: Type[task_eval.TaskEval] = aw_registry[task_name]
+          else:
+            task_type: Type[task_eval.TaskEval] = random.choice(
+                list(aw_registry.values())
+            )
+            print("unknown task name")
+            input()
+          print(task_type)
+
+          # load params
+          task_id = task_item["task_id"]
+          params_dir = './params_new'
+          params_path = os.path.join(params_dir, task_id + "_params.pkl")
+          with open(params_path, 'rb') as f:
+              params = pickle.load(f)
+          print(params)
+          #params = task_type.generate_random_params()
+
+          task = task_type(params)
+
+          task.initialize_task(env)
+          # agent = t3a.T3A(env, infer.Gpt4Wrapper('gpt-4-turbo-2024-04-09'))
+        #   agent = m3a_origin.M3A(env, infer.Gpt4Wrapper('gpt-4o-2024-08-06'))
+          agent = m3a.M3A(env, infer.Gpt4Wrapper('gpt-4o-2024-08-06'))
+
+
+          # Open the corresponding app after initializing the task.
+          open_app = True
+          if open_app:
+              open_app_action = {"action_type": "open_app", "app_name": app_name}
+              converted_action = json_action.JSONAction(**open_app_action)
+              agent.env.execute_action(converted_action)
+              time.sleep(3.0)
+
+          print('Goal: ' + str(instrcution))
+          is_done = False
+          gpt_traj = []
+          for i, _ in enumerate(range(15)):
+
+            # Obtain the state of the environment before execution to synchronize with our training setup.
+            env_state = agent.get_post_transition_state()
+            logical_screen_size = agent.env.logical_screen_size
+            ui_elements = env_state.ui_elements
+            screen, element_list_text = get_state(env_state, logical_screen_size, ui_elements)
+            screen_before = save_image(screen, SCREEN_GPT_DIR)
+            # Note: Here, following the implementation of M3A, a state interface representation consistent with the agent’s observation is saved, ensuring that the actions generated by the model can locate the corresponding elements.
+            ui_elements_before_identifiers = [element_to_identifier(elem) for elem in ui_elements if m3a_utils.validate_ui_element(elem, logical_screen_size)]
+
+            # take one step
+            response = agent.step(instrcution)
+
+            # Extract the screen, prompt, and generated action from the response.
+            screen_before_som = save_image(response.data["before_screenshot_with_som"], SCREEN_GPT_DIR)
+            action_prompt = response.data["action_prompt"]
+            action_output = response.data["action_output"]
+            action_reason = response.data["action_reason"]
+            summary_prompt = response.data["summary_prompt"]
+            summary = response.data["summary"]
+
+            match = re.search(r'Action:\s*(\{.*\})', action_output)
+            action_json = match.group(1) if match else "action_not_match"
+            # Exit if the same action is performed three times consecutively.
+            if app_name != "Simple Calendar Pro":
+                if i >= 2 and (action_json == gpt_traj[i-1]["action_json"] == gpt_traj[i-2]["action_json"]):
+                    break
+
+            step_data = {
+                "screen_before": screen_before,
+                "screen_before_som": screen_before_som,
+                "ui_elements_before_text": element_list_text,
+                "ui_elements_before": ui_elements_before_identifiers,
+                "action_prompt": action_prompt,
+                "action_output": action_output,
+                "action_json": action_json,
+                "action_reason": action_reason,
+                "summary_prompt": summary_prompt,
+                "summary": summary
+            }
+            gpt_traj.append(step_data)
+
+            if response.done:
+              is_done = True
+              break
+
+          """
+          agent_successful = is_done and task.is_successful(env) == 1
+          print(
+              f'{"Task Successful ✅" if agent_successful else "Task Failed ❌"};'
+              f' {task.goal}'
+          )
+          """
+
+          # env.close()
+
+          # Update the annotations to the original aw_instructions file at the end of each trajectory.
+          task_item["gpt_traj"] = gpt_traj
+          json.dump(aw_instrcutions, open(instruction_path, 'w'))
+
+      except Exception as e:
+          print(f"An error occurred: {e}")
+          task_item["task_fail"] = "fail"
+          json.dump(aw_instrcutions, open(instruction_path, 'w'))
+          time.sleep(10)
+          break     # Exit and restart after an error occurs.
+
+
+def main(argv: Sequence[str]) -> None:
+  del argv
+  _main()
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/collection/run_mobile_runner.py b/collection/run_mobile_runner.py
@@ -0,0 +1,10 @@
+import os
+import time
+
+
+def run_gpt_task():
+    os.system("mobile_runner.py")
+
+while True:
+    run_gpt_task()
+    time.sleep(20)