Skip to content

Commit

Permalink
update: data construction
Browse files Browse the repository at this point in the history
  • Loading branch information
QiushiSun committed Mar 6, 2025
1 parent 2324c1d commit 060d19a
Show file tree
Hide file tree
Showing 4 changed files with 325 additions and 0 deletions.
Binary file modified .DS_Store
Binary file not shown.
18 changes: 18 additions & 0 deletions collection/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# README

In addition to the collected-synthesized data, we here provide scripts collected from the environment for Reverse Task Synthesis, helping you extend OS-Genesis to more scenarios or synthesize more data as desired.

# Mobile

## Reverse Task Synthesis

Work in progress.

## Trajectory Construction

1. Install the AndroidWorld Environment as described in: https://github.com/google-research/android_world
2. Move the scripts to the AndroidWorld directory: ``android_env/android_world``
3. Run the following command to collect the data:
```bash
python mobile_runner.py
```
297 changes: 297 additions & 0 deletions collection/mobile_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
# Copyright 2024 The android_world Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Build trajectories of OS-Genesis.
"""

from collections.abc import Sequence
import os
import random
from typing import Type
import json
import time
import uuid
from PIL import Image
import numpy as np
import re
import pickle

from absl import app
from absl import flags
from absl import logging
from android_world import registry
from android_world.agents import infer
from android_world.agents import t3a, m3a
# from android_world.agents import m3a_origin
from android_world.agents import m3a_utils
from android_world.agents.t3a import _generate_ui_elements_description_list_full
from android_world.env import env_launcher, json_action
from android_world.task_evals import task_eval

logging.set_verbosity(logging.WARNING)

os.environ['GRPC_VERBOSITY'] = 'ERROR' # Only show errors
os.environ['GRPC_TRACE'] = 'none' # Disable tracing


def _find_adb_directory() -> str:
"""Returns the directory where adb is located."""
potential_paths = [
os.path.expanduser('~/Library/Android/sdk/platform-tools/adb'),
os.path.expanduser('~/Android/Sdk/platform-tools/adb'),
]
for path in potential_paths:
if os.path.isfile(path):
return path
raise EnvironmentError(
'adb not found in the common Android SDK paths. Please install Android'
" SDK and ensure adb is in one of the expected directories. If it's"
' already installed, point to the installed location.'
)


_ADB_PATH = flags.DEFINE_string(
'adb_path',
_find_adb_directory(),
'Path to adb. Set if not installed through SDK.',
)
_EMULATOR_SETUP = flags.DEFINE_boolean(
'perform_emulator_setup',
False,
'Whether to perform emulator setup. This must be done once and only once'
' before running Android World. After an emulator is setup, this flag'
' should always be False.',
)
_DEVICE_CONSOLE_PORT = flags.DEFINE_integer(
'console_port',
5554,
'The console port of the running Android device. This can usually be'
' retrieved by looking at the output of `adb devices`. In general, the'
' first connected device is port 5554, the second is 5556, and'
' so on.',
)

_TASK = flags.DEFINE_string(
'task',
None,
'A specific task to run.',
)


def save_image(image, directory):
"""Same image to a file and return the file name."""
unique_id = str(uuid.uuid4())
image_name = f"{unique_id}.png"
image_path = os.path.join(directory, image_name)
if isinstance(image, np.ndarray):
image = Image.fromarray(np.uint8(image))
image.save(image_path)
return image_name


def get_state(env_state, logical_screen_size, ui_elements):
element_list_text = _generate_ui_elements_description_list_full(
ui_elements,
logical_screen_size,
)
screen = env_state.pixels.copy()
screen = Image.fromarray(screen.astype('uint8'))
return screen, element_list_text


def element_to_identifier(element):
"""Converts an element to a JSON-serializable identifier."""
bbox = getattr(element, 'bbox_pixels', None)
bbox_dict = {'x_min': bbox.x_min, 'x_max': bbox.x_max, 'y_min': bbox.y_min, 'y_max': bbox.y_max} if bbox else None
identifier = {
'resource_id': getattr(element, 'resource_id', None),
'text': getattr(element, 'text', None),
'content_description': getattr(element, 'content_description', None),
'class_name': getattr(element, 'class_name', None),
'bbox_pixels': bbox_dict,
'hint_text': getattr(element, 'hint_text', None),
'is_checkable': getattr(element, 'is_checkable', None),
'is_enabled': getattr(element, 'is_enabled', None),
'is_visible': getattr(element, 'is_visible', None),
'is_clickable': getattr(element, 'is_clickable', None),
'is_editable': getattr(element, 'is_editable', None),
'is_focused': getattr(element, 'is_focused', None),
'is_focusable': getattr(element, 'is_focusable', None),
'is_long_clickable': getattr(element, 'is_long_clickable', None),
'is_scrollable': getattr(element, 'is_scrollable', None),
'is_selected': getattr(element, 'is_selected', None),
'package_name': getattr(element, 'package_name', None),
'resource_name': getattr(element, 'resource_name', None),
}
return identifier


def _main() -> None:

instruction_path = './aw_instructions.json'
aw_instrcutions = json.load(open(instruction_path, 'r'))

SCREEN_GPT_DIR = './screenshots_gpt_v2'
if not os.path.exists(SCREEN_GPT_DIR):
os.mkdir(SCREEN_GPT_DIR)

"""Initialize Env."""
env = env_launcher.load_and_setup_env(
console_port=_DEVICE_CONSOLE_PORT.value,
emulator_setup=_EMULATOR_SETUP.value,
adb_path=_ADB_PATH.value,
)
env_launcher.verify_api_level(env)

for task_item in aw_instrcutions:
if "task_fail" in task_item:
del task_item["task_fail"]

for task_item in aw_instrcutions:

total_tasks = len(aw_instrcutions)
annotated_tasks = len([item for item in aw_instrcutions if "gpt_traj" in item])
print(f"Total task: {total_tasks} --- Annotated task: {annotated_tasks}")
failed_tasks = len([item for item in aw_instrcutions if "task_fail" in item])
print(f"Total task: {total_tasks} --- Failed task: {failed_tasks}")
if "gpt_traj" in task_item or "task_fail" in task_item:
continue

try:
env.reset(go_home=True)
task_registry = registry.TaskRegistry()
aw_registry = task_registry.get_registry(task_registry.ANDROID_WORLD_FAMILY)

# Initialize based on the task sampled and open the corresponding app.
app_name = task_item["app_name"]
task_name = task_item["task_name"] if "task_name" in task_item else task_item["task_task"]
instrcution = task_item["refine_task"]

if task_name and task_name != "default":
if task_name not in aw_registry:
raise ValueError('Task {} not found in registry.'.format(_TASK.value))
task_type: Type[task_eval.TaskEval] = aw_registry[task_name]
else:
task_type: Type[task_eval.TaskEval] = random.choice(
list(aw_registry.values())
)
print("unknown task name")
input()
print(task_type)

# load params
task_id = task_item["task_id"]
params_dir = './params_new'
params_path = os.path.join(params_dir, task_id + "_params.pkl")
with open(params_path, 'rb') as f:
params = pickle.load(f)
print(params)
#params = task_type.generate_random_params()

task = task_type(params)

task.initialize_task(env)
# agent = t3a.T3A(env, infer.Gpt4Wrapper('gpt-4-turbo-2024-04-09'))
# agent = m3a_origin.M3A(env, infer.Gpt4Wrapper('gpt-4o-2024-08-06'))
agent = m3a.M3A(env, infer.Gpt4Wrapper('gpt-4o-2024-08-06'))


# Open the corresponding app after initializing the task.
open_app = True
if open_app:
open_app_action = {"action_type": "open_app", "app_name": app_name}
converted_action = json_action.JSONAction(**open_app_action)
agent.env.execute_action(converted_action)
time.sleep(3.0)

print('Goal: ' + str(instrcution))
is_done = False
gpt_traj = []
for i, _ in enumerate(range(15)):

# Obtain the state of the environment before execution to synchronize with our training setup.
env_state = agent.get_post_transition_state()
logical_screen_size = agent.env.logical_screen_size
ui_elements = env_state.ui_elements
screen, element_list_text = get_state(env_state, logical_screen_size, ui_elements)
screen_before = save_image(screen, SCREEN_GPT_DIR)
# Note: Here, following the implementation of M3A, a state interface representation consistent with the agent’s observation is saved, ensuring that the actions generated by the model can locate the corresponding elements.
ui_elements_before_identifiers = [element_to_identifier(elem) for elem in ui_elements if m3a_utils.validate_ui_element(elem, logical_screen_size)]

# take one step
response = agent.step(instrcution)

# Extract the screen, prompt, and generated action from the response.
screen_before_som = save_image(response.data["before_screenshot_with_som"], SCREEN_GPT_DIR)
action_prompt = response.data["action_prompt"]
action_output = response.data["action_output"]
action_reason = response.data["action_reason"]
summary_prompt = response.data["summary_prompt"]
summary = response.data["summary"]

match = re.search(r'Action:\s*(\{.*\})', action_output)
action_json = match.group(1) if match else "action_not_match"
# Exit if the same action is performed three times consecutively.
if app_name != "Simple Calendar Pro":
if i >= 2 and (action_json == gpt_traj[i-1]["action_json"] == gpt_traj[i-2]["action_json"]):
break

step_data = {
"screen_before": screen_before,
"screen_before_som": screen_before_som,
"ui_elements_before_text": element_list_text,
"ui_elements_before": ui_elements_before_identifiers,
"action_prompt": action_prompt,
"action_output": action_output,
"action_json": action_json,
"action_reason": action_reason,
"summary_prompt": summary_prompt,
"summary": summary
}
gpt_traj.append(step_data)

if response.done:
is_done = True
break

"""
agent_successful = is_done and task.is_successful(env) == 1
print(
f'{"Task Successful ✅" if agent_successful else "Task Failed ❌"};'
f' {task.goal}'
)
"""

# env.close()

# Update the annotations to the original aw_instructions file at the end of each trajectory.
task_item["gpt_traj"] = gpt_traj
json.dump(aw_instrcutions, open(instruction_path, 'w'))

except Exception as e:
print(f"An error occurred: {e}")
task_item["task_fail"] = "fail"
json.dump(aw_instrcutions, open(instruction_path, 'w'))
time.sleep(10)
break # Exit and restart after an error occurs.


def main(argv: Sequence[str]) -> None:
del argv
_main()


if __name__ == '__main__':
app.run(main)
10 changes: 10 additions & 0 deletions collection/run_mobile_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import os
import time


def run_gpt_task():
os.system("mobile_runner.py")

while True:
run_gpt_task()
time.sleep(20)

0 comments on commit 060d19a

Please sign in to comment.