|
1 |
| - |
2 |
| -from distributaur.distributaur import create_from_config |
| 1 | +from distributask.distributask import create_from_config |
3 | 2 | from .filter import read_json_in_batches
|
4 | 3 | from .worker import run_job
|
5 | 4 | from tqdm import tqdm
|
6 | 5 | import time
|
7 | 6 |
|
8 |
| - |
9 | 7 | if __name__ == "__main__":
|
10 | 8 |
|
11 |
| - input_filename = 'datasets/cap3d_captions.json' |
12 |
| - batch_size = 1000 |
| 9 | + input_filename = "datasets/cap3d_captions.json" |
| 10 | + batch_size = 10000 |
13 | 11 |
|
14 |
| - distributaur = create_from_config() |
| 12 | + distributask = create_from_config() |
15 | 13 |
|
16 |
| - max_price = 0.1 |
17 |
| - max_nodes = 50 |
| 14 | + max_price = 0.25 |
| 15 | + max_nodes = 25 |
18 | 16 | docker_image = "antbaez/filter-worker:latest"
|
19 | 17 | module_name = "filtered.worker"
|
20 | 18 |
|
21 |
| - redis_client = distributaur.get_redis_connection() |
| 19 | + redis_client = distributask.get_redis_connection() |
22 | 20 |
|
23 |
| - rented_nodes = distributaur.rent_nodes(max_price, max_nodes, docker_image, module_name) |
| 21 | + rented_nodes = distributask.rent_nodes( |
| 22 | + max_price, max_nodes, docker_image, module_name |
| 23 | + ) |
24 | 24 | print("Total nodes rented: ", len(rented_nodes))
|
25 | 25 |
|
26 |
| - distributaur.register_function(run_job) |
| 26 | + distributask.register_function(run_job) |
27 | 27 |
|
28 | 28 | while True:
|
29 | 29 | user_input = input("press r when workers are ready: ")
|
30 | 30 | if user_input == "r":
|
31 | 31 | break
|
32 | 32 |
|
33 |
| - |
34 | 33 | total_batches = 0
|
35 |
| - |
| 34 | + |
36 | 35 | print("Sending tasks")
|
37 | 36 | tasks = []
|
38 | 37 |
|
|
43 | 42 | for i in range(num_batches):
|
44 | 43 |
|
45 | 44 | batch = json_batches[i]
|
46 |
| - |
47 | 45 | total_batches += 1
|
48 |
| - task = distributaur.execute_function("run_job", { |
49 |
| - "batch_index" : total_batches, |
50 |
| - "batch" : batch |
51 |
| - }) |
| 46 | + |
| 47 | + print(total_batches) |
| 48 | + task = distributask.execute_function( |
| 49 | + "run_job", {"batch_index": total_batches, "batch": batch} |
| 50 | + ) |
52 | 51 |
|
53 | 52 | tasks.append(task)
|
54 | 53 |
|
55 | 54 | first_task_done = False
|
56 | 55 | print("Tasks sent. Starting monitoring")
|
| 56 | + |
| 57 | + inactivity_log = {node["instance_id"]: 0 for node in rented_nodes} |
| 58 | + |
| 59 | + start_time = time.time() |
57 | 60 | with tqdm(total=len(tasks), unit="task") as pbar:
|
58 | 61 | while not all(task.ready() for task in tasks):
|
| 62 | + |
59 | 63 | current_tasks = sum([task.ready() for task in tasks])
|
60 | 64 | pbar.update(current_tasks - pbar.n)
|
61 |
| - if current_tasks > 0: |
62 |
| - if not first_task_done: |
63 |
| - first_task_done = True |
64 |
| - first_task_start_time = time.time() |
65 |
| - |
66 |
| - end_time = time.time() |
67 |
| - elapsed_time = end_time - first_task_start_time |
68 |
| - time_per_tasks = elapsed_time / current_tasks |
69 |
| - time_left = time_per_tasks * (len(tasks) - current_tasks) |
70 |
| - |
71 |
| - pbar.set_postfix( |
72 |
| - elapsed=f"{elapsed_time:.2f}s", time_left=f"{time_left:.2f}" |
73 |
| - ) |
74 |
| - time.sleep(2) |
| 65 | + |
| 66 | + time.sleep(1) |
| 67 | + |
| 68 | + current_time = time.time() |
| 69 | + if current_time - start_time > 60: |
| 70 | + start_time = time.time() |
| 71 | + |
| 72 | + for node in rented_nodes: |
| 73 | + log_response = distributask.get_node_log(node) |
| 74 | + if log_response.status_code == 200: |
| 75 | + try: |
| 76 | + last_msg = log_response.text.splitlines()[-1] |
| 77 | + if ("Task complete" in last_msg and inactivity_log[node["instance_id"]] == 0): |
| 78 | + inactivity_log[node["instance_id"]] = 1 |
| 79 | + elif ("Task complete" in last_msg and inactivity_log[node["instance_id"]] == 1): |
| 80 | + distributask.terminate_nodes([node]) |
| 81 | + print("node terminated") |
| 82 | + else: |
| 83 | + inactivity_log[node["instance_id"]] == 0 |
| 84 | + except: |
| 85 | + pass |
0 commit comments