Skip to content

Commit 7e13a79

Browse files
ahalbhearsum
authored andcommitted
feat: generate kinds in parallel across multiple processes
This is a cleaned up and slightly improved version of @ahal's original patch. Most notably, it uses `wait` to resubmit new kinds as soon as they become available (instead of waiting for all kinds in each round to be completed). This means that if a slow kind gets submitted before all other (non-downstream) kinds have been submitted, that it won't block them. In the case of Gecko, the effect of this is that the `test` kind begins to process very quickly, and all other kinds are finished processing before that has completed. Locally, this took `./mach taskgraph tasks` from 1m26s to 1m9s (measured from command start to the final "Generated xxx tasks" message. On try the results were a bit more mixed. The minimum time I observed without this patch was 140s, while the maximum was 291s (which seems to have been caused by bugbug slowness...which I'm willing to throw out). Outside of that outlier, the maximum was 146s and the mean was 143s. The minimum time I observed with this patch was 130s, while the maximum was 144s and the mean was 138s. I presume the difference in results locally vs. Try is that locally I'm on a 64-core SSD machine, and the decision tasks run on lowered powered machines on Try, so there ends up being some resource contention (I/O, I suspect, because the ProcessPoolExecutor will only run one process per CPU core) when we process kinds in parallel there. Despite this disappointing result on Try, this may still be worth taking, as `./mach taskgraph` runs twice in the critical path of many try pushes (once on a developer's machine, and again in the decision task). raw data: Over 5 runs on try I got, without this patch: 291s, 146s, 146s, 140s, 140s In each of those, there were 241s, 92s, 94s, 90s, 90s between "Loading tasks for kind test" and "Generated xxxxxx tasks for kind test" Which means we spent the following amount of time doing non-test kind things in the critical path: 50s, 54s, 52s, 50s, 50s With this patch: 130s, 141s, and 144s, 140s, 135s In each of those, there were 105s, 114s, 115s, 114s, 109s between "Loading tasks for kind test" and "Generated xxxxxx tasks for kind test" Which means we spent the following amount of time doing non-test kind things, but it was almost entirely out of the critical path: 25s, 27s, 29s, 26s, 26s
1 parent 8cc5577 commit 7e13a79

File tree

2 files changed

+139
-35
lines changed

2 files changed

+139
-35
lines changed

src/taskgraph/generator.py

Lines changed: 124 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@
55
import copy
66
import logging
77
import os
8+
import platform
9+
from concurrent.futures import (
10+
FIRST_COMPLETED,
11+
ProcessPoolExecutor,
12+
wait,
13+
)
814
from dataclasses import dataclass
915
from typing import Callable, Dict, Optional, Union
1016

@@ -46,16 +52,20 @@ def _get_loader(self) -> Callable:
4652
assert callable(loader)
4753
return loader
4854

49-
def load_tasks(self, parameters, loaded_tasks, write_artifacts):
55+
def load_tasks(self, parameters, kind_dependencies_tasks, write_artifacts):
56+
logger.debug(f"Loading tasks for kind {self.name}")
57+
58+
parameters = Parameters(**parameters)
5059
loader = self._get_loader()
5160
config = copy.deepcopy(self.config)
5261

53-
kind_dependencies = config.get("kind-dependencies", [])
54-
kind_dependencies_tasks = {
55-
task.label: task for task in loaded_tasks if task.kind in kind_dependencies
56-
}
57-
58-
inputs = loader(self.name, self.path, config, parameters, loaded_tasks)
62+
inputs = loader(
63+
self.name,
64+
self.path,
65+
config,
66+
parameters,
67+
list(kind_dependencies_tasks.values()),
68+
)
5969

6070
transforms = TransformSequence()
6171
for xform_path in config["transforms"]:
@@ -89,6 +99,7 @@ def load_tasks(self, parameters, loaded_tasks, write_artifacts):
8999
)
90100
for task_dict in transforms(trans_config, inputs)
91101
]
102+
logger.info(f"Generated {len(tasks)} tasks for kind {self.name}")
92103
return tasks
93104

94105
@classmethod
@@ -253,6 +264,101 @@ def _load_kinds(self, graph_config, target_kinds=None):
253264
except KindNotFound:
254265
continue
255266

267+
def _load_tasks_serial(self, kinds, kind_graph, parameters):
268+
all_tasks = {}
269+
for kind_name in kind_graph.visit_postorder():
270+
logger.debug(f"Loading tasks for kind {kind_name}")
271+
272+
kind = kinds.get(kind_name)
273+
if not kind:
274+
message = f'Could not find the kind "{kind_name}"\nAvailable kinds:\n'
275+
for k in sorted(kinds):
276+
message += f' - "{k}"\n'
277+
raise Exception(message)
278+
279+
try:
280+
new_tasks = kind.load_tasks(
281+
parameters,
282+
{
283+
k: t
284+
for k, t in all_tasks.items()
285+
if t.kind in kind.config.get("kind-dependencies", [])
286+
},
287+
self._write_artifacts,
288+
)
289+
except Exception:
290+
logger.exception(f"Error loading tasks for kind {kind_name}:")
291+
raise
292+
for task in new_tasks:
293+
if task.label in all_tasks:
294+
raise Exception("duplicate tasks with label " + task.label)
295+
all_tasks[task.label] = task
296+
297+
return all_tasks
298+
299+
def _load_tasks_parallel(self, kinds, kind_graph, parameters):
300+
all_tasks = {}
301+
futures_to_kind = {}
302+
futures = set()
303+
edges = set(kind_graph.edges)
304+
305+
with ProcessPoolExecutor() as executor:
306+
307+
def submit_ready_kinds():
308+
"""Create the next batch of tasks for kinds without dependencies."""
309+
nonlocal kinds, edges, futures
310+
loaded_tasks = all_tasks.copy()
311+
kinds_with_deps = {edge[0] for edge in edges}
312+
ready_kinds = (
313+
set(kinds) - kinds_with_deps - set(futures_to_kind.values())
314+
)
315+
for name in ready_kinds:
316+
kind = kinds.get(name)
317+
if not kind:
318+
message = (
319+
f'Could not find the kind "{name}"\nAvailable kinds:\n'
320+
)
321+
for k in sorted(kinds):
322+
message += f' - "{k}"\n'
323+
raise Exception(message)
324+
325+
future = executor.submit(
326+
kind.load_tasks,
327+
dict(parameters),
328+
{
329+
k: t
330+
for k, t in loaded_tasks.items()
331+
if t.kind in kind.config.get("kind-dependencies", [])
332+
},
333+
self._write_artifacts,
334+
)
335+
futures.add(future)
336+
futures_to_kind[future] = name
337+
338+
submit_ready_kinds()
339+
while futures:
340+
done, _ = wait(futures, return_when=FIRST_COMPLETED)
341+
for future in done:
342+
if exc := future.exception():
343+
executor.shutdown(wait=False, cancel_futures=True)
344+
raise exc
345+
kind = futures_to_kind.pop(future)
346+
futures.remove(future)
347+
348+
for task in future.result():
349+
if task.label in all_tasks:
350+
raise Exception("duplicate tasks with label " + task.label)
351+
all_tasks[task.label] = task
352+
353+
# Update state for next batch of futures.
354+
del kinds[kind]
355+
edges = {e for e in edges if e[1] != kind}
356+
357+
# Submit any newly unblocked kinds
358+
submit_ready_kinds()
359+
360+
return all_tasks
361+
256362
def _run(self):
257363
logger.info("Loading graph configuration.")
258364
graph_config = load_graph_config(self.root_dir)
@@ -307,31 +413,18 @@ def _run(self):
307413
)
308414

309415
logger.info("Generating full task set")
310-
all_tasks = {}
311-
for kind_name in kind_graph.visit_postorder():
312-
logger.debug(f"Loading tasks for kind {kind_name}")
313-
314-
kind = kinds.get(kind_name)
315-
if not kind:
316-
message = f'Could not find the kind "{kind_name}"\nAvailable kinds:\n'
317-
for k in sorted(kinds):
318-
message += f' - "{k}"\n'
319-
raise Exception(message)
416+
# Current parallel generation relies on multiprocessing, and forking.
417+
# This causes problems on Windows and macOS due to how new processes
418+
# are created there, and how doing so reinitializes global variables
419+
# that are modified earlier in graph generation, that doesn't get
420+
# redone in the new processes. Ideally this would be fixed, or we
421+
# would take another approach to parallel kind generation. In the
422+
# meantime, it's not supported outside of Linux.
423+
if platform.system() != "Linux":
424+
all_tasks = self._load_tasks_serial(kinds, kind_graph, parameters)
425+
else:
426+
all_tasks = self._load_tasks_parallel(kinds, kind_graph, parameters)
320427

321-
try:
322-
new_tasks = kind.load_tasks(
323-
parameters,
324-
list(all_tasks.values()),
325-
self._write_artifacts,
326-
)
327-
except Exception:
328-
logger.exception(f"Error loading tasks for kind {kind_name}:")
329-
raise
330-
for task in new_tasks:
331-
if task.label in all_tasks:
332-
raise Exception("duplicate tasks with label " + task.label)
333-
all_tasks[task.label] = task
334-
logger.info(f"Generated {len(new_tasks)} tasks for kind {kind_name}")
335428
full_task_set = TaskGraph(all_tasks, Graph(frozenset(all_tasks), frozenset()))
336429
yield self.verify("full_task_set", full_task_set, graph_config, parameters)
337430

test/test_generator.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,27 @@
33
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
44

55

6+
from concurrent.futures import ProcessPoolExecutor
7+
68
import pytest
7-
from pytest_taskgraph import FakeKind, WithFakeKind, fake_load_graph_config
9+
from pytest_taskgraph import WithFakeKind, fake_load_graph_config
810

911
from taskgraph import generator, graph
1012
from taskgraph.generator import Kind, load_tasks_for_kind, load_tasks_for_kinds
1113
from taskgraph.loader.default import loader as default_loader
1214

1315

14-
def test_kind_ordering(maketgg):
16+
class FakePPE(ProcessPoolExecutor):
17+
loaded_kinds = []
18+
19+
def submit(self, kind_load_tasks, *args):
20+
self.loaded_kinds.append(kind_load_tasks.__self__.name)
21+
return super().submit(kind_load_tasks, *args)
22+
23+
24+
def test_kind_ordering(mocker, maketgg):
1525
"When task kinds depend on each other, they are loaded in postorder"
26+
mocked_ppe = mocker.patch.object(generator, "ProcessPoolExecutor", new=FakePPE)
1627
tgg = maketgg(
1728
kinds=[
1829
("_fake3", {"kind-dependencies": ["_fake2", "_fake1"]}),
@@ -21,7 +32,7 @@ def test_kind_ordering(maketgg):
2132
]
2233
)
2334
tgg._run_until("full_task_set")
24-
assert FakeKind.loaded_kinds == ["_fake1", "_fake2", "_fake3"]
35+
assert mocked_ppe.loaded_kinds == ["_fake1", "_fake2", "_fake3"]
2536

2637

2738
def test_full_task_set(maketgg):
@@ -293,5 +304,5 @@ def test_kind_load_tasks(monkeypatch, graph_config, parameters, datadir, kind_co
293304
kind = Kind(
294305
name="fake", path="foo/bar", config=kind_config, graph_config=graph_config
295306
)
296-
tasks = kind.load_tasks(parameters, [], False)
307+
tasks = kind.load_tasks(parameters, {}, False)
297308
assert tasks

0 commit comments

Comments
 (0)