Skip to content

Commit 51a06f2

Browse files
andywagfacebook-github-bot
authored andcommitted
Remove crash from AIPM in torchX at end (#1042)
Summary: AIPM is crashing when closing the application. This resolves this by not killing the app_ids Differential Revision: D72580570
1 parent bec9317 commit 51a06f2

File tree

2 files changed

+0
-31
lines changed

2 files changed

+0
-31
lines changed

torchx/schedulers/local_scheduler.py

-4
Original file line numberDiff line numberDiff line change
@@ -1109,10 +1109,6 @@ def _cancel_existing(self, app_id: str) -> None:
11091109
local_app.state = AppState.CANCELLED
11101110

11111111
def close(self) -> None:
1112-
# terminate all apps
1113-
for app_id, app in self._apps.items():
1114-
log.debug(f"Terminating app: {app_id}")
1115-
app.kill()
11161112
# delete logdir if torchx created a log dir
11171113
if self._base_log_dir and self._created_tmp_log_dir:
11181114
shutil.rmtree(self._base_log_dir, ignore_errors=True)

torchx/schedulers/test/local_scheduler_test.py

-27
Original file line numberDiff line numberDiff line change
@@ -1112,33 +1112,6 @@ def test_get_cuda_devices_not_set(self, _: MagicMock) -> None:
11121112
self.assertFalse(ENV_CUDA_VISIBLE_DEVICES in role_params[2].env)
11131113
self.assertFalse(ENV_CUDA_VISIBLE_DEVICES in role_params[3].env)
11141114

1115-
def test_no_orphan_process_function(self) -> None:
1116-
self._test_orphan_workflow()
1117-
1118-
def _test_orphan_workflow(self) -> None:
1119-
mp_queue = mp.Queue()
1120-
child_nproc = 2
1121-
1122-
proc = mp.Process(
1123-
target=start_sleep_processes, args=(self.test_dir, mp_queue, child_nproc)
1124-
)
1125-
proc.start()
1126-
total_processes = child_nproc + 1
1127-
pids = []
1128-
for _ in range(total_processes):
1129-
pids.append(mp_queue.get(timeout=5))
1130-
parent_pid = pids[0]
1131-
child_pids = pids[1:]
1132-
1133-
os.kill(parent_pid, signal.SIGTERM)
1134-
# Wait to give time for signal handlers to finish work
1135-
time.sleep(5)
1136-
for child_pid in child_pids:
1137-
# Killing parent should kill all children, we expect that each call to
1138-
# os.kill would raise OSError
1139-
with self.assertRaises(OSError):
1140-
os.kill(child_pid, 0)
1141-
11421115

11431116
class JoinPATHTest(unittest.TestCase):
11441117
def test_join_PATH(self) -> None:

0 commit comments

Comments
 (0)