aleph-im · olethanh · May 19, 2025 · Apr 6, 2025 · Apr 8, 2025 · Apr 9, 2025
diff --git a/src/aleph/vm/controllers/firecracker/instance.py b/src/aleph/vm/controllers/firecracker/instance.py
@@ -21,13 +21,7 @@
 from aleph.vm.hypervisors.firecracker.microvm import setfacl
 from aleph.vm.network.interfaces import TapInterface
 from aleph.vm.storage import create_devmapper, create_volume_file
-from aleph.vm.utils import (
-    HostNotFoundError,
-    NotEnoughDiskSpaceError,
-    check_disk_space,
-    ping,
-    run_in_subprocess,
-)
+from aleph.vm.utils import NotEnoughDiskSpaceError, check_disk_space, run_in_subprocess
 
 from .executable import (
     AlephFirecrackerExecutable,
@@ -120,30 +114,6 @@ async def setup(self):
             ),
         )
 
-    async def wait_for_init(self) -> None:
-        """Wait for the init process of the instance to be ready."""
-        assert self.enable_networking and self.tap_interface, f"Network not enabled for VM {self.vm_id}"
-
-        ip = self.get_ip()
-        if not ip:
-            msg = "Host IP not available"
-            raise ValueError(msg)
-
-        ip = ip.split("/", 1)[0]
-
-        attempts = 30
-        timeout_seconds = 2
-
-        for attempt in range(attempts):
-            try:
-                await ping(ip, packets=1, timeout=timeout_seconds)
-                return
-            except HostNotFoundError:
-                if attempt < (attempts - 1):
-                    continue
-                else:
-                    raise
-
     async def create_snapshot(self) -> CompressedDiskVolumeSnapshot:
         """Create a VM snapshot"""
         volume_path = await create_volume_file(self.resources.message_content.rootfs, self.resources.namespace)

diff --git a/src/aleph/vm/controllers/qemu/instance.py b/src/aleph/vm/controllers/qemu/instance.py
@@ -2,7 +2,6 @@
 import json
 import logging
 import shutil
-from asyncio import Task
 from asyncio.subprocess import Process
 from pathlib import Path
 from typing import Generic, TypeVar
@@ -32,7 +31,7 @@
 from aleph.vm.network.interfaces import TapInterface
 from aleph.vm.resources import HostGPU
 from aleph.vm.storage import get_rootfs_base_path
-from aleph.vm.utils import HostNotFoundError, ping, run_in_subprocess
+from aleph.vm.utils import run_in_subprocess
 
 logger = logging.getLogger(__name__)
 
@@ -232,41 +231,13 @@ async def start(self):
         # Start via systemd not here
         raise NotImplementedError()
 
-    async def wait_for_init(self) -> None:
-        """Wait for the init process of the instance to be ready."""
-        assert self.enable_networking and self.tap_interface, f"Network not enabled for VM {self.vm_id}"
-
-        ip = self.get_ip()
-        if not ip:
-            msg = "Host IP not available"
-            raise ValueError(msg)
-        ip = ip.split("/", 1)[0]
-
-        attempts = 30
-        timeout_seconds = 2
-
-        for attempt in range(attempts):
-            try:
-                await ping(ip, packets=1, timeout=timeout_seconds)
-                return
-            except HostNotFoundError:
-                if attempt < (attempts - 1):
-                    continue
-                else:
-                    raise
-
     async def start_guest_api(self):
         pass
 
     async def stop_guest_api(self):
         pass
 
-    print_task: Task | None = None
-
     async def teardown(self):
-        if self.print_task:
-            self.print_task.cancel()
-
         if self.enable_networking:
             teardown_nftables_for_vm(self.vm_id)
             if self.tap_interface:

diff --git a/src/aleph/vm/hypervisors/firecracker/microvm.py b/src/aleph/vm/hypervisors/firecracker/microvm.py
@@ -508,7 +508,7 @@
             except asyncio.TimeoutError:
                 # In  Python < 3.11 wait_closed() was broken and returned immediatly
                 # It is supposedly fixed in Python 3.12.1, but it hangs indefinitely during tests.
-                logger.info("f{self} unix socket closing timeout")
+                logger.info("%s unix socket closing timeout", self)
 
         logger.debug("Removing files")
         if self.config_file_path:

diff --git a/src/aleph/vm/models.py b/src/aleph/vm/models.py
@@ -41,7 +41,7 @@
 from aleph.vm.orchestrator.vm import AlephFirecrackerInstance
 from aleph.vm.resources import GpuDevice, HostGPU
 from aleph.vm.systemd import SystemDManager
-from aleph.vm.utils import create_task_log_exceptions, dumps_for_json
+from aleph.vm.utils import create_task_log_exceptions, dumps_for_json, is_pinging
 
 logger = logging.getLogger(__name__)
 
@@ -86,12 +86,25 @@
     stop_event: asyncio.Event
     expire_task: asyncio.Task | None = None
     update_task: asyncio.Task | None = None
+    init_task: asyncio.Task | None
 
     snapshot_manager: SnapshotManager | None
     systemd_manager: SystemDManager | None
 
     persistent: bool = False
 
+    @property
+    def is_starting(self) -> bool:
+        return bool(self.times.starting_at and not self.times.started_at and not self.times.stopping_at)
+
+    @property
+    def is_controller_running(self):
+        return (
+            self.systemd_manager.is_service_active(self.controller_service)
+            if self.persistent and self.systemd_manager
+            else None
+        )
+
     @property
     def is_running(self) -> bool:
         return (
@@ -162,6 +175,7 @@
         systemd_manager: SystemDManager | None,
         persistent: bool,
     ):
+        self.init_task = None
         self.uuid = uuid.uuid1()  # uuid1() includes the hardware address and timestamp
         self.vm_hash = vm_hash
         self.message = message
@@ -322,25 +336,71 @@
             await self.vm.start_guest_api()
 
             # Start VM and snapshots automatically
-            # If the execution is confidential, don't start it because we need to wait for the session certificate
-            # files, use the endpoint /control/machine/{ref}/confidential/initialize to get session files and start the VM
+            # If the execution is a confidential instance, it is start later in the process when the session certificate
+            # files are received from the client via the endpoint /control/machine/{ref}/confidential/initialize endpoint
             if self.persistent and not self.is_confidential and self.systemd_manager:
-                self.systemd_manager.enable_and_start(self.controller_service)
-                await self.wait_for_init()
-                if self.is_program and self.vm:
+                await self.systemd_manager.enable_and_start(self.controller_service)
+
+                if self.is_program:
+                    await self.wait_for_init()
                     await self.vm.load_configuration()
+                    self.times.started_at = datetime.now(tz=timezone.utc)
+                else:
+                    self.init_task = asyncio.create_task(self.non_blocking_wait_for_boot())
 
                 if self.vm and self.vm.support_snapshot and self.snapshot_manager:
                     await self.snapshot_manager.start_for(vm=self.vm)
 
-            self.times.started_at = datetime.now(tz=timezone.utc)
             self.ready_event.set()
             await self.save()
         except Exception:
+            logger.exception("%s error during start, tearing down", self)
             await self.vm.teardown()
             await self.vm.stop_guest_api()
             raise
 
+    async def wait_for_persistent_boot(self):
+        """Determine if VM has booted by responding to ping and check if the process is still running"""
+        assert self.vm
+        assert self.vm.enable_networking and self.vm.tap_interface, f"Network not enabled for VM {self.vm.vm_id}"
+        ip = self.vm.get_ip()
+        if not ip:
+            msg = "Host IP not available"
+            raise ValueError(msg)
+
+        ip = ip.split("/", 1)[0]
+        max_attempt = 30
+        timeout_seconds = 2
+        attempt = 0
+        while True:
+            attempt += 1
+            if attempt > max_attempt:
+                logging.error("%s has not responded to ping after %s attempt", self, attempt)
+                raise Exception("Max attempt")
+
+            if not self.is_controller_running:
+                logging.error("%s process stopped running while waiting for boot", self)
+                raise Exception("Process is not running")
+            if await is_pinging(ip, packets=1, timeout=timeout_seconds):
+                break
+
+    async def non_blocking_wait_for_boot(self):
+        """Wait till the VM respond to ping and mark it as booted or not and clean up ressource if it fail
+
+        Used for instances"""
+        assert self.vm
+        try:
+            await self.wait_for_persistent_boot()
+            logger.info("%s responded to ping. Marking it as started.", self)
+            self.times.started_at = datetime.now(tz=timezone.utc)
+            return True
+            # await self.save()
+        except Exception as e:
+            logger.warning("%s failed to responded to ping or is not running, stopping it.: %s ", self, e)
+            assert self.vm
+            await self.stop()
+            return False
+
     async def wait_for_init(self):
         assert self.vm, "The VM attribute has to be set before calling wait_for_init()"
         await self.vm.wait_for_init()
@@ -383,12 +443,15 @@
     async def stop(self) -> None:
         """Stop the VM and release resources"""
         assert self.vm, "The VM attribute has to be set before calling stop()"
+        logger.info("%s stopping", self)
 
         # Prevent concurrent calls to stop() using a Lock
         async with self.stop_pending_lock:
             if self.times.stopped_at is not None:
                 logger.debug(f"VM={self.vm.vm_id} already stopped")
                 return
+            if self.persistent and self.systemd_manager:
+                self.systemd_manager.stop_and_disable(self.controller_service)
             self.times.stopping_at = datetime.now(tz=timezone.utc)
             await self.all_runs_complete()
             await self.record_usage()
@@ -400,6 +463,7 @@
             if self.vm.support_snapshot and self.snapshot_manager:
                 await self.snapshot_manager.stop_for(self.vm_hash)
             self.stop_event.set()
+            logger.info("%s stopped", self)
 
     def start_watching_for_updates(self, pubsub: PubSub):
         if not self.update_task:
@@ -430,6 +494,7 @@
             await self.runs_done_event.wait()
 
     async def save(self):
+        """Save to DB"""
         assert self.vm, "The VM attribute has to be set before calling save()"
 
         pid_info = self.vm.to_dict() if self.vm else None

diff --git a/src/aleph/vm/network/firewall.py b/src/aleph/vm/network/firewall.py
@@ -35,7 +35,7 @@ def execute_json_nft_commands(commands: list[dict]) -> int:
     logger.debug("Inserting nftables rules")
     return_code, output, error = nft.json_cmd(commands_dict)
     if return_code != 0:
-        logger.error(f"Failed to add nftables rules: {error}")
+        logger.error("Failed to add nftables rules: %s -- %s", error, json.dumps(commands, indent=4))
 
     return return_code
 

diff --git a/src/aleph/vm/orchestrator/cli.py b/src/aleph/vm/orchestrator/cli.py
@@ -188,7 +188,7 @@
     bench: list[float] = []
 
     loop = asyncio.get_event_loop()
-    pool = VmPool(loop)
+    pool = VmPool()
     await pool.setup()
 
     # Does not make sense in benchmarks
@@ -246,7 +246,7 @@
     """Run instances from a list of message identifiers."""
     logger.info(f"Instances to run: {instances}")
     loop = asyncio.get_event_loop()
-    pool = VmPool(loop)
+    pool = VmPool()
     # The main program uses a singleton pubsub instance in order to watch for updates.
     # We create another instance here since that singleton is not initialized yet.
     # Watching for updates on this instance will therefore not work.

diff --git a/src/aleph/vm/orchestrator/run.py b/src/aleph/vm/orchestrator/run.py
@@ -258,13 +258,24 @@
 
 
 async def start_persistent_vm(vm_hash: ItemHash, pubsub: PubSub | None, pool: VmPool) -> VmExecution:
-    execution: VmExecution | None = pool.get_running_vm(vm_hash=vm_hash)
+    execution: VmExecution | None = pool.executions.get(vm_hash)
+    if execution:
+        if execution.is_running:
+            logger.info(f"{vm_hash} is already running")
+        elif execution.is_starting:
+            logger.info(f"{vm_hash} is already starting")
+        elif execution.is_stopping:
+            logger.info(f"{vm_hash} is stopping, waiting for complete stop before restarting")
+            await execution.stop_event.wait()
+            execution = None
+        else:
+            logger.info(f"{vm_hash} unknown execution state, stopping the vm")
+            await execution.stop()
+            execution = None
 
     if not execution:
         logger.info(f"Starting persistent virtual machine with id: {vm_hash}")
         execution = await create_vm_execution(vm_hash=vm_hash, pool=pool, persistent=True)
-    else:
-        logger.info(f"{vm_hash} is already running")
 
     await execution.becomes_ready()
 
@@ -276,13 +287,3 @@
         execution.start_watching_for_updates(pubsub=pubsub)
 
     return execution
-
-
-async def stop_persistent_vm(vm_hash: ItemHash, pool: VmPool) -> VmExecution | None:
-    logger.info(f"Stopping persistent VM {vm_hash}")
-    execution = pool.get_running_vm(vm_hash)
-
-    if execution:
-        await execution.stop()
-
-    return execution
diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py
@@ -33,6 +33,7 @@
     about_executions,
     about_login,
     list_executions,
+    list_executions_v2,
     notify_allocation,
     operate_reserve_resources,
     run_code_from_hostname,
@@ -94,14 +95,19 @@
     response.headers["Server"] = f"aleph-vm/{__version__}"
 
 
-async def http_not_found(request: web.Request):
+async def http_not_found(request: web.Request):  # noqa: ARG001
     """Return a 404 error for unknown URLs."""
     return web.HTTPNotFound()
 
 
-def setup_webapp():
+def setup_webapp(pool: VmPool | None):
+    """Create the webapp and set the VmPool
+
+    Only case where VmPool is None is in some tests that won't use it.
+    """
     app = web.Application(middlewares=[error_middleware])
     app.on_response_prepare.append(on_prepare_server_version)
+    app["vm_pool"] = pool
     cors = setup(
         app,
         defaults={
@@ -118,6 +124,7 @@
         # /about APIs return information about the VM Orchestrator
         web.get("/about/login", about_login),
         web.get("/about/executions/list", list_executions),
+        web.get("/v2/about/executions/list", list_executions_v2),
         web.get("/about/executions/details", about_executions),
         web.get("/about/executions/records", about_execution_records),
         web.get("/about/usage/system", about_system_usage),
@@ -175,18 +182,19 @@
     settings.check()
 
     loop = asyncio.new_event_loop()
-    pool = VmPool(loop)
+    pool = VmPool()
     asyncio.run(pool.setup())
 
     hostname = settings.DOMAIN_NAME
     protocol = "http" if hostname == "localhost" else "https"
 
     # Require a random token to access /about APIs
     secret_token = token_urlsafe(nbytes=32)
-    app = setup_webapp()
+    (settings.EXECUTION_ROOT / "login_token").write_text(secret_token)
+    (settings.EXECUTION_ROOT / "login_token").chmod(0o400)
+    app = setup_webapp(pool=pool)
     # Store app singletons. Note that app["pubsub"] will also be created.
     app["secret_token"] = secret_token
-    app["vm_pool"] = pool
 
     # Store sevctl app singleton only if confidential feature is enabled
     if settings.ENABLE_CONFIDENTIAL_COMPUTING: