aleph-im · nesitor · May 26, 2025 · Feb 16, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/.github/workflows/test-using-pytest.yml b/.github/workflows/test-using-pytest.yml
@@ -25,7 +25,7 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get -y upgrade
-          sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-cpuinfo python3-nftables python3-jsonschema nftables  libsystemd-dev cmake libdbus-1-dev libglib2.0-dev
+          sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-cpuinfo python3-nftables python3-jsonschema nftables  libsystemd-dev cmake libdbus-1-dev libglib2.0-dev lshw python3-jwcrypto
           pip install --upgrade typing-extensions types-PyYAML
 
       - name: Install required Python packages

diff --git a/docker/vm_supervisor-dev.dockerfile b/docker/vm_supervisor-dev.dockerfile
@@ -5,7 +5,7 @@ FROM debian:bookworm
 RUN apt-get update && apt-get -y upgrade && apt-get install -y \
     sudo acl curl squashfs-tools git \
     python3 python3-aiohttp python3-alembic python3-msgpack python3-pip python3-aiodns python3-aioredis\
-    python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging python3-cpuinfo ndppd nftables \
+    python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging ndppd nftables \
     && rm -rf /var/lib/apt/lists/*
 
 RUN useradd jailman

diff --git a/packaging/aleph-vm/DEBIAN/control b/packaging/aleph-vm/DEBIAN/control
@@ -3,6 +3,6 @@ Version: 0.1.8
 Architecture: all
 Maintainer: Aleph.im
 Description: Aleph.im VM execution engine
-Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables,python3-jwcrypto
+Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables,lshw,python3-jwcrypto
 Section: aleph-im
 Priority: Extra
diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py
@@ -0,0 +1,76 @@
+import asyncio
+import json
+import re
+import shutil
+
+import psutil
+
+from aleph.vm.utils import run_in_subprocess
+
+
+async def get_hardware_info():
+    lshw_path = shutil.which("lshw")
+    assert lshw_path, "lshw not found in PATH. apt install lshw."
+    lshw_output = await run_in_subprocess([lshw_path, "-sanitize", "-json"])
+    data = json.loads(lshw_output)
+
+    hw_info = {"cpu": None, "memory": None}
+
+    for hw in data["children"][0]["children"]:
+        if hw["id"] == "cpu":
+            hw_info["cpu"] = hw
+        elif hw["class"] == "memory" and hw["id"] == "memory":
+            hw_info["memory"] = hw
+
+    return hw_info
+
+
+def get_cpu_info(hw):
+    cpu_info = hw["cpu"]
+
+    if "x86_64" in cpu_info["capabilities"] or "x86-64" in cpu_info["capabilities"]:
+        architecture = "x86_64"
+    elif "arm64" in cpu_info["capabilities"] or "arm-64" in cpu_info["capabilities"]:
+        architecture = "arm64"
+    else:
+        architecture = None
+
+    vendor = cpu_info["vendor"]
+    # lshw vendor implementation => https://github.com/lyonel/lshw/blob/15e4ca64647ad119b69be63274e5de2696d3934f/src/core/cpuinfo.cc#L308
+
+    if "Intel Corp" in vendor:
+        vendor = "GenuineIntel"
+    elif "Advanced Micro Devices [AMD]" in vendor:
+        vendor = "AuthenticAMD"
+
+    return {
+        "architecture": architecture,
+        "vendor": vendor,
+        "model": cpu_info["product"],
+        "frequency": cpu_info["capacity"],
+        "count": psutil.cpu_count(),
+    }
+
+
+def get_memory_info(hw):
+    mem_info = hw["memory"]
+
+    memory_type = ""
+    memory_clock = ""
+    for bank in mem_info["children"]:
+        memory_clock = bank.get("clock")
+        if "description" in bank:
+            matched = re.search("(DDR[2-6])", bank["description"])
+            if matched:
+                memory_type = matched.group(0)
+                break
+            else:
+                pass
+
+    return {
+        "size": mem_info["size"],
+        "units": mem_info["units"],
+        "type": memory_type,
+        "clock": memory_clock,
+        "clock_units": "Hz" if memory_clock is not None else "",
+    }
diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py
@@ -1,19 +1,24 @@
 import math
 from datetime import datetime, timezone
-from functools import lru_cache
+from typing import Optional
 
-import cpuinfo
 import psutil
 from aiohttp import web
 from aleph_message.models import ItemHash
 from aleph_message.models.execution.environment import CpuProperties
 from pydantic import BaseModel, Field
 
 from aleph.vm.conf import settings
+from aleph.vm.orchestrator.machine import (
+    get_cpu_info,
+    get_hardware_info,
+    get_memory_info,
+)
 from aleph.vm.pool import VmPool
 from aleph.vm.resources import GpuDevice
 from aleph.vm.sevclient import SevClient
 from aleph.vm.utils import (
+    async_cache,
     check_amd_sev_es_supported,
     check_amd_sev_snp_supported,
     check_amd_sev_supported,
@@ -90,6 +95,29 @@ class MachineUsage(BaseModel):
     active: bool = True
 
 
+class ExtendedCpuProperties(CpuProperties):
+    """CPU properties."""
+
+    model: str | None = Field(default=None, description="CPU model")
+    frequency: int | None = Field(default=None, description="CPU frequency")
+    count: int | None = Field(default=None, description="CPU count")
+
+
+class MemoryProperties(BaseModel):
+    """MEMORY properties."""
+
+    size: int | None = Field(default=None, description="Memory size")
+    units: str | None = Field(default=None, description="Memory size units")
+    type: str | None = Field(default=None, description="Memory type")
+    clock: int | None = Field(default=None, description="Memory clock")
+    clock_units: str | None = Field(default=None, description="Memory clock units")
+
+
+class MachineCapability(BaseModel):
+    cpu: ExtendedCpuProperties
+    memory: MemoryProperties
+
+
 def get_machine_gpus(request: web.Request) -> GpuProperties:
     pool: VmPool = request.app["vm_pool"]
     gpus = pool.gpus
@@ -101,19 +129,22 @@ def get_machine_gpus(request: web.Request) -> GpuProperties:
     )
 
 
-@lru_cache
-def get_machine_properties() -> MachineProperties:
+machine_properties_cached = None
+
+
+@async_cache
+async def get_machine_properties() -> MachineProperties:
     """Fetch machine properties such as architecture, CPU vendor, ...
     These should not change while the supervisor is running.
 
     In the future, some properties may have to be fetched from within a VM.
     """
-    cpu_info = cpuinfo.get_cpu_info()  # Slow
-
+    hw = await get_hardware_info()
+    cpu_info = get_cpu_info(hw)
     return MachineProperties(
         cpu=CpuProperties(
-            architecture=cpu_info.get("raw_arch_string", cpu_info.get("arch_string_raw")),
-            vendor=cpu_info.get("vendor_id", cpu_info.get("vendor_id_raw")),
+            architecture=cpu_info["architecture"],
+            vendor=cpu_info["vendor"],
             features=list(
                 filter(
                     None,
@@ -128,13 +159,47 @@ def get_machine_properties() -> MachineProperties:
     )
 
 
+@async_cache
+async def get_machine_capability() -> MachineCapability:
+    hw = await get_hardware_info()
+    cpu_info = get_cpu_info(hw)
+    mem_info = get_memory_info(hw)
+
+    return MachineCapability(
+        cpu=ExtendedCpuProperties(
+            architecture=cpu_info["architecture"],
+            vendor=cpu_info["vendor"],
+            model=cpu_info["model"],
+            frequency=(cpu_info["frequency"]),
+            count=(cpu_info["count"]),
+            features=list(
+                filter(
+                    None,
+                    (
+                        "sev" if check_amd_sev_supported() else None,
+                        "sev_es" if check_amd_sev_es_supported() else None,
+                        "sev_snp" if check_amd_sev_snp_supported() else None,
+                    ),
+                )
+            ),
+        ),
+        memory=MemoryProperties(
+            size=mem_info["size"],
+            units=mem_info["units"],
+            type=mem_info["type"],
+            clock=mem_info["clock"],
+        ),
+    )
+
+
 @cors_allow_all
 async def about_system_usage(request: web.Request):
     """Public endpoint to expose information about the system usage."""
     period_start = datetime.now(timezone.utc).replace(second=0, microsecond=0)
     machine_properties = get_machine_properties()
     pool = request.app["vm_pool"]
 
+    machine_properties = await get_machine_properties()
     usage: MachineUsage = MachineUsage(
         cpu=CpuUsage(
             count=psutil.cpu_count(),
@@ -173,6 +238,13 @@ async def about_certificates(request: web.Request):
     return web.FileResponse(await sev_client.get_certificates())
 
 
+async def about_capability(_: web.Request):
+    """Public endpoint to expose information about the CRN capability."""
+
+    capability: MachineCapability = await get_machine_capability()
+    return web.json_response(text=capability.json(exclude_none=False))
+
+
 class Allocation(BaseModel):
     """An allocation is the set of resources that are currently allocated on this orchestrator.
     It contains the item_hashes of all persistent VMs, instances, on-demand VMs and jobs.

diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py
@@ -20,7 +20,7 @@
 from aleph.vm.sevclient import SevClient
 from aleph.vm.version import __version__
 
-from .resources import about_certificates, about_system_usage
+from .resources import about_capability, about_certificates, about_system_usage
 from .tasks import (
     start_payment_monitoring_task,
     start_watch_for_messages_task,
@@ -129,6 +129,7 @@ def setup_webapp(pool: VmPool | None):
         web.get("/about/executions/records", about_execution_records),
         web.get("/about/usage/system", about_system_usage),
         web.get("/about/certificates", about_certificates),
+        web.get("/about/capability", about_capability),
         web.get("/about/config", about_config),
         # /control APIs are used to control the VMs and access their logs
         web.post("/control/allocation/notify", notify_allocation),

diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py
@@ -405,7 +405,7 @@ async def reserve_resources(self, message: ExecutableContent, user):
         return expiration_date
 
     def find_resources_available_for_user(self, message: ExecutableContent, user) -> set[GpuDevice]:
-        """Find required resource to run ExecutableContent from reserved resources by user or free resources.
+        """Find the required resource to run ExecutableContent from reserved resources by user or free resources.
 
         Only implement GPU for now"""
         # Calling function should use the creation_lock to avoid resource being stollem

diff --git a/src/aleph/vm/resources.py b/src/aleph/vm/resources.py
@@ -28,7 +28,7 @@ class GpuDevice(HashableModel):
     """GPU properties."""
 
     vendor: str = Field(description="GPU vendor name")
-    model: Optional[str] = Field(description="GPU model name on Aleph Network", default=None)
+    model: str | None = Field(description="GPU model name on Aleph Network", default=None)
     device_name: str = Field(description="GPU vendor card name")
     device_class: GpuDeviceClass = Field(
         description="GPU device class. Look at https://admin.pci-ids.ucw.cz/read/PD/03"

diff --git a/src/aleph/vm/utils/__init__.py b/src/aleph/vm/utils/__init__.py
@@ -1,5 +1,6 @@
 import asyncio
 import dataclasses
+import functools
 import hashlib
 import json
 import logging
@@ -252,3 +253,17 @@ def file_hashes_differ(source: Path, destination: Path, checksum: Callable[[Path
         return True
 
     return checksum(source) != checksum(destination)
+
+
+def async_cache(fn):
+    """Simple async function cache decorator."""
+    cache = {}
+
+    @functools.wraps(fn)
+    async def wrapper(*args, **kwargs):
+        key = (args, frozenset(kwargs.items()))
+        if key not in cache:
+            cache[key] = await fn(*args, **kwargs)
+        return cache[key]
+
+    return wrapper