Update CPU and memory details by switching to lshw method instead of cpuinfo

aliel · aliel · commit 54ff2608f565 · 2024-02-16T11:58:33.000+01:00
Add CPU information (model, vendor, frequency) and memory details (clock, size, type) to API
diff --git a/.github/workflows/code-quality.yml b/.github/workflows/code-quality.yml
@@ -16,7 +16,7 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get -y upgrade
-          sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-cpuinfo python3-nftables python3-jsonschema
+          sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-nftables python3-jsonschema
           pip install --upgrade typing-extensions types-PyYAML
 
       - name: Install required Python packages
diff --git a/docker/vm_supervisor-dev.dockerfile b/docker/vm_supervisor-dev.dockerfile
@@ -5,7 +5,7 @@ FROM debian:bullseye
 RUN apt-get update && apt-get -y upgrade && apt-get install -y \
     sudo acl curl squashfs-tools git \
     python3 python3-aiohttp python3-alembic python3-msgpack python3-pip python3-aiodns python3-aioredis\
-    python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging python3-cpuinfo ndppd \
+    python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging ndppd \
     && rm -rf /var/lib/apt/lists/*
 
 RUN useradd jailman
diff --git a/packaging/aleph-vm/DEBIAN/control b/packaging/aleph-vm/DEBIAN/control
@@ -3,6 +3,6 @@ Version: 0.1.8
 Architecture: all
 Maintainer: Aleph.im
 Description: Aleph.im VM execution engine
-Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs
+Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,lshw
 Section: aleph-im
 Priority: Extra
diff --git a/packaging/requirements-debian-11.txt b/packaging/requirements-debian-11.txt
@@ -17,7 +17,6 @@ multidict==5.1.0
 git+https://salsa.debian.org/pkg-netfilter-team/pkg-nftables#egg=nftables&subdirectory=py
 packaging==20.9
 psutil==5.8.0
-py-cpuinfo==5.0.0
 pycares==3.1.1
 pyparsing==2.4.7
 pyrsistent==0.15.5
diff --git a/packaging/requirements-ubuntu-20.04.txt b/packaging/requirements-ubuntu-20.04.txt
@@ -18,7 +18,6 @@ multidict==4.7.3
 git+https://salsa.debian.org/pkg-netfilter-team/pkg-nftables#egg=nftables&subdirectory=py
 packaging==20.3
 psutil==5.5.1
-py-cpuinfo==5.0.0
 pycares==3.1.1
 PyGObject==3.36.0
 pyparsing==2.4.6
diff --git a/packaging/requirements-ubuntu-22.04.txt b/packaging/requirements-ubuntu-22.04.txt
@@ -21,7 +21,6 @@ multidict==5.1.0
 git+https://salsa.debian.org/pkg-netfilter-team/pkg-nftables#egg=nftables&subdirectory=py
 packaging==21.3
 psutil==5.9.0
-py-cpuinfo==5.0.0
 pycares==4.1.2
 PyGObject==3.42.1
 pyparsing==2.4.7
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,7 +35,6 @@ dependencies = [
   "sentry-sdk==1.31.0",
   "aioredis==1.3.1",
   "psutil==5.9.5",
-  "py-cpuinfo==9.0.0",
   "schedule==1.2.1",
   "nftables @ git+https://salsa.debian.org/pkg-netfilter-team/pkg-nftables#egg=nftables&subdirectory=py",
   "msgpack==1.0.7",
diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py
@@ -0,0 +1,77 @@
+import json
+import re
+import subprocess
+from functools import lru_cache
+
+
+@lru_cache
+def get_hardware_info():
+    lshw = subprocess.Popen(["lshw", "-sanitize", "-json"], stdout=subprocess.PIPE, shell=False)
+    output, _ = lshw.communicate()
+    data = json.loads(output)
+
+    hw_info = {}
+
+    memory_bank0 = None
+
+    for hw in data["children"][0]["children"]:
+        if hw["id"] == "cpu":
+            hw_info["cpu"] = hw
+        elif hw["class"] == "memory" and hw["id"] == "memory":
+            hw_info["memory"] = hw
+
+    hw_info["memory"]["bank:0"] = memory_bank0
+    return hw_info
+
+
+@lru_cache
+def get_cpu_info():
+    hw = get_hardware_info()
+
+    cpu_info = hw["cpu"]
+    architecture = cpu_info["width"]
+
+    if "x86_64" in cpu_info["capabilities"] or "x86-64" in cpu_info["capabilities"]:
+        architecture = "x86_64"
+    elif "arm64" in cpu_info["capabilities"] or "arm-64" in cpu_info["capabilities"]:
+        architecture = "arm64"
+
+    vendor = cpu_info["vendor"]
+    # lshw vendor implementation => https://github.com/lyonel/lshw/blob/15e4ca64647ad119b69be63274e5de2696d3934f/src/core/cpuinfo.cc#L308
+
+    if "Intel Corp" in vendor:
+        vendor = "GenuineIntel"
+    elif "Advanced Micro Devices [AMD]" in vendor:
+        vendor = "AuthenticAMD"
+
+    return {
+        "architecture": architecture,
+        "vendor": vendor,
+        "model": cpu_info["product"],
+        "frequency": cpu_info["capacity"],
+    }
+
+
+@lru_cache
+def get_memory_info():
+    hw = get_hardware_info()
+    mem_info = hw["memory"]
+
+    memory_type = ""
+    memory_clock = ""
+
+    for bank in mem_info["children"]:
+        memory_clock = bank["clock"]
+        try:
+            memory_type = re.search("(DDR[2-6])", bank["description"]).group(0)
+            break
+        except:
+            pass
+
+    return {
+        "size": mem_info["size"],
+        "units": mem_info["units"],
+        "type": memory_type,
+        "clock": memory_clock,
+        "clock_units": "Hz",
+    }
diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py
@@ -3,15 +3,14 @@
 from functools import lru_cache
 from typing import Optional
 
-import cpuinfo
 import psutil
 from aiohttp import web
+from aleph.vm.conf import settings
+from aleph.vm.orchestrator.machine import get_cpu_info, get_memory_info
 from aleph_message.models import ItemHash
 from aleph_message.models.execution.environment import CpuProperties
 from pydantic import BaseModel, Field
 
-from aleph.vm.conf import settings
-
 
 class Period(BaseModel):
     datetime: datetime
@@ -76,18 +75,63 @@ class MachineUsage(BaseModel):
     active: bool = True
 
 
+class ExtendedCpuProperties(CpuProperties):
+    """CPU properties."""
+
+    model: Optional[str] = Field(default=None, description="CPU model")
+    frequency: Optional[str] = Field(default=None, description="CPU frequency")
+
+
+class MemoryProperties(BaseModel):
+    """MEMORY properties."""
+
+    size: Optional[str] = Field(default=None, description="Memory size")
+    units: Optional[str] = Field(default=None, description="Memory size units")
+    type: Optional[str] = Field(default=None, description="Memory type")
+    clock: Optional[str] = Field(default=None, description="Memory clock")
+    clock_units: Optional[str] = Field(default=None, description="Memory clock units")
+
+
+class MachineCapability(BaseModel):
+    cpu: ExtendedCpuProperties
+    memory: MemoryProperties
+
+
 @lru_cache
 def get_machine_properties() -> MachineProperties:
     """Fetch machine properties such as architecture, CPU vendor, ...
     These should not change while the supervisor is running.
 
     In the future, some properties may have to be fetched from within a VM.
     """
-    cpu_info = cpuinfo.get_cpu_info()  # Slow
+
+    cpu_info = get_cpu_info()
     return MachineProperties(
         cpu=CpuProperties(
-            architecture=cpu_info["raw_arch_string"],
-            vendor=cpu_info["vendor_id"],
+            architecture=cpu_info["architecture"],
+            vendor=cpu_info["vendor"],
+        ),
+    )
+
+
+@lru_cache
+def get_machine_capability() -> MachineCapability:
+    cpu_info = get_cpu_info()
+    mem_info = get_memory_info()
+
+    return MachineCapability(
+        cpu=ExtendedCpuProperties(
+            architecture=cpu_info["architecture"],
+            vendor=cpu_info["vendor"],
+            model=cpu_info["model"],
+            frequency=cpu_info["frequency"],
+        ),
+        memory=MemoryProperties(
+            size=mem_info["size"],
+            units=mem_info["units"],
+            type=mem_info["type"],
+            clock=mem_info["clock"],
+            clock_units=mem_info["clock_units"],
         ),
     )
 
@@ -119,6 +163,13 @@ async def about_system_usage(_: web.Request):
     return web.json_response(text=usage.json(exclude_none=True), headers={"Access-Control-Allow-Origin:": "*"})
 
 
+async def about_capability(_: web.Request):
+    """Public endpoint to expose information about the CRN capability."""
+
+    capability: MachineCapability = get_machine_capability()
+    return web.json_response(text=capability.json(exclude_none=False), headers={"Access-Control-Allow-Origin:": "*"})
+
+
 class Allocation(BaseModel):
     """An allocation is the set of resources that are currently allocated on this orchestrator.
     It contains the item_hashes of all persistent VMs, instances, on-demand VMs and jobs.
diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py
@@ -14,13 +14,12 @@
 from typing import Callable
 
 from aiohttp import web
-
 from aleph.vm.conf import settings
 from aleph.vm.pool import VmPool
 from aleph.vm.version import __version__
 
 from .metrics import create_tables, setup_engine
-from .resources import about_system_usage
+from .resources import about_capability, about_system_usage
 from .tasks import (
     start_payment_monitoring_task,
     start_watch_for_messages_task,
@@ -93,6 +92,7 @@ async def allow_cors_on_endpoint(request: web.Request):
         web.get("/about/executions/records", about_execution_records),
         web.get("/about/usage/system", about_system_usage),
         web.get("/about/config", about_config),
+        web.get("/about/capability", about_capability),
         # /control APIs are used to control the VMs and access their logs
         web.post("/control/allocations", update_allocations),
         web.post("/control/allocation/notify", notify_allocation),