Skip to content

New capability endpoint to add CPU information (model, vendor, frequency) and memory details (clock, size, type) to API #801

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion .github/workflows/test-using-pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
run: |
sudo apt-get update
sudo apt-get -y upgrade
sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-cpuinfo python3-nftables python3-jsonschema nftables libsystemd-dev cmake libdbus-1-dev libglib2.0-dev
sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-cpuinfo python3-nftables python3-jsonschema nftables libsystemd-dev cmake libdbus-1-dev libglib2.0-dev lshw python3-jwcrypto
pip install --upgrade typing-extensions types-PyYAML

- name: Install required Python packages
Expand Down
2 changes: 1 addition & 1 deletion docker/vm_supervisor-dev.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ FROM debian:bookworm
RUN apt-get update && apt-get -y upgrade && apt-get install -y \
sudo acl curl squashfs-tools git \
python3 python3-aiohttp python3-alembic python3-msgpack python3-pip python3-aiodns python3-aioredis\
python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging python3-cpuinfo ndppd nftables \
python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging ndppd nftables \
&& rm -rf /var/lib/apt/lists/*

RUN useradd jailman
Expand Down
2 changes: 1 addition & 1 deletion packaging/aleph-vm/DEBIAN/control
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ Version: 0.1.8
Architecture: all
Maintainer: Aleph.im
Description: Aleph.im VM execution engine
Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables,python3-jwcrypto
Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables,lshw,python3-jwcrypto
Section: aleph-im
Priority: Extra
76 changes: 76 additions & 0 deletions src/aleph/vm/orchestrator/machine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import asyncio
import json
import re
import shutil

import psutil

from aleph.vm.utils import run_in_subprocess


async def get_hardware_info():
lshw_path = shutil.which("lshw")
assert lshw_path, "lshw not found in PATH. apt install lshw."
lshw_output = await run_in_subprocess([lshw_path, "-sanitize", "-json"])
data = json.loads(lshw_output)

hw_info = {"cpu": None, "memory": None}

for hw in data["children"][0]["children"]:
if hw["id"] == "cpu":
hw_info["cpu"] = hw
elif hw["class"] == "memory" and hw["id"] == "memory":
hw_info["memory"] = hw

return hw_info


def get_cpu_info(hw):
cpu_info = hw["cpu"]

if "x86_64" in cpu_info["capabilities"] or "x86-64" in cpu_info["capabilities"]:
architecture = "x86_64"
elif "arm64" in cpu_info["capabilities"] or "arm-64" in cpu_info["capabilities"]:
architecture = "arm64"
else:
architecture = None

vendor = cpu_info["vendor"]
# lshw vendor implementation => https://github.com/lyonel/lshw/blob/15e4ca64647ad119b69be63274e5de2696d3934f/src/core/cpuinfo.cc#L308

if "Intel Corp" in vendor:
vendor = "GenuineIntel"
elif "Advanced Micro Devices [AMD]" in vendor:
vendor = "AuthenticAMD"

return {
"architecture": architecture,
"vendor": vendor,
"model": cpu_info["product"],
"frequency": cpu_info["capacity"],
"count": psutil.cpu_count(),
}


def get_memory_info(hw):
mem_info = hw["memory"]

memory_type = ""
memory_clock = ""
for bank in mem_info["children"]:
memory_clock = bank.get("clock")
if "description" in bank:
matched = re.search("(DDR[2-6])", bank["description"])
if matched:
memory_type = matched.group(0)
break
else:
pass

return {
"size": mem_info["size"],
"units": mem_info["units"],
"type": memory_type,
"clock": memory_clock,
"clock_units": "Hz" if memory_clock is not None else "",
}
88 changes: 80 additions & 8 deletions src/aleph/vm/orchestrator/resources.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
import math
from datetime import datetime, timezone
from functools import lru_cache
from typing import Optional

import cpuinfo
import psutil
from aiohttp import web
from aleph_message.models import ItemHash
from aleph_message.models.execution.environment import CpuProperties
from pydantic import BaseModel, Field

from aleph.vm.conf import settings
from aleph.vm.orchestrator.machine import (
get_cpu_info,
get_hardware_info,
get_memory_info,
)
from aleph.vm.pool import VmPool
from aleph.vm.resources import GpuDevice
from aleph.vm.sevclient import SevClient
from aleph.vm.utils import (
async_cache,
check_amd_sev_es_supported,
check_amd_sev_snp_supported,
check_amd_sev_supported,
Expand Down Expand Up @@ -90,6 +95,29 @@ class MachineUsage(BaseModel):
active: bool = True


class ExtendedCpuProperties(CpuProperties):
"""CPU properties."""

model: Optional[str] = Field(default=None, description="CPU model")
frequency: Optional[int] = Field(default=None, description="CPU frequency")
count: Optional[int] = Field(default=None, description="CPU count")


class MemoryProperties(BaseModel):
"""MEMORY properties."""

size: Optional[int] = Field(default=None, description="Memory size")
units: Optional[str] = Field(default=None, description="Memory size units")
type: Optional[str] = Field(default=None, description="Memory type")
clock: Optional[int] = Field(default=None, description="Memory clock")
clock_units: Optional[str] = Field(default=None, description="Memory clock units")


class MachineCapability(BaseModel):
cpu: ExtendedCpuProperties
memory: MemoryProperties


def get_machine_gpus(request: web.Request) -> GpuProperties:
pool: VmPool = request.app["vm_pool"]
gpus = pool.gpus
Expand All @@ -101,19 +129,22 @@ def get_machine_gpus(request: web.Request) -> GpuProperties:
)


@lru_cache
def get_machine_properties() -> MachineProperties:
machine_properties_cached = None


@async_cache
async def get_machine_properties() -> MachineProperties:
"""Fetch machine properties such as architecture, CPU vendor, ...
These should not change while the supervisor is running.

In the future, some properties may have to be fetched from within a VM.
"""
cpu_info = cpuinfo.get_cpu_info() # Slow

hw = await get_hardware_info()
cpu_info = get_cpu_info(hw)
return MachineProperties(
cpu=CpuProperties(
architecture=cpu_info.get("raw_arch_string", cpu_info.get("arch_string_raw")),
vendor=cpu_info.get("vendor_id", cpu_info.get("vendor_id_raw")),
architecture=cpu_info["architecture"],
vendor=cpu_info["vendor"],
features=list(
filter(
None,
Expand All @@ -128,13 +159,47 @@ def get_machine_properties() -> MachineProperties:
)


@async_cache
async def get_machine_capability() -> MachineCapability:
hw = await get_hardware_info()
cpu_info = get_cpu_info(hw)
mem_info = get_memory_info(hw)

return MachineCapability(
cpu=ExtendedCpuProperties(
architecture=cpu_info["architecture"],
vendor=cpu_info["vendor"],
model=cpu_info["model"],
frequency=(cpu_info["frequency"]),
count=(cpu_info["count"]),
features=list(
filter(
None,
(
"sev" if check_amd_sev_supported() else None,
"sev_es" if check_amd_sev_es_supported() else None,
"sev_snp" if check_amd_sev_snp_supported() else None,
),
)
),
),
memory=MemoryProperties(
size=mem_info["size"],
units=mem_info["units"],
type=mem_info["type"],
clock=mem_info["clock"],
),
)


@cors_allow_all
async def about_system_usage(request: web.Request):
"""Public endpoint to expose information about the system usage."""
period_start = datetime.now(timezone.utc).replace(second=0, microsecond=0)
machine_properties = get_machine_properties()
pool = request.app["vm_pool"]

machine_properties = await get_machine_properties()
usage: MachineUsage = MachineUsage(
cpu=CpuUsage(
count=psutil.cpu_count(),
Expand Down Expand Up @@ -173,6 +238,13 @@ async def about_certificates(request: web.Request):
return web.FileResponse(await sev_client.get_certificates())


async def about_capability(_: web.Request):
"""Public endpoint to expose information about the CRN capability."""

capability: MachineCapability = await get_machine_capability()
return web.json_response(text=capability.json(exclude_none=False))


class Allocation(BaseModel):
"""An allocation is the set of resources that are currently allocated on this orchestrator.
It contains the item_hashes of all persistent VMs, instances, on-demand VMs and jobs.
Expand Down
3 changes: 2 additions & 1 deletion src/aleph/vm/orchestrator/supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from aleph.vm.sevclient import SevClient
from aleph.vm.version import __version__

from .resources import about_certificates, about_system_usage
from .resources import about_capability, about_certificates, about_system_usage
from .tasks import (
start_payment_monitoring_task,
start_watch_for_messages_task,
Expand Down Expand Up @@ -100,6 +100,7 @@ def setup_webapp():
web.get("/about/executions/records", about_execution_records),
web.get("/about/usage/system", about_system_usage),
web.get("/about/certificates", about_certificates),
web.get("/about/capability", about_capability),
web.get("/about/config", about_config),
# /control APIs are used to control the VMs and access their logs
web.post("/control/allocation/notify", notify_allocation),
Expand Down
2 changes: 1 addition & 1 deletion src/aleph/vm/pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ async def reserve_resources(self, message: ExecutableContent, user):
return expiration_date

def find_resources_available_for_user(self, message: ExecutableContent, user) -> set[GpuDevice]:
"""Find required resource to run ExecutableContent from reserved resources by user or free resources.
"""Find the required resource to run ExecutableContent from reserved resources by user or free resources.

Only implement GPU for now"""
# Calling function should use the creation_lock to avoid resource being stollem
Expand Down
15 changes: 15 additions & 0 deletions src/aleph/vm/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import dataclasses
import functools
import hashlib
import json
import logging
Expand Down Expand Up @@ -251,3 +252,17 @@ def file_hashes_differ(source: Path, destination: Path, checksum: Callable[[Path
return True

return checksum(source) != checksum(destination)


def async_cache(fn):
"""Simple async function cache decorator."""
cache = {}

@functools.wraps(fn)
async def wrapper(*args, **kwargs):
key = (args, frozenset(kwargs.items()))
if key not in cache:
cache[key] = await fn(*args, **kwargs)
return cache[key]

return wrapper
Loading
Loading