Skip to content

Commit

Permalink
Merge pull request #183 from microsoft/danmihai1/hotplug7
Browse files Browse the repository at this point in the history
 runtime: agent: use PCI segments 1+ for blk devices
  • Loading branch information
Redent0r authored Apr 30, 2024
2 parents 7ea417b + cbb60ff commit 99f1e83
Show file tree
Hide file tree
Showing 10 changed files with 191 additions and 60 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
---
# This pod fails to start if Kata debug features are enabled.
# It starts successfully without Kata debug output.
#
# The policy generated by the genpolicy tool is also too large
# and therefore gets rejected by "kubectl apply".
apiVersion: v1
kind: Pod
metadata:
name: debug-failing-many-layers
spec:
terminationGracePeriodSeconds: 0
restartPolicy: Never
runtimeClassName: kata-cc
containers:
# 29 layers
- name: azure-vote-front
image: "mcr.microsoft.com/azuredocs/azure-vote-front:v1"
command:
- sh
- "-c"
- while true; do echo hello; sleep 10; done
# 11 layers
- name: footloose
image: "quay.io/footloose/ubuntu18.04:latest"
command:
- sh
- "-c"
- while true; do echo go; sleep 25; done
# 9 layers
- name: bootloose
image: "quay.io/k0sproject/bootloose-ubuntu22.04:latest"
command:
- sh
- "-c"
- while true; do echo go; sleep 25; done
# 6 layers
- name: nginx
image: "mcr.microsoft.com/cbl-mariner/base/nginx:1.22.1-9-cm2.0.20230904-amd64"
command:
- sh
- "-c"
- while true; do echo nginx; sleep 10; done
# 5 layers
- name: redis
image: "mcr.microsoft.com/oss/bitnami/redis:6.0.8"
command:
- sh
- "-c"
- while true; do echo redis; sleep 20; done
# 4 layers
- name: go
image: "mcr.microsoft.com/appsvc/go:1.19-bullseye_20230324.1"
command:
- sh
- "-c"
- while true; do echo go; sleep 25; done
# 4 layers
- name: sysbench-kata
image: "quay.io/kata-containers/sysbench-kata:latest"
command:
- sh
- "-c"
- while true; do echo go; sleep 25; done
36 changes: 36 additions & 0 deletions src/agent/samples/policy/yaml/pod/pod-many-layers.yaml

Large diffs are not rendered by default.

28 changes: 12 additions & 16 deletions src/agent/src/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,7 @@ where
// provided.
#[instrument]
pub fn pcipath_to_sysfs(root_bus_sysfs: &str, pcipath: &pci::Path) -> Result<String> {
// Support up to 10 PCI segments.
let mut bus = "000[0-9]:00".to_string();

let mut bus = "0000:00".to_string();
let mut relpath = String::new();

for i in 0..pcipath.len() {
Expand Down Expand Up @@ -227,9 +225,8 @@ struct VirtioBlkPciMatcher {
}

impl VirtioBlkPciMatcher {
fn new(relpath: &str) -> VirtioBlkPciMatcher {
let root_bus = create_pci_root_bus_pattern();
let re = format!(r"^{}{}/virtio[0-9]+/block/", root_bus, relpath);
fn new(sysfspath: &str) -> VirtioBlkPciMatcher {
let re = format!(r"^{sysfspath}/virtio[0-9]+/block/");

VirtioBlkPciMatcher {
rex: Regex::new(&re).expect("BUG: failed to compile VirtioBlkPciMatcher regex"),
Expand All @@ -246,11 +243,10 @@ impl UeventMatcher for VirtioBlkPciMatcher {
#[instrument]
pub async fn get_virtio_blk_pci_device_name(
sandbox: &Arc<Mutex<Sandbox>>,
pcipath: &pci::Path,
pciaddr: &pci::Address,
) -> Result<String> {
let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_pattern());
let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?;
let matcher = VirtioBlkPciMatcher::new(&sysfs_rel_path);
let sysfs_path = pciaddr.get_sysfs_path();
let matcher = VirtioBlkPciMatcher::new(&sysfs_path);

let uev = wait_for_uevent(sandbox, matcher).await?;
Ok(format!("{}/{}", SYSTEM_DEV_PATH, &uev.devname))
Expand Down Expand Up @@ -350,7 +346,7 @@ struct PciMatcher {

impl PciMatcher {
fn new(relpath: &str) -> Result<PciMatcher> {
let root_bus = create_pci_root_bus_pattern();
let root_bus = create_pci_root_bus_path();
Ok(PciMatcher {
devpath: format!("{}{}", root_bus, relpath),
})
Expand All @@ -367,7 +363,7 @@ pub async fn wait_for_pci_device(
sandbox: &Arc<Mutex<Sandbox>>,
pcipath: &pci::Path,
) -> Result<pci::Address> {
let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_pattern());
let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_path());
let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?;
let matcher = PciMatcher::new(&sysfs_rel_path)?;

Expand Down Expand Up @@ -770,7 +766,7 @@ async fn virtio_blk_device_handler(
device: &Device,
sandbox: &Arc<Mutex<Sandbox>>,
) -> Result<SpecUpdate> {
let pcipath = pci::Path::from_str(&device.id)?;
let pcipath = pci::Address::from_str(&device.id)?;
let vm_path = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?;

Ok(DeviceInfo::new(&vm_path, true)
Expand Down Expand Up @@ -1490,7 +1486,7 @@ mod tests {
#[tokio::test]
async fn test_get_device_name() {
let devname = "vda";
let root_bus = create_pci_root_bus_pattern();
let root_bus = create_pci_root_bus_path();
let relpath = "/0000:00:0a.0/0000:03:0b.0";
let devpath = format!("{}{}/virtio4/block/{}", root_bus, relpath, devname);

Expand Down Expand Up @@ -1525,7 +1521,7 @@ mod tests {
#[tokio::test]
#[allow(clippy::redundant_clone)]
async fn test_virtio_blk_matcher() {
let root_bus = create_pci_root_bus_pattern();
let root_bus = create_pci_root_bus_path();
let devname = "vda";

let mut uev_a = crate::uevent::Uevent::default();
Expand Down Expand Up @@ -1610,7 +1606,7 @@ mod tests {
#[tokio::test]
#[allow(clippy::redundant_clone)]
async fn test_scsi_block_matcher() {
let root_bus = create_pci_root_bus_pattern();
let root_bus = create_pci_root_bus_path();
let devname = "sda";

let mut uev_a = crate::uevent::Uevent::default();
Expand Down
7 changes: 3 additions & 4 deletions src/agent/src/linux_abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@ pub const SYSFS_DIR: &str = "/sys";
target_arch = "x86_64",
target_arch = "x86"
))]
pub fn create_pci_root_bus_pattern() -> String {
// Support up to 10 PCI segments.
String::from("/devices/pci000[0-9]:00")
pub fn create_pci_root_bus_path() -> String {
String::from("/devices/pci0000:00")
}

#[cfg(target_arch = "aarch64")]
pub fn create_pci_root_bus_pattern() -> String {
pub fn create_pci_root_bus_path() -> String {
let ret = String::from("/devices/platform/4010000000.pcie/pci0000:00");

let acpi_root_bus_path = String::from("/devices/pci0000:00");
Expand Down
8 changes: 8 additions & 0 deletions src/agent/src/pci.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,14 @@ impl Address {
slotfn,
}
}

pub fn get_sysfs_path(&self) -> String {
// Example: /devices/pci0001:00/0001:00:01.0
format!(
"/devices/pci{:04x}:00/{:04x}:{:02x}:{}",
self.domain, self.domain, self.bus, self.slotfn
)
}
}

impl FromStr for Address {
Expand Down
26 changes: 4 additions & 22 deletions src/agent/src/rpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ use tokio::sync::Mutex;

use std::ffi::{CString, OsStr};
use std::fmt::Debug;
use std::io;
use std::os::unix::ffi::OsStrExt;
use std::path::Path;
use std::sync::Arc;
Expand Down Expand Up @@ -52,13 +51,12 @@ use nix::sys::{stat, statfs};
use nix::unistd::{self, Pid};
use rustjail::process::ProcessOperations;

use crate::device::{add_devices, get_virtio_blk_pci_device_name, update_env_pci};
use crate::device::{add_devices, update_env_pci};
use crate::linux_abi::*;
use crate::metrics::get_metrics;
use crate::mount::baremount;
use crate::namespace::{NSTYPEIPC, NSTYPEPID, NSTYPEUTS};
use crate::network::setup_guest_dns;
use crate::pci;
use crate::random;
use crate::sandbox::Sandbox;
use crate::storage::{add_storages, update_ephemeral_mounts, STORAGE_HANDLERS};
Expand All @@ -77,7 +75,7 @@ use tracing_opentelemetry::OpenTelemetrySpanExt;

use tracing::instrument;

use libc::{self, c_char, c_ushort, pid_t, winsize, TIOCSWINSZ};
use libc::{self, c_ushort, pid_t, winsize, TIOCSWINSZ};
use std::fs;
use std::os::unix::prelude::PermissionsExt;
use std::process::{Command, Stdio};
Expand Down Expand Up @@ -1854,24 +1852,8 @@ fn do_copy_file(req: &CopyFileRequest) -> Result<()> {
Ok(())
}

async fn do_add_swap(sandbox: &Arc<Mutex<Sandbox>>, req: &AddSwapRequest) -> Result<()> {
let mut slots = Vec::new();
for slot in &req.PCIPath {
slots.push(pci::SlotFn::new(*slot, 0)?);
}
let pcipath = pci::Path::new(slots)?;
let dev_name = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?;

let c_str = CString::new(dev_name)?;
let ret = unsafe { libc::swapon(c_str.as_ptr() as *const c_char, 0) };
if ret != 0 {
return Err(anyhow!(
"libc::swapon get error {}",
io::Error::last_os_error()
));
}

Ok(())
async fn do_add_swap(_sandbox: &Arc<Mutex<Sandbox>>, _req: &AddSwapRequest) -> Result<()> {
Err(anyhow!(nix::Error::ENOTSUP))
}

// Setup container bundle under CONTAINER_BASE, which is cleaned up
Expand Down
4 changes: 2 additions & 2 deletions src/agent/src/storage/block_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ impl StorageHandler for VirtioBlkPciHandler {
return Err(anyhow!("Invalid device {}", &storage.source));
}
} else {
let pcipath = pci::Path::from_str(&storage.source)?;
let dev_path = get_virtio_blk_pci_device_name(ctx.sandbox, &pcipath).await?;
let pciaddr = pci::Address::from_str(&storage.source)?;
let dev_path = get_virtio_blk_pci_device_name(ctx.sandbox, &pciaddr).await?;
storage.source = dev_path;
}

Expand Down
20 changes: 12 additions & 8 deletions src/runtime/virtcontainers/clh.go
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,7 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net

clh.vmconfig.Platform = chclient.NewPlatformConfig()
platform := clh.vmconfig.Platform
platform.SetNumPciSegments(2)
platform.SetNumPciSegments(10)
if clh.config.IOMMU {
platform.SetIommuSegments([]int32{0})
}
Expand All @@ -554,11 +554,6 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net
}
}

if clh.vmconfig.Platform == nil {
clh.vmconfig.Platform = chclient.NewPlatformConfig()
}
clh.vmconfig.Platform.SetNumPciSegments(10)

// Create the VM memory config via the constructor to ensure default values are properly assigned
clh.vmconfig.Memory = chclient.NewMemoryConfig(int64((utils.MemUnit(clh.config.MemorySize) * utils.MiB).ToBytes()))
// shared memory should be enabled if using vhost-user(kata uses virtiofsd)
Expand Down Expand Up @@ -898,7 +893,7 @@ func clhPciInfoToPath(pciInfo chclient.PciDeviceInfo) (types.PciPath, error) {
return types.PciPath{}, fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf)
}

return types.PciPathFromString(tokens[0])
return types.PciPathFromString(pciInfo.Bdf)
}

func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) error {
Expand Down Expand Up @@ -944,7 +939,10 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro

// Hotplug block devices on PCI segments >= 1. PCI segment 0 is used
// for the network interface, any disks present at Guest boot time, etc.
clhDisk.SetPciSegment(int32(drive.Index)/32 + 1)
// Just bus 0 of each segment is used, and up to 31 devices can be
// plugged in to each bus.
pciSegment := int32(drive.Index)/31 + 1
clhDisk.SetPciSegment(pciSegment)

pciInfo, _, err := cl.VmAddDiskPut(ctx, clhDisk)

Expand All @@ -954,6 +952,12 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro

clh.devicesIds[driveID] = pciInfo.GetId()
drive.PCIPath, err = clhPciInfoToPath(pciInfo)
clh.Logger().
WithField("bdf", pciInfo.Bdf).
WithField("index", drive.Index).
WithField("pcipath", drive.PCIPath).
WithField("segment", pciSegment).
Debug("hotplugAddBlockDevice")

return err
}
Expand Down
55 changes: 48 additions & 7 deletions src/runtime/virtcontainers/types/pcipath.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ const (
// number), giving slots 0..31
pciSlotBits = 5
maxPciSlot = (1 << pciSlotBits) - 1

pcidomainBits = 16
maxPcidomain = (1 << pcidomainBits) - 1
)

// A PciSlot describes where a PCI device sits on a single bus
Expand All @@ -25,27 +28,65 @@ const (
//
// XXX In order to support multifunction device's we'll need to extend
// this to include the PCI 3-bit function number as well.
type PciSlot struct{ slot uint8 }
type PciSlot struct {
domain uint16
slot uint8
}

func PciSlotFromString(s string) (PciSlot, error) {
v, err := strconv.ParseUint(s, 16, pciSlotBits)
tokens := strings.Split(s, ":")
if len(tokens) == 3 {
return PciSlotFromBdfString(tokens)
} else {
device, err := PciSlotFromDeviceIndexString(s)
if err != nil {
return PciSlot{}, err
}

return PciSlot{domain: 0, slot: uint8(device)}, nil
}
}

func PciSlotFromDeviceIndexString(s string) (uint64, error) {
return strconv.ParseUint(s, 16, pciSlotBits)
}

func PciSlotFromBdfString(bdfTokens []string) (PciSlot, error) {
if bdfTokens[1] != "00" {
return PciSlot{}, fmt.Errorf("Unexpected PCI bus index %q", bdfTokens[1])
}

domain, err := strconv.ParseUint(bdfTokens[0], 16, pcidomainBits)
if err != nil {
return PciSlot{}, err
}

deviceTokens := strings.Split(bdfTokens[2], ".")
if len(deviceTokens) != 2 || deviceTokens[1] != "0" || len(deviceTokens[0]) != 2 {
return PciSlot{}, fmt.Errorf("Unexpected PCI BDF device format %q", bdfTokens[2])
}

device, err := PciSlotFromDeviceIndexString(deviceTokens[0])
if err != nil {
return PciSlot{}, err
}
// The 5 bit width passed to ParseUint ensures the value is <=
// maxPciSlot
return PciSlot{slot: uint8(v)}, nil

return PciSlot{domain: uint16(domain), slot: uint8(device)}, nil
}

func PciSlotFromInt(v int) (PciSlot, error) {
if v < 0 || v > maxPciSlot {
return PciSlot{}, fmt.Errorf("PCI slot 0x%x should be in range [0..0x%x]", v, maxPciSlot)
}
return PciSlot{slot: uint8(v)}, nil
return PciSlot{domain: 0, slot: uint8(v)}, nil
}

func (slot PciSlot) String() string {
return fmt.Sprintf("%02x", slot.slot)
if slot.domain == 0 {
return fmt.Sprintf("%02x", slot.slot)
} else {
return fmt.Sprintf("%04x:00:%02x.0", slot.domain, slot.slot)
}
}

// A PciPath describes where a PCI sits in a PCI hierarchy.
Expand Down
Loading

0 comments on commit 99f1e83

Please sign in to comment.