Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

packer-rocm/repos: +dir for overrides, bool for GA-optout #59

Merged
merged 9 commits into from
Oct 24, 2024
4 changes: 3 additions & 1 deletion packer-rocm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,11 @@ Remove `-K` if your account does _not_ require a passphrase for `sudo`. This is
| `hidden` | If the VNC window for the VM is _hidden_ during build. Adjustment brings _display_ requirements.<br/>**Default:** `True` |
| `packer_binary` | The name _or_ path for the _Packer_ binary. Installation skipped when changed.<br/>**Default:** `/usr/bin/packer` |
| `qemu_binary` | The name _or_ path for the _QEMU_ binary.<br/>**Default:** `qemu-system-x86_64` |
| `rocm_repos` | If the _'rocm'_ and _'amdgpu'_ repositories are created by the role. _Opt out_ with overrides in the 'repositories' directory.<br/>**Default:** `True` |
| `rocm_releases` | One or more versions to include _[comma-separated]_. Newest selects the `amdgpu` driver.<br/>**Default:** `6.2.2` |
| `rocm_amdgpu_pkgs` | Comma-separated string of `amdgpu` driver/firmware packages to install. May specify releases.<br/>**Default:** `amdgpu-dkms` |
| `rocm_kernel` | The kernel package with an optional release specifier.<br/>**Default:** `linux-image-generic-hwe-22.04` |
| `rocm_extras` | Packages to install _before_ `amdgpu-dkms` and _ROCm_. Comma-separated list.<br/>**Default:** _linux-headers-generic-hwe-22.04,linux-image-extra-virtual-hwe-22.04,mesa-amdgpu-va-drivers_ |
| `rocm_extras` | Packages to install _before_ `rocm_amdgpu_pkgs` and _ROCm_. Comma-separated list.<br/>**Default:** _linux-headers-generic-hwe-22.04,linux-image-extra-virtual-hwe-22.04,mesa-amdgpu-va-drivers_ |
| `rocm_filename` | The name of the output file/artifact _(tarball)_<br/>**Default:** `ubuntu-rocm.tar.gz` |
| `rocm_installed` | If _ROCm_ multi-release packages are installed. The `amdgpu` driver and extras are, always.<br/>**Default:** `False` |
| `rocm_builder_cpus` | Number of virtual CPUs given to the builder VM.<br/>**Default:** _4_ |
Expand Down
2 changes: 1 addition & 1 deletion packer-rocm/playbooks/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@
ansible.builtin.command:
cmd: >
{{ packer_binary }} build
{% for _var in (packer_vars + ['hidden', 'http_directory', 'http_proxy', 'https_proxy', 'no_proxy', 'ubuntu_release']) if vars[_var] is defined %}
{% for _var in (packer_vars + ['http_directory', 'http_proxy', 'https_proxy', 'no_proxy', 'ubuntu_release']) if vars[_var] is defined %}
{{ '-var ' + _var + '=' + vars[_var] }}
{% endfor %}
-only=qemu.rocm .
Expand Down
62 changes: 58 additions & 4 deletions packer-rocm/playbooks/os_prep.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,68 @@
---
# vim: ft=yaml.ansible
- name: OS Preparation # the Packer 'file' provisioner runs this before other plays
gather_facts: false
# This play runs before any others [by Packer, as provisioners]. Provides package repository and RDMA device naming overrides
- name: OS Preparation
hosts: default
become: true
vars: # change these with '-e var=...'
os_rdma_rename: true # accepts loosely 'true' or 'false' values. ie: 0, 1, yes, no
os_rdma_rename_mode: 'NAME_KERNEL'
# one may find 'NAME_FALLBACK'/device-driven naming inconsistency with certain packages installed
# these two vars control if devices should be renamed... and if so, the mode/pattern to use. accepts loosely 'true' or 'false' values. ie: 0, 1, yes, no.
os_rdma_rename: true
os_rdma_rename_mode: 'NAME_KERNEL' # ref: https://github.com/linux-rdma/rdma-core/blob/master/Documentation/udev.md#stable-names
# *fully qualified* path on the controller where repo overrides can be found; intended for/changed by Packer
os_repos_src: /srv/repos
tasks:

- name: Create temporary directory for repository overrides
ansible.builtin.tempfile:
state: directory
suffix: repos
register: mktemp

- name: Mirror repository overrides (for searching on managed host w/ facts)
ansible.posix.synchronize:
src: "{{ os_repos_src }}/" # trailing '/' is significant, ensures the contents of the path are copied
dest: "{{ mktemp.path }}"
recursive: true
archive: true # retain ownership/modes/etc
mode: push

- name: "Search for repository overrides in '{{ mktemp.path }}'"
ansible.builtin.find:
paths:
- "{{ mktemp.path }}"
patterns: "{{ os_repo_patts[ansible_os_family] }}"
recurse: true
file_type: file
register: os_repo_search
vars:
os_repo_patts:
RedHat:
- '*.repo'
Debian:
- '*.list'

- name: 'Copy [relevant] repository files'
become: true
ansible.builtin.copy:
src: "{{ repofile }}"
dest: "{{ os_repo_paths[ansible_os_family] }}"
mode: preserve
owner: root
group: root
remote_src: true
loop: "{{ os_repo_search.files | map(attribute='path') }}"
loop_control: { loop_var: repofile }
vars:
os_repo_paths:
Debian: /etc/apt/sources.list.d/
RedHat: /etc/yum.repos.d/

- name: Clean temporary directory
ansible.builtin.file:
state: absent
path: "{{ mktemp.path }}"

- name: "Manage RDMA device rename mode ({{ os_rdma_rename_mode }})"
when: os_rdma_rename is truthy(convert_bool=True)
ansible.builtin.lineinfile:
Expand Down
23 changes: 15 additions & 8 deletions packer-rocm/playbooks/rocm.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# yamllint disable rule:line-length
- name: "Prepare amdgpu-dkms + ROCm"
- name: "Prepare amdgpu + ROCm"
hosts: default
become: true
environment: # may be superfluous for your environment; mapped through Packer HCL with 'ansible_env_vars'
Expand All @@ -13,6 +13,7 @@
rocm_default: '6.2.2'
rocm_requests: "{{ (rocm_releases | split(',')) if rocm_releases is defined else [rocm_default] }}"
rocm_gpg_url: 'https://repo.radeon.com/rocm/rocm.gpg.key'
rocm_repos: true # controls if this play manages the repositories or assumed elsewhere
rocm_reqs:
common:
- "autoconf"
Expand Down Expand Up @@ -49,8 +50,10 @@
state: present
update_cache: true # passed to the modules for either 'apt' or 'dnf'; supported by either

- name: RedHat
when: ansible_os_family in ['RedHat']
- name: RedHat repository tasks
when:
- ansible_os_family in ['RedHat']
- rocm_repos is truthy(convert_bool=True)
block:
- name: "Manage 'amdgpu' Yum repositories"
ansible.builtin.yum_repository:
Expand Down Expand Up @@ -90,8 +93,10 @@
epel:
RedHat: "https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm"

- name: Debian
when: ansible_os_family in ['Debian']
- name: Debian repository tasks
when:
- ansible_os_family in ['Debian']
- rocm_repos is truthy(convert_bool=True)
block:

- name: "Fetch signing key for Apt"
Expand All @@ -117,17 +122,19 @@
loop: "{{ rocm_requests }}"
loop_control: { loop_var: rocm_release }

- name: "Install any 'extra' packages before 'amdgpu-dkms' or ROCm"
- name: "Install any 'extra' packages before 'amdgpu' or ROCm"
ansible.builtin.package:
name: "{{ rocm_extras | split(',') }}"
state: present
update_cache: true
when: rocm_extras is defined

- name: "Install 'amdgpu-dkms'"
- name: "Install '{{ _amdgpu_pkgs }}'"
ansible.builtin.package:
name: amdgpu-dkms
name: "{{ _amdgpu_pkgs }}"
state: present
vars:
_amdgpu_pkgs: "{{ (rocm_amdgpu_pkgs | split(',')) if rocm_amdgpu_pkgs is defined else ['amdgpu-dkms'] }}"

- name: "Install ROCm releases (when 'rocm_installed' is truthy)"
ansible.builtin.package:
Expand Down
7 changes: 7 additions & 0 deletions packer-rocm/repositories/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# repositories

Any `.repo` or `.list` files placed here will be automatically copied into the _Packer_ builder VM.

Use `-e rocm_extras=pkg1,pkg2,...` to request any packages they -- or the distribution -- provide.

When replacing the `amdgpu` or `rocm` repositories, `-e rocm_repos=false` may skip those for generally-available releases.
8 changes: 6 additions & 2 deletions packer-rocm/ubuntu/ubuntu-rocm.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ build {
]
}


# remove step/message from 'install-custom-packages' RE: uninstalling existing kernels; DKMS/'cloud-init'
provisioner "shell" {
inline_shebang = "/bin/bash"
Expand All @@ -82,7 +83,8 @@ build {
ansible_env_vars = ["http_proxy=${var.http_proxy}", "https_proxy=${var.https_proxy}", "no_proxy=${var.no_proxy}"]
extra_arguments = [
"-e", "ansible_python_interpreter=/usr/bin/python3", # work around Packer/SSH proxy+client limitations
"--scp-extra-args", "'-O'"
"--scp-extra-args", "'-O'",
"-e", "os_repos_src=${path.root}/../repositories" # *Absolute* path to the 'repositories' directory with '.list' or '.repo' overrides, copied-to/processed-on builder VM
]
}

Expand All @@ -95,7 +97,9 @@ build {
"--scp-extra-args", "'-O'",
"-e", "rocm_releases=${var.rocm_releases}", # pass ROCm requests [release + packages]
"-e", "rocm_extras=${var.rocm_extras}",
"-e", "rocm_installed=${var.rocm_installed}"
"-e", "rocm_installed=${var.rocm_installed}",
"-e", "rocm_repos=${var.rocm_repos}",
"-e", "rocm_amdgpu_pkgs=${var.rocm_amdgpu_pkgs}"
]
}

Expand Down
12 changes: 12 additions & 0 deletions packer-rocm/ubuntu/ubuntu-rocm.variables.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ variable "rocm_filename" {
description = "The name of the output file/artifact (tarball)"
}

variable "rocm_amdgpu_pkgs" {
type = string
default = "amdgpu-dkms"
description = "Comma-separated string of 'amdgpu' driver/firmware packages to install. May specify releases."
}

variable "rocm_kernel" {
type = string
default = "linux-image-generic-hwe-22.04"
Expand All @@ -45,6 +51,12 @@ variable "rocm_extras" {
description = "Comma-separated string of extra packages to install [before 'amdgpu-dkms' and ROCm releases]. For headers, extra-modules, and any other packages. May include release specifiers, '=1.2.3' or globbed."
}

variable "rocm_repos" {
type = string
default = "true"
description = "If the 'rocm' and 'amdgpu' repositories are created by the 'rocm' role. Used to opt out when overrides are in the 'repositories' directory"
}

variable "rocm_builder_cpus" {
type = number
default = 4
Expand Down