stackhpc · jovial · Mar 22, 2024 · Feb 4, 2025 · Feb 27, 2025 · dougszumski
@@ -10,6 +10,7 @@ the various features provided.
 
    release-train
    host-images
+   ironic
    lvm
    swap
    cephadm

@@ -0,0 +1,31 @@
+======
+Ironic
+======
+
+Cleaning
+========
+
+Storage
+-------
+
+Hardware assisted secure erase, i.e the ``erase_devices`` clean step, is
+enabled by default. This is normally dependent on the `Hardware Manager
+<https://docs.openstack.org/ironic-python-agent/latest/contributor/hardware_managers.html>`__
+in use. For example, when using the GenericHardwareManager the priority would
+be 10, whereas if using the `ProliantHardwareManager
+<https://docs.openstack.org/ironic/latest/admin/drivers/ilo.html#disk-erase-support>`__
+it would be 0. The idea is that we will prevent the catastrophic case where
+data could be leaked to another tenant; forcing you to have to explicitly relax
+this setting if this is a risk you want to take. This can be customised by
+editing the following variables:
+
+.. code-block::
+    :caption: $KAYOBE_CONFIG_PATH/kolla/config/ironic/ironic-conductor.conf
+
+    [deploy]
+    erase_devices_priority=10
+    erase_devices_metadata_priority=0
+
+See `Ironic documentation
+<https://docs.openstack.org/ironic/latest/admin/cleaning.html>`__ for more
+details.
@@ -0,0 +1,13 @@
+[DEFAULT]
+timeout = 0
+{% if "genericswitch" in kolla_neutron_ml2_mechanism_drivers %}
+# We are increasing the RPC response timeouts to 5 minutes due to the neutron
+# generic switch driver, which synchronously applies switch configuration for
+# each ironic port during node provisioning and tear down.
+# The specific API calls that require this long timeout are:
+# - Creation and deletion of VLAN networks.
+# - Creation or update of ports, adding binding information.
+# - Update of ports, removing binding information.
+# - Deletion of ports.
+rpc_response_timeout = 360
+{% endif %}
@@ -0,0 +1,6 @@
+[DEFAULT]
+# Avoid some timeouts of heartbeats and vif deletes
+rpc_response_timeout = 360
+
+[neutron]
+timeout = 300
@@ -0,0 +1,60 @@
+[DEFAULT]
+# Make direct deploy faster, transfer sparse qcow2 images
+force_raw_images = False
+# Avoid some rpc timeouts
+rpc_response_timeout = 360
+
+[conductor]
+automated_clean=true
+# We have busy conductors failing to heartbeat
+# Default is 10 secs
+heartbeat_interval = 30
+# Default is 60 seconds
+heartbeat_timeout = 360
+sync_local_state_interval = 360
+
+# Normally this is 100. We see eventlet threads
+# not making much progress, to for saftey reduce
+# this by half, should leave work on rabbit queu
+workers_pool_size = 50
+# Normally this is 8, keep it same
+period_max_workers = 8
+
+# Increase power sync interval to reduce load
+sync_power_state_interval = 120
+power_failure_recovery_interval = 120
+# Stop checking for orphan allocations for now
+check_allocations_interval = 120
+
+# Wait much longer before provision timeout check, to reduce background load
+# The default is 60 seconds
+check_provision_state_interval = 120
+check_rescue_state_interval = 120
+
+[database]
+# Usually this is 50, reduce to stop DB connection timeouts
+# and instead just make eventlet threads wait a bit longer
+max_overflow = 5
+# By default this is 30 seconds, but as we reduce
+# the pool overflow, some people will need to wait longer
+pool_timeout = 60
+
+[deploy]
+# Force Hardware assisted secure erase by default.
+erase_devices_priority=10
+erase_devices_metadata_priority=0
+
+[pxe]
+# Increase cache size to 120GB and TTL to 28 hours
+image_cache_size = 122880
+image_cache_ttl = 100800
+
+[neutron]
+# Increase the neutron client timeout to allow for the slow management
+# switches.
+timeout = 300
+request_timeout = 300
+
+[glance]
+# Retry image download at least once if failure
+num_retries = 1
@@ -0,0 +1,12 @@
+[DEFAULT]
+{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %}
+# We are increasing the RPC response timeouts to 5 minutes due to the neutron
+# generic switch driver, which synchronously applies switch configuration for
+# each ironic port during node provisioning and tear down.
+# The specific API calls that require this long timeout are:
+# - Creation and deletion of VLAN networks.
+# - Creation or update of ports, adding binding information.
+# - Update of ports, removing binding information.
+# - Deletion of ports.
+rpc_response_timeout = 360
+{% endif %}
@@ -1,2 +1,15 @@
+[DEFAULT]
+{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %}
+# We are increasing the RPC response timeouts to 5 minutes due to the neutron
+# generic switch driver, which synchronously applies switch configuration for
+# each ironic port during node provisioning and tear down.
+# The specific API calls that require this long timeout are:
+# - Creation and deletion of VLAN networks.
+# - Creation or update of ports, adding binding information.
+# - Update of ports, removing binding information.
+# - Deletion of ports.
+rpc_response_timeout = 360
+{% endif %}
+
 [libvirt]
 hw_machine_type = x86_64=q35
@@ -1,4 +1,17 @@
-{% if kolla_enable_ironic|bool and kolla_nova_compute_ironic_host is not none %}
 [DEFAULT]
+{% if kolla_enable_ironic|bool and kolla_nova_compute_ironic_host is not none %}
 host = {{ kolla_nova_compute_ironic_static_host_name | mandatory('You must set a static host name to help with service failover. See the operations documentation, Ironic section.') }}
 {% endif %}
+# Don't limit the number of concurrent builds for the nova ironic compute
+# service.
+max_concurrent_builds = 35
+
+force_config_drive = True
+
+[ironic]
+# Ramp up maximum retries to allow time for baremetal node reboot and switch configs
+api_max_retries = 720
+
+[compute]
+# Don't disable the compute service due to failed builds.
+consecutive_build_service_disable_threshold = 0
@@ -60,6 +60,21 @@ prometheus_instance_label: "{% raw %}{{ ansible_facts.hostname }}{% endraw %}"
 # openstack_nova_server_status metrics.
 prometheus_openstack_exporter_compute_api_version: "2.1"
 
+{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %}
+# NOTE: We are increasing the HAProxy timeouts to 5 minutes due to the neutron
+# generic switch driver, which synchronously applies switch configuration for
+# each ironic port during node provisioning and tear down.
+# The specific API calls that require this long timeout are:
+# - Creation and deletion of VLAN networks.
+# - Creation or update of ports, adding binding information.
+# - Update of ports, removing binding information.
+# - Deletion of ports.
+haproxy_client_timeout: 5m30
+haproxy_server_timeout: 5m30
+# If using Neutron backend TLS:
+neutron_tls_proxy_client_timeout: 5m30
+neutron_tls_proxy_server_timeout: 5m30
+{% endif %}
 # The Pulp URL must be templated by Kayobe rather than Kolla Ansible.
 # The rest of the Prometheus Blackbox exporter configuration can be found in
 # the Kolla inventory.
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,6 +10,7 @@ the various features provided. @@
        release-train
        host-images
+       ironic
        lvm
        swap
        cephadm
@@ Expand Down @@