Add StackHPC Ironic tunings

jovial · jovial · commit cff7553762ae · 2024-03-22T10:44:22.000Z
diff --git a/etc/kayobe/kolla/config/global.conf b/etc/kayobe/kolla/config/global.conf
@@ -0,0 +1,4 @@
+[oslo_messaging_rabbit]
+rpc_conn_pool_size = 300
+rpc_thread_pool_size = 2048
+rpc_response_timeout = 3600
diff --git a/etc/kayobe/kolla/config/ironic-inspector.conf b/etc/kayobe/kolla/config/ironic-inspector.conf
@@ -0,0 +1,12 @@
+[DEFAULT]
+timeout = 0
+# We are increasing the RPC response timeouts to 5 minutes due to the neutron
+# generic switch driver, which synchronously applies switch configuration for
+# each ironic port during node provisioning and tear down.
+# The specific API calls that require this long timeout are:
+# - Creation and deletion of VLAN networks.
+# - Creation or update of ports, adding binding information.
+# - Update of ports, removing binding information.
+# - Deletion of ports.
+rpc_response_timeout = 360
+
diff --git a/etc/kayobe/kolla/config/ironic/ironic-api.conf b/etc/kayobe/kolla/config/ironic/ironic-api.conf
@@ -0,0 +1,6 @@
+[DEFAULT]
+# Avoid some timeouts of heartbeats and vif deletes
+rpc_response_timeout = 360
+
+[neutron]
+timeout = 300
diff --git a/etc/kayobe/kolla/config/ironic/ironic-conductor.conf b/etc/kayobe/kolla/config/ironic/ironic-conductor.conf
@@ -0,0 +1,92 @@
+[DEFAULT]
+# Make direct deploy faster, transfer sparse qcow2 images
+force_raw_images = False
+# Avoid some rpc timeouts
+rpc_response_timeout = 360
+
+[agent]
+# Use iPXE server for direct deploy source
+image_download_source = http
+# Reduce load by only collecting logs on failure
+deploy_logs_collect = on_failure
+
+[console]
+socat_address = {% raw %}{{ api_interface_address }}{% endraw %}
+
+[conductor]
+automated_clean=true
+# We have busy conductors failing to heartbeat
+# Default is 10 secs
+heartbeat_interval = 30
+# Default is 60 seconds
+heartbeat_timeout = 360
+sync_local_state_interval = 360
+
+[agent]
+# Use iPXE server for direct deploy source
+image_download_source = http
+# Reduce load by only collecting logs on failure
+deploy_logs_collect = on_failure
+
+[console]
+socat_address = {% raw %}{{ api_interface_address }}{% endraw %}
+
+[conductor]
+automated_clean=true
+# We have busy conductors failing to heartbeat
+# Default is 10 secs
+heartbeat_interval = 30
+# Default is 60 seconds
+heartbeat_timeout = 360
+sync_local_state_interval = 360
+
+# Normally this is 100. We see eventlet threads
+# not making much progress, to for saftey reduce
+# this by half, should leave work on rabbit queu
+workers_pool_size = 50
+# Normally this is 8, keep it same
+period_max_workers = 8
+
+# Stop power state sync
+sync_power_state_interval = 0
+power_failure_recovery_interval = 0
+# Stop checking for orphan allocations for now
+check_allocations_interval = 0
+
+# Wait much longer before provision timeout check, to reduce background load
+# The default is 60 seconds
+check_provision_state_interval = 120
+check_rescue_state_interval = 120
+
+[database]
+# Usually this is 50, reduce to stop DB connection timeouts
+# and instead just make eventlet threads wait a bit longer
+max_overflow = 5
+# Usually this is 5
+max_pool_size = 5
+# Usually this is 3600, reduce this to make it less likely
+# to get a connection out the pool that has already timed out
+connection_recycle_time = 1800
+# By default this is 30 seconds, but as we reduce
+# the pool overflow, some people will need to wait longer
+pool_timeout = 60
+
+[deploy]
+shred_random_overwrite_iterations = 0
+shred_final_overwrite_with_zeros = false
+continue_if_disk_secure_erase_fails = true
+
+[pxe]
+# Increase cache size to 120GB and TTL to 28 hours
+image_cache_size = 122880
+image_cache_ttl = 100800
+
+[neutron]
+# Increase the neutron client timeout to allow for the slow management
+# switches.
+timeout = 300
+request_timeout = 300
+
+[glance]
+# Retry image download at least once if failure
+num_retries = 1
diff --git a/etc/kayobe/kolla/config/neutron.conf b/etc/kayobe/kolla/config/neutron.conf
@@ -0,0 +1,10 @@
+[DEFAULT]
+# We are increasing the RPC response timeouts to 5 minutes due to the neutron
+# generic switch driver, which synchronously applies switch configuration for
+# each ironic port during node provisioning and tear down.
+# The specific API calls that require this long timeout are:
+# - Creation and deletion of VLAN networks.
+# - Creation or update of ports, adding binding information.
+# - Update of ports, removing binding information.
+# - Deletion of ports.
+rpc_response_timeout = 360
diff --git a/etc/kayobe/kolla/config/nova.conf b/etc/kayobe/kolla/config/nova.conf
@@ -1,2 +1,13 @@
+[DEFAULT]
+# We are increasing the RPC response timeouts to 5 minutes due to the neutron
+# generic switch driver, which synchronously applies switch configuration for
+# each ironic port during node provisioning and tear down.
+# The specific API calls that require this long timeout are:
+# - Creation and deletion of VLAN networks.
+# - Creation or update of ports, adding binding information.
+# - Update of ports, removing binding information.
+# - Deletion of ports.
+rpc_response_timeout = 360
+
 [libvirt]
 hw_machine_type = x86_64=q35
diff --git a/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf b/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf
@@ -2,3 +2,25 @@
 [DEFAULT]
 host = {{ kolla_nova_compute_ironic_static_host_name | mandatory('You must set a static host name to help with service failover. See the operations documentation, Ironic section.') }}
 {% endif %}
+# Don't limit the number of concurrent builds for the nova ironic compute
+# service.
+max_concurrent_builds = 35
+
+force_config_drive = True
+
+# Disable power state sync interval (default is 600 seconds)
+sync_power_state_interval = -1
+
+# usually 1000, turn this down for ironic
+sync_power_state_pool_size = 50
+
+# usually 300, back off a bit for ironic
+resource_provider_association_refresh = 600
+
+[ironic]
+# Ramp up maximum retries to allow time for baremetal node reboot and switch configs
+api_max_retries = 720
+
+[compute]
+# Don't disable the compute service due to failed builds.
+consecutive_build_service_disable_threshold = 0
diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml
@@ -53,3 +53,18 @@ prometheus_instance_label: "{% raw %}{{ ansible_facts.hostname }}{% endraw %}"
 # in Yoga. This is required to include a valid value for the flavor_id label on
 # openstack_nova_server_status metrics.
 prometheus_openstack_exporter_compute_api_version: "2.1"
+
+# NOTE: We are increasing the HAProxy timeouts to 5 minutes due to the neutron
+# generic switch driver, which synchronously applies switch configuration for
+# each ironic port during node provisioning and tear down. Due to the lack of
+# batching of configuration sets, and the long time required for configuration
+# to be committed by the Juniper virtual chassis fabric (~15 seconds),
+# concurrent deployment of multiple nodes can cause configuration
+# transactions to back up.  The specific API calls that require this long
+# timeout are:
+# - Creation and deletion of VLAN networks.
+# - Creation or update of ports, adding binding information.
+# - Update of ports, removing binding information.
+# - Deletion of ports.
+haproxy_client_timeout: 5m30
+haproxy_server_timeout: 5m30