Skip to content

Commit cff7553

Browse files
committed
Add StackHPC Ironic tunings
1 parent 1fd7190 commit cff7553

File tree

8 files changed

+172
-0
lines changed

8 files changed

+172
-0
lines changed

etc/kayobe/kolla/config/global.conf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[oslo_messaging_rabbit]
2+
rpc_conn_pool_size = 300
3+
rpc_thread_pool_size = 2048
4+
rpc_response_timeout = 3600
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[DEFAULT]
2+
timeout = 0
3+
# We are increasing the RPC response timeouts to 5 minutes due to the neutron
4+
# generic switch driver, which synchronously applies switch configuration for
5+
# each ironic port during node provisioning and tear down.
6+
# The specific API calls that require this long timeout are:
7+
# - Creation and deletion of VLAN networks.
8+
# - Creation or update of ports, adding binding information.
9+
# - Update of ports, removing binding information.
10+
# - Deletion of ports.
11+
rpc_response_timeout = 360
12+
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[DEFAULT]
2+
# Avoid some timeouts of heartbeats and vif deletes
3+
rpc_response_timeout = 360
4+
5+
[neutron]
6+
timeout = 300
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
[DEFAULT]
2+
# Make direct deploy faster, transfer sparse qcow2 images
3+
force_raw_images = False
4+
# Avoid some rpc timeouts
5+
rpc_response_timeout = 360
6+
7+
[agent]
8+
# Use iPXE server for direct deploy source
9+
image_download_source = http
10+
# Reduce load by only collecting logs on failure
11+
deploy_logs_collect = on_failure
12+
13+
[console]
14+
socat_address = {% raw %}{{ api_interface_address }}{% endraw %}
15+
16+
[conductor]
17+
automated_clean=true
18+
# We have busy conductors failing to heartbeat
19+
# Default is 10 secs
20+
heartbeat_interval = 30
21+
# Default is 60 seconds
22+
heartbeat_timeout = 360
23+
sync_local_state_interval = 360
24+
25+
[agent]
26+
# Use iPXE server for direct deploy source
27+
image_download_source = http
28+
# Reduce load by only collecting logs on failure
29+
deploy_logs_collect = on_failure
30+
31+
[console]
32+
socat_address = {% raw %}{{ api_interface_address }}{% endraw %}
33+
34+
[conductor]
35+
automated_clean=true
36+
# We have busy conductors failing to heartbeat
37+
# Default is 10 secs
38+
heartbeat_interval = 30
39+
# Default is 60 seconds
40+
heartbeat_timeout = 360
41+
sync_local_state_interval = 360
42+
43+
# Normally this is 100. We see eventlet threads
44+
# not making much progress, to for saftey reduce
45+
# this by half, should leave work on rabbit queu
46+
workers_pool_size = 50
47+
# Normally this is 8, keep it same
48+
period_max_workers = 8
49+
50+
# Stop power state sync
51+
sync_power_state_interval = 0
52+
power_failure_recovery_interval = 0
53+
# Stop checking for orphan allocations for now
54+
check_allocations_interval = 0
55+
56+
# Wait much longer before provision timeout check, to reduce background load
57+
# The default is 60 seconds
58+
check_provision_state_interval = 120
59+
check_rescue_state_interval = 120
60+
61+
[database]
62+
# Usually this is 50, reduce to stop DB connection timeouts
63+
# and instead just make eventlet threads wait a bit longer
64+
max_overflow = 5
65+
# Usually this is 5
66+
max_pool_size = 5
67+
# Usually this is 3600, reduce this to make it less likely
68+
# to get a connection out the pool that has already timed out
69+
connection_recycle_time = 1800
70+
# By default this is 30 seconds, but as we reduce
71+
# the pool overflow, some people will need to wait longer
72+
pool_timeout = 60
73+
74+
[deploy]
75+
shred_random_overwrite_iterations = 0
76+
shred_final_overwrite_with_zeros = false
77+
continue_if_disk_secure_erase_fails = true
78+
79+
[pxe]
80+
# Increase cache size to 120GB and TTL to 28 hours
81+
image_cache_size = 122880
82+
image_cache_ttl = 100800
83+
84+
[neutron]
85+
# Increase the neutron client timeout to allow for the slow management
86+
# switches.
87+
timeout = 300
88+
request_timeout = 300
89+
90+
[glance]
91+
# Retry image download at least once if failure
92+
num_retries = 1

etc/kayobe/kolla/config/neutron.conf

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[DEFAULT]
2+
# We are increasing the RPC response timeouts to 5 minutes due to the neutron
3+
# generic switch driver, which synchronously applies switch configuration for
4+
# each ironic port during node provisioning and tear down.
5+
# The specific API calls that require this long timeout are:
6+
# - Creation and deletion of VLAN networks.
7+
# - Creation or update of ports, adding binding information.
8+
# - Update of ports, removing binding information.
9+
# - Deletion of ports.
10+
rpc_response_timeout = 360

etc/kayobe/kolla/config/nova.conf

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,13 @@
1+
[DEFAULT]
2+
# We are increasing the RPC response timeouts to 5 minutes due to the neutron
3+
# generic switch driver, which synchronously applies switch configuration for
4+
# each ironic port during node provisioning and tear down.
5+
# The specific API calls that require this long timeout are:
6+
# - Creation and deletion of VLAN networks.
7+
# - Creation or update of ports, adding binding information.
8+
# - Update of ports, removing binding information.
9+
# - Deletion of ports.
10+
rpc_response_timeout = 360
11+
112
[libvirt]
213
hw_machine_type = x86_64=q35

etc/kayobe/kolla/config/nova/nova-compute-ironic.conf

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,25 @@
22
[DEFAULT]
33
host = {{ kolla_nova_compute_ironic_static_host_name | mandatory('You must set a static host name to help with service failover. See the operations documentation, Ironic section.') }}
44
{% endif %}
5+
# Don't limit the number of concurrent builds for the nova ironic compute
6+
# service.
7+
max_concurrent_builds = 35
8+
9+
force_config_drive = True
10+
11+
# Disable power state sync interval (default is 600 seconds)
12+
sync_power_state_interval = -1
13+
14+
# usually 1000, turn this down for ironic
15+
sync_power_state_pool_size = 50
16+
17+
# usually 300, back off a bit for ironic
18+
resource_provider_association_refresh = 600
19+
20+
[ironic]
21+
# Ramp up maximum retries to allow time for baremetal node reboot and switch configs
22+
api_max_retries = 720
23+
24+
[compute]
25+
# Don't disable the compute service due to failed builds.
26+
consecutive_build_service_disable_threshold = 0

etc/kayobe/kolla/globals.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,18 @@ prometheus_instance_label: "{% raw %}{{ ansible_facts.hostname }}{% endraw %}"
5353
# in Yoga. This is required to include a valid value for the flavor_id label on
5454
# openstack_nova_server_status metrics.
5555
prometheus_openstack_exporter_compute_api_version: "2.1"
56+
57+
# NOTE: We are increasing the HAProxy timeouts to 5 minutes due to the neutron
58+
# generic switch driver, which synchronously applies switch configuration for
59+
# each ironic port during node provisioning and tear down. Due to the lack of
60+
# batching of configuration sets, and the long time required for configuration
61+
# to be committed by the Juniper virtual chassis fabric (~15 seconds),
62+
# concurrent deployment of multiple nodes can cause configuration
63+
# transactions to back up. The specific API calls that require this long
64+
# timeout are:
65+
# - Creation and deletion of VLAN networks.
66+
# - Creation or update of ports, adding binding information.
67+
# - Update of ports, removing binding information.
68+
# - Deletion of ports.
69+
haproxy_client_timeout: 5m30
70+
haproxy_server_timeout: 5m30

0 commit comments

Comments
 (0)