Skip to content

DA Scale change to public repo #268

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: da_scale_jul_1
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions datasource.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,17 @@ data "ibm_is_zone" "zone" {
}
*/

#Fetching Existing VPC CIDR for Security rules:
data "ibm_is_vpc" "existing_vpc" {
count = var.vpc_name != null ? 1 : 0
name = var.vpc_name
}

data "ibm_is_vpc_address_prefixes" "existing_vpc_cidr" {
count = var.vpc_name != null ? 1 : 0
vpc = data.ibm_is_vpc.existing_vpc[0].id
}

/*
data "ibm_is_subnet" "subnet" {
count = length(local.subnets)
Expand All @@ -28,8 +34,8 @@ data "ibm_is_subnet" "subnet" {
# }

data "ibm_is_subnet" "existing_cluster_subnets" {
count = var.vpc_name != null && var.cluster_subnet_ids != null ? 1 : 0
identifier = var.cluster_subnet_ids
count = var.vpc_name != null && var.cluster_subnet_id != null ? 1 : 0
identifier = var.cluster_subnet_id
}

data "ibm_is_subnet" "existing_storage_subnets" {
Expand Down Expand Up @@ -58,6 +64,7 @@ data "ibm_is_ssh_key" "ssh_keys" {
}

data "ibm_is_subnet" "compute_subnet_crn" {
count = var.vpc_name != null && var.cluster_subnet_id != null ? 1 : 0
identifier = local.compute_subnet_id
}

Expand All @@ -83,7 +90,7 @@ data "ibm_is_instance_profile" "protocol_profile" {
}

data "ibm_is_subnet_reserved_ips" "protocol_subnet_reserved_ips" {
count = local.scale_ces_enabled == true ? 1 : 0
count = var.enable_deployer == false && local.scale_ces_enabled == true ? 1 : 0
subnet = local.protocol_subnet_id
}

Expand Down
894 changes: 546 additions & 348 deletions ibm_catalog.json

Large diffs are not rendered by default.

120 changes: 81 additions & 39 deletions locals.tf

Large diffs are not rendered by default.

292 changes: 194 additions & 98 deletions main.tf

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,14 @@
line: "{{ item.line }}"
create: yes
loop:
- { regexp: "==ACCESSKEY==", line: "customerid: {{ cloud_monitoring_access_key }}" }
- { regexp: "==COLLECTOR==", line: "collector: {{ cloud_monitoring_ingestion_url }}" }
- {
regexp: "==ACCESSKEY==",
line: "customerid: {{ cloud_monitoring_access_key }}",
}
- {
regexp: "==COLLECTOR==",
line: "collector: {{ cloud_monitoring_ingestion_url }}",
}
- { regexp: "^tags:", line: "tags: type:management,lsf:true" }
when: monitoring_enable_for_management | bool

Expand Down Expand Up @@ -113,14 +119,11 @@
executable: /bin/bash
when: monitoring_enable_for_management | bool

- name: Install LSF Prometheus Exporter if not already installed
ansible.builtin.shell: |
echo "Exporter not found. Cloning and installing...";
rm -rf /tmp/lsf_prometheus_exporter
git clone --branch wheel_pkg https://[email protected]/platformcomputing/lsf_prometheus_exporter /tmp/lsf_prometheus_exporter
{{ pip_executable }} install /tmp/lsf_prometheus_exporter/lsf_prometheus_exporter-1.0.0-py3-none-any.whl
args:
executable: /bin/bash
- name: Install LSF Prometheus Exporter using pip
ansible.builtin.pip:
name: lsf_prometheus_exporter
executable: /usr/local/bin/pip3.11
extra_args: --no-cache-dir --force-reinstall
when:
- monitoring_enable_for_management | bool
- not exporter_installed.stat.exists
Expand Down Expand Up @@ -163,7 +166,7 @@
exec >> /var/log/lsf_prometheus_exporter.log 2>&1
source /opt/ibm/lsfsuite/lsf/conf/profile.lsf
exec /usr/bin/python3 -m lsf_prometheus_exporter
mode: '0755'
mode: "0755"
owner: lsfadmin
group: lsfadmin
when:
Expand All @@ -172,7 +175,7 @@
- name: Create systemd service for Prometheus Agent
ansible.builtin.copy:
dest: /etc/systemd/system/prometheus.service
mode: '0644'
mode: "0644"
content: |
[Unit]
Description=Prometheus Agent
Expand All @@ -198,6 +201,24 @@
- monitoring_enable_for_management | bool
- not exporter_installed.stat.exists

- name: Enable LSF scheduler metrics for Prometheus
ansible.builtin.lineinfile:
path: "{{ LSF_CONF }}/lsbatch/{{ prefix }}/configdir/lsb.params"
insertbefore: "^End Parameters"
line: "SCHED_METRIC_ENABLE=Y"
state: present
backup: yes
when:
- monitoring_enable_for_management | bool

- name: Restart lsfd service to apply scheduler metric changes
ansible.builtin.systemd:
name: lsfd
state: restarted
enabled: yes
when:
- monitoring_enable_for_management | bool

- name: Reload systemd and start Prometheus Agent
ansible.builtin.systemd:
daemon_reload: yes
Expand Down Expand Up @@ -226,7 +247,7 @@

[Install]
WantedBy=multi-user.target
mode: '0644'
mode: "0644"
when: monitoring_enable_for_management | bool

- name: Reload systemd and start Prometheus Agent
Expand All @@ -240,7 +261,7 @@
- name: Ensure start script has correct permissions
ansible.builtin.file:
path: /opt/ibm/lsfsuite/lsf/start_lsf_prometheus_exporter.sh
mode: '0755'
mode: "0755"
owner: lsfadmin
group: lsfadmin
when: monitoring_enable_for_management | bool
Expand All @@ -262,10 +283,11 @@
RestartSec=10
User=lsfadmin
Group=lsfadmin
Restart=always

[Install]
WantedBy=multi-user.target
mode: '0644'
mode: "0644"
when: monitoring_enable_for_management | bool

- name: Reload systemd and start LSF Prometheus Exporter
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
sysdig_config_file: "/opt/draios/etc/dragent.yaml"
prometheus_config_file: "/opt/prometheus/prometheus.yml"
LSF_CONF: "/opt/ibm/lsfsuite/lsf/conf"
PROMETHEUS_VERSION: "2.51.1"
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
---

- name: Check if LSF logs directory exists
stat:
path: "{{ LSF_LOGS }}"
register: logs_dir_stat

- name: Ensure LSF logs directory exists (recurse only on first creation)
file:
path: "{{ LSF_LOGS }}"
state: directory
owner: lsfadmin
group: lsfadmin
mode: '0755'
recurse: "{{ not logs_dir_stat.stat.exists }}"

- name: Ensure LSF conf and work are symlinks via shell
shell: |
[ -L "{{ LSF_TOP }}/{{ item }}" ] && echo "Symlink exists, skipping." || { \
[ -d "{{ LSF_TOP }}/{{ item }}" ] && rm -rf "{{ LSF_TOP }}/{{ item }}"; \
ln -s /mnt/lsf/lsf/{{ item }} "{{ LSF_TOP }}/{{ item }}"; }
loop:
- conf
- work

- name: Ensure correct ownership and permissions of /opt/ibm/lsfsuite
file:
path: "{{ LSF_SUITE }}"
owner: lsfadmin
group: lsfadmin
mode: '0777'
recurse: yes

- name: Set login_node_host to first host in login_node group
set_fact:
login_node_host: "{{ groups['login_node'][0] }}"

- name: Get IPv4 address of the current host
shell: "getent ahostsv4 {{ inventory_hostname }} | awk '{ print $1; exit }'"
register: ip_result
changed_when: false

- name: Ensure login node entry exists in LSF hosts file
lineinfile:
path: /mnt/lsf/lsf/conf/hosts
line: "{{ ip_result.stdout }} {{ login_node_host }}.{{ dns_domain_names }}"
state: present
insertafter: EOF
create: yes

- name: Insert hostname line after "#prune" only once
lineinfile:
path: "{{ LSF_CLUSTER_FILE }}"
insertafter: "^#prune"
line: "{{ login_node_host }}.{{ dns_domain_names }} Intel_E5 X86_64 0 ()"
state: present

- name: Ensure LSF profile is sourced in root's .bashrc
lineinfile:
path: "/root/.bashrc"
line: "source {{ LSF_CONF }}/profile.lsf"
state: present

- name: Ensure LSF profile is sourced in lsfadmin's .bashrc
lineinfile:
path: "{{ LSFADMIN_DIR }}/.bashrc"
line: "source {{ LSF_CONF }}/profile.lsf"
state: present

- name: Source current user's .bashrc (only if updated)
shell: |
grep -q "source {{ LSF_CONF }}/profile.lsf" /root/.bashrc && source /root/.bashrc || true
args:
executable: /bin/bash

- name: Source lsfadmin's .bashrc (only if updated)
shell: |
grep -q "source {{ LSF_CONF }}/profile.lsf" "{{ LSFADMIN_DIR }}/.bashrc" && source "{{ LSFADMIN_DIR }}/.bashrc" || true
args:
executable: /bin/bash
4 changes: 4 additions & 0 deletions modules/ansible-roles/roles/lsf_login_config/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---

# Configure Login node
- import_tasks: login_node_configuration.yml
8 changes: 8 additions & 0 deletions modules/ansible-roles/roles/lsf_login_config/vars/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
LSF_SUITE: "/opt/ibm/lsfsuite"
LSF_TOP: "{{ LSF_SUITE }}/lsf"
LSF_CONF: "{{ LSF_TOP }}/conf"
LSF_WORK: "{{ LSF_TOP }}/work"
LSF_LOGS: "/opt/ibm/lsflogs"
LSF_HOSTS_FILE: "{{ LSF_CONF }}/hosts"
LSF_CLUSTER_FILE: "{{ LSF_CONF }}/lsf.cluster.{{ prefix }}"
LSFADMIN_DIR: "/home/lsfadmin"
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
---
# AppCenter HTTPS Configuration

- name: PAC | Check if HTTPS is already enabled
ansible.builtin.command: pmcadmin https enable
register: https_check
changed_when: "'was already enabled' not in https_check.stdout"
failed_when: false
run_once: true

- name: PAC | Debug HTTPS status
ansible.builtin.debug:
msg: "HTTPS is already enabled"
when: "'was already enabled' in https_check.stdout"
run_once: true

- name: PAC | Configure HTTPS for AppCenter
block:

- name: PAC | Set GUI password for lsfadmin
ansible.builtin.command: passwd --stdin lsfadmin
args:
stdin: "{{ app_center_gui_password }}"

- name: PAC | Enable HTTPS access for AppCenter
ansible.builtin.command: >
pmcadmin https enable
--password {{ app_center_gui_password }}
--validhosts localhost

- name: PAC | Stop pmcadmin service
ansible.builtin.command: pmcadmin stop

- name: PAC | Pause before restarting pmcadmin
ansible.builtin.pause:
seconds: 5

- name: PAC | Start pmcadmin service
ansible.builtin.command: pmcadmin start

- name: PAC | Update JS_PAC_SERVER_URL in js.conf
ansible.builtin.lineinfile:
path: "{{ JS_PAC_SERVER_URL }}"
regexp: '^JS_PAC_SERVER_URL='
line: "JS_PAC_SERVER_URL=https://{{ lsf_masters[0] }}:8443"
backrefs: true

- name: PAC | Stop ACD (Application Center Daemon) service
ansible.builtin.service:
name: acd
state: stopped

- name: PAC | Pause before restarting ACD
ansible.builtin.pause:
seconds: 5

- name: PAC | Start ACD (Application Center Daemon) service
ansible.builtin.service:
name: acd
state: started

rescue:
- name: PAC | Log error if AppCenter HTTPS configuration fails
ansible.builtin.debug:
msg: "AppCenter HTTPS configuration block failed. Check previous task results."

always:
- name: PAC | Always log final status of AppCenter HTTPS configuration
ansible.builtin.debug:
msg: "AppCenter HTTPS configuration block completed (success or failure)."

when: "'was already enabled' not in https_check.stdout"
run_once: true
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,3 @@
dest: "{{ LSF_RC_IC_CONF }}/user_data.sh"
mode: '0644'
run_once: true

- name: Management Config Templates | Restart lsfd service
service:
name: lsfd
state: restarted

- name: Management Config Templates | Restart NetworkManager
service:
name: NetworkManager
state: restarted
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,21 @@
path: "{{ LSF_CLUSTER_FILE }}"
regexp: '^lsfservers'
state: absent
run_once: true

# Temporary: Remove after new image build includes cleanup
- name: Temporary Cleanup | Delete all 'sagar-fp-15-new1' folders
ansible.builtin.shell: |
find "{{ LSF_EXT_CONF }}" -type d -name "sagar-fp-15-new1" -exec rm -rf {} +
args:
warn: false
ignore_errors: true
when: inventory_hostname in groups['management_nodes']

# Temporary: Remove after new image build includes cleanup
- name: Temporary Cleanup | Replace 'sagar-fp-15-new1' with 'lsfservers'
ansible.builtin.shell: |
grep -rl 'sagar-fp-15-new1' "{{ LSF_EXT_CONF }}" | xargs sed -i 's/sagar-fp-15-new1/lsfservers/g' || true
args:
warn: false
when: inventory_hostname in groups['management_nodes']
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

- name: Management Config | Aggregate all IPs from all hosts
set_fact:
all_ips: "{{ groups['all'] | map('extract', hostvars, 'host_ip') | list }}"
all_ips: "{{ groups['mgmt_compute_nodes'] | map('extract', hostvars, 'host_ip') | list }}"
run_once: true

- name: Management Config | Display all resolved IP addresses
Expand Down
Loading