Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix functional tests failures for STF tests #166

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
Draft
5 changes: 5 additions & 0 deletions roles/test_alerts/tasks/test_create_an_alert.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,12 @@
cmd: |
curl -k {{ prom_auth_string }} https://{{ prom_url }}/api/v1/rules
register: cmd_output
retries: 30
delay: 10
changed_when: true
# when there are no rules, there is still a response and rc == 0
# e.g. {\"status\":\"success\",\"data\":{\"groups\":[]}}
until: '"FVT_TESTING Collectd metrics receive rate is zero" in cmd_output.stdout'

always:
- name: "Delete the PrometheusRule"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
ansible.builtin.shell:
cmd: |
oc patch stf default --type merge -p '{"spec": {"alertmanagerConfigManifest": "apiVersion: v1\nkind: Secret\nmetadata:\n name: 'alertmanager-default'\n namespace: 'service-telemetry'\ntype: Opaque\nstringData:\n alertmanager.yaml: |-\n global:\n resolve_timeout: 10m\n route:\n group_by: ['job']\n group_wait: 30s\n group_interval: 5m\n repeat_interval: 12h\n receiver: 'null'\n receivers:\n - name: 'null'\n"}}'
changed_when: false
changed_when: 'cmd_output == "servicetelemetry.infra.watch/default patched"'
register: cmd_output
failed_when: cmd_output.rc != 0

Expand All @@ -27,11 +27,10 @@

# oc get secret alertmanager-default -o go-template='{{index .data "alertmanager.yaml" | base64decode }}'
# Can't use -o go-template because of the "{{" and "}}", which are mistaken for templating syntax.
# The alertmanager.yaml key needed to be surrounded by [".."] because of the period in the key name.
- name: "Get the updated secret"
ansible.builtin.shell:
cmd: |
oc get secret alertmanager-default -ojson | jq '.data | .["alertmanager.yaml"]'
oc get secret alertmanager-default -ojsonpath="{ .data.alertmanager\.yaml }"
register: cmd_output
changed_when: false

Expand Down Expand Up @@ -67,6 +66,10 @@
register: cmd_output
failed_when: cmd_output.rc != 0

- name: Add a check here to make sure the alert is created
ansible.builtin.debug:
msg: TODO add in a check to make sure the alert is created, since it might take a little while to propogate

- name: "RHELOSP-148697 Interrupt metrics flow by preventing the QDR from running"
ansible.builtin.shell:
cmd: |
Expand All @@ -78,18 +81,27 @@
cmd: >-
oc exec -it prometheus-default-0 -c prometheus -- /bin/sh -c 'curl -k -H \
"Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" \
https://default-alertmanager-proxy:9095/api/v1/alerts' | grep 'active' | grep 'FVT_TESTING Collectd metrics receive rate is zero'
https://default-alertmanager-proxy:9095/api/v1/alerts' | grep 'active'
register: cmd_output
retries: 30
delay: 10
changed_when: false
failed_when: cmd_output.stdout_lines | length == 0
until: '"FVT_TESTING Collectd metrics receive rate is zero" in cmd_output.stdout'

- name: "RHELOSP-148699 Verify that the alert is firing in Prometheus"
ansible.builtin.shell:
cmd: >-
/usr/bin/curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/alerts | grep 'firing' | grep 'FVT_TESTING Collectd metrics receive rate is zero'
curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/alerts | jq '.data.alerts | select (.[].state == "firing") | .[].labels.alertname'
register: cmd_output
changed_when: false
failed_when: '"FVT_TESTING Collectd metrics receive rate is zero" not in cmd_output.stdout'

- name: "Check what alerts are firing in prometheus"
ansible.builtin.command:
cmd: >-
curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/alerts
register: cmd_output
changed_when: false
failed_when: cmd_output.stdout_lines | length == 0

always:
- name: "Delete the PrometheusRule"
Expand Down Expand Up @@ -117,7 +129,6 @@
register: output
until: output.stdout_lines | length == expected_pods.stdout_lines | length
changed_when: false


- name: "RHELOSP-176039 Remove alertmanagerConfigManifest from the ServiceTelemetry object"
ansible.builtin.shell:
Expand Down
53 changes: 53 additions & 0 deletions roles/test_sensubility/tasks/test_health_status.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@
changed_when: false
failed_when: container_nodes.stdout_lines|length != 0

- name: Check what metrics are available to prometheus that relate to sensubility
ansible.builtin.shell:
cmd: |
curl -k {{ prom_auth_string }} -g https://${prom_url}/api/v1/label/__name__/values | jq | grep sensubility
changed_when: false

# issue might be that the value is aggregated over 10m, and we only check for 2
- name: RHELOSP-176036 Check that health status of container changed to 0
ansible.builtin.shell:
cmd: /usr/bin/curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/query? --data-urlencode 'query=last_over_time(sensubility_container_health_status{process="logrotate_crond",host="ceph-0.redhat.local"}[10m])' | grep -oP '(?<="value":).*' | awk -F, '{ print $2 }' | grep -o '[0-9]\+' | grep 0
Expand All @@ -41,6 +48,52 @@
delay: 10
until: "output.stdout_lines | length == 1"

#- name: NEW RHELOSP-176036 but check over a smaller time
# ansible.builtin.shell:
# cmd: /usr/bin/curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/query? --data-urlencode 'query=last_over_time(sensubility_container_health_status{process="logrotate_crond",host="ceph-0.redhat.local"}[2m])' | grep -oP '(?<="value":).*' | awk -F, '{ print $2 }' | grep -o '[0-9]\+' | grep 0
# register: output
# changed_when: false
# retries: 12
# delay: 10
# until: "output.stdout_lines | length == 1"
#
#- name: NEW RHELOSP-176036 but check over a smaller time
# ansible.builtin.shell:
# cmd: /usr/bin/curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/query? --data-urlencode 'query=last_over_time(sensubility_container_health_status{process="logrotate_crond",host="ceph-0.redhat.local"}[2m])' | grep -oP '(?<="value":).*' | awk -F, '{ print $2 }' | grep -o '[0-9]\+' | grep 0
# register: output
# changed_when: false
# retries: 12
# delay: 10
# until: "output.stdout_lines | length == 1"
#
#- name: NEW RHELOSP-176036 but wait for up to 10 minutes
# ansible.builtin.shell:
# cmd: /usr/bin/curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/query? --data-urlencode 'query=last_over_time(sensubility_container_health_status{process="logrotate_crond",host="ceph-0.redhat.local"}[10m])' | grep -oP '(?<="value":).*' | awk -F, '{ print $2 }' | grep -o '[0-9]\+' | grep 0
# register: output
# changed_when: false
# retries: 60
# delay: 10
# until: "output.stdout_lines | length == 1"
#
#- name: NEW RHELOSP-176036 but don't "grep 0"
# ansible.builtin.shell:
# cmd: /usr/bin/curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/query? --data-urlencode 'query=last_over_time(sensubility_container_health_status{process="logrotate_crond",host="ceph-0.redhat.local"}[10m])' | grep -oP '(?<="value":).*' | awk -F, '{ print $2 }' | grep -o '[0-9]\+'
# register: output
# changed_when: false
#
#- name: NEW RHELOSP-176036 but skip the awk
# ansible.builtin.shell:
# cmd: /usr/bin/curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/query? --data-urlencode 'query=last_over_time(sensubility_container_health_status{process="logrotate_crond",host="ceph-0.redhat.local"}[10m])' | grep -oP '(?<="value":).*'
# register: output
# changed_when: false
#
#- name: NEW RHELOSP-176036 but check the whole output
# ansible.builtin.shell:
# cmd: /usr/bin/curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/query? --data-urlencode 'query=last_over_time(sensubility_container_health_status{process="logrotate_crond",host="ceph-0.redhat.local"}[10m])'
# register: output
# changed_when: false


- name: RHELOSP-176035 Start logrotate_crond container
ansible.builtin.shell:
cmd: |
Expand Down
25 changes: 17 additions & 8 deletions roles/test_snmp_traps/tasks/main.yml
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changes to this file are included in #209 and in #212

Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
ansible.builtin.shell:
cmd: |
oc patch stf/default --type merge -p '{"spec": {"alerting": {"alertmanager": {"receivers": {"snmpTraps": {"enabled": true, "target": "10.10.10.10" }}}}}}'
changed_when: false
changed_when: 'cmd_output == "servicetelemetry.infra.watch/default patched"'
register: cmd_output
failed_when: cmd_output.rc != 0

Expand Down Expand Up @@ -69,17 +69,18 @@
- name: "RHELOSP-144481 Check for snmpTraps logs"
ansible.builtin.shell:
cmd: |
oc logs -l "app=default-snmp-webhook" | grep "Sending SNMP trap"
oc logs -l "app=default-snmp-webhook"
register: cmd_output
changed_when: false
failed_when: "cmd_output.stdout_lines | length == 0"
retries: 12
delay: 10
until: "'Sending SNMP trap' in cmd_output.stdout"


rescue:
- name: "Get the snmp traps logs"
ansible.builtin.shell:
cmd: |
oc logs -l "app=default-snmp-webhook"
changed_when: false
- name: "Show the snmp traps logs"
ansible.builtin.debug:
var: cmd_output.stdout

always:
- name: "Delete the PrometheusRule"
Expand All @@ -98,6 +99,14 @@
until: 'not "FVT_TESTING Collectd metrics receive rate is zero" in cmd_output.stdout'
changed_when: false

- name: "Remove alertmanagerConfigManifest from the ServiceTelemetry object"
ansible.builtin.shell:
cmd: |
oc patch stf/default --type='json' -p '[{"op": "replace", "path": "/spec/alerting/alertmanager/receivers/snmpTraps/enabled", "value": false }]'
changed_when: 'cmd_output == "servicetelemetry.infra.watch/default patched"'
register: cmd_output
failed_when: cmd_output.rc != 0

- name: "Wait up to 2 minutes to make sure all default-interconnect pods are back"
ansible.builtin.command:
cmd: |
Expand Down
2 changes: 1 addition & 1 deletion roles/test_verify_email/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
- name: "RHELOSP-176043 Patch the ServiceTelemetry object for the STF deployment"
ansible.builtin.shell:
cmd: |
oc patch stf default --type merge -p '{"spec": {"alertmanagerConfigManifest": "apiVersion: v1\nkind: Secret\nmetadata:\n name: 'alertmanager-default'\n namespace: 'service-telemetry'\ntype: Opaque\nstringData:\n alertmanager.yaml: |-\n global:\n resolve_timeout: 10m\n smtp_smarthost: localhost:25\n smtp_from: [email protected]\n smtp_auth_username: alertmanager\n smtp_auth_password: password\n route:\n group_by: ['job']\n group_wait: 30s\n group_interval: 5m\n repeat_interval: 12h\n receiver: 'email'\n receivers:\n - name: 'email'\n email_configs:\n - to: [email protected]"}}'
oc patch stf default --type merge -p '{"spec": {"alertmanagerConfigManifest": "apiVersion: v1\nkind: Secret\nmetadata:\n name: 'alertmanager-default'\n namespace: 'service-telemetry'\ntype: Opaque\nstringData:\n alertmanager.yaml: |-\n global:\n resolve_timeout: 10m\n smtp_smarthost: localhost:25\n smtp_from: [email protected]\n smtp_auth_username: alertmanager\n smtp_auth_password: password\n route:\n group_by: ['job']\n group_wait: 30s\n group_interval: 5m\n repeat_interval: 12h\n receiver: 'email'\n receivers:\n - name: \"email\"\n email_configs:\n - to: [email protected]"}}'
changed_when: false

- name: "RHELOSP-176044 Interrupt metrics flow by preventing the QDR from running"
Expand Down