Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: nod-ai/ADA
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 478801cbb481863bbb38c2bd9e4c492922e97513
Choose a base ref
...
head repository: nod-ai/ADA
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 034e2ad2d839ff1645f4a035c6ca18a8dabf030d
Choose a head ref
  • 1 commit
  • 6 files changed
  • 1 contributor

Commits on Jan 23, 2025

  1. Update the code to use scontrol in place of slurm APIs to drain the

    node to make it slurm version independent
    mithun-pensando committed Jan 23, 2025

    Unverified

    This commit is not signed, but one or more authors requires that any commit attributed to them is signed.
    Copy the full SHA
    034e2ad View commit details
Showing with 331 additions and 57 deletions.
  1. +18 −13 redfish-exporter/.env
  2. +18 −12 redfish-exporter/config.go
  3. +22 −2 redfish-exporter/listener.go
  4. +0 −11 redfish-exporter/main.go
  5. +45 −16 redfish-exporter/slurm/queue.go
  6. +228 −3 redfish-exporter/slurm/slurm.go
31 changes: 18 additions & 13 deletions redfish-exporter/.env
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
UPDATED="2024-09-24"
UPDATED="2025-01-22"
DESCRIPTION="Redfish Event Listener/Exporter"
LISTENER_IP="0.0.0.0"
LISTENER_PORT="8080"
METRICS_PORT="2112"
LISTENER_IP="<Listener_IP>"
LISTENER_PORT="<PORT>"
METRICS_PORT="<MERTRICS_PORT>"
USE_SSL="false"
CERTFILE="path/to/certfile"
KEYFILE="path/to/keyfile"
SLURM_USER="slurm user here"
SLURM_TOKEN="token string here, from secret when for real"
SLURM_CONTROL_NODE="slurm control node IP:Port"
SLURM_CONTROL_NODE="<SLURM_CONTROL_NODE_IP>"
#List of '|' seperated reasons for avoiding drain action if there is a match
SLURM_DRAIN_EXCLUDE_REASON_LIST="reason 1|reason 2"
SLURM_SCONTROL_PATH="/usr/bin/scontrol"

# Match RAS events received based on severity and '|' seperated list of message fields and perform drain action with the DrainReasonPrefix set as the prefix in the reason
# Message can be left empty if it doesn't need to be matched against, in that case only severity is matched
# only DrainNode action is supported for now
TRIGGER_EVENTS="[\
{\"Severity\":\"Fatal\",\"Action\":\"DrainNode\"},\
{\"Severity\":\"Critical\",\"Action\":\"DrainNode\"}
{\"Severity\":\"Critical\",\"Message\":\"message 1|This is a critical test event\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNeeded\"},\
{\"Severity\":\"Info\",\"Message\":\"message 3\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"},\
{\"Severity\":\"Warning\",\"Message\":\"message 4|This is a test event message\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"}
]"

# Subscription (v1.5+)
@@ -28,8 +33,8 @@ TRIGGER_EVENTS="[\

# Deprecated <v1.5
SUBSCRIPTION_PAYLOAD="{\
\"Destination\":\"http://host.docker.internal:8080\",\
\"EventTypes\":[\"Alert\",\"StatusChange\"],\
\"Destination\":\"http://<Listener_IP:Port>\",\
\"EventTypes\":[\"Alert\"],\
\"Protocol\":\"Redfish\",\
\"Context\":\"YourContextData\",\
\"Oem\":{\"Supermicro\":{\"EnableSubscription\":true}}\
@@ -41,5 +46,5 @@ PROMETHEUS_CONFIG="{\
}"

REDFISH_SERVERS="[\
{\"ip\":\"http://127.0.0.1:8000\",\"username\":\"Username1\",\"password\":\"Password1\",\"loginType\":\"Session\",\"slurmNode\":\"Node1\"}
]"
{\"ip\":\"https://<BMC_IP>\",\"username\":\"<username>\",\"password\":\"<password>\",\"loginType\":\"Session\",\"slurmNode\":\"<nodename\"}
]"
30 changes: 18 additions & 12 deletions redfish-exporter/config.go
Original file line number Diff line number Diff line change
@@ -49,21 +49,25 @@ type Config struct {
CertFile string
KeyFile string
}
SlurmToken string
SlurmControlNode string
SlurmUser string
SubscriptionPayload SubscriptionPayload
RedfishServers []RedfishServer
TriggerEvents []TriggerEvent
PrometheusConfig PrometheusConfig
context *tls.Config
eventCount int
dataBuffer []byte
SlurmToken string
SlurmControlNode string
SlurmUser string
SlurmScontrolPath string
SlurmDrainExcludeStr string
SubscriptionPayload SubscriptionPayload
RedfishServers []RedfishServer
TriggerEvents []TriggerEvent
PrometheusConfig PrometheusConfig
context *tls.Config
eventCount int
dataBuffer []byte
}

type TriggerEvent struct {
Severity string `json:"Severity"`
Action string `json:"Action"`
Severity string `json:"Severity"`
Action string `json:"Action"`
Message string `json:"Message"`
DrainReasonPrefix string `json:"DrainReasonPrefix"`
}

type PrometheusConfig struct {
@@ -119,6 +123,8 @@ func setupConfig() Config {
AppConfig.SlurmToken = os.Getenv("SLURM_TOKEN")
AppConfig.SlurmControlNode = os.Getenv("SLURM_CONTROL_NODE")
AppConfig.SlurmUser = os.Getenv("SLURM_USER")
AppConfig.SlurmDrainExcludeStr = os.Getenv("SLURM_DRAIN_EXCLUDE_REASON_LIST")
AppConfig.SlurmScontrolPath = os.Getenv("SLURM_SCONTROL_PATH")

subscriptionPayloadJSON := os.Getenv("SUBSCRIPTION_PAYLOAD")
if err := json.Unmarshal([]byte(subscriptionPayloadJSON), &AppConfig.SubscriptionPayload); err != nil {
24 changes: 22 additions & 2 deletions redfish-exporter/listener.go
Original file line number Diff line number Diff line change
@@ -27,6 +27,7 @@ import (
"log"
"net"
"net/http"
"regexp"
"strings"

"github.com/nod-ai/ADA/redfish-exporter/metrics"
@@ -219,15 +220,34 @@ func (s *Server) processRequest(AppConfig Config, conn net.Conn, req *http.Reque
log.Printf("Origin Of Condition: %s", originOfCondition)
for _, triggerEvent := range AppConfig.TriggerEvents {
if severity == triggerEvent.Severity {
log.Printf("Matched Trigger Event: %s with action %s", triggerEvent.Severity, triggerEvent.Action)
if triggerEvent.Message != "" {
re := regexp.MustCompile(triggerEvent.Message)
match := re.FindAllString(message, -1)

if len(match) == 0 {
continue
}
}
log.Printf("Matched Trigger Event: %s | message: %s | with action %s", triggerEvent.Severity, triggerEvent.Message, triggerEvent.Action)
// Sending event belongs to redfish_utils. Each server may have different slurm node associated, and redfish_servers has the info/map.
if s.slurmQueue != nil {
redfishServerInfo := getServerInfoByIP(AppConfig.RedfishServers, ip)
if len(strings.TrimSpace(redfishServerInfo.SlurmNode)) == 0 {
log.Printf("failed to get the slurm node name, cannot perform action: %v", triggerEvent.Action)
break
}
s.slurmQueue.Add(redfishServerInfo.IP, redfishServerInfo.SlurmNode, triggerEvent.Severity, triggerEvent.Action)
evt := slurm.AddEventReq{
RedfishServerIP: redfishServerInfo.IP,
SlurmNodeName: redfishServerInfo.SlurmNode,
Severity: triggerEvent.Severity,
Action: triggerEvent.Action,
DrainReasonPrefix: triggerEvent.DrainReasonPrefix,
MessageId: messageId,
Message: message,
ExcludeStr: AppConfig.SlurmDrainExcludeStr,
ScontrolPath: AppConfig.SlurmScontrolPath,
}
s.slurmQueue.Add(evt)
}
break
}
11 changes: 0 additions & 11 deletions redfish-exporter/main.go
Original file line number Diff line number Diff line change
@@ -53,17 +53,6 @@ func main() {
defer cancel()
var slurmQueue *slurm.SlurmQueue
if *enableSlurm {
if len(strings.TrimSpace(AppConfig.SlurmToken)) == 0 {
log.Fatalf("Provide slurm token to enable slurm")
}
if len(strings.TrimSpace(AppConfig.SlurmControlNode)) == 0 {
log.Fatalf("Provide slurm control node IP:Port to enable slurm")
}
_, err := slurm.NewClient(AppConfig.SlurmControlNode, AppConfig.SlurmUser, AppConfig.SlurmToken)
if err != nil {
log.Fatalf("failed to create slurm client, err: %+v", err)
}

slurmQueue = slurm.InitSlurmQueue(ctx)
go slurmQueue.ProcessEventActionQueue()
}
61 changes: 45 additions & 16 deletions redfish-exporter/slurm/queue.go
Original file line number Diff line number Diff line change
@@ -2,21 +2,40 @@ package slurm

import (
"context"
"fmt"
"log"
"strings"

"github.com/nod-ai/ADA/redfish-exporter/metrics"
)

const (
Drain = "DrainNode"
Drain = "DrainNode"
ExlcudeReasonSet = "DRAIN_EXCLUDE_REASON_SET"
)

type AddEventReq struct {
RedfishServerIP string
SlurmNodeName string
Severity string
Action string
DrainReasonPrefix string
MessageId string
Message string
ExcludeStr string
ScontrolPath string
}

type eventsActionReq struct {
redfishServerIP string
slurmNodeName string
severity string
action string
redfishServerIP string
slurmNodeName string
severity string
action string
drainReasonPrefix string
messageId string
message string
excludeStr string
scontrolPath string
}

type SlurmQueue struct {
@@ -28,12 +47,17 @@ func InitSlurmQueue(ctx context.Context) *SlurmQueue {
return &SlurmQueue{ctx: ctx, queue: make(chan *eventsActionReq)}
}

func (q *SlurmQueue) Add(redfishServerIP, slurmNodeName, severity, action string) {
func (q *SlurmQueue) Add(evt AddEventReq) {
q.queue <- &eventsActionReq{
redfishServerIP: redfishServerIP,
slurmNodeName: slurmNodeName,
severity: severity,
action: action,
redfishServerIP: evt.RedfishServerIP,
slurmNodeName: evt.SlurmNodeName,
severity: evt.Severity,
action: evt.Action,
drainReasonPrefix: evt.DrainReasonPrefix,
messageId: evt.MessageId,
message: evt.Message,
excludeStr: evt.ExcludeStr,
scontrolPath: evt.ScontrolPath,
}
}

@@ -65,19 +89,24 @@ func (q *SlurmQueue) ProcessEventActionQueue() {
}
}

func getDrainReasonString(prefix, msg, msgId, severity string) string {
ret := fmt.Sprintf("%s:redfishlistener:%s:%s:%s", prefix, severity, msgId, msg)
return ret
}

func (q *SlurmQueue) performEventAction(req *eventsActionReq) error {
if len(strings.TrimSpace(req.slurmNodeName)) == 0 {
return nil
}

slurmClient := GetClient()
if slurmClient == nil {
return nil
}

if req.action == Drain {
err := slurmClient.DrainNode(req.slurmNodeName)
reason := getDrainReasonString(req.drainReasonPrefix, req.message, req.messageId, req.severity)
err := DrainNodeWithScontrol(req.slurmNodeName, reason, req.excludeStr, req.scontrolPath)
if err != nil {
if strings.Contains(err.Error(), ExlcudeReasonSet) {
log.Printf("Node not drained: %v", err.Error())
return nil
}
log.Printf("Error draining node: %v", err)
return err
}
Loading