Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the code to use scontrol in place of slurm APIs to drain the #74

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 18 additions & 11 deletions redfish-exporter/.env
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
UPDATED="2024-09-24"
DESCRIPTION="Redfish Event Listener/Exporter"
LISTENER_IP="0.0.0.0"
LISTENER_PORT="8080"
LISTENER_IP="10.11.18.55"
LISTENER_PORT="9003"
METRICS_PORT="2112"
USE_SSL="false"
CERTFILE="path/to/certfile"
KEYFILE="path/to/keyfile"
SLURM_USER="slurm user here"
SLURM_TOKEN="token string here, from secret when for real"
SLURM_CONTROL_NODE="slurm control node IP:Port"
SLURM_CONTROL_NODE="10.235.34.47"
SLURM_DRAIN_EXCLUDE_REASON_LIST="AMD|Pensando|RebootNeeded"
SLURM_SCONTROL_PATH="/usr/bin/scontrol"
TLS_TIMEOUT="15"

TRIGGER_EVENTS="[\
{\"Severity\":\"Fatal\",\"Action\":\"DrainNode\"},\
{\"Severity\":\"Critical\",\"Action\":\"DrainNode\"}
{\"Severity\":\"Critical\",\"Message\":\"Image 'UBB_FPGA' is being verified at 'ERoT'|This is an e2e critical test event\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNeeded\"},\
{\"Severity\":\"Info\",\"Message\":\"Image 'UBB_FPGA' is being verified at 'ERoT'\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"},\
{\"Severity\":\"Warning\",\"Message\":\"Image 'UBB_FPGA' is being verified at 'ERoT'|This is an e2e test event message\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"}
]"

# Subscription (v1.5+)
Expand All @@ -28,11 +30,11 @@ TRIGGER_EVENTS="[\

# Deprecated <v1.5
SUBSCRIPTION_PAYLOAD="{\
\"Destination\":\"http://host.docker.internal:8080\",\
\"EventTypes\":[\"Alert\",\"StatusChange\"],\
\"Destination\":\"http://10.11.18.55:9003\",\
\"EventTypes\":[\"Alert\"],\
\"Protocol\":\"Redfish\",\
\"Context\":\"YourContextData\",\
\"Oem\":{\"Supermicro\":{\"EnableSubscription\":true}}\
\"Oem\":{\"Supermicro\": {\"EnableSubscription\": true}}\
}"

# Config for setting default labels in Prometheus counter metrics.
Expand All @@ -41,5 +43,10 @@ PROMETHEUS_CONFIG="{\
}"

REDFISH_SERVERS="[\
{\"ip\":\"http://127.0.0.1:8000\",\"username\":\"Username1\",\"password\":\"Password1\",\"loginType\":\"Session\",\"slurmNode\":\"Node1\"}
{\"ip\":\"https://10.235.37.54\",\"username\":\"ADMIN\",\"password\":\"PHHCJZUHDV\",\"loginType\":\"Session\",\"slurmNode\":\"smc300x-ccs-aus-GPUFCE9\"},
{\"ip\":\"https://10.235.37.48\",\"username\":\"ADMIN\",\"password\":\"PHHCJZUHDV\",\"loginType\":\"Session\",\"slurmNode\":\"smc300x-ccs-aus-GPUFCE9\"}

]"

REDFISH_SERVERS_COMMON_CONFIG="{\
\"hostSuffix\":\"ipmi.cluster\",\"username\":\"<username>\",\"password\":\"<password>\"}"
99 changes: 84 additions & 15 deletions redfish-exporter/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,15 @@ package main
import (
"crypto/tls"
"encoding/json"
"fmt"
"log"
"net"
"os"
"strconv"
"strings"

"github.com/joho/godotenv"
"gopkg.in/yaml.v3"
)

const (
Expand All @@ -49,28 +52,38 @@ type Config struct {
CertFile string
KeyFile string
}
SlurmToken string
SlurmControlNode string
SlurmUser string
SubscriptionPayload SubscriptionPayload
RedfishServers []RedfishServer
TriggerEvents []TriggerEvent
PrometheusConfig PrometheusConfig
context *tls.Config
eventCount int
dataBuffer []byte
SlurmToken string
SlurmControlNode string
SlurmUser string
SlurmScontrolPath string
SlurmDrainExcludeStr string
SubscriptionPayload SubscriptionPayload
RedfishServers []RedfishServer
TriggerEvents []TriggerEvent
PrometheusConfig PrometheusConfig
context *tls.Config
eventCount int
dataBuffer []byte
TlsTimeOut string
}

type TriggerEvent struct {
Severity string `json:"Severity"`
Action string `json:"Action"`
Severity string `json:"Severity"`
Action string `json:"Action"`
Message string `json:"Message"`
DrainReasonPrefix string `json:"DrainReasonPrefix"`
}

type PrometheusConfig struct {
Severity []string `json:"Severity"`
}

func setupConfig() Config {
type target struct {
Targets []string `yaml:"targets"`
Labels map[string]string `yaml:"labels"`
}

func setupConfig(targetFile string) Config {
// Load .env file
err := godotenv.Load()
if err != nil {
Expand Down Expand Up @@ -119,6 +132,9 @@ func setupConfig() Config {
AppConfig.SlurmToken = os.Getenv("SLURM_TOKEN")
AppConfig.SlurmControlNode = os.Getenv("SLURM_CONTROL_NODE")
AppConfig.SlurmUser = os.Getenv("SLURM_USER")
AppConfig.SlurmDrainExcludeStr = os.Getenv("SLURM_DRAIN_EXCLUDE_REASON_LIST")
AppConfig.SlurmScontrolPath = os.Getenv("SLURM_SCONTROL_PATH")
AppConfig.TlsTimeOut = os.Getenv("TLS_TIMEOUT")

subscriptionPayloadJSON := os.Getenv("SUBSCRIPTION_PAYLOAD")
if err := json.Unmarshal([]byte(subscriptionPayloadJSON), &AppConfig.SubscriptionPayload); err != nil {
Expand Down Expand Up @@ -148,10 +164,63 @@ func setupConfig() Config {
redfishServersJSON := os.Getenv("REDFISH_SERVERS")
if redfishServersJSON == "" {
log.Println("REDFISH_SERVERS environment variable is not set or is empty")
} else {
if err := json.Unmarshal([]byte(redfishServersJSON), &AppConfig.RedfishServers); err != nil {
log.Fatalf("Failed to parse REDFISH_SERVERS: %v", err)
}
}

// Read and parse the REDFISH_SERVERS_COMMON_CONFIG environment variable
redfishServersCommonConfigJSON := os.Getenv("REDFISH_SERVERS_COMMON_CONFIG")
if redfishServersCommonConfigJSON == "" {
log.Println("redfishServersCommonConfigJSON environment variable is not set or is empty")
return AppConfig
}
if err := json.Unmarshal([]byte(redfishServersJSON), &AppConfig.RedfishServers); err != nil {
log.Fatalf("Failed to parse REDFISH_SERVERS: %v", err)
redfishServersCommonConfig := RedfishServersCommongConfig{}
if err := json.Unmarshal([]byte(redfishServersCommonConfigJSON), &redfishServersCommonConfig); err != nil {
log.Fatalf("Failed to parse REDFISH_SERVERS_COMMON_CONFIG: %v", err)
}

if targetFile == "" {
log.Println("No target file provided")
return AppConfig
}

targetYamlFile, err := os.ReadFile(targetFile)

if err != nil {
log.Fatalf("Failed to read file: %v", targetFile)
}

targets := []target{}

err = yaml.Unmarshal(targetYamlFile, &targets)

if err != nil {
log.Fatalf("Error parsing target file: %v | err: %v", targetFile, err)
}

for _, t := range targets {
log.Println("target: ", t.Targets)

for _, hostName := range t.Targets {
// add this target to Redfish servers
server := RedfishServer{}
bmcHost := fmt.Sprintf(hostName+".%v", redfishServersCommonConfig.HostSuffix)
ips, err := net.LookupIP(bmcHost)
if err != nil || len(ips) == 0 {
log.Printf("[error] Couldn't get the IP for host: %v | ips: %v | err: %v", bmcHost, ips, err)
continue
}
log.Println("IPs: ", ips)

server.IP = fmt.Sprintf("https://%v", ips[0])
server.LoginType = "Session"
server.Username = redfishServersCommonConfig.UserName
server.Password = redfishServersCommonConfig.Password
server.SlurmNode = hostName
AppConfig.RedfishServers = append(AppConfig.RedfishServers, server)
}
}

return AppConfig
Expand Down
1 change: 1 addition & 0 deletions redfish-exporter/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ require (
github.com/nod-ai/ADA/redfish-exporter v0.0.0-20241002210630-2ef2d1070d90
github.com/prometheus/client_golang v1.20.4
github.com/stmcginnis/gofish v0.19.0
gopkg.in/yaml.v3 v3.0.1
)

require (
Expand Down
3 changes: 3 additions & 0 deletions redfish-exporter/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
24 changes: 22 additions & 2 deletions redfish-exporter/listener.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"log"
"net"
"net/http"
"regexp"
"strings"

"github.com/nod-ai/ADA/redfish-exporter/metrics"
Expand Down Expand Up @@ -219,15 +220,34 @@ func (s *Server) processRequest(AppConfig Config, conn net.Conn, req *http.Reque
log.Printf("Origin Of Condition: %s", originOfCondition)
for _, triggerEvent := range AppConfig.TriggerEvents {
if severity == triggerEvent.Severity {
log.Printf("Matched Trigger Event: %s with action %s", triggerEvent.Severity, triggerEvent.Action)
if triggerEvent.Message != "" {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, the action will only be triggered if the severity message matches as well?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the user doesn't want to match on the 'message' field, it can be left empty. I will add it to the comments in .env

re := regexp.MustCompile(triggerEvent.Message)
match := re.FindAllString(message, -1)

if len(match) == 0 {
continue
}
}
log.Printf("Matched Trigger Event: %s | message: %s | with action %s", triggerEvent.Severity, triggerEvent.Message, triggerEvent.Action)
// Sending event belongs to redfish_utils. Each server may have different slurm node associated, and redfish_servers has the info/map.
if s.slurmQueue != nil {
redfishServerInfo := getServerInfoByIP(AppConfig.RedfishServers, ip)
if len(strings.TrimSpace(redfishServerInfo.SlurmNode)) == 0 {
log.Printf("failed to get the slurm node name, cannot perform action: %v", triggerEvent.Action)
break
}
s.slurmQueue.Add(redfishServerInfo.IP, redfishServerInfo.SlurmNode, triggerEvent.Severity, triggerEvent.Action)
evt := slurm.AddEventReq{
RedfishServerIP: redfishServerInfo.IP,
SlurmNodeName: redfishServerInfo.SlurmNode,
Severity: triggerEvent.Severity,
Action: triggerEvent.Action,
DrainReasonPrefix: triggerEvent.DrainReasonPrefix,
MessageId: messageId,
Message: message,
ExcludeStr: AppConfig.SlurmDrainExcludeStr,
ScontrolPath: AppConfig.SlurmScontrolPath,
}
s.slurmQueue.Add(evt)
}
break
}
Expand Down
31 changes: 17 additions & 14 deletions redfish-exporter/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"os/signal"
"strconv"
"strings"
"sync"
"syscall"
"time"

Expand All @@ -36,15 +37,24 @@ import (

func main() {
var (
enableSlurm = flag.Bool("enable-slurm", false, "Enable slurm")
targetFile string
enableSlurm = flag.Bool("enable-slurm", false, "Enable slurm")
subscriptionMapLock sync.Mutex // to guard access to the map
)

flag.StringVar(&targetFile, "target", "", "Path to the target file for host/slurm node names")
flag.Parse()

log.SetFlags(log.LstdFlags | log.Lshortfile)
log.Println("Starting Redfish Event Listener/Exporter")

/*
if targetFile == "" {
log.Fatalf("Target file for host/slurm node names not set: Usage: ./amd-redfish-exporter --enable-slurm -target <filename>")
}
*/
// Setup configuration
AppConfig := setupConfig()
AppConfig := setupConfig(targetFile)

// Log the initialized config
log.Printf("Initialized Config: %+v", AppConfig)
Expand All @@ -53,23 +63,14 @@ func main() {
defer cancel()
var slurmQueue *slurm.SlurmQueue
if *enableSlurm {
if len(strings.TrimSpace(AppConfig.SlurmToken)) == 0 {
log.Fatalf("Provide slurm token to enable slurm")
}
if len(strings.TrimSpace(AppConfig.SlurmControlNode)) == 0 {
log.Fatalf("Provide slurm control node IP:Port to enable slurm")
}
_, err := slurm.NewClient(AppConfig.SlurmControlNode, AppConfig.SlurmUser, AppConfig.SlurmToken)
if err != nil {
log.Fatalf("failed to create slurm client, err: %+v", err)
}

slurmQueue = slurm.InitSlurmQueue(ctx)
go slurmQueue.ProcessEventActionQueue()
}

subscriptionMap := make(map[string]string)

// Subscribe the listener to the event stream for all servers
subscriptionMap, err := CreateSubscriptionsForAllServers(AppConfig.RedfishServers, AppConfig.SubscriptionPayload)
err := CreateSubscriptionsForAllServers(AppConfig.RedfishServers, AppConfig.SubscriptionPayload, subscriptionMap, &subscriptionMapLock, AppConfig.TlsTimeOut)
if err != nil {
log.Fatal(err)
}
Expand Down Expand Up @@ -110,7 +111,9 @@ func main() {
time.Sleep(time.Second)

// Unsubscribe the listener from all servers
subscriptionMapLock.Lock()
DeleteSubscriptionsFromAllServers(AppConfig.RedfishServers, subscriptionMap)
subscriptionMapLock.Unlock()

cancel()

Expand Down
Loading