-
Notifications
You must be signed in to change notification settings - Fork 2
feat: add socket auditor for forwarding logs to coder agent #124
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| package audit | ||
|
|
||
| // MultiAuditor wraps multiple auditors and sends audit events to all of them. | ||
| type MultiAuditor struct { | ||
| auditors []Auditor | ||
| } | ||
|
|
||
| // NewMultiAuditor creates a new MultiAuditor that sends to all provided auditors. | ||
| func NewMultiAuditor(auditors ...Auditor) *MultiAuditor { | ||
| return &MultiAuditor{auditors: auditors} | ||
| } | ||
|
|
||
| // AuditRequest sends the request to all wrapped auditors. | ||
| func (m *MultiAuditor) AuditRequest(req Request) { | ||
| for _, a := range m.auditors { | ||
| a.AuditRequest(req) | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,240 @@ | ||||||
| package audit | ||||||
|
|
||||||
| import ( | ||||||
| "context" | ||||||
| "encoding/binary" | ||||||
| "fmt" | ||||||
| "log/slog" | ||||||
| "net" | ||||||
| "time" | ||||||
|
|
||||||
| "google.golang.org/protobuf/proto" | ||||||
| "google.golang.org/protobuf/types/known/timestamppb" | ||||||
|
|
||||||
| agentproto "github.com/coder/coder/v2/agent/proto" | ||||||
| ) | ||||||
|
|
||||||
| const ( | ||||||
| defaultBatchSize = 10 | ||||||
| defaultBatchTimerDuration = 5 * time.Second | ||||||
| // DefaultAuditSocketPath is the well-known path for the boundary audit socket. | ||||||
| // The expectation is the Coder agent listens on this socket to receive audit logs. | ||||||
| DefaultAuditSocketPath = "/tmp/boundary-audit.sock" | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
| ) | ||||||
|
|
||||||
| // SocketAuditor implements the Auditor interface. It sends logs to the | ||||||
| // workspace agent's boundary log proxy socket. It queues logs and sends | ||||||
| // them in batches using a batch size and timer. The internal queue operates | ||||||
| // as a FIFO i.e., logs are sent in the order they are received and dropped | ||||||
| // if the queue is full. | ||||||
| // | ||||||
| // The proto messages sent to the agent are framed as follows: | ||||||
| // [4 bit tag][28 bit length][length bytes of encoded protobuf] | ||||||
| // | ||||||
| // The tag is currently always 1, but may be extended in the future (e.g. | ||||||
| // to support compression). | ||||||
| type SocketAuditor struct { | ||||||
| socketPath string | ||||||
| logger *slog.Logger | ||||||
| logCh chan *agentproto.BoundaryLog | ||||||
| batchSize int | ||||||
| batchTimerDuration time.Duration | ||||||
|
|
||||||
| // onFlushAttempt is called after each flush attempt (intended for testing). | ||||||
| onFlushAttempt func() | ||||||
| } | ||||||
|
|
||||||
| // NewSocketAuditor creates a new SocketAuditor that sends logs to the agent's | ||||||
| // boundary log proxy socket at DefaultAuditSocketPath after SocketAuditor.Loop | ||||||
| // is called. | ||||||
| func NewSocketAuditor(logger *slog.Logger) *SocketAuditor { | ||||||
| return &SocketAuditor{ | ||||||
| socketPath: DefaultAuditSocketPath, | ||||||
| logger: logger, | ||||||
| logCh: make(chan *agentproto.BoundaryLog, 2*defaultBatchSize), | ||||||
| batchSize: defaultBatchSize, | ||||||
| batchTimerDuration: defaultBatchTimerDuration, | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // AuditRequest implements the Auditor interface. It queues the log to be sent to the | ||||||
| // agent in a batch. | ||||||
| func (s *SocketAuditor) AuditRequest(req Request) { | ||||||
| httpReq := &agentproto.BoundaryLog_HttpRequest{ | ||||||
| Method: req.Method, | ||||||
| Url: req.URL, | ||||||
| } | ||||||
| // Only include the matched rule for allowed requests. Boundary is deny by | ||||||
| // default, so rules are what allow requests. | ||||||
| if req.Allowed { | ||||||
| httpReq.MatchedRule = req.Rule | ||||||
zedkipp marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| } | ||||||
|
|
||||||
| log := &agentproto.BoundaryLog{ | ||||||
| Allowed: req.Allowed, | ||||||
| Time: timestamppb.Now(), | ||||||
| Resource: &agentproto.BoundaryLog_HttpRequest_{HttpRequest: httpReq}, | ||||||
| } | ||||||
|
|
||||||
| select { | ||||||
| case s.logCh <- log: | ||||||
| default: | ||||||
| s.logger.Warn("audit log dropped, channel full") | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // flushErr represents an error from flush, distinguishing between | ||||||
| // permanent errors (bad data) and transient errors (network issues). | ||||||
| type flushErr struct { | ||||||
| err error | ||||||
| permanent bool | ||||||
| } | ||||||
|
|
||||||
| func (e *flushErr) Error() string { return e.err.Error() } | ||||||
|
|
||||||
| // flush sends the current batch of logs to the given connection. | ||||||
| func flush(conn net.Conn, logs []*agentproto.BoundaryLog) *flushErr { | ||||||
| if len(logs) == 0 { | ||||||
| return nil | ||||||
| } | ||||||
|
|
||||||
| req := &agentproto.ReportBoundaryLogsRequest{ | ||||||
| Logs: logs, | ||||||
| } | ||||||
|
|
||||||
| data, err := proto.Marshal(req) | ||||||
| if err != nil { | ||||||
| return &flushErr{err: err, permanent: true} | ||||||
| } | ||||||
|
|
||||||
| if len(data) > 1<<28 { | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use constants instead of magic numbers, and document how this relates to the wire protocol. |
||||||
| return &flushErr{err: fmt.Errorf("data too large: %d bytes", len(data)), permanent: true} | ||||||
| } | ||||||
|
|
||||||
| var header uint32 | ||||||
| header |= uint32(len(data)) | ||||||
| header |= 1 << 28 | ||||||
|
|
||||||
| if err := binary.Write(conn, binary.BigEndian, header); err != nil { | ||||||
| return &flushErr{err: err} | ||||||
| } | ||||||
| if _, err := conn.Write(data); err != nil { | ||||||
| return &flushErr{err: err} | ||||||
| } | ||||||
| return nil | ||||||
| } | ||||||
|
|
||||||
| // Loop handles the I/O to send audit logs to the agent. | ||||||
| func (s *SocketAuditor) Loop(ctx context.Context) { | ||||||
| var conn net.Conn | ||||||
| batch := make([]*agentproto.BoundaryLog, 0, s.batchSize) | ||||||
| t := time.NewTimer(0) | ||||||
| t.Stop() | ||||||
|
|
||||||
| // connect attempts to establish a connection to the socket. | ||||||
| connect := func() { | ||||||
| if conn != nil { | ||||||
| return | ||||||
| } | ||||||
| var err error | ||||||
| conn, err = net.Dial("unix", s.socketPath) | ||||||
| if err != nil { | ||||||
| s.logger.Warn("failed to connect to audit socket", "path", s.socketPath, "error", err) | ||||||
| conn = nil | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // closeConn closes the current connection if open. | ||||||
| closeConn := func() { | ||||||
| if conn != nil { | ||||||
| _ = conn.Close() | ||||||
| conn = nil | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // clearBatch resets the length of the batch and frees memory while preserving | ||||||
| // the batch slice backing array. | ||||||
| clearBatch := func() { | ||||||
| for i := range len(batch) { | ||||||
| batch[i] = nil | ||||||
| } | ||||||
| batch = batch[:0] | ||||||
| } | ||||||
|
|
||||||
| // doFlush flushes the batch and handles errors by reconnecting. | ||||||
| doFlush := func() { | ||||||
| t.Stop() | ||||||
| defer func() { | ||||||
| if s.onFlushAttempt != nil { | ||||||
| s.onFlushAttempt() | ||||||
| } | ||||||
| }() | ||||||
| if len(batch) == 0 { | ||||||
| return | ||||||
| } | ||||||
| connect() | ||||||
| if conn == nil { | ||||||
| // No connection: logs will be retried on next flush. | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be logged. |
||||||
| return | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This could lead to buffered logs being stuck until the buffer fills up, which may be a very long time. |
||||||
| } | ||||||
|
|
||||||
| if err := flush(conn, batch); err != nil { | ||||||
| s.logger.Warn("failed to flush audit logs", "error", err) | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit:
Suggested change
|
||||||
| if err.permanent { | ||||||
| // Data error: discard batch to avoid infinite retries. | ||||||
| clearBatch() | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This needs to be logged. |
||||||
| } else { | ||||||
| // Network error: close connection but keep batch for a future retry. | ||||||
| closeConn() | ||||||
| } | ||||||
| return | ||||||
| } | ||||||
|
|
||||||
| clearBatch() | ||||||
| } | ||||||
|
|
||||||
| connect() | ||||||
|
|
||||||
| for { | ||||||
| select { | ||||||
| case <-ctx.Done(): | ||||||
| // Drain any pending logs before the last flush. Not concerned about | ||||||
| // growing the batch slice here since we're exiting. | ||||||
| drain: | ||||||
| for { | ||||||
| select { | ||||||
| case log := <-s.logCh: | ||||||
| batch = append(batch, log) | ||||||
| default: | ||||||
| break drain | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| doFlush() | ||||||
| closeConn() | ||||||
| return | ||||||
| case <-t.C: | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: |
||||||
| doFlush() | ||||||
| case log := <-s.logCh: | ||||||
| // If batch is at capacity, attempt flushing first and drop the log if | ||||||
| // the batch still full. | ||||||
| if len(batch) >= s.batchSize { | ||||||
| doFlush() | ||||||
| if len(batch) >= s.batchSize { | ||||||
| s.logger.Warn("audit log dropped, batch full") | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if we should report something up to the agent in this case. |
||||||
| continue | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| batch = append(batch, log) | ||||||
|
|
||||||
| if len(batch) == 1 { | ||||||
| t.Reset(s.batchTimerDuration) | ||||||
| } | ||||||
|
|
||||||
| if len(batch) >= s.batchSize { | ||||||
| doFlush() | ||||||
| } | ||||||
| } | ||||||
| } | ||||||
| } | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Magic numbers should be documented please.