Skip to content

Commit f1dd633

Browse files
committed
multi: update collectors to exit on failure, rather than log
Update all of our collectors to shutdown on failure rather than sliently log. This paired with restarting the lndmon container on exit allows easier detection of persistenet issues, and simple restart when lnd is unavailable temporarily.
1 parent 86bd96c commit f1dd633

10 files changed

+125
-70
lines changed

collectors/chain_collector.go

+12-3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package collectors
22

33
import (
44
"context"
5+
"fmt"
56

67
"github.com/lightninglabs/lndclient"
78
"github.com/prometheus/client_golang/prometheus"
@@ -14,11 +15,17 @@ type ChainCollector struct {
1415
syncedToChain *prometheus.Desc
1516

1617
lnd lndclient.LightningClient
18+
19+
// errChan is a channel that we send any errors that we encounter into.
20+
// This channel should be buffered so that it does not block sends.
21+
errChan chan<- error
1722
}
1823

1924
// NewChainCollector returns a new instance of the ChainCollector for the target
2025
// lnd client.
21-
func NewChainCollector(lnd lndclient.LightningClient) *ChainCollector {
26+
func NewChainCollector(lnd lndclient.LightningClient,
27+
errChan chan<- error) *ChainCollector {
28+
2229
return &ChainCollector{
2330
bestBlockHeight: prometheus.NewDesc(
2431
"lnd_chain_block_height", "best block height from lnd",
@@ -34,7 +41,8 @@ func NewChainCollector(lnd lndclient.LightningClient) *ChainCollector {
3441
"lnd is synced to the chain",
3542
nil, nil,
3643
),
37-
lnd: lnd,
44+
lnd: lnd,
45+
errChan: errChan,
3846
}
3947
}
4048

@@ -55,7 +63,8 @@ func (c *ChainCollector) Describe(ch chan<- *prometheus.Desc) {
5563
func (c *ChainCollector) Collect(ch chan<- prometheus.Metric) {
5664
resp, err := c.lnd.GetInfo(context.Background())
5765
if err != nil {
58-
chainLogger.Error(err)
66+
c.errChan <- fmt.Errorf("ChainCollector GetInfo failed with: "+
67+
"%v", err)
5968
return
6069
}
6170

collectors/channels_collector.go

+14-5
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,15 @@ type ChannelsCollector struct {
4444
lnd lndclient.LightningClient
4545

4646
primaryNode *route.Vertex
47+
48+
// errChan is a channel that we send any errors that we encounter into.
49+
// This channel should be buffered so that it does not block sends.
50+
errChan chan<- error
4751
}
4852

4953
// NewChannelsCollector returns a new instance of the ChannelsCollector for the
5054
// target lnd client.
51-
func NewChannelsCollector(lnd lndclient.LightningClient,
55+
func NewChannelsCollector(lnd lndclient.LightningClient, errChan chan<- error,
5256
cfg *MonitoringConfig) *ChannelsCollector {
5357

5458
// Our set of labels, status should either be active or inactive. The
@@ -155,6 +159,7 @@ func NewChannelsCollector(lnd lndclient.LightningClient,
155159

156160
lnd: lnd,
157161
primaryNode: cfg.PrimaryNode,
162+
errChan: errChan,
158163
}
159164
}
160165

@@ -201,7 +206,8 @@ func (c *ChannelsCollector) Collect(ch chan<- prometheus.Metric) {
201206
// pending channel balances.
202207
chanBalResp, err := c.lnd.ChannelBalance(context.Background())
203208
if err != nil {
204-
channelLogger.Error(err)
209+
c.errChan <- fmt.Errorf("ChannelsCollector ChannelBalance "+
210+
"failed with: %v", err)
205211
return
206212
}
207213

@@ -218,7 +224,8 @@ func (c *ChannelsCollector) Collect(ch chan<- prometheus.Metric) {
218224
// have open.
219225
getInfoResp, err := c.lnd.GetInfo(context.Background())
220226
if err != nil {
221-
channelLogger.Error(err)
227+
c.errChan <- fmt.Errorf("ChannelsCollector GetInfo failed "+
228+
"with: %v", err)
222229
return
223230
}
224231

@@ -239,7 +246,8 @@ func (c *ChannelsCollector) Collect(ch chan<- prometheus.Metric) {
239246
// as well as the number of pending HTLCs.
240247
listChannelsResp, err := c.lnd.ListChannels(context.Background())
241248
if err != nil {
242-
channelLogger.Error(err)
249+
c.errChan <- fmt.Errorf("ChannelsCollector ListChannels "+
250+
"failed with: %v", err)
243251
return
244252
}
245253

@@ -347,7 +355,8 @@ func (c *ChannelsCollector) Collect(ch chan<- prometheus.Metric) {
347355
// Get all remote policies
348356
remotePolicies, err := c.getRemotePolicies(getInfoResp.IdentityPubkey)
349357
if err != nil {
350-
channelLogger.Error(err)
358+
c.errChan <- fmt.Errorf("ChannelsCollector getRemotePolicies "+
359+
"failed with: %v", err)
351360
return
352361
}
353362

collectors/graph_collector.go

+14-4
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package collectors
22

33
import (
44
"context"
5+
"fmt"
56

67
"github.com/lightninglabs/lndclient"
78
"github.com/prometheus/client_golang/prometheus"
@@ -51,11 +52,17 @@ type GraphCollector struct {
5152
avgMaxHtlcMsatDesc *prometheus.Desc
5253

5354
lnd lndclient.LightningClient
55+
56+
// errChan is a channel that we send any errors that we encounter into.
57+
// This channel should be buffered so that it does not block sends.
58+
errChan chan<- error
5459
}
5560

5661
// NewGraphCollector returns a new instance of the GraphCollector for the target
5762
// lnd client.
58-
func NewGraphCollector(lnd lndclient.LightningClient) *GraphCollector {
63+
func NewGraphCollector(lnd lndclient.LightningClient,
64+
errChan chan<- error) *GraphCollector {
65+
5966
return &GraphCollector{
6067
numEdgesDesc: prometheus.NewDesc(
6168
"lnd_graph_edges_count",
@@ -221,7 +228,8 @@ func NewGraphCollector(lnd lndclient.LightningClient) *GraphCollector {
221228
nil, nil,
222229
),
223230

224-
lnd: lnd,
231+
lnd: lnd,
232+
errChan: errChan,
225233
}
226234
}
227235

@@ -278,7 +286,8 @@ func (g *GraphCollector) Describe(ch chan<- *prometheus.Desc) {
278286
func (g *GraphCollector) Collect(ch chan<- prometheus.Metric) {
279287
resp, err := g.lnd.DescribeGraph(context.Background(), false)
280288
if err != nil {
281-
graphLogger.Error(err)
289+
g.errChan <- fmt.Errorf("GraphCollector DescribeGraph failed "+
290+
"with: %v", err)
282291
return
283292
}
284293

@@ -295,7 +304,8 @@ func (g *GraphCollector) Collect(ch chan<- prometheus.Metric) {
295304

296305
networkInfo, err := g.lnd.NetworkInfo(context.Background())
297306
if err != nil {
298-
graphLogger.Error(err)
307+
g.errChan <- fmt.Errorf("GraphCollector NetworkInfo failed "+
308+
"with: %v", err)
299309
return
300310
}
301311

collectors/info_collector.go

+12-3
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package collectors
33
import (
44
"context"
55
"encoding/hex"
6+
"fmt"
67

78
"github.com/lightninglabs/lndclient"
89
"github.com/prometheus/client_golang/prometheus"
@@ -13,17 +14,24 @@ type InfoCollector struct {
1314
info *prometheus.Desc
1415

1516
lnd lndclient.LightningClient
17+
18+
// errChan is a channel that we send any errors that we encounter into.
19+
// This channel should be buffered so that it does not block sends.
20+
errChan chan<- error
1621
}
1722

1823
// NewInfoCollector returns a new instance of the InfoCollector for the target
1924
// lnd client.
20-
func NewInfoCollector(lnd lndclient.LightningClient) *InfoCollector {
25+
func NewInfoCollector(lnd lndclient.LightningClient,
26+
errChan chan<- error) *InfoCollector {
27+
2128
labels := []string{"version", "alias", "pubkey"}
2229
return &InfoCollector{
2330
info: prometheus.NewDesc(
2431
"lnd_info", "lnd node info", labels, nil,
2532
),
26-
lnd: lnd,
33+
lnd: lnd,
34+
errChan: errChan,
2735
}
2836
}
2937

@@ -42,7 +50,8 @@ func (c *InfoCollector) Describe(ch chan<- *prometheus.Desc) {
4250
func (c *InfoCollector) Collect(ch chan<- prometheus.Metric) {
4351
resp, err := c.lnd.GetInfo(context.Background())
4452
if err != nil {
45-
chainLogger.Error(err)
53+
c.errChan <- fmt.Errorf("InfoCollector GetInfo failed with: "+
54+
"%v", err)
4655
return
4756
}
4857

collectors/log.go

-15
Original file line numberDiff line numberDiff line change
@@ -17,21 +17,6 @@ var (
1717

1818
// Logger for lndmon's main process.
1919
Logger = backendLog.Logger("LNDMON")
20-
21-
// graphLogger for lndmon's graph collector.
22-
graphLogger = build.NewSubLogger("GRPH", backendLog.Logger)
23-
24-
// peerLogger for lndmon's peer collector.
25-
peerLogger = build.NewSubLogger("PEER", backendLog.Logger)
26-
27-
// channelLogger for lndmon's channel collector.
28-
channelLogger = build.NewSubLogger("CHAN", backendLog.Logger)
29-
30-
// chainLogger for lndmon's chain collector.
31-
chainLogger = build.NewSubLogger("BLCN", backendLog.Logger)
32-
33-
// walletLogger for lndmon's wallet collector.
34-
walletLogger = build.NewSubLogger("WALT", backendLog.Logger)
3520
)
3621

3722
// initLogRotator initializes the logging rotator to write logs to logFile and

collectors/peer_collector.go

+12-3
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package collectors
33
import (
44
"context"
55
"encoding/hex"
6+
"fmt"
67

78
"github.com/lightninglabs/lndclient"
89
"github.com/prometheus/client_golang/prometheus"
@@ -21,11 +22,17 @@ type PeerCollector struct {
2122
bytesRecvDesc *prometheus.Desc
2223

2324
lnd lndclient.LightningClient
25+
26+
// errChan is a channel that we send any errors that we encounter into.
27+
// This channel should be buffered so that it does not block sends.
28+
errChan chan<- error
2429
}
2530

2631
// NewPeerCollector returns a new instance of the PeerCollector for the target
2732
// lnd client.
28-
func NewPeerCollector(lnd lndclient.LightningClient) *PeerCollector {
33+
func NewPeerCollector(lnd lndclient.LightningClient,
34+
errChan chan<- error) *PeerCollector {
35+
2936
perPeerLabels := []string{"pubkey"}
3037
return &PeerCollector{
3138
peerCountDesc: prometheus.NewDesc(
@@ -54,7 +61,8 @@ func NewPeerCollector(lnd lndclient.LightningClient) *PeerCollector {
5461
"bytes transmitted from this peer",
5562
perPeerLabels, nil,
5663
),
57-
lnd: lnd,
64+
lnd: lnd,
65+
errChan: errChan,
5866
}
5967
}
6068

@@ -81,7 +89,8 @@ func (p *PeerCollector) Describe(ch chan<- *prometheus.Desc) {
8189
func (p *PeerCollector) Collect(ch chan<- prometheus.Metric) {
8290
listPeersResp, err := p.lnd.ListPeers(context.Background())
8391
if err != nil {
84-
peerLogger.Error(err)
92+
p.errChan <- fmt.Errorf("PeerCollector ListPeers failed with: "+
93+
"%v", err)
8594
return
8695
}
8796

collectors/prometheus.go

+30-16
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import (
66
"net/http"
77
"os"
88
"path/filepath"
9-
"sync"
109

1110
"github.com/btcsuite/btcutil"
1211
"github.com/lightninglabs/lndclient"
@@ -16,8 +15,6 @@ import (
1615
)
1716

1817
var (
19-
metricsMtx sync.Mutex
20-
2118
// log configuration defaults.
2219
defaultLogFilename = "lndmon.log"
2320
defaultLogFileSize = 10
@@ -33,6 +30,13 @@ type PrometheusExporter struct {
3330
lnd *lndclient.LndServices
3431

3532
monitoringCfg *MonitoringConfig
33+
34+
// collectors is the exporter's active set of collectors.
35+
collectors []prometheus.Collector
36+
37+
// errChan is an error channel that we receive errors from our
38+
// collectors on.
39+
errChan <-chan error
3640
}
3741

3842
// PrometheusConfig is the set of configuration data that specifies the
@@ -74,10 +78,26 @@ func DefaultConfig() *PrometheusConfig {
7478
func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices,
7579
monitoringCfg *MonitoringConfig) *PrometheusExporter {
7680

81+
// We have six collectors, so we buffer our error channel by 6 so that
82+
// we do not need to consume all errors from this channel (on the first
83+
// one, we'll start shutting down, but a few could arrive quickly).
84+
errChan := make(chan error, 6)
85+
7786
return &PrometheusExporter{
7887
cfg: cfg,
7988
lnd: lnd,
8089
monitoringCfg: monitoringCfg,
90+
collectors: []prometheus.Collector{
91+
NewChainCollector(lnd.Client, errChan),
92+
NewChannelsCollector(
93+
lnd.Client, errChan, monitoringCfg,
94+
),
95+
NewWalletCollector(lnd, errChan),
96+
NewGraphCollector(lnd.Client, errChan),
97+
NewPeerCollector(lnd.Client, errChan),
98+
NewInfoCollector(lnd.Client, errChan),
99+
},
100+
errChan: errChan,
81101
}
82102
}
83103

@@ -128,23 +148,17 @@ func (p *PrometheusExporter) Start() error {
128148
return nil
129149
}
130150

151+
// Errors returns an error channel that any failures experienced by its
152+
// collectors experience.
153+
func (p *PrometheusExporter) Errors() <-chan error {
154+
return p.errChan
155+
}
156+
131157
// registerMetrics iterates through all the registered collectors and attempts
132158
// to register each one. If any of the collectors fail to register, then an
133159
// error will be returned.
134160
func (p *PrometheusExporter) registerMetrics() error {
135-
metricsMtx.Lock()
136-
defer metricsMtx.Unlock()
137-
138-
collectors := []prometheus.Collector{
139-
NewChainCollector(p.lnd.Client),
140-
NewChannelsCollector(p.lnd.Client, p.monitoringCfg),
141-
NewWalletCollector(p.lnd),
142-
NewGraphCollector(p.lnd.Client),
143-
NewPeerCollector(p.lnd.Client),
144-
NewInfoCollector(p.lnd.Client),
145-
}
146-
147-
for _, collector := range collectors {
161+
for _, collector := range p.collectors {
148162
err := prometheus.Register(collector)
149163
if err != nil {
150164
return err

0 commit comments

Comments
 (0)