Skip to content

Commit 0a73cef

Browse files
authored
feat: refactor producer, improve performance and observability (#312)
- performance improved in high-contention scenario
1 parent 270ed16 commit 0a73cef

13 files changed

+447
-336
lines changed

consumer/shard_monitor.go

Lines changed: 3 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,22 @@
11
package consumerLibrary
22

33
import (
4-
"fmt"
5-
"math"
64
"time"
75

86
"go.uber.org/atomic"
97

108
sls "github.com/aliyun/aliyun-log-go-sdk"
9+
"github.com/aliyun/aliyun-log-go-sdk/internal"
1110
"github.com/go-kit/kit/log"
1211
)
1312

1413
type MonitorMetrics struct {
1514
fetchReqFailedCount atomic.Int64
1615
logRawSize atomic.Int64
17-
fetchLogHistogram TimeHistogram // in us
16+
fetchLogHistogram internal.TimeHistogram // in us
1817

1918
processFailedCount atomic.Int64
20-
processHistogram TimeHistogram // in us
19+
processHistogram internal.TimeHistogram // in us
2120
}
2221

2322
type ShardMonitor struct {
@@ -77,40 +76,3 @@ func (m *ShardMonitor) reportByLogger(logger log.Logger) {
7776
"process", metrics.processHistogram.String(),
7877
)
7978
}
80-
81-
type TimeHistogram struct {
82-
Count atomic.Int64
83-
Sum atomic.Float64
84-
SumSquare atomic.Float64
85-
}
86-
87-
func (h *TimeHistogram) AddSample(v float64) {
88-
h.Count.Inc()
89-
h.Sum.Add(v)
90-
h.SumSquare.Add(v * v)
91-
}
92-
93-
func (h *TimeHistogram) String() string {
94-
avg := h.Avg()
95-
stdDev := h.StdDev()
96-
count := h.Count.Load()
97-
return fmt.Sprintf("{avg: %.1fus, stdDev: %.1fus, count: %d}", avg, stdDev, count)
98-
}
99-
100-
func (h *TimeHistogram) Avg() float64 {
101-
count := h.Count.Load()
102-
if count == 0 {
103-
return 0
104-
}
105-
return h.Sum.Load() / float64(count)
106-
}
107-
108-
func (h *TimeHistogram) StdDev() float64 {
109-
count := h.Count.Load()
110-
if count < 2 {
111-
return 0
112-
}
113-
div := float64(count * (count - 1))
114-
num := (float64(count) * h.SumSquare.Load()) - math.Pow(h.Sum.Load(), 2)
115-
return math.Sqrt(num / div)
116-
}

internal/histogram.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package internal
2+
3+
import (
4+
"fmt"
5+
"math"
6+
7+
"go.uber.org/atomic"
8+
)
9+
10+
type TimeHistogram struct {
11+
Count atomic.Int64
12+
Sum atomic.Float64
13+
SumSquare atomic.Float64
14+
}
15+
16+
func (h *TimeHistogram) AddSample(us float64) {
17+
h.Count.Inc()
18+
h.Sum.Add(us)
19+
h.SumSquare.Add(us * us)
20+
}
21+
22+
func (h *TimeHistogram) String() string {
23+
avg := h.Avg()
24+
stdDev := h.StdDev()
25+
count := h.Count.Load()
26+
return fmt.Sprintf("{avg: %.1fus, stdDev: %.1fus, count: %d}", avg, stdDev, count)
27+
}
28+
29+
func (h *TimeHistogram) Avg() float64 {
30+
count := h.Count.Load()
31+
if count == 0 {
32+
return 0
33+
}
34+
return h.Sum.Load() / float64(count)
35+
}
36+
37+
func (h *TimeHistogram) StdDev() float64 {
38+
count := h.Count.Load()
39+
if count < 2 {
40+
return 0
41+
}
42+
div := float64(count * (count - 1))
43+
num := (float64(count) * h.SumSquare.Load()) - math.Pow(h.Sum.Load(), 2)
44+
return math.Sqrt(num / div)
45+
}

producer/io_thread_pool.go

Lines changed: 26 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
package producer
22

33
import (
4-
"container/list"
54
"sync"
6-
"time"
75

86
"github.com/go-kit/kit/log"
97
"github.com/go-kit/kit/log/level"
@@ -12,61 +10,50 @@ import (
1210

1311
type IoThreadPool struct {
1412
threadPoolShutDownFlag *atomic.Bool
15-
queue *list.List
16-
lock sync.RWMutex
13+
taskCh chan *ProducerBatch
1714
ioworker *IoWorker
1815
logger log.Logger
16+
stopped *atomic.Bool
1917
}
2018

2119
func initIoThreadPool(ioworker *IoWorker, logger log.Logger) *IoThreadPool {
2220
return &IoThreadPool{
2321
threadPoolShutDownFlag: atomic.NewBool(false),
24-
queue: list.New(),
22+
taskCh: make(chan *ProducerBatch, 100000),
2523
ioworker: ioworker,
2624
logger: logger,
25+
stopped: atomic.NewBool(false),
2726
}
2827
}
2928

3029
func (threadPool *IoThreadPool) addTask(batch *ProducerBatch) {
31-
defer threadPool.lock.Unlock()
32-
threadPool.lock.Lock()
33-
threadPool.queue.PushBack(batch)
34-
}
35-
36-
func (threadPool *IoThreadPool) popTask() *ProducerBatch {
37-
defer threadPool.lock.Unlock()
38-
threadPool.lock.Lock()
39-
if threadPool.queue.Len() <= 0 {
40-
return nil
41-
}
42-
ele := threadPool.queue.Front()
43-
threadPool.queue.Remove(ele)
44-
return ele.Value.(*ProducerBatch)
45-
}
46-
47-
func (threadPool *IoThreadPool) hasTask() bool {
48-
defer threadPool.lock.RUnlock()
49-
threadPool.lock.RLock()
50-
return threadPool.queue.Len() > 0
30+
threadPool.taskCh <- batch
5131
}
5232

5333
func (threadPool *IoThreadPool) start(ioWorkerWaitGroup *sync.WaitGroup, ioThreadPoolwait *sync.WaitGroup) {
5434
defer ioThreadPoolwait.Done()
55-
for {
56-
if task := threadPool.popTask(); task != nil {
57-
threadPool.ioworker.startSendTask(ioWorkerWaitGroup)
58-
go func(producerBatch *ProducerBatch) {
59-
defer threadPool.ioworker.closeSendTask(ioWorkerWaitGroup)
60-
threadPool.ioworker.sendToServer(producerBatch)
61-
}(task)
62-
} else {
63-
if !threadPool.threadPoolShutDownFlag.Load() {
64-
time.Sleep(100 * time.Millisecond)
65-
} else {
66-
level.Info(threadPool.logger).Log("msg", "All cache tasks in the thread pool have been successfully sent")
67-
break
68-
}
35+
for task := range threadPool.taskCh {
36+
if task == nil {
37+
level.Info(threadPool.logger).Log("msg", "All cache tasks in the thread pool have been successfully sent")
38+
threadPool.stopped.Store(true)
39+
return
6940
}
41+
42+
threadPool.ioworker.startSendTask(ioWorkerWaitGroup)
43+
go func(producerBatch *ProducerBatch) {
44+
defer threadPool.ioworker.closeSendTask(ioWorkerWaitGroup)
45+
threadPool.ioworker.sendToServer(producerBatch)
46+
}(task)
7047
}
48+
}
49+
50+
func (threadPool *IoThreadPool) ShutDown() {
51+
old := threadPool.threadPoolShutDownFlag.Swap(true)
52+
if !old {
53+
close(threadPool.taskCh)
54+
}
55+
}
7156

57+
func (threadPool *IoThreadPool) Stopped() bool {
58+
return threadPool.stopped.Load()
7259
}

producer/io_worker.go

Lines changed: 47 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package producer
22

33
import (
4-
"math"
54
"sync"
65
"sync/atomic"
76
"time"
@@ -43,7 +42,7 @@ func initIoWorker(client sls.ClientInterface, retryQueue *RetryQueue, logger log
4342

4443
func (ioWorker *IoWorker) sendToServer(producerBatch *ProducerBatch) {
4544
level.Debug(ioWorker.logger).Log("msg", "ioworker send data to server")
46-
beginMs := GetTimeMs(time.Now().UnixNano())
45+
sendBegin := time.Now()
4746
var err error
4847
if producerBatch.isUseMetricStoreUrl() {
4948
// not use compress type now
@@ -57,67 +56,59 @@ func (ioWorker *IoWorker) sendToServer(producerBatch *ProducerBatch) {
5756
}
5857
err = ioWorker.client.PostLogStoreLogsV2(producerBatch.getProject(), producerBatch.getLogstore(), req)
5958
}
59+
sendEnd := time.Now()
60+
61+
// send ok
6062
if err == nil {
61-
level.Debug(ioWorker.logger).Log("msg", "sendToServer suecssed,Execute successful callback function")
62-
if producerBatch.attemptCount < producerBatch.maxReservedAttempts {
63-
nowMs := GetTimeMs(time.Now().UnixNano())
64-
attempt := createAttempt(true, "", "", "", nowMs, nowMs-beginMs)
65-
producerBatch.result.attemptList = append(producerBatch.result.attemptList, attempt)
66-
}
67-
producerBatch.result.successful = true
63+
level.Debug(ioWorker.logger).Log("msg", "sendToServer success")
64+
defer ioWorker.producer.monitor.recordSuccess(sendBegin, sendEnd)
65+
producerBatch.OnSuccess(sendBegin)
6866
// After successful delivery, producer removes the batch size sent out
6967
atomic.AddInt64(&ioWorker.producer.producerLogGroupSize, -producerBatch.totalDataSize)
70-
if len(producerBatch.callBackList) > 0 {
71-
for _, callBack := range producerBatch.callBackList {
72-
callBack.Success(producerBatch.result)
73-
}
74-
}
75-
} else {
76-
if ioWorker.retryQueueShutDownFlag.Load() {
77-
if len(producerBatch.callBackList) > 0 {
78-
for _, callBack := range producerBatch.callBackList {
79-
ioWorker.addErrorMessageToBatchAttempt(producerBatch, err, false, beginMs)
80-
callBack.Fail(producerBatch.result)
81-
}
82-
}
83-
return
84-
}
85-
level.Info(ioWorker.logger).Log("msg", "sendToServer failed", "error", err)
86-
if slsError, ok := err.(*sls.Error); ok {
87-
if _, ok := ioWorker.noRetryStatusCodeMap[int(slsError.HTTPCode)]; ok {
88-
ioWorker.addErrorMessageToBatchAttempt(producerBatch, err, false, beginMs)
89-
ioWorker.excuteFailedCallback(producerBatch)
90-
return
91-
}
92-
}
93-
if producerBatch.attemptCount < producerBatch.maxRetryTimes {
94-
ioWorker.addErrorMessageToBatchAttempt(producerBatch, err, true, beginMs)
95-
retryWaitTime := producerBatch.baseRetryBackoffMs * int64(math.Pow(2, float64(producerBatch.attemptCount)-1))
96-
if retryWaitTime < producerBatch.maxRetryIntervalInMs {
97-
producerBatch.nextRetryMs = GetTimeMs(time.Now().UnixNano()) + retryWaitTime
98-
} else {
99-
producerBatch.nextRetryMs = GetTimeMs(time.Now().UnixNano()) + producerBatch.maxRetryIntervalInMs
100-
}
101-
level.Debug(ioWorker.logger).Log("msg", "Submit to the retry queue after meeting the retry criteria。")
102-
ioWorker.retryQueue.sendToRetryQueue(producerBatch, ioWorker.logger)
103-
} else {
104-
ioWorker.excuteFailedCallback(producerBatch)
105-
}
68+
return
10669
}
70+
71+
slsError := parseSlsError(err)
72+
canRetry := ioWorker.canRetry(producerBatch, slsError)
73+
level.Error(ioWorker.logger).Log("msg", "sendToServer failed",
74+
"retryTimes", producerBatch.attemptCount,
75+
"requestId", slsError.RequestID,
76+
"errorCode", slsError.Code,
77+
"errorMessage", slsError.Message,
78+
"logs", len(producerBatch.logGroup.Logs),
79+
"canRetry", canRetry)
80+
if !canRetry {
81+
defer ioWorker.producer.monitor.recordFailure(sendBegin, sendEnd)
82+
producerBatch.OnFail(slsError, sendBegin)
83+
atomic.AddInt64(&ioWorker.producer.producerLogGroupSize, -producerBatch.totalDataSize)
84+
return
85+
}
86+
87+
// do retry
88+
ioWorker.producer.monitor.recordRetry(sendEnd.Sub(sendBegin))
89+
producerBatch.addAttempt(slsError, sendBegin)
90+
producerBatch.nextRetryMs = producerBatch.getRetryBackoffIntervalMs() + time.Now().UnixMilli()
91+
level.Debug(ioWorker.logger).Log("msg", "Submit to the retry queue after meeting the retry criteria。")
92+
ioWorker.retryQueue.sendToRetryQueue(producerBatch, ioWorker.logger)
10793
}
10894

109-
func (ioWorker *IoWorker) addErrorMessageToBatchAttempt(producerBatch *ProducerBatch, err error, retryInfo bool, beginMs int64) {
110-
if producerBatch.attemptCount < producerBatch.maxReservedAttempts {
111-
slsError := err.(*sls.Error)
112-
if retryInfo {
113-
level.Info(ioWorker.logger).Log("msg", "sendToServer failed,start retrying", "retry times", producerBatch.attemptCount, "requestId", slsError.RequestID, "error code", slsError.Code, "error message", slsError.Message)
114-
}
115-
nowMs := GetTimeMs(time.Now().UnixNano())
116-
attempt := createAttempt(false, slsError.RequestID, slsError.Code, slsError.Message, nowMs, nowMs-beginMs)
117-
producerBatch.result.attemptList = append(producerBatch.result.attemptList, attempt)
95+
func parseSlsError(err error) *sls.Error {
96+
if slsError, ok := err.(*sls.Error); ok {
97+
return slsError
98+
}
99+
return &sls.Error{
100+
Message: err.Error(),
101+
}
102+
}
103+
104+
func (ioWorker *IoWorker) canRetry(producerBatch *ProducerBatch, err *sls.Error) bool {
105+
if ioWorker.retryQueueShutDownFlag.Load() {
106+
return false
118107
}
119-
producerBatch.result.successful = false
120-
producerBatch.attemptCount += 1
108+
if _, ok := ioWorker.noRetryStatusCodeMap[int(err.HTTPCode)]; ok {
109+
return false
110+
}
111+
return producerBatch.attemptCount < producerBatch.maxRetryTimes
121112
}
122113

123114
func (ioWorker *IoWorker) closeSendTask(ioWorkerWaitGroup *sync.WaitGroup) {
@@ -131,13 +122,3 @@ func (ioWorker *IoWorker) startSendTask(ioWorkerWaitGroup *sync.WaitGroup) {
131122
ioWorker.maxIoWorker <- 1
132123
ioWorkerWaitGroup.Add(1)
133124
}
134-
135-
func (ioWorker *IoWorker) excuteFailedCallback(producerBatch *ProducerBatch) {
136-
level.Info(ioWorker.logger).Log("msg", "sendToServer failed,Execute failed callback function")
137-
atomic.AddInt64(&ioWorker.producer.producerLogGroupSize, -producerBatch.totalDataSize)
138-
if len(producerBatch.callBackList) > 0 {
139-
for _, callBack := range producerBatch.callBackList {
140-
callBack.Fail(producerBatch.result)
141-
}
142-
}
143-
}

0 commit comments

Comments
 (0)