Skip to content

Commit 2d2c1c5

Browse files
authored
Fix RUBY-1647 Reset connection pool on non-timeout network errors (#1258)
1 parent 458d48f commit 2d2c1c5

22 files changed

+544
-96
lines changed

lib/mongo/cluster.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,9 @@ def summary
340340
end
341341

342342
# @api private
343-
attr_reader :server_selection_semaphore
343+
def server_selection_semaphore
344+
options[:server_selection_semaphore]
345+
end
344346

345347
# Finalize the cluster for garbage collection.
346348
#

lib/mongo/error.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ def add_label(label)
134134
end
135135
end
136136

137+
require 'mongo/error/sdam_error_detection'
137138
require 'mongo/error/parser'
138139
require 'mongo/error/write_retryable'
139140
require 'mongo/error/change_stream_resumable'

lib/mongo/error/operation_failure.rb

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class Error
2020
# @since 2.0.0
2121
class OperationFailure < Error
2222
extend Forwardable
23+
include SdamErrorDetection
2324

2425
# Error codes and code names that should result in a failing write
2526
# being retried.
@@ -28,7 +29,7 @@ class OperationFailure < Error
2829
# @api private
2930
WRITE_RETRY_ERRORS = [
3031
{:code_name => 'InterruptedAtShutdown', :code => 11600},
31-
{:code_name => 'InterruptedDueToReplStateChange', :code => 11602},
32+
{:code_name => 'InterruptedDueToStepDown', :code => 11602},
3233
{:code_name => 'NotMaster', :code => 10107},
3334
{:code_name => 'NotMasterNoSlaveOk', :code => 13435},
3435
{:code_name => 'NotMasterOrSecondary', :code => 13436},
@@ -175,6 +176,9 @@ def change_stream_resumable_code?
175176
#
176177
# @option options [ Integer ] :code Error code
177178
# @option options [ String ] :code_name Error code name
179+
# @option options [ Array<String> ] :labels The set of labels associated
180+
# with the error
181+
# @option options [ true | false ] :wtimeout Whether the error is a wtimeout
178182
#
179183
# @since 2.5.0, options added in 2.6.0
180184
def initialize(message = nil, result = nil, options = {})
@@ -186,9 +190,9 @@ def initialize(message = nil, result = nil, options = {})
186190
super(message)
187191
end
188192

189-
# Whether the error was a write concern timeout.
193+
# Whether the error is a write concern timeout.
190194
#
191-
# @return [ true | false ] Whether the error was a write concern timeout.
195+
# @return [ true | false ] Whether the error is a write concern timeout.
192196
#
193197
# @since 2.7.1
194198
def wtimeout?

lib/mongo/error/parser.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class Error
4444
#
4545
# @since 2.0.0
4646
class Parser
47+
include SdamErrorDetection
4748

4849
# @return [ BSON::Document ] document The returned document.
4950
attr_reader :document
@@ -62,7 +63,7 @@ class Parser
6263
# @since 2.6.0
6364
attr_reader :code_name
6465

65-
# @return [ Array ] labels The set of labels associated with the error.
66+
# @return [ Array<String> ] labels The set of labels associated with the error.
6667
# @since 2.7.0
6768
attr_reader :labels
6869

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
module Mongo
2+
class Error
3+
# @note Although not_master? and node_recovering? methods of this module
4+
# are part of the public API, the fact that these methods are defined on
5+
# this module and not on the classes which include this module is not
6+
# part of the public API.
7+
#
8+
# @api semipublic
9+
module SdamErrorDetection
10+
11+
# @api private
12+
NOT_MASTER_CODES = [10107, 13435].freeze
13+
14+
# @api private
15+
NODE_RECOVERING_CODES = [11600, 11602, 13436, 189, 91].freeze
16+
17+
# Whether the error is a "not master" error, or one of its variants.
18+
#
19+
# See https://github.com/mongodb/specifications/blob/master/source/server-discovery-and-monitoring/server-discovery-and-monitoring.rst#not-master-and-node-is-recovering.
20+
#
21+
# @return [ true | false ] Whether the error is a not master.
22+
#
23+
# @since 2.8.0
24+
def not_master?
25+
if node_recovering?
26+
false
27+
elsif code && NOT_MASTER_CODES.include?(code)
28+
true
29+
elsif message
30+
message.include?('not master')
31+
else
32+
false
33+
end
34+
end
35+
36+
# Whether the error is a "node is recovering" error, or one of its variants.
37+
#
38+
# See https://github.com/mongodb/specifications/blob/master/source/server-discovery-and-monitoring/server-discovery-and-monitoring.rst#not-master-and-node-is-recovering.
39+
#
40+
# @return [ true | false ] Whether the error is a node is recovering.
41+
#
42+
# @since 2.8.0
43+
def node_recovering?
44+
if code && NODE_RECOVERING_CODES.include?(code)
45+
true
46+
elsif message
47+
message.include?('node is recovering') || message.include?('not master or secondary')
48+
else
49+
false
50+
end
51+
end
52+
end
53+
end
54+
end

lib/mongo/operation/result.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ def initialize(replies)
7979
# @return [ Array<Protocol::Reply> ] replies The wrapped wire protocol replies.
8080
attr_reader :replies
8181

82+
def_delegators :parser,
83+
:not_master?, :node_recovering?
84+
8285
# Is the result acknowledged?
8386
#
8487
# @note On MongoDB 2.6 and higher all writes are acknowledged since the

lib/mongo/operation/shared/executable.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ def dispatch_message(server)
4545

4646
def process_result(result, server)
4747
server.update_cluster_time(result)
48+
if result.not_master? || result.node_recovering?
49+
server.unknown!
50+
server.monitor.scan_semaphore.signal
51+
end
4852
session.process(result) if session
4953
result
5054
end

lib/mongo/retryable.rb

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,18 +42,22 @@ def read_with_retry(session = nil)
4242
attempt += 1
4343
yield
4444
rescue Error::SocketError, Error::SocketTimeoutError => e
45-
raise(e) if attempt > cluster.max_read_retries || (session && session.in_transaction?)
45+
if attempt > cluster.max_read_retries || (session && session.in_transaction?)
46+
raise
47+
end
4648
log_retry(e)
4749
cluster.scan!(false)
4850
retry
4951
rescue Error::OperationFailure => e
5052
if cluster.sharded? && e.retryable? && !(session && session.in_transaction?)
51-
raise(e) if attempt > cluster.max_read_retries
53+
if attempt > cluster.max_read_retries
54+
raise
55+
end
5256
log_retry(e)
5357
sleep(cluster.read_retry_interval)
5458
retry
5559
else
56-
raise e
60+
raise
5761
end
5862
end
5963
end
@@ -141,10 +145,14 @@ def write_with_retry(session, write_concern, ending_transaction = false, &block)
141145
txn_num = session.in_transaction? ? session.txn_num : session.next_txn_num
142146
yield(server, txn_num, false)
143147
rescue Error::SocketError, Error::SocketTimeoutError => e
144-
raise e if session.in_transaction? && !ending_transaction
148+
if session.in_transaction? && !ending_transaction
149+
raise
150+
end
145151
retry_write(e, txn_num, &block)
146152
rescue Error::OperationFailure => e
147-
raise e if (session.in_transaction? && !ending_transaction) || !e.write_retryable?
153+
if (session.in_transaction? && !ending_transaction) || !e.write_retryable?
154+
raise
155+
end
148156
retry_write(e, txn_num, &block)
149157
end
150158
end
@@ -167,17 +175,19 @@ def retry_write_allowed?(session, write_concern)
167175
end
168176

169177
def retry_write(original_error, txn_num, &block)
170-
cluster.scan!(false)
178+
# We do not request a scan of the cluster here, because error handling
179+
# for the error which triggered the retry should have updated the
180+
# server description and/or topology as necessary (specifically,
181+
# a socket error or a not master error should have marked the respective
182+
# server unknown). Here we just need to wait for server selection.
171183
server = cluster.next_primary
172184
raise original_error unless (server.retry_writes? && txn_num)
173185
log_retry(original_error)
174186
yield(server, txn_num, true)
175187
rescue Error::SocketError, Error::SocketTimeoutError => e
176-
cluster.scan!(false)
177188
raise e
178189
rescue Error::OperationFailure => e
179190
raise original_error unless e.write_retryable?
180-
cluster.scan!(false)
181191
raise e
182192
rescue
183193
raise original_error
@@ -193,13 +203,15 @@ def legacy_write_with_retry(server = nil, session = nil)
193203
yield(server || cluster.next_primary)
194204
rescue Error::OperationFailure => e
195205
server = nil
196-
raise(e) if attempt > Cluster::MAX_WRITE_RETRIES
206+
if attempt > Cluster::MAX_WRITE_RETRIES
207+
raise
208+
end
197209
if e.write_retryable? && !(session && session.in_transaction?)
198210
log_retry(e)
199211
cluster.scan!(false)
200212
retry
201213
else
202-
raise(e)
214+
raise
203215
end
204216
end
205217
end

lib/mongo/server/connection.rb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,17 @@ def authenticate!(pending_connection)
293293
def default_mechanism
294294
@auth_mechanism || (@server.features.scram_sha_1_enabled? ? :scram : :mongodb_cr)
295295
end
296+
297+
def deliver(message)
298+
begin
299+
super
300+
# Important: timeout errors are not handled here
301+
rescue Error::SocketError
302+
@server.unknown!
303+
@server.pool.disconnect!
304+
raise
305+
end
306+
end
296307
end
297308
end
298309
end

lib/mongo/server/connection_base.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ module Mongo
1616
class Server
1717

1818
# This class encapsulates common connection functionality.
19+
#
20+
# @note Although methods of this module are part of the public API,
21+
# the fact that these methods are defined on this module and not on
22+
# the classes which include this module is not part of the public API.
23+
#
24+
# @api semipublic
1925
class ConnectionBase
2026
extend Forwardable
2127
include Monitoring::Publishable

0 commit comments

Comments
 (0)