2626import org .apache .fluss .config .ConfigOptions ;
2727import org .apache .fluss .config .Configuration ;
2828import org .apache .fluss .exception .FlussRuntimeException ;
29+ import org .apache .fluss .exception .NetworkException ;
2930import org .apache .fluss .exception .PartitionNotExistException ;
3031import org .apache .fluss .exception .RetriableException ;
32+ import org .apache .fluss .exception .StaleMetadataException ;
3133import org .apache .fluss .metadata .PhysicalTablePath ;
3234import org .apache .fluss .metadata .TableBucket ;
3335import org .apache .fluss .metadata .TableInfo ;
3840import org .apache .fluss .rpc .gateway .AdminReadOnlyGateway ;
3941import org .apache .fluss .rpc .gateway .CoordinatorGateway ;
4042import org .apache .fluss .rpc .gateway .TabletServerGateway ;
41- import org .apache .fluss .utils .ExceptionUtils ;
4243
4344import org .slf4j .Logger ;
4445import org .slf4j .LoggerFactory ;
5657import java .util .stream .Collectors ;
5758
5859import static org .apache .fluss .client .utils .MetadataUtils .sendMetadataRequestAndRebuildCluster ;
60+ import static org .apache .fluss .utils .ExceptionUtils .stripExecutionException ;
5961
6062/** The updater to initialize and update client metadata. */
6163public class MetadataUpdater {
6264 private static final Logger LOG = LoggerFactory .getLogger (MetadataUpdater .class );
6365
64- private static final int MAX_RETRY_TIMES = 5 ;
66+ private static final int MAX_RETRY_TIMES = 3 ;
67+ private static final int RETRY_INTERVAL_MS = 100 ;
6568
6669 private final RpcClient rpcClient ;
6770 protected volatile Cluster cluster ;
@@ -270,7 +273,7 @@ public void updateMetadata(
270273 tablePartitionIds );
271274 }
272275 } catch (Exception e ) {
273- Throwable t = ExceptionUtils . stripExecutionException (e );
276+ Throwable t = stripExecutionException (e );
274277 if (t instanceof RetriableException || t instanceof TimeoutException ) {
275278 LOG .warn ("Failed to update metadata, but the exception is re-triable." , t );
276279 } else if (t instanceof PartitionNotExistException ) {
@@ -292,10 +295,33 @@ private static Cluster initializeCluster(Configuration conf, RpcClient rpcClient
292295 Cluster cluster = null ;
293296 Exception lastException = null ;
294297 for (InetSocketAddress address : inetSocketAddresses ) {
298+ ServerNode serverNode = null ;
295299 try {
296- cluster = tryToInitializeCluster (rpcClient , address );
297- break ;
300+ serverNode =
301+ new ServerNode (
302+ -1 ,
303+ address .getHostString (),
304+ address .getPort (),
305+ ServerType .COORDINATOR );
306+ ServerNode finalServerNode = serverNode ;
307+ AdminReadOnlyGateway adminReadOnlyGateway =
308+ GatewayClientProxy .createGatewayProxy (
309+ () -> finalServerNode , rpcClient , AdminReadOnlyGateway .class );
310+ if (inetSocketAddresses .size () == 1 ) {
311+ // if there is only one bootstrap server, we can retry to connect to it.
312+ cluster =
313+ tryToInitializeClusterWithRetries (
314+ rpcClient , serverNode , adminReadOnlyGateway , MAX_RETRY_TIMES );
315+ } else {
316+ cluster = tryToInitializeCluster (adminReadOnlyGateway );
317+ break ;
318+ }
298319 } catch (Exception e ) {
320+ // We should dis-connected with the bootstrap server id to make sure the next
321+ // retry can rebuild the connection.
322+ if (serverNode != null ) {
323+ rpcClient .disconnect (serverNode .uid ());
324+ }
299325 LOG .error (
300326 "Failed to initialize fluss client connection to bootstrap server: {}" ,
301327 address ,
@@ -306,24 +332,64 @@ private static Cluster initializeCluster(Configuration conf, RpcClient rpcClient
306332
307333 if (cluster == null && lastException != null ) {
308334 String errorMsg =
309- "Failed to initialize fluss client connection to server because no "
310- + "bootstrap server is validate. bootstrap servers: "
311- + inetSocketAddresses ;
335+ "Failed to initialize fluss client connection to bootstrap servers: "
336+ + inetSocketAddresses
337+ + ". \n Reason: "
338+ + lastException .getMessage ();
312339 LOG .error (errorMsg );
313340 throw new IllegalStateException (errorMsg , lastException );
314341 }
315342
316343 return cluster ;
317344 }
318345
319- private static Cluster tryToInitializeCluster (RpcClient rpcClient , InetSocketAddress address )
346+ @ VisibleForTesting
347+ static @ Nullable Cluster tryToInitializeClusterWithRetries (
348+ RpcClient rpcClient ,
349+ ServerNode serverNode ,
350+ AdminReadOnlyGateway gateway ,
351+ int maxRetryTimes )
352+ throws Exception {
353+ int retryCount = 0 ;
354+ while (retryCount <= maxRetryTimes ) {
355+ try {
356+ return tryToInitializeCluster (gateway );
357+ } catch (Exception e ) {
358+ Throwable cause = stripExecutionException (e );
359+ // in case of bootstrap is recovering, we should retry to connect.
360+ if (!(cause instanceof StaleMetadataException || cause instanceof NetworkException )
361+ || retryCount >= maxRetryTimes ) {
362+ throw e ;
363+ }
364+
365+ // We should dis-connected with the bootstrap server id to make sure the next
366+ // retry can rebuild the connection.
367+ rpcClient .disconnect (serverNode .uid ());
368+
369+ long delayMs = (long ) (RETRY_INTERVAL_MS * Math .pow (2 , retryCount ));
370+ LOG .warn (
371+ "Failed to connect to bootstrap server: {} (retry {}/{}). Retrying in {} ms." ,
372+ serverNode ,
373+ retryCount + 1 ,
374+ maxRetryTimes ,
375+ delayMs ,
376+ e );
377+
378+ try {
379+ Thread .sleep (delayMs );
380+ } catch (InterruptedException ex ) {
381+ Thread .currentThread ().interrupt ();
382+ throw new RuntimeException ("Interrupted during retry sleep" , ex );
383+ }
384+ retryCount ++;
385+ }
386+ }
387+
388+ return null ;
389+ }
390+
391+ private static Cluster tryToInitializeCluster (AdminReadOnlyGateway adminReadOnlyGateway )
320392 throws Exception {
321- ServerNode serverNode =
322- new ServerNode (
323- -1 , address .getHostString (), address .getPort (), ServerType .COORDINATOR );
324- AdminReadOnlyGateway adminReadOnlyGateway =
325- GatewayClientProxy .createGatewayProxy (
326- () -> serverNode , rpcClient , AdminReadOnlyGateway .class );
327393 return sendMetadataRequestAndRebuildCluster (adminReadOnlyGateway , Collections .emptySet ());
328394 }
329395
0 commit comments