80
80
import org .apache .pulsar .broker .loadbalance .extensions .scheduler .SplitScheduler ;
81
81
import org .apache .pulsar .broker .loadbalance .extensions .scheduler .UnloadScheduler ;
82
82
import org .apache .pulsar .broker .loadbalance .extensions .store .LoadDataStore ;
83
- import org .apache .pulsar .broker .loadbalance .extensions .store .LoadDataStoreException ;
84
83
import org .apache .pulsar .broker .loadbalance .extensions .store .LoadDataStoreFactory ;
85
84
import org .apache .pulsar .broker .loadbalance .extensions .strategy .BrokerSelectionStrategy ;
86
85
import org .apache .pulsar .broker .loadbalance .extensions .strategy .LeastResourceUsageWithWeight ;
97
96
import org .apache .pulsar .common .naming .TopicDomain ;
98
97
import org .apache .pulsar .common .naming .TopicName ;
99
98
import org .apache .pulsar .common .stats .Metrics ;
100
- import org .apache .pulsar .common .util .Backoff ;
101
- import org .apache .pulsar .common .util .BackoffBuilder ;
102
99
import org .apache .pulsar .common .util .FutureUtil ;
103
- import org .apache .pulsar .metadata .api .MetadataStoreException ;
104
100
import org .apache .pulsar .metadata .api .coordination .LeaderElectionState ;
105
101
import org .slf4j .Logger ;
106
102
@@ -123,10 +119,6 @@ public class ExtensibleLoadManagerImpl implements ExtensibleLoadManager {
123
119
124
120
public static final long COMPACTION_THRESHOLD = 5 * 1024 * 1024 ;
125
121
126
- public static final int STARTUP_TIMEOUT_SECONDS = 30 ;
127
-
128
- public static final int MAX_RETRY = 5 ;
129
-
130
122
private static final String ELECTION_ROOT = "/loadbalance/extension/leader" ;
131
123
132
124
public static final Set <String > INTERNAL_TOPICS =
@@ -204,7 +196,7 @@ public class ExtensibleLoadManagerImpl implements ExtensibleLoadManager {
204
196
205
197
private final ConcurrentHashMap <String , CompletableFuture <Optional <BrokerLookupData >>>
206
198
lookupRequests = new ConcurrentHashMap <>();
207
- private final CompletableFuture <Void > initWaiter = new CompletableFuture <>();
199
+ private final CompletableFuture <Boolean > initWaiter = new CompletableFuture <>();
208
200
209
201
/**
210
202
* Get all the bundles that are owned by this broker.
@@ -331,7 +323,7 @@ public void start() throws PulsarServerException {
331
323
return ;
332
324
}
333
325
try {
334
- this .brokerRegistry = new BrokerRegistryImpl (pulsar );
326
+ this .brokerRegistry = createBrokerRegistry (pulsar );
335
327
this .leaderElectionService = new LeaderElectionService (
336
328
pulsar .getCoordinationService (), pulsar .getBrokerId (),
337
329
pulsar .getSafeWebServiceAddress (), ELECTION_ROOT ,
@@ -346,53 +338,14 @@ public void start() throws PulsarServerException {
346
338
});
347
339
});
348
340
});
349
- this .serviceUnitStateChannel = ServiceUnitStateChannelImpl . newInstance (pulsar );
341
+ this .serviceUnitStateChannel = createServiceUnitStateChannel (pulsar );
350
342
this .brokerRegistry .start ();
351
343
this .splitManager = new SplitManager (splitCounter );
352
344
this .unloadManager = new UnloadManager (unloadCounter );
353
345
this .serviceUnitStateChannel .listen (unloadManager );
354
346
this .serviceUnitStateChannel .listen (splitManager );
355
347
this .leaderElectionService .start ();
356
- pulsar .runWhenReadyForIncomingRequests (() -> {
357
- Backoff backoff = new BackoffBuilder ()
358
- .setInitialTime (100 , TimeUnit .MILLISECONDS )
359
- .setMax (STARTUP_TIMEOUT_SECONDS , TimeUnit .SECONDS )
360
- .create ();
361
- int retry = 0 ;
362
- while (!Thread .currentThread ().isInterrupted ()) {
363
- try {
364
- brokerRegistry .register ();
365
- this .serviceUnitStateChannel .start ();
366
- break ;
367
- } catch (Exception e ) {
368
- log .warn ("The broker:{} failed to start service unit state channel. Retrying {} th ..." ,
369
- pulsar .getBrokerId (), ++retry , e );
370
- try {
371
- Thread .sleep (backoff .next ());
372
- } catch (InterruptedException ex ) {
373
- log .warn ("Interrupted while sleeping." );
374
- // preserve thread's interrupt status
375
- Thread .currentThread ().interrupt ();
376
- try {
377
- pulsar .close ();
378
- } catch (PulsarServerException exc ) {
379
- log .error ("Failed to close pulsar service." , exc );
380
- }
381
- return ;
382
- }
383
- failStarting (e );
384
- if (retry >= MAX_RETRY ) {
385
- log .error ("Failed to start the service unit state channel after retry {} th. "
386
- + "Closing pulsar service." , retry , e );
387
- try {
388
- pulsar .close ();
389
- } catch (PulsarServerException ex ) {
390
- log .error ("Failed to close pulsar service." , ex );
391
- }
392
- }
393
- }
394
- }
395
- });
348
+
396
349
this .antiAffinityGroupPolicyHelper =
397
350
new AntiAffinityGroupPolicyHelper (pulsar , serviceUnitStateChannel );
398
351
antiAffinityGroupPolicyHelper .listenFailureDomainUpdate ();
@@ -401,15 +354,10 @@ public void start() throws PulsarServerException {
401
354
SimpleResourceAllocationPolicies policies = new SimpleResourceAllocationPolicies (pulsar );
402
355
this .isolationPoliciesHelper = new IsolationPoliciesHelper (policies );
403
356
this .brokerFilterPipeline .add (new BrokerIsolationPoliciesFilter (isolationPoliciesHelper ));
404
-
405
- try {
406
- this .brokerLoadDataStore = LoadDataStoreFactory
407
- .create (pulsar , BROKER_LOAD_DATA_STORE_TOPIC , BrokerLoadData .class );
408
- this .topBundlesLoadDataStore = LoadDataStoreFactory
409
- .create (pulsar , TOP_BUNDLES_LOAD_DATA_STORE_TOPIC , TopBundlesLoadData .class );
410
- } catch (LoadDataStoreException e ) {
411
- throw new PulsarServerException (e );
412
- }
357
+ this .brokerLoadDataStore = LoadDataStoreFactory
358
+ .create (pulsar , BROKER_LOAD_DATA_STORE_TOPIC , BrokerLoadData .class );
359
+ this .topBundlesLoadDataStore = LoadDataStoreFactory
360
+ .create (pulsar , TOP_BUNDLES_LOAD_DATA_STORE_TOPIC , TopBundlesLoadData .class );
413
361
414
362
this .context = LoadManagerContextImpl .builder ()
415
363
.configuration (conf )
@@ -433,6 +381,7 @@ public void start() throws PulsarServerException {
433
381
434
382
pulsar .runWhenReadyForIncomingRequests (() -> {
435
383
try {
384
+ this .serviceUnitStateChannel .start ();
436
385
var interval = conf .getLoadBalancerReportUpdateMinIntervalMillis ();
437
386
438
387
this .brokerLoadDataReportTask = this .pulsar .getLoadManagerExecutor ()
@@ -467,38 +416,33 @@ public void start() throws PulsarServerException {
467
416
MONITOR_INTERVAL_IN_MILLIS , TimeUnit .MILLISECONDS );
468
417
469
418
this .splitScheduler .start ();
470
- this .initWaiter .complete (null );
419
+ this .initWaiter .complete (true );
471
420
this .started = true ;
472
421
log .info ("Started load manager." );
473
- } catch (Exception ex ) {
474
- failStarting (ex );
422
+ } catch (Throwable e ) {
423
+ failStarting (e );
475
424
}
476
425
});
477
- } catch (Exception ex ) {
426
+ } catch (Throwable ex ) {
478
427
failStarting (ex );
479
428
}
480
429
}
481
430
482
- private void failStarting (Exception ex ) {
483
- log .error ("Failed to start the extensible load balance and close broker registry {}." ,
484
- this .brokerRegistry , ex );
431
+ private void failStarting (Throwable throwable ) {
485
432
if (this .brokerRegistry != null ) {
486
433
try {
487
- brokerRegistry .unregister ();
488
- } catch (MetadataStoreException e ) {
489
- // ignore
490
- }
491
- }
492
- if (this .serviceUnitStateChannel != null ) {
493
- try {
494
- serviceUnitStateChannel .close ();
495
- } catch (IOException e ) {
496
- // ignore
434
+ brokerRegistry .close ();
435
+ } catch (PulsarServerException e ) {
436
+ // If close failed, this broker might still exist in the metadata store. Then it could be found by other
437
+ // brokers as an available broker. Hence, print a warning log for it.
438
+ log .warn ("Failed to close the broker registry: {}" , e .getMessage ());
497
439
}
498
440
}
499
- initWaiter .completeExceptionally (ex );
441
+ initWaiter .complete (false ); // exit the background thread gracefully
442
+ throw PulsarServerException .toUncheckedException (PulsarServerException .from (throwable ));
500
443
}
501
444
445
+
502
446
@ Override
503
447
public void initialize (PulsarService pulsar ) {
504
448
this .pulsar = pulsar ;
@@ -843,7 +787,9 @@ synchronized void playLeader() {
843
787
boolean becameFollower = false ;
844
788
while (!Thread .currentThread ().isInterrupted ()) {
845
789
try {
846
- initWaiter .get ();
790
+ if (!initWaiter .get ()) {
791
+ return ;
792
+ }
847
793
if (!serviceUnitStateChannel .isChannelOwner ()) {
848
794
becameFollower = true ;
849
795
break ;
@@ -893,7 +839,9 @@ synchronized void playFollower() {
893
839
boolean becameLeader = false ;
894
840
while (!Thread .currentThread ().isInterrupted ()) {
895
841
try {
896
- initWaiter .get ();
842
+ if (!initWaiter .get ()) {
843
+ return ;
844
+ }
897
845
if (serviceUnitStateChannel .isChannelOwner ()) {
898
846
becameLeader = true ;
899
847
break ;
@@ -957,7 +905,9 @@ public List<Metrics> getMetrics() {
957
905
@ VisibleForTesting
958
906
protected void monitor () {
959
907
try {
960
- initWaiter .get ();
908
+ if (!initWaiter .get ()) {
909
+ return ;
910
+ }
961
911
962
912
// Monitor role
963
913
// Periodically check the role in case ZK watcher fails.
@@ -1012,4 +962,14 @@ private void closeInternalTopics() {
1012
962
log .warn ("Failed to wait for closing internal topics" , e );
1013
963
}
1014
964
}
965
+
966
+ @ VisibleForTesting
967
+ protected BrokerRegistry createBrokerRegistry (PulsarService pulsar ) {
968
+ return new BrokerRegistryImpl (pulsar );
969
+ }
970
+
971
+ @ VisibleForTesting
972
+ protected ServiceUnitStateChannel createServiceUnitStateChannel (PulsarService pulsar ) {
973
+ return new ServiceUnitStateChannelImpl (pulsar );
974
+ }
1015
975
}
0 commit comments