@@ -124,6 +124,7 @@ public class HelixClusterManager implements ClusterMap {
124
124
private final AtomicLong currentXid ;
125
125
final HelixClusterManagerMetrics helixClusterManagerMetrics ;
126
126
private HelixAggregatedViewClusterInfo helixAggregatedViewClusterInfo = null ;
127
+ private final AtomicLong dataNodeInitializationFailureCount = new AtomicLong (0 );
127
128
128
129
// The map from resource name to resource config, This is only used in FULL AUTO. This map is not going to be updated
129
130
// if the ResourceConfig is updated, but we are only using default replica capacity from the ResourceConfig. So if you
@@ -230,7 +231,7 @@ public HelixClusterManager(ClusterMapConfig clusterMapConfig, String instanceNam
230
231
initializationFailureMap .values ().stream ().filter (Objects ::nonNull ).count ());
231
232
helixClusterManagerMetrics .initializeXidMetric (currentXid );
232
233
helixClusterManagerMetrics .initializeDatacenterMetrics ();
233
- helixClusterManagerMetrics .initializeDataNodeMetrics ();
234
+ helixClusterManagerMetrics .initializeDataNodeMetrics (dataNodeInitializationFailureCount );
234
235
helixClusterManagerMetrics .initializeDiskMetrics ();
235
236
helixClusterManagerMetrics .initializePartitionMetrics ();
236
237
helixClusterManagerMetrics .initializeCapacityMetrics ();
@@ -2077,10 +2078,18 @@ private void updateReplicaStateAndOverrideIfNeeded(AmbryReplica replica, Collect
2077
2078
private List <ReplicaId > createNewInstance (DataNodeConfig dataNodeConfig , String dcName ) throws Exception {
2078
2079
String instanceName = dataNodeConfig .getInstanceName ();
2079
2080
logger .info ("Adding node {} and its disks and replicas in {}" , instanceName , dcName );
2080
- AmbryDataNode datanode =
2081
- new AmbryServerDataNode (dataNodeConfig .getDatacenterName (), clusterMapConfig , dataNodeConfig .getHostName (),
2082
- dataNodeConfig .getPort (), dataNodeConfig .getRackId (), dataNodeConfig .getSslPort (),
2083
- dataNodeConfig .getHttp2Port (), DEFAULT_XID , helixClusterManagerQueryHelper );
2081
+ AmbryDataNode datanode = null ;
2082
+ try {
2083
+ datanode =
2084
+ new AmbryServerDataNode (dataNodeConfig .getDatacenterName (), clusterMapConfig , dataNodeConfig .getHostName (),
2085
+ dataNodeConfig .getPort (), dataNodeConfig .getRackId (), dataNodeConfig .getSslPort (),
2086
+ dataNodeConfig .getHttp2Port (), DEFAULT_XID , helixClusterManagerQueryHelper );
2087
+ } catch (Exception e ) {
2088
+ logger .error ("Fail to create an AmbryServerDataNode for {} in datacenter {}, skip adding this node." ,
2089
+ dataNodeConfig .getHostName (), dataNodeConfig .getDatacenterName (), e );
2090
+ dataNodeInitializationFailureCount .incrementAndGet ();
2091
+ return Collections .emptyList ();
2092
+ }
2084
2093
// for new instance, we first set it to unavailable and rely on its participation to update its liveness
2085
2094
if (!instanceName .equals (selfInstanceName )) {
2086
2095
datanode .setState (HardwareState .UNAVAILABLE );
0 commit comments