Skip to content

Commit 4c652be

Browse files
Dont fail on the unresolved hostname (#3024)
If we have an unresolvable hostname from property store data node configs, we will fail helix cluster manager and kill all the frontend and server instance when booting up the process. Let's don't do this. We should create an alert for those hosts.
1 parent 0025f4b commit 4c652be

File tree

2 files changed

+19
-6
lines changed

2 files changed

+19
-6
lines changed

ambry-clustermap/src/main/java/com/github/ambry/clustermap/HelixClusterManager.java

+14-5
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ public class HelixClusterManager implements ClusterMap {
124124
private final AtomicLong currentXid;
125125
final HelixClusterManagerMetrics helixClusterManagerMetrics;
126126
private HelixAggregatedViewClusterInfo helixAggregatedViewClusterInfo = null;
127+
private final AtomicLong dataNodeInitializationFailureCount = new AtomicLong(0);
127128

128129
// The map from resource name to resource config, This is only used in FULL AUTO. This map is not going to be updated
129130
// if the ResourceConfig is updated, but we are only using default replica capacity from the ResourceConfig. So if you
@@ -230,7 +231,7 @@ public HelixClusterManager(ClusterMapConfig clusterMapConfig, String instanceNam
230231
initializationFailureMap.values().stream().filter(Objects::nonNull).count());
231232
helixClusterManagerMetrics.initializeXidMetric(currentXid);
232233
helixClusterManagerMetrics.initializeDatacenterMetrics();
233-
helixClusterManagerMetrics.initializeDataNodeMetrics();
234+
helixClusterManagerMetrics.initializeDataNodeMetrics(dataNodeInitializationFailureCount);
234235
helixClusterManagerMetrics.initializeDiskMetrics();
235236
helixClusterManagerMetrics.initializePartitionMetrics();
236237
helixClusterManagerMetrics.initializeCapacityMetrics();
@@ -2077,10 +2078,18 @@ private void updateReplicaStateAndOverrideIfNeeded(AmbryReplica replica, Collect
20772078
private List<ReplicaId> createNewInstance(DataNodeConfig dataNodeConfig, String dcName) throws Exception {
20782079
String instanceName = dataNodeConfig.getInstanceName();
20792080
logger.info("Adding node {} and its disks and replicas in {}", instanceName, dcName);
2080-
AmbryDataNode datanode =
2081-
new AmbryServerDataNode(dataNodeConfig.getDatacenterName(), clusterMapConfig, dataNodeConfig.getHostName(),
2082-
dataNodeConfig.getPort(), dataNodeConfig.getRackId(), dataNodeConfig.getSslPort(),
2083-
dataNodeConfig.getHttp2Port(), DEFAULT_XID, helixClusterManagerQueryHelper);
2081+
AmbryDataNode datanode = null;
2082+
try {
2083+
datanode =
2084+
new AmbryServerDataNode(dataNodeConfig.getDatacenterName(), clusterMapConfig, dataNodeConfig.getHostName(),
2085+
dataNodeConfig.getPort(), dataNodeConfig.getRackId(), dataNodeConfig.getSslPort(),
2086+
dataNodeConfig.getHttp2Port(), DEFAULT_XID, helixClusterManagerQueryHelper);
2087+
} catch (Exception e) {
2088+
logger.error("Fail to create an AmbryServerDataNode for {} in datacenter {}, skip adding this node.",
2089+
dataNodeConfig.getHostName(), dataNodeConfig.getDatacenterName(), e);
2090+
dataNodeInitializationFailureCount.incrementAndGet();
2091+
return Collections.emptyList();
2092+
}
20842093
// for new instance, we first set it to unavailable and rely on its participation to update its liveness
20852094
if (!instanceName.equals(selfInstanceName)) {
20862095
datanode.setState(HardwareState.UNAVAILABLE);

ambry-clustermap/src/main/java/com/github/ambry/clustermap/HelixClusterManagerMetrics.java

+5-1
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ void initializeDatacenterMetrics() {
140140
/**
141141
* Initialize datanode related metrics.
142142
*/
143-
void initializeDataNodeMetrics() {
143+
void initializeDataNodeMetrics(AtomicLong dataNodeInitializationFailureCount) {
144144
Gauge<Long> dataNodeCount = clusterMapCallback::getDatanodeCount;
145145
registry.gauge(MetricRegistry.name(HelixClusterManager.class, "dataNodeCount"), () -> dataNodeCount);
146146

@@ -152,6 +152,10 @@ void initializeDataNodeMetrics() {
152152
Gauge<Long> dataNodeState = () -> datanode.getState() == HardwareState.AVAILABLE ? 1L : 0L;
153153
registry.gauge(MetricRegistry.name(HelixClusterManager.class, metricName), () -> dataNodeState);
154154
}
155+
156+
Gauge<Long> dataNodeInitializationFailureCountGauge = dataNodeInitializationFailureCount::get;
157+
registry.gauge(MetricRegistry.name(HelixClusterManager.class, "dataNodeInitializationFailureCount"),
158+
() -> dataNodeInitializationFailureCountGauge);
155159
}
156160

157161
/**

0 commit comments

Comments
 (0)