Skip to content
This repository has been archived by the owner on Aug 2, 2022. It is now read-only.

Publish latency and failure metrics for cluster state applier thread #563

Merged
merged 6 commits into from
Mar 17, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ public enum MetricName {
THREAD_POOL,
SHARD_STATS,
MASTER_PENDING,
MOUNTED_PARTITION_METRICS
MOUNTED_PARTITION_METRICS,
CLUSTER_APPLIER_SERVICE
}

// we don't store node details as a metric on reader side database. We
Expand Down Expand Up @@ -821,6 +822,33 @@ public static class Constants {
}
}

public enum ClusterApplierServiceStatsValue implements MetricValue {
/**
* Sum of total pending tasks throttled by master node.
*/

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a copy-paste miss? The comment seems to be unrelated.

CLUSTER_APPLIER_SERVICE_LATENCY(ClusterApplierServiceStatsValue.Constants.CLUSTER_APPLIER_SERVICE_LATENCY),
/**
* Number of pending tasks on which data nodes are actively performing retries.
*/
CLUSTER_APPLIER_SERVICE_FAILURE(ClusterApplierServiceStatsValue.Constants.CLUSTER_APPLIER_SERVICE_FAILURE);

private final String value;

ClusterApplierServiceStatsValue(String value) {
this.value = value;
}

@Override
public String toString() {
return value;
}

public static class Constants {
public static final String CLUSTER_APPLIER_SERVICE_LATENCY = "ClusterApplierService_Latency";
public static final String CLUSTER_APPLIER_SERVICE_FAILURE = "ClusterApplierService_Failure";
}
}

public enum MasterThrottlingValue implements MetricValue {
/**
* Sum of total pending tasks throttled by master node.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ public class PerformanceAnalyzerMetrics {
public static final String sShardQueryPath = "shardquery";
public static final String sMasterTaskPath = "master_task";
public static final String sFaultDetection = "fault_detection";
public static final String sClusterApplierService = "cluster_applier_service";
public static final String sHttpPath = "http";
public static final String sOSPath = "os_metrics";
public static final String sHeapPath = "heap_metrics";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,11 @@ public class MetricsModel {
new MetricAttributes(
MetricUnits.COUNT.toString(), AllMetrics.ShardStateDimension.values()));

allMetricsInitializer.put(
AllMetrics.ClusterApplierServiceStatsValue.CLUSTER_APPLIER_SERVICE_LATENCY.toString(),
new MetricAttributes(
MetricUnits.MILLISECOND.toString(), EmptyDimension.values()));

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we need to include failure metric as well in here?

ALL_METRICS = Collections.unmodifiableMap(allMetricsInitializer);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we change year to 2021 in the license header?

*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics;

import com.amazon.opendistro.elasticsearch.performanceanalyzer.metrics.AllMetrics;
import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Metric;

public class ClusterApplierService_Latency extends Metric {
public ClusterApplierService_Latency(long evaluationIntervalSeconds) {
super(AllMetrics.ClusterApplierServiceStatsValue.CLUSTER_APPLIER_SERVICE_LATENCY.name(), evaluationIntervalSeconds);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ public enum ExceptionsAndErrors implements MeasurementSet {

MASTER_THROTTLING_COLLECTOR_ERROR("MasterThrottlingMetricsCollector"),

FAULT_DETECTION_COLLECTOR_ERROR("FaultDetectionMetricsCollector");
FAULT_DETECTION_COLLECTOR_ERROR("FaultDetectionMetricsCollector"),

CLUSTER_APPLIER_SERVICE_STATS_COLLECTOR_ERROR("ClusterApplierServiceStatsCollector");

/** What we want to appear as the metric name. */
private String name;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ public enum WriterMetrics implements MeasurementSet {
FAULT_DETECTION_COLLECTOR_EXECUTION_TIME("FaultDetectionCollectorExecutionTime", "millis", Arrays.asList(
Statistics.MAX, Statistics.MIN, Statistics.MEAN, Statistics.COUNT, Statistics.SUM)),

CLUSTER_APPLIER_SERVICE_STATS_COLLECTOR_EXECUTION_TIME("ClusterApplierServiceStatsCollectorExecutionTime",
"millis", Arrays.asList(Statistics.MAX, Statistics.MIN, Statistics.MEAN, Statistics.COUNT,
Statistics.SUM)),

STALE_METRICS("StaleMetrics", "count", Arrays.asList(Statistics.COUNT)),
;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ private MetricPropertiesConfig() {
metricPathMap.put(MetricName.MASTER_PENDING, PerformanceAnalyzerMetrics.sPendingTasksPath);
metricPathMap.put(MetricName.MOUNTED_PARTITION_METRICS,
PerformanceAnalyzerMetrics.sMountedPartitionMetricsPath);
metricPathMap.put(MetricName.CLUSTER_APPLIER_SERVICE, PerformanceAnalyzerMetrics.sClusterApplierService);

eventKeyToMetricNameMap = new HashMap<>();
eventKeyToMetricNameMap.put(PerformanceAnalyzerMetrics.sCacheConfigPath, MetricName.CACHE_CONFIG);
Expand All @@ -183,6 +184,8 @@ private MetricPropertiesConfig() {
PerformanceAnalyzerMetrics.sPendingTasksPath, MetricName.MASTER_PENDING);
eventKeyToMetricNameMap.put(PerformanceAnalyzerMetrics.sMountedPartitionMetricsPath,
MetricName.MOUNTED_PARTITION_METRICS);
eventKeyToMetricNameMap.put(PerformanceAnalyzerMetrics.sClusterApplierService,
MetricName.CLUSTER_APPLIER_SERVICE);

metricName2Property = new HashMap<>();

Expand Down Expand Up @@ -250,6 +253,13 @@ private MetricPropertiesConfig() {
DevicePartitionValue.values(),
createFileHandler(metricPathMap.get(MetricName.MOUNTED_PARTITION_METRICS))
));
metricName2Property.put(
MetricName.CLUSTER_APPLIER_SERVICE,
new MetricProperties(
MetricProperties.EMPTY_DIMENSION,
AllMetrics.ClusterApplierServiceStatsValue.values(),
createFileHandler(
metricPathMap.get(MetricName.CLUSTER_APPLIER_SERVICE))));
}

public static MetricPropertiesConfig getInstance() {
Expand Down
3 changes: 3 additions & 0 deletions src/test/resources/reader/1566413960000
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ $
^master_throttling_metrics
{"current_time":1566413936555}
{"Data_RetryingPendingTasksCount":0,"Master_ThrottledPendingTasksCount":33}
^cluster_applier_service
{"current_time":1566413936555}
{"ClusterApplierService_Latency":23,"ClusterApplierService_Failure":3}
^shard_state_metrics
{"current_time":1566413936488}
{"IndexName":"pmc"}
Expand Down
3 changes: 3 additions & 0 deletions src/test/resources/reader/1566413965000
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ $
^master_throttling_metrics
{"current_time":1566413966544}
{"Data_RetryingPendingTasksCount":0,"Master_ThrottledPendingTasksCount":33}
^cluster_applier_service
{"current_time":1566413966544}
{"ClusterApplierService_Latency":23,"ClusterApplierService_Failure":3}
^shard_state_metrics
{"current_time":1566413966493}
{"IndexName":"pmc"}
Expand Down
3 changes: 3 additions & 0 deletions src/test/resources/reader/1566413970000
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ $
^master_throttling_metrics
{"current_time":1566413996947}
{"Data_RetryingPendingTasksCount":0,"Master_ThrottledPendingTasksCount":33}
^cluster_applier_service
{"current_time":1566413996947}
{"ClusterApplierService_Latency":23,"ClusterApplierService_Failure":3}
^shard_state_metrics
{"current_time":1566413996664}
{"IndexName":"pmc"}
Expand Down