[testing-on-gke] Add script for automating runs (#2538)

* [testing-on-gke] Add script for automating runs * fix a buggy comment * address review comment * fail if USER not defined * disable local-ssd tests by default * add a dummy workload configuration for testing * remove output_bucket pii from script
GoogleCloudPlatform · Sep 30, 2024 · 814ede2 · 814ede2
1 parent 9ccfd34
commit 814ede2
Show file tree

Hide file tree

Showing 3 changed files with 185 additions and 1 deletion.
diff --git a/perfmetrics/scripts/testing_on_gke/examples/run-automated.sh b/perfmetrics/scripts/testing_on_gke/examples/run-automated.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+#
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script is used purely for automating the run of
+# the script run-gke-tests.sh to
+# run it periodically as a cron-job.
+#
+# For your case, add/remove/modify the configuration parameters as you need.
+#
+# Assumptions for this script to work:
+# 1. You have appropriate access to project_id defined below.
+# 2. You have the cluster with cluster_name defined below, or enough.
+# resources in project_id to create this cluster.
+
+# Print all shell commands.
+set -x
+
+# Fail if any command fails.
+set -e
+
+# Environment variable USER must be defined.
+if test -z ${USER}; then
+  echo "USER has not been set"
+  exit 1
+fi
+
+# Define configuration parameters.
+if test -z ${project_id}; then
+  echo "project_id has not been set."
+  exit 1
+fi
+if test -z "${project_number}"; then
+  echo "project_number has not been set."
+  exit 1
+fi
+export zone=us-west1-b
+if test -z "${cluster_name}"; then
+  echo "cluster_name has not been set."
+  exit 1
+fi
+export node_pool=default-pool
+export machine_type=n2-standard-96
+export num_nodes=7
+export num_ssd=16
+export use_custom_csi_driver=true
+export output_dir=.
+if test -z "${gcsfuse_branch}"; then
+  echo "gcsfuse_branch has not been set."
+  exit 1
+fi
+export pod_wait_time_in_seconds=300
+export pod_timeout_in_seconds=64800
+# Pass instance_id from outside to continue previous run, if it got terminated
+# somehow (timeout of ssh etc.)
+if test -z ${instance_id}; then
+  export instance_id=$(echo ${USER} | sed 's/_google//' | sed 's/_com//')-$(date +%Y%m%d-%H%M%S)
+fi
+if test -z "${output_gsheet_id}"; then
+  echo "output_gsheet_id has not been set."
+  exit 1
+fi
+if test -z "${output_gsheet_keyfile}"; then
+  echo "output_gsheet_keyfile has not been set."
+  exit 1
+fi
+export force_update_gcsfuse_code=true
+# Continue previous run if pods had been scheduled/completed already.
+test -n ${only_parse} || export only_parse=false
+
+# Create a dedicated folder on the machine.
+mkdir -pv ~/gke-testing && cd ~/gke-testing
+wget https://raw.githubusercontent.com/googlecloudplatform/gcsfuse/${gcsfuse_branch}/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh -O run-gke-tests.sh
+chmod +x run-gke-tests.sh
+
+# Remove previous run's outputs.
+rm -rfv log fio/output.csv dlio/output.csv
+
+# Run the script.
+start_time=$(date +%Y-%m-%dT%H:%M:%SZ)
+echo 'Run started at ${start_time}'
+touch log
+(./run-gke-tests.sh --debug |& tee -a log) || true
+# Use the following if you want to run it in a tmux session instead.
+# tmux new-session -d -s ${instance_id} 'bash -c "(./run-gke-tests.sh --debug |& tee -a log); sleep 604800 "'
+end_time=$(date +%Y-%m-%dT%H:%M:%SZ)
+echo 'Run ended at ${end_time}'
+
+# Some post-run steps to be taken for output collection.
+if test -n "${workload_config}"; then
+  cp ${workload_config} ./workloads.json
+else
+  cp src/gcsfuse/perfmetrics/scripts/testing_on_gke/examples/workloads.json .
+fi
+git -C src/gcsfuse rev-parse HEAD > gcsfuse_commithash
+git -C src/gcs-fuse-csi-driver rev-parse HEAD > gcs_fuse_csi_driver_commithash
+# Fetch cloud-logs for this run. This has not been tested yet.
+# (gcloud logging read --project=${project_id} 'timestamp>="${start_time}"" AND timestamp<="${end_time}" AND resource.labels.cluster_name="${cluster_name}" ' --order=ASC --format=csv\(timestamp\,resource.labels.pod_name,resource.labels.container_name,"text_payload"\) > cloud_logs.txt) &
+
+# Upload outputs to GCS after the run.
+if test -z "${output_bucket}"; then
+  echo "output_bucket has not been set."
+  exit 1
+fi
+output_path_uri=gs://${output_bucket}/outputs/${instance_id}
+for file in fio/output.csv dlio/output.csv log run-gke-tests.sh workloads.json gcsfuse_commithash gcs_fuse_csi_driver_commithash; do
+  if test -f ${file} ; then
+    gcloud storage cp --content-type=text/text ${file} ${output_path_uri}/${file}
+  fi
+done
+
+# Go back to whichever working directory you were in.
+cd -
diff --git a/perfmetrics/scripts/testing_on_gke/examples/workloads.json b/perfmetrics/scripts/testing_on_gke/examples/workloads.json
@@ -3,7 +3,7 @@
   "TestConfig": {
     "workloadConfig": {
       "_description": "workloadConfig has an optional field runOnSSD (default true if missing), and an array of workloads.",
-      "runOnSSD": true,
+      "runOnSSD": false,
       "workloads": [
         {
           "_description": "This is a dummy fio workload (missing the 'fioWorkload' field), purely standing as a header and does not execute any workload. For it to execute a fio workload, it must have a valid 'fioWorkload', a valid 'bucket' attribute, and a valid gcsfuseMountOption attribute.",

diff --git a/perfmetrics/scripts/testing_on_gke/examples/workloads_test.json b/perfmetrics/scripts/testing_on_gke/examples/workloads_test.json
@@ -0,0 +1,59 @@
+{
+  "_comment": "_ in the starting of element name indicates comment.",
+  "TestConfig": {
+    "workloadConfig": {
+      "_description": "workloadConfig has an optional field runOnSSD (default true if missing), and an array of workloads.",
+      "runOnSSD": false,
+      "workloads": [
+        {
+          "_description": "This is a dummy fio workload (missing the 'fioWorkload' field), purely standing as a header and does not execute any workload. For it to execute a fio workload, it must have a valid 'fioWorkload', a valid 'bucket' attribute, and a valid gcsfuseMountOption attribute.",
+          "_fioWorkload": {
+            "_description": "Every fioWorkload must have fileSize, filesPerThread, numThreads, and blockSize fields. readTypes is an array of string values 'read' and 'randread'. If readTypes is missing, then it defaults to [\"read\",\"randread\"].",
+            "fileSize": "64K",
+            "filesPerThread": 20000,
+            "numThreads": 50,
+            "blockSize": "64K",
+            "readTypes": ["read","randread"]
+          },
+          "gcsfuseMountOptions": "GCSFuse mount-options, in a compact stringified format, to be used for the test scenario gcsfuse-generic. The individual config/cli flag values should be separated by comma. Each cli flag should be of the form <flag>[=<value>], while each config-file flag should be of form <config>[:<subconfig>[:<subsubconfig>[...]]]:<value>. For example, a legal value would be: implicit-dirs,file_mode=777,file-cache:enable-parallel-downloads:true,metadata-cache:ttl-secs:-1 .",
+          "bucket":"The bucket must have objects with name Workload.{i}/{j} for every i,j where i:0-{numThreads}-1, j:0-{filesPerThread}-1, and each of these objects must be of size {fileSize}. The buckets gke-* are all in us-central1, are owned by GKE team and are in their GCP project(s). For best performance, please ensure that the bucket is in the same google-cloud region and GCP project as that of the GKE cluster used for running this test configuration."
+        },
+        {
+          "fioWorkload": {
+            "fileSize": "64K",
+            "filesPerThread": 100,
+            "numThreads": 20,
+            "blockSize": "64K",
+            "readTypes": ["randread"]
+          },
+          "gcsfuseMountOptions": "implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:stat-cache-max-size-mb:-1,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true",
+          "bucket":"fio-64k-1m-us-west1",
+          "_bucket_alt2":"fio-64k-1m-us-central1",
+          "_bucket_alt3":"gke-fio-64k-1m"
+        },
+        {
+          "_description": "This is a dummy dlio workload (missing the 'dlioWorkload' field), purely standing as a header and does not execute any workload. For it to execute a dlio workload, it must have a valid 'dlioWorkload' object and a valid 'bucket' attribute, and a valid gcsfuseMountOption attribute.",
+          "_dlioWorkload": {
+            "_description": "Every dlioWorkload must have numFilesTrain, recordLength, and batchSizes fields. batchSizes is an array of integer values",
+            "numFilesTrain": 500000,
+            "recordLength": 102400,
+            "batchSizes": [800,128]
+          },
+          "gcsfuseMountOptions": "implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:stat-cache-max-size-mb:-1,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true",
+          "bucket":"The bucket must have objects with name 'train/', 'valid/', and train/img_{i}_of_{numFilesTrain}.npz for every i where i:0-{numFilesTrain}-1 and each train/img_{i}_of_{numFilesTrain}.npz must be of size {recordLength} bytes. The buckets gke-* are all in us-central1, are owned by GKE team and are in their GCP project(s). For best performance, please ensure that the bucket is in the same google-cloud region and GCP project as that of the GKE cluster used for running this test configuration."
+        },
+        {
+          "dlioWorkload": {
+            "numFilesTrain": 1000,
+            "recordLength": 3145728,
+            "batchSizes": [200]
+          },
+          "gcsfuseMountOptions": "implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:stat-cache-max-size-mb:-1,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true",
+          "bucket":"dlio-unet3d-3mb-100k-us-west1",
+          "_bucket_alt2":"dlio-unet3d-3mb-100k-us-central1",
+          "_bucket_alt3":"gke-dlio-unet3d-3mb-100k"
+        }
+      ]
+    }
+  }
+}