Skip to content

Commit

Permalink
[testing-on-gke] Add script for automating runs (#2538)
Browse files Browse the repository at this point in the history
* [testing-on-gke] Add script for automating runs

* fix a buggy comment

* address review comment

* fail if USER not defined

* disable local-ssd tests by default

* add a dummy workload configuration for testing

* remove output_bucket pii from script
  • Loading branch information
gargnitingoogle authored Sep 30, 2024
1 parent 9ccfd34 commit 814ede2
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 1 deletion.
125 changes: 125 additions & 0 deletions perfmetrics/scripts/testing_on_gke/examples/run-automated.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/bin/bash
#
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This script is used purely for automating the run of
# the script run-gke-tests.sh to
# run it periodically as a cron-job.
#
# For your case, add/remove/modify the configuration parameters as you need.
#
# Assumptions for this script to work:
# 1. You have appropriate access to project_id defined below.
# 2. You have the cluster with cluster_name defined below, or enough.
# resources in project_id to create this cluster.

# Print all shell commands.
set -x

# Fail if any command fails.
set -e

# Environment variable USER must be defined.
if test -z ${USER}; then
echo "USER has not been set"
exit 1
fi

# Define configuration parameters.
if test -z ${project_id}; then
echo "project_id has not been set."
exit 1
fi
if test -z "${project_number}"; then
echo "project_number has not been set."
exit 1
fi
export zone=us-west1-b
if test -z "${cluster_name}"; then
echo "cluster_name has not been set."
exit 1
fi
export node_pool=default-pool
export machine_type=n2-standard-96
export num_nodes=7
export num_ssd=16
export use_custom_csi_driver=true
export output_dir=.
if test -z "${gcsfuse_branch}"; then
echo "gcsfuse_branch has not been set."
exit 1
fi
export pod_wait_time_in_seconds=300
export pod_timeout_in_seconds=64800
# Pass instance_id from outside to continue previous run, if it got terminated
# somehow (timeout of ssh etc.)
if test -z ${instance_id}; then
export instance_id=$(echo ${USER} | sed 's/_google//' | sed 's/_com//')-$(date +%Y%m%d-%H%M%S)
fi
if test -z "${output_gsheet_id}"; then
echo "output_gsheet_id has not been set."
exit 1
fi
if test -z "${output_gsheet_keyfile}"; then
echo "output_gsheet_keyfile has not been set."
exit 1
fi
export force_update_gcsfuse_code=true
# Continue previous run if pods had been scheduled/completed already.
test -n ${only_parse} || export only_parse=false

# Create a dedicated folder on the machine.
mkdir -pv ~/gke-testing && cd ~/gke-testing
wget https://raw.githubusercontent.com/googlecloudplatform/gcsfuse/${gcsfuse_branch}/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh -O run-gke-tests.sh
chmod +x run-gke-tests.sh

# Remove previous run's outputs.
rm -rfv log fio/output.csv dlio/output.csv

# Run the script.
start_time=$(date +%Y-%m-%dT%H:%M:%SZ)
echo 'Run started at ${start_time}'
touch log
(./run-gke-tests.sh --debug |& tee -a log) || true
# Use the following if you want to run it in a tmux session instead.
# tmux new-session -d -s ${instance_id} 'bash -c "(./run-gke-tests.sh --debug |& tee -a log); sleep 604800 "'
end_time=$(date +%Y-%m-%dT%H:%M:%SZ)
echo 'Run ended at ${end_time}'

# Some post-run steps to be taken for output collection.
if test -n "${workload_config}"; then
cp ${workload_config} ./workloads.json
else
cp src/gcsfuse/perfmetrics/scripts/testing_on_gke/examples/workloads.json .
fi
git -C src/gcsfuse rev-parse HEAD > gcsfuse_commithash
git -C src/gcs-fuse-csi-driver rev-parse HEAD > gcs_fuse_csi_driver_commithash
# Fetch cloud-logs for this run. This has not been tested yet.
# (gcloud logging read --project=${project_id} 'timestamp>="${start_time}"" AND timestamp<="${end_time}" AND resource.labels.cluster_name="${cluster_name}" ' --order=ASC --format=csv\(timestamp\,resource.labels.pod_name,resource.labels.container_name,"text_payload"\) > cloud_logs.txt) &

# Upload outputs to GCS after the run.
if test -z "${output_bucket}"; then
echo "output_bucket has not been set."
exit 1
fi
output_path_uri=gs://${output_bucket}/outputs/${instance_id}
for file in fio/output.csv dlio/output.csv log run-gke-tests.sh workloads.json gcsfuse_commithash gcs_fuse_csi_driver_commithash; do
if test -f ${file} ; then
gcloud storage cp --content-type=text/text ${file} ${output_path_uri}/${file}
fi
done

# Go back to whichever working directory you were in.
cd -
2 changes: 1 addition & 1 deletion perfmetrics/scripts/testing_on_gke/examples/workloads.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"TestConfig": {
"workloadConfig": {
"_description": "workloadConfig has an optional field runOnSSD (default true if missing), and an array of workloads.",
"runOnSSD": true,
"runOnSSD": false,
"workloads": [
{
"_description": "This is a dummy fio workload (missing the 'fioWorkload' field), purely standing as a header and does not execute any workload. For it to execute a fio workload, it must have a valid 'fioWorkload', a valid 'bucket' attribute, and a valid gcsfuseMountOption attribute.",
Expand Down
59 changes: 59 additions & 0 deletions perfmetrics/scripts/testing_on_gke/examples/workloads_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{
"_comment": "_ in the starting of element name indicates comment.",
"TestConfig": {
"workloadConfig": {
"_description": "workloadConfig has an optional field runOnSSD (default true if missing), and an array of workloads.",
"runOnSSD": false,
"workloads": [
{
"_description": "This is a dummy fio workload (missing the 'fioWorkload' field), purely standing as a header and does not execute any workload. For it to execute a fio workload, it must have a valid 'fioWorkload', a valid 'bucket' attribute, and a valid gcsfuseMountOption attribute.",
"_fioWorkload": {
"_description": "Every fioWorkload must have fileSize, filesPerThread, numThreads, and blockSize fields. readTypes is an array of string values 'read' and 'randread'. If readTypes is missing, then it defaults to [\"read\",\"randread\"].",
"fileSize": "64K",
"filesPerThread": 20000,
"numThreads": 50,
"blockSize": "64K",
"readTypes": ["read","randread"]
},
"gcsfuseMountOptions": "GCSFuse mount-options, in a compact stringified format, to be used for the test scenario gcsfuse-generic. The individual config/cli flag values should be separated by comma. Each cli flag should be of the form <flag>[=<value>], while each config-file flag should be of form <config>[:<subconfig>[:<subsubconfig>[...]]]:<value>. For example, a legal value would be: implicit-dirs,file_mode=777,file-cache:enable-parallel-downloads:true,metadata-cache:ttl-secs:-1 .",
"bucket":"The bucket must have objects with name Workload.{i}/{j} for every i,j where i:0-{numThreads}-1, j:0-{filesPerThread}-1, and each of these objects must be of size {fileSize}. The buckets gke-* are all in us-central1, are owned by GKE team and are in their GCP project(s). For best performance, please ensure that the bucket is in the same google-cloud region and GCP project as that of the GKE cluster used for running this test configuration."
},
{
"fioWorkload": {
"fileSize": "64K",
"filesPerThread": 100,
"numThreads": 20,
"blockSize": "64K",
"readTypes": ["randread"]
},
"gcsfuseMountOptions": "implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:stat-cache-max-size-mb:-1,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true",
"bucket":"fio-64k-1m-us-west1",
"_bucket_alt2":"fio-64k-1m-us-central1",
"_bucket_alt3":"gke-fio-64k-1m"
},
{
"_description": "This is a dummy dlio workload (missing the 'dlioWorkload' field), purely standing as a header and does not execute any workload. For it to execute a dlio workload, it must have a valid 'dlioWorkload' object and a valid 'bucket' attribute, and a valid gcsfuseMountOption attribute.",
"_dlioWorkload": {
"_description": "Every dlioWorkload must have numFilesTrain, recordLength, and batchSizes fields. batchSizes is an array of integer values",
"numFilesTrain": 500000,
"recordLength": 102400,
"batchSizes": [800,128]
},
"gcsfuseMountOptions": "implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:stat-cache-max-size-mb:-1,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true",
"bucket":"The bucket must have objects with name 'train/', 'valid/', and train/img_{i}_of_{numFilesTrain}.npz for every i where i:0-{numFilesTrain}-1 and each train/img_{i}_of_{numFilesTrain}.npz must be of size {recordLength} bytes. The buckets gke-* are all in us-central1, are owned by GKE team and are in their GCP project(s). For best performance, please ensure that the bucket is in the same google-cloud region and GCP project as that of the GKE cluster used for running this test configuration."
},
{
"dlioWorkload": {
"numFilesTrain": 1000,
"recordLength": 3145728,
"batchSizes": [200]
},
"gcsfuseMountOptions": "implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:stat-cache-max-size-mb:-1,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true",
"bucket":"dlio-unet3d-3mb-100k-us-west1",
"_bucket_alt2":"dlio-unet3d-3mb-100k-us-central1",
"_bucket_alt3":"gke-dlio-unet3d-3mb-100k"
}
]
}
}
}

0 comments on commit 814ede2

Please sign in to comment.