-
Notifications
You must be signed in to change notification settings - Fork 430
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[testing-on-gke] Add script for automating runs (#2538)
* [testing-on-gke] Add script for automating runs * fix a buggy comment * address review comment * fail if USER not defined * disable local-ssd tests by default * add a dummy workload configuration for testing * remove output_bucket pii from script
- Loading branch information
1 parent
9ccfd34
commit 814ede2
Showing
3 changed files
with
185 additions
and
1 deletion.
There are no files selected for viewing
125 changes: 125 additions & 0 deletions
125
perfmetrics/scripts/testing_on_gke/examples/run-automated.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
#!/bin/bash | ||
# | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# This script is used purely for automating the run of | ||
# the script run-gke-tests.sh to | ||
# run it periodically as a cron-job. | ||
# | ||
# For your case, add/remove/modify the configuration parameters as you need. | ||
# | ||
# Assumptions for this script to work: | ||
# 1. You have appropriate access to project_id defined below. | ||
# 2. You have the cluster with cluster_name defined below, or enough. | ||
# resources in project_id to create this cluster. | ||
|
||
# Print all shell commands. | ||
set -x | ||
|
||
# Fail if any command fails. | ||
set -e | ||
|
||
# Environment variable USER must be defined. | ||
if test -z ${USER}; then | ||
echo "USER has not been set" | ||
exit 1 | ||
fi | ||
|
||
# Define configuration parameters. | ||
if test -z ${project_id}; then | ||
echo "project_id has not been set." | ||
exit 1 | ||
fi | ||
if test -z "${project_number}"; then | ||
echo "project_number has not been set." | ||
exit 1 | ||
fi | ||
export zone=us-west1-b | ||
if test -z "${cluster_name}"; then | ||
echo "cluster_name has not been set." | ||
exit 1 | ||
fi | ||
export node_pool=default-pool | ||
export machine_type=n2-standard-96 | ||
export num_nodes=7 | ||
export num_ssd=16 | ||
export use_custom_csi_driver=true | ||
export output_dir=. | ||
if test -z "${gcsfuse_branch}"; then | ||
echo "gcsfuse_branch has not been set." | ||
exit 1 | ||
fi | ||
export pod_wait_time_in_seconds=300 | ||
export pod_timeout_in_seconds=64800 | ||
# Pass instance_id from outside to continue previous run, if it got terminated | ||
# somehow (timeout of ssh etc.) | ||
if test -z ${instance_id}; then | ||
export instance_id=$(echo ${USER} | sed 's/_google//' | sed 's/_com//')-$(date +%Y%m%d-%H%M%S) | ||
fi | ||
if test -z "${output_gsheet_id}"; then | ||
echo "output_gsheet_id has not been set." | ||
exit 1 | ||
fi | ||
if test -z "${output_gsheet_keyfile}"; then | ||
echo "output_gsheet_keyfile has not been set." | ||
exit 1 | ||
fi | ||
export force_update_gcsfuse_code=true | ||
# Continue previous run if pods had been scheduled/completed already. | ||
test -n ${only_parse} || export only_parse=false | ||
|
||
# Create a dedicated folder on the machine. | ||
mkdir -pv ~/gke-testing && cd ~/gke-testing | ||
wget https://raw.githubusercontent.com/googlecloudplatform/gcsfuse/${gcsfuse_branch}/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh -O run-gke-tests.sh | ||
chmod +x run-gke-tests.sh | ||
|
||
# Remove previous run's outputs. | ||
rm -rfv log fio/output.csv dlio/output.csv | ||
|
||
# Run the script. | ||
start_time=$(date +%Y-%m-%dT%H:%M:%SZ) | ||
echo 'Run started at ${start_time}' | ||
touch log | ||
(./run-gke-tests.sh --debug |& tee -a log) || true | ||
# Use the following if you want to run it in a tmux session instead. | ||
# tmux new-session -d -s ${instance_id} 'bash -c "(./run-gke-tests.sh --debug |& tee -a log); sleep 604800 "' | ||
end_time=$(date +%Y-%m-%dT%H:%M:%SZ) | ||
echo 'Run ended at ${end_time}' | ||
|
||
# Some post-run steps to be taken for output collection. | ||
if test -n "${workload_config}"; then | ||
cp ${workload_config} ./workloads.json | ||
else | ||
cp src/gcsfuse/perfmetrics/scripts/testing_on_gke/examples/workloads.json . | ||
fi | ||
git -C src/gcsfuse rev-parse HEAD > gcsfuse_commithash | ||
git -C src/gcs-fuse-csi-driver rev-parse HEAD > gcs_fuse_csi_driver_commithash | ||
# Fetch cloud-logs for this run. This has not been tested yet. | ||
# (gcloud logging read --project=${project_id} 'timestamp>="${start_time}"" AND timestamp<="${end_time}" AND resource.labels.cluster_name="${cluster_name}" ' --order=ASC --format=csv\(timestamp\,resource.labels.pod_name,resource.labels.container_name,"text_payload"\) > cloud_logs.txt) & | ||
|
||
# Upload outputs to GCS after the run. | ||
if test -z "${output_bucket}"; then | ||
echo "output_bucket has not been set." | ||
exit 1 | ||
fi | ||
output_path_uri=gs://${output_bucket}/outputs/${instance_id} | ||
for file in fio/output.csv dlio/output.csv log run-gke-tests.sh workloads.json gcsfuse_commithash gcs_fuse_csi_driver_commithash; do | ||
if test -f ${file} ; then | ||
gcloud storage cp --content-type=text/text ${file} ${output_path_uri}/${file} | ||
fi | ||
done | ||
|
||
# Go back to whichever working directory you were in. | ||
cd - |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
59 changes: 59 additions & 0 deletions
59
perfmetrics/scripts/testing_on_gke/examples/workloads_test.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
{ | ||
"_comment": "_ in the starting of element name indicates comment.", | ||
"TestConfig": { | ||
"workloadConfig": { | ||
"_description": "workloadConfig has an optional field runOnSSD (default true if missing), and an array of workloads.", | ||
"runOnSSD": false, | ||
"workloads": [ | ||
{ | ||
"_description": "This is a dummy fio workload (missing the 'fioWorkload' field), purely standing as a header and does not execute any workload. For it to execute a fio workload, it must have a valid 'fioWorkload', a valid 'bucket' attribute, and a valid gcsfuseMountOption attribute.", | ||
"_fioWorkload": { | ||
"_description": "Every fioWorkload must have fileSize, filesPerThread, numThreads, and blockSize fields. readTypes is an array of string values 'read' and 'randread'. If readTypes is missing, then it defaults to [\"read\",\"randread\"].", | ||
"fileSize": "64K", | ||
"filesPerThread": 20000, | ||
"numThreads": 50, | ||
"blockSize": "64K", | ||
"readTypes": ["read","randread"] | ||
}, | ||
"gcsfuseMountOptions": "GCSFuse mount-options, in a compact stringified format, to be used for the test scenario gcsfuse-generic. The individual config/cli flag values should be separated by comma. Each cli flag should be of the form <flag>[=<value>], while each config-file flag should be of form <config>[:<subconfig>[:<subsubconfig>[...]]]:<value>. For example, a legal value would be: implicit-dirs,file_mode=777,file-cache:enable-parallel-downloads:true,metadata-cache:ttl-secs:-1 .", | ||
"bucket":"The bucket must have objects with name Workload.{i}/{j} for every i,j where i:0-{numThreads}-1, j:0-{filesPerThread}-1, and each of these objects must be of size {fileSize}. The buckets gke-* are all in us-central1, are owned by GKE team and are in their GCP project(s). For best performance, please ensure that the bucket is in the same google-cloud region and GCP project as that of the GKE cluster used for running this test configuration." | ||
}, | ||
{ | ||
"fioWorkload": { | ||
"fileSize": "64K", | ||
"filesPerThread": 100, | ||
"numThreads": 20, | ||
"blockSize": "64K", | ||
"readTypes": ["randread"] | ||
}, | ||
"gcsfuseMountOptions": "implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:stat-cache-max-size-mb:-1,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true", | ||
"bucket":"fio-64k-1m-us-west1", | ||
"_bucket_alt2":"fio-64k-1m-us-central1", | ||
"_bucket_alt3":"gke-fio-64k-1m" | ||
}, | ||
{ | ||
"_description": "This is a dummy dlio workload (missing the 'dlioWorkload' field), purely standing as a header and does not execute any workload. For it to execute a dlio workload, it must have a valid 'dlioWorkload' object and a valid 'bucket' attribute, and a valid gcsfuseMountOption attribute.", | ||
"_dlioWorkload": { | ||
"_description": "Every dlioWorkload must have numFilesTrain, recordLength, and batchSizes fields. batchSizes is an array of integer values", | ||
"numFilesTrain": 500000, | ||
"recordLength": 102400, | ||
"batchSizes": [800,128] | ||
}, | ||
"gcsfuseMountOptions": "implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:stat-cache-max-size-mb:-1,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true", | ||
"bucket":"The bucket must have objects with name 'train/', 'valid/', and train/img_{i}_of_{numFilesTrain}.npz for every i where i:0-{numFilesTrain}-1 and each train/img_{i}_of_{numFilesTrain}.npz must be of size {recordLength} bytes. The buckets gke-* are all in us-central1, are owned by GKE team and are in their GCP project(s). For best performance, please ensure that the bucket is in the same google-cloud region and GCP project as that of the GKE cluster used for running this test configuration." | ||
}, | ||
{ | ||
"dlioWorkload": { | ||
"numFilesTrain": 1000, | ||
"recordLength": 3145728, | ||
"batchSizes": [200] | ||
}, | ||
"gcsfuseMountOptions": "implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:stat-cache-max-size-mb:-1,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true", | ||
"bucket":"dlio-unet3d-3mb-100k-us-west1", | ||
"_bucket_alt2":"dlio-unet3d-3mb-100k-us-central1", | ||
"_bucket_alt3":"gke-dlio-unet3d-3mb-100k" | ||
} | ||
] | ||
} | ||
} | ||
} |