From dc187d1f22296dd2ca49c81eaf92037e11a15bef Mon Sep 17 00:00:00 2001 From: Nitin Garg <113666283+gargnitingoogle@users.noreply.github.com> Date: Mon, 26 Aug 2024 11:35:33 +0530 Subject: [PATCH] [testing-on-gke] Support instance-id and more complex configurations (#2359) * Support special cases - Support instance_id (unique-id for the current test-run, allowed multiple runs concurrently) 1. Support env variable instance_id in run-script. 2. Support instance_id as argument in fio/dlio run_tests.py, parse_logs.py 3. Pass instance_id in fio/dlio pod yaml config - Support multiple combinations of blockSize,numThreads, filesPerThread, combinations for a given fileSize for fio tests. * address self-review comment --- .../examples/dlio/parse_logs.py | 51 +++++++++------ .../testing_on_gke/examples/dlio/run_tests.py | 16 +++-- .../templates/dlio-tester.yaml | 10 +-- .../dlio/unet3d-loading-test/values.yaml | 1 + .../loading-test/templates/fio-tester.yaml | 23 +++---- .../examples/fio/loading-test/values.yaml | 1 + .../testing_on_gke/examples/fio/parse_logs.py | 64 +++++++++++++------ .../testing_on_gke/examples/fio/run_tests.py | 16 +++-- .../testing_on_gke/examples/run-gke-tests.sh | 14 ++-- 9 files changed, 129 insertions(+), 67 deletions(-) diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py b/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py index eddf0beef0..b6792ca856 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py @@ -23,7 +23,7 @@ sys.path.append("../") from utils.utils import get_memory, get_cpu, standard_timestamp, is_mash_installed -_LOCAL_LOGS_LOCATION = "../../bin/dlio-logs" +_LOCAL_LOGS_LOCATION = "../../bin/dlio-logs/logs" record = { "pod_name": "", @@ -44,7 +44,7 @@ } -def downloadDlioOutputs(dlioWorkloads): +def downloadDlioOutputs(dlioWorkloads: set, instanceId: str): for dlioWorkload in dlioWorkloads: print(f"Downloading DLIO logs from the bucket {dlioWorkload.bucket}...") result = subprocess.run( @@ -55,7 +55,7 @@ def downloadDlioOutputs(dlioWorkloads): "cp", "-r", "--no-user-output-enabled", # do not print names of files being copied - f"gs://{dlioWorkload.bucket}/logs", + f"gs://{dlioWorkload.bucket}/logs/{instanceId}", _LOCAL_LOGS_LOCATION, ], capture_output=False, @@ -92,6 +92,11 @@ def downloadDlioOutputs(dlioWorkloads): ), required=True, ) + parser.add_argument( + "--instance-id", + help="unique string ID for current test-run", + required=True, + ) args = parser.parse_args() try: @@ -102,7 +107,7 @@ def downloadDlioOutputs(dlioWorkloads): dlioWorkloads = dlio_workload.ParseTestConfigForDlioWorkloads( args.workload_config ) - downloadDlioOutputs(dlioWorkloads) + downloadDlioOutputs(dlioWorkloads, args.instance_id) """ "{num_files_train}-{mean_file_size}-{batch_size}": @@ -120,7 +125,7 @@ def downloadDlioOutputs(dlioWorkloads): if not mash_installed: print("Mash is not installed, will skip parsing CPU and memory usage.") - for root, _, files in os.walk(_LOCAL_LOGS_LOCATION): + for root, _, files in os.walk(_LOCAL_LOGS_LOCATION + "/" + args.instance_id): if files: print(f"Parsing directory {root} ...") per_epoch_stats_file = root + "/per_epoch_stats.json" @@ -153,9 +158,9 @@ def downloadDlioOutputs(dlioWorkloads): if key not in output: output[key] = { - "num_files_train": part_list[2], - "mean_file_size": part_list[3], - "batch_size": part_list[4], + "num_files_train": part_list[-3], + "mean_file_size": part_list[-2], + "batch_size": part_list[-1], "records": { "local-ssd": [], "gcsfuse-generic": [], @@ -167,7 +172,7 @@ def downloadDlioOutputs(dlioWorkloads): r = record.copy() r["pod_name"] = summary_data["hostname"] r["epoch"] = i + 1 - r["scenario"] = "-".join(part_list[5:]) + r["scenario"] = root.split("/")[-1] r["train_au_percentage"] = round( summary_data["metric"]["train_au_percentage"][i], 2 ) @@ -221,7 +226,7 @@ def downloadDlioOutputs(dlioWorkloads): " (s),GPU Utilization (%),Throughput (sample/s),Throughput" " (MB/s),Throughput over Local SSD (%),GCSFuse Lowest Memory (MB),GCSFuse" " Highest Memory (MB),GCSFuse Lowest CPU (core),GCSFuse Highest CPU" - " (core),Pod,Start,End,GcsfuseMountOptions\n" + " (core),Pod,Start,End,GcsfuseMountOptions,InstanceID\n" ) for key in output: @@ -242,19 +247,25 @@ def downloadDlioOutputs(dlioWorkloads): ): for i in range(len(record_set["records"]["local-ssd"])): r = record_set["records"][scenario][i] - r["throughput_over_local_ssd"] = round( - r["train_throughput_mb_per_second"] - / record_set["records"]["local-ssd"][i][ - "train_throughput_mb_per_second" - ] - * 100, - 2, - ) + try: + r["throughput_over_local_ssd"] = round( + r["train_throughput_mb_per_second"] + / record_set["records"]["local-ssd"][i][ + "train_throughput_mb_per_second" + ] + * 100, + 2, + ) + except ZeroDivisionError: + print("Got ZeroDivisionError. Ignoring it.") + r["throughput_over_local_ssd"] = 0 + except: + raise output_file.write( f"{record_set['mean_file_size']},{record_set['num_files_train']},{total_size},{record_set['batch_size']},{scenario}," ) output_file.write( - f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\"\n" + f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{args.instance_id}\n" ) else: for i in range(len(record_set["records"][scenario])): @@ -264,7 +275,7 @@ def downloadDlioOutputs(dlioWorkloads): f"{record_set['mean_file_size']},{record_set['num_files_train']},{total_size},{record_set['batch_size']},{scenario}," ) output_file.write( - f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\"\n" + f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{args.instance_id}\n" ) output_file.close() diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py b/perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py index 1c645fc119..55484ec3c5 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py @@ -34,21 +34,22 @@ def run_command(command: str): print(result.stderr) -def createHelmInstallCommands(dlioWorkloads: set) -> list: - """Create helm install commands for the given dlioWorkload objects.""" +def createHelmInstallCommands(dlioWorkloads: set, instanceId: str): + """Create helm install commands for the given set of dlioWorkload objects.""" helm_commands = [] for dlioWorkload in dlioWorkloads: for batchSize in dlioWorkload.batchSizes: commands = [ ( 'helm install' - f' {dlioWorkload.bucket}-{batchSize}-{dlioWorkload.scenario} unet3d-loading-test' + f' dlio-unet3d-{dlioWorkload.scenario}-{dlioWorkload.numFilesTrain}-{dlioWorkload.recordLength}-{batchSize} unet3d-loading-test' ), f'--set bucketName={dlioWorkload.bucket}', f'--set scenario={dlioWorkload.scenario}', f'--set dlio.numFilesTrain={dlioWorkload.numFilesTrain}', f'--set dlio.recordLength={dlioWorkload.recordLength}', f'--set dlio.batchSize={batchSize}', + f'--set instanceId={instanceId}', ] helm_command = ' '.join(commands) @@ -60,7 +61,9 @@ def main(args) -> None: dlioWorkloads = dlio_workload.ParseTestConfigForDlioWorkloads( args.workload_config ) - helmInstallCommands = createHelmInstallCommands(dlioWorkloads) + helmInstallCommands = createHelmInstallCommands( + dlioWorkloads, args.instance_id + ) for helmInstallCommand in helmInstallCommands: print(f'{helmInstallCommand}') if not args.dry_run: @@ -81,6 +84,11 @@ def main(args) -> None: help='Runs DLIO Unet3d tests using this JSON workload configuration.', required=True, ) + parser.add_argument( + '--instance-id', + help='unique string ID for current test-run', + required=True, + ) parser.add_argument( '-n', '--dry-run', diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml index 7a117da56b..74d36bd820 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml @@ -16,7 +16,7 @@ apiVersion: v1 kind: Pod metadata: - name: dlio-tester-{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}-{{ .Values.scenario }} + name: dlio-tester-{{ .Values.scenario }}-{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }} {{- if ne .Values.scenario "local-ssd" }} annotations: gke-gcsfuse/volumes: "true" @@ -73,6 +73,8 @@ spec: sleep 300 {{ end }} + outputDir=/logs/{{ .Values.instanceId }}/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }} + echo "Testing {{ .Values.scenario }}" mpirun -np 8 dlio_benchmark workload=unet3d_a100 \ ++workload.train.epochs=4 \ @@ -84,14 +86,14 @@ spec: ++workload.reader.batch_size={{ .Values.dlio.batchSize }} \ ++workload.dataset.record_length={{ .Values.dlio.recordLength }} \ ++workload.reader.read_threads={{ .Values.dlio.readThreads }} \ - ++workload.output.folder=/logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }} + ++workload.output.folder=${outputDir} # dump the gcsfuse-mount-configuration to a file in output-directory. {{ if eq .Values.scenario "gcsfuse-generic"}} - echo "{{ .Values.gcsfuse.mountOptions }}" > /logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }}/gcsfuse_mount_options + echo "{{ .Values.gcsfuse.mountOptions }}" > ${outputDir}/gcsfuse_mount_options {{ end }} - gsutil -m cp -R /logs gs://{{ .Values.bucketName }}/logs/$(date +"%Y-%m-%d-%H-%M") + gsutil -m cp -R /logs/{{ .Values.instanceId }} gs://{{ .Values.bucketName }}/logs/{{ .Values.instanceId }}/$(date +"%Y-%m-%d-%H-%M") volumeMounts: - name: dshm mountPath: /dev/shm diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml index cbaa9d0137..ef0b3a20ef 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml @@ -22,6 +22,7 @@ bucketName: gke-dlio-test-data # scenario controls the kind of storage that is used for the load testing. local-ssd means directly on LSSD; gcsfuse-generic means on a gcsfuse mount with gcsfuse.mountOptions sent from the caller; gcsfuse-no-file-cache and gcsfuse-file-cache mean as their name suggests. scenario: local-ssd nodeType: n2-standard-96 +instanceId: ldap-yyyymmdd-hhmmss resourceLimits: cpu: 0 diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml index 368fba13af..d9c79eb8d4 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml @@ -16,7 +16,7 @@ apiVersion: v1 kind: Pod metadata: - name: fio-tester-{{ .Values.fio.readType }}-{{ lower .Values.fio.fileSize }}-{{ lower .Values.fio.blockSize }}-{{ .Values.scenario }} + name: fio-tester-{{ .Values.instanceId }}-{{ .Values.scenario }}-{{ .Values.fio.readType }}-{{ lower .Values.fio.fileSize }}-{{ lower .Values.fio.blockSize }}-{{ .Values.fio.numThreads }}-{{ .Values.fio.filesPerThread }} {{- if ne .Values.scenario "local-ssd" }} annotations: gke-gcsfuse/volumes: "true" @@ -45,7 +45,12 @@ spec: echo "Install dependencies..." apt-get update apt-get install -y libaio-dev gcc make git time wget - + + no_of_files_per_thread={{ .Values.fio.filesPerThread }} + block_size={{ .Values.fio.blockSize }} + file_size={{ .Values.fio.fileSize }} + num_of_threads={{ .Values.fio.numThreads }} + {{ if eq .Values.scenario "local-ssd" }} echo "Installing gsutil..." apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg curl @@ -105,12 +110,8 @@ spec: echo "Setup default values..." epoch=4 - no_of_files_per_thread={{ .Values.fio.filesPerThread }} read_type={{ .Values.fio.readType }} pause_in_seconds=20 - block_size={{ .Values.fio.blockSize }} - file_size={{ .Values.fio.fileSize }} - num_of_threads={{ .Values.fio.numThreads }} workload_dir=/data # Cleaning the pagecache, dentries and inode cache before the starting the workload. @@ -125,18 +126,18 @@ spec: time ls -R $workload_dir 1> /dev/null echo "Run fio tests..." - mkdir -p /data/fio-output/{{ .Values.scenario }}/$read_type + output_dir=/data/fio-output/{{ .Values.instanceId }}/${file_size}-{{ lower .Values.fio.blockSize}}-${num_of_threads}-${no_of_files_per_thread}/{{ .Values.scenario }}/$read_type + mkdir -p ${output_dir} # dump the gcsfuse-mount-configuration to a file in output-directory. {{ if eq .Values.scenario "gcsfuse-generic" }} - echo "{{ .Values.gcsfuse.mountOptions }}" > /data/fio-output/{{ .Values.scenario }}/$read_type/gcsfuse_mount_options + echo "{{ .Values.gcsfuse.mountOptions }}" > ${output_dir}/gcsfuse_mount_options {{ end }} for i in $(seq $epoch); do - echo "[Epoch ${i}] start time:" `date +%s` free -mh # Memory usage before workload start. - NUMJOBS=$num_of_threads NRFILES=$no_of_files_per_thread FILE_SIZE=$file_size BLOCK_SIZE=$block_size READ_TYPE=$read_type DIR=$workload_dir fio ${filename} --alloc-size=1048576 --output-format=json --output="/data/fio-output/{{ .Values.scenario }}/${read_type}/epoch${i}.json" + NUMJOBS=$num_of_threads NRFILES=$no_of_files_per_thread FILE_SIZE=$file_size BLOCK_SIZE=$block_size READ_TYPE=$read_type DIR=$workload_dir fio ${filename} --alloc-size=1048576 --output-format=json --output="${output_dir}/epoch${i}.json" free -mh # Memory usage after workload completion. echo "[Epoch ${i}] end time:" `date +%s` @@ -154,7 +155,7 @@ spec: done {{ if eq .Values.scenario "local-ssd" }} - gsutil -m cp -R /data/fio-output/local-ssd gs://{{ .Values.bucketName }}/fio-output + gsutil -m cp -R /data/fio-output/{{ .Values.instanceId }}/* gs://{{ .Values.bucketName }}/fio-output/{{ .Values.instanceId }}/ {{ end }} echo "fio job completed!" diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml index 15111a740d..efd0b6f4a1 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml @@ -22,6 +22,7 @@ bucketName: gke-dlio-test-data # scenario controls the kind of storage that is used for the load testing. local-ssd means directly on LSSD; gcsfuse-generic means on a gcsfuse mount with gcsfuse.mountOptions sent from the caller; gcsfuse-no-file-cache and gcsfuse-file-cache mean as their name suggests. scenario: local-ssd nodeType: n2-standard-96 +instanceId: ldap-yyyymmdd-hhmmss resourceLimits: cpu: 0 diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py b/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py index 1cfc6a64c9..a62063f2c3 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py @@ -46,10 +46,13 @@ } -def downloadFioOutputs(fioWorkloads): +def downloadFioOutputs(fioWorkloads: set, instanceId: str): for fioWorkload in fioWorkloads: + dstDir = ( + _LOCAL_LOGS_LOCATION + "/" + instanceId + "/" + fioWorkload.fileSize + ) try: - os.makedirs(_LOCAL_LOGS_LOCATION + "/" + fioWorkload.fileSize) + os.makedirs(dstDir) except FileExistsError: pass @@ -62,8 +65,8 @@ def downloadFioOutputs(fioWorkloads): "cp", "-r", "--no-user-output-enabled", # do not print names of objects being copied - f"gs://{fioWorkload.bucket}/fio-output", - _LOCAL_LOGS_LOCATION + "/" + fioWorkload.fileSize, + f"gs://{fioWorkload.bucket}/fio-output/{instanceId}/*", + dstDir, ], capture_output=False, text=True, @@ -99,6 +102,11 @@ def downloadFioOutputs(fioWorkloads): ), required=True, ) + parser.add_argument( + "--instance-id", + help="unique string ID for current test-run", + required=True, + ) args = parser.parse_args() try: @@ -109,10 +117,10 @@ def downloadFioOutputs(fioWorkloads): fioWorkloads = fio_workload.ParseTestConfigForFioWorkloads( args.workload_config ) - downloadFioOutputs(fioWorkloads) + downloadFioOutputs(fioWorkloads, args.instance_id) """ - "{read_type}-{mean_file_size}": + "{read_type}-{mean_file_size}-{bs}-{numjobs}-{nrfiles}": "mean_file_size": str "read_type": str "records": @@ -126,7 +134,7 @@ def downloadFioOutputs(fioWorkloads): if not mash_installed: print("Mash is not installed, will skip parsing CPU and memory usage.") - for root, _, files in os.walk(_LOCAL_LOGS_LOCATION): + for root, _, files in os.walk(_LOCAL_LOGS_LOCATION + "/" + args.instance_id): for file in files: per_epoch_output = root + f"/{file}" if not per_epoch_output.endswith(".json"): @@ -139,13 +147,6 @@ def downloadFioOutputs(fioWorkloads): with open(gcsfuse_mount_options_file) as f: gcsfuse_mount_options = f.read().strip() - print(f"Now parsing file {per_epoch_output} ...") - root_split = root.split("/") - mean_file_size = root_split[-4] - scenario = root_split[-2] - read_type = root_split[-1] - epoch = int(file.split(".")[0][-1]) - with open(per_epoch_output, "r") as f: try: per_epoch_output_data = json.load(f) @@ -153,14 +154,36 @@ def downloadFioOutputs(fioWorkloads): print(f"failed to json-parse {per_epoch_output}, so skipping it.") continue + if ( + not "jobs" in per_epoch_output_data + or not per_epoch_output_data["jobs"] + or not "job options" in per_epoch_output_data["jobs"][0] + or not "bs" in per_epoch_output_data["jobs"][0]["job options"] + ): + print( + f'Did not find "[jobs][0][job options][bs]" in {per_epoch_output},' + " so ignoring this file" + ) + continue + + print(f"Now parsing file {per_epoch_output} ...") + root_split = root.split("/") + mean_file_size = root_split[-4] + scenario = root_split[-2] + read_type = root_split[-1] + epoch = int(file.split(".")[0][-1]) + if "global options" not in per_epoch_output_data: print(f"field: 'global options' missing in {per_epoch_output}") continue global_options = per_epoch_output_data["global options"] nrfiles = int(global_options["nrfiles"]) numjobs = int(global_options["numjobs"]) + bs = per_epoch_output_data["jobs"][0]["job options"]["bs"] - key = "-".join([read_type, mean_file_size]) + key = "-".join( + [read_type, mean_file_size, bs, str(numjobs), str(nrfiles)] + ) if key not in output: output[key] = { "mean_file_size": mean_file_size, @@ -176,7 +199,7 @@ def downloadFioOutputs(fioWorkloads): r = record.copy() bs = per_epoch_output_data["jobs"][0]["job options"]["bs"] r["pod_name"] = ( - f"fio-tester-{read_type}-{mean_file_size.lower()}-{bs.lower()}-{scenario}" + f"fio-tester-{args.instance_id}-{scenario}-{read_type}-{mean_file_size.lower()}-{bs.lower()}-{numjobs}-{nrfiles}" ) r["epoch"] = epoch r["scenario"] = scenario @@ -230,13 +253,16 @@ def downloadFioOutputs(fioWorkloads): " (s),Throughput (MB/s),IOPS,Throughput over Local SSD (%),GCSFuse Lowest" " Memory (MB),GCSFuse Highest Memory (MB),GCSFuse Lowest CPU" " (core),GCSFuse Highest CPU" - " (core),Pod,Start,End,GcsfuseMoutOptions,BlockSize,FilesPerThread,NumThreads\n" + " (core),Pod,Start,End,GcsfuseMoutOptions,BlockSize,FilesPerThread,NumThreads,InstanceID\n" ) for key in output: record_set = output[key] for scenario in scenario_order: + if not record_set["records"][scenario]: + continue + for i in range(len(record_set["records"][scenario])): if ("local-ssd" in record_set["records"]) and ( len(record_set["records"]["local-ssd"]) @@ -260,7 +286,7 @@ def downloadFioOutputs(fioWorkloads): continue else: output_file.write( - f"{record_set['mean_file_size']},{record_set['read_type']},{scenario},{r['epoch']},{r['duration']},{r['throughput_mb_per_second']},{r['IOPS']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{r['blockSize']},{r['filesPerThread']},{r['numThreads']}\n" + f"{record_set['mean_file_size']},{record_set['read_type']},{scenario},{r['epoch']},{r['duration']},{r['throughput_mb_per_second']},{r['IOPS']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{r['blockSize']},{r['filesPerThread']},{r['numThreads']},{args.instance_id}\n" ) else: try: @@ -274,6 +300,6 @@ def downloadFioOutputs(fioWorkloads): continue else: output_file.write( - f"{record_set['mean_file_size']},{record_set['read_type']},{scenario},'Unknown',{r['epoch']},{r['duration']},{r['throughput_mb_per_second']},{r['IOPS']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{r['blockSize']},{r['filesPerThread']},{r['numThreads']}\n" + f"{record_set['mean_file_size']},{record_set['read_type']},{scenario},{r['epoch']},{r['duration']},{r['throughput_mb_per_second']},{r['IOPS']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{r['blockSize']},{r['filesPerThread']},{r['numThreads']},{args.instance_id}\n" ) output_file.close() diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py b/perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py index 623e91f040..1c326b9f31 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py @@ -33,15 +33,15 @@ def run_command(command: str): print(result.stderr) -def createHelmInstallCommands(fioWorkloads: set) -> list: - """Create helm install commands for the given fioWorkload objects.""" +def createHelmInstallCommands(fioWorkloads: list, instanceId: str): + """Create helm install commands for the given set of fioWorkload objects.""" helm_commands = [] for fioWorkload in fioWorkloads: for readType in fioWorkload.readTypes: commands = [ ( 'helm install' - f' fio-loading-test-{fioWorkload.fileSize.lower()}-{readType}-{fioWorkload.scenario} loading-test' + f' fio-load-{fioWorkload.scenario}-{readType}-{fioWorkload.fileSize.lower()}-{fioWorkload.blockSize.lower()}-{fioWorkload.numThreads}-{fioWorkload.filesPerThread} loading-test' ), f'--set bucketName={fioWorkload.bucket}', f'--set scenario={fioWorkload.scenario}', @@ -50,6 +50,7 @@ def createHelmInstallCommands(fioWorkloads: set) -> list: f'--set fio.blockSize={fioWorkload.blockSize}', f'--set fio.filesPerThread={fioWorkload.filesPerThread}', f'--set fio.numThreads={fioWorkload.numThreads}', + f'--set instanceId={instanceId}', ] helm_command = ' '.join(commands) @@ -61,7 +62,9 @@ def main(args) -> None: fioWorkloads = fio_workload.ParseTestConfigForFioWorkloads( args.workload_config ) - helmInstallCommands = createHelmInstallCommands(fioWorkloads) + helmInstallCommands = createHelmInstallCommands( + fioWorkloads, args.instance_id + ) for helmInstallCommand in helmInstallCommands: print(f'{helmInstallCommand}') if not args.dry_run: @@ -82,6 +85,11 @@ def main(args) -> None: help='Runs FIO tests using this JSON workload configuration', required=True, ) + parser.add_argument( + '--instance-id', + help='unique string ID for current test-run', + required=True, + ) parser.add_argument( '-n', '--dry-run', diff --git a/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh b/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh index 1b87405345..fd98fdeec5 100755 --- a/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh +++ b/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh @@ -60,6 +60,7 @@ readonly gcsfuse_branch=garnitin/add-gke-load-testing/v1 # GCSFuse configuration related readonly DEFAULT_GCSFUSE_MOUNT_OPTIONS="implicit-dirs" # Test runtime configuration +readonly DEFAULT_INSTANCE_ID=${USER}-$(date +%Y%m%d-%H%M%S) readonly DEFAULT_POD_WAIT_TIME_IN_SECONDS=300 function printHelp() { @@ -86,7 +87,8 @@ function printHelp() { # GCSFuse configuration related echo "gcsfuse_mount_options=<\"comma-separated-gcsfuse-mount-options\" e.g. \""${DEFAULT_GCSFUSE_MOUNT_OPTIONS}"\">" # Test runtime configuration - echo "pod_wait_time_in_seconds=" + echo "pod_wait_time_in_seconds=" + echo "instance_id=gcsfuse_mount_options # Test runtime configuration echo "pod_wait_time_in_seconds=\"${pod_wait_time_in_seconds}\"" + echo "instance_id=\"${instance_id}\"" echo "" echo "" echo "" @@ -444,12 +448,12 @@ function deleteAllPods() { function deployAllFioHelmCharts() { echo "Deploying all fio helm charts ..." - cd "${gke_testing_dir}"/examples/fio && python3 ./run_tests.py --workload-config "${gke_testing_dir}"/examples/workloads.json && cd - + cd "${gke_testing_dir}"/examples/fio && python3 ./run_tests.py --workload-config "${gke_testing_dir}"/examples/workloads.json --instance-id ${instance_id} && cd - } function deployAllDlioHelmCharts() { echo "Deploying all dlio helm charts ..." - cd "${gke_testing_dir}"/examples/dlio && python3 ./run_tests.py --workload-config "${gke_testing_dir}"/examples/workloads.json && cd - + cd "${gke_testing_dir}"/examples/dlio && python3 ./run_tests.py --workload-config "${gke_testing_dir}"/examples/workloads.json --instance-id ${instance_id} && cd - } function listAllHelmCharts() { @@ -519,14 +523,14 @@ function revertPodConfigsFilesAfterTestRuns() { function fetchAndParseFioOutputs() { echo "Fetching and parsing fio outputs ..." cd "${gke_testing_dir}"/examples/fio - python3 parse_logs.py --project-number=${project_number} --workload-config="${gke_testing_dir}"/examples/workloads.json + python3 parse_logs.py --project-number=${project_number} --workload-config="${gke_testing_dir}"/examples/workloads.json --instance-id ${instance_id} cd - } function fetchAndParseDlioOutputs() { echo "Fetching and parsing dlio outputs ..." cd "${gke_testing_dir}"/examples/dlio - python3 parse_logs.py --project-number=${project_number} --workload-config="${gke_testing_dir}"/examples/workloads.json + python3 parse_logs.py --project-number=${project_number} --workload-config="${gke_testing_dir}"/examples/workloads.json --instance-id ${instance_id} cd - }