From dc187d1f22296dd2ca49c81eaf92037e11a15bef Mon Sep 17 00:00:00 2001
From: Nitin Garg <113666283+gargnitingoogle@users.noreply.github.com>
Date: Mon, 26 Aug 2024 11:35:33 +0530
Subject: [PATCH] [testing-on-gke] Support instance-id and more complex
 configurations (#2359)

* Support special cases

- Support instance_id (unique-id for the current test-run, allowed
  multiple runs concurrently)
   1. Support env variable instance_id in run-script.
   2. Support instance_id as argument in fio/dlio
      run_tests.py, parse_logs.py
   3. Pass instance_id in fio/dlio pod yaml config
- Support multiple combinations of blockSize,numThreads,
  filesPerThread, combinations for a given fileSize for fio tests.

* address self-review comment
---
 .../examples/dlio/parse_logs.py               | 51 +++++++++------
 .../testing_on_gke/examples/dlio/run_tests.py | 16 +++--
 .../templates/dlio-tester.yaml                | 10 +--
 .../dlio/unet3d-loading-test/values.yaml      |  1 +
 .../loading-test/templates/fio-tester.yaml    | 23 +++----
 .../examples/fio/loading-test/values.yaml     |  1 +
 .../testing_on_gke/examples/fio/parse_logs.py | 64 +++++++++++++------
 .../testing_on_gke/examples/fio/run_tests.py  | 16 +++--
 .../testing_on_gke/examples/run-gke-tests.sh  | 14 ++--
 9 files changed, 129 insertions(+), 67 deletions(-)

diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py b/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py
index eddf0beef0..b6792ca856 100644
--- a/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py
+++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py
@@ -23,7 +23,7 @@
 sys.path.append("../")
 from utils.utils import get_memory, get_cpu, standard_timestamp, is_mash_installed
 
-_LOCAL_LOGS_LOCATION = "../../bin/dlio-logs"
+_LOCAL_LOGS_LOCATION = "../../bin/dlio-logs/logs"
 
 record = {
     "pod_name": "",
@@ -44,7 +44,7 @@
 }
 
 
-def downloadDlioOutputs(dlioWorkloads):
+def downloadDlioOutputs(dlioWorkloads: set, instanceId: str):
   for dlioWorkload in dlioWorkloads:
     print(f"Downloading DLIO logs from the bucket {dlioWorkload.bucket}...")
     result = subprocess.run(
@@ -55,7 +55,7 @@ def downloadDlioOutputs(dlioWorkloads):
             "cp",
             "-r",
             "--no-user-output-enabled",  # do not print names of files being copied
-            f"gs://{dlioWorkload.bucket}/logs",
+            f"gs://{dlioWorkload.bucket}/logs/{instanceId}",
             _LOCAL_LOGS_LOCATION,
         ],
         capture_output=False,
@@ -92,6 +92,11 @@ def downloadDlioOutputs(dlioWorkloads):
       ),
       required=True,
   )
+  parser.add_argument(
+      "--instance-id",
+      help="unique string ID for current test-run",
+      required=True,
+  )
   args = parser.parse_args()
 
   try:
@@ -102,7 +107,7 @@ def downloadDlioOutputs(dlioWorkloads):
   dlioWorkloads = dlio_workload.ParseTestConfigForDlioWorkloads(
       args.workload_config
   )
-  downloadDlioOutputs(dlioWorkloads)
+  downloadDlioOutputs(dlioWorkloads, args.instance_id)
 
   """
     "{num_files_train}-{mean_file_size}-{batch_size}":
@@ -120,7 +125,7 @@ def downloadDlioOutputs(dlioWorkloads):
   if not mash_installed:
     print("Mash is not installed, will skip parsing CPU and memory usage.")
 
-  for root, _, files in os.walk(_LOCAL_LOGS_LOCATION):
+  for root, _, files in os.walk(_LOCAL_LOGS_LOCATION + "/" + args.instance_id):
     if files:
       print(f"Parsing directory {root} ...")
       per_epoch_stats_file = root + "/per_epoch_stats.json"
@@ -153,9 +158,9 @@ def downloadDlioOutputs(dlioWorkloads):
 
         if key not in output:
           output[key] = {
-              "num_files_train": part_list[2],
-              "mean_file_size": part_list[3],
-              "batch_size": part_list[4],
+              "num_files_train": part_list[-3],
+              "mean_file_size": part_list[-2],
+              "batch_size": part_list[-1],
               "records": {
                   "local-ssd": [],
                   "gcsfuse-generic": [],
@@ -167,7 +172,7 @@ def downloadDlioOutputs(dlioWorkloads):
         r = record.copy()
         r["pod_name"] = summary_data["hostname"]
         r["epoch"] = i + 1
-        r["scenario"] = "-".join(part_list[5:])
+        r["scenario"] = root.split("/")[-1]
         r["train_au_percentage"] = round(
             summary_data["metric"]["train_au_percentage"][i], 2
         )
@@ -221,7 +226,7 @@ def downloadDlioOutputs(dlioWorkloads):
       " (s),GPU Utilization (%),Throughput (sample/s),Throughput"
       " (MB/s),Throughput over Local SSD (%),GCSFuse Lowest Memory (MB),GCSFuse"
       " Highest Memory (MB),GCSFuse Lowest CPU (core),GCSFuse Highest CPU"
-      " (core),Pod,Start,End,GcsfuseMountOptions\n"
+      " (core),Pod,Start,End,GcsfuseMountOptions,InstanceID\n"
   )
 
   for key in output:
@@ -242,19 +247,25 @@ def downloadDlioOutputs(dlioWorkloads):
       ):
         for i in range(len(record_set["records"]["local-ssd"])):
           r = record_set["records"][scenario][i]
-          r["throughput_over_local_ssd"] = round(
-              r["train_throughput_mb_per_second"]
-              / record_set["records"]["local-ssd"][i][
-                  "train_throughput_mb_per_second"
-              ]
-              * 100,
-              2,
-          )
+          try:
+            r["throughput_over_local_ssd"] = round(
+                r["train_throughput_mb_per_second"]
+                / record_set["records"]["local-ssd"][i][
+                    "train_throughput_mb_per_second"
+                ]
+                * 100,
+                2,
+            )
+          except ZeroDivisionError:
+            print("Got ZeroDivisionError. Ignoring it.")
+            r["throughput_over_local_ssd"] = 0
+          except:
+            raise
           output_file.write(
               f"{record_set['mean_file_size']},{record_set['num_files_train']},{total_size},{record_set['batch_size']},{scenario},"
           )
           output_file.write(
-              f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\"\n"
+              f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{args.instance_id}\n"
           )
       else:
         for i in range(len(record_set["records"][scenario])):
@@ -264,7 +275,7 @@ def downloadDlioOutputs(dlioWorkloads):
               f"{record_set['mean_file_size']},{record_set['num_files_train']},{total_size},{record_set['batch_size']},{scenario},"
           )
           output_file.write(
-              f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\"\n"
+              f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{args.instance_id}\n"
           )
 
   output_file.close()
diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py b/perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py
index 1c645fc119..55484ec3c5 100644
--- a/perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py
+++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py
@@ -34,21 +34,22 @@ def run_command(command: str):
   print(result.stderr)
 
 
-def createHelmInstallCommands(dlioWorkloads: set) -> list:
-  """Create helm install commands for the given dlioWorkload objects."""
+def createHelmInstallCommands(dlioWorkloads: set, instanceId: str):
+  """Create helm install commands for the given set of dlioWorkload objects."""
   helm_commands = []
   for dlioWorkload in dlioWorkloads:
     for batchSize in dlioWorkload.batchSizes:
       commands = [
           (
               'helm install'
-              f' {dlioWorkload.bucket}-{batchSize}-{dlioWorkload.scenario} unet3d-loading-test'
+              f' dlio-unet3d-{dlioWorkload.scenario}-{dlioWorkload.numFilesTrain}-{dlioWorkload.recordLength}-{batchSize} unet3d-loading-test'
           ),
           f'--set bucketName={dlioWorkload.bucket}',
           f'--set scenario={dlioWorkload.scenario}',
           f'--set dlio.numFilesTrain={dlioWorkload.numFilesTrain}',
           f'--set dlio.recordLength={dlioWorkload.recordLength}',
           f'--set dlio.batchSize={batchSize}',
+          f'--set instanceId={instanceId}',
       ]
 
       helm_command = ' '.join(commands)
@@ -60,7 +61,9 @@ def main(args) -> None:
   dlioWorkloads = dlio_workload.ParseTestConfigForDlioWorkloads(
       args.workload_config
   )
-  helmInstallCommands = createHelmInstallCommands(dlioWorkloads)
+  helmInstallCommands = createHelmInstallCommands(
+      dlioWorkloads, args.instance_id
+  )
   for helmInstallCommand in helmInstallCommands:
     print(f'{helmInstallCommand}')
     if not args.dry_run:
@@ -81,6 +84,11 @@ def main(args) -> None:
       help='Runs DLIO Unet3d tests using this JSON workload configuration.',
       required=True,
   )
+  parser.add_argument(
+      '--instance-id',
+      help='unique string ID for current test-run',
+      required=True,
+  )
   parser.add_argument(
       '-n',
       '--dry-run',
diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml
index 7a117da56b..74d36bd820 100644
--- a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml
+++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml
@@ -16,7 +16,7 @@
 apiVersion: v1
 kind: Pod
 metadata:
-  name: dlio-tester-{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}-{{ .Values.scenario }}
+  name: dlio-tester-{{ .Values.scenario }}-{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}
   {{- if ne .Values.scenario "local-ssd" }}
   annotations:
     gke-gcsfuse/volumes: "true"
@@ -73,6 +73,8 @@ spec:
         sleep 300
         {{ end }}
 
+        outputDir=/logs/{{ .Values.instanceId }}/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }}
+
         echo "Testing {{ .Values.scenario }}"
         mpirun -np 8 dlio_benchmark workload=unet3d_a100 \
         ++workload.train.epochs=4 \
@@ -84,14 +86,14 @@ spec:
         ++workload.reader.batch_size={{ .Values.dlio.batchSize }} \
         ++workload.dataset.record_length={{ .Values.dlio.recordLength }} \
         ++workload.reader.read_threads={{ .Values.dlio.readThreads }} \
-        ++workload.output.folder=/logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }}
+        ++workload.output.folder=${outputDir}
 
         # dump the gcsfuse-mount-configuration to a file in output-directory.
         {{ if eq .Values.scenario "gcsfuse-generic"}}
-        echo "{{ .Values.gcsfuse.mountOptions }}" > /logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }}/gcsfuse_mount_options
+        echo "{{ .Values.gcsfuse.mountOptions }}" > ${outputDir}/gcsfuse_mount_options
         {{ end }}
 
-        gsutil -m cp -R /logs gs://{{ .Values.bucketName }}/logs/$(date +"%Y-%m-%d-%H-%M")
+        gsutil -m cp -R /logs/{{ .Values.instanceId }} gs://{{ .Values.bucketName }}/logs/{{ .Values.instanceId }}/$(date +"%Y-%m-%d-%H-%M")
     volumeMounts:
     - name: dshm
       mountPath: /dev/shm
diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml
index cbaa9d0137..ef0b3a20ef 100644
--- a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml
+++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml
@@ -22,6 +22,7 @@ bucketName: gke-dlio-test-data
 # scenario controls the kind of storage that is used for the load testing. local-ssd means directly on LSSD; gcsfuse-generic means on a gcsfuse mount with gcsfuse.mountOptions sent from the caller; gcsfuse-no-file-cache and gcsfuse-file-cache mean as their name suggests.
 scenario: local-ssd
 nodeType: n2-standard-96
+instanceId: ldap-yyyymmdd-hhmmss
 
 resourceLimits:
   cpu: 0
diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml
index 368fba13af..d9c79eb8d4 100644
--- a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml
+++ b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml
@@ -16,7 +16,7 @@
 apiVersion: v1
 kind: Pod
 metadata:
-  name: fio-tester-{{ .Values.fio.readType }}-{{ lower .Values.fio.fileSize }}-{{ lower .Values.fio.blockSize }}-{{ .Values.scenario }}
+  name: fio-tester-{{ .Values.instanceId }}-{{ .Values.scenario }}-{{ .Values.fio.readType }}-{{ lower .Values.fio.fileSize }}-{{ lower .Values.fio.blockSize }}-{{ .Values.fio.numThreads }}-{{ .Values.fio.filesPerThread }}
   {{- if ne .Values.scenario "local-ssd" }}
   annotations:
     gke-gcsfuse/volumes: "true"
@@ -45,7 +45,12 @@ spec:
         echo "Install dependencies..."
         apt-get update
         apt-get install -y libaio-dev gcc make git time wget
-        
+
+        no_of_files_per_thread={{ .Values.fio.filesPerThread }}
+        block_size={{ .Values.fio.blockSize }}
+        file_size={{ .Values.fio.fileSize }}
+        num_of_threads={{ .Values.fio.numThreads }}
+
         {{ if eq .Values.scenario "local-ssd" }}
         echo "Installing gsutil..."
         apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg curl
@@ -105,12 +110,8 @@ spec:
 
         echo "Setup default values..."
         epoch=4
-        no_of_files_per_thread={{ .Values.fio.filesPerThread }}
         read_type={{ .Values.fio.readType }}
         pause_in_seconds=20
-        block_size={{ .Values.fio.blockSize }}
-        file_size={{ .Values.fio.fileSize }}
-        num_of_threads={{ .Values.fio.numThreads }}
         workload_dir=/data
 
         # Cleaning the pagecache, dentries and inode cache before the starting the workload.
@@ -125,18 +126,18 @@ spec:
         time ls -R $workload_dir 1> /dev/null
 
         echo "Run fio tests..."
-        mkdir -p /data/fio-output/{{ .Values.scenario }}/$read_type
+        output_dir=/data/fio-output/{{ .Values.instanceId }}/${file_size}-{{ lower .Values.fio.blockSize}}-${num_of_threads}-${no_of_files_per_thread}/{{ .Values.scenario }}/$read_type
+        mkdir -p ${output_dir}
 
         # dump the gcsfuse-mount-configuration to a file in output-directory.
         {{ if eq .Values.scenario "gcsfuse-generic" }}
-        echo "{{ .Values.gcsfuse.mountOptions }}" > /data/fio-output/{{ .Values.scenario }}/$read_type/gcsfuse_mount_options
+        echo "{{ .Values.gcsfuse.mountOptions }}" > ${output_dir}/gcsfuse_mount_options
         {{ end }}
 
         for i in $(seq $epoch); do
-
           echo "[Epoch ${i}] start time:" `date +%s`
           free -mh # Memory usage before workload start.
-          NUMJOBS=$num_of_threads NRFILES=$no_of_files_per_thread FILE_SIZE=$file_size BLOCK_SIZE=$block_size READ_TYPE=$read_type DIR=$workload_dir fio ${filename} --alloc-size=1048576 --output-format=json --output="/data/fio-output/{{ .Values.scenario }}/${read_type}/epoch${i}.json"
+          NUMJOBS=$num_of_threads NRFILES=$no_of_files_per_thread FILE_SIZE=$file_size BLOCK_SIZE=$block_size READ_TYPE=$read_type DIR=$workload_dir fio ${filename} --alloc-size=1048576 --output-format=json --output="${output_dir}/epoch${i}.json"
           free -mh # Memory usage after workload completion.
           echo "[Epoch ${i}] end time:" `date +%s`
 
@@ -154,7 +155,7 @@ spec:
         done
         
         {{ if eq .Values.scenario "local-ssd" }}
-        gsutil -m cp -R /data/fio-output/local-ssd gs://{{ .Values.bucketName }}/fio-output
+        gsutil -m cp -R /data/fio-output/{{ .Values.instanceId }}/* gs://{{ .Values.bucketName }}/fio-output/{{ .Values.instanceId }}/
         {{ end }}
         
         echo "fio job completed!"
diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml
index 15111a740d..efd0b6f4a1 100644
--- a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml
+++ b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml
@@ -22,6 +22,7 @@ bucketName: gke-dlio-test-data
 # scenario controls the kind of storage that is used for the load testing. local-ssd means directly on LSSD; gcsfuse-generic means on a gcsfuse mount with gcsfuse.mountOptions sent from the caller; gcsfuse-no-file-cache and gcsfuse-file-cache mean as their name suggests.
 scenario: local-ssd
 nodeType: n2-standard-96
+instanceId: ldap-yyyymmdd-hhmmss
 
 resourceLimits:
   cpu: 0
diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py b/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py
index 1cfc6a64c9..a62063f2c3 100644
--- a/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py
+++ b/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py
@@ -46,10 +46,13 @@
 }
 
 
-def downloadFioOutputs(fioWorkloads):
+def downloadFioOutputs(fioWorkloads: set, instanceId: str):
   for fioWorkload in fioWorkloads:
+    dstDir = (
+        _LOCAL_LOGS_LOCATION + "/" + instanceId + "/" + fioWorkload.fileSize
+    )
     try:
-      os.makedirs(_LOCAL_LOGS_LOCATION + "/" + fioWorkload.fileSize)
+      os.makedirs(dstDir)
     except FileExistsError:
       pass
 
@@ -62,8 +65,8 @@ def downloadFioOutputs(fioWorkloads):
             "cp",
             "-r",
             "--no-user-output-enabled",  # do not print names of objects being copied
-            f"gs://{fioWorkload.bucket}/fio-output",
-            _LOCAL_LOGS_LOCATION + "/" + fioWorkload.fileSize,
+            f"gs://{fioWorkload.bucket}/fio-output/{instanceId}/*",
+            dstDir,
         ],
         capture_output=False,
         text=True,
@@ -99,6 +102,11 @@ def downloadFioOutputs(fioWorkloads):
       ),
       required=True,
   )
+  parser.add_argument(
+      "--instance-id",
+      help="unique string ID for current test-run",
+      required=True,
+  )
   args = parser.parse_args()
 
   try:
@@ -109,10 +117,10 @@ def downloadFioOutputs(fioWorkloads):
   fioWorkloads = fio_workload.ParseTestConfigForFioWorkloads(
       args.workload_config
   )
-  downloadFioOutputs(fioWorkloads)
+  downloadFioOutputs(fioWorkloads, args.instance_id)
 
   """
-    "{read_type}-{mean_file_size}":
+    "{read_type}-{mean_file_size}-{bs}-{numjobs}-{nrfiles}":
         "mean_file_size": str
         "read_type": str
         "records":
@@ -126,7 +134,7 @@ def downloadFioOutputs(fioWorkloads):
   if not mash_installed:
     print("Mash is not installed, will skip parsing CPU and memory usage.")
 
-  for root, _, files in os.walk(_LOCAL_LOGS_LOCATION):
+  for root, _, files in os.walk(_LOCAL_LOGS_LOCATION + "/" + args.instance_id):
     for file in files:
       per_epoch_output = root + f"/{file}"
       if not per_epoch_output.endswith(".json"):
@@ -139,13 +147,6 @@ def downloadFioOutputs(fioWorkloads):
         with open(gcsfuse_mount_options_file) as f:
           gcsfuse_mount_options = f.read().strip()
 
-      print(f"Now parsing file {per_epoch_output} ...")
-      root_split = root.split("/")
-      mean_file_size = root_split[-4]
-      scenario = root_split[-2]
-      read_type = root_split[-1]
-      epoch = int(file.split(".")[0][-1])
-
       with open(per_epoch_output, "r") as f:
         try:
           per_epoch_output_data = json.load(f)
@@ -153,14 +154,36 @@ def downloadFioOutputs(fioWorkloads):
           print(f"failed to json-parse {per_epoch_output}, so skipping it.")
           continue
 
+      if (
+          not "jobs" in per_epoch_output_data
+          or not per_epoch_output_data["jobs"]
+          or not "job options" in per_epoch_output_data["jobs"][0]
+          or not "bs" in per_epoch_output_data["jobs"][0]["job options"]
+      ):
+        print(
+            f'Did not find "[jobs][0][job options][bs]" in {per_epoch_output},'
+            " so ignoring this file"
+        )
+        continue
+
+      print(f"Now parsing file {per_epoch_output} ...")
+      root_split = root.split("/")
+      mean_file_size = root_split[-4]
+      scenario = root_split[-2]
+      read_type = root_split[-1]
+      epoch = int(file.split(".")[0][-1])
+
       if "global options" not in per_epoch_output_data:
         print(f"field: 'global options' missing in {per_epoch_output}")
         continue
       global_options = per_epoch_output_data["global options"]
       nrfiles = int(global_options["nrfiles"])
       numjobs = int(global_options["numjobs"])
+      bs = per_epoch_output_data["jobs"][0]["job options"]["bs"]
 
-      key = "-".join([read_type, mean_file_size])
+      key = "-".join(
+          [read_type, mean_file_size, bs, str(numjobs), str(nrfiles)]
+      )
       if key not in output:
         output[key] = {
             "mean_file_size": mean_file_size,
@@ -176,7 +199,7 @@ def downloadFioOutputs(fioWorkloads):
       r = record.copy()
       bs = per_epoch_output_data["jobs"][0]["job options"]["bs"]
       r["pod_name"] = (
-          f"fio-tester-{read_type}-{mean_file_size.lower()}-{bs.lower()}-{scenario}"
+          f"fio-tester-{args.instance_id}-{scenario}-{read_type}-{mean_file_size.lower()}-{bs.lower()}-{numjobs}-{nrfiles}"
       )
       r["epoch"] = epoch
       r["scenario"] = scenario
@@ -230,13 +253,16 @@ def downloadFioOutputs(fioWorkloads):
       " (s),Throughput (MB/s),IOPS,Throughput over Local SSD (%),GCSFuse Lowest"
       " Memory (MB),GCSFuse Highest Memory (MB),GCSFuse Lowest CPU"
       " (core),GCSFuse Highest CPU"
-      " (core),Pod,Start,End,GcsfuseMoutOptions,BlockSize,FilesPerThread,NumThreads\n"
+      " (core),Pod,Start,End,GcsfuseMoutOptions,BlockSize,FilesPerThread,NumThreads,InstanceID\n"
   )
 
   for key in output:
     record_set = output[key]
 
     for scenario in scenario_order:
+      if not record_set["records"][scenario]:
+        continue
+
       for i in range(len(record_set["records"][scenario])):
         if ("local-ssd" in record_set["records"]) and (
             len(record_set["records"]["local-ssd"])
@@ -260,7 +286,7 @@ def downloadFioOutputs(fioWorkloads):
             continue
           else:
             output_file.write(
-                f"{record_set['mean_file_size']},{record_set['read_type']},{scenario},{r['epoch']},{r['duration']},{r['throughput_mb_per_second']},{r['IOPS']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{r['blockSize']},{r['filesPerThread']},{r['numThreads']}\n"
+                f"{record_set['mean_file_size']},{record_set['read_type']},{scenario},{r['epoch']},{r['duration']},{r['throughput_mb_per_second']},{r['IOPS']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{r['blockSize']},{r['filesPerThread']},{r['numThreads']},{args.instance_id}\n"
             )
         else:
           try:
@@ -274,6 +300,6 @@ def downloadFioOutputs(fioWorkloads):
             continue
           else:
             output_file.write(
-                f"{record_set['mean_file_size']},{record_set['read_type']},{scenario},'Unknown',{r['epoch']},{r['duration']},{r['throughput_mb_per_second']},{r['IOPS']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{r['blockSize']},{r['filesPerThread']},{r['numThreads']}\n"
+                f"{record_set['mean_file_size']},{record_set['read_type']},{scenario},{r['epoch']},{r['duration']},{r['throughput_mb_per_second']},{r['IOPS']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{r['blockSize']},{r['filesPerThread']},{r['numThreads']},{args.instance_id}\n"
             )
   output_file.close()
diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py b/perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py
index 623e91f040..1c326b9f31 100644
--- a/perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py
+++ b/perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py
@@ -33,15 +33,15 @@ def run_command(command: str):
   print(result.stderr)
 
 
-def createHelmInstallCommands(fioWorkloads: set) -> list:
-  """Create helm install commands for the given fioWorkload objects."""
+def createHelmInstallCommands(fioWorkloads: list, instanceId: str):
+  """Create helm install commands for the given set of fioWorkload objects."""
   helm_commands = []
   for fioWorkload in fioWorkloads:
     for readType in fioWorkload.readTypes:
       commands = [
           (
               'helm install'
-              f' fio-loading-test-{fioWorkload.fileSize.lower()}-{readType}-{fioWorkload.scenario} loading-test'
+              f' fio-load-{fioWorkload.scenario}-{readType}-{fioWorkload.fileSize.lower()}-{fioWorkload.blockSize.lower()}-{fioWorkload.numThreads}-{fioWorkload.filesPerThread} loading-test'
           ),
           f'--set bucketName={fioWorkload.bucket}',
           f'--set scenario={fioWorkload.scenario}',
@@ -50,6 +50,7 @@ def createHelmInstallCommands(fioWorkloads: set) -> list:
           f'--set fio.blockSize={fioWorkload.blockSize}',
           f'--set fio.filesPerThread={fioWorkload.filesPerThread}',
           f'--set fio.numThreads={fioWorkload.numThreads}',
+          f'--set instanceId={instanceId}',
       ]
 
       helm_command = ' '.join(commands)
@@ -61,7 +62,9 @@ def main(args) -> None:
   fioWorkloads = fio_workload.ParseTestConfigForFioWorkloads(
       args.workload_config
   )
-  helmInstallCommands = createHelmInstallCommands(fioWorkloads)
+  helmInstallCommands = createHelmInstallCommands(
+      fioWorkloads, args.instance_id
+  )
   for helmInstallCommand in helmInstallCommands:
     print(f'{helmInstallCommand}')
     if not args.dry_run:
@@ -82,6 +85,11 @@ def main(args) -> None:
       help='Runs FIO tests using this JSON workload configuration',
       required=True,
   )
+  parser.add_argument(
+      '--instance-id',
+      help='unique string ID for current test-run',
+      required=True,
+  )
   parser.add_argument(
       '-n',
       '--dry-run',
diff --git a/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh b/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh
index 1b87405345..fd98fdeec5 100755
--- a/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh
+++ b/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh
@@ -60,6 +60,7 @@ readonly gcsfuse_branch=garnitin/add-gke-load-testing/v1
 # GCSFuse configuration related
 readonly DEFAULT_GCSFUSE_MOUNT_OPTIONS="implicit-dirs"
 # Test runtime configuration
+readonly DEFAULT_INSTANCE_ID=${USER}-$(date +%Y%m%d-%H%M%S)
 readonly DEFAULT_POD_WAIT_TIME_IN_SECONDS=300
 
 function printHelp() {
@@ -86,7 +87,8 @@ function printHelp() {
   # GCSFuse configuration related
   echo "gcsfuse_mount_options=<\"comma-separated-gcsfuse-mount-options\" e.g. \""${DEFAULT_GCSFUSE_MOUNT_OPTIONS}"\">"
   # Test runtime configuration
-  echo "pod_wait_time_in_seconds=<number e.g. 60 for checking pod status every 1 min, default="${DEFAULT_POD_WAIT_TIME_IN_SECONDS}">"
+  echo "pod_wait_time_in_seconds=<number e.g. 60 for checking pod status every 1 min, default=\"${DEFAULT_POD_WAIT_TIME_IN_SECONDS}\">"
+  echo "instance_id=<string, not containing spaces, representing unique id for particular test-run e.g. \"${DEFAULT_INSTANCE_ID}\""
   echo ""
   echo ""
   echo ""
@@ -126,6 +128,7 @@ test -n "${csi_src_dir}" || export csi_src_dir="${src_dir}"/gcs-fuse-csi-driver
 test -n "${gcsfuse_mount_options}" || export gcsfuse_mount_options="${DEFAULT_GCSFUSE_MOUNT_OPTIONS}"
 # Test runtime configuration
 test -n "${pod_wait_time_in_seconds}" || export pod_wait_time_in_seconds="${DEFAULT_POD_WAIT_TIME_IN_SECONDS}"
+test -n "${instance_id}" || export instance_id="${DEFAULT_INSTANCE_ID}"
 
 function printRunParameters() {
   echo "Running $0 with following parameters:"
@@ -152,6 +155,7 @@ function printRunParameters() {
   echo "${gcsfuse_mount_options}" >gcsfuse_mount_options
   # Test runtime configuration
   echo "pod_wait_time_in_seconds=\"${pod_wait_time_in_seconds}\""
+  echo "instance_id=\"${instance_id}\""
   echo ""
   echo ""
   echo ""
@@ -444,12 +448,12 @@ function deleteAllPods() {
 
 function deployAllFioHelmCharts() {
   echo "Deploying all fio helm charts ..."
-  cd "${gke_testing_dir}"/examples/fio && python3 ./run_tests.py --workload-config "${gke_testing_dir}"/examples/workloads.json && cd -
+  cd "${gke_testing_dir}"/examples/fio && python3 ./run_tests.py --workload-config "${gke_testing_dir}"/examples/workloads.json --instance-id ${instance_id} && cd -
 }
 
 function deployAllDlioHelmCharts() {
   echo "Deploying all dlio helm charts ..."
-  cd "${gke_testing_dir}"/examples/dlio && python3 ./run_tests.py --workload-config "${gke_testing_dir}"/examples/workloads.json && cd -
+  cd "${gke_testing_dir}"/examples/dlio && python3 ./run_tests.py --workload-config "${gke_testing_dir}"/examples/workloads.json --instance-id ${instance_id} && cd -
 }
 
 function listAllHelmCharts() {
@@ -519,14 +523,14 @@ function revertPodConfigsFilesAfterTestRuns() {
 function fetchAndParseFioOutputs() {
   echo "Fetching and parsing fio outputs ..."
   cd "${gke_testing_dir}"/examples/fio
-  python3 parse_logs.py --project-number=${project_number} --workload-config="${gke_testing_dir}"/examples/workloads.json
+  python3 parse_logs.py --project-number=${project_number} --workload-config="${gke_testing_dir}"/examples/workloads.json --instance-id ${instance_id}
   cd -
 }
 
 function fetchAndParseDlioOutputs() {
   echo "Fetching and parsing dlio outputs ..."
   cd "${gke_testing_dir}"/examples/dlio
-  python3 parse_logs.py --project-number=${project_number} --workload-config="${gke_testing_dir}"/examples/workloads.json
+  python3 parse_logs.py --project-number=${project_number} --workload-config="${gke_testing_dir}"/examples/workloads.json --instance-id ${instance_id}
   cd -
 }