Skip to content

Commit 529c8da

Browse files
committed
In compare_results.py, set threshold based on reporting units; refactor some methods; better documentation.
1 parent 26a0f8f commit 529c8da

File tree

2 files changed

+130
-56
lines changed

2 files changed

+130
-56
lines changed

.github/workflows/integration_tests.yml

+7-9
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ jobs:
5555
- name: Install Python dependencies
5656
run: |
5757
python -m pip install --upgrade pip
58-
pip install . psrecord
58+
pip install ".[dev]" psrecord
5959
- name: Run and profile workflow
6060
env:
6161
branch_ref: ${{ steps.branch_ref.outputs.branch_ref }}
@@ -70,19 +70,17 @@ jobs:
7070
else
7171
python tests/integration_testing/run_workflow.py --yaml tests/integration_testing/integration_test.yml
7272
fi
73-
rm -rf ./tests/integration_testing/results/*
7473
mv ./results/* ./tests/integration_testing/results/
7574
- name: Compare integration test results
7675
run: |
77-
#FIXME temporarily pull from ci_outputs
78-
git fetch origin master ci_outputs
76+
git fetch origin master
7977
branch_name="${{ github.ref }}"
8078
if [[ $(git diff --exit-code origin/master ./tests/integration_testing/results/agg_results.json ./tests/integration_testing/results/ecm_results.json) ]]; then
8179
mkdir tests/integration_testing/base_results
82-
git show origin/ci_outputs:tests/integration_testing/results/agg_results.json > tests/integration_testing/base_results/agg_results.json
83-
git show origin/ci_outputs:tests/integration_testing/results/ecm_results.json > tests/integration_testing/base_results/ecm_results.json
84-
git show origin/ci_outputs:tests/integration_testing/results/plots/tech_potential/Summary_Data-TP.xlsx > tests/integration_testing/base_results/Summary_Data-TP.xlsx
85-
git show origin/ci_outputs:tests/integration_testing/results/plots/max_adopt_potential/Summary_Data-MAP.xlsx > tests/integration_testing/base_results/Summary_Data-MAP.xlsx
80+
git show origin/master:tests/integration_testing/results/agg_results.json > tests/integration_testing/base_results/agg_results.json
81+
git show origin/master:tests/integration_testing/results/ecm_results.json > tests/integration_testing/base_results/ecm_results.json
82+
git show origin/master:tests/integration_testing/results/plots/tech_potential/Summary_Data-TP.xlsx > tests/integration_testing/base_results/Summary_Data-TP.xlsx
83+
git show origin/master:tests/integration_testing/results/plots/max_adopt_potential/Summary_Data-MAP.xlsx > tests/integration_testing/base_results/Summary_Data-MAP.xlsx
8684
8785
python tests/integration_testing/compare_results.py --base-dir tests/integration_testing/base_results --new-dir tests/integration_testing/results
8886
fi
@@ -98,7 +96,7 @@ jobs:
9896
git pull origin $branch_ref
9997
git add ./tests/integration_testing/results/*.json
10098
if [[ $(git diff --cached --exit-code) ]]; then
101-
git add ./tests/integration_testing/results/plots
99+
git add ./tests/integration_testing/results/plots/**/*.pdf
102100
git config --system user.email "[email protected]"
103101
git config --system user.name "GitHub Action"
104102
git commit -m "Upload results files from CI build"

tests/integration_testing/compare_results.py

+123-47
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
import json
44
import re
55
from pathlib import Path
6+
import logging
7+
from scout.config import LogConfig
8+
LogConfig.configure_logging()
9+
logger = logging.getLogger(__name__)
610

711

812
class ScoutCompare():
@@ -12,48 +16,71 @@ class ScoutCompare():
1216
"""
1317

1418
@staticmethod
15-
def load_json(file_path):
19+
def load_json(file_path: Path):
20+
"""Load json file as dictionary
21+
22+
Args:
23+
file_path (Path): filepath of json file
24+
25+
Returns:
26+
dict: json as a dictionary
27+
"""
1628
with open(file_path, 'r') as file:
1729
return json.load(file)
1830

1931
@staticmethod
20-
def load_summary_report(file_path):
32+
def load_summary_report(file_path: Path):
33+
"""Read in a summary report
34+
35+
Args:
36+
file_path (Path): filepath of summary report xlsx
37+
38+
Returns:
39+
pd.DataFrame: summary report DataFrame
40+
"""
2141
reports = pd.read_excel(file_path, sheet_name=None, index_col=list(range(5)))
2242
return reports
2343

24-
def compare_dict_keys(self, dict1, dict2, paths, path='', key_diffs=None):
44+
def compare_dict_keys(self,
45+
dict1: dict,
46+
dict2: dict,
47+
paths: list,
48+
path: str = '',
49+
key_diffs: pd.DataFrame = None):
2550
"""Compares nested keys across two dictionaries by recursively searching each level
2651
2752
Args:
2853
dict1 (dict): baseline dictionary to compare
2954
dict2 (dict): new dictionary to compare
3055
paths (list): paths to the original files from which the dictionaries are imported
31-
path (str, optional): current dictionary path at whcih to compare. Defaults to ''.
56+
path (str, optional): current dictionary path at which to compare. Defaults to ''.
3257
key_diffs (pd.DataFrame, optional): existing summary of difference. Defaults to None.
3358
3459
Returns:
3560
pd.DataFrame: summary of differences specifying the file, the unique keys, and the
3661
path that key is found at.
3762
"""
3863
if key_diffs is None:
39-
key_diffs = pd.DataFrame(columns=["Results file", "Unique key", "Found at"])
40-
keys1 = set(dict1.keys())
41-
keys2 = set(dict2.keys())
64+
key_diffs = pd.DataFrame(columns=["Results file", "Unique key(s)", "Found at"])
65+
keys1, keys2 = set(dict1.keys()), set(dict2.keys())
4266
only_in_dict1 = keys1 - keys2
4367
only_in_dict2 = keys2 - keys1
4468

69+
# Write report rows if keys differ
70+
diff_entries = []
4571
if only_in_dict1:
46-
new_row = pd.DataFrame({"Results file": f"{paths[0].parent.name}/{paths[0].name}",
47-
"Unique key": str(only_in_dict1),
48-
"Found at": path[2:]}, index=[0])
49-
key_diffs = pd.concat([key_diffs, new_row], ignore_index=True)
72+
diff_entries.extend([{"Results file": paths[0].as_posix(),
73+
"Unique key(s)": str(list(only_in_dict1)),
74+
"Found at": path[2:]}])
5075
if only_in_dict2:
51-
new_row = pd.DataFrame({"Results file": f"{paths[1].parent.name}/{paths[1].name}",
52-
"Unique key": str(only_in_dict2),
53-
"Found at": path[2:]}, index=[0])
54-
key_diffs = pd.concat([key_diffs, new_row], ignore_index=True)
76+
diff_entries.extend([{"Results file": paths[1].as_posix(),
77+
"Unique key(s)": str(list(only_in_dict2)),
78+
"Found at": path[2:]}])
79+
if diff_entries:
80+
key_diffs = pd.concat([key_diffs, pd.DataFrame(diff_entries)], ignore_index=True)
5581

56-
for key in keys1.intersection(keys2):
82+
# Recursively call if keys intersect
83+
for key in keys1 & keys2:
5784
if isinstance(dict1[key], dict) and isinstance(dict2[key], dict):
5885
key_diffs = self.compare_dict_keys(dict1[key],
5986
dict2[key],
@@ -63,54 +90,82 @@ def compare_dict_keys(self, dict1, dict2, paths, path='', key_diffs=None):
6390

6491
return key_diffs
6592

66-
def compare_dict_values(self, dict1, dict2, percent_threshold=10, abs_threshold=1000):
93+
def compare_dict_values(self, dict1, dict2, percent_threshold=10):
6794
"""Compares values across two dictionary by recursively searching keys until identifying
68-
values at common paths. Both thresholds must be met to report results.
95+
values at common paths. The percent difference is only reported if the percentage
96+
meets or exceeds the threshold and one or both values exceed the absolute value
97+
threshold, which depends on the units of the values.
6998
7099
Args:
71100
dict1 (dict): baseline dictionary to compare
72101
dict2 (dict): new dictionary to compare
73102
percent_threshold (int, optional): the percent difference threshold at which
74103
differences are reported. Defaults to 10.
75-
abs_threshold (int, optional): the abosolute difference threshold at which differences
76-
are reported. Defaults to 10.
77104
78105
Returns:
79106
pd.DataFrame: summary of percent differences that meet thresholds
80107
"""
81108
diff_report = {}
109+
abs_threshold_map = {"USD": 1000, "MMBtu": 1000, "MMTons": 10}
82110

83-
def compare_recursive(d1, d2, path=""):
111+
# Recursively navigate dicts until finding numeric values at the same location to compare
112+
def compare_recursive(d1, d2, path="", units=""):
84113
for key in d1.keys():
85114
current_path = f"{path}['{key}']"
115+
units = next((unit for unit in abs_threshold_map.keys() if unit in key), units)
116+
valid_nums = (int, float)
86117
if isinstance(d1[key], dict) and key in d2:
87-
compare_recursive(d1[key], d2[key], current_path)
88-
elif isinstance(d1[key], (int, float)) and key in d2:
89-
if isinstance(d2[key], (int, float)):
90-
val1 = d1[key]
91-
val2 = d2[key]
92-
if val1 != 0:
93-
percent_change = ((val2 - val1) / val1) * 100
94-
if (abs(percent_change) >= percent_threshold) and \
95-
(abs(val1) >= abs_threshold or abs(val2) >= abs_threshold):
96-
diff_report[current_path] = percent_change
118+
compare_recursive(d1[key], d2[key], current_path, units)
119+
elif isinstance(d1.get(key), valid_nums) and isinstance(d2.get(key), valid_nums):
120+
val1, val2 = d1[key], d2[key]
121+
if val1 == 0:
122+
percent_change = float("inf") if val2 != 0 else 0
123+
else:
124+
percent_change = ((val2 - val1) / val1) * 100
125+
abs_threshold = abs_threshold_map.get(units, float("inf"))
126+
if (abs(percent_change) >= percent_threshold) and \
127+
(abs(val1) >= abs_threshold or abs(val2) >= abs_threshold):
128+
diff_report[current_path] = {"base": val1,
129+
"new": val2,
130+
"percent_diff": percent_change}
97131

98132
compare_recursive(dict1, dict2)
99133
return diff_report
100134

101-
def split_json_key_path(self, path):
135+
def split_json_key_path(self, path: str):
136+
"""Parse a string of nested keys found in a results json file
137+
138+
Args:
139+
path (str): string of nested keys seperated by brackets
140+
141+
Returns:
142+
list: list of individual keys
143+
"""
102144
keys = re.findall(r"\['(.*?)'\]", path)
103145
if len(keys) == 5:
104146
keys[4:4] = [None, None, None]
105147
return keys
106148

107-
def write_dict_key_report(self, diff_report, output_path):
149+
def write_dict_key_report(self, diff_report: pd.DataFrame, output_path: Path):
150+
"""Writes a dictionary key report to a csv file
151+
152+
Args:
153+
diff_report (pd.DataFrame): report with dictionary key differences
154+
output_path (Path): csv output path
155+
"""
108156
if diff_report.empty:
157+
logger.info(f"No key differences found, {output_path} not written")
109158
return
110159
diff_report.to_csv(output_path, index=False)
111-
print(f"Wrote dictionary key report to {output_path}")
160+
logger.info(f"Wrote dictionary key report to {output_path}")
161+
162+
def write_dict_value_report(self, diff_report: pd.DataFrame, output_path: Path):
163+
"""Writes a dictionary value report to a csv file
112164
113-
def write_dict_value_report(self, diff_report, output_path):
165+
Args:
166+
diff_report (pd.DataFrame): report with dictionary value differences
167+
output_path (Path): csv output path
168+
"""
114169
col_headers = [
115170
"ECM",
116171
"Markets and Savings Type",
@@ -123,20 +178,30 @@ def write_dict_value_report(self, diff_report, output_path):
123178
]
124179
df = pd.DataFrame(columns=["Results path"], data=list(diff_report.keys()))
125180
if df.empty:
181+
logger.info(f"No changes above the threshold found, {output_path} not written")
126182
return
127183
df[col_headers] = df["Results path"].apply(self.split_json_key_path).apply(pd.Series)
128-
df["Percent difference"] = [round(diff, 2) for diff in diff_report.values()]
184+
df["Percent difference"] = [
185+
round(diff_dict["percent_diff"], 2) for diff_dict in diff_report.values()]
186+
df["Base value"] = [round(diff_dict["base"], 2) for diff_dict in diff_report.values()]
187+
df["New value"] = [round(diff_dict["new"], 2) for diff_dict in diff_report.values()]
129188
df = df.dropna(axis=1, how="all")
130189
df.to_csv(output_path, index=False)
131-
print(f"Wrote dictionary value report to {output_path}")
190+
logger.info(f"Wrote dictionary value report to {output_path}")
132191

133-
def compare_jsons(self, json1_path, json2_path, output_dir=True):
192+
def compare_jsons(self,
193+
json1_path: Path,
194+
json2_path: Path,
195+
percent_threshold: float,
196+
output_dir: Path = None):
134197
"""Compare two jsons and report differences in keys and in values
135198
136199
Args:
137200
json1_path (Path): baseline json file to compare
138201
json2_path (Path): new json file to compare
139-
write_reports (bool, optional): _description_. Defaults to True.
202+
percent_threshold (float): threshold for reporting percent difference if values
203+
output_dir (Path, optional): output directory where comparison reports are saved.
204+
Defaults to None.
140205
"""
141206
json1 = self.load_json(json1_path)
142207
json2 = self.load_json(json2_path)
@@ -148,18 +213,21 @@ def compare_jsons(self, json1_path, json2_path, output_dir=True):
148213
self.write_dict_key_report(key_diffs, output_dir / f"{json2_path.stem}_key_diffs.csv")
149214

150215
# Compare differences in json values
151-
val_diffs = self.compare_dict_values(json1, json2)
216+
val_diffs = self.compare_dict_values(json1, json2, percent_threshold=percent_threshold)
152217
self.write_dict_value_report(val_diffs, output_dir / f"{json2_path.stem}_value_diffs.csv")
153218

154-
def compare_summary_reports(self, report1_path, report2_path, output_dir=None):
219+
def compare_summary_reports(self,
220+
report1_path: Path,
221+
report2_path: Path,
222+
output_dir: Path = None):
155223
"""Compare Summary_Data-TP.xlsx and Summary_Data-MAP.xlsx with baseline files
156224
157225
Args:
158226
report1_path (Path): baseline summary report to compare
159227
report2_path (Path): new summary report to compare
160-
output_dir (Path, optional): _description_. Defaults to None.
228+
output_dir (Path, optional): output directory where comparison report is saved.
229+
Defaults to None.
161230
"""
162-
163231
reports1 = self.load_summary_report(report1_path)
164232
reports2 = self.load_summary_report(report2_path)
165233
if output_dir is None:
@@ -170,7 +238,7 @@ def compare_summary_reports(self, report1_path, report2_path, output_dir=None):
170238
diff = (100 * (report2 - report1)/report1).round(2)
171239
diff = diff.reset_index()
172240
diff.to_excel(writer, sheet_name=output_type, index=False)
173-
print(f"Wrote Summary_Data percent difference report to {output_path}")
241+
logger.info(f"Wrote Summary_Data percent difference report to {output_path}")
174242

175243

176244
def main():
@@ -194,10 +262,16 @@ def main():
194262
new_dir = args.new_dir.resolve()
195263
agg_json_base = base_dir / "agg_results.json"
196264
agg_json_new = new_dir / "agg_results.json"
197-
compare.compare_jsons(agg_json_base, agg_json_new, output_dir=new_dir)
265+
compare.compare_jsons(agg_json_base,
266+
agg_json_new,
267+
percent_threshold=args.threshold,
268+
output_dir=new_dir)
198269
ecm_json_base = base_dir / "ecm_results.json"
199270
ecm_json_new = new_dir / "ecm_results.json"
200-
compare.compare_jsons(ecm_json_base, ecm_json_new, output_dir=new_dir)
271+
compare.compare_jsons(ecm_json_base,
272+
ecm_json_new,
273+
percent_threshold=args.threshold,
274+
output_dir=new_dir)
201275

202276
summary_tp_base = base_dir / "Summary_Data-TP.xlsx"
203277
summary_tp_new = new_dir / "plots" / "tech_potential" / "Summary_Data-TP.xlsx"
@@ -208,7 +282,9 @@ def main():
208282
else:
209283
# Compare only as specified by the arguments
210284
if args.json_baseline and args.json_new:
211-
compare.compare_jsons(args.json_baseline, args.json_new)
285+
compare.compare_jsons(args.json_baseline,
286+
args.json_new,
287+
percent_threshold=args.threshold)
212288
if args.summary_baseline and args.summary_new:
213289
compare.compare_summary_reports(args.summary_baseline, args.summary_new)
214290

0 commit comments

Comments
 (0)