3
3
import json
4
4
import re
5
5
from pathlib import Path
6
+ import logging
7
+ from scout .config import LogConfig
8
+ LogConfig .configure_logging ()
9
+ logger = logging .getLogger (__name__ )
6
10
7
11
8
12
class ScoutCompare ():
@@ -12,48 +16,71 @@ class ScoutCompare():
12
16
"""
13
17
14
18
@staticmethod
15
- def load_json (file_path ):
19
+ def load_json (file_path : Path ):
20
+ """Load json file as dictionary
21
+
22
+ Args:
23
+ file_path (Path): filepath of json file
24
+
25
+ Returns:
26
+ dict: json as a dictionary
27
+ """
16
28
with open (file_path , 'r' ) as file :
17
29
return json .load (file )
18
30
19
31
@staticmethod
20
- def load_summary_report (file_path ):
32
+ def load_summary_report (file_path : Path ):
33
+ """Read in a summary report
34
+
35
+ Args:
36
+ file_path (Path): filepath of summary report xlsx
37
+
38
+ Returns:
39
+ pd.DataFrame: summary report DataFrame
40
+ """
21
41
reports = pd .read_excel (file_path , sheet_name = None , index_col = list (range (5 )))
22
42
return reports
23
43
24
- def compare_dict_keys (self , dict1 , dict2 , paths , path = '' , key_diffs = None ):
44
+ def compare_dict_keys (self ,
45
+ dict1 : dict ,
46
+ dict2 : dict ,
47
+ paths : list ,
48
+ path : str = '' ,
49
+ key_diffs : pd .DataFrame = None ):
25
50
"""Compares nested keys across two dictionaries by recursively searching each level
26
51
27
52
Args:
28
53
dict1 (dict): baseline dictionary to compare
29
54
dict2 (dict): new dictionary to compare
30
55
paths (list): paths to the original files from which the dictionaries are imported
31
- path (str, optional): current dictionary path at whcih to compare. Defaults to ''.
56
+ path (str, optional): current dictionary path at which to compare. Defaults to ''.
32
57
key_diffs (pd.DataFrame, optional): existing summary of difference. Defaults to None.
33
58
34
59
Returns:
35
60
pd.DataFrame: summary of differences specifying the file, the unique keys, and the
36
61
path that key is found at.
37
62
"""
38
63
if key_diffs is None :
39
- key_diffs = pd .DataFrame (columns = ["Results file" , "Unique key" , "Found at" ])
40
- keys1 = set (dict1 .keys ())
41
- keys2 = set (dict2 .keys ())
64
+ key_diffs = pd .DataFrame (columns = ["Results file" , "Unique key(s)" , "Found at" ])
65
+ keys1 , keys2 = set (dict1 .keys ()), set (dict2 .keys ())
42
66
only_in_dict1 = keys1 - keys2
43
67
only_in_dict2 = keys2 - keys1
44
68
69
+ # Write report rows if keys differ
70
+ diff_entries = []
45
71
if only_in_dict1 :
46
- new_row = pd .DataFrame ({"Results file" : f"{ paths [0 ].parent .name } /{ paths [0 ].name } " ,
47
- "Unique key" : str (only_in_dict1 ),
48
- "Found at" : path [2 :]}, index = [0 ])
49
- key_diffs = pd .concat ([key_diffs , new_row ], ignore_index = True )
72
+ diff_entries .extend ([{"Results file" : paths [0 ].as_posix (),
73
+ "Unique key(s)" : str (list (only_in_dict1 )),
74
+ "Found at" : path [2 :]}])
50
75
if only_in_dict2 :
51
- new_row = pd .DataFrame ({"Results file" : f"{ paths [1 ].parent .name } /{ paths [1 ].name } " ,
52
- "Unique key" : str (only_in_dict2 ),
53
- "Found at" : path [2 :]}, index = [0 ])
54
- key_diffs = pd .concat ([key_diffs , new_row ], ignore_index = True )
76
+ diff_entries .extend ([{"Results file" : paths [1 ].as_posix (),
77
+ "Unique key(s)" : str (list (only_in_dict2 )),
78
+ "Found at" : path [2 :]}])
79
+ if diff_entries :
80
+ key_diffs = pd .concat ([key_diffs , pd .DataFrame (diff_entries )], ignore_index = True )
55
81
56
- for key in keys1 .intersection (keys2 ):
82
+ # Recursively call if keys intersect
83
+ for key in keys1 & keys2 :
57
84
if isinstance (dict1 [key ], dict ) and isinstance (dict2 [key ], dict ):
58
85
key_diffs = self .compare_dict_keys (dict1 [key ],
59
86
dict2 [key ],
@@ -63,54 +90,82 @@ def compare_dict_keys(self, dict1, dict2, paths, path='', key_diffs=None):
63
90
64
91
return key_diffs
65
92
66
- def compare_dict_values (self , dict1 , dict2 , percent_threshold = 10 , abs_threshold = 1000 ):
93
+ def compare_dict_values (self , dict1 , dict2 , percent_threshold = 10 ):
67
94
"""Compares values across two dictionary by recursively searching keys until identifying
68
- values at common paths. Both thresholds must be met to report results.
95
+ values at common paths. The percent difference is only reported if the percentage
96
+ meets or exceeds the threshold and one or both values exceed the absolute value
97
+ threshold, which depends on the units of the values.
69
98
70
99
Args:
71
100
dict1 (dict): baseline dictionary to compare
72
101
dict2 (dict): new dictionary to compare
73
102
percent_threshold (int, optional): the percent difference threshold at which
74
103
differences are reported. Defaults to 10.
75
- abs_threshold (int, optional): the abosolute difference threshold at which differences
76
- are reported. Defaults to 10.
77
104
78
105
Returns:
79
106
pd.DataFrame: summary of percent differences that meet thresholds
80
107
"""
81
108
diff_report = {}
109
+ abs_threshold_map = {"USD" : 1000 , "MMBtu" : 1000 , "MMTons" : 10 }
82
110
83
- def compare_recursive (d1 , d2 , path = "" ):
111
+ # Recursively navigate dicts until finding numeric values at the same location to compare
112
+ def compare_recursive (d1 , d2 , path = "" , units = "" ):
84
113
for key in d1 .keys ():
85
114
current_path = f"{ path } ['{ key } ']"
115
+ units = next ((unit for unit in abs_threshold_map .keys () if unit in key ), units )
116
+ valid_nums = (int , float )
86
117
if isinstance (d1 [key ], dict ) and key in d2 :
87
- compare_recursive (d1 [key ], d2 [key ], current_path )
88
- elif isinstance (d1 [key ], (int , float )) and key in d2 :
89
- if isinstance (d2 [key ], (int , float )):
90
- val1 = d1 [key ]
91
- val2 = d2 [key ]
92
- if val1 != 0 :
93
- percent_change = ((val2 - val1 ) / val1 ) * 100
94
- if (abs (percent_change ) >= percent_threshold ) and \
95
- (abs (val1 ) >= abs_threshold or abs (val2 ) >= abs_threshold ):
96
- diff_report [current_path ] = percent_change
118
+ compare_recursive (d1 [key ], d2 [key ], current_path , units )
119
+ elif isinstance (d1 .get (key ), valid_nums ) and isinstance (d2 .get (key ), valid_nums ):
120
+ val1 , val2 = d1 [key ], d2 [key ]
121
+ if val1 == 0 :
122
+ percent_change = float ("inf" ) if val2 != 0 else 0
123
+ else :
124
+ percent_change = ((val2 - val1 ) / val1 ) * 100
125
+ abs_threshold = abs_threshold_map .get (units , float ("inf" ))
126
+ if (abs (percent_change ) >= percent_threshold ) and \
127
+ (abs (val1 ) >= abs_threshold or abs (val2 ) >= abs_threshold ):
128
+ diff_report [current_path ] = {"base" : val1 ,
129
+ "new" : val2 ,
130
+ "percent_diff" : percent_change }
97
131
98
132
compare_recursive (dict1 , dict2 )
99
133
return diff_report
100
134
101
- def split_json_key_path (self , path ):
135
+ def split_json_key_path (self , path : str ):
136
+ """Parse a string of nested keys found in a results json file
137
+
138
+ Args:
139
+ path (str): string of nested keys seperated by brackets
140
+
141
+ Returns:
142
+ list: list of individual keys
143
+ """
102
144
keys = re .findall (r"\['(.*?)'\]" , path )
103
145
if len (keys ) == 5 :
104
146
keys [4 :4 ] = [None , None , None ]
105
147
return keys
106
148
107
- def write_dict_key_report (self , diff_report , output_path ):
149
+ def write_dict_key_report (self , diff_report : pd .DataFrame , output_path : Path ):
150
+ """Writes a dictionary key report to a csv file
151
+
152
+ Args:
153
+ diff_report (pd.DataFrame): report with dictionary key differences
154
+ output_path (Path): csv output path
155
+ """
108
156
if diff_report .empty :
157
+ logger .info (f"No key differences found, { output_path } not written" )
109
158
return
110
159
diff_report .to_csv (output_path , index = False )
111
- print (f"Wrote dictionary key report to { output_path } " )
160
+ logger .info (f"Wrote dictionary key report to { output_path } " )
161
+
162
+ def write_dict_value_report (self , diff_report : pd .DataFrame , output_path : Path ):
163
+ """Writes a dictionary value report to a csv file
112
164
113
- def write_dict_value_report (self , diff_report , output_path ):
165
+ Args:
166
+ diff_report (pd.DataFrame): report with dictionary value differences
167
+ output_path (Path): csv output path
168
+ """
114
169
col_headers = [
115
170
"ECM" ,
116
171
"Markets and Savings Type" ,
@@ -123,20 +178,30 @@ def write_dict_value_report(self, diff_report, output_path):
123
178
]
124
179
df = pd .DataFrame (columns = ["Results path" ], data = list (diff_report .keys ()))
125
180
if df .empty :
181
+ logger .info (f"No changes above the threshold found, { output_path } not written" )
126
182
return
127
183
df [col_headers ] = df ["Results path" ].apply (self .split_json_key_path ).apply (pd .Series )
128
- df ["Percent difference" ] = [round (diff , 2 ) for diff in diff_report .values ()]
184
+ df ["Percent difference" ] = [
185
+ round (diff_dict ["percent_diff" ], 2 ) for diff_dict in diff_report .values ()]
186
+ df ["Base value" ] = [round (diff_dict ["base" ], 2 ) for diff_dict in diff_report .values ()]
187
+ df ["New value" ] = [round (diff_dict ["new" ], 2 ) for diff_dict in diff_report .values ()]
129
188
df = df .dropna (axis = 1 , how = "all" )
130
189
df .to_csv (output_path , index = False )
131
- print (f"Wrote dictionary value report to { output_path } " )
190
+ logger . info (f"Wrote dictionary value report to { output_path } " )
132
191
133
- def compare_jsons (self , json1_path , json2_path , output_dir = True ):
192
+ def compare_jsons (self ,
193
+ json1_path : Path ,
194
+ json2_path : Path ,
195
+ percent_threshold : float ,
196
+ output_dir : Path = None ):
134
197
"""Compare two jsons and report differences in keys and in values
135
198
136
199
Args:
137
200
json1_path (Path): baseline json file to compare
138
201
json2_path (Path): new json file to compare
139
- write_reports (bool, optional): _description_. Defaults to True.
202
+ percent_threshold (float): threshold for reporting percent difference if values
203
+ output_dir (Path, optional): output directory where comparison reports are saved.
204
+ Defaults to None.
140
205
"""
141
206
json1 = self .load_json (json1_path )
142
207
json2 = self .load_json (json2_path )
@@ -148,18 +213,21 @@ def compare_jsons(self, json1_path, json2_path, output_dir=True):
148
213
self .write_dict_key_report (key_diffs , output_dir / f"{ json2_path .stem } _key_diffs.csv" )
149
214
150
215
# Compare differences in json values
151
- val_diffs = self .compare_dict_values (json1 , json2 )
216
+ val_diffs = self .compare_dict_values (json1 , json2 , percent_threshold = percent_threshold )
152
217
self .write_dict_value_report (val_diffs , output_dir / f"{ json2_path .stem } _value_diffs.csv" )
153
218
154
- def compare_summary_reports (self , report1_path , report2_path , output_dir = None ):
219
+ def compare_summary_reports (self ,
220
+ report1_path : Path ,
221
+ report2_path : Path ,
222
+ output_dir : Path = None ):
155
223
"""Compare Summary_Data-TP.xlsx and Summary_Data-MAP.xlsx with baseline files
156
224
157
225
Args:
158
226
report1_path (Path): baseline summary report to compare
159
227
report2_path (Path): new summary report to compare
160
- output_dir (Path, optional): _description_. Defaults to None.
228
+ output_dir (Path, optional): output directory where comparison report is saved.
229
+ Defaults to None.
161
230
"""
162
-
163
231
reports1 = self .load_summary_report (report1_path )
164
232
reports2 = self .load_summary_report (report2_path )
165
233
if output_dir is None :
@@ -170,7 +238,7 @@ def compare_summary_reports(self, report1_path, report2_path, output_dir=None):
170
238
diff = (100 * (report2 - report1 )/ report1 ).round (2 )
171
239
diff = diff .reset_index ()
172
240
diff .to_excel (writer , sheet_name = output_type , index = False )
173
- print (f"Wrote Summary_Data percent difference report to { output_path } " )
241
+ logger . info (f"Wrote Summary_Data percent difference report to { output_path } " )
174
242
175
243
176
244
def main ():
@@ -194,10 +262,16 @@ def main():
194
262
new_dir = args .new_dir .resolve ()
195
263
agg_json_base = base_dir / "agg_results.json"
196
264
agg_json_new = new_dir / "agg_results.json"
197
- compare .compare_jsons (agg_json_base , agg_json_new , output_dir = new_dir )
265
+ compare .compare_jsons (agg_json_base ,
266
+ agg_json_new ,
267
+ percent_threshold = args .threshold ,
268
+ output_dir = new_dir )
198
269
ecm_json_base = base_dir / "ecm_results.json"
199
270
ecm_json_new = new_dir / "ecm_results.json"
200
- compare .compare_jsons (ecm_json_base , ecm_json_new , output_dir = new_dir )
271
+ compare .compare_jsons (ecm_json_base ,
272
+ ecm_json_new ,
273
+ percent_threshold = args .threshold ,
274
+ output_dir = new_dir )
201
275
202
276
summary_tp_base = base_dir / "Summary_Data-TP.xlsx"
203
277
summary_tp_new = new_dir / "plots" / "tech_potential" / "Summary_Data-TP.xlsx"
@@ -208,7 +282,9 @@ def main():
208
282
else :
209
283
# Compare only as specified by the arguments
210
284
if args .json_baseline and args .json_new :
211
- compare .compare_jsons (args .json_baseline , args .json_new )
285
+ compare .compare_jsons (args .json_baseline ,
286
+ args .json_new ,
287
+ percent_threshold = args .threshold )
212
288
if args .summary_baseline and args .summary_new :
213
289
compare .compare_summary_reports (args .summary_baseline , args .summary_new )
214
290
0 commit comments