-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpickle_find_pattern.py
175 lines (141 loc) · 5.49 KB
/
pickle_find_pattern.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
'''
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# Load the provided data
with open("prof_database.pkl", "rb") as f:
data = pickle.load(f)
# Function to extract X and Y values from the dataset
def extract_data(dataset, key):
x = [point[0] for point in dataset[key]]
y = [point[1] for point in dataset[key]]
return np.array(x).reshape(-1, 1), np.array(y)
# Prepare data for linear regression
key = ('default', (4, 8))
all_gather_cost_dict = data[key].all_gather_cost_dict
for config, cost_data in all_gather_cost_dict.items():
X, y = extract_data(all_gather_cost_dict, config)
# Linear Regression
lr = LinearRegression()
lr.fit(X, y)
y_pred = lr.predict(X)
r2 = r2_score(y, y_pred)
# Print accuracy results
print(f"Configuration: {config}")
print(f"R2 score: {r2:.2f}")
print(f"Slope: {lr.coef_[0]}")
print(f"Intercept: {lr.intercept_}\n")
# Visualization
plt.scatter(X, y, label=f"{config} R2: {r2:.2f}")
plt.plot(X, y_pred)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Ranks")
plt.ylabel("Cost")
# Move the legend outside of the plot
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.title("All Gather Cost vs Ranks for Different Configurations")
# Save the plot
plt.savefig("all_gather_cost_vs_ranks.png", bbox_inches='tight')
# Show the plot
plt.show()
'''
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from decimal import Decimal
from sklearn.preprocessing import StandardScaler
# Load the provided data
with open("prof_database.pkl", "rb") as f:
data = pickle.load(f)
# Function to extract X and Y values from the dataset
def extract_data(dataset, key):
x = [point[0] for point in dataset[key]]
#x = [Decimal(point[0]) for point in dataset[key]]
y = [point[1] for point in dataset[key]]
#y = [Decimal(point[1]) for point in dataset[key]]
# Replace extreme values with the maximum finite representable value for float64
x = np.clip(x, np.finfo(np.float64).min, np.finfo(np.float64).max)
y = np.clip(y, np.finfo(np.float64).min, np.finfo(np.float64).max)
# Replace NaN values with the mean of the non-NaN elements in the array
y = np.where(np.isnan(y), np.nanmean(y), y)
return np.array(x).reshape(-1, 1), np.array(y)
# Function to filter out infinity or large values from X and y
def filter_data(X, y):
X_flat = np.ravel(X)
overflow_mask = (X_flat < np.finfo(np.float32).max) & (y < np.finfo(np.float32).max)
return X[overflow_mask], y[overflow_mask]
# Function to apply log transformation to y values
def apply_log_transform(y):
return np.log(y)
# List of attributes
attributes = [
'all_gather_cost_dict',
'all_reduce_cost_dict',
'all_to_all_cost_dict',
'reduce_scatter_cost_dict',
'available_memory_per_device',
'dot_cost_dict',
'conv_cost_dict',
'op_cost_dict',
]
# Loop through all keys
for key in data.keys():
# Loop through all attributes
for attr in attributes:
attribute_dict = getattr(data[key], attr)
if not isinstance(attribute_dict, dict):
continue
for config, cost_data in attribute_dict.items():
X, y = extract_data(attribute_dict, config)
# Filter out infinity or large values from X and y
X, y = filter_data(X, y)
# Apply log transformation to y values
y = np.where(np.isnan(y), np.nanmean(y), y)
# Initialize the scaler
scaler = StandardScaler()
if X.size == 0 or y.size == 0:
print(f"Empty arrays encountered for {key}, {attr}, {config}. Skipping...")
continue
y_norm = scaler.fit_transform(y.reshape(-1, 1)).reshape(-1)
# Linear Regression
lr = LinearRegression()
lr.fit(X, y_norm)
y_pred_norm = lr.predict(X)
# Rescale predictions back to original scale
y_pred = scaler.inverse_transform(y_pred_norm.reshape(-1, 1)).reshape(-1)
r2 = r2_score(y, y_pred)
'''
# Linear Regression
lr = LinearRegression()
lr.fit(X, y)
y_pred = lr.predict(X)
r2 = r2_score(y, y_pred)
'''
# Print accuracy results
print(f"Key: {key}")
print(f"Attribute: {attr}")
print(f"Configuration: {config}")
print(f"R2 score: {r2:.2f}")
print(f"Slope: {lr.coef_[0]}")
print(f"Intercept: {lr.intercept_}\n")
# Visualization
plt.scatter(X, y, label=f"{config} R2: {r2:.2f}")
plt.plot(X, y_pred)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Number of Parameters")
plt.ylabel("Cost")
# Move the legend outside of the plot
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.title(f"{attr.capitalize()} vs Ranks for Different Configurations ({key})")
# Save the plot
plt.savefig(f"{attr}_vs_ranks_{key}.png", bbox_inches='tight')
# Show the plot
plt.show()
# Clear the plot for the next attribute
plt.clf()