This repository has been archived by the owner on Jan 2, 2024. It is now read-only.
forked from ncfrey/litmatter
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtpe.py
88 lines (72 loc) · 3.16 KB
/
tpe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
class TrainingSpeedEstimator():
"""Implements training speed estimators from
Ru, Robin, et al. "Speedy Performance Estimation for Neural Architecture Search."
Advances in Neural Information Processing Systems 34 (2021).
"""
def __init__(self, E=1, gamma=0.999, norm=True):
"""
Parameters
----------
E: int
number of "burn-in" epochs to throw away at beginning of training
for TSE-E estimator
gamma: float
hyperparam for exponential moving average
normalize: bool
boolean for whether or not to normalize loss data before fitting curve; normalization
is simply dividing by the maximum loss value and generally gives better results (default True)
"""
self.E = E
self.gamma = gamma
self.normalize = norm
def estimate(self, df_train, T, df_energy=None):
"""
Parameters
---------
df_train: Pandas dataframe
dataframe with 'epoch' and 'train_loss_step' columns
T: int
number of epochs to consider in estimation
df_energy: Pandas dataframe (Optional)
dataframe with GPU index, timestamps, and power draw from nvidia-smi
Returns
-------
tse_dict: dict
Results from three TSE estimation methods for training loss curve
"""
B = len(df_train[df_train['epoch']==T]) # number of steps (minibatches) in an epoch
T_end = df_train.iloc[-1].epoch + 1 # number of total epochs
tse = df_train[df_train['epoch'] < T].train_loss_stp.sum() / B
tsee = df_train[(df_train['epoch'] >= T - self.E + 1) & (df_train['epoch'] <= T)].train_loss_stp.sum() / B
tseema = 0
for t in range(0, T+1):
sum_losses = df_train[df_train['epoch']==t].train_loss_stp.sum() / B * self.gamma ** (T-t)
tseema += sum_losses
if df_energy is not None:
energies = []
for idx in df_energy[' index'].unique():
df0 = df_energy[df_energy[' index']==idx].reset_index(drop=True) # power by GPU device index
E = self._compute_energy(df0)
energies.append(E)
total_energy = np.sum(energies) / 1e3
else:
total_energy = 0
energy_per_epoch = total_energy / T_end
energy_per_step = total_energy / len(df_train)
tpe_dict = {'tse': tse, 'tsee': tsee, 'tseema': tseema,
'T_end': T_end, 'energy_per_epoch (kJ)': energy_per_epoch,
'energy_per_step (kJ)': energy_per_step}
return tpe_dict
def _compute_energy(self, df):
ts = pd.to_datetime(df['timestamp'])
ts = ts - ts[0]
ts = ts.dt.total_seconds().to_numpy()
# Quadrature by trapezoidal rule
deltas = ts[1:] - ts[0: -1]
power = df[' power.draw [W]'].to_numpy()
avg_powers = 0.5 * (power[1:] + power[0: -1])
energy = deltas * avg_powers # units of watts * seconds
return np.sum(energy)