Skip to content

Commit 302d0ce

Browse files
committedSep 3, 2019
Using PCA for dimensionality reduction
1 parent acbf3fa commit 302d0ce

File tree

8 files changed

+2170
-44
lines changed

8 files changed

+2170
-44
lines changed
 

‎.gitignore

+3-2
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@ utils/cryptocurrency/credentials
5454

5555
# Confidential informations
5656
credentials
57-
MinMaxScaler.pkl
58-
MinMaxScaler_predict.pkl
57+
58+
# Models
59+
scalers/*
5960
weights.h5
6061
models/regressor.h5

‎README.md

+16-7
Original file line numberDiff line numberDiff line change
@@ -31,28 +31,37 @@ the RNN:
3131
- ADX,
3232
- Stochastic RSI.
3333

34+
After that all features are collected, a PCA is computed, and components
35+
giving .99 of explained variance are kept. Let x be the number of components.
36+
3437
### Model
3538

3639
Currently used model is pretty basic. It's a stacked LSTM model taking
37-
an input of shape (2048, 60).
40+
an input of shape (2048, x).
3841

39-
There are 4 LSTM layers of 65 neurons each,with a relu activation function
40-
connected to a Dense layer (output layer) of 32 neurons with a linear
41-
activation function.
42-
Each LSTM layer has a Dropout rate of .2.
42+
There are 4 Gated Recurrent Units (GRU) layers of 256, 128, 64 and 32
43+
neurons each, with a parametric relu activation function connected
44+
to a Dense layer (output layer) of 32 neurons with a sigmoid activation
45+
function.
4346

4447
As of now, the regressor is using a classical mean squared error loss
45-
function, with a rmsprop optimizer, a batch size of 64 and 128 epochs.
48+
function, with a rmsprop optimizer, a batch size of 32 and 128 epochs.
4649

4750
![model](model.png)
4851

4952
*The model needs hyperparameters tuning.*
5053

54+
Here is the resulting prices array with historical price (left), and
55+
prediction (right):
56+
57+
![prediction](prediction.png)
58+
5159
## Note
5260

5361
- Early and unstable,
5462
- Basic RNN still not tuned,
5563
- Integrating Google Trends is theoretically working, but pytrends' API isn't
5664
working,
5765
- History is heavily limited by the quantity of social data, but this is
58-
related to CryptoCompare's API.
66+
related to CryptoCompare's API,
67+
- PCA is linear, why not using KernelPCA or Autoencoders?

‎cpai.py

+23-8
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
import matplotlib.pyplot as plt
2727
import joblib
2828

29+
from keras.models import load_model
30+
2931
# Global variables
3032
N_FUTURE = 32
3133
N_PAST = 2048
@@ -35,6 +37,7 @@ def get_datasets(validation_set=False):
3537
data = hp.get_data()
3638
data.to_csv("tmp/data.csv", index=False)
3739

40+
time = data.time
3841
data = data.drop(columns=["time"])
3942

4043
if validation_set:
@@ -47,27 +50,33 @@ def get_datasets(validation_set=False):
4750
else:
4851
X_train, y_train = hp.preprocessing_pipeline(data, N_PAST, N_FUTURE)
4952

50-
return data, X_train, y_train
53+
return time, data, X_train, y_train
5154

5255

5356
def main():
5457
"""Here we go again... Main function, getting data,
5558
training model, and computing predictions."""
5659

5760
print("Getting X_train and y_train...")
58-
data, X_train, y_train = get_datasets()
61+
time, data, X_train, y_train = get_datasets()
5962

63+
#regressor = load_model("models/regressor.h5")
6064
print("Building regressor...")
61-
regressor, history = md.train_model(X_train,
62-
y_train,
63-
N_PAST,
64-
optimizer="rmsprop",
65-
batch_size=64,
66-
epochs=30)
65+
regressor = md.train_model(X_train,
66+
y_train,
67+
N_PAST,
68+
optimizer="rmsprop",
69+
batch_size=64,
70+
epochs=30)
6771
regressor.save("models/regressor.h5")
6872

6973
print("Getting last {} hours to predict next {} hours...".format(
7074
N_PAST, N_FUTURE))
75+
76+
timepred = np.concatenate(
77+
(time[-N_PAST:].values,
78+
[time.iloc[-1] + (1 + n) * 3600 for n in range(N_FUTURE)]))
79+
7180
last = data.iloc[-N_PAST:, :]
7281
last = hp.preprocessing_pipeline(last,
7382
N_PAST,
@@ -86,6 +95,12 @@ def main():
8695
plt.axvline(N_PAST, linestyle=":")
8796
plt.savefig("prediction.png")
8897
plt.show()
98+
99+
pd.DataFrame({
100+
"time": timepred,
101+
"prediction": prices[:, 0]
102+
}).to_csv("prediction.csv")
103+
89104
#prediction = regressor.predict(X_test)[0].reshape(-1, 1)
90105
#prediction = sc.inverse_transform(prediction)
91106

‎model.png

192 Bytes
Loading

‎prediction.csv

+2,081
Large diffs are not rendered by default.

‎prediction.png

-343 Bytes
Loading

‎utils/helpers.py

+38-18
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
import utils.technicalanalysis.indicators as ind
1111
import joblib
1212

13-
from impyute.imputation.ts import moving_window
14-
from sklearn.preprocessing import MinMaxScaler
13+
from sklearn.preprocessing import MinMaxScaler, StandardScaler
14+
from sklearn.decomposition import PCA
1515

1616

1717
def impute_ts(X):
@@ -33,19 +33,18 @@ def merge_truncate(historical, social):
3333
return data
3434

3535

36-
def scale(X):
37-
sc = MinMaxScaler(feature_range=(0, 1))
38-
sc_predict = MinMaxScaler(feature_range=(0, 1))
39-
36+
def scale(X,
37+
scaler=MinMaxScaler(feature_range=(0, 1)),
38+
save=True,
39+
filename="MinMaxScaler"):
40+
sc = scaler
4041
X_scaled = sc.fit_transform(X)
4142

42-
if not os.path.exists("scalers/"):
43-
os.mkdir("scalers/")
44-
45-
joblib.dump(sc, "scalers/MinMaxScaler.pkl")
43+
if save:
44+
if not os.path.exists("scalers/"):
45+
os.mkdir("scalers/")
4646

47-
sc_predict.fit(X[:, 0:1])
48-
joblib.dump(sc_predict, "scalers/MinMaxScaler_predict.pkl")
47+
joblib.dump(sc, "scalers/{}.pkl".format(filename))
4948

5049
return X_scaled
5150

@@ -57,23 +56,44 @@ def preprocessing_pipeline(X, n_past, n_future, is_testing_set=False):
5756
columns_to_drop.append(col)
5857
preprocessed = X.drop(columns=columns_to_drop)
5958

60-
# for col in preprocessed.columns:
61-
# if (preprocessed[col] == 0).any():
62-
# preprocessed[col] = impute_ts(X[col])
63-
6459
preprocessed = preprocessed.astype(float)
6560
preprocessed = preprocessed.values
6661

62+
close = preprocessed[:, 0]
63+
preprocessed = preprocessed[:, 1:]
64+
6765
if is_testing_set:
68-
sc = joblib.load("scalers/MinMaxScaler.pkl")
69-
preprocessed = sc.transform(preprocessed)
66+
#! to update
67+
stdsc = joblib.load("scalers/StandardScaler.pkl")
68+
pca = joblib.load("scalers/pca.pkl")
69+
mmsc = joblib.load("scalers/MinMaxScaler.pkl")
70+
mmsc_pred = joblib.load("scalers/MinMaxScaler_predict.pkl")
71+
72+
preprocessed = stdsc.transform(preprocessed)
73+
preprocessed = pca.transform(preprocessed)
74+
preprocessed = mmsc.transform(preprocessed)
75+
76+
close = mmsc_pred.transform(close.reshape(-1, 1))
77+
preprocessed = np.concatenate([close, preprocessed], axis=1)
7078

7179
X_test = np.array([preprocessed])
7280

7381
return X_test
7482

7583
else:
84+
preprocessed = scale(preprocessed,
85+
scaler=StandardScaler(),
86+
save=True,
87+
filename="StandardScaler")
88+
89+
pca = PCA(.99)
90+
preprocessed = pca.fit_transform(preprocessed)
91+
joblib.dump(pca, "scalers/pca.pkl")
92+
93+
close = scale(close.reshape(-1, 1), filename="MinMaxScaler_predict")
7694
preprocessed = scale(preprocessed)
95+
preprocessed = np.concatenate((close, preprocessed), axis=1)
96+
7797
X_train = [
7898
preprocessed[i - n_past:i, :]
7999
for i in range(n_past,

‎utils/neuralnet/model.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,15 @@ def train_model(X_train,
7575

7676
tb = TensorBoard('logs')
7777

78-
history = regressor.fit(X_train,
79-
y_train,
80-
epochs=epochs,
81-
callbacks=[es, rlr, mcp, tb],
82-
verbose=1,
83-
validation_split=validation_split,
84-
batch_size=batch_size)
85-
86-
return regressor, history
78+
regressor.fit(X_train,
79+
y_train,
80+
epochs=epochs,
81+
callbacks=[es, rlr, mcp, tb],
82+
verbose=1,
83+
validation_split=validation_split,
84+
batch_size=batch_size)
85+
86+
return regressor
8787

8888

8989
def tune(X_train, y_train, parameters, cv=4, n_jobs=-1):

0 commit comments

Comments
 (0)
Please sign in to comment.