clementpoiret · Sep 3, 2019
diff --git a/‎.gitignore
+3-2 b/‎.gitignore
+3-2
diff --git a/‎README.md
+16-7 b/‎README.md
+16-7
diff --git a/‎cpai.py
+23-8 b/‎cpai.py
+23-8
diff --git a/‎model.png
192 Bytes b/‎model.png
192 Bytes
diff --git a/‎prediction.csv
+2,081 b/‎prediction.csv
+2,081
diff --git a/‎prediction.png
-343 Bytes b/‎prediction.png
-343 Bytes
diff --git a/‎utils/helpers.py
+38-18 b/‎utils/helpers.py
+38-18
diff --git a/‎utils/neuralnet/model.py
+9-9 b/‎utils/neuralnet/model.py
+9-9
@@ -54,7 +54,8 @@ utils/cryptocurrency/credentials
 
 # Confidential informations
 credentials
-MinMaxScaler.pkl
-MinMaxScaler_predict.pkl
+
+# Models
+scalers/*
 weights.h5
 models/regressor.h5
@@ -31,28 +31,37 @@ the RNN:
 - ADX,
 - Stochastic RSI.
 
+After that all features are collected, a PCA is computed, and components
+giving .99 of explained variance are kept. Let x be the number of components.
+
 ### Model
 
 Currently used model is pretty basic. It's a stacked LSTM model taking
-an input of shape (2048, 60).
+an input of shape (2048, x).
 
-There are 4 LSTM layers of 65 neurons each,with a relu activation function
-connected to a Dense layer (output layer) of 32 neurons with a linear
-activation function.
-Each LSTM layer has a Dropout rate of .2.
+There are 4 Gated Recurrent Units (GRU) layers of 256, 128, 64 and 32
+neurons each, with a parametric relu activation function connected 
+to a Dense layer (output layer) of 32 neurons with a sigmoid activation
+function.
 
 As of now, the regressor is using a classical mean squared error loss
-function, with a rmsprop optimizer, a batch size of 64 and 128 epochs.
+function, with a rmsprop optimizer, a batch size of 32 and 128 epochs.
 
 ![model](model.png)
 
 *The model needs hyperparameters tuning.*
 
+Here is the resulting prices array with historical price (left), and
+prediction (right):
+
+![prediction](prediction.png)
+
 ## Note
 
 - Early and unstable,
 - Basic RNN still not tuned,
 - Integrating Google Trends is theoretically working, but pytrends' API isn't
 working,
 - History is heavily limited by the quantity of social data, but this is
-related to CryptoCompare's API.
+related to CryptoCompare's API,
+- PCA is linear, why not using KernelPCA or Autoencoders?
@@ -26,6 +26,8 @@
 import matplotlib.pyplot as plt
 import joblib
 
+from keras.models import load_model
+
 # Global variables
 N_FUTURE = 32
 N_PAST = 2048
@@ -35,6 +37,7 @@ def get_datasets(validation_set=False):
     data = hp.get_data()
     data.to_csv("tmp/data.csv", index=False)
 
+    time = data.time
     data = data.drop(columns=["time"])
 
     if validation_set:
@@ -47,27 +50,33 @@ def get_datasets(validation_set=False):
     else:
         X_train, y_train = hp.preprocessing_pipeline(data, N_PAST, N_FUTURE)
 
-    return data, X_train, y_train
+    return time, data, X_train, y_train
 
 
 def main():
     """Here we go again... Main function, getting data,
     training model, and computing predictions."""
 
     print("Getting X_train and y_train...")
-    data, X_train, y_train = get_datasets()
+    time, data, X_train, y_train = get_datasets()
 
+    #regressor = load_model("models/regressor.h5")
     print("Building regressor...")
-    regressor, history = md.train_model(X_train,
-                                        y_train,
-                                        N_PAST,
-                                        optimizer="rmsprop",
-                                        batch_size=64,
-                                        epochs=30)
+    regressor = md.train_model(X_train,
+                               y_train,
+                               N_PAST,
+                               optimizer="rmsprop",
+                               batch_size=64,
+                               epochs=30)
     regressor.save("models/regressor.h5")
 
     print("Getting last {} hours to predict next {} hours...".format(
         N_PAST, N_FUTURE))
+
+    timepred = np.concatenate(
+        (time[-N_PAST:].values,
+         [time.iloc[-1] + (1 + n) * 3600 for n in range(N_FUTURE)]))
+
     last = data.iloc[-N_PAST:, :]
     last = hp.preprocessing_pipeline(last,
                                      N_PAST,
@@ -86,6 +95,12 @@ def main():
     plt.axvline(N_PAST, linestyle=":")
     plt.savefig("prediction.png")
     plt.show()
+
+    pd.DataFrame({
+        "time": timepred,
+        "prediction": prices[:, 0]
+    }).to_csv("prediction.csv")
+
     #prediction = regressor.predict(X_test)[0].reshape(-1, 1)
     #prediction = sc.inverse_transform(prediction)
 
 
@@ -10,8 +10,8 @@
 import utils.technicalanalysis.indicators as ind
 import joblib
 
-from impyute.imputation.ts import moving_window
-from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+from sklearn.decomposition import PCA
 
 
 def impute_ts(X):
@@ -33,19 +33,18 @@ def merge_truncate(historical, social):
     return data
 
 
-def scale(X):
-    sc = MinMaxScaler(feature_range=(0, 1))
-    sc_predict = MinMaxScaler(feature_range=(0, 1))
-
+def scale(X,
+          scaler=MinMaxScaler(feature_range=(0, 1)),
+          save=True,
+          filename="MinMaxScaler"):
+    sc = scaler
     X_scaled = sc.fit_transform(X)
 
-    if not os.path.exists("scalers/"):
-        os.mkdir("scalers/")
-
-    joblib.dump(sc, "scalers/MinMaxScaler.pkl")
+    if save:
+        if not os.path.exists("scalers/"):
+            os.mkdir("scalers/")
 
-    sc_predict.fit(X[:, 0:1])
-    joblib.dump(sc_predict, "scalers/MinMaxScaler_predict.pkl")
+        joblib.dump(sc, "scalers/{}.pkl".format(filename))
 
     return X_scaled
 
@@ -57,23 +56,44 @@ def preprocessing_pipeline(X, n_past, n_future, is_testing_set=False):
             columns_to_drop.append(col)
     preprocessed = X.drop(columns=columns_to_drop)
 
-    # for col in preprocessed.columns:
-    #     if (preprocessed[col] == 0).any():
-    #         preprocessed[col] = impute_ts(X[col])
-
     preprocessed = preprocessed.astype(float)
     preprocessed = preprocessed.values
 
+    close = preprocessed[:, 0]
+    preprocessed = preprocessed[:, 1:]
+
     if is_testing_set:
-        sc = joblib.load("scalers/MinMaxScaler.pkl")
-        preprocessed = sc.transform(preprocessed)
+        #! to update
+        stdsc = joblib.load("scalers/StandardScaler.pkl")
+        pca = joblib.load("scalers/pca.pkl")
+        mmsc = joblib.load("scalers/MinMaxScaler.pkl")
+        mmsc_pred = joblib.load("scalers/MinMaxScaler_predict.pkl")
+
+        preprocessed = stdsc.transform(preprocessed)
+        preprocessed = pca.transform(preprocessed)
+        preprocessed = mmsc.transform(preprocessed)
+
+        close = mmsc_pred.transform(close.reshape(-1, 1))
+        preprocessed = np.concatenate([close, preprocessed], axis=1)
 
         X_test = np.array([preprocessed])
 
         return X_test
 
     else:
+        preprocessed = scale(preprocessed,
+                             scaler=StandardScaler(),
+                             save=True,
+                             filename="StandardScaler")
+
+        pca = PCA(.99)
+        preprocessed = pca.fit_transform(preprocessed)
+        joblib.dump(pca, "scalers/pca.pkl")
+
+        close = scale(close.reshape(-1, 1), filename="MinMaxScaler_predict")
         preprocessed = scale(preprocessed)
+        preprocessed = np.concatenate((close, preprocessed), axis=1)
+
         X_train = [
             preprocessed[i - n_past:i, :]
             for i in range(n_past,
 
@@ -75,15 +75,15 @@ def train_model(X_train,
 
     tb = TensorBoard('logs')
 
-    history = regressor.fit(X_train,
-                            y_train,
-                            epochs=epochs,
-                            callbacks=[es, rlr, mcp, tb],
-                            verbose=1,
-                            validation_split=validation_split,
-                            batch_size=batch_size)
-
-    return regressor, history
+    regressor.fit(X_train,
+                  y_train,
+                  epochs=epochs,
+                  callbacks=[es, rlr, mcp, tb],
+                  verbose=1,
+                  validation_split=validation_split,
+                  batch_size=batch_size)
+
+    return regressor
 
 
 def tune(X_train, y_train, parameters, cv=4, n_jobs=-1):