Added PCR and comparative visualisation with 2 components

gustavovelascoh · Oct 12, 2018 · f5ce8c6 · f5ce8c6
1 parent 86173a2
commit f5ce8c6
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 57 deletions.
diff --git a/data_vis.py b/data_vis.py
@@ -10,9 +10,9 @@ def plot_ds(ax, wls, xdata, ydata):
     '''
     Plot dataset
     ax: matplotlib Axes3DSubplot
-    wls: Numpy ndarray
-    xdata: Pandas DataFrame
-    ydata: Pandas Series
+    wls: Numpy ndarray: List of wavelength
+    xdata: Pandas DataFrame: Measurements
+    ydata: Pandas Series: Octane numbers
     '''
 
     wls_len = len(wls)
@@ -23,8 +23,7 @@ def plot_ds(ax, wls, xdata, ydata):
 
         ax.plot(on_ar, wls, vals)
 
-if __name__ == "__main__":
-
+def main():
     FILENAME = "octane.xlsx"
     oct_df = pandas.read_excel(FILENAME)
     FILENAME = "gasoline.csv"
@@ -64,3 +63,6 @@ def plot_ds(ax, wls, xdata, ydata):
     ax1.set_zlabel("Value")
 
     plt.show()
+
+if __name__ == "__main__":
+    main()
diff --git a/plsr_example.py b/plsr_example.py
@@ -4,85 +4,103 @@
 import pandas
 import matplotlib.pyplot as plt
 from sklearn.cross_decomposition import PLSRegression
+from sklearn.cross_decomposition import PLSCanonical
+from sklearn.decomposition import PCA
+from sklearn.linear_model import LinearRegression
 from sklearn.metrics import r2_score
 
 
-FILENAME = "octane.xlsx"
-
+def import_dataset(ds_name="octane"):
+    '''
+    ds_name: Name of the dataset ("octane", "gasoline")
+    Returns:
+    wls: Numpy ndarray: List of wavelength
+    xdata: Pandas DataFrame: Measurements
+    ydata: Pandas Series: Octane numbers
+    '''
+
+    if ds_name == "octane":
+        oct_df = pandas.read_excel("octane.xlsx")
+        wls = np.array([ int(i) for i in oct_df.columns.values[2:]])
+        xdata = oct_df.loc[:,'1100':]
+        ydata = oct_df['Octane number']
+    elif ds_name == "gasoline":
+        import re
+
+        gas_df = pandas.read_csv("gasoline.csv")
+        reg = re.compile('([0-9]+)')
+        wls = np.array([ int(reg.findall(i)[0]) for i in gas_df.columns.values[1:]])
+        xdata = gas_df.loc[:,'NIR.900 nm':]
+        ydata = gas_df['octane']
+    else:
+        exit("Invalid Dataset")
+
+    return wls, xdata, ydata
 
 if __name__ == "__main__":
-
-    data = pandas.read_excel(FILENAME)
-
-    print(type(data), np.shape(data))
-    print(data.loc[0:3,'Octane number':'1108']) 
-
-    wls = np.array([ int(i) for i in data.columns.values[2:]])
-    print(wls)
-    wls_len = len(wls)
 
-    x_data = data.loc[:,'1100':]
-    y_data = data['Octane number']
+    wls, xdata, ydata = import_dataset("gasoline")
 
-    total_variance_in_y = np.var(y_data, axis = 0)
+    total_variance_in_y = np.var(ydata, axis = 0)
     nc=10
 
+    pls = PLSRegression(n_components=nc)
+    pls2 = PLSRegression(n_components=2)
+    pca = PCA(n_components=2)
+    #pls2 = PLSCanonical(n_components=nc)
+    pls.fit(xdata, ydata)
+    pls2.fit(xdata, ydata)
+    pca.fit(xdata)
 
-    pls2 = PLSRegression(n_components=nc)
-    pls2.fit(x_data, y_data)
-
-    variance_in_y = np.var(pls2.y_scores_, axis = 0)
+    y2fit = pls2.predict(xdata)
 
-    print("xw ", pls2.x_weights_)
-    print("yw ", pls2.y_weights_)
-    print("xl ", pls2.x_loadings_)
-    print("yl ", pls2.y_loadings_)
-    print("yl ", pls2.y_scores_)
+    pca_scores = pca.transform(xdata)
+    pcr = LinearRegression().fit(pca_scores[:,0:2], ydata)
+    yPfit = pcr.predict(pca.transform(xdata))
 
+    variance_in_y = np.var(pls.y_scores_, axis = 0)
     fractions_of_explained_variance = variance_in_y / total_variance_in_y
 
+    TSS = np.sum((ydata-np.mean(ydata))**2 )
+
+
+    print(np.shape(y2fit[:,0]))
+    RSS_PLS = np.sum(np.subtract(ydata,y2fit[:,0])**2 )
+    r2PLS = 1 - RSS_PLS/TSS
+
+    RSS_PCR = np.sum((ydata-yPfit)**2 )
+    r2PCR = 1 - RSS_PCR/TSS
+
     r2_sum = 0
-    
+
     fev = []
 
     for i in range(0,nc):
-        Y_pred=np.dot(pls2.x_scores_[:,i].reshape(-1,1),
-                        pls2.y_loadings_[:,i].reshape(-1,1).T) * y_data.std(axis=0, ddof=1) + y_data.mean(axis=0)
-        r2_sum += round(r2_score(y_data,Y_pred),3) 
+        Y_pred=np.dot(pls.x_scores_[:,i].reshape(-1,1),
+                        pls.y_loadings_[:,i].reshape(-1,1).T) * ydata.std(axis=0, ddof=1) + ydata.mean(axis=0)
+        r2_sum += round(r2_score(ydata,Y_pred),3)
         fev.append(r2_sum)
-        print('R2 for %d component: %g' %(i+1,round(r2_score(y_data,Y_pred),3)))
+        print('R^2 for %d component: %g, cummulative: %g' %(i+1,round(r2_score(ydata,Y_pred),3), r2_sum))
 
-    print('R2 for all components (): %g' %r2_sum) #Sum of above
+    print('R^2 for all components (): %g' %r2_sum) #Sum of above
 
-    ax = plt.axes()
+    f, (ax, ax2) = plt.subplots(1,2)
 
 #    ax.plot([np.sum(fractions_of_explained_variance[0:i]) for i in range(len(fractions_of_explained_variance))],'-bo')
-    ax.plot(fev,'-bo')
+    ax.plot(range(1,11),fev,'-bo')
     ax.set_xlabel("Number of PLS Components")
     ax.set_ylabel("Percent Variance Explained in Y")
 
-    plt.show()
 
-    exit()
-    ax = plt.axes(projection='3d')
-
-    for idx, rdata in data.iterrows():
-        on = rdata['Octane number']
-        vals = rdata.loc['1100':]
-        print(on)
-        on_ar = np.full(wls_len, on)
-
-        print(on_ar, wls, vals)
-
-        ax.plot(on_ar, wls, vals)
-
-    ax.set_xlabel("Octane number")
-    ax.set_ylabel("Wavelength")
-    ax.set_zlabel("Value")
-    plt.show()
-
-    exit()
+    print("asdasd")
+    ax2.plot(ydata, y2fit, ' ob')
+    ax2.plot(ydata, yPfit, ' ^r')
+    ax2.set_xlim(83,90)
+    ax2.set_ylim(83,90)
+    ax2.set_xlabel("Observed Response")
+    ax2.set_ylabel("Fitted Response")
+    ax2.legend(["PLSR with 2 Components  (R2 = %2.4f)" % (r2PLS),
+                "PCR with 2 Components  (R2 = %2.4f)" % (r2PCR)])
 
 
-    ax.plot(data.loc[:,'Octane number'], data.loc[:,'1100':].T, ' .')
     plt.show()