diff --git a/data_vis.py b/data_vis.py index baadad8..14d6b4e 100644 --- a/data_vis.py +++ b/data_vis.py @@ -10,9 +10,9 @@ def plot_ds(ax, wls, xdata, ydata): ''' Plot dataset ax: matplotlib Axes3DSubplot - wls: Numpy ndarray - xdata: Pandas DataFrame - ydata: Pandas Series + wls: Numpy ndarray: List of wavelength + xdata: Pandas DataFrame: Measurements + ydata: Pandas Series: Octane numbers ''' wls_len = len(wls) @@ -23,8 +23,7 @@ def plot_ds(ax, wls, xdata, ydata): ax.plot(on_ar, wls, vals) -if __name__ == "__main__": - +def main(): FILENAME = "octane.xlsx" oct_df = pandas.read_excel(FILENAME) FILENAME = "gasoline.csv" @@ -64,3 +63,6 @@ def plot_ds(ax, wls, xdata, ydata): ax1.set_zlabel("Value") plt.show() + +if __name__ == "__main__": + main() diff --git a/plsr_example.py b/plsr_example.py index dac800a..7aa40de 100644 --- a/plsr_example.py +++ b/plsr_example.py @@ -4,85 +4,103 @@ import pandas import matplotlib.pyplot as plt from sklearn.cross_decomposition import PLSRegression +from sklearn.cross_decomposition import PLSCanonical +from sklearn.decomposition import PCA +from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score -FILENAME = "octane.xlsx" - +def import_dataset(ds_name="octane"): + ''' + ds_name: Name of the dataset ("octane", "gasoline") + Returns: + wls: Numpy ndarray: List of wavelength + xdata: Pandas DataFrame: Measurements + ydata: Pandas Series: Octane numbers + ''' + + if ds_name == "octane": + oct_df = pandas.read_excel("octane.xlsx") + wls = np.array([ int(i) for i in oct_df.columns.values[2:]]) + xdata = oct_df.loc[:,'1100':] + ydata = oct_df['Octane number'] + elif ds_name == "gasoline": + import re + + gas_df = pandas.read_csv("gasoline.csv") + reg = re.compile('([0-9]+)') + wls = np.array([ int(reg.findall(i)[0]) for i in gas_df.columns.values[1:]]) + xdata = gas_df.loc[:,'NIR.900 nm':] + ydata = gas_df['octane'] + else: + exit("Invalid Dataset") + + return wls, xdata, ydata if __name__ == "__main__": - - data = pandas.read_excel(FILENAME) - - print(type(data), np.shape(data)) - print(data.loc[0:3,'Octane number':'1108']) - - wls = np.array([ int(i) for i in data.columns.values[2:]]) - print(wls) - wls_len = len(wls) - x_data = data.loc[:,'1100':] - y_data = data['Octane number'] + wls, xdata, ydata = import_dataset("gasoline") - total_variance_in_y = np.var(y_data, axis = 0) + total_variance_in_y = np.var(ydata, axis = 0) nc=10 + pls = PLSRegression(n_components=nc) + pls2 = PLSRegression(n_components=2) + pca = PCA(n_components=2) + #pls2 = PLSCanonical(n_components=nc) + pls.fit(xdata, ydata) + pls2.fit(xdata, ydata) + pca.fit(xdata) - pls2 = PLSRegression(n_components=nc) - pls2.fit(x_data, y_data) - - variance_in_y = np.var(pls2.y_scores_, axis = 0) + y2fit = pls2.predict(xdata) - print("xw ", pls2.x_weights_) - print("yw ", pls2.y_weights_) - print("xl ", pls2.x_loadings_) - print("yl ", pls2.y_loadings_) - print("yl ", pls2.y_scores_) + pca_scores = pca.transform(xdata) + pcr = LinearRegression().fit(pca_scores[:,0:2], ydata) + yPfit = pcr.predict(pca.transform(xdata)) + variance_in_y = np.var(pls.y_scores_, axis = 0) fractions_of_explained_variance = variance_in_y / total_variance_in_y + TSS = np.sum((ydata-np.mean(ydata))**2 ) + + + print(np.shape(y2fit[:,0])) + RSS_PLS = np.sum(np.subtract(ydata,y2fit[:,0])**2 ) + r2PLS = 1 - RSS_PLS/TSS + + RSS_PCR = np.sum((ydata-yPfit)**2 ) + r2PCR = 1 - RSS_PCR/TSS + r2_sum = 0 - + fev = [] for i in range(0,nc): - Y_pred=np.dot(pls2.x_scores_[:,i].reshape(-1,1), - pls2.y_loadings_[:,i].reshape(-1,1).T) * y_data.std(axis=0, ddof=1) + y_data.mean(axis=0) - r2_sum += round(r2_score(y_data,Y_pred),3) + Y_pred=np.dot(pls.x_scores_[:,i].reshape(-1,1), + pls.y_loadings_[:,i].reshape(-1,1).T) * ydata.std(axis=0, ddof=1) + ydata.mean(axis=0) + r2_sum += round(r2_score(ydata,Y_pred),3) fev.append(r2_sum) - print('R2 for %d component: %g' %(i+1,round(r2_score(y_data,Y_pred),3))) + print('R^2 for %d component: %g, cummulative: %g' %(i+1,round(r2_score(ydata,Y_pred),3), r2_sum)) - print('R2 for all components (): %g' %r2_sum) #Sum of above + print('R^2 for all components (): %g' %r2_sum) #Sum of above - ax = plt.axes() + f, (ax, ax2) = plt.subplots(1,2) # ax.plot([np.sum(fractions_of_explained_variance[0:i]) for i in range(len(fractions_of_explained_variance))],'-bo') - ax.plot(fev,'-bo') + ax.plot(range(1,11),fev,'-bo') ax.set_xlabel("Number of PLS Components") ax.set_ylabel("Percent Variance Explained in Y") - plt.show() - exit() - ax = plt.axes(projection='3d') - - for idx, rdata in data.iterrows(): - on = rdata['Octane number'] - vals = rdata.loc['1100':] - print(on) - on_ar = np.full(wls_len, on) - - print(on_ar, wls, vals) - - ax.plot(on_ar, wls, vals) - - ax.set_xlabel("Octane number") - ax.set_ylabel("Wavelength") - ax.set_zlabel("Value") - plt.show() - - exit() + print("asdasd") + ax2.plot(ydata, y2fit, ' ob') + ax2.plot(ydata, yPfit, ' ^r') + ax2.set_xlim(83,90) + ax2.set_ylim(83,90) + ax2.set_xlabel("Observed Response") + ax2.set_ylabel("Fitted Response") + ax2.legend(["PLSR with 2 Components (R2 = %2.4f)" % (r2PLS), + "PCR with 2 Components (R2 = %2.4f)" % (r2PCR)]) - ax.plot(data.loc[:,'Octane number'], data.loc[:,'1100':].T, ' .') plt.show()