From f0b4f308699f17296a144ca27443b780aae21d2c Mon Sep 17 00:00:00 2001 From: gustavovelascoh Date: Tue, 23 Oct 2018 14:36:57 +0100 Subject: [PATCH] Updated code. Updated readme --- README.md | 4 + plsr_example.py | 204 ++++++++++++++++++++---------------------------- 2 files changed, 90 insertions(+), 118 deletions(-) diff --git a/README.md b/README.md index 3282418..57f4849 100644 --- a/README.md +++ b/README.md @@ -16,3 +16,7 @@ Preview: ![3dview](img/datasets_3d.png "3D plot of datasets") ![PLSR vs PCR](img/plsr-pcr.png "PLSR vs PCR comparison") + +# Resources: +- [A simple explanation of Partial Least Squares](http://users.cecs.anu.edu.au/~kee/pls.pdf) +- [Partial Least Squares. A tutorial](https://maths.ucd.ie/~brendan/chemometrics/PLS_TUTORIAL.pdf) diff --git a/plsr_example.py b/plsr_example.py index f248125..ad253df 100644 --- a/plsr_example.py +++ b/plsr_example.py @@ -41,36 +41,31 @@ def import_dataset(ds_name="octane"): if __name__ == "__main__": dataset = "gasoline" + dataset = "octane" wls, xdata, ydata = import_dataset(dataset) - xdata_norm = xdata-np.mean(xdata) - ydata_norm = ydata-np.mean(ydata) total_variance_in_y = np.var(ydata, axis = 0) nc=10 - pls = PLSRegression(n_components=nc) + pls = PLSRegression(n_components=nc,) pls2 = PLSRegression(n_components=2) pls3 = PLSRegression(n_components=3) - plsC = PLSCanonical(n_components=nc) - pca = PCA(n_components=nc) pca2 = PCA(n_components=2) + print("X shape: %s, Y shape: %s" % (np.shape(xdata),np.shape([ydata]))) + print("Fitting PLS...") pls.fit(xdata, ydata) pls2.fit(xdata, ydata) pls3.fit(xdata, ydata) - - plsC.fit(xdata_norm, ydata_norm) - + print("Fitting PCA...") pca.fit(xdata) pca2.fit(xdata) yfit = pls.predict(xdata) y2fit = pls2.predict(xdata) - xcfit, ycfit = plsC.transform(xdata_norm, ydata_norm) - pca_scores = pca.transform(xdata) pca2_scores = pca2.transform(xdata) @@ -84,10 +79,6 @@ def import_dataset(ds_name="octane"): fractions_of_explained_variance = variance_in_y / total_variance_in_y TSS = np.sum((ydata-np.mean(ydata))**2 ) - TSSc = np.sum((ydata_norm-np.mean(ydata_norm))**2 ) - - print(np.shape(y2fit[:,0])) - print(np.shape(ycfit)) RSS_PLS2 = np.sum(np.subtract(ydata,y2fit[:,0])**2 ) r2PLS2 = 1 - RSS_PLS2/TSS @@ -101,19 +92,8 @@ def import_dataset(ds_name="octane"): RSS_PCR = np.sum((ydata-yPfit)**2 ) r2PCR = 1 - RSS_PCR/TSS - RSS_PLSC = np.sum(np.subtract(ydata_norm,ycfit)**2 ) - r2PLSC = 1 - RSS_PLSC/TSSc - - #print("ycfit: ",ycfit) - #print("ydata_norm: ",ydata_norm) - - print("r2PLS: %s, r2PLSC: %s" % (r2PLS, r2PLSC)) - r2_sum = 0 - r2_sumc = 0 - fev = [] - fevc = [] for i in range(0,nc): Y_pred=np.dot(pls.x_scores_[:,i].reshape(-1,1), @@ -122,31 +102,10 @@ def import_dataset(ds_name="octane"): fev.append(r2_sum) print('R^2 for %d component: %g, cummulative: %g' %(i+1,round(r2_score(ydata,Y_pred),3), r2_sum)) - Y_predc=np.dot(plsC.x_scores_[:,i].reshape(-1,1), - plsC.y_loadings_[:,i].reshape(-1,1).T) - #print("YPRED: ", Y_predc) - Y_predc=Y_predc - np.mean(Y_predc) - #print("YPRED: ", Y_predc) - r2_sumc += round(r2_score(ydata_norm,Y_predc),3) - fevc.append(r2_sumc) - print('R^2 for all components (): %g' %r2_sum) #Sum of above - # for i in range(0,nc): - # Y_pred=np.dot(pls.x_scores_[:,i].reshape(-1,1), - # pls.y_loadings_[:,i].reshape(-1,1).T) * ydata.std(axis=0, ddof=1) + ydata.mean(axis=0) - # r2_sum += round(r2_score(ydata,Y_pred),3) - # fev.append(r2_sum) - # print('R^2 for %d component: %g, cummulative: %g' %(i+1,round(r2_score(ydata,Y_pred),3), r2_sum)) - # - # print('R^2 for all components (): %g' %r2_sum) #Sum of above - - #asdasd total_variance_in_x = np.sum(np.var(xdata, axis = 0)) - print("shape: TotalXvar ", np.shape(total_variance_in_x)) - print("shape: Xscores ", np.shape(pls.x_scores_)) - # variance in transformed X data for each latent vector: variance_in_x = [] for i in range(0,nc): @@ -158,29 +117,21 @@ def import_dataset(ds_name="octane"): ,axis = 0) ) - print("shape: Xvar ", np.shape(variance_in_x)) - # normalize variance by total variance: fractions_of_explained_variance = np.sum(variance_in_x / total_variance_in_x, axis=1) - # for i in range(1,11): - # x_partial = pls.x_scores_[:,0:i] * pls.x_loadings_[0:i,:] - - # CROSS VALIDATION STAGE + ''' + CROSS VALIDATION STAGE + ''' plscv_err = [] pcrcv_err = [] xdata_norm = xdata-np.mean(xdata) - print(np.shape(pca.singular_values_)) - print(np.shape(pca.components_)) - - for i in range(1,nc+1): pls_cv = PLSRegression(n_components=i) plsrcv = cross_validate(pls_cv, xdata, ydata, cv=10, scoring="neg_mean_squared_error") plscv_err.append(-1*np.mean(plsrcv["test_score"])) - print(i, np.mean(plsrcv["test_score"])) pca_cv = PCA(n_components=i) pca_cv.fit(xdata) @@ -189,76 +140,93 @@ def import_dataset(ds_name="octane"): pcr_cv = LinearRegression().fit(pca_scores, ydata) pcacv = cross_validate(pcr_cv, pca_scores, ydata, cv=10, scoring="neg_mean_squared_error") pcrcv_err.append(-1*np.mean(pcacv["test_score"])) - print(i, np.mean(pcacv["test_score"])) - - - # Plotting section - - f, axs = plt.subplots(3,3) - -# ax.plot([np.sum(fractions_of_explained_variance[0:i]) for i in range(len(fractions_of_explained_variance))],'-bo') - axs[0,0].plot(range(1,11),fev,'-bo') - #axs[0,0].plot(range(1,11),fevc,'-gx') - axs[0,0].set_xlabel("Number of PLS Components") - axs[0,0].set_ylabel("Percent Variance Explained in Y") - axs[0,1].plot(ydata, y2fit, ' ob') - axs[0,1].plot(ydata, yP2fit, ' ^r') - axs[0,1].plot(range(83,91), range(83,91), ':', color='#888888') - axs[0,1].set_xlim(83,90) - axs[0,1].set_ylim(83,90) - axs[0,1].set_xlabel("Observed Response") - axs[0,1].set_ylabel("Fitted Response") - axs[0,1].legend(["PLSR with 2 Components (R2 = %2.4f)" % (r2PLS2), - "PCR with 2 Components (R2 = %2.4f)" % (r2PCR2)], - prop={'size': 8}) - - axs[1,0].plot(range(1,11), 100*np.cumsum(fractions_of_explained_variance)/np.sum(fractions_of_explained_variance), "-ob") - axs[1,0].plot(range(1,11), 100*(np.cumsum(pca.explained_variance_ratio_)/np.sum(pca.explained_variance_ratio_)), '-^r') - axs[1,0].set_xlabel("Number of PLS/PCR Components") - axs[1,0].set_ylabel("Percent Variance Explained in X") - axs[1,0].legend(["PLSR", "PCR"], + ''' + Plotting section + ''' + # Axes for dataset visualisation + f_dataset, axs_ds = plt.subplots(1,2) + ds_ax = axs_ds[0] + dsc_ax = axs_ds[1] + # Axes for PLSR PCR comparisons + f_comp, axs_comp = plt.subplots(2,2) + yvar_ax = axs_comp[0,0] + xvar_ax = axs_comp[0,1] + comp2_ax = axs_comp[1,0] + comp10_ax = axs_comp[1,1] + # Axes for results visualisation + f_res, axs_res = plt.subplots(3,1) + mse_ax = axs_res[0] + plsc_ax = axs_res[1] + pcrc_ax = axs_res[2] + + # Plot dataset (raw) + ds_ax.plot(wls, xdata.values.T, linewidth=0.5) + ds_ax.set_title("Dataset " + dataset) + ds_ax.set_xlabel("wavelength [nm]") + # Plot dataset (mean centered) + dsc_ax.plot(wls, xdata_norm.values.T, linewidth=0.5) + dsc_ax.set_title("Dataset " + dataset + " (mean-normalised)") + dsc_ax.set_xlabel("wavelength [nm]") + # Plot Variance in Y + yvar_ax.plot(range(1,11),fev,'-bo',fillstyle='none', linewidth=0.5) + yvar_ax.set_xlabel("PLS Components") + yvar_ax.set_ylabel("Variance Explained in Y [%]") + yvar_ax.set_xlim(1,10) + yvar_ax.set_ylim(0.30,1.00) + yvar_ax.legend(["PLSR"], prop={'size': 8}) - - axs[0,2].plot(ydata, yfit, ' ob') - axs[0,2].plot(ydata, yPfit, ' ^r') - axs[0,2].plot(range(83,91), range(83,91), ':', color='#888888') - axs[0,2].set_xlim(83,90) - axs[0,2].set_ylim(83,90) - axs[0,2].set_xlabel("Observed Response") - axs[0,2].set_ylabel("Fitted Response") - axs[0,2].legend(["PLSR (10 Components) (R2 = %2.4f)" % (r2PLS), - "PCR with (10 Components) (R2 = %2.4f)" % (r2PCR)], + # Plot Variance in X + xvar_ax.plot(range(1,11), 100*np.cumsum(fractions_of_explained_variance)/np.sum(fractions_of_explained_variance), "-ob") + xvar_ax.plot(range(1,11), 100*(np.cumsum(pca.explained_variance_ratio_)/np.sum(pca.explained_variance_ratio_)), '-^r') + xvar_ax.set_xlabel("PLS/PCR Components") + xvar_ax.set_ylabel("Variance Explained in X [%]") + xvar_ax.legend(["PLSR", "PCR"], + prop={'size': 8}) + # Plot comparison with 2 components + comp2_ax.plot(ydata, y2fit, ' ob') + comp2_ax.plot(ydata, yP2fit, ' ^r') + comp2_ax.plot(range(83,91), range(83,91), ':', color='#888888') + comp2_ax.set_xlim(83,90) + comp2_ax.set_ylim(83,90) + comp2_ax.set_xlabel("Observed Response") + comp2_ax.set_ylabel("Fitted Response") + comp2_ax.legend(["PLSR (2 Comp.) (R2: %2.3f)" % (r2PLS2), + "PCR (2 Comp.)(R2: %2.3f)" % (r2PCR2)], prop={'size': 8}) - - axs[1,1].plot(range(1,11), plscv_err, '-ob') - axs[1,1].plot(range(1,11), pcrcv_err, '-^r') - axs[1,1].legend(["PLSR", "PCR"], + # Plot comparison with 10 components + comp10_ax.plot(ydata, yfit, ' ob') + comp10_ax.plot(ydata, yPfit, ' ^r') + comp10_ax.plot(range(83,91), range(83,91), ':', color='#888888') + comp10_ax.set_xlim(83,90) + comp10_ax.set_ylim(83,90) + comp10_ax.set_xlabel("Observed Response") + comp10_ax.set_ylabel("Fitted Response") + comp10_ax.legend(["PLSR (10 Comp.) (R2: %2.3f)" % (r2PLS), + "PCR (10 Comp.) (R2: %2.3f)" % (r2PCR)], + prop={'size': 8}) + # Plot comparison of MSE + mse_ax.plot(range(1,11), plscv_err, '-ob') + mse_ax.plot(range(1,11), pcrcv_err, '-^r') + mse_ax.legend(["PLSR", "PCR"], prop={'size': 8}) - axs[1,1].set_ylabel("Estimated Mean Squared Prediction Error") - axs[1,1].set_xlabel("Number of PLS/PCR Components") - - axs[1,2].plot(pls3.x_weights_) - axs[1,2].set_xlabel("Variable") - axs[1,2].set_xlabel("PLS Weight") - axs[1,2].legend(["1st Component", + mse_ax.set_ylabel("Estimated Mean\nSquared Prediction Error") + mse_ax.set_xlabel("Number of PLS/PCR Components") + # Plot PCA component weights + plsc_ax.plot(pls3.x_weights_) + plsc_ax.set_xlabel("Variable") + plsc_ax.set_ylabel("PLS Weight") + plsc_ax.legend(["1st Component", "2nd Component", "3rd Component"], prop={'size': 8}) - - axs[2,2].plot(pca.components_.T[:,0:4]) - axs[2,2].set_xlabel("Variable") - axs[2,2].set_xlabel("PCA Loading") - axs[2,2].legend(["1st Component", + # Plot PLSR component loadings + pcrc_ax.plot(pca.components_.T[:,0:4]) + pcrc_ax.set_xlabel("Variable") + pcrc_ax.set_ylabel("PCA Loading") + pcrc_ax.legend(["1st Component", "2nd Component", "3rd Component", "4th Component"], prop={'size': 8}) - - axs[2,0].plot(xdata.values.T) - axs[2,0].set_title("Dataset " + dataset) - - axs[2,1].plot(xdata_norm.values.T) - axs[2,1].set_title("Dataset " + dataset + " (mean-normalised)") - plt.show()