From e09cc70a3934d10f9d0b54e50a269b7e975fff4b Mon Sep 17 00:00:00 2001 From: George Karypis Date: Fri, 8 Nov 2019 08:49:07 -0800 Subject: [PATCH] - Added support for negative test in slim_predict. - Re-organized the information that is displayed by the command-line programs. - Fixed mini-help typos. --- Makefile | 8 +-- include/slim.h | 2 +- src/programs/cmdline_learn.c | 23 ++++----- src/programs/cmdline_predict.c | 27 ++++++++-- src/programs/slim_learn.c | 14 +++--- src/programs/slim_predict.c | 90 +++++++++++++++++++++++++++++++--- src/programs/struct.h | 1 + 7 files changed, 127 insertions(+), 38 deletions(-) diff --git a/Makefile b/Makefile index 10af3fa..ab673bc 100644 --- a/Makefile +++ b/Makefile @@ -10,10 +10,10 @@ gklib_path = not-set bcls_path = not-set shared = not-set with_mkl = not-set -cc = /usr/bin/gcc -cxx = /usr/bin/g++ -#cc = gcc-mp-4.9 -#cxx = g++-mp-4.9 +cc = not-set +cxx = not-set +#cc = /usr/bin/gcc +#cxx = /usr/bin/g++ #=============================================================== # There should be no need to modify beyond this point diff --git a/include/slim.h b/include/slim.h index 6a39230..6fba4f1 100644 --- a/include/slim.h +++ b/include/slim.h @@ -50,7 +50,7 @@ typedef void slim_t; * Constant definitions *-------------------------------------------------------------------------*/ /* SLIM's version number */ -#define SLIM_VERSION "2.0pre1" +#define SLIM_VERSION "2.0" /* The maximum length of the options[] array */ #define SLIM_NOPTIONS 40 diff --git a/src/programs/cmdline_learn.c b/src/programs/cmdline_learn.c index 6199b5e..c2f9ec6 100644 --- a/src/programs/cmdline_learn.c +++ b/src/programs/cmdline_learn.c @@ -73,24 +73,18 @@ static char helpstr[][512] = { " csrnv - CSR format without ratings.", " cluto - Format used by CLUTO.", " ijv - One (row#, col#, val) per line.", - " " + " ", " -binarize", " Specifies that the ratings should be binarized.", " ", " -l1r=double", - " Specifies the L1 regularization parameter. The default value is " - "1.0.", - " ", - " -ipmdlfile=string", - " Specifies the file used to initialize the model.", + " Specifies the L1 regularization parameter. The default value is 1.0.", " ", " -l2r=double", - " Specifies the L2 regularization parameter. The default value is " - "1.0.", + " Specifies the L2 regularization parameter. The default value is 1.0.", " ", " -nnbrs=int", - " Selects FSLIM model and specifies the number of item nearest " - "neighbors", + " Selects FSLIM model and specifies the number of item nearest neighbors", " to be used. The default value is 0.", " ", " -simtype=string", @@ -116,12 +110,13 @@ static char helpstr[][512] = { " ", " -nthreads=int", " Specifies the number of threads to be used for estimation.", - " The default value is maximum number of threads available in the " - "machine.", + " The default value is maximum number of threads available in the machine.", + " ", + " -ipmdlfile=string", + " Specifies the file used to initialize the model.", " ", " -dbglvl=int", - " Specifies the debug level. The default value turns on info and " - "timing.", + " Specifies the debug level. The default value turns on info and timing.", " ", " -help", " Prints this message.", diff --git a/src/programs/cmdline_predict.c b/src/programs/cmdline_predict.c index 5e1535f..3d90f96 100644 --- a/src/programs/cmdline_predict.c +++ b/src/programs/cmdline_predict.c @@ -36,17 +36,27 @@ static gk_StringMap_t ifmt_options[] = { static char helpstr[][512] = { " ", " Usage:", - " slim_predict [options] model-file old-file [test-file]", + " slim_predict [options] model-file old-file [test-file] [neg-file]", " ", " Parameters:", " model-file", " The file that stores the model that was generated by slim_learn.", " ", " old-file", - " The file that stores the historical information for each user.", + " The file that stores the historical information for the users", + " for which recommendations are generated.", " ", " test-file", " The file that stores the hidden items for each user.", + " It is only used to evaluate the quality of the recommendations", + " and it should contain a row for each of the users in the old-file.", + " ", + " neg-file", + " The file that stores the negative items for each user.", + " It is used for evaluation purposes as follows: The hidden items", + " and the negative items are predicted, and the nrcmds highest", + " highest scoring items among them are returned as the recommendations.", + " This is list is then used to evaluate the performance.", " ", " Options:", " -ifmt=string", @@ -55,7 +65,7 @@ static char helpstr[][512] = { " csrnv - CSR format without ratings.", " cluto - Format used by CLUTO.", " ijv - One (row#, col#, val) per line.", - " " + " ", " -binarize", " Specifies that the ratings should be binarized.", " ", @@ -98,6 +108,7 @@ params_t *parse_cmdline(int argc, char *argv[]) { params->binarize = 0; params->outfile = NULL; params->tstfile = NULL; + params->negfile = NULL; params->nrcmds = 10; params->dbglvl = 0; @@ -145,7 +156,7 @@ params_t *parse_cmdline(int argc, char *argv[]) { } /* get the datafile */ - if (argc - gk_optind < 1 || argc - gk_optind > 3) { + if (argc - gk_optind < 1 || argc - gk_optind > 4) { for (int i = 0; strlen(shorthelpstr[i]) > 0; i++) printf("%s\n", shorthelpstr[i]); exit(0); @@ -159,11 +170,17 @@ params_t *parse_cmdline(int argc, char *argv[]) { if (!gk_fexists(params->trnfile)) errexit("Input old file %s does not exist.\n", params->trnfile); - if (argc - gk_optind == 1) { + if (argc - gk_optind >= 1) { params->tstfile = gk_strdup(argv[gk_optind++]); if (!gk_fexists(params->tstfile)) errexit("Input test file %s does not exist.\n", params->tstfile); } + if (argc - gk_optind >= 1) { + params->negfile = gk_strdup(argv[gk_optind++]); + if (!gk_fexists(params->negfile)) + errexit("Input negative file %s does not exist.\n", params->negfile); + } + return params; } diff --git a/src/programs/slim_learn.c b/src/programs/slim_learn.c index 1c87744..3d886a7 100644 --- a/src/programs/slim_learn.c +++ b/src/programs/slim_learn.c @@ -33,12 +33,14 @@ int main(int argc, char *argv[]) { "------------------------------------------------------------------\n"); printf(" trnfile: %s, nrows: %d, ncols: %d, nnz: %zd\n", params->trnfile, tmat->nrows, tmat->ncols, tmat->rowptr[tmat->nrows]); - printf(" l1r: %.2le, l2r: %.2le, optTol: %.2le, niters: %d\n", params->l1r, - params->l2r, params->optTol, params->niters); - printf(" binarize: %d, nnbrs: %d, nthreads: %d, dbglvl: %d\n", - params->binarize, params->nnbrs, params->nthreads, params->dbglvl); - printf(" simtype: %s, mdlfile: %s\n", slim_simtypenames[params->simtype], - params->mdlfile); + printf(" l1r: %.2le, l2r: %.2le, binarize: %s\n", params->l1r, + params->l2r, (params->binarize == 0 ? "No" : "Yes")); + printf(" solver: %s, optTol: %.2le, niters: %d\n", + slim_algonames[params->algo], params->optTol, params->niters); + printf(" mdlfile: %s, nthreads: %d, dbglvl: %d\n", + params->mdlfile, params->nthreads, params->dbglvl); + printf(" simtype: %s, nnbrs: %d\n", + slim_simtypenames[params->simtype], params->nnbrs); printf("\nEstimating model...\n"); /* free any user-supplied ratings if set to be ignored */ diff --git a/src/programs/slim_predict.c b/src/programs/slim_predict.c index 3108ae5..a9590bc 100644 --- a/src/programs/slim_predict.c +++ b/src/programs/slim_predict.c @@ -14,14 +14,15 @@ /*************************************************************************/ int main(int argc, char *argv[]) { ssize_t zI; - int32_t iU, iR, nrcmds, nhits[3], ntrue[2]; + int32_t i, iU, iR, nrcmds, ask_nrcmds, ncands, nhits[3], ntrue[2]; int32_t nvalid, nvalid_head, nvalid_tail; float all_hr, head_hr, tail_hr; int is_tail_u, is_head_u; int32_t *rids, *rmarker, *fmarker; + gk_fkv_t *rcands, cand; float *rscores, hr[3], arhr, larhr, baseline; params_t *params; - gk_csr_t *oldmat, *tstmat = NULL, *model; + gk_csr_t *oldmat, *tstmat = NULL, *negmat = NULL, *model; int32_t ioptions[SLIM_NOPTIONS]; FILE *fpout = NULL; @@ -34,6 +35,8 @@ int main(int argc, char *argv[]) { oldmat = gk_csr_Read(params->trnfile, params->ifmt, params->readvals, 0); if (params->tstfile) tstmat = gk_csr_Read(params->tstfile, params->ifmt, params->readvals, 0); + if (params->negfile) + negmat = gk_csr_Read(params->negfile, params->ifmt, params->readvals, 0); printf( "------------------------------------------------------------------\n"); @@ -47,6 +50,9 @@ int main(int argc, char *argv[]) { if (tstmat) printf(" tstfile: %s, nrows: %d, ncols: %d, nnz: %zd\n", params->tstfile, tstmat->nrows, tstmat->ncols, tstmat->rowptr[tstmat->nrows]); + if (negmat) + printf(" negfile: %s, nrows: %d, ncols: %d, nnz: %zd\n", params->negfile, + negmat->nrows, negmat->ncols, negmat->rowptr[negmat->nrows]); if (params->outfile) printf(" outfile: %s\n", (params->outfile ? params->outfile : "No output")); @@ -62,18 +68,25 @@ int main(int argc, char *argv[]) { gk_free((void **)&oldmat->rowval, LTERM); if (tstmat) gk_free((void **)&tstmat->rowval, LTERM); + if (negmat) + gk_free((void **)&negmat->rowval, LTERM); } SLIM_iSetDefaults(ioptions); ioptions[SLIM_OPTION_DBGLVL] = params->dbglvl; - /* predict for each row in oldmat */ if (params->outfile) fpout = gk_fopen(params->outfile, "w", "outfile"); - rids = gk_i32malloc(params->nrcmds, "rids"); - rscores = gk_fmalloc(params->nrcmds, "rscores"); + /* if we are using a negative test, ask for a score for all non-supplied items */ + ask_nrcmds = (negmat ? model->nrows : params->nrcmds); + + /* allocate neccessary arrays */ + rids = gk_i32malloc(ask_nrcmds, "rids"); + rscores = gk_fmalloc(ask_nrcmds, "rscores"); rmarker = (tstmat ? gk_i32smalloc(model->ncols, -1, "rmarker") : NULL); + rcands = (negmat ? gk_fkvmalloc(model->ncols, "rcands") : NULL); + // get head and tail columns, mark 0 for head items and 1 for items in tail fmarker = (tstmat ? SLIM_DetermineHeadAndTail( oldmat->nrows, gk_max(oldmat->ncols, tstmat->ncols), @@ -84,12 +97,72 @@ int main(int argc, char *argv[]) { arhr = 0.0; nvalid = nvalid_head = nvalid_tail = 0; + + /* predict for each row in oldmat */ for (iU = 0; iU < oldmat->nrows; iU++) { nrcmds = SLIM_GetTopN( model, oldmat->rowptr[iU + 1] - oldmat->rowptr[iU], oldmat->rowind + oldmat->rowptr[iU], - (oldmat->rowval ? oldmat->rowval + oldmat->rowptr[iU] : NULL), ioptions, - params->nrcmds, rids, rscores); + (oldmat->rowval ? oldmat->rowval + oldmat->rowptr[iU] : NULL), + ioptions, ask_nrcmds, rids, rscores); + + /* if negative test items, select the params->nrcmds from neg+pos test */ + if (negmat && nrcmds != SLIM_ERROR) { + for (zI = tstmat->rowptr[iU]; zI < tstmat->rowptr[iU + 1]; zI++) + rmarker[tstmat->rowind[zI]] = -2; + for (zI = negmat->rowptr[iU]; zI < negmat->rowptr[iU + 1]; zI++) + rmarker[negmat->rowind[zI]] = -2; + + /* select the neg+pos that were in the recommended list */ + for (ncands=0, iR=0; iRrowptr[iU]; zI < tstmat->rowptr[iU + 1]; zI++) { + if (rmarker[tstmat->rowind[zI]] != -3) { + rcands[ncands].val = tstmat->rowind[zI]; + rcands[ncands].key = 0.0; + ncands++; + } + rmarker[tstmat->rowind[zI]] = -1; + } + for (zI = negmat->rowptr[iU]; zI < negmat->rowptr[iU + 1]; zI++) { + if (rmarker[negmat->rowind[zI]] != -3) { + rcands[ncands].val = negmat->rowind[zI]; + rcands[ncands].key = 0.0; + ncands++; + } + rmarker[negmat->rowind[zI]] = -1; + } + //printf("ncands: %5d,", ncands); + + + /* shuffle prior to sorting */ + for (iR=0; iRnrcmds); + for (iR=0; iR 0 ? 1.0 * nhits[0] / ntrue[0] : 0.0); @@ -171,7 +245,7 @@ int main(int argc, char *argv[]) { "------------------------------------------------------------------\n"); /* clean up */ - gk_free((void **)&rids, &rscores, &rmarker, &fmarker, LTERM); + gk_free((void **)&rids, &rscores, &rmarker, &fmarker, &rcands, LTERM); SLIM_FreeModel((slim_t **)&model); gk_csr_Free(&oldmat); if (tstmat) diff --git a/src/programs/struct.h b/src/programs/struct.h index 6b22bce..bf82693 100644 --- a/src/programs/struct.h +++ b/src/programs/struct.h @@ -17,6 +17,7 @@ the University of Minnesota typedef struct { char *trnfile; /*!< the file of historical preferences */ char *tstfile; /*!< the file to validate the recommendations */ + char *negfile; /*!< the file containing the negative test instances */ char *l12file; /*!< the file that contains the regularization values over which to search */ char *mdlfile; /*!< the model file during prediction */