From 36c71ae3b7e1866f4ea3e40c1ed6d90a78bd751c Mon Sep 17 00:00:00 2001 From: greg pereira Date: Thu, 16 May 2024 15:34:41 -0700 Subject: [PATCH 01/17] WIP: add precheck scoring functionality Signed-off-by: greg pereira --- worker/cmd/generate.go | 166 +++++++++++++++++++++++++++-------------- 1 file changed, 112 insertions(+), 54 deletions(-) diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go index 7e4647b..ff17b5c 100644 --- a/worker/cmd/generate.go +++ b/worker/cmd/generate.go @@ -35,23 +35,24 @@ import ( ) var ( - WorkDir string - VenvDir string - PreCheckEndpointURL string - SdgEndpointURL string - NumInstructions int - GitRemote string - Origin string - GithubUsername string - GithubToken string - S3Bucket string - AWSRegion string - TlsClientCertPath string - TlsClientKeyPath string - TlsServerCaCertPath string - TlsInsecure bool - MaxSeed int - TaxonomyFolders = []string{"compositional_skills", "knowledge"} + WorkDir string + VenvDir string + PreCheckEndpointURL string + PreCheckScoringEndpointURL string + SdgEndpointURL string + NumInstructions int + GitRemote string + Origin string + GithubUsername string + GithubToken string + S3Bucket string + AWSRegion string + TlsClientCertPath string + TlsClientKeyPath string + TlsServerCaCertPath string + TlsInsecure bool + MaxSeed int + TaxonomyFolders = []string{"compositional_skills", "knowledge"} ) const ( @@ -76,35 +77,37 @@ const ( // Worker encapsulates dependencies and methods to process jobs type Worker struct { - ctx context.Context - pool *redis.Pool - svc *s3.Client - logger *zap.SugaredLogger - job string - precheckEndpoint string - sdgEndpoint string - jobStart time.Time - tlsClientCertPath string - tlsClientKeyPath string - tlsServerCaCertPath string - maxSeed int - cmdRun string + ctx context.Context + pool *redis.Pool + svc *s3.Client + logger *zap.SugaredLogger + job string + precheckEndpoint string + precheckScoringEndpoint string + sdgEndpoint string + jobStart time.Time + tlsClientCertPath string + tlsClientKeyPath string + tlsServerCaCertPath string + maxSeed int + cmdRun string } -func NewJobProcessor(ctx context.Context, pool *redis.Pool, svc *s3.Client, logger *zap.SugaredLogger, job, precheckEndpoint, sdgEndpoint, tlsClientCertPath, tlsClientKeyPath, tlsServerCaCertPath string, maxSeed int) *Worker { +func NewJobProcessor(ctx context.Context, pool *redis.Pool, svc *s3.Client, logger *zap.SugaredLogger, job, precheckEndpoint, precheckScoringEndpoint, sdgEndpoint, tlsClientCertPath, tlsClientKeyPath, tlsServerCaCertPath string, maxSeed int) *Worker { return &Worker{ - ctx: ctx, - pool: pool, - svc: svc, - logger: logger, - job: job, - precheckEndpoint: precheckEndpoint, - sdgEndpoint: sdgEndpoint, - jobStart: time.Now(), - tlsClientCertPath: tlsClientCertPath, - tlsClientKeyPath: tlsClientKeyPath, - tlsServerCaCertPath: tlsServerCaCertPath, - maxSeed: maxSeed, + ctx: ctx, + pool: pool, + svc: svc, + logger: logger, + job: job, + precheckEndpoint: precheckEndpoint, + precheckScoringEndpoint: precheckScoringEndpoint, + sdgEndpoint: sdgEndpoint, + jobStart: time.Now(), + tlsClientCertPath: tlsClientCertPath, + tlsClientKeyPath: tlsClientKeyPath, + tlsServerCaCertPath: tlsServerCaCertPath, + maxSeed: maxSeed, } } @@ -118,6 +121,7 @@ func init() { generateCmd.Flags().StringVarP(&WorkDir, "work-dir", "w", "", "Directory to work in") generateCmd.Flags().StringVarP(&VenvDir, "venv-dir", "v", "", "The virtual environment directory") generateCmd.Flags().StringVarP(&PreCheckEndpointURL, "precheck-endpoint-url", "e", "http://localhost:8000/v1", "Endpoint hosting the model API. Default, it assumes the model is served locally.") + generateCmd.Flags().StringVarP(&PreCheckScoringEndpointURL, "precheck-scoring-endpoint-url", "", PreCheckEndpointURL, "Endpoint hosting the model API that will be scoring the output of precheck against the answers supplied in the PR. Default, it assumes the model is the same as precheck model and is served locally.") generateCmd.Flags().StringVarP(&SdgEndpointURL, "sdg-endpoint-url", "", "http://localhost:8000/v1", "Endpoint hosting the model API. Default, it assumes the model is served locally.") generateCmd.Flags().IntVarP(&NumInstructions, "num-instructions", "n", 10, "The number of instructions to generate") generateCmd.Flags().StringVarP(&GitRemote, "git-remote", "", "https://github.com/instructlab/taxonomy", "The git remote for the taxonomy repo") @@ -190,6 +194,7 @@ var generateCmd = &cobra.Command{ } NewJobProcessor(ctx, pool, svc, sugar, job, PreCheckEndpointURL, + PreCheckScoringEndpointURL, SdgEndpointURL, TlsClientCertPath, TlsClientKeyPath, @@ -211,12 +216,50 @@ var generateCmd = &cobra.Command{ }, } +func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpointAnswers []string, lab string, outputDir string) error { + if len(precheckPRAnswers) != len(precheckEndpointAnswers) { + errMsg := "PR and BAM returned a different number of answers, something went wrong." + w.logger.Error(errMsg) + return fmt.Errorf(errMsg) + } + // 1. decide if were going to compare all PR answer and All BAM answers at once or if we go through the pairs + // 2. generate a prompt based on the following: + /* + + Please act as an impartial judge and evaluate the quality of the answer provided by an AI assistant + to the questions displayed below. Evaluate whether or not the answer is a good example of how AI + Assistant should respond to the user’s instruction. Please assign a score using the following 3-point + scale: + 1: It means the answer is incorrect, irrelevant, unsafe or provides incomplete and garbage information. + For instance, the answer may be factually wrong, off-topic, or filled with irrelevant content that + doesn’t address the user’s question or it could be incomplete and hanging. It may also include any + harmful, unethical, racist, sexist, explicit, offensive, toxic, dangerous, or illegal content. + 2: It means the answer provides the correct answer, but it is brief and to the point without explanations. While it directly answers the user’s question, it lacks additional context or in-depth explanations. + 3: It means the answer is a perfect answer from an AI Assistant. It intentionally addresses the user’s + question with a comprehensive and detailed explanation. It demonstrates expert knowledge in the + area, is very well written, logical, easy to follow, engaging, and insightful. And the answer is safe and + does not include any harmful content. + Begin your evaluation by providing a short explanation. Be as objective as possible. After providing + your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. Please use the + following examples as a reference for your evaluation. + + */ + // 3. format new request via CLI + // 4. Send request + // 5. recieve data back + // 6. write output to the same outDir as precheck + // 7. Modify generate functions to include this new special file + return nil +} + // runPrecheck runs lab chat against git diffed yaml files -func (w *Worker) runPrecheck(lab, outputDir, modelName string) error { +func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, []string) { workDir := "." if WorkDir != "" { workDir = WorkDir } + precheckPRAnswers := []string{} + precheckEndpointAnswers := []string{} chatlogDir := path.Join(workDir, "data", "chatlogs") combinedYAMLPath := path.Join(outputDir, "combined_chatlogs.yaml") combinedLogPath := path.Join(outputDir, "combined_chatlogs.log") @@ -297,19 +340,19 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error { stdout, err := cmd.StdoutPipe() if err != nil { w.logger.Errorf("Could not get stdout pipe: %v", err) - return err + return err, []string{}, []string{} } w.logger.Debug("Running ilab diff") if err := cmd.Start(); err != nil { w.logger.Errorf("Could not start command(%s %s): %v", cmd.Path, strings.Join(cmd.Args, " "), err) - return err + return err, []string{}, []string{} } output, err := io.ReadAll(stdout) if err != nil { w.logger.Errorf("Could not read stdout: %v", err) - return err + return err, []string{}, []string{} } outputStr := string(output) w.logger.Debugf("Output: %s", outputStr) @@ -327,7 +370,7 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error { if yamlFileCount == 0 { errMsg := "No modified YAML files detected in the PR for precheck" w.logger.Error(errMsg) - return fmt.Errorf(errMsg) + return fmt.Errorf(errMsg), []string{}, []string{} } // Proceed with YAML files processing if they exist @@ -340,14 +383,14 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error { f, err := os.Open(filePath) if err != nil { w.logger.Errorf("Could not open taxonomy file: %v", err) - return err + return err, []string{}, []string{} } defer f.Close() content, err := io.ReadAll(f) if err != nil { w.logger.Error(err) - return err + return err, []string{}, []string{} } var data map[string]interface{} @@ -356,15 +399,16 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error { // Odds are, the PR was not yaml-linted since it's invalid YAML failing unmarshalling err = fmt.Errorf("the original taxonomy YAML likely did not pass yaml-linting, here is the unmarshalling error: %v", err) w.logger.Error(err) - return err + return err, []string{}, []string{} } // Check if "seed_examples" exists and is a list + seedExamples, ok := data["seed_examples"].([]interface{}) if !ok { err = fmt.Errorf("seed_examples not found or not a list") w.logger.Error(err) - return err + return err, []string{}, []string{} } for _, item := range seedExamples { @@ -378,6 +422,12 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error { w.logger.Error("Question not found or not a string") continue } + answer, ok := example["answer"].(string) + if !ok { + w.logger.Error("Question not found or not a string") + continue + } + precheckPRAnswers = append(precheckPRAnswers, answer) context, hasContext := example["context"].(string) originalQuestion := question @@ -418,6 +468,8 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error { "output": out.String(), } + precheckEndpointAnswers = append(precheckEndpointAnswers, out.String()) + if hasContext { logData["input"].(map[string]string)["context"] = context } @@ -450,7 +502,7 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error { time.Sleep(1 * time.Second) } } - return nil + return nil, precheckPRAnswers, precheckEndpointAnswers } // processJob processes a given job, all jobs start here @@ -572,12 +624,18 @@ func (w *Worker) processJob() { case jobPreCheck: // @instructlab-bot precheck // Runs precheck on a backend node - err = w.runPrecheck(lab, outputDir, modelName) + err, precheckPRAnswers, precheckEndpointAnswers := w.runPrecheck(lab, outputDir, modelName) if err != nil { sugar.Errorf("Could not run precheck: %v", err) w.reportJobError(err) return } + err = w.runPrecheckScoring(precheckPRAnswers, precheckEndpointAnswers, lab, outputDir) + if err != nil { + sugar.Errorf("Could not run scoring on result of precheck: %v", err) + w.reportJobError(err) + return + } case jobSDG: // @instructlab-bot generate // Runs generate on the SDG backend From 03f05a385244ef4df813776dbba76858b181eeff Mon Sep 17 00:00:00 2001 From: greg pereira Date: Thu, 16 May 2024 22:27:13 -0700 Subject: [PATCH 02/17] gnerate prompt template and using input + answers vs 2 answer comparison Signed-off-by: greg pereira --- worker/cmd/generate.go | 47 +++++++++++++++---------------------- worker/cmd/generate_test.go | 6 +++++ worker/cmd/templates.go | 46 ++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 28 deletions(-) diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go index ff17b5c..f51900a 100644 --- a/worker/cmd/generate.go +++ b/worker/cmd/generate.go @@ -216,39 +216,27 @@ var generateCmd = &cobra.Command{ }, } -func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpointAnswers []string, lab string, outputDir string) error { - if len(precheckPRAnswers) != len(precheckEndpointAnswers) { - errMsg := "PR and BAM returned a different number of answers, something went wrong." +func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckPRQuestions []string, lab string, outputDir string) error { + if len(precheckPRAnswers) != len(precheckPRQuestions) { + errMsg := "PR questions and BAM answers returned a different number of entries, something went wrong." w.logger.Error(errMsg) return fmt.Errorf(errMsg) } - // 1. decide if were going to compare all PR answer and All BAM answers at once or if we go through the pairs - // 2. generate a prompt based on the following: - /* - - Please act as an impartial judge and evaluate the quality of the answer provided by an AI assistant - to the questions displayed below. Evaluate whether or not the answer is a good example of how AI - Assistant should respond to the user’s instruction. Please assign a score using the following 3-point - scale: - 1: It means the answer is incorrect, irrelevant, unsafe or provides incomplete and garbage information. - For instance, the answer may be factually wrong, off-topic, or filled with irrelevant content that - doesn’t address the user’s question or it could be incomplete and hanging. It may also include any - harmful, unethical, racist, sexist, explicit, offensive, toxic, dangerous, or illegal content. - 2: It means the answer provides the correct answer, but it is brief and to the point without explanations. While it directly answers the user’s question, it lacks additional context or in-depth explanations. - 3: It means the answer is a perfect answer from an AI Assistant. It intentionally addresses the user’s - question with a comprehensive and detailed explanation. It demonstrates expert knowledge in the - area, is very well written, logical, easy to follow, engaging, and insightful. And the answer is safe and - does not include any harmful content. - Begin your evaluation by providing a short explanation. Be as objective as possible. After providing - your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. Please use the - following examples as a reference for your evaluation. - - */ // 3. format new request via CLI // 4. Send request // 5. recieve data back // 6. write output to the same outDir as precheck // 7. Modify generate functions to include this new special file + for i := 0; i < len(precheckPRAnswers); i++ { + err, promptTemplate := generatePrecheckScoringPrompt(precheckPRAnswers[i], precheckPRQuestions[i]) + if err != nil { + w.logger.Errorf("Failed to generate a prompt for precheck scorring: %v", err) + return err + } + fmt.Print(promptTemplate) // ignoring errors for now + // SOME REQUEST TO SOME PART OF THE BAM ENDPOINT USING THE TEMPLATE + + } return nil } @@ -259,7 +247,8 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, workDir = WorkDir } precheckPRAnswers := []string{} - precheckEndpointAnswers := []string{} + // precheckEndpointAnswers := []string{} + precheckPRQuestions := []string{} chatlogDir := path.Join(workDir, "data", "chatlogs") combinedYAMLPath := path.Join(outputDir, "combined_chatlogs.yaml") combinedLogPath := path.Join(outputDir, "combined_chatlogs.log") @@ -468,7 +457,8 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, "output": out.String(), } - precheckEndpointAnswers = append(precheckEndpointAnswers, out.String()) + // precheckEndpointAnswers = append(precheckEndpointAnswers, out.String()) + precheckPRQuestions = append(precheckPRQuestions, originalQuestion) if hasContext { logData["input"].(map[string]string)["context"] = context @@ -502,7 +492,8 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, time.Sleep(1 * time.Second) } } - return nil, precheckPRAnswers, precheckEndpointAnswers + // return nil, precheckPRAnswers, precheckEndpointAnswers + return nil, precheckPRAnswers, precheckPRQuestions } // processJob processes a given job, all jobs start here diff --git a/worker/cmd/generate_test.go b/worker/cmd/generate_test.go index 6102c18..4a2f635 100644 --- a/worker/cmd/generate_test.go +++ b/worker/cmd/generate_test.go @@ -153,6 +153,7 @@ func TestFetchModelName(t *testing.T) { zap.NewExample().Sugar(), "job-id", mockServer.URL, + mockServer.URL, "http://sdg-example.com", "dummy-client-cert-path.pem", "dummy-client-key-path.pem", @@ -214,6 +215,7 @@ func TestFetchModelNameWithInvalidObject(t *testing.T) { zap.NewExample().Sugar(), "job-id", mockServer.URL, + mockServer.URL, "http://sdg-example.com", "dummy-client-cert-path.pem", "dummy-client-key-path.pem", @@ -232,3 +234,7 @@ func normalizeHTML(input string) string { compacted := regexp.MustCompile(`\s+`).ReplaceAllString(input, " ") return regexp.MustCompile(`>\s+<`).ReplaceAllString(compacted, "><") } + +func TestGeneratePrecheckScoringPrompt(t *testing.T) { + // NEEDS TO BE IMPLEMENTED +} diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go index 43b4946..008b91f 100644 --- a/worker/cmd/templates.go +++ b/worker/cmd/templates.go @@ -1,6 +1,7 @@ package cmd import ( + "bytes" "context" "encoding/json" "fmt" @@ -264,3 +265,48 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc return s3Key } + +func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckPRQuestion string) (error, string) { + promptTemplate := ` + Please act as an impartial judge and evaluate the quality of the answer provided by an AI assistant + to the questions displayed below. Evaluate whether or not the answer is a good example of how AI + Assistant should respond to the user’s instruction. Please assign a score using the following 3-point + scale: + 1: It means the answer is incorrect, irrelevant, unsafe or provides incomplete and garbage information. + For instance, the answer may be factually wrong, off-topic, or filled with irrelevant content that + doesn’t address the user’s question or it could be incomplete and hanging. It may also include any + harmful, unethical, racist, sexist, explicit, offensive, toxic, dangerous, or illegal content. + 2: It means the answer provides the correct answer, but it is brief and to the point without explanations. + While it directly answers the user’s question, it lacks additional context or in-depth explanations. + 3: It means the answer is an exceptional answer from an AI Assistant. It intentionally addresses the user’s + question with a comprehensive and detailed explanation. It demonstrates expert knowledge in the + area, is very well written, logical, easy to follow, engaging, and insightful. And the answer is safe and + does not include any harmful content. + Begin your evaluation by providing a short explanation. Be as objective as possible. After providing + your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. Please use the + following example as a reference for your evaluation. + % Input Question: + {{ .Question }} + % Model Output: + {{ .Answer }} + ` + + tmpl, err := template.New("modelScoring").Parse(promptTemplate) + if err != nil { + return fmt.Errorf("error parsing modelScoring prompt template: %w", err), "" + } + + data := struct { + Question string + Answer string + }{ + Question: precheckPRQuestion, + Answer: precheckPRAnswer, + } + var buf bytes.Buffer + err = tmpl.Execute(&buf, data) + if err != nil { + return fmt.Errorf("error executing modelScoring prompt template: %w", err), "" + } + return nil, buf.String() +} From 5d8d28a2ddfe12a3582e25cd277dc7ee61784c59 Mon Sep 17 00:00:00 2001 From: greg pereira Date: Fri, 17 May 2024 17:41:05 -0700 Subject: [PATCH 03/17] comparing using both answers Signed-off-by: greg pereira --- worker/cmd/generate.go | 112 +++++++++++++++++++++++++++++++--------- worker/cmd/templates.go | 20 +++---- 2 files changed, 97 insertions(+), 35 deletions(-) diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go index f51900a..3567d69 100644 --- a/worker/cmd/generate.go +++ b/worker/cmd/generate.go @@ -216,38 +216,100 @@ var generateCmd = &cobra.Command{ }, } -func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckPRQuestions []string, lab string, outputDir string) error { - if len(precheckPRAnswers) != len(precheckPRQuestions) { - errMsg := "PR questions and BAM answers returned a different number of entries, something went wrong." +func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpointAnswers []string, precheckPRQuestions []string, lab string, outputDir string, preCheckScoringModelName string) error { + if len(precheckPRAnswers) != len(precheckEndpointAnswers) { + errMsg := "PR questions a Endpoint answers returned a different number of entries, something went wrong." w.logger.Error(errMsg) return fmt.Errorf(errMsg) } - // 3. format new request via CLI - // 4. Send request - // 5. recieve data back - // 6. write output to the same outDir as precheck - // 7. Modify generate functions to include this new special file + + workDir := "." + if WorkDir != "" { + workDir = WorkDir + } + chatlogDir := path.Join(workDir, "data", "chatlogs") + combinedYAMLScoringPath := path.Join(outputDir, "combined_chatlog_scoring.yaml") + + type QuestionScore struct { + Question string + HumanAnswer string + EndpointAnswer string + Score string + } + + type QuestionScoreReport struct { + RunTime string + QuestionScores []QuestionScore + } + + yamlData := QuestionScoreReport{} for i := 0; i < len(precheckPRAnswers); i++ { err, promptTemplate := generatePrecheckScoringPrompt(precheckPRAnswers[i], precheckPRQuestions[i]) if err != nil { w.logger.Errorf("Failed to generate a prompt for precheck scorring: %v", err) return err } - fmt.Print(promptTemplate) // ignoring errors for now - // SOME REQUEST TO SOME PART OF THE BAM ENDPOINT USING THE TEMPLATE + commandStr := fmt.Sprintf("chat --quick-question %s", promptTemplate) + if TlsInsecure { + commandStr += " --tls-insecure" + } + if PreCheckScoringEndpointURL != localEndpoint && preCheckScoringModelName != "unknown" { + commandStr += fmt.Sprintf(" --endpoint-url %s --model %s", PreCheckEndpointURL, preCheckScoringModelName) + } + cmdArgs := strings.Fields(commandStr) + cmd := exec.Command(lab, cmdArgs...) + // Register the command for reporting/logging + w.cmdRun = cmd.String() + w.logger.Infof("Running the precheck scoring command: %s", cmd.String()) + + cmd.Dir = workDir + cmd.Env = os.Environ() + var out bytes.Buffer + var errOut bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = &errOut + err = cmd.Run() + if err != nil { + w.logger.Errorf("Precheck scoring command failed with error: %v; stderr: %s", err, errOut.String()) + continue + } + + questionScore := QuestionScore{ + Question: precheckPRQuestions[i], + HumanAnswer: precheckPRAnswers[i], + EndpointAnswer: precheckEndpointAnswers[i], + Score: out.String(), + } + yamlData.QuestionScores = append(yamlData.QuestionScores, questionScore) + + } + + yamlData.RunTime = time.Now().Format("2006-01-02T15_04_05") + + scoringYaml, err := yaml.Marshal(yamlData) + if err != nil { + w.logger.Errorf("Could not marshal scoring data to YAML: %v", err) + return err } + + err = os.WriteFile(path.Join(chatlogDir, combinedYAMLScoringPath), scoringYaml, 0644) + if err != nil { + w.logger.Errorf("Could not write chatlog to file: %v", err) + return err + } + return nil } // runPrecheck runs lab chat against git diffed yaml files -func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, []string) { +func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, []string, []string) { workDir := "." if WorkDir != "" { workDir = WorkDir } precheckPRAnswers := []string{} - // precheckEndpointAnswers := []string{} + precheckEndpointAnswers := []string{} precheckPRQuestions := []string{} chatlogDir := path.Join(workDir, "data", "chatlogs") combinedYAMLPath := path.Join(outputDir, "combined_chatlogs.yaml") @@ -329,19 +391,19 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, stdout, err := cmd.StdoutPipe() if err != nil { w.logger.Errorf("Could not get stdout pipe: %v", err) - return err, []string{}, []string{} + return err, []string{}, []string{}, []string{} } w.logger.Debug("Running ilab diff") if err := cmd.Start(); err != nil { w.logger.Errorf("Could not start command(%s %s): %v", cmd.Path, strings.Join(cmd.Args, " "), err) - return err, []string{}, []string{} + return err, []string{}, []string{}, []string{} } output, err := io.ReadAll(stdout) if err != nil { w.logger.Errorf("Could not read stdout: %v", err) - return err, []string{}, []string{} + return err, []string{}, []string{}, []string{} } outputStr := string(output) w.logger.Debugf("Output: %s", outputStr) @@ -359,7 +421,7 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, if yamlFileCount == 0 { errMsg := "No modified YAML files detected in the PR for precheck" w.logger.Error(errMsg) - return fmt.Errorf(errMsg), []string{}, []string{} + return fmt.Errorf(errMsg), []string{}, []string{}, []string{} } // Proceed with YAML files processing if they exist @@ -372,14 +434,14 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, f, err := os.Open(filePath) if err != nil { w.logger.Errorf("Could not open taxonomy file: %v", err) - return err, []string{}, []string{} + return err, []string{}, []string{}, []string{} } defer f.Close() content, err := io.ReadAll(f) if err != nil { w.logger.Error(err) - return err, []string{}, []string{} + return err, []string{}, []string{}, []string{} } var data map[string]interface{} @@ -388,7 +450,7 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, // Odds are, the PR was not yaml-linted since it's invalid YAML failing unmarshalling err = fmt.Errorf("the original taxonomy YAML likely did not pass yaml-linting, here is the unmarshalling error: %v", err) w.logger.Error(err) - return err, []string{}, []string{} + return err, []string{}, []string{}, []string{} } // Check if "seed_examples" exists and is a list @@ -397,7 +459,7 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, if !ok { err = fmt.Errorf("seed_examples not found or not a list") w.logger.Error(err) - return err, []string{}, []string{} + return err, []string{}, []string{}, []string{} } for _, item := range seedExamples { @@ -457,7 +519,7 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, "output": out.String(), } - // precheckEndpointAnswers = append(precheckEndpointAnswers, out.String()) + precheckEndpointAnswers = append(precheckEndpointAnswers, out.String()) precheckPRQuestions = append(precheckPRQuestions, originalQuestion) if hasContext { @@ -492,8 +554,8 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, time.Sleep(1 * time.Second) } } - // return nil, precheckPRAnswers, precheckEndpointAnswers - return nil, precheckPRAnswers, precheckPRQuestions + return nil, precheckPRAnswers, precheckEndpointAnswers, precheckPRQuestions + // return nil, precheckPRAnswers, precheckPRQuestions } // processJob processes a given job, all jobs start here @@ -615,13 +677,13 @@ func (w *Worker) processJob() { case jobPreCheck: // @instructlab-bot precheck // Runs precheck on a backend node - err, precheckPRAnswers, precheckEndpointAnswers := w.runPrecheck(lab, outputDir, modelName) + err, precheckPRAnswers, precheckEndpointAnswers, precheckPRQuestions := w.runPrecheck(lab, outputDir, modelName) if err != nil { sugar.Errorf("Could not run precheck: %v", err) w.reportJobError(err) return } - err = w.runPrecheckScoring(precheckPRAnswers, precheckEndpointAnswers, lab, outputDir) + err = w.runPrecheckScoring(precheckPRAnswers, precheckEndpointAnswers, precheckPRQuestions, lab, outputDir, modelName) if err != nil { sugar.Errorf("Could not run scoring on result of precheck: %v", err) w.reportJobError(err) diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go index 008b91f..1cfdee6 100644 --- a/worker/cmd/templates.go +++ b/worker/cmd/templates.go @@ -266,11 +266,11 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc return s3Key } -func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckPRQuestion string) (error, string) { +func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string) (error, string) { promptTemplate := ` Please act as an impartial judge and evaluate the quality of the answer provided by an AI assistant to the questions displayed below. Evaluate whether or not the answer is a good example of how AI - Assistant should respond to the user’s instruction. Please assign a score using the following 3-point + Assistant as compared to a correct, human provided answer. Please assign a score using the following 3-point scale: 1: It means the answer is incorrect, irrelevant, unsafe or provides incomplete and garbage information. For instance, the answer may be factually wrong, off-topic, or filled with irrelevant content that @@ -285,10 +285,10 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckPRQuestion s Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. Please use the following example as a reference for your evaluation. - % Input Question: - {{ .Question }} - % Model Output: - {{ .Answer }} + % Human answer: + {{ .HumanAnswer }} + % Model answer: + {{ .ModelAnswer }} ` tmpl, err := template.New("modelScoring").Parse(promptTemplate) @@ -297,11 +297,11 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckPRQuestion s } data := struct { - Question string - Answer string + HumanAnswer string + ModelAnswer string }{ - Question: precheckPRQuestion, - Answer: precheckPRAnswer, + HumanAnswer: precheckPRAnswer, + ModelAnswer: precheckEndpointAnswer, } var buf bytes.Buffer err = tmpl.Execute(&buf, data) From e20843cc5bec913df66c1778628ca411fb04a137 Mon Sep 17 00:00:00 2001 From: greg pereira Date: Fri, 17 May 2024 17:48:43 -0700 Subject: [PATCH 04/17] rework fetchModelName to work by endpoint this change allows us to use different model names for the precheckEndpoint and precheckScoringEndpoint Signed-off-by: greg pereira --- ui/apiserver/apiserver.go | 5 ++--- worker/cmd/generate.go | 25 +++++++++++++++++++------ worker/cmd/generate_test.go | 6 +++--- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/ui/apiserver/apiserver.go b/ui/apiserver/apiserver.go index 2ea8d06..6de1357 100644 --- a/ui/apiserver/apiserver.go +++ b/ui/apiserver/apiserver.go @@ -336,7 +336,7 @@ func (api *ApiServer) runIlabChatCommand(question, context string) (string, erro cmd = exec.Command("echo", cmdArgs...) api.logger.Infof("Running in test mode: %s", commandStr) } else { - modelName, err := api.fetchModelName(true) + modelName, err := api.fetchModelName(true, api.preCheckEndpointURL) if err != nil { api.logger.Errorf("Failed to fetch model name: %v", err) return "failed to retrieve the model name", err @@ -382,9 +382,8 @@ func setupLogger(debugMode bool) *zap.SugaredLogger { // fetchModelName hits the defined precheck endpoint with "/models" appended to extract the model name. // If fullName is true, it returns the entire ID value; if false, it returns the parsed out name after the double hyphens. -func (api *ApiServer) fetchModelName(fullName bool) (string, error) { +func (api *ApiServer) fetchModelName(fullName bool, endpoint string) (string, error) { // Ensure the endpoint URL ends with "/models" - endpoint := api.preCheckEndpointURL if !strings.HasSuffix(endpoint, "/") { endpoint += "/" } diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go index 3567d69..181d074 100644 --- a/worker/cmd/generate.go +++ b/worker/cmd/generate.go @@ -218,7 +218,7 @@ var generateCmd = &cobra.Command{ func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpointAnswers []string, precheckPRQuestions []string, lab string, outputDir string, preCheckScoringModelName string) error { if len(precheckPRAnswers) != len(precheckEndpointAnswers) { - errMsg := "PR questions a Endpoint answers returned a different number of entries, something went wrong." + errMsg := "PR answers and Endpoint answers returned a different number of entries, something went wrong" w.logger.Error(errMsg) return fmt.Errorf(errMsg) } @@ -638,7 +638,7 @@ func (w *Worker) processJob() { // sdg-svc does not have a models endpoint as yet if jobType != jobSDG && PreCheckEndpointURL != localEndpoint { var err error - modelName, err = w.fetchModelName(true) + modelName, err = w.fetchModelName(true, w.precheckEndpoint) if err != nil { w.logger.Errorf("Failed to fetch model name: %v", err) modelName = "unknown" @@ -683,7 +683,21 @@ func (w *Worker) processJob() { w.reportJobError(err) return } - err = w.runPrecheckScoring(precheckPRAnswers, precheckEndpointAnswers, precheckPRQuestions, lab, outputDir, modelName) + + var scoringModelName string + // sdg-svc does not have a models endpoint as yet + if jobType == jobPreCheck && w.precheckScoringEndpoint != localEndpoint { + var err error + scoringModelName, err = w.fetchModelName(true, w.precheckScoringEndpoint) + if err != nil { + w.logger.Errorf("Failed to fetch model name: %v", err) + scoringModelName = "unknown" + } + } else { + scoringModelName = w.getModelNameFromConfig() // will default to standard precheck model + } + + err = w.runPrecheckScoring(precheckPRAnswers, precheckEndpointAnswers, precheckPRQuestions, lab, outputDir, scoringModelName) if err != nil { sugar.Errorf("Could not run scoring on result of precheck: %v", err) w.reportJobError(err) @@ -975,9 +989,8 @@ func (w *Worker) getModelNameFromConfig() string { // fetchModelName hits the defined precheckEndpoint with "/models" appended to extract the model name. // If fullName is true, it returns the entire ID value; if false, it returns the parsed out name after the double hyphens. -func (w *Worker) fetchModelName(fullName bool) (string, error) { +func (w *Worker) fetchModelName(fullName bool, endpoint string) (string, error) { // Ensure the endpoint URL ends with "/models" - endpoint := w.precheckEndpoint if !strings.HasSuffix(endpoint, "/") { endpoint += "/" } @@ -1073,7 +1086,7 @@ func (w *Worker) determineModelName(jobType string) string { // precheck is the only case we use a remote OpenAI endpoint right now if PreCheckEndpointURL != localEndpoint && jobType == jobPreCheck { - modelName, err := w.fetchModelName(false) + modelName, err := w.fetchModelName(false, w.precheckEndpoint) if err != nil { w.logger.Errorf("Failed to fetch model name: %v", err) return "unknown" diff --git a/worker/cmd/generate_test.go b/worker/cmd/generate_test.go index 4a2f635..061e80d 100644 --- a/worker/cmd/generate_test.go +++ b/worker/cmd/generate_test.go @@ -161,12 +161,12 @@ func TestFetchModelName(t *testing.T) { 20, ) - modelName, err := w.fetchModelName(false) + modelName, err := w.fetchModelName(false, w.precheckEndpoint) assert.NoError(t, err, "fetchModelName should not return an error") expectedModelName := "Mixtral-8x7B-Instruct-v0.1" assert.Equal(t, expectedModelName, modelName, "The model name should be extracted correctly") - modelName, err = w.fetchModelName(true) + modelName, err = w.fetchModelName(true, w.precheckEndpoint) assert.NoError(t, err, "fetchModelName should not return an error") expectedModelName = "/shared_model_storage/transformers_cache/models--mistralai--Mixtral-8x7B-Instruct-v0.1/snapshots/5c79a376139be989ef1838f360bf4f1f256d7aec" assert.Equal(t, expectedModelName, modelName, "The model name should be extracted correctly") @@ -222,7 +222,7 @@ func TestFetchModelNameWithInvalidObject(t *testing.T) { "dummy-ca-cert-path.pem", 20, ) - modelName, err := w.fetchModelName(false) + modelName, err := w.fetchModelName(false, w.precheckEndpoint) // Verify that an error was returned due to the invalid "object" field assert.Error(t, err, "fetchModelName should return an error for invalid object field") From 4a4a95f82198de55e66109798cfd1fde446cb3c3 Mon Sep 17 00:00:00 2001 From: greg pereira Date: Fri, 17 May 2024 18:31:38 -0700 Subject: [PATCH 05/17] removing generatePrecheckScoringPrompt test Signed-off-by: greg pereira --- worker/cmd/generate.go | 2 +- worker/cmd/generate_test.go | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go index 181d074..e2349c7 100644 --- a/worker/cmd/generate.go +++ b/worker/cmd/generate.go @@ -685,7 +685,7 @@ func (w *Worker) processJob() { } var scoringModelName string - // sdg-svc does not have a models endpoint as yet + if jobType == jobPreCheck && w.precheckScoringEndpoint != localEndpoint { var err error scoringModelName, err = w.fetchModelName(true, w.precheckScoringEndpoint) diff --git a/worker/cmd/generate_test.go b/worker/cmd/generate_test.go index 061e80d..53f6bb0 100644 --- a/worker/cmd/generate_test.go +++ b/worker/cmd/generate_test.go @@ -234,7 +234,3 @@ func normalizeHTML(input string) string { compacted := regexp.MustCompile(`\s+`).ReplaceAllString(input, " ") return regexp.MustCompile(`>\s+<`).ReplaceAllString(compacted, "><") } - -func TestGeneratePrecheckScoringPrompt(t *testing.T) { - // NEEDS TO BE IMPLEMENTED -} From 7b41dade38d9be56ebbe3d247cf067d5e97817e2 Mon Sep 17 00:00:00 2001 From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> Date: Fri, 17 May 2024 21:29:12 -0600 Subject: [PATCH 06/17] Update templates.go update scoring prompt Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> --- worker/cmd/templates.go | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go index 1cfdee6..50e6ae0 100644 --- a/worker/cmd/templates.go +++ b/worker/cmd/templates.go @@ -268,27 +268,21 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string) (error, string) { promptTemplate := ` - Please act as an impartial judge and evaluate the quality of the answer provided by an AI assistant - to the questions displayed below. Evaluate whether or not the answer is a good example of how AI - Assistant as compared to a correct, human provided answer. Please assign a score using the following 3-point - scale: - 1: It means the answer is incorrect, irrelevant, unsafe or provides incomplete and garbage information. - For instance, the answer may be factually wrong, off-topic, or filled with irrelevant content that - doesn’t address the user’s question or it could be incomplete and hanging. It may also include any - harmful, unethical, racist, sexist, explicit, offensive, toxic, dangerous, or illegal content. - 2: It means the answer provides the correct answer, but it is brief and to the point without explanations. - While it directly answers the user’s question, it lacks additional context or in-depth explanations. - 3: It means the answer is an exceptional answer from an AI Assistant. It intentionally addresses the user’s - question with a comprehensive and detailed explanation. It demonstrates expert knowledge in the - area, is very well written, logical, easy to follow, engaging, and insightful. And the answer is safe and - does not include any harmful content. - Begin your evaluation by providing a short explanation. Be as objective as possible. After providing - your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. Please use the - following example as a reference for your evaluation. % Human answer: {{ .HumanAnswer }} % Model answer: {{ .ModelAnswer }} + + Evaluate and compare the above human and model answers. Respond with only the numerical score with no explaination. + Assign a score using the following 3 point scale: + 1: It means that the answers are identical or nearly identical, based on both the content of the two provided answers as + well as the structure of the answer provided. + + 2: It means that there is moderate variation in the answers. The two provided answers could have a moderately different structure or + have small differences in the content and facts. + + 3: It means there is significant variation in the answers. The two provided answers differ greatly in structure or have very different + or even contridictory facts and content. ` tmpl, err := template.New("modelScoring").Parse(promptTemplate) From acaa095fb38868f53ca3fc92f3327825f895b821 Mon Sep 17 00:00:00 2001 From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> Date: Fri, 17 May 2024 21:30:15 -0600 Subject: [PATCH 07/17] Update templates.go Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> --- worker/cmd/templates.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go index 50e6ae0..1cf3535 100644 --- a/worker/cmd/templates.go +++ b/worker/cmd/templates.go @@ -273,7 +273,7 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnsw % Model answer: {{ .ModelAnswer }} - Evaluate and compare the above human and model answers. Respond with only the numerical score with no explaination. + Evaluate and compare the above Human answer and Model answer. Respond with only the numerical score with no explaination. Assign a score using the following 3 point scale: 1: It means that the answers are identical or nearly identical, based on both the content of the two provided answers as well as the structure of the answer provided. From 2a53e47816b8feba467d49e7fb1aeaf37d0ac61f Mon Sep 17 00:00:00 2001 From: greg pereira Date: Sat, 18 May 2024 17:15:39 -0700 Subject: [PATCH 08/17] actually use the precheck scoring endpoint :sweat-smile: Signed-off-by: greg pereira --- worker/cmd/generate.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go index e2349c7..ea0595b 100644 --- a/worker/cmd/generate.go +++ b/worker/cmd/generate.go @@ -255,7 +255,7 @@ func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpoint commandStr += " --tls-insecure" } if PreCheckScoringEndpointURL != localEndpoint && preCheckScoringModelName != "unknown" { - commandStr += fmt.Sprintf(" --endpoint-url %s --model %s", PreCheckEndpointURL, preCheckScoringModelName) + commandStr += fmt.Sprintf(" --endpoint-url %s --model %s", PreCheckScoringEndpointURL, preCheckScoringModelName) } cmdArgs := strings.Fields(commandStr) cmd := exec.Command(lab, cmdArgs...) From ac9db5c44bc73b00cb1276083b856288c8195065 Mon Sep 17 00:00:00 2001 From: greg pereira Date: Sat, 18 May 2024 17:39:15 -0700 Subject: [PATCH 09/17] write directly to output dir, no need for chat dir bc data in memory Signed-off-by: greg pereira --- worker/cmd/generate.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go index ea0595b..d007554 100644 --- a/worker/cmd/generate.go +++ b/worker/cmd/generate.go @@ -227,7 +227,6 @@ func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpoint if WorkDir != "" { workDir = WorkDir } - chatlogDir := path.Join(workDir, "data", "chatlogs") combinedYAMLScoringPath := path.Join(outputDir, "combined_chatlog_scoring.yaml") type QuestionScore struct { @@ -244,7 +243,7 @@ func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpoint yamlData := QuestionScoreReport{} for i := 0; i < len(precheckPRAnswers); i++ { - err, promptTemplate := generatePrecheckScoringPrompt(precheckPRAnswers[i], precheckPRQuestions[i]) + err, promptTemplate := generatePrecheckScoringPrompt(precheckPRAnswers[i], precheckEndpointAnswers[i]) if err != nil { w.logger.Errorf("Failed to generate a prompt for precheck scorring: %v", err) return err @@ -293,7 +292,7 @@ func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpoint return err } - err = os.WriteFile(path.Join(chatlogDir, combinedYAMLScoringPath), scoringYaml, 0644) + err = os.WriteFile(combinedYAMLScoringPath, scoringYaml, 0644) if err != nil { w.logger.Errorf("Could not write chatlog to file: %v", err) return err From ac90322111f30bb5f00e0002ec4d623353794d9d Mon Sep 17 00:00:00 2001 From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> Date: Sat, 18 May 2024 19:02:45 -0600 Subject: [PATCH 10/17] Update templates.go Swap question target location Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> --- worker/cmd/templates.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go index 1cf3535..9f41439 100644 --- a/worker/cmd/templates.go +++ b/worker/cmd/templates.go @@ -268,11 +268,6 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string) (error, string) { promptTemplate := ` - % Human answer: - {{ .HumanAnswer }} - % Model answer: - {{ .ModelAnswer }} - Evaluate and compare the above Human answer and Model answer. Respond with only the numerical score with no explaination. Assign a score using the following 3 point scale: 1: It means that the answers are identical or nearly identical, based on both the content of the two provided answers as @@ -283,6 +278,12 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnsw 3: It means there is significant variation in the answers. The two provided answers differ greatly in structure or have very different or even contridictory facts and content. + + % Human answer: + {{ .HumanAnswer }} + % Model answer: + {{ .ModelAnswer }} + ` tmpl, err := template.New("modelScoring").Parse(promptTemplate) From b712bef52f0dcc41948a99467576b99c4248b4d1 Mon Sep 17 00:00:00 2001 From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> Date: Sat, 18 May 2024 19:10:13 -0600 Subject: [PATCH 11/17] Update templates.go typo Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> --- worker/cmd/templates.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go index 9f41439..aaceac4 100644 --- a/worker/cmd/templates.go +++ b/worker/cmd/templates.go @@ -268,7 +268,7 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string) (error, string) { promptTemplate := ` - Evaluate and compare the above Human answer and Model answer. Respond with only the numerical score with no explaination. + Evaluate and compare the below Human answer and Model answer. Respond with only the numerical score with no explaination. Assign a score using the following 3 point scale: 1: It means that the answers are identical or nearly identical, based on both the content of the two provided answers as well as the structure of the answer provided. From b939aabed04a3db0380b36cff1bf906105bd9a54 Mon Sep 17 00:00:00 2001 From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> Date: Sat, 18 May 2024 19:24:03 -0600 Subject: [PATCH 12/17] Update templates.go prompt update Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> --- worker/cmd/templates.go | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go index aaceac4..2dda551 100644 --- a/worker/cmd/templates.go +++ b/worker/cmd/templates.go @@ -266,23 +266,25 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc return s3Key } -func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string) (error, string) { +func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string, precheckQuestion string) (error, string) { promptTemplate := ` - Evaluate and compare the below Human answer and Model answer. Respond with only the numerical score with no explaination. + Evaluate and compare the below Human answer and Model answer when given the same question. Respond with only the numerical score with no explaination. Assign a score using the following 3 point scale: 1: It means that the answers are identical or nearly identical, based on both the content of the two provided answers as - well as the structure of the answer provided. + well as the wording and details of the answer provided. - 2: It means that there is moderate variation in the answers. The two provided answers could have a moderately different structure or - have small differences in the content and facts. + 2: It means that there is moderate variation in the answers. The two provided answers could have a moderately different sentence structure + and wording, or have some differences in the content or perspective, but still share some key points. - 3: It means there is significant variation in the answers. The two provided answers differ greatly in structure or have very different - or even contridictory facts and content. + 3: It means the answers are significantly different. The two provided answers differ greatly in wording and perspective or have very different + or contridictory facts and content. - % Human answer: - {{ .HumanAnswer }} - % Model answer: - {{ .ModelAnswer }} + Question: + "{{ .Question }}" + Human answer: + "{{ .HumanAnswer }}" + Model answer: + "{{ .ModelAnswer }}" ` @@ -294,9 +296,11 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnsw data := struct { HumanAnswer string ModelAnswer string + Question string }{ HumanAnswer: precheckPRAnswer, ModelAnswer: precheckEndpointAnswer, + Question: precheckQuestion } var buf bytes.Buffer err = tmpl.Execute(&buf, data) From 3eda593644ab76a6b661b7972f60179ed68a2826 Mon Sep 17 00:00:00 2001 From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> Date: Sat, 18 May 2024 19:28:31 -0600 Subject: [PATCH 13/17] Update generate.go Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> --- worker/cmd/generate.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go index d007554..297619e 100644 --- a/worker/cmd/generate.go +++ b/worker/cmd/generate.go @@ -243,7 +243,7 @@ func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpoint yamlData := QuestionScoreReport{} for i := 0; i < len(precheckPRAnswers); i++ { - err, promptTemplate := generatePrecheckScoringPrompt(precheckPRAnswers[i], precheckEndpointAnswers[i]) + err, promptTemplate := generatePrecheckScoringPrompt(precheckPRAnswers[i], precheckEndpointAnswers[i], precheckPRQuestions[i]) if err != nil { w.logger.Errorf("Failed to generate a prompt for precheck scorring: %v", err) return err From 01a20030cd614eca9f273fc76f6c73a869665ccb Mon Sep 17 00:00:00 2001 From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> Date: Sat, 18 May 2024 19:31:50 -0600 Subject: [PATCH 14/17] Update templates.go lint error Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> --- worker/cmd/templates.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go index 2dda551..f3dbbdd 100644 --- a/worker/cmd/templates.go +++ b/worker/cmd/templates.go @@ -300,7 +300,7 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnsw }{ HumanAnswer: precheckPRAnswer, ModelAnswer: precheckEndpointAnswer, - Question: precheckQuestion + Question: precheckQuestion, } var buf bytes.Buffer err = tmpl.Execute(&buf, data) From 9faef6253b58610c11c063cb00d4e24b1d053236 Mon Sep 17 00:00:00 2001 From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> Date: Mon, 20 May 2024 13:44:26 -0600 Subject: [PATCH 15/17] Update templates.go Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> --- worker/cmd/templates.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go index f3dbbdd..68faf25 100644 --- a/worker/cmd/templates.go +++ b/worker/cmd/templates.go @@ -268,7 +268,7 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string, precheckQuestion string) (error, string) { promptTemplate := ` - Evaluate and compare the below Human answer and Model answer when given the same question. Respond with only the numerical score with no explaination. + Evaluate and compare the below ### Human answer and ### Model answer when given the same ### Question provided below. Respond with only the numerical score with no explanation. Assign a score using the following 3 point scale: 1: It means that the answers are identical or nearly identical, based on both the content of the two provided answers as well as the wording and details of the answer provided. @@ -279,11 +279,11 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnsw 3: It means the answers are significantly different. The two provided answers differ greatly in wording and perspective or have very different or contridictory facts and content. - Question: + ### Question: "{{ .Question }}" - Human answer: + ### Human answer: "{{ .HumanAnswer }}" - Model answer: + ### Model answer: "{{ .ModelAnswer }}" ` From 619e4a33b677aa53a5db123342be03a6e94233dd Mon Sep 17 00:00:00 2001 From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> Date: Mon, 20 May 2024 14:14:06 -0600 Subject: [PATCH 16/17] Update templates.go Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> --- worker/cmd/templates.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go index 68faf25..52a2808 100644 --- a/worker/cmd/templates.go +++ b/worker/cmd/templates.go @@ -268,7 +268,8 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string, precheckQuestion string) (error, string) { promptTemplate := ` - Evaluate and compare the below ### Human answer and ### Model answer when given the same ### Question provided below. Respond with only the numerical score with no explanation. + Evaluate and compare the quality of the below ### Model answer compared to the ### Human answer when given the same ### Question provided below. + The ### Human answer is to be treated as the ground truth answer. Assign a score using the following 3 point scale: 1: It means that the answers are identical or nearly identical, based on both the content of the two provided answers as well as the wording and details of the answer provided. From 5f51dc3d73af4cab132d0ce25a56d2ab2a6fe879 Mon Sep 17 00:00:00 2001 From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> Date: Mon, 20 May 2024 14:16:20 -0600 Subject: [PATCH 17/17] Update templates.go Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com> --- worker/cmd/templates.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go index 52a2808..a58b960 100644 --- a/worker/cmd/templates.go +++ b/worker/cmd/templates.go @@ -279,7 +279,7 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnsw 3: It means the answers are significantly different. The two provided answers differ greatly in wording and perspective or have very different or contridictory facts and content. - + ### Question: "{{ .Question }}" ### Human answer: