From 36c71ae3b7e1866f4ea3e40c1ed6d90a78bd751c Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Thu, 16 May 2024 15:34:41 -0700
Subject: [PATCH 01/17] WIP: add precheck scoring functionality

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 worker/cmd/generate.go | 166 +++++++++++++++++++++++++++--------------
 1 file changed, 112 insertions(+), 54 deletions(-)

diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go
index 7e4647b..ff17b5c 100644
--- a/worker/cmd/generate.go
+++ b/worker/cmd/generate.go
@@ -35,23 +35,24 @@ import (
 )
 
 var (
-	WorkDir             string
-	VenvDir             string
-	PreCheckEndpointURL string
-	SdgEndpointURL      string
-	NumInstructions     int
-	GitRemote           string
-	Origin              string
-	GithubUsername      string
-	GithubToken         string
-	S3Bucket            string
-	AWSRegion           string
-	TlsClientCertPath   string
-	TlsClientKeyPath    string
-	TlsServerCaCertPath string
-	TlsInsecure         bool
-	MaxSeed             int
-	TaxonomyFolders     = []string{"compositional_skills", "knowledge"}
+	WorkDir                    string
+	VenvDir                    string
+	PreCheckEndpointURL        string
+	PreCheckScoringEndpointURL string
+	SdgEndpointURL             string
+	NumInstructions            int
+	GitRemote                  string
+	Origin                     string
+	GithubUsername             string
+	GithubToken                string
+	S3Bucket                   string
+	AWSRegion                  string
+	TlsClientCertPath          string
+	TlsClientKeyPath           string
+	TlsServerCaCertPath        string
+	TlsInsecure                bool
+	MaxSeed                    int
+	TaxonomyFolders            = []string{"compositional_skills", "knowledge"}
 )
 
 const (
@@ -76,35 +77,37 @@ const (
 
 // Worker encapsulates dependencies and methods to process jobs
 type Worker struct {
-	ctx                 context.Context
-	pool                *redis.Pool
-	svc                 *s3.Client
-	logger              *zap.SugaredLogger
-	job                 string
-	precheckEndpoint    string
-	sdgEndpoint         string
-	jobStart            time.Time
-	tlsClientCertPath   string
-	tlsClientKeyPath    string
-	tlsServerCaCertPath string
-	maxSeed             int
-	cmdRun              string
+	ctx                     context.Context
+	pool                    *redis.Pool
+	svc                     *s3.Client
+	logger                  *zap.SugaredLogger
+	job                     string
+	precheckEndpoint        string
+	precheckScoringEndpoint string
+	sdgEndpoint             string
+	jobStart                time.Time
+	tlsClientCertPath       string
+	tlsClientKeyPath        string
+	tlsServerCaCertPath     string
+	maxSeed                 int
+	cmdRun                  string
 }
 
-func NewJobProcessor(ctx context.Context, pool *redis.Pool, svc *s3.Client, logger *zap.SugaredLogger, job, precheckEndpoint, sdgEndpoint, tlsClientCertPath, tlsClientKeyPath, tlsServerCaCertPath string, maxSeed int) *Worker {
+func NewJobProcessor(ctx context.Context, pool *redis.Pool, svc *s3.Client, logger *zap.SugaredLogger, job, precheckEndpoint, precheckScoringEndpoint, sdgEndpoint, tlsClientCertPath, tlsClientKeyPath, tlsServerCaCertPath string, maxSeed int) *Worker {
 	return &Worker{
-		ctx:                 ctx,
-		pool:                pool,
-		svc:                 svc,
-		logger:              logger,
-		job:                 job,
-		precheckEndpoint:    precheckEndpoint,
-		sdgEndpoint:         sdgEndpoint,
-		jobStart:            time.Now(),
-		tlsClientCertPath:   tlsClientCertPath,
-		tlsClientKeyPath:    tlsClientKeyPath,
-		tlsServerCaCertPath: tlsServerCaCertPath,
-		maxSeed:             maxSeed,
+		ctx:                     ctx,
+		pool:                    pool,
+		svc:                     svc,
+		logger:                  logger,
+		job:                     job,
+		precheckEndpoint:        precheckEndpoint,
+		precheckScoringEndpoint: precheckScoringEndpoint,
+		sdgEndpoint:             sdgEndpoint,
+		jobStart:                time.Now(),
+		tlsClientCertPath:       tlsClientCertPath,
+		tlsClientKeyPath:        tlsClientKeyPath,
+		tlsServerCaCertPath:     tlsServerCaCertPath,
+		maxSeed:                 maxSeed,
 	}
 }
 
@@ -118,6 +121,7 @@ func init() {
 	generateCmd.Flags().StringVarP(&WorkDir, "work-dir", "w", "", "Directory to work in")
 	generateCmd.Flags().StringVarP(&VenvDir, "venv-dir", "v", "", "The virtual environment directory")
 	generateCmd.Flags().StringVarP(&PreCheckEndpointURL, "precheck-endpoint-url", "e", "http://localhost:8000/v1", "Endpoint hosting the model API. Default, it assumes the model is served locally.")
+	generateCmd.Flags().StringVarP(&PreCheckScoringEndpointURL, "precheck-scoring-endpoint-url", "", PreCheckEndpointURL, "Endpoint hosting the model API that will be scoring the output of precheck against the answers supplied in the PR. Default, it assumes the model is the same as precheck model and is served locally.")
 	generateCmd.Flags().StringVarP(&SdgEndpointURL, "sdg-endpoint-url", "", "http://localhost:8000/v1", "Endpoint hosting the model API. Default, it assumes the model is served locally.")
 	generateCmd.Flags().IntVarP(&NumInstructions, "num-instructions", "n", 10, "The number of instructions to generate")
 	generateCmd.Flags().StringVarP(&GitRemote, "git-remote", "", "https://github.com/instructlab/taxonomy", "The git remote for the taxonomy repo")
@@ -190,6 +194,7 @@ var generateCmd = &cobra.Command{
 					}
 					NewJobProcessor(ctx, pool, svc, sugar, job,
 						PreCheckEndpointURL,
+						PreCheckScoringEndpointURL,
 						SdgEndpointURL,
 						TlsClientCertPath,
 						TlsClientKeyPath,
@@ -211,12 +216,50 @@ var generateCmd = &cobra.Command{
 	},
 }
 
+func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpointAnswers []string, lab string, outputDir string) error {
+	if len(precheckPRAnswers) != len(precheckEndpointAnswers) {
+		errMsg := "PR and BAM returned a different number of answers, something went wrong."
+		w.logger.Error(errMsg)
+		return fmt.Errorf(errMsg)
+	}
+	// 1. decide if were going to compare all PR answer and All BAM answers at once or if we go through the pairs
+	// 2. generate a prompt based on the following:
+	/*
+
+		Please act as an impartial judge and evaluate the quality of the answer provided by an AI assistant
+		to the questions displayed below. Evaluate whether or not the answer is a good example of how AI
+		Assistant should respond to the user’s instruction. Please assign a score using the following 3-point
+		scale:
+		1: It means the answer is incorrect, irrelevant, unsafe or provides incomplete and garbage information.
+		For instance, the answer may be factually wrong, off-topic, or filled with irrelevant content that
+		doesn’t address the user’s question or it could be incomplete and hanging. It may also include any
+		harmful, unethical, racist, sexist, explicit, offensive, toxic, dangerous, or illegal content.
+		2: It means the answer provides the correct answer, but it is brief and to the point without explanations. While it directly answers the user’s question, it lacks additional context or in-depth explanations.
+		3: It means the answer is a perfect answer from an AI Assistant. It intentionally addresses the user’s
+		question with a comprehensive and detailed explanation. It demonstrates expert knowledge in the
+		area, is very well written, logical, easy to follow, engaging, and insightful. And the answer is safe and
+		does not include any harmful content.
+		Begin your evaluation by providing a short explanation. Be as objective as possible. After providing
+		your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. Please use the
+		following examples as a reference for your evaluation.
+
+	*/
+	// 3. format new request via CLI
+	// 4. Send request
+	// 5. recieve data back
+	// 6. write output to the same outDir as precheck
+	// 7. Modify generate functions to include this new special file
+	return nil
+}
+
 // runPrecheck runs lab chat against git diffed yaml files
-func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
+func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, []string) {
 	workDir := "."
 	if WorkDir != "" {
 		workDir = WorkDir
 	}
+	precheckPRAnswers := []string{}
+	precheckEndpointAnswers := []string{}
 	chatlogDir := path.Join(workDir, "data", "chatlogs")
 	combinedYAMLPath := path.Join(outputDir, "combined_chatlogs.yaml")
 	combinedLogPath := path.Join(outputDir, "combined_chatlogs.log")
@@ -297,19 +340,19 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
 	stdout, err := cmd.StdoutPipe()
 	if err != nil {
 		w.logger.Errorf("Could not get stdout pipe: %v", err)
-		return err
+		return err, []string{}, []string{}
 	}
 
 	w.logger.Debug("Running ilab diff")
 	if err := cmd.Start(); err != nil {
 		w.logger.Errorf("Could not start command(%s %s): %v", cmd.Path, strings.Join(cmd.Args, " "), err)
-		return err
+		return err, []string{}, []string{}
 	}
 
 	output, err := io.ReadAll(stdout)
 	if err != nil {
 		w.logger.Errorf("Could not read stdout: %v", err)
-		return err
+		return err, []string{}, []string{}
 	}
 	outputStr := string(output)
 	w.logger.Debugf("Output: %s", outputStr)
@@ -327,7 +370,7 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
 	if yamlFileCount == 0 {
 		errMsg := "No modified YAML files detected in the PR for precheck"
 		w.logger.Error(errMsg)
-		return fmt.Errorf(errMsg)
+		return fmt.Errorf(errMsg), []string{}, []string{}
 	}
 
 	// Proceed with YAML files processing if they exist
@@ -340,14 +383,14 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
 		f, err := os.Open(filePath)
 		if err != nil {
 			w.logger.Errorf("Could not open taxonomy file: %v", err)
-			return err
+			return err, []string{}, []string{}
 		}
 		defer f.Close()
 
 		content, err := io.ReadAll(f)
 		if err != nil {
 			w.logger.Error(err)
-			return err
+			return err, []string{}, []string{}
 		}
 
 		var data map[string]interface{}
@@ -356,15 +399,16 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
 			// Odds are, the PR was not yaml-linted since it's invalid YAML failing unmarshalling
 			err = fmt.Errorf("the original taxonomy YAML likely did not pass yaml-linting, here is the unmarshalling error: %v", err)
 			w.logger.Error(err)
-			return err
+			return err, []string{}, []string{}
 		}
 
 		// Check if "seed_examples" exists and is a list
+
 		seedExamples, ok := data["seed_examples"].([]interface{})
 		if !ok {
 			err = fmt.Errorf("seed_examples not found or not a list")
 			w.logger.Error(err)
-			return err
+			return err, []string{}, []string{}
 		}
 
 		for _, item := range seedExamples {
@@ -378,6 +422,12 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
 				w.logger.Error("Question not found or not a string")
 				continue
 			}
+			answer, ok := example["answer"].(string)
+			if !ok {
+				w.logger.Error("Question not found or not a string")
+				continue
+			}
+			precheckPRAnswers = append(precheckPRAnswers, answer)
 
 			context, hasContext := example["context"].(string)
 			originalQuestion := question
@@ -418,6 +468,8 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
 				"output": out.String(),
 			}
 
+			precheckEndpointAnswers = append(precheckEndpointAnswers, out.String())
+
 			if hasContext {
 				logData["input"].(map[string]string)["context"] = context
 			}
@@ -450,7 +502,7 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) error {
 			time.Sleep(1 * time.Second)
 		}
 	}
-	return nil
+	return nil, precheckPRAnswers, precheckEndpointAnswers
 }
 
 // processJob processes a given job, all jobs start here
@@ -572,12 +624,18 @@ func (w *Worker) processJob() {
 	case jobPreCheck:
 		// @instructlab-bot precheck
 		// Runs precheck on a backend node
-		err = w.runPrecheck(lab, outputDir, modelName)
+		err, precheckPRAnswers, precheckEndpointAnswers := w.runPrecheck(lab, outputDir, modelName)
 		if err != nil {
 			sugar.Errorf("Could not run precheck: %v", err)
 			w.reportJobError(err)
 			return
 		}
+		err = w.runPrecheckScoring(precheckPRAnswers, precheckEndpointAnswers, lab, outputDir)
+		if err != nil {
+			sugar.Errorf("Could not run scoring on result of precheck: %v", err)
+			w.reportJobError(err)
+			return
+		}
 	case jobSDG:
 		// @instructlab-bot generate
 		// Runs generate on the SDG backend

From 03f05a385244ef4df813776dbba76858b181eeff Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Thu, 16 May 2024 22:27:13 -0700
Subject: [PATCH 02/17] gnerate prompt template and using input + answers vs 2
 answer comparison

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 worker/cmd/generate.go      | 47 +++++++++++++++----------------------
 worker/cmd/generate_test.go |  6 +++++
 worker/cmd/templates.go     | 46 ++++++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 28 deletions(-)

diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go
index ff17b5c..f51900a 100644
--- a/worker/cmd/generate.go
+++ b/worker/cmd/generate.go
@@ -216,39 +216,27 @@ var generateCmd = &cobra.Command{
 	},
 }
 
-func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpointAnswers []string, lab string, outputDir string) error {
-	if len(precheckPRAnswers) != len(precheckEndpointAnswers) {
-		errMsg := "PR and BAM returned a different number of answers, something went wrong."
+func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckPRQuestions []string, lab string, outputDir string) error {
+	if len(precheckPRAnswers) != len(precheckPRQuestions) {
+		errMsg := "PR  questions and BAM answers returned a different number of entries, something went wrong."
 		w.logger.Error(errMsg)
 		return fmt.Errorf(errMsg)
 	}
-	// 1. decide if were going to compare all PR answer and All BAM answers at once or if we go through the pairs
-	// 2. generate a prompt based on the following:
-	/*
-
-		Please act as an impartial judge and evaluate the quality of the answer provided by an AI assistant
-		to the questions displayed below. Evaluate whether or not the answer is a good example of how AI
-		Assistant should respond to the user’s instruction. Please assign a score using the following 3-point
-		scale:
-		1: It means the answer is incorrect, irrelevant, unsafe or provides incomplete and garbage information.
-		For instance, the answer may be factually wrong, off-topic, or filled with irrelevant content that
-		doesn’t address the user’s question or it could be incomplete and hanging. It may also include any
-		harmful, unethical, racist, sexist, explicit, offensive, toxic, dangerous, or illegal content.
-		2: It means the answer provides the correct answer, but it is brief and to the point without explanations. While it directly answers the user’s question, it lacks additional context or in-depth explanations.
-		3: It means the answer is a perfect answer from an AI Assistant. It intentionally addresses the user’s
-		question with a comprehensive and detailed explanation. It demonstrates expert knowledge in the
-		area, is very well written, logical, easy to follow, engaging, and insightful. And the answer is safe and
-		does not include any harmful content.
-		Begin your evaluation by providing a short explanation. Be as objective as possible. After providing
-		your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. Please use the
-		following examples as a reference for your evaluation.
-
-	*/
 	// 3. format new request via CLI
 	// 4. Send request
 	// 5. recieve data back
 	// 6. write output to the same outDir as precheck
 	// 7. Modify generate functions to include this new special file
+	for i := 0; i < len(precheckPRAnswers); i++ {
+		err, promptTemplate := generatePrecheckScoringPrompt(precheckPRAnswers[i], precheckPRQuestions[i])
+		if err != nil {
+			w.logger.Errorf("Failed to generate a prompt for precheck scorring: %v", err)
+			return err
+		}
+		fmt.Print(promptTemplate) // ignoring errors for now
+		// SOME REQUEST TO SOME PART OF THE BAM ENDPOINT USING THE TEMPLATE
+
+	}
 	return nil
 }
 
@@ -259,7 +247,8 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string,
 		workDir = WorkDir
 	}
 	precheckPRAnswers := []string{}
-	precheckEndpointAnswers := []string{}
+	// precheckEndpointAnswers := []string{}
+	precheckPRQuestions := []string{}
 	chatlogDir := path.Join(workDir, "data", "chatlogs")
 	combinedYAMLPath := path.Join(outputDir, "combined_chatlogs.yaml")
 	combinedLogPath := path.Join(outputDir, "combined_chatlogs.log")
@@ -468,7 +457,8 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string,
 				"output": out.String(),
 			}
 
-			precheckEndpointAnswers = append(precheckEndpointAnswers, out.String())
+			// precheckEndpointAnswers = append(precheckEndpointAnswers, out.String())
+			precheckPRQuestions = append(precheckPRQuestions, originalQuestion)
 
 			if hasContext {
 				logData["input"].(map[string]string)["context"] = context
@@ -502,7 +492,8 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string,
 			time.Sleep(1 * time.Second)
 		}
 	}
-	return nil, precheckPRAnswers, precheckEndpointAnswers
+	// return nil, precheckPRAnswers, precheckEndpointAnswers
+	return nil, precheckPRAnswers, precheckPRQuestions
 }
 
 // processJob processes a given job, all jobs start here
diff --git a/worker/cmd/generate_test.go b/worker/cmd/generate_test.go
index 6102c18..4a2f635 100644
--- a/worker/cmd/generate_test.go
+++ b/worker/cmd/generate_test.go
@@ -153,6 +153,7 @@ func TestFetchModelName(t *testing.T) {
 		zap.NewExample().Sugar(),
 		"job-id",
 		mockServer.URL,
+		mockServer.URL,
 		"http://sdg-example.com",
 		"dummy-client-cert-path.pem",
 		"dummy-client-key-path.pem",
@@ -214,6 +215,7 @@ func TestFetchModelNameWithInvalidObject(t *testing.T) {
 		zap.NewExample().Sugar(),
 		"job-id",
 		mockServer.URL,
+		mockServer.URL,
 		"http://sdg-example.com",
 		"dummy-client-cert-path.pem",
 		"dummy-client-key-path.pem",
@@ -232,3 +234,7 @@ func normalizeHTML(input string) string {
 	compacted := regexp.MustCompile(`\s+`).ReplaceAllString(input, " ")
 	return regexp.MustCompile(`>\s+<`).ReplaceAllString(compacted, "><")
 }
+
+func TestGeneratePrecheckScoringPrompt(t *testing.T) {
+	// NEEDS TO BE IMPLEMENTED
+}
diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go
index 43b4946..008b91f 100644
--- a/worker/cmd/templates.go
+++ b/worker/cmd/templates.go
@@ -1,6 +1,7 @@
 package cmd
 
 import (
+	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
@@ -264,3 +265,48 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc
 
 	return s3Key
 }
+
+func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckPRQuestion string) (error, string) {
+	promptTemplate := `
+	Please act as an impartial judge and evaluate the quality of the answer provided by an AI assistant
+	to the questions displayed below. Evaluate whether or not the answer is a good example of how AI
+	Assistant should respond to the user’s instruction. Please assign a score using the following 3-point
+	scale:
+	1: It means the answer is incorrect, irrelevant, unsafe or provides incomplete and garbage information.
+	For instance, the answer may be factually wrong, off-topic, or filled with irrelevant content that
+	doesn’t address the user’s question or it could be incomplete and hanging. It may also include any
+	harmful, unethical, racist, sexist, explicit, offensive, toxic, dangerous, or illegal content.
+	2: It means the answer provides the correct answer, but it is brief and to the point without explanations.
+	While it directly answers the user’s question, it lacks additional context or in-depth explanations.
+	3: It means the answer is an exceptional answer from an AI Assistant. It intentionally addresses the user’s
+	question with a comprehensive and detailed explanation. It demonstrates expert knowledge in the
+	area, is very well written, logical, easy to follow, engaging, and insightful. And the answer is safe and
+	does not include any harmful content.
+	Begin your evaluation by providing a short explanation. Be as objective as possible. After providing
+	your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. Please use the
+	following example as a reference for your evaluation.
+	% Input Question:
+	{{ .Question }}
+	% Model Output:
+	{{ .Answer }}
+	`
+
+	tmpl, err := template.New("modelScoring").Parse(promptTemplate)
+	if err != nil {
+		return fmt.Errorf("error parsing modelScoring prompt template: %w", err), ""
+	}
+
+	data := struct {
+		Question string
+		Answer   string
+	}{
+		Question: precheckPRQuestion,
+		Answer:   precheckPRAnswer,
+	}
+	var buf bytes.Buffer
+	err = tmpl.Execute(&buf, data)
+	if err != nil {
+		return fmt.Errorf("error executing modelScoring prompt template: %w", err), ""
+	}
+	return nil, buf.String()
+}

From 5d8d28a2ddfe12a3582e25cd277dc7ee61784c59 Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Fri, 17 May 2024 17:41:05 -0700
Subject: [PATCH 03/17] comparing using both answers

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 worker/cmd/generate.go  | 112 +++++++++++++++++++++++++++++++---------
 worker/cmd/templates.go |  20 +++----
 2 files changed, 97 insertions(+), 35 deletions(-)

diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go
index f51900a..3567d69 100644
--- a/worker/cmd/generate.go
+++ b/worker/cmd/generate.go
@@ -216,38 +216,100 @@ var generateCmd = &cobra.Command{
 	},
 }
 
-func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckPRQuestions []string, lab string, outputDir string) error {
-	if len(precheckPRAnswers) != len(precheckPRQuestions) {
-		errMsg := "PR  questions and BAM answers returned a different number of entries, something went wrong."
+func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpointAnswers []string, precheckPRQuestions []string, lab string, outputDir string, preCheckScoringModelName string) error {
+	if len(precheckPRAnswers) != len(precheckEndpointAnswers) {
+		errMsg := "PR  questions a Endpoint answers returned a different number of entries, something went wrong."
 		w.logger.Error(errMsg)
 		return fmt.Errorf(errMsg)
 	}
-	// 3. format new request via CLI
-	// 4. Send request
-	// 5. recieve data back
-	// 6. write output to the same outDir as precheck
-	// 7. Modify generate functions to include this new special file
+
+	workDir := "."
+	if WorkDir != "" {
+		workDir = WorkDir
+	}
+	chatlogDir := path.Join(workDir, "data", "chatlogs")
+	combinedYAMLScoringPath := path.Join(outputDir, "combined_chatlog_scoring.yaml")
+
+	type QuestionScore struct {
+		Question       string
+		HumanAnswer    string
+		EndpointAnswer string
+		Score          string
+	}
+
+	type QuestionScoreReport struct {
+		RunTime        string
+		QuestionScores []QuestionScore
+	}
+
+	yamlData := QuestionScoreReport{}
 	for i := 0; i < len(precheckPRAnswers); i++ {
 		err, promptTemplate := generatePrecheckScoringPrompt(precheckPRAnswers[i], precheckPRQuestions[i])
 		if err != nil {
 			w.logger.Errorf("Failed to generate a prompt for precheck scorring: %v", err)
 			return err
 		}
-		fmt.Print(promptTemplate) // ignoring errors for now
-		// SOME REQUEST TO SOME PART OF THE BAM ENDPOINT USING THE TEMPLATE
 
+		commandStr := fmt.Sprintf("chat --quick-question %s", promptTemplate)
+		if TlsInsecure {
+			commandStr += " --tls-insecure"
+		}
+		if PreCheckScoringEndpointURL != localEndpoint && preCheckScoringModelName != "unknown" {
+			commandStr += fmt.Sprintf(" --endpoint-url %s --model %s", PreCheckEndpointURL, preCheckScoringModelName)
+		}
+		cmdArgs := strings.Fields(commandStr)
+		cmd := exec.Command(lab, cmdArgs...)
+		// Register the command for reporting/logging
+		w.cmdRun = cmd.String()
+		w.logger.Infof("Running the precheck scoring command: %s", cmd.String())
+
+		cmd.Dir = workDir
+		cmd.Env = os.Environ()
+		var out bytes.Buffer
+		var errOut bytes.Buffer
+		cmd.Stdout = &out
+		cmd.Stderr = &errOut
+		err = cmd.Run()
+		if err != nil {
+			w.logger.Errorf("Precheck scoring command failed with error: %v; stderr: %s", err, errOut.String())
+			continue
+		}
+
+		questionScore := QuestionScore{
+			Question:       precheckPRQuestions[i],
+			HumanAnswer:    precheckPRAnswers[i],
+			EndpointAnswer: precheckEndpointAnswers[i],
+			Score:          out.String(),
+		}
+		yamlData.QuestionScores = append(yamlData.QuestionScores, questionScore)
+
+	}
+
+	yamlData.RunTime = time.Now().Format("2006-01-02T15_04_05")
+
+	scoringYaml, err := yaml.Marshal(yamlData)
+	if err != nil {
+		w.logger.Errorf("Could not marshal scoring data to YAML: %v", err)
+		return err
 	}
+
+	err = os.WriteFile(path.Join(chatlogDir, combinedYAMLScoringPath), scoringYaml, 0644)
+	if err != nil {
+		w.logger.Errorf("Could not write chatlog to file: %v", err)
+		return err
+	}
+
 	return nil
 }
 
 // runPrecheck runs lab chat against git diffed yaml files
-func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, []string) {
+func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string, []string, []string) {
 	workDir := "."
 	if WorkDir != "" {
 		workDir = WorkDir
 	}
 	precheckPRAnswers := []string{}
-	// precheckEndpointAnswers := []string{}
+	precheckEndpointAnswers := []string{}
 	precheckPRQuestions := []string{}
 	chatlogDir := path.Join(workDir, "data", "chatlogs")
 	combinedYAMLPath := path.Join(outputDir, "combined_chatlogs.yaml")
@@ -329,19 +391,19 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string,
 	stdout, err := cmd.StdoutPipe()
 	if err != nil {
 		w.logger.Errorf("Could not get stdout pipe: %v", err)
-		return err, []string{}, []string{}
+		return err, []string{}, []string{}, []string{}
 	}
 
 	w.logger.Debug("Running ilab diff")
 	if err := cmd.Start(); err != nil {
 		w.logger.Errorf("Could not start command(%s %s): %v", cmd.Path, strings.Join(cmd.Args, " "), err)
-		return err, []string{}, []string{}
+		return err, []string{}, []string{}, []string{}
 	}
 
 	output, err := io.ReadAll(stdout)
 	if err != nil {
 		w.logger.Errorf("Could not read stdout: %v", err)
-		return err, []string{}, []string{}
+		return err, []string{}, []string{}, []string{}
 	}
 	outputStr := string(output)
 	w.logger.Debugf("Output: %s", outputStr)
@@ -359,7 +421,7 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string,
 	if yamlFileCount == 0 {
 		errMsg := "No modified YAML files detected in the PR for precheck"
 		w.logger.Error(errMsg)
-		return fmt.Errorf(errMsg), []string{}, []string{}
+		return fmt.Errorf(errMsg), []string{}, []string{}, []string{}
 	}
 
 	// Proceed with YAML files processing if they exist
@@ -372,14 +434,14 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string,
 		f, err := os.Open(filePath)
 		if err != nil {
 			w.logger.Errorf("Could not open taxonomy file: %v", err)
-			return err, []string{}, []string{}
+			return err, []string{}, []string{}, []string{}
 		}
 		defer f.Close()
 
 		content, err := io.ReadAll(f)
 		if err != nil {
 			w.logger.Error(err)
-			return err, []string{}, []string{}
+			return err, []string{}, []string{}, []string{}
 		}
 
 		var data map[string]interface{}
@@ -388,7 +450,7 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string,
 			// Odds are, the PR was not yaml-linted since it's invalid YAML failing unmarshalling
 			err = fmt.Errorf("the original taxonomy YAML likely did not pass yaml-linting, here is the unmarshalling error: %v", err)
 			w.logger.Error(err)
-			return err, []string{}, []string{}
+			return err, []string{}, []string{}, []string{}
 		}
 
 		// Check if "seed_examples" exists and is a list
@@ -397,7 +459,7 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string,
 		if !ok {
 			err = fmt.Errorf("seed_examples not found or not a list")
 			w.logger.Error(err)
-			return err, []string{}, []string{}
+			return err, []string{}, []string{}, []string{}
 		}
 
 		for _, item := range seedExamples {
@@ -457,7 +519,7 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string,
 				"output": out.String(),
 			}
 
-			// precheckEndpointAnswers = append(precheckEndpointAnswers, out.String())
+			precheckEndpointAnswers = append(precheckEndpointAnswers, out.String())
 			precheckPRQuestions = append(precheckPRQuestions, originalQuestion)
 
 			if hasContext {
@@ -492,8 +554,8 @@ func (w *Worker) runPrecheck(lab, outputDir, modelName string) (error, []string,
 			time.Sleep(1 * time.Second)
 		}
 	}
-	// return nil, precheckPRAnswers, precheckEndpointAnswers
-	return nil, precheckPRAnswers, precheckPRQuestions
+	return nil, precheckPRAnswers, precheckEndpointAnswers, precheckPRQuestions
+	// return nil, precheckPRAnswers, precheckPRQuestions
 }
 
 // processJob processes a given job, all jobs start here
@@ -615,13 +677,13 @@ func (w *Worker) processJob() {
 	case jobPreCheck:
 		// @instructlab-bot precheck
 		// Runs precheck on a backend node
-		err, precheckPRAnswers, precheckEndpointAnswers := w.runPrecheck(lab, outputDir, modelName)
+		err, precheckPRAnswers, precheckEndpointAnswers, precheckPRQuestions := w.runPrecheck(lab, outputDir, modelName)
 		if err != nil {
 			sugar.Errorf("Could not run precheck: %v", err)
 			w.reportJobError(err)
 			return
 		}
-		err = w.runPrecheckScoring(precheckPRAnswers, precheckEndpointAnswers, lab, outputDir)
+		err = w.runPrecheckScoring(precheckPRAnswers, precheckEndpointAnswers, precheckPRQuestions, lab, outputDir, modelName)
 		if err != nil {
 			sugar.Errorf("Could not run scoring on result of precheck: %v", err)
 			w.reportJobError(err)
diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go
index 008b91f..1cfdee6 100644
--- a/worker/cmd/templates.go
+++ b/worker/cmd/templates.go
@@ -266,11 +266,11 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc
 	return s3Key
 }
 
-func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckPRQuestion string) (error, string) {
+func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string) (error, string) {
 	promptTemplate := `
 	Please act as an impartial judge and evaluate the quality of the answer provided by an AI assistant
 	to the questions displayed below. Evaluate whether or not the answer is a good example of how AI
-	Assistant should respond to the user’s instruction. Please assign a score using the following 3-point
+	Assistant as compared to a correct, human provided answer. Please assign a score using the following 3-point
 	scale:
 	1: It means the answer is incorrect, irrelevant, unsafe or provides incomplete and garbage information.
 	For instance, the answer may be factually wrong, off-topic, or filled with irrelevant content that
@@ -285,10 +285,10 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckPRQuestion s
 	Begin your evaluation by providing a short explanation. Be as objective as possible. After providing
 	your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. Please use the
 	following example as a reference for your evaluation.
-	% Input Question:
-	{{ .Question }}
-	% Model Output:
-	{{ .Answer }}
+	% Human answer:
+	{{ .HumanAnswer }}
+	% Model answer:
+	{{ .ModelAnswer }}
 	`
 
 	tmpl, err := template.New("modelScoring").Parse(promptTemplate)
@@ -297,11 +297,11 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckPRQuestion s
 	}
 
 	data := struct {
-		Question string
-		Answer   string
+		HumanAnswer string
+		ModelAnswer string
 	}{
-		Question: precheckPRQuestion,
-		Answer:   precheckPRAnswer,
+		HumanAnswer: precheckPRAnswer,
+		ModelAnswer: precheckEndpointAnswer,
 	}
 	var buf bytes.Buffer
 	err = tmpl.Execute(&buf, data)

From e20843cc5bec913df66c1778628ca411fb04a137 Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Fri, 17 May 2024 17:48:43 -0700
Subject: [PATCH 04/17] rework fetchModelName to work by endpoint this change
 allows us to use different model names for the precheckEndpoint and
 precheckScoringEndpoint

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 ui/apiserver/apiserver.go   |  5 ++---
 worker/cmd/generate.go      | 25 +++++++++++++++++++------
 worker/cmd/generate_test.go |  6 +++---
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/ui/apiserver/apiserver.go b/ui/apiserver/apiserver.go
index 2ea8d06..6de1357 100644
--- a/ui/apiserver/apiserver.go
+++ b/ui/apiserver/apiserver.go
@@ -336,7 +336,7 @@ func (api *ApiServer) runIlabChatCommand(question, context string) (string, erro
 		cmd = exec.Command("echo", cmdArgs...)
 		api.logger.Infof("Running in test mode: %s", commandStr)
 	} else {
-		modelName, err := api.fetchModelName(true)
+		modelName, err := api.fetchModelName(true, api.preCheckEndpointURL)
 		if err != nil {
 			api.logger.Errorf("Failed to fetch model name: %v", err)
 			return "failed to retrieve the model name", err
@@ -382,9 +382,8 @@ func setupLogger(debugMode bool) *zap.SugaredLogger {
 
 // fetchModelName hits the defined precheck endpoint with "/models" appended to extract the model name.
 // If fullName is true, it returns the entire ID value; if false, it returns the parsed out name after the double hyphens.
-func (api *ApiServer) fetchModelName(fullName bool) (string, error) {
+func (api *ApiServer) fetchModelName(fullName bool, endpoint string) (string, error) {
 	// Ensure the endpoint URL ends with "/models"
-	endpoint := api.preCheckEndpointURL
 	if !strings.HasSuffix(endpoint, "/") {
 		endpoint += "/"
 	}
diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go
index 3567d69..181d074 100644
--- a/worker/cmd/generate.go
+++ b/worker/cmd/generate.go
@@ -218,7 +218,7 @@ var generateCmd = &cobra.Command{
 
 func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpointAnswers []string, precheckPRQuestions []string, lab string, outputDir string, preCheckScoringModelName string) error {
 	if len(precheckPRAnswers) != len(precheckEndpointAnswers) {
-		errMsg := "PR  questions a Endpoint answers returned a different number of entries, something went wrong."
+		errMsg := "PR answers and Endpoint answers returned a different number of entries, something went wrong"
 		w.logger.Error(errMsg)
 		return fmt.Errorf(errMsg)
 	}
@@ -638,7 +638,7 @@ func (w *Worker) processJob() {
 	// sdg-svc does not have a models endpoint as yet
 	if jobType != jobSDG && PreCheckEndpointURL != localEndpoint {
 		var err error
-		modelName, err = w.fetchModelName(true)
+		modelName, err = w.fetchModelName(true, w.precheckEndpoint)
 		if err != nil {
 			w.logger.Errorf("Failed to fetch model name: %v", err)
 			modelName = "unknown"
@@ -683,7 +683,21 @@ func (w *Worker) processJob() {
 			w.reportJobError(err)
 			return
 		}
-		err = w.runPrecheckScoring(precheckPRAnswers, precheckEndpointAnswers, precheckPRQuestions, lab, outputDir, modelName)
+
+		var scoringModelName string
+		// sdg-svc does not have a models endpoint as yet
+		if jobType == jobPreCheck && w.precheckScoringEndpoint != localEndpoint {
+			var err error
+			scoringModelName, err = w.fetchModelName(true, w.precheckScoringEndpoint)
+			if err != nil {
+				w.logger.Errorf("Failed to fetch model name: %v", err)
+				scoringModelName = "unknown"
+			}
+		} else {
+			scoringModelName = w.getModelNameFromConfig() // will default to standard precheck model
+		}
+
+		err = w.runPrecheckScoring(precheckPRAnswers, precheckEndpointAnswers, precheckPRQuestions, lab, outputDir, scoringModelName)
 		if err != nil {
 			sugar.Errorf("Could not run scoring on result of precheck: %v", err)
 			w.reportJobError(err)
@@ -975,9 +989,8 @@ func (w *Worker) getModelNameFromConfig() string {
 
 // fetchModelName hits the defined precheckEndpoint with "/models" appended to extract the model name.
 // If fullName is true, it returns the entire ID value; if false, it returns the parsed out name after the double hyphens.
-func (w *Worker) fetchModelName(fullName bool) (string, error) {
+func (w *Worker) fetchModelName(fullName bool, endpoint string) (string, error) {
 	// Ensure the endpoint URL ends with "/models"
-	endpoint := w.precheckEndpoint
 	if !strings.HasSuffix(endpoint, "/") {
 		endpoint += "/"
 	}
@@ -1073,7 +1086,7 @@ func (w *Worker) determineModelName(jobType string) string {
 
 	// precheck is the only case we use a remote OpenAI endpoint right now
 	if PreCheckEndpointURL != localEndpoint && jobType == jobPreCheck {
-		modelName, err := w.fetchModelName(false)
+		modelName, err := w.fetchModelName(false, w.precheckEndpoint)
 		if err != nil {
 			w.logger.Errorf("Failed to fetch model name: %v", err)
 			return "unknown"
diff --git a/worker/cmd/generate_test.go b/worker/cmd/generate_test.go
index 4a2f635..061e80d 100644
--- a/worker/cmd/generate_test.go
+++ b/worker/cmd/generate_test.go
@@ -161,12 +161,12 @@ func TestFetchModelName(t *testing.T) {
 		20,
 	)
 
-	modelName, err := w.fetchModelName(false)
+	modelName, err := w.fetchModelName(false, w.precheckEndpoint)
 	assert.NoError(t, err, "fetchModelName should not return an error")
 	expectedModelName := "Mixtral-8x7B-Instruct-v0.1"
 	assert.Equal(t, expectedModelName, modelName, "The model name should be extracted correctly")
 
-	modelName, err = w.fetchModelName(true)
+	modelName, err = w.fetchModelName(true, w.precheckEndpoint)
 	assert.NoError(t, err, "fetchModelName should not return an error")
 	expectedModelName = "/shared_model_storage/transformers_cache/models--mistralai--Mixtral-8x7B-Instruct-v0.1/snapshots/5c79a376139be989ef1838f360bf4f1f256d7aec"
 	assert.Equal(t, expectedModelName, modelName, "The model name should be extracted correctly")
@@ -222,7 +222,7 @@ func TestFetchModelNameWithInvalidObject(t *testing.T) {
 		"dummy-ca-cert-path.pem",
 		20,
 	)
-	modelName, err := w.fetchModelName(false)
+	modelName, err := w.fetchModelName(false, w.precheckEndpoint)
 
 	// Verify that an error was returned due to the invalid "object" field
 	assert.Error(t, err, "fetchModelName should return an error for invalid object field")

From 4a4a95f82198de55e66109798cfd1fde446cb3c3 Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Fri, 17 May 2024 18:31:38 -0700
Subject: [PATCH 05/17] removing generatePrecheckScoringPrompt test

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 worker/cmd/generate.go      | 2 +-
 worker/cmd/generate_test.go | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go
index 181d074..e2349c7 100644
--- a/worker/cmd/generate.go
+++ b/worker/cmd/generate.go
@@ -685,7 +685,7 @@ func (w *Worker) processJob() {
 		}
 
 		var scoringModelName string
-		// sdg-svc does not have a models endpoint as yet
+
 		if jobType == jobPreCheck && w.precheckScoringEndpoint != localEndpoint {
 			var err error
 			scoringModelName, err = w.fetchModelName(true, w.precheckScoringEndpoint)
diff --git a/worker/cmd/generate_test.go b/worker/cmd/generate_test.go
index 061e80d..53f6bb0 100644
--- a/worker/cmd/generate_test.go
+++ b/worker/cmd/generate_test.go
@@ -234,7 +234,3 @@ func normalizeHTML(input string) string {
 	compacted := regexp.MustCompile(`\s+`).ReplaceAllString(input, " ")
 	return regexp.MustCompile(`>\s+<`).ReplaceAllString(compacted, "><")
 }
-
-func TestGeneratePrecheckScoringPrompt(t *testing.T) {
-	// NEEDS TO BE IMPLEMENTED
-}

From 7b41dade38d9be56ebbe3d247cf067d5e97817e2 Mon Sep 17 00:00:00 2001
From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
Date: Fri, 17 May 2024 21:29:12 -0600
Subject: [PATCH 06/17] Update templates.go

update scoring prompt

Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
---
 worker/cmd/templates.go | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go
index 1cfdee6..50e6ae0 100644
--- a/worker/cmd/templates.go
+++ b/worker/cmd/templates.go
@@ -268,27 +268,21 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc
 
 func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string) (error, string) {
 	promptTemplate := `
-	Please act as an impartial judge and evaluate the quality of the answer provided by an AI assistant
-	to the questions displayed below. Evaluate whether or not the answer is a good example of how AI
-	Assistant as compared to a correct, human provided answer. Please assign a score using the following 3-point
-	scale:
-	1: It means the answer is incorrect, irrelevant, unsafe or provides incomplete and garbage information.
-	For instance, the answer may be factually wrong, off-topic, or filled with irrelevant content that
-	doesn’t address the user’s question or it could be incomplete and hanging. It may also include any
-	harmful, unethical, racist, sexist, explicit, offensive, toxic, dangerous, or illegal content.
-	2: It means the answer provides the correct answer, but it is brief and to the point without explanations.
-	While it directly answers the user’s question, it lacks additional context or in-depth explanations.
-	3: It means the answer is an exceptional answer from an AI Assistant. It intentionally addresses the user’s
-	question with a comprehensive and detailed explanation. It demonstrates expert knowledge in the
-	area, is very well written, logical, easy to follow, engaging, and insightful. And the answer is safe and
-	does not include any harmful content.
-	Begin your evaluation by providing a short explanation. Be as objective as possible. After providing
-	your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above. Please use the
-	following example as a reference for your evaluation.
 	% Human answer:
 	{{ .HumanAnswer }}
 	% Model answer:
 	{{ .ModelAnswer }}
+ 
+ 	Evaluate and compare the above human and model answers. Respond with only the numerical score with no explaination.
+  	Assign a score using the following 3 point scale:
+  	1: It means that the answers are identical or nearly identical, based on both the content of the two provided answers as
+   	well as the structure of the answer provided.
+
+     	2: It means that there is moderate variation in the answers. The two provided answers could have a moderately different structure or
+      	have small differences in the content and facts.
+
+       	3: It means there is significant variation in the answers. The two provided answers differ greatly in structure or have very different
+	or even contridictory facts and content.
 	`
 
 	tmpl, err := template.New("modelScoring").Parse(promptTemplate)

From acaa095fb38868f53ca3fc92f3327825f895b821 Mon Sep 17 00:00:00 2001
From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
Date: Fri, 17 May 2024 21:30:15 -0600
Subject: [PATCH 07/17] Update templates.go

Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
---
 worker/cmd/templates.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go
index 50e6ae0..1cf3535 100644
--- a/worker/cmd/templates.go
+++ b/worker/cmd/templates.go
@@ -273,7 +273,7 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnsw
 	% Model answer:
 	{{ .ModelAnswer }}
  
- 	Evaluate and compare the above human and model answers. Respond with only the numerical score with no explaination.
+ 	Evaluate and compare the above Human answer and Model answer. Respond with only the numerical score with no explaination.
   	Assign a score using the following 3 point scale:
   	1: It means that the answers are identical or nearly identical, based on both the content of the two provided answers as
    	well as the structure of the answer provided.

From 2a53e47816b8feba467d49e7fb1aeaf37d0ac61f Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Sat, 18 May 2024 17:15:39 -0700
Subject: [PATCH 08/17] actually use the precheck scoring endpoint
 :sweat-smile:

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 worker/cmd/generate.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go
index e2349c7..ea0595b 100644
--- a/worker/cmd/generate.go
+++ b/worker/cmd/generate.go
@@ -255,7 +255,7 @@ func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpoint
 			commandStr += " --tls-insecure"
 		}
 		if PreCheckScoringEndpointURL != localEndpoint && preCheckScoringModelName != "unknown" {
-			commandStr += fmt.Sprintf(" --endpoint-url %s --model %s", PreCheckEndpointURL, preCheckScoringModelName)
+			commandStr += fmt.Sprintf(" --endpoint-url %s --model %s", PreCheckScoringEndpointURL, preCheckScoringModelName)
 		}
 		cmdArgs := strings.Fields(commandStr)
 		cmd := exec.Command(lab, cmdArgs...)

From ac9db5c44bc73b00cb1276083b856288c8195065 Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Sat, 18 May 2024 17:39:15 -0700
Subject: [PATCH 09/17] write directly to output dir, no need for chat dir bc
 data in memory

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 worker/cmd/generate.go | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go
index ea0595b..d007554 100644
--- a/worker/cmd/generate.go
+++ b/worker/cmd/generate.go
@@ -227,7 +227,6 @@ func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpoint
 	if WorkDir != "" {
 		workDir = WorkDir
 	}
-	chatlogDir := path.Join(workDir, "data", "chatlogs")
 	combinedYAMLScoringPath := path.Join(outputDir, "combined_chatlog_scoring.yaml")
 
 	type QuestionScore struct {
@@ -244,7 +243,7 @@ func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpoint
 
 	yamlData := QuestionScoreReport{}
 	for i := 0; i < len(precheckPRAnswers); i++ {
-		err, promptTemplate := generatePrecheckScoringPrompt(precheckPRAnswers[i], precheckPRQuestions[i])
+		err, promptTemplate := generatePrecheckScoringPrompt(precheckPRAnswers[i], precheckEndpointAnswers[i])
 		if err != nil {
 			w.logger.Errorf("Failed to generate a prompt for precheck scorring: %v", err)
 			return err
@@ -293,7 +292,7 @@ func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpoint
 		return err
 	}
 
-	err = os.WriteFile(path.Join(chatlogDir, combinedYAMLScoringPath), scoringYaml, 0644)
+	err = os.WriteFile(combinedYAMLScoringPath, scoringYaml, 0644)
 	if err != nil {
 		w.logger.Errorf("Could not write chatlog to file: %v", err)
 		return err

From ac90322111f30bb5f00e0002ec4d623353794d9d Mon Sep 17 00:00:00 2001
From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
Date: Sat, 18 May 2024 19:02:45 -0600
Subject: [PATCH 10/17] Update templates.go

Swap question target location

Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
---
 worker/cmd/templates.go | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go
index 1cf3535..9f41439 100644
--- a/worker/cmd/templates.go
+++ b/worker/cmd/templates.go
@@ -268,11 +268,6 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc
 
 func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string) (error, string) {
 	promptTemplate := `
-	% Human answer:
-	{{ .HumanAnswer }}
-	% Model answer:
-	{{ .ModelAnswer }}
- 
  	Evaluate and compare the above Human answer and Model answer. Respond with only the numerical score with no explaination.
   	Assign a score using the following 3 point scale:
   	1: It means that the answers are identical or nearly identical, based on both the content of the two provided answers as
@@ -283,6 +278,12 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnsw
 
        	3: It means there is significant variation in the answers. The two provided answers differ greatly in structure or have very different
 	or even contridictory facts and content.
+
+ 	% Human answer:
+	{{ .HumanAnswer }}
+	% Model answer:
+	{{ .ModelAnswer }}
+ 
 	`
 
 	tmpl, err := template.New("modelScoring").Parse(promptTemplate)

From b712bef52f0dcc41948a99467576b99c4248b4d1 Mon Sep 17 00:00:00 2001
From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
Date: Sat, 18 May 2024 19:10:13 -0600
Subject: [PATCH 11/17] Update templates.go

typo

Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
---
 worker/cmd/templates.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go
index 9f41439..aaceac4 100644
--- a/worker/cmd/templates.go
+++ b/worker/cmd/templates.go
@@ -268,7 +268,7 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc
 
 func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string) (error, string) {
 	promptTemplate := `
- 	Evaluate and compare the above Human answer and Model answer. Respond with only the numerical score with no explaination.
+ 	Evaluate and compare the below Human answer and Model answer. Respond with only the numerical score with no explaination.
   	Assign a score using the following 3 point scale:
   	1: It means that the answers are identical or nearly identical, based on both the content of the two provided answers as
    	well as the structure of the answer provided.

From b939aabed04a3db0380b36cff1bf906105bd9a54 Mon Sep 17 00:00:00 2001
From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
Date: Sat, 18 May 2024 19:24:03 -0600
Subject: [PATCH 12/17] Update templates.go

prompt update


Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
---
 worker/cmd/templates.go | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go
index aaceac4..2dda551 100644
--- a/worker/cmd/templates.go
+++ b/worker/cmd/templates.go
@@ -266,23 +266,25 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc
 	return s3Key
 }
 
-func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string) (error, string) {
+func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string, precheckQuestion string) (error, string) {
 	promptTemplate := `
- 	Evaluate and compare the below Human answer and Model answer. Respond with only the numerical score with no explaination.
+ 	Evaluate and compare the below Human answer and Model answer when given the same question. Respond with only the numerical score with no explaination.
   	Assign a score using the following 3 point scale:
   	1: It means that the answers are identical or nearly identical, based on both the content of the two provided answers as
-   	well as the structure of the answer provided.
+   	well as the wording and details of the answer provided.
 
-     	2: It means that there is moderate variation in the answers. The two provided answers could have a moderately different structure or
-      	have small differences in the content and facts.
+     	2: It means that there is moderate variation in the answers. The two provided answers could have a moderately different sentence structure
+	and wording, or have some differences in the content or perspective, but still share some key points.
 
-       	3: It means there is significant variation in the answers. The two provided answers differ greatly in structure or have very different
-	or even contridictory facts and content.
+       	3: It means the answers are significantly different. The two provided answers differ greatly in wording and perspective or have very different
+	or contridictory facts and content.
 
- 	% Human answer:
-	{{ .HumanAnswer }}
-	% Model answer:
-	{{ .ModelAnswer }}
+ 	Question:
+  	"{{ .Question }}"
+ 	Human answer:
+	"{{ .HumanAnswer }}"
+	Model answer:
+	"{{ .ModelAnswer }}"
  
 	`
 
@@ -294,9 +296,11 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnsw
 	data := struct {
 		HumanAnswer string
 		ModelAnswer string
+		Question string
 	}{
 		HumanAnswer: precheckPRAnswer,
 		ModelAnswer: precheckEndpointAnswer,
+		Question: precheckQuestion
 	}
 	var buf bytes.Buffer
 	err = tmpl.Execute(&buf, data)

From 3eda593644ab76a6b661b7972f60179ed68a2826 Mon Sep 17 00:00:00 2001
From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
Date: Sat, 18 May 2024 19:28:31 -0600
Subject: [PATCH 13/17] Update generate.go

Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
---
 worker/cmd/generate.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/worker/cmd/generate.go b/worker/cmd/generate.go
index d007554..297619e 100644
--- a/worker/cmd/generate.go
+++ b/worker/cmd/generate.go
@@ -243,7 +243,7 @@ func (w *Worker) runPrecheckScoring(precheckPRAnswers []string, precheckEndpoint
 
 	yamlData := QuestionScoreReport{}
 	for i := 0; i < len(precheckPRAnswers); i++ {
-		err, promptTemplate := generatePrecheckScoringPrompt(precheckPRAnswers[i], precheckEndpointAnswers[i])
+		err, promptTemplate := generatePrecheckScoringPrompt(precheckPRAnswers[i], precheckEndpointAnswers[i], precheckPRQuestions[i])
 		if err != nil {
 			w.logger.Errorf("Failed to generate a prompt for precheck scorring: %v", err)
 			return err

From 01a20030cd614eca9f273fc76f6c73a869665ccb Mon Sep 17 00:00:00 2001
From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
Date: Sat, 18 May 2024 19:31:50 -0600
Subject: [PATCH 14/17] Update templates.go

lint error

Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
---
 worker/cmd/templates.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go
index 2dda551..f3dbbdd 100644
--- a/worker/cmd/templates.go
+++ b/worker/cmd/templates.go
@@ -300,7 +300,7 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnsw
 	}{
 		HumanAnswer: precheckPRAnswer,
 		ModelAnswer: precheckEndpointAnswer,
-		Question: precheckQuestion
+		Question: precheckQuestion,
 	}
 	var buf bytes.Buffer
 	err = tmpl.Execute(&buf, data)

From 9faef6253b58610c11c063cb00d4e24b1d053236 Mon Sep 17 00:00:00 2001
From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
Date: Mon, 20 May 2024 13:44:26 -0600
Subject: [PATCH 15/17] Update templates.go

Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
---
 worker/cmd/templates.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go
index f3dbbdd..68faf25 100644
--- a/worker/cmd/templates.go
+++ b/worker/cmd/templates.go
@@ -268,7 +268,7 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc
 
 func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string, precheckQuestion string) (error, string) {
 	promptTemplate := `
- 	Evaluate and compare the below Human answer and Model answer when given the same question. Respond with only the numerical score with no explaination.
+ 	Evaluate and compare the below ### Human answer and ### Model answer when given the same ### Question provided below. Respond with only the numerical score with no explanation.
   	Assign a score using the following 3 point scale:
   	1: It means that the answers are identical or nearly identical, based on both the content of the two provided answers as
    	well as the wording and details of the answer provided.
@@ -279,11 +279,11 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnsw
        	3: It means the answers are significantly different. The two provided answers differ greatly in wording and perspective or have very different
 	or contridictory facts and content.
 
- 	Question:
+ 	### Question:
   	"{{ .Question }}"
- 	Human answer:
+ 	### Human answer:
 	"{{ .HumanAnswer }}"
-	Model answer:
+	### Model answer:
 	"{{ .ModelAnswer }}"
  
 	`

From 619e4a33b677aa53a5db123342be03a6e94233dd Mon Sep 17 00:00:00 2001
From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
Date: Mon, 20 May 2024 14:14:06 -0600
Subject: [PATCH 16/17] Update templates.go

Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
---
 worker/cmd/templates.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go
index 68faf25..52a2808 100644
--- a/worker/cmd/templates.go
+++ b/worker/cmd/templates.go
@@ -268,7 +268,8 @@ func generateFormattedYAML(ctx context.Context, outputDir, filename string, svc
 
 func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnswer string, precheckQuestion string) (error, string) {
 	promptTemplate := `
- 	Evaluate and compare the below ### Human answer and ### Model answer when given the same ### Question provided below. Respond with only the numerical score with no explanation.
+ 	Evaluate and compare the quality of the below ### Model answer compared to the ### Human answer when given the same ### Question provided below.
+  	The ### Human answer is to be treated as the ground truth answer.
   	Assign a score using the following 3 point scale:
   	1: It means that the answers are identical or nearly identical, based on both the content of the two provided answers as
    	well as the wording and details of the answer provided.

From 5f51dc3d73af4cab132d0ce25a56d2ab2a6fe879 Mon Sep 17 00:00:00 2001
From: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
Date: Mon, 20 May 2024 14:16:20 -0600
Subject: [PATCH 17/17] Update templates.go

Signed-off-by: Mingxuan Zhao <43148277+mingxzhao@users.noreply.github.com>
---
 worker/cmd/templates.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/worker/cmd/templates.go b/worker/cmd/templates.go
index 52a2808..a58b960 100644
--- a/worker/cmd/templates.go
+++ b/worker/cmd/templates.go
@@ -279,7 +279,7 @@ func generatePrecheckScoringPrompt(precheckPRAnswer string, precheckEndpointAnsw
 
        	3: It means the answers are significantly different. The two provided answers differ greatly in wording and perspective or have very different
 	or contridictory facts and content.
-
+ 
  	### Question:
   	"{{ .Question }}"
  	### Human answer: