diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 40c5778..de00ae9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -76,7 +76,9 @@ jobs: if_no_artifact_found: warn - name: Evaluate - run: go test -json -run TestEval ./... | evals | tee evals.txt >> $GITHUB_STEP_SUMMARY + run: | + go test -run TestEval ./... + evals | tee evals.txt >> $GITHUB_STEP_SUMMARY - name: Upload evals.db uses: actions/upload-artifact@v4 diff --git a/.gitignore b/.gitignore index 2ff9792..f25d53a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /.env*.local /cover.out -evals.db +/evals.db +/evals.jsonl diff --git a/Makefile b/Makefile index ffb5730..4b9291b 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,8 @@ cover: .PHONY: evaluate evaluate: - @go test -json -run TestEval ./... | evals + go test -run TestEval ./... + evals | glow .PHONY: lint lint: diff --git a/eval/run.go b/eval/run.go index 22e6568..9478fb3 100644 --- a/eval/run.go +++ b/eval/run.go @@ -3,7 +3,10 @@ package eval import ( "encoding/json" "os" + "path" + "path/filepath" "strings" + "sync" "testing" "time" ) @@ -72,36 +75,82 @@ func (e *E) Score(s Sample, scorer Scorer) Result { } type logLine struct { + Name string Sample Sample Result Result Duration time.Duration } -const ( - startDelimiter = "EVALRESULT🌜" - endDelimiter = "🌛EVALRESULT" -) +var evalsFileLock sync.Mutex +var evalsFileOnce sync.Once -// Log a [Sample] and [Result]. +// Log a [Sample] and [Result] to evals.txt. // This effectively logs the eval name, sample, and result, along with timing information. // TODO include token information? func (e *E) Log(s Sample, r Result) { e.T.Helper() l := logLine{ + Name: e.T.Name(), Sample: s, Result: r, Duration: time.Since(e.start), } - e.T.Log(startDelimiter + mustJSON(l) + endDelimiter) + e.T.Logf("%+v", l) + + evalsFileLock.Lock() + defer evalsFileLock.Unlock() + + dir := findProjectRoot(e.T) + path := path.Join(dir, "evals.jsonl") + + evalsFileOnce.Do(func() { + if err := os.Remove(path); err != nil && !os.IsNotExist(err) { + e.T.Fatal(err) + } + }) + + f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) + if err != nil { + e.T.Fatal(err) + } + defer func() { + _ = f.Close() + }() + + if _, err := f.Write(mustJSON(l)); err != nil { + e.T.Fatal(err) + } } -func mustJSON(l logLine) string { +func mustJSON(l logLine) []byte { b, err := json.Marshal(l) if err != nil { panic(err) } + b = append(b, '\n') - return string(b) + return b +} + +func findProjectRoot(t *testing.T) string { + t.Helper() + + dir, err := os.Getwd() + if err != nil { + t.Fatal(err) + } + + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + + parent := filepath.Dir(dir) + if parent == dir { + t.Fatal("could not find go.mod file") + } + dir = parent + } } diff --git a/go.mod b/go.mod index 430da91..80525d9 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/openai/openai-go v0.1.0-alpha.45 google.golang.org/api v0.216.0 maragu.dev/env v0.2.0 - maragu.dev/evals v0.0.0-20250110140605-d045751e4b4d + maragu.dev/evals v0.0.0-20250114114008-6c73fea1551c maragu.dev/is v0.2.0 ) diff --git a/go.sum b/go.sum index 80fde73..af1ad63 100644 --- a/go.sum +++ b/go.sum @@ -128,8 +128,8 @@ maragu.dev/env v0.2.0 h1:nQKitDEB65ArZsh6E7vxzodOqY9bxEVFdBg+tskS1ys= maragu.dev/env v0.2.0/go.mod h1:t5CCbaEnjCM5mewiAVVzTS4N+oXTus2+SRnzKQbQVME= maragu.dev/errors v0.3.0 h1:huI+n+ddMfVgQFD+cEqIPaozUlfz3TkfgpkssNip5G0= maragu.dev/errors v0.3.0/go.mod h1:cygLiyNnq4ofF3whYscilo2ecUADCaUQXwvwFrMOhmM= -maragu.dev/evals v0.0.0-20250110140605-d045751e4b4d h1:q8VMFOPZIJNnbFYRK7OWRWVvDrl3RtyCUYOg5/Fhlmg= -maragu.dev/evals v0.0.0-20250110140605-d045751e4b4d/go.mod h1:zEwfRDpHBpRcgkG9pBzoZlnETLYRt9Shj+cIAFNizW8= +maragu.dev/evals v0.0.0-20250114114008-6c73fea1551c h1:huPj1S5RhqgpbBAd3aCLfdVie3ZsU8Du7kepL2ZtDUQ= +maragu.dev/evals v0.0.0-20250114114008-6c73fea1551c/go.mod h1:+2Y3dYZ6oANM+cL88kFxaPD1H7rq3FXOrI3NOeNKaZ8= maragu.dev/is v0.2.0 h1:poeuVEA5GG3vrDpGmzo2KjWtIMZmqUyvGnOB0/pemig= maragu.dev/is v0.2.0/go.mod h1:bviaM5S0fBshCw7wuumFGTju/izopZ/Yvq4g7Klc7y8= maragu.dev/migrate v0.6.0 h1:gJLAIVaRh9z9sN55Q2sWwScpEH+JsT6N0L1DnzedXFE=