Skip to content

Commit

Permalink
Write evals to evals.jsonl instead of test output (#30)
Browse files Browse the repository at this point in the history
Turns out parsing the test output, even with the `-json` flag, is quite
cumbersome!
  • Loading branch information
markuswustenberg authored Jan 14, 2025
1 parent 1345a9f commit 222b1eb
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 14 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ jobs:
if_no_artifact_found: warn

- name: Evaluate
run: go test -json -run TestEval ./... | evals | tee evals.txt >> $GITHUB_STEP_SUMMARY
run: |
go test -run TestEval ./...
evals | tee evals.txt >> $GITHUB_STEP_SUMMARY
- name: Upload evals.db
uses: actions/upload-artifact@v4
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/.env*.local
/cover.out
evals.db
/evals.db
/evals.jsonl
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ cover:

.PHONY: evaluate
evaluate:
@go test -json -run TestEval ./... | evals
go test -run TestEval ./...
evals | glow

.PHONY: lint
lint:
Expand Down
65 changes: 57 additions & 8 deletions eval/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ package eval
import (
"encoding/json"
"os"
"path"
"path/filepath"
"strings"
"sync"
"testing"
"time"
)
Expand Down Expand Up @@ -72,36 +75,82 @@ func (e *E) Score(s Sample, scorer Scorer) Result {
}

type logLine struct {
Name string
Sample Sample
Result Result
Duration time.Duration
}

const (
startDelimiter = "EVALRESULT🌜"
endDelimiter = "🌛EVALRESULT"
)
var evalsFileLock sync.Mutex
var evalsFileOnce sync.Once

// Log a [Sample] and [Result].
// Log a [Sample] and [Result] to evals.txt.
// This effectively logs the eval name, sample, and result, along with timing information.
// TODO include token information?
func (e *E) Log(s Sample, r Result) {
e.T.Helper()

l := logLine{
Name: e.T.Name(),
Sample: s,
Result: r,
Duration: time.Since(e.start),
}

e.T.Log(startDelimiter + mustJSON(l) + endDelimiter)
e.T.Logf("%+v", l)

evalsFileLock.Lock()
defer evalsFileLock.Unlock()

dir := findProjectRoot(e.T)
path := path.Join(dir, "evals.jsonl")

evalsFileOnce.Do(func() {
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
e.T.Fatal(err)
}
})

f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
e.T.Fatal(err)
}
defer func() {
_ = f.Close()
}()

if _, err := f.Write(mustJSON(l)); err != nil {
e.T.Fatal(err)
}
}

func mustJSON(l logLine) string {
func mustJSON(l logLine) []byte {
b, err := json.Marshal(l)
if err != nil {
panic(err)
}
b = append(b, '\n')

return string(b)
return b
}

func findProjectRoot(t *testing.T) string {
t.Helper()

dir, err := os.Getwd()
if err != nil {
t.Fatal(err)
}

for {
if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil {
return dir
}

parent := filepath.Dir(dir)
if parent == dir {
t.Fatal("could not find go.mod file")
}
dir = parent
}
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ require (
github.com/openai/openai-go v0.1.0-alpha.45
google.golang.org/api v0.216.0
maragu.dev/env v0.2.0
maragu.dev/evals v0.0.0-20250110140605-d045751e4b4d
maragu.dev/evals v0.0.0-20250114114008-6c73fea1551c
maragu.dev/is v0.2.0
)

Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ maragu.dev/env v0.2.0 h1:nQKitDEB65ArZsh6E7vxzodOqY9bxEVFdBg+tskS1ys=
maragu.dev/env v0.2.0/go.mod h1:t5CCbaEnjCM5mewiAVVzTS4N+oXTus2+SRnzKQbQVME=
maragu.dev/errors v0.3.0 h1:huI+n+ddMfVgQFD+cEqIPaozUlfz3TkfgpkssNip5G0=
maragu.dev/errors v0.3.0/go.mod h1:cygLiyNnq4ofF3whYscilo2ecUADCaUQXwvwFrMOhmM=
maragu.dev/evals v0.0.0-20250110140605-d045751e4b4d h1:q8VMFOPZIJNnbFYRK7OWRWVvDrl3RtyCUYOg5/Fhlmg=
maragu.dev/evals v0.0.0-20250110140605-d045751e4b4d/go.mod h1:zEwfRDpHBpRcgkG9pBzoZlnETLYRt9Shj+cIAFNizW8=
maragu.dev/evals v0.0.0-20250114114008-6c73fea1551c h1:huPj1S5RhqgpbBAd3aCLfdVie3ZsU8Du7kepL2ZtDUQ=
maragu.dev/evals v0.0.0-20250114114008-6c73fea1551c/go.mod h1:+2Y3dYZ6oANM+cL88kFxaPD1H7rq3FXOrI3NOeNKaZ8=
maragu.dev/is v0.2.0 h1:poeuVEA5GG3vrDpGmzo2KjWtIMZmqUyvGnOB0/pemig=
maragu.dev/is v0.2.0/go.mod h1:bviaM5S0fBshCw7wuumFGTju/izopZ/Yvq4g7Klc7y8=
maragu.dev/migrate v0.6.0 h1:gJLAIVaRh9z9sN55Q2sWwScpEH+JsT6N0L1DnzedXFE=
Expand Down

0 comments on commit 222b1eb

Please sign in to comment.