-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_eval.sh
executable file
·105 lines (81 loc) · 3.6 KB
/
run_eval.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env bash
datetime=$(date +%F_%H-%M-%S)
echo "Setting up eval $datetime"
outdir_base="evaluation_results/$datetime"
mkdir -p "$outdir_base"
results="$outdir_base/results.md"
scenarios=($(ls evaluations))
attempt_count=3
declare -A test_case_params
test_case_params["claude_sonnet_latest_no_seg"]="--model claude-3-5-sonnet-latest"
test_case_params["claude_sonnet_latest_with_seg"]="--apply-segmentation --model claude-3-5-sonnet-latest"
test_case_params["gpt-4o-mini_no_seg"]="--model gpt-4o-mini"
test_case_params["gpt-4o_with_seg"]="--apply-segmentation --model gpt-4o-mini"
test_case_params["gpt-4o-mini_no_seg"]="--model gpt-4o"
test_case_params["gpt-4o_with_seg"]="--apply-segmentation --model gpt-4o"
test_case_params["gemini-2-flash_no_seg"]="--model gemini-2.0-flash-exp"
test_case_params["gemini-2-flash_with_seg"]="--apply-segmentation --model gemini-2.0-flash-exp"
# test_case_params["gemini-1206-flash_no_seg"]="--model gemini-exp-1206"
# test_case_params["gemini-1206-flash_with_seg"]="--apply-segmentation --model gemini-exp-1206"
test_case_params["gemini-1.5-pro_no_seg"]="--model gemini-1.5-pro"
test_case_params["gemini-1.5-pro_with_seg"]="--apply-segmentation --model gemini-1.5-pro"
echo "# Ghostwriter evaluation results $datetime" > $results
echo "" >> $results
# how many scenarios are there
scenario_count=${#scenarios[@]}
test_case_count=${#test_case_params[@]}
total_tests=$(($scenario_count * $test_case_count * $attempt_count))
echo "There are $scenario_count scenarios and $test_case_count test cases with $attempt_count attempts ($total_tests total tests)." >> $results
echo "There are $scenario_count scenarios and $test_case_count test cases with $attempt_count attempts ($total_tests total tests)."
# Loop over each scenario
for scenario in "${scenarios[@]}"; do
echo "Running scenario $scenario"
echo "## Test: $scenario" >> $results
echo "" >> $results
# Loop over each test_case_params key
for case_name in ${!test_case_params[@]}; do
params=${test_case_params[$case_name]}
# Append to the results.md file
echo "### $case_name" >> $results
for attempt in $(seq 1 $attempt_count); do
# Create output directory
outdir=$outdir_base/$scenario/$case_name/$attempt
mkdir -p $outdir
# Run the test case
echo "Running scenario $scenario with params $params attempt $attempt"
./target/release/ghostwriter \
--input-png evaluations/$scenario/input.png \
--save-screenshot $outdir/input.png \
--model-output-file $outdir/result.json \
--output-file $outdir/result.out \
--save-bitmap $outdir/result.png \
--no-draw \
--no-draw-progress \
--no-loop \
--no-trigger \
$params
# Create a merged image with the new part in red
if [ -f $outdir/result.png ]; then
convert \
\( evaluations/$scenario/input.png -colorspace RGB \) \
\( $outdir/result.png -type truecolormatte -transparent white -fill red -colorize 100 \) \
-compose Over \
-composite \
$outdir/merged-output.png
fi
if [ -f $outdir/merged-output.png ]; then
echo -n "<img src='../../$outdir/merged-output.png' border=1 width=200 />" >> $results
else
echo "<img src='../../evaluations/$scenario/input.png' border=1 width=200 />" >> $results
echo "" >> $results
echo '```' >> $results
cat $outdir/result.out >> $results
echo "" >> $results
echo '```' >> $results
fi
echo "Sleeping for 10 seconds to avoid rate limiting"
sleep 10
done
echo "" >> $results
done
done