-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathreport_example.py
82 lines (70 loc) · 3.96 KB
/
report_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import numpy as np
import pandas as pd
import plotly.express as px
import math
from arena import model_table, pass1_to_battle, example_table
def get_anchor(benchmark_id: str, example_id: str):
def get_link():
if benchmark_id in ['humaneval', 'humaneval+', 'mbpp', 'mbpp+']:
dir, id = example_id.split('/') # expecting HumanEval/93 and Mbpp/622 etc.
return f'https://crux-eval.github.io/eval-arena/evalplus/{dir}/{id}.html'
elif benchmark_id in ['CRUXEval-input', 'CRUXEval-output']:
id = example_id.replace(benchmark_id + '/', '')
return f'https://crux-eval.github.io/demo.html?id={int(id) + 1}'
else:
return ''
link = get_link()
if link != '':
return f'<a href="{get_link()}">{example_id}</a>'
else:
return example_id
def fig_example_vs_model(result, all_stats, ex_table):
df = result[['model', 'example_id', 'pass1']].merge(ex_table[['example_id', 'acc']], on='example_id')
df = df.merge(all_stats[['model', 'pass1']], on='model', suffixes=['_ex', '_model'])
df.sort_values(by=['acc', 'example_id', 'pass1_model', 'model'], inplace=True)
fig = px.scatter(df, y='example_id', x='model', color='pass1_ex',
opacity=0.75,
color_continuous_scale=["red", "yellow", "green"],
hover_data=['acc', 'model', 'example_id'])
fig.update_xaxes(autorange="reversed")
fig.update_traces(marker={'symbol': 'square'})
bid = 'test'
fig.update_layout(
width=900, height=1200,
xaxis = dict(side ="top"),
)
return fig
def get_example_level_results(benchmark_id, result):
battles = pass1_to_battle(result)
battles_no_ties = battles[battles["winner"].str.contains("model_")]
all_stats = model_table(battles_no_ties, result)
ex_table = example_table(result, all_stats)
ex_table['example_link'] = ex_table['example_id'].apply(lambda x: get_anchor(benchmark_id, x))
outputs = {}
outputs['result table'] = all_stats.sort_values(by='elo', ascending=False).to_html(float_format='%10.3f')
outputs['fig_min_elo_solve'] = px.histogram(ex_table, x='min_elo', marginal='rug', title='min ELO to solve').to_html(full_html=False)
outputs['table_histogram_accs'] = px.histogram(ex_table, x='acc', marginal='rug', title='accuracy on examples').to_html(full_html=False)
no_solve = ex_table[ex_table['num_solved'] == 0]
outputs['list_no_solve'] = sorted(no_solve['example_link'].to_list())
one_solve = ex_table[ex_table['num_solved'] == 1]
pd.options.mode.chained_assignment = None
one_solve['model'] = one_solve['models'].apply(lambda x: x[0])
one_solve = one_solve.sort_values(by='min_elo', ascending=False)
one_solve = one_solve[['example_link', 'model', 'min_elo']]
outputs['table_one_solve'] = one_solve.to_html(escape=False, float_format='%10.3f', index=False)
list_suspect = ex_table.sort_values(by='tau', ascending=True).head(10)
outputs['table_suspect'] = list_suspect[['example_link', 'acc', 'tau']].to_html(escape=False, float_format='%10.3f', index=False)
print(benchmark_id, 'anti-correlated prop', np.mean(ex_table['tau'] <= 0))
outputs['fig_example_vs_model'] = fig_example_vs_model(result, all_stats, ex_table)
return outputs
def gen_example_report(benchmark_id: str, raw_results: pd.DataFrame, OUTPUT_PATH):
outputs = get_example_level_results(benchmark_id, raw_results)
from jinja2 import Template
template_path = r"templates/template_example.html"
output_path = rf"{OUTPUT_PATH}/ex_{benchmark_id}.html"
with open(output_path, "w", encoding="utf-8") as output_file:
with open(template_path) as template_file:
j2_template = Template(template_file.read())
output_file.write(j2_template.render({'benchmark_id': benchmark_id, 'outputs': outputs}))
with open(f'{OUTPUT_PATH}/ex_v_model_{benchmark_id}.html', 'wt') as f:
f.write(outputs['fig_example_vs_model'].to_html())