diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index e34d046a6..39ec3cb35 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -249,7 +249,41 @@ def get_full_table(arena_df, model_table_df, model_to_score): def arena_hard_process(leaderboard_table_file, filepath): - arena_hard = pd.read_csv(filepath) + with open(filepath, "rb") as f: # 'rb' is for reading in binary mode + combined_arena_hard = pickle.load(f) # normal + styled results + + arena_hard_normal = pd.DataFrame(combined_arena_hard["normal"]) # normal + arena_hard_style = pd.DataFrame(combined_arena_hard["style_control"]) # style + + normal_rankings = recompute_final_ranking(arena_hard_normal) + style_rankings = recompute_final_ranking(arena_hard_style) + + arena_hard_normal.insert( + loc=0, column="Rank* (UB)", value=normal_rankings + ) # normal rankings + arena_hard_style.insert( + loc=0, column="Rank (StyleCtrl)", value=style_rankings + ) # style rankings + + arena_hard_normal["avg_tokens"] = arena_hard_normal["avg_tokens"].astype(int) + + # combine together + combined_df = pd.merge( + arena_hard_normal, + arena_hard_style[["model", "Rank (StyleCtrl)"]], + on="model", + how="left", + ) + + # Move 'Rank (StyleCtrl)' to position 1 (second column) + columns = list(combined_df.columns) + columns.insert(1, columns.pop(columns.index("Rank (StyleCtrl)"))) + combined_df = combined_df[columns] + + combined_df["Rank (StyleCtrl)"] = combined_df["Rank (StyleCtrl)"].astype( + int + ) # convert the ranking to integer values + leaderboard_table = pd.read_csv(leaderboard_table_file) links = leaderboard_table.get("Link") display_name = leaderboard_table.get("Model") @@ -265,23 +299,21 @@ def arena_hard_process(leaderboard_table_file, filepath): info[model_name[i]] = model_info organization = [] - for i in range(len(arena_hard)): + for i in range(len(combined_df)): assert ( - arena_hard.loc[i, "model"] in info - ), f"need to update leaderboard_table info by adding {arena_hard.loc[i, 'model']}" - organization.append(info[arena_hard.loc[i, "model"]]["org"]) - link = info[arena_hard.loc[i, "model"]]["link"] - arena_hard.loc[i, "model"] = model_hyperlink( - info[arena_hard.loc[i, "model"]]["display"], link + combined_df.loc[i, "model"] in info + ), f"need to update leaderboard_table info by adding {combined_df.loc[i, 'model']}" + organization.append(info[combined_df.loc[i, "model"]]["org"]) + link = info[combined_df.loc[i, "model"]]["link"] + combined_df.loc[i, "model"] = model_hyperlink( + info[combined_df.loc[i, "model"]]["display"], link ) - arena_hard.insert( - loc=len(arena_hard.columns), column="Organization", value=organization + combined_df.insert( + loc=len(combined_df.columns), column="Organization", value=organization ) - rankings = recompute_final_ranking(arena_hard) - arena_hard.insert(loc=0, column="Rank* (UB)", value=rankings) - return arena_hard + return combined_df def get_arena_table( @@ -398,6 +430,28 @@ def highlight_rank_max(s): return elo_dataframe.style.apply(highlight_rank_max, subset=["Delta"]) +# highlight the style control rank value as red +def highlight_red(s): + return [("color: red; font-weight: bold") for v in s] + + +# highlight the style control rank value as green +def highlight_green(s): + return [("color: green; font-weight: bold") for v in s] + + +# decide whether to highlight the style control rank value as green or red +def compare_func(row): + if row["Rank (StyleCtrl)"] is None: + return 0 + if row["Rank (StyleCtrl)"] == row["Rank* (UB)"]: + return 0 + elif row["Rank (StyleCtrl)"] < row["Rank* (UB)"]: + return 1 + else: + return -1 + + def update_overall_leaderboard_df(arena_table_vals): columns = [ "Rank* (UB)", @@ -412,30 +466,38 @@ def update_overall_leaderboard_df(arena_table_vals): ] elo_dataframe = pd.DataFrame(arena_table_vals, columns=columns) - def highlight_red(s): - return [("color: red; font-weight: bold") for v in s] + comparison = elo_dataframe.apply( + compare_func, + axis=1, + ) + indices_red = [i for i, value in enumerate(comparison) if value == -1] + indices_green = [i for i, value in enumerate(comparison) if value == 1] - def highlight_green(s): - return [("color: green; font-weight: bold") for v in s] + return elo_dataframe.style.apply( + highlight_red, subset=pd.IndexSlice[indices_red, ["Rank (StyleCtrl)"]] + ).apply(highlight_green, subset=pd.IndexSlice[indices_green, ["Rank (StyleCtrl)"]]) - def compare_func(row): - if row["Rank (StyleCtrl)"] is None: - return 0 - if row["Rank (StyleCtrl)"] == row["Rank* (UB)"]: - return 0 - elif row["Rank (StyleCtrl)"] < row["Rank* (UB)"]: - return 1 - else: - return -1 - comparison = elo_dataframe.apply( +def update_hard_leaderboard_df(arena_table_vals): + columns = [ + "Rank* (UB)", + "Rank (StyleCtrl)", + "Model", + "Win-rate", + "95% CI", + "Average Tokens", + "Organization", + ] + dataframe = pd.DataFrame(arena_table_vals, columns=columns) + + comparison = dataframe.apply( compare_func, axis=1, ) indices_red = [i for i, value in enumerate(comparison) if value == -1] indices_green = [i for i, value in enumerate(comparison) if value == 1] - return elo_dataframe.style.apply( + return dataframe.style.apply( highlight_red, subset=pd.IndexSlice[indices_red, ["Rank (StyleCtrl)"]] ).apply(highlight_green, subset=pd.IndexSlice[indices_green, ["Rank (StyleCtrl)"]]) @@ -493,9 +555,9 @@ def update_leaderboard_and_plots(category, filters): arena_values = get_arena_table( arena_df, model_table_df, - arena_subset_df=arena_subset_df - if category != "Overall" - else arena_overall_sc_df, + arena_subset_df=( + arena_subset_df if category != "Overall" else arena_overall_sc_df + ), hidden_models=( None if len(filters) > 0 and "Show Deprecated" in filters @@ -1013,12 +1075,14 @@ def build_leaderboard_tab( dataFrame = arena_hard_process( leaderboard_table_file, arena_hard_leaderboard ) + date = dataFrame["date"][0] dataFrame = dataFrame.drop( columns=["rating_q025", "rating_q975", "date"] ) dataFrame["CI"] = dataFrame.CI.map(ast.literal_eval) - dataFrame["CI"] = dataFrame.CI.map(lambda x: f"+{x[1]}/-{x[0]}") + dataFrame["CI"] = dataFrame.CI.map(lambda x: f"+{x[1]}/{x[0]}") + dataFrame = dataFrame.rename( columns={ "model": "Model", @@ -1027,23 +1091,49 @@ def build_leaderboard_tab( "avg_tokens": "Average Tokens", } ) + dataFrame["Win-rate"] = dataFrame["Win-rate"].apply( + lambda x: f"{x:g}" if pd.notnull(x) else x + ) + model_to_score = {} for i in range(len(dataFrame)): - model_to_score[dataFrame.loc[i, "Model"]] = dataFrame.loc[ - i, "Win-rate" - ] + model_name = dataFrame.loc[i, "Model"] + win_rate = dataFrame.loc[i, "Win-rate"] + + model_to_score[model_name] = win_rate + md = arena_hard_title(date) gr.Markdown(md, elem_id="leaderboard_markdown") + + dataFrame = update_hard_leaderboard_df(dataFrame) gr.DataFrame( dataFrame, datatype=[ - "markdown" if col == "Model" else "str" - for col in dataFrame.columns + "number", + "number", + "markdown", + "str", + "str", + "number", + "str", ], elem_id="arena_hard_leaderboard", height=1000, wrap=True, - column_widths=[70, 190, 80, 80, 90, 150], + column_widths=[70, 70, 190, 80, 80, 90, 150], + ) + gr.Markdown( + f""" +***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model. +Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). +See Figure 1 below for visualization of the confidence intervals of model scores. + +**Rank (StyleCtrl)**: model's ranking with style control, which accounts for factors like response length and markdown usage to decouple model performance from these potential confounding variables. +See [blog post](https://blog.lmarena.ai/blog/2024/style-control/) for further details. + +Note: in each category, we exclude models with fewer than 300 votes as their confidence intervals can be large. +""", + elem_id="leaderboard_markdown", ) with gr.Tab("Full Leaderboard", id=4):