Konstantin Chernyshev commited on
Commit
aac20b5
·
1 Parent(s): 1589444

chore: update r1/o3-mini, fix buttons

Browse files
Files changed (3) hide show
  1. app.py +61 -59
  2. data/u_math_eval_results.json +56 -17
  3. src/populate.py +21 -9
app.py CHANGED
@@ -45,7 +45,7 @@ def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) ->
45
  ) -> tuple[pd.DataFrame, list[str], str, str]:
46
  always_here_cols = [c.pretty_name for c in columns_dict.values() if c.never_hidden]
47
  selected_columns = [
48
- c.pretty_name for c in columns_dict.values() if current_tag in c.tags and c not in always_here_cols
49
  ]
50
  # keep the order of the columns
51
  filtered_df = full_df[[c for c in full_df.columns if c in (always_here_cols + selected_columns)]]
@@ -66,7 +66,6 @@ def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) ->
66
  if filter_name == "All":
67
  return full_df[current_df.columns]
68
  else:
69
- # actually filter by emoji
70
  query_symbol = filter_name[0]
71
  filtered_df = full_df[full_df[columns_dict["model_type_symbol"].pretty_name] == query_symbol]
72
  return filtered_df[current_df.columns]
@@ -77,7 +76,6 @@ def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) ->
77
  if filter_name == "All":
78
  return full_df[current_df.columns]
79
  else:
80
- # actually filter by emoji
81
  query_symbol = filter_name[0]
82
  filtered_df = full_df[full_df[columns_dict["model_size_symbol"].pretty_name] == query_symbol]
83
  return filtered_df[current_df.columns]
@@ -92,62 +90,64 @@ def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) ->
92
  return filtered_df[current_df.columns]
93
 
94
  with gr.Column() as col:
95
- # Add the controls
96
- with gr.Accordion("➡️ See All Columns", open=False):
97
- columns_to_select_visibility = [
98
- c.pretty_name for c in columns_dict.values() if not c.fully_hidden and not c.never_hidden
99
- ]
100
- all_columns_selector = gr.CheckboxGroup(
101
- choices=columns_to_select_visibility,
102
- value=[
103
- c.pretty_name
104
- for c in columns_dict.values()
105
- if c.pretty_name in columns_to_select_visibility and c.displayed_by_default
106
- ],
107
- label="Select Columns to Display:",
108
- interactive=True,
109
- container=False,
110
- )
111
  with gr.Row():
112
- with gr.Column():
113
- search_bar = gr.Textbox(
114
- placeholder="🔍 Search for your model and press ENTER...",
115
- show_label=False,
116
- elem_id="search-bar",
117
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
- # collect all column tags and create buttons for them
120
- all_tags = {}
121
- with gr.Column(variant="panel"):
122
- gr.Markdown("Select Columns:")
123
- for c in columns_dict.values():
124
- for tag in c.tags:
125
- if tag not in all_tags:
126
- all_tags[tag] = gr.Button(tag, interactive=True, size="sm")
127
-
128
- model_type_filter_selector = gr.Radio(
129
- label="Filter model types:",
130
- choices=["All", "💙 Open-Weights", "🟥 Proprietary"],
131
- value="All",
132
- elem_id="model-type-filter",
133
- interactive=True,
134
- )
135
-
136
- model_size_filter_selector = gr.Radio(
137
- label="Filter model sizes:",
138
- choices=["All", "🛴 Tiny (<5B)", "🚗 Small (5-50B)", "🚚 Medium (50-100B)", "🚀 Large (>100B)"],
139
- value="All",
140
- elem_id="model-size-filter",
141
- interactive=True,
142
- )
143
-
144
- model_family_filter_selector = gr.Radio(
145
- label="Filter model families:",
146
- choices=["All"] + list(dataframe[columns_dict["model_family"].pretty_name].unique()),
147
- value="All",
148
- elem_id="model-family-filter",
149
- interactive=True,
150
- )
151
 
152
  # create the hidden and visible dataframes to display
153
  hidden_leaderboard_df = gr.components.Dataframe(
@@ -162,7 +162,8 @@ def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) ->
162
  elem_id="leaderboard-df",
163
  interactive=False,
164
  )
165
- # add the callbacks
 
166
  all_columns_selector.change(
167
  fn=filter_dataframe_by_selected_columns,
168
  inputs=[hidden_leaderboard_df, all_columns_selector],
@@ -188,6 +189,7 @@ def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) ->
188
  inputs=[hidden_leaderboard_df, leaderboard_df, model_family_filter_selector],
189
  outputs=[leaderboard_df],
190
  )
 
191
  for tag, button in all_tags.items():
192
  button.click(
193
  fn=filter_dataframe_by_selected_tag_columns,
@@ -195,7 +197,7 @@ def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) ->
195
  outputs=[leaderboard_df, all_columns_selector, model_type_filter_selector, model_size_filter_selector],
196
  )
197
 
198
- # reload the leaderboard on the first load
199
  filter_dataframe_by_selected_columns(dataframe, all_columns_selector.value)
200
  return col
201
 
 
45
  ) -> tuple[pd.DataFrame, list[str], str, str]:
46
  always_here_cols = [c.pretty_name for c in columns_dict.values() if c.never_hidden]
47
  selected_columns = [
48
+ c.pretty_name for c in columns_dict.values() if current_tag in c.tags and c.pretty_name not in always_here_cols
49
  ]
50
  # keep the order of the columns
51
  filtered_df = full_df[[c for c in full_df.columns if c in (always_here_cols + selected_columns)]]
 
66
  if filter_name == "All":
67
  return full_df[current_df.columns]
68
  else:
 
69
  query_symbol = filter_name[0]
70
  filtered_df = full_df[full_df[columns_dict["model_type_symbol"].pretty_name] == query_symbol]
71
  return filtered_df[current_df.columns]
 
76
  if filter_name == "All":
77
  return full_df[current_df.columns]
78
  else:
 
79
  query_symbol = filter_name[0]
80
  filtered_df = full_df[full_df[columns_dict["model_size_symbol"].pretty_name] == query_symbol]
81
  return filtered_df[current_df.columns]
 
90
  return filtered_df[current_df.columns]
91
 
92
  with gr.Column() as col:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  with gr.Row():
94
+ with gr.Column(scale=8):
95
+ with gr.Accordion("➡️ See All Columns", open=False):
96
+ columns_to_select_visibility = [
97
+ c.pretty_name for c in columns_dict.values() if not c.fully_hidden and not c.never_hidden
98
+ ]
99
+ all_columns_selector = gr.CheckboxGroup(
100
+ choices=columns_to_select_visibility,
101
+ value=[
102
+ c.pretty_name
103
+ for c in columns_dict.values()
104
+ if c.pretty_name in columns_to_select_visibility and c.displayed_by_default
105
+ ],
106
+ label="Select Columns to Display:",
107
+ interactive=True,
108
+ container=False,
109
+ )
110
+ with gr.Column(variant='panel'):
111
+ gr.Markdown("Visible Columns:", elem_id="visible-columns-label")
112
+ all_tags = {}
113
+ with gr.Row():
114
+ for c in columns_dict.values():
115
+ for tag in c.tags:
116
+ if tag not in all_tags:
117
+ all_tags[tag] = gr.Button(tag, interactive=True, size="sm", variant="secondary", min_width=100)
118
+ with gr.Column(scale=8):
119
+ with gr.Row():
120
+ search_bar = gr.Textbox(
121
+ placeholder="🔍 Search for your model and press ENTER...",
122
+ show_label=False,
123
+ elem_id="search-bar",
124
+ )
125
 
126
+ with gr.Row():
127
+ model_type_filter_selector = gr.Dropdown(
128
+ label="Filter model types:",
129
+ choices=["All", "💙 Open-Weights", "🟥 Proprietary"],
130
+ value="All",
131
+ elem_id="model-type-filter",
132
+ interactive=True,
133
+ multiselect=False,
134
+ )
135
+ model_size_filter_selector = gr.Dropdown(
136
+ label="Filter model sizes:",
137
+ choices=["All", "🛴 Tiny (<5B)", "🚗 Small (5-50B)", "🚚 Medium (50-100B)", "🚀 Large (>100B)"],
138
+ value="All",
139
+ elem_id="model-size-filter",
140
+ interactive=True,
141
+ multiselect=False,
142
+ )
143
+ model_family_filter_selector = gr.Dropdown(
144
+ label="Filter model families:",
145
+ choices=["All"] + list(dataframe[columns_dict["model_family"].pretty_name].unique()),
146
+ value="All",
147
+ elem_id="model-family-filter",
148
+ interactive=True,
149
+ multiselect=False,
150
+ )
 
 
 
 
 
 
 
151
 
152
  # create the hidden and visible dataframes to display
153
  hidden_leaderboard_df = gr.components.Dataframe(
 
162
  elem_id="leaderboard-df",
163
  interactive=False,
164
  )
165
+
166
+ # Add the callbacks
167
  all_columns_selector.change(
168
  fn=filter_dataframe_by_selected_columns,
169
  inputs=[hidden_leaderboard_df, all_columns_selector],
 
189
  inputs=[hidden_leaderboard_df, leaderboard_df, model_family_filter_selector],
190
  outputs=[leaderboard_df],
191
  )
192
+ # Wire up each visible-column button to filter by tag
193
  for tag, button in all_tags.items():
194
  button.click(
195
  fn=filter_dataframe_by_selected_tag_columns,
 
197
  outputs=[leaderboard_df, all_columns_selector, model_type_filter_selector, model_size_filter_selector],
198
  )
199
 
200
+ # On first load, show the default columns
201
  filter_dataframe_by_selected_columns(dataframe, all_columns_selector.value)
202
  return col
203
 
data/u_math_eval_results.json CHANGED
@@ -1403,42 +1403,81 @@
1403
  0.5
1404
  ]
1405
  },
1406
- {
1407
  "model_name": "deepseek-ai/DeepSeek-R1",
1408
  "judge_model_name": "gpt-4o-2024-08-06",
1409
  "u_math": [
1410
- 63.2727,
1411
- 73.6667,
1412
- 16.5
1413
  ],
1414
  "algebra": [
1415
- 0.8,
1416
- 0.96,
1417
  0.0
1418
  ],
1419
  "differential_calc": [
1420
- 0.4818,
1421
- 0.6667,
1422
- 0.0857
1423
  ],
1424
  "integral_calc": [
1425
- 0.3606,
1426
- 0.4,
1427
- 0.2586
1428
  ],
1429
  "multivariable_calculus": [
1430
- 0.6124,
1431
- 0.6667,
1432
  0.3214
1433
  ],
1434
  "precalculus_review": [
1435
- 0.9188,
1436
- 0.9733,
1437
  0.1
1438
  ],
1439
  "sequences_series": [
1440
- 0.7468,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1441
  0.7533,
 
 
 
 
 
 
 
 
 
 
1442
  0.5
1443
  ]
1444
  },
 
1403
  0.5
1404
  ]
1405
  },
1406
+ {
1407
  "model_name": "deepseek-ai/DeepSeek-R1",
1408
  "judge_model_name": "gpt-4o-2024-08-06",
1409
  "u_math": [
1410
+ 68.3636,
1411
+ 79.0,
1412
+ 20.5
1413
  ],
1414
  "algebra": [
1415
+ 0.8056,
1416
+ 0.9667,
1417
  0.0
1418
  ],
1419
  "differential_calc": [
1420
+ 0.5136,
1421
+ 0.7067,
1422
+ 0.1
1423
  ],
1424
  "integral_calc": [
1425
+ 0.4471,
1426
+ 0.48,
1427
+ 0.3621
1428
  ],
1429
  "multivariable_calculus": [
1430
+ 0.6966,
1431
+ 0.7667,
1432
  0.3214
1433
  ],
1434
  "precalculus_review": [
1435
+ 0.925,
1436
+ 0.98,
1437
  0.1
1438
  ],
1439
  "sequences_series": [
1440
+ 0.8377,
1441
+ 0.84,
1442
+ 0.75
1443
+ ]
1444
+ },
1445
+ {
1446
+ "model_name": "o3-mini",
1447
+ "judge_model_name": "gpt-4o-2024-08-06",
1448
+ "u_math": [
1449
+ 68.9091,
1450
+ 79.6667,
1451
+ 20.5
1452
+ ],
1453
+ "algebra": [
1454
+ 0.7944,
1455
+ 0.94,
1456
+ 0.0667
1457
+ ],
1458
+ "differential_calc": [
1459
+ 0.5455,
1460
+ 0.7667,
1461
+ 0.0714
1462
+ ],
1463
+ "integral_calc": [
1464
+ 0.4904,
1465
+ 0.5267,
1466
+ 0.3966
1467
+ ],
1468
+ "multivariable_calculus": [
1469
+ 0.6854,
1470
  0.7533,
1471
+ 0.3214
1472
+ ],
1473
+ "precalculus_review": [
1474
+ 0.9,
1475
+ 0.96,
1476
+ 0.0
1477
+ ],
1478
+ "sequences_series": [
1479
+ 0.8247,
1480
+ 0.8333,
1481
  0.5
1482
  ]
1483
  },
src/populate.py CHANGED
@@ -11,19 +11,31 @@ UNKNOWN_MODEL_SHOW_SIZE = 150
11
  PERCENT_ROUND_DIGITS = 1
12
 
13
 
 
 
 
 
14
  def get_hf_model_info_card_or_none(model_name: str) -> ModelInfo | None:
 
 
15
  try:
16
  info = model_info(repo_id=model_name)
 
17
  return info
18
  except Exception:
 
19
  return None
20
 
21
 
22
  def get_hf_hub_config_or_none(model_name: str) -> AutoConfig | None:
 
 
23
  try:
24
  config = AutoConfig.from_pretrained(model_name, revision="main", trust_remote_code=True)
 
25
  return config
26
  except Exception:
 
27
  return None
28
 
29
 
@@ -169,9 +181,9 @@ U_MATH_COLUMNS_DICT = {
169
  "rank": Field("Rank", "number", never_hidden=True),
170
  **MODEL_COLUMNS_DICT,
171
  "judge_model_name": Field("Judge Model Name", "markdown", displayed_by_default=False),
172
- "u_math_acc": Field("U-MATH Acc", "rate", never_hidden=True, tags=["u_math"]),
173
- "u_math_text_acc": Field("U-MATH Text Acc", "rate", tags=["u_math", "text"]),
174
- "u_math_visual_acc": Field("U-MATH Visual Acc", "rate", tags=["u_math", "visual"]),
175
  "differential_calc_acc": Field("Diff Calc Acc", "rate", displayed_by_default=False, tags=["subjects"]),
176
  "differential_calc_text_acc": Field("Diff Calc Text Acc", "rate", displayed_by_default=False, tags=["text"]),
177
  "differential_calc_visual_acc": Field(
@@ -208,11 +220,11 @@ MU_MATH_COLUMNS_DICT = {
208
  "rank": Field("Rank", "number", never_hidden=True),
209
  **MODEL_COLUMNS_DICT,
210
  "extract_model_name": Field("Extract Model Name", "markdown", displayed_by_default=False),
211
- "mu_math_f1": Field("μ-MATH F1", "rate", never_hidden=True, tags=["mu_math", "splits"]),
212
- "mu_math_tpr": Field("μ-MATH TPR", "rate", displayed_by_default=False, tags=["mu_math"]),
213
- "mu_math_tnr": Field("μ-MATH TNR", "rate", displayed_by_default=False, tags=["mu_math"]),
214
- "mu_math_ppv": Field("μ-MATH PPV", "rate", displayed_by_default=False, tags=["mu_math"]),
215
- "mu_math_npv": Field("μ-MATH NPV", "rate", displayed_by_default=False, tags=["mu_math"]),
216
  "GPT-4o_f1": Field("GPT-4o Subset F1", "rate", tags=["splits"]),
217
  "GPT-4o_tpr": Field("GPT-4o Subset TPR", "rate", displayed_by_default=False),
218
  "GPT-4o_tnr": Field("GPT-4o Subset TNR", "rate", displayed_by_default=False),
@@ -243,7 +255,7 @@ U_MATH_AND_MU_MATH_COLUMNS_DICT = {
243
  "u_math_visual_acc": Field("U-MATH Visual Acc", "rate", displayed_by_default=False, tags=["u_math"]),
244
  "judge_model_name": Field("Judge Model Name", "markdown", displayed_by_default=False),
245
  "extract_model_name": Field("Extract Model Name", "markdown", displayed_by_default=False),
246
- "mu_math_f1": Field("μ-MATH F1", "rate", tags=["main", "u_math", "mu_math"]),
247
  "mu_math_tpr": Field("μ-MATH TPR", "rate", displayed_by_default=False, tags=["mu_math"]),
248
  "mu_math_tnr": Field("μ-MATH TNR", "rate", displayed_by_default=False, tags=["mu_math"]),
249
  "mu_math_ppv": Field("μ-MATH PPV", "rate", displayed_by_default=False, tags=["mu_math"]),
 
11
  PERCENT_ROUND_DIGITS = 1
12
 
13
 
14
+ MODEL_CONFIG_CACHE = {}
15
+ MODEL_CARD_CACHE = {}
16
+
17
+
18
  def get_hf_model_info_card_or_none(model_name: str) -> ModelInfo | None:
19
+ if model_name in MODEL_CARD_CACHE:
20
+ return MODEL_CARD_CACHE[model_name]
21
  try:
22
  info = model_info(repo_id=model_name)
23
+ MODEL_CARD_CACHE[model_name] = info
24
  return info
25
  except Exception:
26
+ MODEL_CARD_CACHE[model_name] = None
27
  return None
28
 
29
 
30
  def get_hf_hub_config_or_none(model_name: str) -> AutoConfig | None:
31
+ if model_name in MODEL_CONFIG_CACHE:
32
+ return MODEL_CONFIG_CACHE[model_name]
33
  try:
34
  config = AutoConfig.from_pretrained(model_name, revision="main", trust_remote_code=True)
35
+ MODEL_CONFIG_CACHE[model_name] = config
36
  return config
37
  except Exception:
38
+ MODEL_CONFIG_CACHE[model_name] = None
39
  return None
40
 
41
 
 
181
  "rank": Field("Rank", "number", never_hidden=True),
182
  **MODEL_COLUMNS_DICT,
183
  "judge_model_name": Field("Judge Model Name", "markdown", displayed_by_default=False),
184
+ "u_math_acc": Field("U-MATH Acc", "rate", never_hidden=True, tags=["default"]),
185
+ "u_math_text_acc": Field("U-MATH Text Acc", "rate", tags=["default", "text"]),
186
+ "u_math_visual_acc": Field("U-MATH Visual Acc", "rate", tags=["default", "visual"]),
187
  "differential_calc_acc": Field("Diff Calc Acc", "rate", displayed_by_default=False, tags=["subjects"]),
188
  "differential_calc_text_acc": Field("Diff Calc Text Acc", "rate", displayed_by_default=False, tags=["text"]),
189
  "differential_calc_visual_acc": Field(
 
220
  "rank": Field("Rank", "number", never_hidden=True),
221
  **MODEL_COLUMNS_DICT,
222
  "extract_model_name": Field("Extract Model Name", "markdown", displayed_by_default=False),
223
+ "mu_math_f1": Field("μ-MATH F1", "rate", never_hidden=True, tags=["default", "splits"]),
224
+ "mu_math_tpr": Field("μ-MATH TPR", "rate", displayed_by_default=False, tags=["default"]),
225
+ "mu_math_tnr": Field("μ-MATH TNR", "rate", displayed_by_default=False, tags=["default"]),
226
+ "mu_math_ppv": Field("μ-MATH PPV", "rate", displayed_by_default=False, tags=["default"]),
227
+ "mu_math_npv": Field("μ-MATH NPV", "rate", displayed_by_default=False, tags=["default"]),
228
  "GPT-4o_f1": Field("GPT-4o Subset F1", "rate", tags=["splits"]),
229
  "GPT-4o_tpr": Field("GPT-4o Subset TPR", "rate", displayed_by_default=False),
230
  "GPT-4o_tnr": Field("GPT-4o Subset TNR", "rate", displayed_by_default=False),
 
255
  "u_math_visual_acc": Field("U-MATH Visual Acc", "rate", displayed_by_default=False, tags=["u_math"]),
256
  "judge_model_name": Field("Judge Model Name", "markdown", displayed_by_default=False),
257
  "extract_model_name": Field("Extract Model Name", "markdown", displayed_by_default=False),
258
+ "mu_math_f1": Field("μ-MATH F1", "rate", tags=["main", "u_math", "default"]),
259
  "mu_math_tpr": Field("μ-MATH TPR", "rate", displayed_by_default=False, tags=["mu_math"]),
260
  "mu_math_tnr": Field("μ-MATH TNR", "rate", displayed_by_default=False, tags=["mu_math"]),
261
  "mu_math_ppv": Field("μ-MATH PPV", "rate", displayed_by_default=False, tags=["mu_math"]),