Spaces:
Running
Running
update with gemini3 results
#14
by
JessicaOjo
- opened
data/community_results/New Results - June2025.csv
CHANGED
|
@@ -78,3 +78,10 @@ GPT-5 (Aug),afrimmlu,1.0,90.6,85.8,82.8,84.6,82.0,82.6,82.0,86.4,83.0,85.2,90.0,
|
|
| 78 |
GPT-5 (Aug),injongointent,4.0,88.3,90.9,95.8,89.2,79.8,87.3,87.8,83.8,94.7,77.2,93.3,89.4,80.6,87.5,85.0,87.3
|
| 79 |
GPT-5 (Aug),sib,3.0,91.2,89.2,90.2,90.2,89.7,88.2,85.8,88.2,88.7,90.2,91.2,90.7,79.9,87.3,89.7,88.5
|
| 80 |
GPT-5 (Aug),belebele,5.0,89.2,90.3,82.3,78.3,86.9,81.2,77.6,81.3,85.6,86.4,94.6,87.3,69.1,80.4,85.3,83.3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
GPT-5 (Aug),injongointent,4.0,88.3,90.9,95.8,89.2,79.8,87.3,87.8,83.8,94.7,77.2,93.3,89.4,80.6,87.5,85.0,87.3
|
| 79 |
GPT-5 (Aug),sib,3.0,91.2,89.2,90.2,90.2,89.7,88.2,85.8,88.2,88.7,90.2,91.2,90.7,79.9,87.3,89.7,88.5
|
| 80 |
GPT-5 (Aug),belebele,5.0,89.2,90.3,82.3,78.3,86.9,81.2,77.6,81.3,85.6,86.4,94.6,87.3,69.1,80.4,85.3,83.3
|
| 81 |
+
Gemini 3 Pro,afrixnli,3.0,90.8,84.3,81.8,83.8,78.3,31.0,81.3,84.0,83.2,80.2,80.3,84.3,71.8,78.2,81.2,77.4
|
| 82 |
+
Gemini 3 Pro,afrimgsm,2.0,84.4,82.8,75.2,80.8,71.6,63.6,66.4,83.6,76.4,68.4,88.0,66.4,56.4,82.4,66.0,73.4
|
| 83 |
+
Gemini 3 Pro,flores - en_xx,3.0,69.8,42.1,50.8,44.1,52.6,49.8,44.0,45.4,48.3,53.5,61.5,53.0,32.9,28.6,56.1,47.3
|
| 84 |
+
Gemini 3 Pro,afrimmlu,1.0,88.4,89.6,82.8,88.4,83.6,85.0,85.6,88.0,86.8,82.8,89.0,89.4,77.0,88.0,88.8,86.1
|
| 85 |
+
Gemini 3 Pro,injongointent,4.0,88.1,92.0,95.9,92.2,81.1,91.9,87.7,86.7,94.5,78.0,91.4,89.1,87.5,90.5,85.3,88.8
|
| 86 |
+
Gemini 3 Pro,sib,3.0,87.3,87.7,88.2,87.7,89.7,88.7,85.3,89.2,87.7,89.2,89.2,87.7,84.8,86.8,88.2,87.9
|
| 87 |
+
Gemini 3 Pro,belebele,5.0,76.3,76.8,54.3,63.1,75.4,71.3,74.7,68.9,73.8,61.9,74.9,78.2,72.1,73.3,77.6,71.2
|
data/leaderboard_json/afrobench_lite.json
CHANGED
|
@@ -23,7 +23,8 @@
|
|
| 23 |
"Claude 4.5 Sonnet": 69.9,
|
| 24 |
"Gemini-2.5 Flash": 69.3,
|
| 25 |
"Gemini-2.5 Pro": 72.5,
|
| 26 |
-
"GPT-5 (Aug)": 83.3
|
|
|
|
| 27 |
}
|
| 28 |
},
|
| 29 |
"Intent": {
|
|
@@ -50,7 +51,8 @@
|
|
| 50 |
"Claude 4.5 Sonnet": 79.3,
|
| 51 |
"Gemini-2.5 Flash": 87.4,
|
| 52 |
"Gemini-2.5 Pro": 88.0,
|
| 53 |
-
"GPT-5 (Aug)": 87.3
|
|
|
|
| 54 |
}
|
| 55 |
},
|
| 56 |
"MT(en/fr-xx)": {
|
|
@@ -77,7 +79,8 @@
|
|
| 77 |
"Claude 4.5 Sonnet": 45.2,
|
| 78 |
"Gemini-2.5 Flash": 45.3,
|
| 79 |
"Gemini-2.5 Pro": 46.3,
|
| 80 |
-
"GPT-5 (Aug)": 44.8
|
|
|
|
| 81 |
}
|
| 82 |
},
|
| 83 |
"MMLU": {
|
|
@@ -104,7 +107,8 @@
|
|
| 104 |
"Claude 4.5 Sonnet": 74.0,
|
| 105 |
"Gemini-2.5 Flash": 67.3,
|
| 106 |
"Gemini-2.5 Pro": 77.4,
|
| 107 |
-
"GPT-5 (Aug)": 83.3
|
|
|
|
| 108 |
}
|
| 109 |
},
|
| 110 |
"Math": {
|
|
@@ -131,7 +135,8 @@
|
|
| 131 |
"Claude 4.5 Sonnet": 69.7,
|
| 132 |
"Gemini-2.5 Flash": 69.3,
|
| 133 |
"Gemini-2.5 Pro": 73.2,
|
| 134 |
-
"GPT-5 (Aug)": 73.7
|
|
|
|
| 135 |
}
|
| 136 |
},
|
| 137 |
"Topic": {
|
|
@@ -158,7 +163,8 @@
|
|
| 158 |
"Claude 4.5 Sonnet": 83.3,
|
| 159 |
"Gemini-2.5 Flash": 86.8,
|
| 160 |
"Gemini-2.5 Pro": 87.9,
|
| 161 |
-
"GPT-5 (Aug)": 88.5
|
|
|
|
| 162 |
}
|
| 163 |
},
|
| 164 |
"RC": {
|
|
@@ -185,7 +191,8 @@
|
|
| 185 |
"Claude 4.5 Sonnet": 72.8,
|
| 186 |
"Gemini-2.5 Flash": 41.6,
|
| 187 |
"Gemini-2.5 Pro": 76.4,
|
| 188 |
-
"GPT-5 (Aug)": 83.3
|
|
|
|
| 189 |
}
|
| 190 |
}
|
| 191 |
}
|
|
|
|
| 23 |
"Claude 4.5 Sonnet": 69.9,
|
| 24 |
"Gemini-2.5 Flash": 69.3,
|
| 25 |
"Gemini-2.5 Pro": 72.5,
|
| 26 |
+
"GPT-5 (Aug)": 83.3,
|
| 27 |
+
"Gemini 3 Pro": 77.4
|
| 28 |
}
|
| 29 |
},
|
| 30 |
"Intent": {
|
|
|
|
| 51 |
"Claude 4.5 Sonnet": 79.3,
|
| 52 |
"Gemini-2.5 Flash": 87.4,
|
| 53 |
"Gemini-2.5 Pro": 88.0,
|
| 54 |
+
"GPT-5 (Aug)": 87.3,
|
| 55 |
+
"Gemini 3 Pro": 88.8
|
| 56 |
}
|
| 57 |
},
|
| 58 |
"MT(en/fr-xx)": {
|
|
|
|
| 79 |
"Claude 4.5 Sonnet": 45.2,
|
| 80 |
"Gemini-2.5 Flash": 45.3,
|
| 81 |
"Gemini-2.5 Pro": 46.3,
|
| 82 |
+
"GPT-5 (Aug)": 44.8,
|
| 83 |
+
"Gemini 3 Pro": 47.3
|
| 84 |
}
|
| 85 |
},
|
| 86 |
"MMLU": {
|
|
|
|
| 107 |
"Claude 4.5 Sonnet": 74.0,
|
| 108 |
"Gemini-2.5 Flash": 67.3,
|
| 109 |
"Gemini-2.5 Pro": 77.4,
|
| 110 |
+
"GPT-5 (Aug)": 83.3,
|
| 111 |
+
"Gemini 3 Pro": 86.1
|
| 112 |
}
|
| 113 |
},
|
| 114 |
"Math": {
|
|
|
|
| 135 |
"Claude 4.5 Sonnet": 69.7,
|
| 136 |
"Gemini-2.5 Flash": 69.3,
|
| 137 |
"Gemini-2.5 Pro": 73.2,
|
| 138 |
+
"GPT-5 (Aug)": 73.7,
|
| 139 |
+
"Gemini 3 Pro": 73.4
|
| 140 |
}
|
| 141 |
},
|
| 142 |
"Topic": {
|
|
|
|
| 163 |
"Claude 4.5 Sonnet": 83.3,
|
| 164 |
"Gemini-2.5 Flash": 86.8,
|
| 165 |
"Gemini-2.5 Pro": 87.9,
|
| 166 |
+
"GPT-5 (Aug)": 88.5,
|
| 167 |
+
"Gemini 3 Pro": 87.9
|
| 168 |
}
|
| 169 |
},
|
| 170 |
"RC": {
|
|
|
|
| 191 |
"Claude 4.5 Sonnet": 72.8,
|
| 192 |
"Gemini-2.5 Flash": 41.6,
|
| 193 |
"Gemini-2.5 Pro": 76.4,
|
| 194 |
+
"GPT-5 (Aug)": 83.3,
|
| 195 |
+
"Gemini 3 Pro": 71.2
|
| 196 |
}
|
| 197 |
}
|
| 198 |
}
|
data/leaderboard_json/lite_language_scores.json
CHANGED
|
@@ -366,5 +366,21 @@
|
|
| 366 |
"wol": 63.4,
|
| 367 |
"yor": 75.4,
|
| 368 |
"zul": 80.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
}
|
| 370 |
}
|
|
|
|
| 366 |
"wol": 63.4,
|
| 367 |
"yor": 75.4,
|
| 368 |
"zul": 80.0
|
| 369 |
+
},
|
| 370 |
+
"Gemini 3 Pro": {
|
| 371 |
+
"amh": 79.3,
|
| 372 |
+
"hau": 75.6,
|
| 373 |
+
"ibo": 77.2,
|
| 374 |
+
"kin": 76.0,
|
| 375 |
+
"lin": 68.8,
|
| 376 |
+
"lug": 75.0,
|
| 377 |
+
"orm": 78.0,
|
| 378 |
+
"sna": 78.7,
|
| 379 |
+
"sot": 73.4,
|
| 380 |
+
"swa": 82.0,
|
| 381 |
+
"xho": 78.3,
|
| 382 |
+
"wol": 68.9,
|
| 383 |
+
"yor": 75.4,
|
| 384 |
+
"zul": 77.6
|
| 385 |
}
|
| 386 |
}
|