diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..14590d7adf3c46085d5573b07072eec6c664e12a --- /dev/null +++ b/.gitignore @@ -0,0 +1,123 @@ +# Environment variables and secrets +.env +.env.local +.env.*.local +*.env + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Virtual environments +venv/ +venv311/ +env/ +ENV/ +env.bak/ +venv.bak/ +.venv/ + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + + + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Logs +*.log +logs/ +log/ + +# Temporary files +tmp/ +temp/ +.tmp/ + + + +# Cache directories +.cache/ +cache/ +__pycache__/ + + +# API keys and sensitive files +config/secrets.yaml +config/api_keys.yaml +*.key +*.pem +*.p12 +*.pfx + +# Local configuration overrides +config/local_config.yaml +config/dev_config.yaml + +# Backup files +*.bak +*.backup +*.old + +# Dependencies +node_modules/ + +# Compiled files +*.com +*.class +*.dll +*.exe +*.o +*.so + +# Package files +*.7z +*.dmg +*.gz +*.iso +*.jar +*.rar +*.tar +*.zip + +# Database files +*.db +*.sqlite +*.sqlite3 + +# Editor files +.sublime-* +*.sublime-workspace +*.sublime-project diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..b5685772804c8af4235a8504dc6752bfc9ae5d1d --- /dev/null +++ b/Makefile @@ -0,0 +1,13 @@ +.PHONY: style format + + +style: + python -m black --line-length 119 . + python -m isort . + ruff check --fix . + + +quality: + python -m black --check --line-length 119 . + python -m isort --check-only . + ruff check . diff --git a/README.md b/README.md index 1d5c062e120a5b374a1923a202fa419efd737f63..687b6b7892426186a48c3865734abcebe725f22a 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,21 @@ --- title: Mezura -emoji: 👀 -colorFrom: red -colorTo: green +emoji: 🥇 +colorFrom: green +colorTo: indigo sdk: gradio -sdk_version: 5.35.0 +sdk_version: 5.29.0 app_file: app.py pinned: false license: apache-2.0 + +# OAuth configuration +hf_oauth: true +hf_oauth_client_id: "${OAUTH_CLIENT_ID}" +hf_oauth_client_secret: "${OAUTH_CLIENT_SECRET}" +hf_oauth_expiration_minutes: 30 +hf_oauth_scopes: + - email --- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..04c7820654da4f86cd7f9d18531a78725b84f496 --- /dev/null +++ b/api/__init__.py @@ -0,0 +1,6 @@ +""" +API package for Mezura evaluation system. +""" +from api.client import get_api_client, APIClient + +__all__ = ["get_api_client", "APIClient"] \ No newline at end of file diff --git a/api/client.py b/api/client.py new file mode 100644 index 0000000000000000000000000000000000000000..c8e26704dad80d0e594420b7855be9b1f3571929 --- /dev/null +++ b/api/client.py @@ -0,0 +1,37 @@ +""" +Base API client module for Mezura. +""" + +import logging +from typing import Dict, Any, Optional, Type + +# Create logger +logger = logging.getLogger(__name__) + +class APIClient: + """Base class for API clients.""" + + def __init__(self): + """Initialize the API client.""" + pass + +def get_api_client(evaluation_type: str = None) -> Optional[Any]: + """ + Get the appropriate API client for the given evaluation type. + + Args: + evaluation_type: The type of evaluation (unused, always returns AirflowClient) + + Returns: + Optional[Any]: An instance of the AirflowClient, or None if not available + """ + try: + # Import here to avoid circular imports + from api.clients.airflow_client import AirflowClient + + # Always return AirflowClient as it's the only client used in the application + return AirflowClient() + + except Exception as e: + logger.error(f"Error creating API client: {e}") + return None \ No newline at end of file diff --git a/api/clients/__init__.py b/api/clients/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..081c8a8f67cb3ac226e0507994d0c58dc180c42b --- /dev/null +++ b/api/clients/__init__.py @@ -0,0 +1,3 @@ +""" +API clients package. +""" diff --git a/api/clients/airflow_client.py b/api/clients/airflow_client.py new file mode 100644 index 0000000000000000000000000000000000000000..2b263e8698fb1f1e47386c89c2f2a636335c0e08 --- /dev/null +++ b/api/clients/airflow_client.py @@ -0,0 +1,322 @@ +""" +Airflow API client for Mezura. +""" +import requests +import logging +import json +import os +from typing import Dict, Any, Optional +import re + +from api.config import get_api_config, get_api_config_for_type, get_airflow_config + +# Set up logger +logger = logging.getLogger(__name__) + +class AirflowClient: + """ + Client for interacting with Airflow API + """ + + def __init__(self): + """Initialize the Airflow API client""" + # Get Airflow configuration from config module + airflow_config = get_airflow_config() + + # Set the base URL for Airflow API + self.airflow_base_url = airflow_config.get("base_url") + if not self.airflow_base_url: + raise ValueError("Airflow base URL not found in configuration") + + # Get auth credentials from config + auth = airflow_config.get("auth", {}) + self.username = auth.get("username") + self.password = auth.get("password") + + # Validate required auth credentials + if not self.username or not self.password: + error_msg = "Airflow authentication credentials not found in configuration" + + # Check if environment variables are set properly + if auth.get("use_env", False): + username_env = auth.get("env_username", "MEZURA_API_USERNAME") + password_env = auth.get("env_password", "MEZURA_API_PASSWORD") + + username_exists = os.environ.get(username_env) is not None + password_exists = os.environ.get(password_env) is not None + + if not username_exists or not password_exists: + missing_vars = [] + if not username_exists: + missing_vars.append(username_env) + if not password_exists: + missing_vars.append(password_env) + + error_msg = f"Required environment variables not set: {', '.join(missing_vars)}" + + raise ValueError(error_msg) + + # Get timeout and retry settings + self.timeout = airflow_config.get("timeout", 30) + self.retry_attempts = airflow_config.get("retry_attempts", 3) + + logger.info(f"Airflow API client initialized with base URL: {self.airflow_base_url}") + # SECURITY: Commented out to prevent potential credential exposure + # logger.info(f"Using auth credentials: {self.username}:***") + + def send_dag_request(self, dag_id: str, conf: Dict[str, Any]) -> Dict[str, Any]: + """ + Sends a request to start a DAG run. + + Args: + dag_id: The ID of the DAG to run + conf: The configuration for the DAG run + + Returns: + Dict[str, Any]: DAG run response + """ + try: + # Airflow API endpoint for triggering DAGs + airflow_endpoint = f"{self.airflow_base_url}/api/v1/dags/{dag_id}/dagRuns" + + # Create a proper copy of the configuration + conf_copy = conf.copy() if conf else {} + + # Sanitize the configuration to ensure all values are serializable + # This is especially important for username which might be an object in some cases + if "username" in conf_copy: + # Username is required - if it's None, this is an error condition + # We'll convert it to string if needed but not remove it + if conf_copy["username"] is None: + logger.error("Username is None but required for API request") + raise ValueError("Username is required for benchmark submission") + # If username is not a primitive type, convert it to string + elif not isinstance(conf_copy["username"], (str, int, float, bool)): + conf_copy["username"] = str(conf_copy["username"]) + + # Double check for "Logout (username)" format + username_str = str(conf_copy["username"]) + logout_pattern = re.compile(r'Logout \(([^)]+)\)') + match = logout_pattern.search(username_str) + if match: + conf_copy["username"] = match.group(1) + # Handle any string with parentheses + elif '(' in username_str and ')' in username_str: + try: + start = username_str.rindex('(') + 1 + end = username_str.find(')', start) + if start < end: + extracted = username_str[start:end].strip() + if extracted: # Only use if not empty + conf_copy["username"] = extracted + except: + pass # Keep original if extraction fails + else: + # Username is required + logger.error("Username field missing from request configuration") + raise ValueError("Username is required for benchmark submission") + + # Create request payload - e-posta değerini olduğu gibi kullan + payload = {"conf": conf_copy} + + # Log the request we're about to send - maske YOK, gerçek değerleri loglanacak + # logger.info(f"Sending POST request to: {airflow_endpoint}") + # SECURITY: Commented out to prevent potential credential exposure + # logger.info(f"Request payload: {json.dumps(payload)}") + + # Send the request + response = requests.post( + airflow_endpoint, + json=payload, + auth=(self.username, self.password), + timeout=self.timeout, + headers={ + 'Content-Type': 'application/json', + 'Accept': 'application/json' + } + ) + + # Log the response + logger.info(f"Response status code: {response.status_code}") + + # Check if request was successful + if response.status_code in (200, 201): + try: + data = response.json() + logger.info(f"Response data: {json.dumps(data)}") + + run_id = data.get("dag_run_id", "unknown") + logger.info(f"DAG run triggered: {run_id}") + + return { + "run_id": run_id, + "status": "submitted", + "dag_id": dag_id + } + except Exception as e: + logger.error(f"Error parsing response: {e}") + return { + "error": f"Error parsing response: {str(e)}", + "status": "error", + "dag_id": dag_id + } + else: + error_msg = f"API Error: {response.status_code}, {response.text}" + logger.error(error_msg) + return { + "error": error_msg, + "status": "error", + "dag_id": dag_id + } + + except Exception as e: + error_msg = f"Request failed: {str(e)}" + logger.error(error_msg) + return { + "error": error_msg, + "status": "error", + "dag_id": dag_id + } + + def send_status_request(self, dag_id: str, run_id: str) -> Dict[str, Any]: + """ + Sends a status request to check the status of a DAG run. + + Args: + dag_id: The ID of the DAG + run_id: The DAG run ID returned by the send_dag_request method + + Returns: + Dict[str, Any]: Status information + """ + try: + # Airflow DAG run status endpoint + status_url = f"{self.airflow_base_url}/api/v1/dags/{dag_id}/dagRuns/{run_id}" + + # Log the request + logger.info(f"Checking status for DAG run: {run_id}, URL: {status_url}") + + # Send the request + response = requests.get( + status_url, + auth=(self.username, self.password), + timeout=self.timeout, + headers={'Accept': 'application/json'} + ) + + # Log response status + logger.info(f"Status response code: {response.status_code}") + + if response.status_code == 200: + try: + data = response.json() + state = data.get("state", "unknown") + + # Map Airflow states to Mezura states + status_mapping = { + "running": "running", + "success": "completed", + "failed": "failed", + "queued": "pending" + } + + status_info = { + "status": status_mapping.get(state, "unknown"), + "progress": 100 if state == "success" else 0, + "current_step": state, + "error": None if state != "failed" else "DAG execution failed", + "run_id": run_id, + "dag_id": dag_id + } + + logger.info(f"DAG run status: {state}") + + return status_info + + except Exception as e: + error_msg = f"Error parsing status response: {str(e)}" + logger.error(error_msg) + return { + "status": "error", + "error": error_msg, + "run_id": run_id, + "dag_id": dag_id + } + else: + error_msg = f"Status API Error: {response.status_code}, {response.text}" + logger.error(error_msg) + return { + "status": "error", + "error": error_msg, + "run_id": run_id, + "dag_id": dag_id + } + + except Exception as e: + error_msg = f"Status request failed: {str(e)}" + logger.error(error_msg) + return { + "status": "error", + "error": error_msg, + "run_id": run_id, + "dag_id": dag_id + } + + def send_logs_request(self, dag_id: str, run_id: str, task_id: str = "process_results") -> Dict[str, Any]: + """ + Sends a request to get the logs of a DAG run. + + Args: + dag_id: The ID of the DAG + run_id: The DAG run ID + task_id: The task ID to get logs for, defaults to process_results + + Returns: + Dict[str, Any]: Log information + """ + try: + # Log endpoint URL + logs_url = f"{self.airflow_base_url}/api/v1/dags/{dag_id}/dagRuns/{run_id}/taskInstances/{task_id}/logs" + + # Log the request + logger.info(f"Getting logs for DAG run ID: {run_id}, URL: {logs_url}") + + # Send the request + response = requests.get( + logs_url, + auth=(self.username, self.password), + timeout=self.timeout, + headers={'Accept': 'application/json'} + ) + + # Log response status + logger.info(f"Logs response code: {response.status_code}") + + if response.status_code == 200: + return { + "logs": response.text, + "status": "success", + "run_id": run_id, + "dag_id": dag_id + } + else: + error_msg = f"Logs API Error: {response.status_code}, {response.text}" + logger.error(error_msg) + return { + "status": "error", + "error": error_msg, + "run_id": run_id, + "dag_id": dag_id, + "logs": "Failed to retrieve logs" + } + + except Exception as e: + error_msg = f"Logs request failed: {str(e)}" + logger.error(error_msg) + return { + "status": "error", + "error": error_msg, + "run_id": run_id, + "dag_id": dag_id, + "logs": "Failed to retrieve logs due to an error" + } \ No newline at end of file diff --git a/api/config.py b/api/config.py new file mode 100644 index 0000000000000000000000000000000000000000..3fc3cebcf999347835348cb3b246eda7f09e12a2 --- /dev/null +++ b/api/config.py @@ -0,0 +1,155 @@ +""" +API configuration module. +""" +import os +import logging +import yaml +from typing import Dict, Any, Optional +import base64 + +logger = logging.getLogger(__name__) + +# Cache config for performance +_config_cache = None + +# Store the list of supported base models +_supported_base_models = [] + +def get_api_config() -> Dict[str, Any]: + """ + Returns the API configuration. + + Returns: + Dict[str, Any]: API configuration + """ + global _config_cache + + # Use cache if available + if _config_cache is not None: + return _config_cache + + # Try to load config from file + config_path = os.environ.get("MEZURA_CONFIG_PATH", "config/api_config.yaml") + if os.path.exists(config_path): + try: + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + _config_cache = config + logger.info(f"Loaded API configuration from {config_path}") + return config + except Exception as e: + logger.error(f"Error loading config from {config_path}: {e}") + raise RuntimeError(f"Failed to load configuration from {config_path}: {e}") + else: + # If config file not found, raise error + error_msg = f"Configuration file not found at {config_path}" + logger.error(error_msg) + raise FileNotFoundError(error_msg) + +def get_api_config_for_type(evaluation_type: str) -> Dict[str, Any]: + """ + Get API configuration for a specific evaluation type. + + Args: + evaluation_type: Evaluation type (e.g., "evalmix") + + Returns: + Dict[str, Any]: API configuration for the specified type + """ + config = get_api_config() + + # Convert evaluation type to config key + api_type = evaluation_type.lower().replace("-", "_") + + # Get API configuration + if "apis" in config and api_type in config["apis"]: + type_config = config["apis"][api_type] + logger.debug(f"Using config for {evaluation_type}: {type_config}") + return type_config + + # Get default configuration if not found + if "default" in config: + logger.warning(f"No configuration found for {evaluation_type}, using default") + return config["default"] + + # If no default config either, return empty dict + logger.warning(f"No configuration found for {evaluation_type} and no default config") + return {} + +def get_airflow_config() -> Dict[str, Any]: + """ + Get Airflow API configuration. + + Returns: + Dict[str, Any]: Airflow API configuration + """ + config = get_api_config() + + # Get Airflow config from the loaded yaml configuration + if "apis" in config and "airflow" in config["apis"]: + airflow_config = config["apis"]["airflow"] + logger.debug(f"Using Airflow config from YAML: {airflow_config}") + + # --- Load base_url from environment if available --- + env_base_url = os.environ.get("AIRFLOW_URL") + if env_base_url: + airflow_config["base_url"] = env_base_url + logger.info(f"Loaded Airflow base_url from environment variable AIRFLOW_URL: {env_base_url}") + else: + logger.info(f"Using Airflow base_url from YAML config: {airflow_config.get('base_url')}") + # --- END base_url env logic --- + + # Check if credentials should be loaded from environment + auth_config = airflow_config.get("auth", {}) + if auth_config.get("use_env", False): + # Get environment variable names + username_env = auth_config.get("env_username", "MEZURA_API_USERNAME") + password_env = auth_config.get("env_password", "MEZURA_API_PASSWORD") + + # Log environment variable names + logger.info(f"Looking for credentials in environment variables: {username_env}, {password_env}") + + # Check if environment variables are set + username = os.environ.get(username_env) + password = os.environ.get(password_env) + + # SECURITY: Commented out to prevent potential credential exposure + # Directly access environment variables for better logging + # all_env_vars = os.environ.keys() + # logger.info(f"Available environment variables: {', '.join(all_env_vars)}") + + # Log results of environment variable check + logger.info(f"Username variable '{username_env}' found: {username is not None}") + logger.info(f"Password variable '{password_env}' found: {password is not None}") + + # Update auth config with credentials from environment + if username and password: + auth_config["username"] = username + auth_config["password"] = password + # Update the auth config in airflow_config + airflow_config["auth"] = auth_config + + return airflow_config + + # If not found in config, log warning and return empty dict + logger.warning("Airflow configuration not found in config file") + return {} + +def update_base_model_list(models): + """ + Updates the list of supported base models. + + Args: + models (list): List of supported model names + """ + global _supported_base_models + _supported_base_models = models + +def get_base_model_list(): + """ + Returns the current list of supported base models. + + Returns: + list: List of supported model names + """ + return _supported_base_models \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..d5493d3d589b33e8d42325db06da6bb136e4a753 --- /dev/null +++ b/app.py @@ -0,0 +1,712 @@ +import gradio as gr +import pandas as pd +import os +import sys +import traceback +import logging + +# Disable SSL verification for curl requests if needed +os.environ['CURL_CA_BUNDLE'] = '' + +# Configure minimal logging first thing - before any imports +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +# Disable all potentially sensitive loggers immediately +logging.getLogger("httpx").setLevel(logging.ERROR) +logging.getLogger("urllib3").setLevel(logging.ERROR) +logging.getLogger("matplotlib").setLevel(logging.WARNING) +logging.getLogger("huggingface_hub").setLevel(logging.ERROR) + +# Minimize the OAuth imports to prevent errors +from gradio.oauth import OAuthProfile + +from src.display.about import ( + CITATION_BUTTON_LABEL, + CITATION_BUTTON_TEXT, + EVALUATION_QUEUE_TEXT, + INTRODUCTION_TEXT, + LLM_BENCHMARKS_TEXT, + TITLE, +) + +from src.display.css_html_js import custom_css + +from src.utils import ( + restart_space, + load_benchmark_results, + create_benchmark_plots, + create_combined_leaderboard_table, + create_evalmix_table, + create_light_eval_table, + create_raw_details_table, + create_human_arena_table, + update_supported_base_models +) + +# Pipelines utils fonksiyonlarını import et +from pipelines.utils.common import search_and_filter +from pipelines.unified_benchmark import submit_unified_benchmark + +# Evaluation types +EVAL_TYPES = ["EvalMix", "RAG-Judge", "Light-Eval", "Arena", "Snake-Bench"] + +# Initialize OAuth configuration +OAUTH_CLIENT_ID = os.getenv("OAUTH_CLIENT_ID") +OAUTH_CLIENT_SECRET = os.getenv("OAUTH_CLIENT_SECRET") +OAUTH_SCOPES = os.getenv("OAUTH_SCOPES", "email") +OPENID_PROVIDER_URL = os.getenv("OPENID_PROVIDER_URL") +SESSION_TIMEOUT_MINUTES = int(os.getenv("HF_OAUTH_EXPIRATION_MINUTES", 30)) + +def format_dataframe(df, is_light_eval_detail=False): + """ + Float değerleri 2 ondalık basamağa yuvarla, + 'file' sütununu kaldır ve kolon isimlerini düzgün formata getir + + Args: + df: DataFrame to format + is_light_eval_detail: If True, use 4 decimal places for light eval detail results + """ + if df.empty: + return df + + # 'file' sütununu kaldır + if 'file' in df.columns: + df = df.drop(columns=['file']) + + # Specifically remove problematic columns + columns_to_remove = ["run_id", "user_id", "total_success_references", "Total Success References", "total_eval_samples", + "total_samples", "samples_number"] + for col in columns_to_remove: + if col in df.columns: + df = df.drop(columns=[col]) + + # Float değerleri yuvarlama - light eval detail için 4 hane, diğerleri için 2 hane + decimal_places = 4 if is_light_eval_detail else 2 + for column in df.columns: + try: + if pd.api.types.is_float_dtype(df[column]): + df[column] = df[column].round(decimal_places) + except: + continue + + # Kolon isimlerini düzgün formata getir + column_mapping = {} + for col in df.columns: + # Skip run_id and user_id fields + if col.lower() in ["run_id", "user_id"]: + continue + + # Special handling for Turkish Semantic column + if "turkish_semantic" in col.lower(): + column_mapping[col] = "Turkish Semantic" + continue + + # Special handling for Multilingual Semantic column + if "multilingual_semantic" in col.lower(): + column_mapping[col] = "Multilingual Semantic" + continue + + # Skip already well-formatted columns or columns that contain special characters + if col == "Model Name" or " " in col: + # Still process column if it contains "mean" + if " mean" in col.lower(): + cleaned_col = col.replace(" mean", "").replace(" Mean", "") + column_mapping[col] = cleaned_col + continue + + # model_name column should be Model Name + if col == "model_name": + column_mapping[col] = "Model Name" + continue + + # Remove the word "mean" from column names (case insensitive) + cleaned_col = col.replace(" mean", "").replace("_mean", "") + + # Format column name by replacing underscores with spaces and capitalizing each word + formatted_col = " ".join([word.capitalize() for word in cleaned_col.replace("_", " ").split()]) + column_mapping[col] = formatted_col + + # Rename columns with the mapping + if column_mapping: + df = df.rename(columns=column_mapping) + + return df + +# User authentication function +def check_user_login(profile): + if profile is None: + return False, "Please log in with your Hugging Face account to submit models for benchmarking." + + # In some environments, profile may be a string instead of a profile object + if isinstance(profile, str): + if profile == "": + return False, "Please log in with your Hugging Face account to submit models for benchmarking." + return True, f"Logged in as {profile}" + + # Normal case where profile is an object with username attribute + return True, f"Logged in as {profile.username}" + +def create_demo(): + # Get logger for this function + logger = logging.getLogger("mezura") + + with gr.Blocks(css=custom_css) as demo: + # Update supported base models at startup + logger.info("Updating supported base models at startup...") + update_supported_base_models() + logger.info("Base models updated successfully") + + gr.Markdown(TITLE) + gr.Markdown(INTRODUCTION_TEXT) + + # Hidden session state to track login expiration + session_expiry = gr.State(None) + + try: + # Benchmark sonuçlarını yükle + benchmark_results = load_benchmark_results() + default_plots = create_benchmark_plots(benchmark_results, "avg") + + # State variable to track login state across page refreshes + login_state = gr.State(value=False) + + with gr.Tabs() as tabs: + with gr.TabItem("🏆 LLM Benchmark", elem_id="llm-benchmark-tab"): + gr.Markdown("## Model Evaluation Results") + gr.Markdown("This screen shows model performance across different evaluation categories.") + + # Remove the separate refresh button row + # Instead, combine search and refresh in one row + with gr.Row(): + search_input = gr.Textbox( + label="🔍 Search for your model (separate multiple queries with `;`) and press ENTER...", + placeholder="Enter model name or evaluation information...", + show_label=False + ) + # # Update refresh button to be orange with "Refresh Results" text + # refresh_button = gr.Button("🔄 Refresh Results", variant="primary") + + # # Status display for refresh results + # refresh_status = gr.Markdown("", visible=False) + + # Benchmark tablarını semboller içeren tab grubuyla göster + with gr.Tabs() as benchmark_tabs: + with gr.TabItem("🏆 Leaderboard"): + # Birleşik leaderboard tablosu - avg_json dosyalarındaki tüm bilgileri göster + # Only use default data (avg files) for the leaderboard + combined_df = create_combined_leaderboard_table(benchmark_results) + # Float değerleri formatlama + combined_df = format_dataframe(combined_df) + + # Tüm sütunları göster + if not combined_df.empty: + leaderboard_df = combined_df.copy() + else: + leaderboard_df = pd.DataFrame({"Model Name": ["No data available"]}) + + # Orijinal veriyi saklayacak state değişkeni + original_leaderboard_data = gr.State(value=leaderboard_df) + + combined_table = gr.DataFrame( + value=leaderboard_df, + label="Model Performance Comparison", + interactive=False, + column_widths=["300px", "165px" ,"165px", "120px", "120px", "180px", "220px", "100px", "100px", "120px"] + + ) + + with gr.TabItem("🏟️ Auto Arena"): + # Arena sonuçları - detail dosyalarını kullan + arena_details_df = create_raw_details_table(benchmark_results, "arena") + arena_details_df = format_dataframe(arena_details_df) + + if arena_details_df.empty: + arena_details_df = pd.DataFrame({"model_name": ["No data available"]}) + + arena_table = gr.DataFrame( + value=arena_details_df, + label="Arena Detailed Results", + interactive=False, + column_widths=["300px", "150px", "110px", "110px", "180px", "100px", "120px"] + + ) + + with gr.TabItem("👥 Human Arena"): + # Human Arena sonuçları - detail dosyalarını kullan + human_arena_data = benchmark_results["raw"]["human_arena"] + if human_arena_data: + human_arena_df = create_human_arena_table(human_arena_data) + else: + human_arena_df = pd.DataFrame() + + human_arena_df = format_dataframe(human_arena_df) + + if human_arena_df.empty: + human_arena_df = pd.DataFrame({"Model Name": ["No data available"]}) + + human_arena_table = gr.DataFrame( + value=human_arena_df, + label="Human Arena Results", + interactive=False, + column_widths=["300px", "150px", "110px", "110px", "110px", "156px", "169px", "100px", "120px"] + + ) + + with gr.TabItem("📚 Retrieval"): + # RAG Judge sonuçları - detail dosyalarını kullan + rag_details_df = create_raw_details_table(benchmark_results, "retrieval") + rag_details_df = format_dataframe(rag_details_df) + + if rag_details_df.empty: + rag_details_df = pd.DataFrame({"model_name": ["No data available"]}) + + rag_table = gr.DataFrame( + value=rag_details_df, + label="Retrieval Detailed Results", + interactive=False, + column_widths=["300px", "180px", "180px", "180px", "143px", "190px", "100px", "120px"] + + ) + + with gr.TabItem("⚡ Light Eval"): + # Light Eval sonuçları - detail dosyalarını kullan + light_details_data = benchmark_results["raw"]["light_eval"] + if light_details_data: + light_details_df = create_light_eval_table(light_details_data, is_detail=True) + else: + light_details_df = pd.DataFrame() + + light_details_df = format_dataframe(light_details_df, is_light_eval_detail=True) + + if light_details_df.empty: + light_details_df = pd.DataFrame({"model_name": ["No data available"]}) + + light_table = gr.DataFrame( + value=light_details_df, + label="Light Eval Detailed Results", + interactive=False, + column_widths=["300px", "110px", "110px", "143px", "130px", "130px", "110px", "110px", "100px", "120px"] + + ) + + with gr.TabItem("📋 EvalMix"): + # Hybrid Benchmark sonuçları - detail dosyalarını kullan + hybrid_details_df = create_raw_details_table(benchmark_results, "evalmix") + hybrid_details_df = format_dataframe(hybrid_details_df) + + if hybrid_details_df.empty: + hybrid_details_df = pd.DataFrame({"model_name": ["No data available"]}) + + hybrid_table = gr.DataFrame( + value=hybrid_details_df, + label="EvalMix Detailed Results", + interactive=False, + column_widths=["300px", "180px", "230px", "143px", "110px", "110px", "110px", "110px", "169px", "220px" ,"100px", "120px"] + + ) + + with gr.TabItem("🐍 𝐒𝐧𝐚𝐤𝐞 𝐁𝐞𝐧𝐜𝐡"): + # Snake Benchmark sonuçları - detail dosyalarını kullan + snake_details_df = create_raw_details_table(benchmark_results, "snake") + snake_details_df = format_dataframe(snake_details_df) + + if snake_details_df.empty: + snake_details_df = pd.DataFrame({"model_name": ["No data available"]}) + + snake_table = gr.DataFrame( + value=snake_details_df, + label="Snake Benchmark Detailed Results", + interactive=False, + column_widths=["300px", "130px", "110px", "117px", "110px", "110px", "110px", "117px", "100px", "120px"] + + ) + + # with gr.TabItem("📊 LM-Harness"): + # # LM Harness sonuçları - detail dosyalarını kullan + # lmharness_details_df = create_raw_details_table(benchmark_results, "lm_harness") + # lmharness_details_df = format_dataframe(lmharness_details_df) + # + # if lmharness_details_df.empty: + # lmharness_details_df = pd.DataFrame({"model_name": ["No data available"]}) + # + # lmharness_table = gr.DataFrame( + # value=lmharness_details_df, + # label="LM Harness Detailed Results", + # interactive=False + # ) + + + # # Refresh butonu bağlantısı + # refresh_button.click( + # refresh_leaderboard, + # inputs=[], + # outputs=[ + # refresh_status, + # combined_table, + # hybrid_table, + # rag_table, + # light_table, + # arena_table, + # lmharness_table, + # snake_table + # ] + # ) + + # Tüm sekmeler için ortak arama fonksiyonu + def search_all_tabs(query, original_data): + """ + Tüm sekmelerde arama yapar + """ + if not query or query.strip() == "": + # Boş arama - orijinal veriyi döndür + return (original_data, arena_details_df, human_arena_df, + rag_details_df, light_details_df, hybrid_details_df, snake_details_df) + + # Arama var - tüm sekmeleri filtrele + return ( + search_and_filter(query, original_data, "All"), + search_and_filter(query, arena_details_df, "All"), + search_and_filter(query, human_arena_df, "All"), + search_and_filter(query, rag_details_df, "All"), + search_and_filter(query, light_details_df, "All"), + search_and_filter(query, hybrid_details_df, "All"), + search_and_filter(query, snake_details_df, "All") + ) + + # Arama fonksiyonu - tüm sekmeleri güncelle + search_input.change( + search_all_tabs, + inputs=[search_input, original_leaderboard_data], + outputs=[combined_table, arena_table, human_arena_table, rag_table, light_table, hybrid_table, snake_table] + ) + + with gr.TabItem("ℹ️ About", elem_id="about-tab"): + gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") + + with gr.TabItem("📊 Datasets", elem_id="datasets-tab"): + gr.Markdown("## Benchmark Datasets") + gr.Markdown(""" + This section provides detailed information about the datasets used in our evaluation benchmarks. + Each dataset has been carefully selected and adapted to provide comprehensive model evaluation across different domains and capabilities. + """) + + # Create and display the datasets table + datasets_html = """ +
| Dataset | +Evaluation Task | +Language | +Description | +
|---|---|---|---|
| malhajar/mmlu_tr-v0.2 | +Lighteval MMLU | +Turkish | +Turkish adaptation of MMLU (Massive Multitask Language Understanding) v0.2 covering 57 academic subjects including mathematics, physics, chemistry, biology, history, law, and computer science. Tests knowledge and reasoning capabilities across multiple domains with multiple-choice questions. | +
| malhajar/truthful_qa-tr-v0.2 | +Lighteval TruthfulQA | +Turkish | +Turkish version of TruthfulQA (v0.2) designed to measure model truthfulness and resistance to generating false information. Contains questions where humans often answer incorrectly due to misconceptions or false beliefs, testing the model's ability to provide accurate information. | +
| malhajar/winogrande-tr-v0.2 | +Lighteval WinoGrande | +Turkish | +Turkish adaptation of WinoGrande (v0.2) focusing on commonsense reasoning through pronoun resolution tasks. Tests the model's ability to understand context, make logical inferences, and resolve ambiguous pronouns in everyday scenarios. | +
| malhajar/hellaswag_tr-v0.2 | +Lighteval HellaSwag | +Turkish | +Turkish version of HellaSwag (v0.2) for commonsense reasoning evaluation. Tests the model's ability to predict plausible continuations of everyday scenarios and activities, requiring understanding of common sense and typical human behavior patterns. | +
| malhajar/arc-tr-v0.2 | +Lighteval ARC | +Turkish | +Turkish adaptation of ARC (AI2 Reasoning Challenge) v0.2 focusing on science reasoning and question answering. Contains grade school level science questions that require reasoning beyond simple factual recall, covering topics in physics, chemistry, biology, and earth science. | +
| malhajar/gsm8k_tr-v0.2 | +Lighteval GSM8K | +Turkish | +Turkish version of GSM8K (Grade School Math 8K) v0.2 for mathematical reasoning evaluation. Contains grade school level math word problems that require multi-step reasoning, arithmetic operations, and logical problem-solving skills to arrive at the correct numerical answer. | +
| newmindai/mezura-eval-data | +Auto-Arena | +Turkish | +mezura-eval dataset is a Turkish-language legal text dataset designed for evaluation tasks with RAG context support. The subsets include domains like Environmental Law, Tax Law, Data Protection Law and Health Law each containing annotated samples. Every row includes structured fields such as the category, concept, input and contextual information drawn from sources like official decisions. | +
| newmindai/mezura-eval-data | +EvalMix | +Turkish | +mezura-eval dataset is a Turkish-language legal text dataset designed for evaluation tasks with RAG context support. The subsets include domains like Environmental Law, Tax Law, Data Protection Law and Health Law each containing annotated samples. Every row includes structured fields such as the category, concept, input and contextual information drawn from sources like official decisions. | +
| newmindai/mezura-eval-data | +Retrieval | +Turkish | +mezura-eval dataset is a Turkish-language legal text dataset designed for evaluation tasks with RAG context support. The subsets include domains like Environmental Law, Tax Law, Data Protection Law and Health Law each containing annotated samples. Every row includes structured fields such as the category, concept, input and contextual information drawn from sources like official decisions. | +
Authentication required. Please log in with your Hugging Face account to submit models.
" + ) + ) + + # Log successful authentication + try: + + if hasattr(profile, 'name'): + username = profile.name + elif hasattr(profile, 'username'): + username = profile.username + else: + username = str(profile) + + logger.info(f"User authenticated: {username}") + except Exception as e: + logger.info(f"LOGIN - Error inspecting profile: {str(e)}") + + # User is logged in - show form, hide error + return ( + gr.update(visible=True), + gr.update(visible=False, value="") + ) + + # Connect login button to visibility toggle + login_button.click( + fn=toggle_form_visibility, + inputs=[login_button], + outputs=[login_dependent_content, auth_error] + ) + + # Check visibility on page load + demo.load( + fn=toggle_form_visibility, + inputs=[login_button], + outputs=[login_dependent_content, auth_error] + ) + + # Handle submission with authentication check + def submit_model(model, base_model, reasoning, email, profile): + # Authentication check + if profile is None: + logging.warning("Unauthorized submission attempt with no profile") + return "Authentication required. Please log in with your Hugging Face account.
" + + # IMPORTANT: In local development, Gradio returns "Sign in with Hugging Face" string + # This is NOT a real authentication, just a placeholder for local testing + if isinstance(profile, str) and profile == "Sign in with Hugging Face": + # Block submission in local dev with mock auth + return "⚠️ HF authentication required.
" + + # Email is required + if not email or email.strip() == "": + return "Email address is required to receive benchmark results.
" + + # Check if the model is a merged model (not supported) + try: + from src.submission.check_validity import determine_model_type + model_type, _ = determine_model_type(model) + if model_type == "merged_model" or model_type == "merge": + return "Merged models are not supported yet. Please submit an adapter model instead.
" + except Exception as e: + # If error checking model type, continue with submission + logging.warning(f"Error checking model type: {str(e)}") + + # Call the benchmark function with profile information + # base_model validasyonunu kaldırdık ama parametre olarak yine de gönderiyoruz + result_message, _ = submit_unified_benchmark(model, base_model, reasoning, email, profile) + logging.info(f"Submission processed for model: {model}") + return result_message + + # Connect submit button + submit_button.click( + fn=submit_model, + inputs=[ + model_to_evaluate, + base_model_dropdown, + reasoning_checkbox, + email_input, + login_button + ], + outputs=[result_output] + ) + + except Exception as e: + traceback.print_exc() + gr.Markdown(f"## Error: An issue occurred while loading the LLM Benchmark screen") + gr.Markdown(f"Error message: {str(e)}") + gr.Markdown("Please check your configuration and try again.") + + # Citation information at the bottom + gr.Markdown("---") + with gr.Accordion(CITATION_BUTTON_LABEL, open=False): + gr.Textbox( + value=CITATION_BUTTON_TEXT, + lines=10, + show_copy_button=True, + label=None + ) + + return demo + +if __name__ == "__main__": + # Get app logger + logger = logging.getLogger("mezura") + + # Additional sensitive filter for remaining logs + class SensitiveFilter(logging.Filter): + def filter(self, record): + msg = record.getMessage().lower() + # Filter out messages with tokens, URLs with sign= in them, etc + sensitive_patterns = ["token", "__sign=", "request", "auth", "http request"] + return not any(pattern in msg.lower() for pattern in sensitive_patterns) + + # Apply the filter to all loggers + for logger_name in logging.root.manager.loggerDict: + logging.getLogger(logger_name).addFilter(SensitiveFilter()) + + try: + logger.info("Creating demo...") + demo = create_demo() + logger.info("Launching demo on 0.0.0.0...") + + # Add options to fix the session.pop error + demo.launch( + server_name="0.0.0.0", + server_port=7860 + ) + + except FileNotFoundError as e: + logger.critical(f"Configuration file not found: {e}") + print(f"\n\nERROR: Configuration file not found. Please ensure config/api_config.yaml exists.\n{e}\n") + sys.exit(1) + except ValueError as e: + logger.critical(f"Configuration error: {e}") + print(f"\n\nERROR: Invalid configuration. Please check your config/api_config.yaml file.\n{e}\n") + sys.exit(1) + except Exception as e: + logger.critical(f"Could not launch demo: {e}", exc_info=True) diff --git a/config/api_config.yaml b/config/api_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a94b0f1aad1311d02f5612a4e36785c832298a98 --- /dev/null +++ b/config/api_config.yaml @@ -0,0 +1,28 @@ +# API Configuration +# This file contains API endpoints and other configuration settings. +# When endpoints change, you can update this file to use new endpoints without changing your code. + +# API configuration for each evaluation type +# API configuration for each evaluation type +apis: + # Airflow configuration + airflow: + # base_url is now loaded from the AIRFLOW_URL environment variable + base_url: "${AIRFLOW_URL}" + timeout: 300 + retry_attempts: 3 + auth: + type: "basic" + # Credentials will be loaded from environment variables + use_env: true + env_username: "MEZURA_API_USERNAME" + env_password: "MEZURA_API_PASSWORD" + +# General configuration +default: + timeout: 300 + retry_attempts: 3 + log_level: "INFO" + headers: + accept: "application/json" + content-type: "application/json" \ No newline at end of file diff --git a/helpers_modular.py b/helpers_modular.py new file mode 100644 index 0000000000000000000000000000000000000000..379558dcbdba9c152aec7d098a191027d863f1f4 --- /dev/null +++ b/helpers_modular.py @@ -0,0 +1,35 @@ +import gradio as gr +import logging + +# Import common utilities +from pipelines.utils.common import ( + set_ui_component, + get_ui_component, + set_evaluation_type, + get_evaluation_type, + update_evaluation_params, + search_and_filter +) + +# Import API utilities +from pipelines.utils.api import ( + update_status, + fetch_logs, + fetch_results +) + +# Set up logger +logger = logging.getLogger(__name__) + +# Set functions to make available to the app +__all__ = [ + "set_ui_component", + "get_ui_component", + "set_evaluation_type", + "get_evaluation_type", + "update_evaluation_params", + "search_and_filter", + "update_status", + "fetch_logs", + "fetch_results" +] \ No newline at end of file diff --git a/pipelines/__init__.py b/pipelines/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..602f637fa3feee8b9e0e4890fa617fd4cd8b5199 --- /dev/null +++ b/pipelines/__init__.py @@ -0,0 +1,3 @@ +""" +Pipelines package for Mezura evaluation +""" \ No newline at end of file diff --git a/pipelines/benchmark_configs.py b/pipelines/benchmark_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..0060bed9bdcd5caa90f5c6730ad31fc4d3f5a82c --- /dev/null +++ b/pipelines/benchmark_configs.py @@ -0,0 +1,28 @@ +""" +Benchmark configuration utilities. +""" +from typing import Dict, Any + +def get_unified_benchmark_config(hf_repo: str, base_model: str = None) -> Dict[str, Any]: + """ + Get configuration for Unified Benchmark Pipeline. + + Args: + hf_repo: Model repository to evaluate + base_model: Base model information + + Returns: + Dict with configuration for the benchmark + """ + return { + "conf": { + "hf_repo": hf_repo, + "base_model": base_model, + "model_name": "mezura-test-model", + "model_type": None, # Will be determined and set later + "reasoning": False, # Default to False, will be overridden later + "email": None, # Default to None, will be overridden if provided + "user_id": None, # Generated from username only + "request_id": None # Generated from repo_id, base_model and reasoning + } + } \ No newline at end of file diff --git a/pipelines/unified_benchmark.py b/pipelines/unified_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..6a3a2010eae2b5e709fa9bee3eee9ea9d5bd7e8c --- /dev/null +++ b/pipelines/unified_benchmark.py @@ -0,0 +1,486 @@ +""" +Unified Benchmark Pipeline + +This module provides a unified interface to run all benchmark pipelines +with a single model name. +""" + +import logging +import json +import os +import re +from typing import Dict, Any, List, Optional, Tuple +import traceback +import hashlib + +# Import email validator library +try: + from email_validator import validate_email, EmailNotValidError + logger = logging.getLogger(__name__) +except ImportError as e: + # If library is not installed, log warning + validate_email = None + EmailNotValidError = Exception + logger = logging.getLogger(__name__) + logger.warning(f"email-validator library not installed: {e}") + +from api.clients.airflow_client import AirflowClient +from api.config import get_api_config_for_type, get_airflow_config +from pipelines.benchmark_configs import get_unified_benchmark_config +from src.utils import log_model_submission + +# Import formatting utilities +from src.display.formatting import styled_error, styled_message, styled_warning +from src.submission.check_model_type import check_model_type +from src.submission.check_validity import determine_model_type + +# Set up logger +logger = logging.getLogger(__name__) +# Configure logging to be minimal +logging.getLogger("api.clients.airflow_client").setLevel(logging.ERROR) +logging.getLogger("mezura.http").setLevel(logging.ERROR) +logging.getLogger("api.config").setLevel(logging.ERROR) + +# Get DAG IDs from configuration +def get_dag_id(benchmark_type: str) -> str: + """ + Get the DAG ID for a benchmark type from configuration + + Args: + benchmark_type: Type of benchmark + + Returns: + str: DAG ID for the benchmark + """ + config = get_api_config_for_type(benchmark_type) + return config.get("dag_id", f"{benchmark_type}_benchmark") + +# Map benchmark types to their DAG IDs +DAG_IDS = { + "unified": "accept_request_dag", + "hybrid": "evalmix", + # "lmharness": "lmharness_benchmark", # LM Harness removed + "rag": "rag_benchmark", + "snake": "snake_benchmark", + "arena": "arena_evaluation", + "light": "lighteval_benchmark" +} + +class BenchmarkRunner: + """ + Runner class for unified benchmark execution + """ + + def __init__(self): + """Initialize the benchmark runner""" + self.client = AirflowClient() + + def run_all_benchmarks(self, hf_repo: str, base_model: str = None) -> Dict[str, Any]: + """ + Run the unified benchmark pipeline for a single model + + Args: + hf_repo: Name of the model repository to evaluate + base_model: Base model information (optional) + + Returns: + Dict[str, Any]: Dictionary with benchmark results + """ + # Log the benchmark start + logger.info("Preparing benchmark task") + + # Get the unified benchmark configuration + benchmark_config = get_unified_benchmark_config(hf_repo, base_model) + + # Send the benchmark request + try: + logger.info("Submitting benchmark task to execution system") + result = self.client.send_dag_request( + dag_id=DAG_IDS["unified"], + conf=benchmark_config["conf"] + ) + + return { + "status": "success", + "message": f"Benchmark started successfully", + "results": {"unified": result} + } + + except Exception as e: + logger.error("Benchmark submission failed") + return { + "status": "error", + "message": f"Error running benchmark: {str(e)}", + "results": {} + } + + def run_all_benchmarks_with_config(self, benchmark_config: Dict[str, Any]) -> Dict[str, Any]: + """ + Run the unified benchmark pipeline for a single model with a given benchmark configuration + + Args: + benchmark_config: Dictionary with benchmark configuration + + Returns: + Dict[str, Any]: Dictionary with benchmark results + """ + # Get the model name + model_name = benchmark_config.get("conf", {}).get("repo_id", "model") + if "hf_repo" in benchmark_config.get("conf", {}): + model_name = benchmark_config["conf"]["hf_repo"] + logger.info(f"Preparing benchmark with configuration for model: {model_name}") + + # SECURITY: Commented out to prevent potential credential exposure + # logger.info(f"Benchmark configuration: {json.dumps(benchmark_config)}") + + # SECURITY: Commented out to prevent potential credential exposure + # logger.info(f"POST payload: {json.dumps(benchmark_config['conf'])}") + + # Add specific logging for base model and repo ID + repo_id = benchmark_config.get('conf', {}).get('repo_id', 'unknown') + base_model = benchmark_config.get('conf', {}).get('base_model', 'unknown') + # logger.info(f"SENDING TO AIRFLOW - REPO_ID: {repo_id}, BASE_MODEL: {base_model}") + + # Log to dedicated submissions log file + log_model_submission(repo_id, base_model) + + # Send the benchmark request + try: + logger.info("Submitting benchmark task to execution system") + result = self.client.send_dag_request( + dag_id=DAG_IDS["unified"], + conf=benchmark_config["conf"] + ) + + return { + "status": "success", + "message": "Benchmark started successfully", + "results": {"unified": result} + } + + except Exception as e: + logger.error(f"Benchmark submission failed: {str(e)}") + logger.error(f"Exception details: {traceback.format_exc()}") + return { + "status": "error", + "message": f"Error running benchmark: {str(e)}", + "results": {} + } + +# Email validation function with email-validator +def is_valid_email(email: str) -> bool: + """ + Validate email using email-validator library + + Args: + email: Email string to validate + + Returns: + bool: True if email is valid according to email-validator + """ + if not email: + return False + + # Use email-validator library if available + if validate_email is not None: + try: + # Validate the email (no deliverability check needed for our case) + emailinfo = validate_email(email, check_deliverability=False) + logger.info(f"Email validation successful") + # Store the normalized form of the email address + return True + except EmailNotValidError as e: + # Log the specific validation error + logger.info(f"Email validation failed") + return False + + # If library not installed, fall back to simple regex validation + logger.warning("Using fallback email validation") + basic_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$') + if bool(basic_pattern.match(email)): + logger.info(f"Email validation successful") + return True + + logger.info(f"Email validation failed") + return False + +def submit_unified_benchmark(hf_repo: str, base_model: str = None, reasoning: bool = False, email: str = None, profile=None): + """ + Submit unified benchmark for a single model + + Args: + hf_repo: Name of the model repository to evaluate + base_model: Base model information (artık kullanılmıyor) + reasoning: Whether to enable reasoning capability during evaluation + email: Email address for notification (required) + profile: User profile object from Hugging Face login (required) + + Returns: + Tuple[str, Dict]: A tuple containing: + - result_text: A markdown string with the status message + - dag_run_info: A dictionary with the DAG run information + """ + try: + + # Verify user is logged in before allowing submission + if profile is None: + return styled_error("Authentication required. Please log in with your Hugging Face account to submit models."), {} + + # Get username from profile for logging + username = None + try: + if hasattr(profile, 'username'): + username = profile.username + elif hasattr(profile, 'name'): + username = profile.name + elif isinstance(profile, str) and profile.strip(): + username = profile + + # Extract username from "Logout (username)" format if present + logout_pattern = re.compile(r'Logout \(([^)]+)\)') + match = logout_pattern.search(username) + if match: + username = match.group(1) + + # Also try other common formats + # Handle any string with parentheses containing the username + elif '(' in username and ')' in username: + # Extract text between last opening and first closing parenthesis + start = username.rindex('(') + 1 + end = username.find(')', start) + if start < end: + extracted = username[start:end].strip() + if extracted: # Only use if not empty + username = extracted + # If none of the above conditions are met, keep username as None + + # If username is not None, ensure it's a string + if username is not None: + if not isinstance(username, str): + username = str(username) + except Exception as e: + username = None + logger.warning(f"Failed to extract username from profile: {str(e)}") + + # Log successful auth + logger.info(f"Submission authorized for user: {username}") + logger.info(f"Benchmark process started") + + # Validate email if provided + valid_email = None + if email: + try: + # Use the full email validation to get normalized form + if validate_email is not None: + # Validate and normalize the email + emailinfo = validate_email(email, check_deliverability=False) + valid_email = emailinfo.normalized # Use normalized form + logger.info(f"Email validation completed") + else: + # Fallback if library not available + is_valid = is_valid_email(email) + if is_valid: + valid_email = email + logger.info(f"Email validation completed") + else: + logger.warning(f"Email validation failed") + return styled_warning("Invalid email address. Please enter a valid email address."), {} + except EmailNotValidError as e: + logger.warning(f"Email validation failed") + return styled_warning(f"Invalid email address: {str(e)}"), {} + else: + # Email is required + logger.warning(f"Email required but not provided") + return styled_warning("Please provide an email address to receive benchmark results."), {} + + # First, analyze the model to get information without displaying details + _, model_data = get_model_information(hf_repo, display_full_info=False) + + # Base model algılama ve kontrol işlemleri tamamen kaldırıldı + + # Determine model type + model_type, type_message = determine_model_type(hf_repo) + logger.info(f"Model type determination completed") + + if model_type == "unknown": + return styled_warning(f"Could not determine model type. Benchmark not submitted."), {} + + # New check: Don't allow merged models + if model_type == "merge" or model_type == "merged_model": + logger.warning(f"Merged model detected. Currently not supported.") + return styled_warning(f"Merged models are not supported yet. Benchmark not submitted."), {} + + # Step 3: Generate a model evaluation name - short, with "eval", under 28 chars + # Extract short name from repo ID + if "/" in hf_repo: + org, model_name = hf_repo.split("/", 1) + short_name = model_name + else: + short_name = hf_repo + + # Clean the name and make it shorter if needed + # Remove special characters and replace with hyphens + clean_name = re.sub(r'[^a-zA-Z0-9]', '-', short_name) + # Truncate if too long + if len(clean_name) > 20: # Leave room for eval suffix + clean_name = clean_name[:20] + + # Add eval suffix if not already present + if "eval" not in clean_name.lower(): + eval_name = f"{clean_name}-eval" + else: + eval_name = clean_name + + # Ensure the final name is under 28 characters + if len(eval_name) > 28: + eval_name = eval_name[:28] + + logger.info(f"Evaluation name generation completed") + + # Create benchmark runner + runner = BenchmarkRunner() + + # Get the benchmark configuration and add model type parameter + benchmark_config = get_unified_benchmark_config(hf_repo, base_model) + + # Make sure hf_repo is set correctly in the configuration + if "conf" in benchmark_config: + # Ensure hf_repo is set properly + benchmark_config["conf"]["hf_repo"] = hf_repo + # Also set repo_id for backwards compatibility + benchmark_config["conf"]["repo_id"] = hf_repo + + # Add model type and model name to the configuration + benchmark_config["conf"]["model_type"] = model_type + benchmark_config["conf"]["unique_model_name"] = eval_name + benchmark_config["conf"]["reasoning"] = reasoning + # Set base_model + benchmark_config["conf"]["base_model"] = base_model + + # Add email if valid + if valid_email: + benchmark_config["conf"]["email"] = valid_email + + # Create a unique user_id based ONLY on username + if username is not None: + # Ensure username is a simple string + if not isinstance(username, str): + username = str(username) + # Limit username length to avoid issues + if len(username) > 100: + username = username[:100] + + # Create a unique hash from username only + user_id = hashlib.md5(username.encode()).hexdigest() + # Add user_id to the configuration + benchmark_config["conf"]["user_id"] = user_id + + # Create a separate request_id based on repo_id, base_model and reasoning + request_hash_input = f"{hf_repo}_{base_model}_{reasoning}" + request_id = hashlib.md5(request_hash_input.encode()).hexdigest() + # Add request_id to the configuration + benchmark_config["conf"]["request_id"] = request_id + + # Still add username for backward compatibility + benchmark_config["conf"]["username"] = username + else: + # Username is required for the request, so don't proceed + logger.error("Username not available, cannot submit benchmark request") + return styled_error("Authentication error. Username not available."), {} + + # Execute the unified benchmark request + logger.info("Submitting benchmark task") + results = runner.run_all_benchmarks_with_config(benchmark_config) + + # Format result for UI display + dag_run_info = {} + if results.get("status") == "success" and "unified" in results.get("results", {}): + unified_result = results["results"]["unified"] + if "run_id" in unified_result: + dag_run_info = { + "dag_run_id": unified_result["run_id"], + "dag_id": DAG_IDS["unified"], + "status": "queued" + } + + # Create simple success/error message + if results["status"] == "success": + success_msg = f"Benchmark started for {hf_repo} (Type: {model_type})" + if valid_email: + success_msg += f". Results will be sent to {valid_email}." + result_message = styled_message(success_msg) + logger.info("Benchmark successfully submitted") + else: + # Log the error but show simplified message + logger.error(f"Benchmark submission failed") + result_message = styled_error("Failed to start benchmark") + + # Return message and run info + return result_message, dag_run_info + + except Exception as e: + # Log the full error + logger.error(f"Error during benchmark submission: {str(e)}") + logger.error(f"Exception details: {traceback.format_exc()}") + # Return simplified error message + return styled_error("An error occurred while submitting the benchmark"), {} + +def get_model_information(hf_repo: str, display_full_info: bool = True) -> Tuple[str, dict]: + """ + Get model type and information. + + Args: + hf_repo: Model repository ID + display_full_info: Whether to include detailed information in the returned message + + Returns: + Tuple[str, dict]: A tuple containing: + - message: Formatted message about the model + - model_info: Dictionary with model information + """ + try: + logger.info("Analyzing model information") + model_data = check_model_type(hf_repo) + + if "error" in model_data.get("info", {}): + error_message = model_data["info"]["error"] + logger.error("Model analysis failed") + return styled_error("Failed to analyze model"), {} + + # If we don't need to display full info, return minimal message + if not display_full_info: + logger.info("Model analysis completed") + return f"Model analysis completed.", model_data + + model_type = model_data.get("model_type", "unknown") + info = model_data.get("info", {}) + + # Format a nice message with full information + message = f"Type: {model_type.capitalize()}
" + + if "base_model" in info: + message += f"Base Model: {info['base_model']}
" + message += "{error}
" + + +def styled_warning(warn): + return f"{warn}
" + + +def styled_message(message): + return f"{message}
" + + +def has_no_nan_values(df, columns): + return df[columns].notna().all(axis=1) + + +def has_nan_values(df, columns): + return df[columns].isna().any(axis=1) diff --git a/src/display/utils.py b/src/display/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..081464fe0802f2a2797c62d7654b6bfd447f69cb --- /dev/null +++ b/src/display/utils.py @@ -0,0 +1,157 @@ +from dataclasses import dataclass, make_dataclass +from enum import Enum + +import pandas as pd + +from src.display.about import Tasks + +def fields(raw_class): + return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"] + + +# These classes are for user facing column names, +# to avoid having to change them all around the code +# when a modif is needed +@dataclass +class ColumnContent: + name: str + type: str + displayed_by_default: bool + hidden: bool = False + never_hidden: bool = False + dummy: bool = False + +## Leaderboard columns +auto_eval_column_dict = [] +# Init +auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)]) +auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)]) +#Scores +auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]) +for task in Tasks: + auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]) +# Model information +auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)]) +auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)]) +auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)]) +auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)]) +auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)]) +auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)]) +auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)]) +auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)]) +auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)]) +# Dummy column for the search bar (hidden by the custom CSS) +auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)]) + +# We use make dataclass to dynamically fill the scores from Tasks +AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True) + +## For the queue columns in the submission tab +@dataclass(frozen=True) +class EvalQueueColumn: # Queue column + model = ColumnContent("model", "markdown", True) + evaluation_type = ColumnContent("evaluation_type", "str", True) + private = ColumnContent("private", "bool", True) + status = ColumnContent("status", "str", True) + revision = ColumnContent("revision", "str", True) + +## All the model information that we might need +@dataclass +class ModelDetails: + name: str + display_name: str = "" + symbol: str = "" # emoji + + +class ModelType(Enum): + PT = ModelDetails(name="pretrained", symbol="🟢") + FT = ModelDetails(name="fine-tuned", symbol="🔶") + IFT = ModelDetails(name="instruction-tuned", symbol="⭕") + RL = ModelDetails(name="RL-tuned", symbol="🟦") + Unknown = ModelDetails(name="", symbol="?") + + def to_str(self, separator=" "): + return f"{self.value.symbol}{separator}{self.value.name}" + + @staticmethod + def from_str(type): + if "fine-tuned" in type or "🔶" in type: + return ModelType.FT + if "pretrained" in type or "🟢" in type: + return ModelType.PT + if "RL-tuned" in type or "🟦" in type: + return ModelType.RL + if "instruction-tuned" in type or "⭕" in type: + return ModelType.IFT + return ModelType.Unknown + +class WeightType(Enum): + Adapter = ModelDetails("Adapter") + Original = ModelDetails("Original") + Delta = ModelDetails("Delta") + +class Precision(Enum): + float16 = ModelDetails("float16") + bfloat16 = ModelDetails("bfloat16") + qt_8bit = ModelDetails("8bit") + qt_4bit = ModelDetails("4bit") + qt_GPTQ = ModelDetails("GPTQ") + Unknown = ModelDetails("?") + + def from_str(precision): + if precision in ["torch.float16", "float16"]: + return Precision.float16 + if precision in ["torch.bfloat16", "bfloat16"]: + return Precision.bfloat16 + if precision in ["8bit"]: + return Precision.qt_8bit + if precision in ["4bit"]: + return Precision.qt_4bit + if precision in ["GPTQ", "None"]: + return Precision.qt_GPTQ + return Precision.Unknown + +# Column selection +COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden] +TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden] +COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] +TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] + +EVAL_COLS = [c.name for c in fields(EvalQueueColumn)] +EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)] + +BENCHMARK_COLS = [t.value.col_name for t in Tasks] + +NUMERIC_INTERVALS = { + "?": pd.Interval(-1, 0, closed="right"), + "~1.5": pd.Interval(0, 2, closed="right"), + "~3": pd.Interval(2, 4, closed="right"), + "~7": pd.Interval(4, 9, closed="right"), + "~13": pd.Interval(9, 20, closed="right"), + "~35": pd.Interval(20, 45, closed="right"), + "~60": pd.Interval(45, 70, closed="right"), + "70+": pd.Interval(70, 10000, closed="right"), +} + +# Global variable to track current view type for leaderboard filtering +current_view_type = "all" # Default view type is "all" + +def set_view_type(view_type): + """ + Set the current view type for leaderboard filtering. + Used to determine which columns to show in the leaderboard. + + Args: + view_type: The view type to set, e.g. "all", "light_eval", etc. + """ + global current_view_type + current_view_type = view_type + +def get_view_type(): + """ + Get the current view type for leaderboard filtering. + + Returns: + The current view type. + """ + return current_view_type diff --git a/src/envs.py b/src/envs.py new file mode 100644 index 0000000000000000000000000000000000000000..929128f24bfbc25863a48ff398b9b8959ce76165 --- /dev/null +++ b/src/envs.py @@ -0,0 +1,25 @@ +import os +import dotenv +import logging + +# .env dosyasını yükle +dotenv.load_dotenv() + +from huggingface_hub import HfApi + +# TOKEN is automatically set when running in Hugging Face Spaces +# For local development, use the token from the .env file +TOKEN = os.environ.get("HFTOKEN", os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN", os.environ.get("TOKEN", None)))) + +# Replace these with your custom repository paths +OWNER = "newmindai" +REPO_ID = f"{OWNER}/mezura" + +CACHE_PATH = os.getenv("HF_HOME", ".") + +# Initialize API if TOKEN is available, otherwise set to None +try: + API = HfApi(token=TOKEN) +except Exception as e: + logging.warning("Could not initialize HfApi") + API = None diff --git a/src/leaderboard/read_evals.py b/src/leaderboard/read_evals.py new file mode 100644 index 0000000000000000000000000000000000000000..7d4c156f6010836fe35315f1ef62044d81348402 --- /dev/null +++ b/src/leaderboard/read_evals.py @@ -0,0 +1,258 @@ +import glob +import json +import math +import os +from dataclasses import dataclass + +import dateutil +import numpy as np + +from src.display.formatting import make_clickable_model +from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType +from src.submission.check_validity import is_model_on_hub + + +@dataclass +class EvalResult: + eval_name: str # org_model_precision (uid) + full_model: str # org/model (path on hub) + org: str + model: str + revision: str # commit hash, "" if main + results: dict + precision: Precision = Precision.Unknown + model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ... + weight_type: WeightType = WeightType.Original # Original or Adapter + architecture: str = "Unknown" + license: str = "?" + likes: int = 0 + num_params: int = 0 + date: str = "" # submission date of request file + still_on_hub: bool = False + + @classmethod + def init_from_json_file(self, json_filepath): + """Inits the result from the specific model result file""" + with open(json_filepath) as fp: + data = json.load(fp) + + config = data.get("config") + + # Eğer config None ise, boş bir sözlük ile devam et + if config is None: + print(f"Warning: Config is None in {json_filepath}") + config = {} + + # Precision + precision = Precision.from_str(config.get("model_dtype")) + + # Get model and org + org_and_model = config.get("model_name", config.get("model_args", None)) + + # Eğer org_and_model None ise, varsayılan bir değer kullan + if org_and_model is None: + print(f"Warning: org_and_model is None in {json_filepath}") + org_and_model = "unknown/unknown" + + org_and_model = org_and_model.split("/", 1) + + if len(org_and_model) == 1: + org = None + model = org_and_model[0] + result_key = f"{model}_{precision.value.name}" + else: + org = org_and_model[0] + model = org_and_model[1] + result_key = f"{org}_{model}_{precision.value.name}" + full_model = "/".join(org_and_model) + + still_on_hub, _, model_config = is_model_on_hub( + full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False + ) + architecture = "?" + if model_config is not None: + architectures = getattr(model_config, "architectures", None) + if architectures: + architecture = ";".join(architectures) + + # Extract results available in this file (some results are split in several files) + results = {} + + # model_evaluation_mlflow_result.json dosyasındaki metrikleri oku + if "results" in data: + # OpenAI Scores + if "openai_scores" in data["results"]: + openai_scores = data["results"]["openai_scores"] + results["OpenAI-Scores"] = { + "accuracy": openai_scores.get("accuracy", 0), + "relevance": openai_scores.get("relevance", 0), + "coherence": openai_scores.get("coherence", 0) + } + + # BLEU Score + if "bleu" in data["results"]: + results["BLEU"] = { + "mean": data["results"]["bleu"].get("mean", 0) + } + + # ROUGE Scores + if "rouge" in data["results"]: + rouge = data["results"]["rouge"] + results["ROUGE"] = { + "rouge1_mean": rouge.get("rouge1", {}).get("mean", 0), + "rouge2_mean": rouge.get("rouge2", {}).get("mean", 0), + "rougeL_mean": rouge.get("rougeL", {}).get("mean", 0) + } + + # BERT Score + if "bert_score" in data["results"]: + bert_score = data["results"]["bert_score"] + results["BERT-Score"] = { + "precision_mean": bert_score.get("precision", {}).get("mean", 0), + "recall_mean": bert_score.get("recall", {}).get("mean", 0), + "f1_mean": bert_score.get("f1", {}).get("mean", 0) + } + + # Cosine Similarity + if "cosine_similarity_turkish" in data["results"]: + results["Cosine-Similarity"] = { + "turkish_mean": data["results"]["cosine_similarity_turkish"].get("mean", 0), + "multilingual_mean": data["results"].get("cosine_similarity_multilingual", {}).get("mean", 0) + } + + # Evaluation Metrics + if "evaluation_metrics" in data["results"]: + eval_metrics = data["results"]["evaluation_metrics"] + results["Evaluation-Metrics"] = { + "total_samples": eval_metrics.get("total_samples", 0), + "avg_input_length": eval_metrics.get("avg_input_length", 0), + "avg_prediction_length": eval_metrics.get("avg_prediction_length", 0), + "avg_reference_length": eval_metrics.get("avg_reference_length", 0) + } + + # Eski format için geriye dönük uyumluluk + task_results = {} + for task in Tasks: + task = task.value + benchmark = task.benchmark + metric = task.metric + + if benchmark in results and metric in results[benchmark]: + task_results[benchmark] = results[benchmark][metric] * 100.0 + + return self( + eval_name=result_key, + full_model=full_model, + org=org, + model=model, + results=task_results, + precision=precision, + revision= config.get("model_sha", ""), + still_on_hub=still_on_hub, + architecture=architecture + ) + + def update_with_request_file(self, requests_path): + """Finds the relevant request file for the current model and updates info with it""" + request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name) + + try: + with open(request_file, "r") as f: + request = json.load(f) + self.model_type = ModelType.from_str(request.get("model_type", "")) + self.weight_type = WeightType[request.get("weight_type", "Original")] + self.license = request.get("license", "?") + self.likes = request.get("likes", 0) + self.num_params = request.get("params", 0) + self.date = request.get("submitted_time", "") + except Exception: + print(f"Could not find request file for {self.org}/{self.model}") + + def to_dict(self): + """Converts the Eval Result to a dict compatible with our dataframe display""" + average = sum([v for v in self.results.values() if v is not None]) / len(Tasks) + data_dict = { + "eval_name": self.eval_name, # not a column, just a save name, + AutoEvalColumn.precision.name: self.precision.value.name, + AutoEvalColumn.model_type.name: self.model_type.value.name, + AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, + AutoEvalColumn.weight_type.name: self.weight_type.value.name, + AutoEvalColumn.architecture.name: self.architecture, + AutoEvalColumn.model.name: make_clickable_model(self.full_model), + AutoEvalColumn.dummy.name: self.full_model, + AutoEvalColumn.revision.name: self.revision, + AutoEvalColumn.average.name: average, + AutoEvalColumn.license.name: self.license, + AutoEvalColumn.likes.name: self.likes, + AutoEvalColumn.params.name: self.num_params, + AutoEvalColumn.still_on_hub.name: self.still_on_hub, + } + + for task in Tasks: + data_dict[task.value.col_name] = self.results[task.value.benchmark] + + return data_dict + + +def get_request_file_for_model(requests_path, model_name, precision): + """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED""" + request_files = os.path.join( + requests_path, + f"{model_name}_eval_request_*.json", + ) + request_files = glob.glob(request_files) + + # Select correct request file (precision) + request_file = "" + request_files = sorted(request_files, reverse=True) + for tmp_request_file in request_files: + with open(tmp_request_file, "r") as f: + req_content = json.load(f) + if ( + req_content["status"] in ["FINISHED"] + and req_content["precision"] == precision.split(".")[-1] + ): + request_file = tmp_request_file + return request_file + + +def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]: + """From the path of the results folder root, extract all needed info for results""" + model_result_filepaths = [] + + for root, _, files in os.walk(results_path): + # We should only have json files in model results + if len(files) == 0 or any([not f.endswith(".json") for f in files]): + continue + + # Sort the files by date + try: + files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7]) + except dateutil.parser._parser.ParserError: + files = [files[-1]] + + for file in files: + model_result_filepaths.append(os.path.join(root, file)) + + eval_results = {} + for model_result_filepath in model_result_filepaths: + # Creation of result + eval_result = EvalResult.init_from_json_file(model_result_filepath) + eval_result.update_with_request_file(requests_path) + + # Store results of same eval together + eval_name = eval_result.eval_name + if eval_name in eval_results.keys(): + eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None}) + else: + eval_results[eval_name] = eval_result + + results = [] + for v in eval_results.values(): + try: + v.to_dict() # we test if the dict version is complete + results.append(v) + except KeyError: # not all eval values present + continue + + return results diff --git a/src/populate.py b/src/populate.py new file mode 100644 index 0000000000000000000000000000000000000000..4649d6111e3d15cdf6a7538d8fe66fd65ef228d4 --- /dev/null +++ b/src/populate.py @@ -0,0 +1,71 @@ +import json +import os + +import pandas as pd + +from src.display.formatting import has_no_nan_values, make_clickable_model +from src.display.utils import AutoEvalColumn, EvalQueueColumn +from src.leaderboard.read_evals import get_raw_eval_results + + +def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame: + raw_data = get_raw_eval_results(results_path, requests_path) + all_data_json = [v.to_dict() for v in raw_data] + + df = pd.DataFrame.from_records(all_data_json) + df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False) + df = df[cols].round(decimals=2) + + # filter out if any of the benchmarks have not been produced + df = df[has_no_nan_values(df, benchmark_cols)] + return raw_data, df + + +def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: + try: + entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")] + all_evals = [] + + for entry in entries: + try: + if ".json" in entry: + file_path = os.path.join(save_path, entry) + with open(file_path) as fp: + data = json.load(fp) + + data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) + data[EvalQueueColumn.revision.name] = data.get("revision", "main") + + all_evals.append(data) + elif ".md" not in entry: + # this is a folder + try: + sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")] + for sub_entry in sub_entries: + try: + file_path = os.path.join(save_path, entry, sub_entry) + with open(file_path) as fp: + data = json.load(fp) + + data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) + data[EvalQueueColumn.revision.name] = data.get("revision", "main") + all_evals.append(data) + except Exception as e: + print(f"Warning: Could not process file {sub_entry}: {e}") + except Exception as e: + print(f"Warning: Could not process directory {entry}: {e}") + except Exception as e: + print(f"Warning: Could not process entry {entry}: {e}") + + pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]] + running_list = [e for e in all_evals if e["status"] == "RUNNING"] + finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"] + df_pending = pd.DataFrame.from_records(pending_list, columns=cols) + df_running = pd.DataFrame.from_records(running_list, columns=cols) + df_finished = pd.DataFrame.from_records(finished_list, columns=cols) + return df_finished[cols], df_running[cols], df_pending[cols] + except Exception as e: + print(f"Warning: Could not process evaluation queue: {e}") + # Boş veri çerçeveleri döndür + df_empty = pd.DataFrame(columns=cols) + return df_empty, df_empty, df_empty diff --git a/src/submission/check_model_type.py b/src/submission/check_model_type.py new file mode 100644 index 0000000000000000000000000000000000000000..fe0ca5b9cb081955c567fd26c1a75ecb39b02f9e --- /dev/null +++ b/src/submission/check_model_type.py @@ -0,0 +1,348 @@ +""" +Module for checking model type and extracting model information. +""" +import os +import json +from typing import Dict, Any, Tuple, List, Optional +import logging + +from huggingface_hub import HfApi, ModelCard +from huggingface_hub.hf_api import ModelInfo + +# Set up logger +logger = logging.getLogger(__name__) + +class ModelTypeChecker: + """ + Class for determining model type and extracting information. + """ + + def __init__(self, token: Optional[str] = None): + """ + Initialize the model type checker. + + Args: + token: HF API token (optional) + """ + self.api = HfApi(token=token) + + def get_model_info(self, repo_id: str) -> Tuple[dict, str]: + """ + Get model information from Hugging Face Hub. + + Args: + repo_id: Model repository ID + + Returns: + Tuple[dict, str]: A tuple containing: + - model_info: Dictionary with model information + - model_type: Detected model type + """ + model_type = "unknown" + model_info = {} + + try: + # Get basic model info + info = self.api.model_info(repo_id=repo_id) + model_info["id"] = info.modelId + model_info["author"] = info.author + model_info["downloads"] = info.downloads + model_info["likes"] = info.likes + + # Check file structure + files = self.api.list_repo_files(repo_id=repo_id) + files_set = set(files) + + # Look for adapter configuration + if "adapter_config.json" in files_set: + model_type = "adapter" + # Read adapter_config.json to extract base model information + try: + adapter_config_url = f"https://huggingface.co/{repo_id}/raw/main/adapter_config.json" + import requests + response = requests.get(adapter_config_url) + if response.status_code == 200: + adapter_config = response.json() + # Check different possible keys for base model info in adapter config + base_model = adapter_config.get("base_model_name_or_path") or \ + adapter_config.get("base_model_name") or \ + adapter_config.get("model_name") or \ + adapter_config.get("base_model") or \ + adapter_config.get("_name_or_path") + if base_model: + model_info["base_model"] = base_model + logger.info(f"Found base_model in adapter_config.json: {base_model}") + except Exception as e: + logger.warning(f"Could not load adapter_config.json for {repo_id}: {e}") + + # Check for merge configuration + elif "config.json" in files_set: + model_type = "merged_model" + # Try to read merge configuration to extract base models + merge_file = "config.json" + try: + merge_config_url = f"https://huggingface.co/{repo_id}/raw/main/{merge_file}" + import requests + response = requests.get(merge_config_url) + if response.status_code == 200: + merge_config = response.json() + # Extract base models from merge config + if "models" in merge_config: + base_models = [model.get("model") for model in merge_config["models"] if "model" in model] + if base_models: + model_info["base_models"] = base_models + # Use the first model as the primary base model + model_info["base_model"] = base_models[0] + except Exception as e: + logger.warning(f"Could not load {merge_file} for {repo_id}: {e}") + + # Check for typical model files + elif "pytorch_model.bin" in files_set or any(f.endswith(".safetensors") for f in files_set): + # If model type is still unknown, default to fine-tune + if model_type == "unknown": + model_type = "fine-tune" + + # Check for config.json to extract architecture and base model info + if "config.json" in files_set: + try: + config_url = f"https://huggingface.co/{repo_id}/raw/main/config.json" + import requests + response = requests.get(config_url) + if response.status_code == 200: + config = response.json() + model_info["architectures"] = config.get("architectures", []) + + # Check if there's a base model reference + base_model = config.get("_name_or_path") + if base_model and "base_model" not in model_info: + model_info["base_model"] = base_model + except Exception as e: + logger.warning(f"Could not load config.json for {repo_id}: {e}") + + return model_info, model_type + + except Exception as e: + logger.error(f"Error getting model info for {repo_id}: {e}") + return {"error": str(e)}, "error" + + def analyze_model(self, repo_id: str) -> dict: + """ + Analyze model type and extract structured information. + + Args: + repo_id: Model repository ID + + Returns: + dict: Structured model information + """ + model_info, model_type = self.get_model_info(repo_id) + + result = { + "repo_id": repo_id, + "model_type": model_type, + "info": model_info + } + + # Add additional insights based on model type + if model_type == "adapter": + result["insights"] = "This is an adapter model. It requires a base model to run." + elif model_type == "merged_model": + result["insights"] = "This is a merged model, created by combining multiple models." + elif model_type == "fine-tune": + result["insights"] = "This is a fine-tuned model, based on a pre-trained model." + elif model_type == "base": + result["insights"] = "This is a base/foundation model." + + return result + + def validate_base_model(self, repo_id: str, expected_base_model: str) -> Tuple[bool, str]: + """ + Validate that the model's base model matches the expected base model. + + Args: + repo_id: Model repository ID + expected_base_model: The expected base model ID + + Returns: + Tuple[bool, str]: A tuple containing: + - is_valid: Whether the base model matches + - message: Validation message or error + """ + try: + # First get model info + model_info, model_type = self.get_model_info(repo_id) + + # Check for error in model info + if "error" in model_info: + return False, f"Could not validate model: {model_info['error']}" + + # Get base model from model info + actual_base_model = None + + # Check if base_model is already extracted + if "base_model" in model_info: + actual_base_model = model_info["base_model"] + logger.info(f"Found base_model in model_info: {actual_base_model}") + + # If no base model found, log the issue + if not actual_base_model: + logger.warning(f"No base model detected for {repo_id}") + if model_type == "adapter": + return False, f"Could not detect base model in adapter configuration" + else: + return False, f"Base model not specified in model configuration" + + # Handle case where actual_base_model is a list + if isinstance(actual_base_model, list): + if len(actual_base_model) > 0: + actual_base_model = actual_base_model[0] + logger.info(f"Using first base model from list: {actual_base_model}") + else: + return False, f"Base model list is empty" + + # Normalize both model names for comparison + def normalize_model_name(name): + # Convert to lowercase and strip whitespace + name = str(name).lower().strip() + + # Handle organization prefix + parts = name.split("/") + model_name = parts[-1] + org_name = parts[0] if len(parts) > 1 else "" + + # Special handling for known organizations + if "meta-llama" in org_name or "llama" in org_name: + # Different variations of Meta-Llama org names should be treated the same + org_name = "meta-llama" + + # Standardize version format: replace '.' with '-' + # Example: llama-3.1-8b-instruct -> llama-3-1-8b-instruct + model_name = model_name.replace(".", "-") + + # Standardize Meta-Llama vs Llama naming + model_name = model_name.replace("meta-llama-", "llama-") + model_name = model_name.replace("metallama-", "llama-") + model_name = model_name.replace("meta-llama", "llama") + + # Replace multiple hyphens with single hyphen + while "--" in model_name: + model_name = model_name.replace("--", "-") + + # For comparison purposes, just return model name without organization + return model_name + + norm_actual = normalize_model_name(actual_base_model) + norm_expected = normalize_model_name(expected_base_model) + + logger.info(f"Comparing normalized model names: '{norm_actual}' vs '{norm_expected}'") + + # Check for exact normalized match + if norm_actual == norm_expected: + return True, f"Base model validation successful (exact match)" + + # Check for one being a subset of the other (partial match) + if norm_actual in norm_expected or norm_expected in norm_actual: + return True, f"Base model validation successful (partial match)" + + # Allow for specific version differences + # Extract model family and size + # Example: llama-3-1-8b-instruct -> llama, 3-1, 8b, instruct + def extract_model_parts(name): + parts = name.split("-") + family = parts[0] if parts else "" + + # Enhanced version extraction + version_parts = [] + for p in parts[1:]: + # Check if this part is a version number (digits with possible hyphens) + if p.replace("-", "").isdigit() or (len(p) > 0 and p[0].isdigit()): + version_parts.append(p) + # Stop collecting version parts once we hit a non-version part + else: + break + + version = "-".join(version_parts) + + # Handle common model size formats + size = next((p for p in parts if p.endswith("b") or p.endswith("B")), "") + + # Convert size to lowercase standard + if size: + size = size.lower() + + # Get variant (like "instruct", "chat", etc.) + variant_parts = [] + for p in parts: + if p not in [family] and p != size and p not in version_parts: + if not p.replace("-", "").isdigit(): + variant_parts.append(p) + + variant = "-".join(variant_parts) + + return family, version, size, variant + + actual_family, actual_version, actual_size, actual_variant = extract_model_parts(norm_actual) + expected_family, expected_version, expected_size, expected_variant = extract_model_parts(norm_expected) + + logger.info(f"Extracted model parts - Actual: family={actual_family}, version={actual_version}, size={actual_size}, variant={actual_variant}") + logger.info(f"Extracted model parts - Expected: family={expected_family}, version={expected_version}, size={expected_size}, variant={expected_variant}") + + # Make comparisons less strict for known model families + if actual_family == expected_family: + # For all models, check if it's the same family and has the same size + if actual_size == expected_size: + return True, f"Base model validation successful ({actual_family} family match with same size)" + # If sizes don't match but it's the same family, still accept it + return True, f"Base model validation successful ({actual_family} family match)" + + # If same family, size, and similar variant (e.g., both include "instruct") + if (actual_family == expected_family and + actual_size == expected_size and + (not actual_variant or not expected_variant or + actual_variant in expected_variant or expected_variant in actual_variant or + "instruct" in actual_variant and "instruct" in expected_variant)): + return True, f"Base model validation successful (family and size match)" + + # Log what didn't match for debugging + logger.warning(f"Base model validation failed: {actual_base_model} vs {expected_base_model}") + logger.warning(f"Normalized: {norm_actual} vs {norm_expected}") + logger.warning(f"Family: {actual_family} vs {expected_family}") + logger.warning(f"Size: {actual_size} vs {expected_size}") + logger.warning(f"Variant: {actual_variant} vs {expected_variant}") + + return False, f"Base model validation failed: expected {expected_base_model}, got {actual_base_model}" + + except Exception as e: + logger.error(f"Error validating base model: {e}") + return False, f"Base model validation error: {str(e)}" + +def check_model_type(repo_id: str, token: Optional[str] = None) -> dict: + """ + Check model type and extract information. + + Args: + repo_id: Model repository ID + token: HF API token (optional) + + Returns: + dict: Model information and type + """ + checker = ModelTypeChecker(token=token) + return checker.analyze_model(repo_id) + +def validate_base_model(repo_id: str, expected_base_model: str, token: Optional[str] = None) -> Tuple[bool, str]: + """ + Validate that a model's base model matches the expected base model. + + Args: + repo_id: Model repository ID + expected_base_model: The expected base model ID + token: HF API token (optional) + + Returns: + Tuple[bool, str]: A tuple containing: + - is_valid: Whether the base model matches + - message: Validation message or error + """ + checker = ModelTypeChecker(token=token) + return checker.validate_base_model(repo_id, expected_base_model) \ No newline at end of file diff --git a/src/submission/check_validity.py b/src/submission/check_validity.py new file mode 100644 index 0000000000000000000000000000000000000000..6f305da508bd810ce07b3d32c7c60d88423f7f21 --- /dev/null +++ b/src/submission/check_validity.py @@ -0,0 +1,275 @@ +import json +import os +import re +from collections import defaultdict +from datetime import datetime, timedelta, timezone +import logging + +import huggingface_hub +from huggingface_hub import ModelCard, HfApi, hf_hub_download +from huggingface_hub.hf_api import ModelInfo +from transformers import AutoConfig +from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) # Only show INFO and above, hide DEBUG messages + +def check_model_card(repo_id: str) -> tuple[bool, str]: + """Checks if the model card and license exist and have been filled""" + try: + card = ModelCard.load(repo_id) + except huggingface_hub.utils.EntryNotFoundError: + return False, "Please add a model card to your model to explain how you trained/fine-tuned it." + + # Enforce license metadata + if card.data.license is None: + if not ("license_name" in card.data and "license_link" in card.data): + return False, ( + "License not found. Please add a license to your model card using the `license` metadata or a" + " `license_name`/`license_link` pair." + ) + + # Enforce card content + if len(card.text) < 200: + return False, "Please add a description to your model card, it is too short." + + return True, "" + + +def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]: + """Makes sure the model is on the hub, and uses a valid configuration (in the latest transformers version)""" + try: + config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) + if test_tokenizer: + tokenizer_config = get_tokenizer_config(model_name) + if tokenizer_config is not None: + tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None) + else: + tokenizer_class_candidate = config.tokenizer_class + + + tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate) + if tokenizer_class is None: + return ( + False, + f"uses {tokenizer_class_candidate}, which is not in a transformers release, therefore not supported at the moment.", + None + ) + return True, None, config + + except ValueError: + return ( + False, + "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.", + None + ) + + except Exception as e: + return False, "was not found on hub!", None + + +def get_model_size(model_info: ModelInfo, precision: str): + """Gets the model size from the configuration, or the model name if the configuration does not contain the information.""" + try: + model_size = round(model_info.safetensors["total"] / 1e9, 3) + except (AttributeError, TypeError): + return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py + + size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1 + model_size = size_factor * model_size + return model_size + +def get_model_arch(model_info: ModelInfo): + """Gets the model architecture from the configuration""" + return model_info.config.get("architectures", "Unknown") + +def already_submitted_models(requested_models_dir: str) -> set[str]: + depth = 1 + file_names = [] + users_to_submission_dates = defaultdict(list) + + for root, _, files in os.walk(requested_models_dir): + current_depth = root.count(os.sep) - requested_models_dir.count(os.sep) + if current_depth == depth: + for file in files: + if not file.endswith(".json"): + continue + with open(os.path.join(root, file), "r") as f: + info = json.load(f) + file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}") + + # Select organisation + if info["model"].count("/") == 0 or "submitted_time" not in info: + continue + organisation, _ = info["model"].split("/") + users_to_submission_dates[organisation].append(info["submitted_time"]) + + return set(file_names), users_to_submission_dates + +def check_adapter_config_and_base_model(model_name: str, base_model: str, token: str = None) -> tuple[bool, str]: + """ + Checks if the model exists on HuggingFace and is accessible. + + Args: + model_name: Name of the model to check + base_model: Expected base model name (not used anymore) + token: HuggingFace API token (optional) + + Returns: + Tuple[bool, str]: A tuple containing: + - is_valid: Whether the model exists and is accessible + - error_message: Error message if the model is invalid + """ + try: + # Check if model exists on HuggingFace + # First try without token (for public models) + try: + # Try to access public model without token + api_public = HfApi() + model_info = api_public.model_info(repo_id=model_name) + logger.debug(f"Successfully accessed model {model_name}") + return True, None + except Exception as e: + logger.debug(f"Could not access model without token") + # If that fails, try with token (for private models) + if token: + try: + api_with_token = HfApi(token=token) + model_info = api_with_token.model_info(repo_id=model_name) + logger.debug(f"Successfully accessed model {model_name} with authentication") + return True, None + except Exception as e: + return False, f"Model {model_name} not found or not accessible: {str(e)}" + else: + return False, f"Model {model_name} not found or not accessible: {str(e)}" + except Exception as e: + return False, f"Error validating model: {str(e)}" + +def has_adapter_config(model_name: str, token: str = None) -> tuple[bool, str]: + """ + Checks if the model repository contains adapter configuration files. + + Args: + model_name: Name of the model to check + token: HuggingFace API token (optional) + + Returns: + Tuple[bool, str]: A tuple containing: + - has_adapter: Whether the model contains adapter configuration + - message: Additional information or error message + """ + try: + # Initialize API with or without token + api = HfApi(token=token) if token else HfApi() + + # Get the list of files in the repository + repo_files = api.list_repo_files(repo_id=model_name) + + # Check for specific adapter configuration files + adapter_files = [ + "adapter_config.json", + "adapter_model.bin", + "adapter_model.safetensors", + "adapter.json", + "adapter.safetensors", + "adapter.bin" + ] + + # Look for specific adapter files + found_adapter_files = [] + for file in repo_files: + file_lower = file.lower() + if any(adapter_file.lower() in file_lower for adapter_file in adapter_files): + found_adapter_files.append(file) + + # Check if we found adapter configuration + has_adapter = len(found_adapter_files) > 0 + + if has_adapter: + adapter_files_str = ", ".join(found_adapter_files) + return True, f"Found adapter configuration: {adapter_files_str}" + else: + return False, "No adapter configuration found" + + except Exception as e: + return False, f"Error checking for adapter configuration: {str(e)}" + +def has_safetensor_model(model_name: str, token: str = None) -> tuple[bool, str]: + """ + Checks if the model repository contains safetensor model files. + + Args: + model_name: Name of the model to check + token: HuggingFace API token (optional) + + Returns: + Tuple[bool, str]: A tuple containing: + - has_safetensor: Whether the model contains safetensor model files + - message: Additional information or error message + """ + try: + # Initialize API with or without token + api = HfApi(token=token) if token else HfApi() + + # Get the list of files in the repository + repo_files = api.list_repo_files(repo_id=model_name) + + # Look for safetensor model files (model_*.safetensors) + safetensor_files = [] + model_pattern = "model_" + safetensor_extension = ".safetensors" + + for file in repo_files: + file_lower = file.lower() + if model_pattern in file_lower and file_lower.endswith(safetensor_extension): + safetensor_files.append(file) + + # Check if we found any safetensor model files + has_safetensor = len(safetensor_files) > 0 + + if has_safetensor: + safetensor_files_str = ", ".join(safetensor_files) + return True, f"Found safetensor model files: {safetensor_files_str}" + else: + # If no model_*.safetensors files, check for any .safetensors files + any_safetensor_files = [file for file in repo_files if file.lower().endswith(safetensor_extension)] + + if any_safetensor_files: + safetensor_files_str = ", ".join(any_safetensor_files) + return True, f"Found safetensor files: {safetensor_files_str}" + else: + return False, "No safetensor model files found" + + except Exception as e: + return False, f"Error checking for safetensor model files: {str(e)}" + +def determine_model_type(model_name: str, token: str = None) -> tuple[str, str]: + """ + Determines the type of model based on the files in the repository. + + Args: + model_name: Name of the model to check + token: HuggingFace API token (optional) + + Returns: + Tuple[str, str]: A tuple containing: + - model_type: Type of model (adapter, merged_model, unknown) + - message: Additional information or details + """ + try: + # Check for adapter configuration + has_adapter, adapter_message = has_adapter_config(model_name, token) + + # Check for safetensor model files + has_safetensor, safetensor_message = has_safetensor_model(model_name, token) + + # Determine model type based on checks + if has_adapter: + return "adapter", adapter_message + elif has_safetensor: + return "merged_model", safetensor_message + else: + return "unknown", "Could not determine model type: no adapter config or safetensor model files found" + + except Exception as e: + return "unknown", f"Error determining model type: {str(e)}" diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..bfd037f180aa3152b34a11ad2c4a984539e99aca --- /dev/null +++ b/src/utils.py @@ -0,0 +1,1305 @@ +import pandas as pd +import json +import os +import glob +import gradio as gr +import traceback +import re +import plotly.express as px +import plotly.graph_objects as go +from src.envs import API, TOKEN, REPO_ID +import requests +import logging +from datetime import datetime +from dotenv import load_dotenv + +# Logger setup +logger = logging.getLogger("mezura.utils") + +# Setup a dedicated logger for tracking model submissions +submission_logger = logging.getLogger("mezura.submissions") +submission_handler = logging.FileHandler("submissions.log") +submission_formatter = logging.Formatter('%(asctime)s - %(message)s') +submission_handler.setFormatter(submission_formatter) +submission_logger.addHandler(submission_handler) +submission_logger.setLevel(logging.INFO) + +# Model metadata lookup table - centralized for all benchmark functions +MODEL_METADATA_LOOKUP = { + "mistralai/Magistral-Small-2506": {"license": "Apache 2.0", "dtype": "bfloat16"}, + "newmindai/Qwen2.5-72B-Instruct": {"license": "Qwen", "dtype": "bfloat16"}, + "Qwen/Qwen2.5-72B-Instruct": {"license": "Qwen", "dtype": "bfloat16"}, + "deepseek-ai/DeepSeek-R1": {"license": "MIT", "dtype": "bfloat16"}, + "Qwen/Qwen3-32B": {"license": "Qwen", "dtype": "bfloat16"}, + "newmindai/QwQ-32B-r1": {"license": "Apache 2.0", "dtype": "bfloat16"}, + "google/gemma-3-27b-it": {"license": "Gemma", "dtype": "bfloat16"}, + "Qwen/Qwen3-14B": {"license": "Apache 2.0", "dtype": "bfloat16"}, + "newmindai/Llama-3.3-70b-Instruct": {"license": "Llama-3.3", "dtype": "bfloat16"}, + "Qwen/QwQ-32B": {"license": "Apache 2.0", "dtype": "bfloat16"}, + "microsoft/phi-4": {"license": "MIT", "dtype": "bfloat16"}, + "meta-llama/Meta-Llama-3.1-70B-Instruct": {"license": "Llama 3.1", "dtype": "bfloat16"}, + "grok-3": {"license": "Proprietary", "dtype": "Unknown"}, + "grok-3-mini-fast": {"license": "Proprietary", "dtype": "Unknown"}, + "meta-llama/Llama-3.3-70B-Instruct": {"license": "Llama-3.3", "dtype": "bfloat16"}, + "meta-llama/Llama-3.3-70b-Instruct": {"license": "Llama 3.3", "dtype": "bfloat16"}, # lowercase b variant + "newmindai/Qwen2.5-72b-Instruct": {"license": "Qwen", "dtype": "bfloat16"}, # lowercase b variant + "grok-3-mini-fast-beta": {"license": "Proprietary", "dtype": "Unknown"}, # beta variant + # Legacy entries for backward compatibility + "deepseek-r1-distill-llama-70b": {"license": "MIT", "dtype": "bfloat16"}, + "qwen-qwq-32b": {"license": "Apache 2.0", "dtype": "bfloat16"} +} + +def log_model_submission(repo_id, base_model): + """ + Logs model submission details to a dedicated log file + + Args: + repo_id: The repository ID of the model + base_model: The base model used + """ + submission_logger.info(f"SUBMISSION - REPO_ID: {repo_id}, BASE_MODEL: {base_model}") + +def restart_space(): + try: + if API is not None: + API.restart_space(repo_id=REPO_ID, token=TOKEN) + else: + print("Warning: API is None, cannot restart space") + except Exception as e: + print(f"Warning: Could not restart space: {e}") + + +def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame: + # Always include model and model_type_symbol columns + selected_columns = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + + # Add selected columns + for column in columns: + if column in df.columns: + selected_columns.append(column) + + # Add dummy column for search + selected_columns.append(AutoEvalColumn.dummy.name) + + return df[selected_columns] + + +def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame: + if not query: + return filtered_df + + # Split query by ; and filter for each part + queries = query.split(";") + filtered_dfs = [] + + for q in queries: + q = q.strip() + if not q: + continue + filtered_dfs.append(filtered_df[filtered_df[AutoEvalColumn.dummy.name].str.contains(q, case=False)]) + + if not filtered_dfs: + return filtered_df + + # Combine all filtered dataframes + return pd.concat(filtered_dfs).drop_duplicates() + + +def filter_models( + df: pd.DataFrame +) -> pd.DataFrame: + # Show all models + filtered_df = df.copy() + + # Always filter out deleted models + filtered_df = filtered_df[filtered_df[AutoEvalColumn.still_on_hub.name]] + + return filtered_df + + +# Yeni fonksiyonlar +def load_benchmark_results(): + """ + Load benchmark results from local files + """ + results = { + "avg": { + "evalmix": [], + "light_eval": [], + "snake": [], + "retrieval": [], + "arena": [], + "human_arena": [] + }, + "raw": { + "evalmix": [], + "light_eval": [], + "snake": [], + "retrieval": [], + "arena": [], + "human_arena": [] + } + } + + # Define benchmark types to look for + benchmark_types = ["evalmix", "light_eval", "snake", "retrieval", "arena", "human_arena"] # "lm_harness" removed + + # Load raw JSON files (detailed results) + for benchmark_type in benchmark_types: + dir_path = f"result/{benchmark_type}" + # if benchmark_type == "lm_harness" and not os.path.exists(dir_path): + # dir_path = "result/lmharness" + + # Skip if directory doesn't exist + if not os.path.exists(dir_path): + continue + + # Load avg files for leaderboard + avg_files = glob.glob(f"{dir_path}/avg_*.json") + + for file in avg_files: + try: + with open(file, "r") as f: + data = json.load(f) + + # Handle different data formats + if isinstance(data, list): + # If data is a list, convert it to a dictionary + if benchmark_type == "arena" and len(data) > 0: + # For arena, create a dictionary with model_name + processed_data = { + "model_name": f"Model {os.path.basename(file).replace('avg_', '').split('.')[0]}", + "file": os.path.basename(file) + } + + # Add metrics from the list if available + if len(data) > 0: + for i, item in enumerate(data): + if isinstance(item, dict): + for key, value in item.items(): + processed_data[f"item_{i}_{key}"] = value + + data = processed_data + else: + # For other types, create a dictionary with model_name + data = {"model_name": f"Model {os.path.basename(file).replace('avg_', '').split('.')[0]}"} + else: + # Ensure data is a dictionary + if not isinstance(data, dict): + data = {"model_name": f"Model {os.path.basename(file).replace('avg_', '').split('.')[0]}"} + + # Add file information + data["file"] = os.path.basename(file) + + # Ensure model_name exists + if "model_name" not in data or not data["model_name"]: + # Extract model ID from filename + file_name = os.path.basename(file) + model_id = file_name.replace("avg_", "").split(".")[0] + data["model_name"] = f"Model {model_id}" + + # Format the model name nicely for display + if "model_name" in data: + data["model_name"] = format_model_name(data["model_name"]) + + results["avg"][benchmark_type].append(data) + except Exception as e: + print(f"Error loading {benchmark_type} avg file: {file} - {e}") + + # Load detail files for pipeline-specific views + detail_files = glob.glob(f"{dir_path}/detail_*.json") + + for file in detail_files: + try: + with open(file, "r") as f: + data = json.load(f) + + # Handle different data formats + if isinstance(data, list): + # If data is a list, convert it to a dictionary + if benchmark_type == "arena" and len(data) > 0: + # For arena, create a dictionary with model_name + processed_data = { + "model_name": f"Model {os.path.basename(file).replace('detail_', '').split('.')[0]}", + "file": os.path.basename(file) + } + + # Add metrics from the list if available + if len(data) > 0: + for i, item in enumerate(data): + if isinstance(item, dict): + for key, value in item.items(): + processed_data[f"item_{i}_{key}"] = value + + data = processed_data + else: + # For other types, create a dictionary with model_name + data = {"model_name": f"Model {os.path.basename(file).replace('detail_', '').split('.')[0]}"} + else: + # Ensure data is a dictionary + if not isinstance(data, dict): + data = {"model_name": f"Model {os.path.basename(file).replace('detail_', '').split('.')[0]}"} + + # Add file information + data["file"] = os.path.basename(file) + + # Ensure model_name exists + if "model_name" not in data or not data["model_name"]: + # Extract model ID from filename + file_name = os.path.basename(file) + model_id = file_name.replace("detail_", "").split(".")[0] + data["model_name"] = f"Model {model_id}" + + # Format the model name nicely for display + if "model_name" in data: + data["model_name"] = format_model_name(data["model_name"]) + + results["raw"][benchmark_type].append(data) + + # Also add to default results to ensure we have all models in the leaderboard + # This ensures that models from detail files are also shown in the leaderboard + # Create a simplified version with just the model name and basic metrics + simplified_data = {"model_name": data["model_name"], "file": data["file"]} + + # Extract key metrics based on benchmark type + if benchmark_type == "retrieval": + # For RAG Judge, extract RAG_success_rate and average_judge_score if available + if "RAG_success_rate" in data: + simplified_data["RAG_success_rate"] = data["RAG_success_rate"] + if "average_judge_score" in data: + simplified_data["average_judge_score"] = data["average_judge_score"] + + # Add to default results if not already present + if not any(item.get("model_name") == data["model_name"] for item in results["avg"][benchmark_type]): + results["avg"][benchmark_type].append(simplified_data) + except Exception as e: + print(f"Error loading {benchmark_type} detail file: {file} - {e}") + + return results + +def format_model_name(model_name): + """ + Formats model names for better display in leaderboards: + - Replaces underscores with spaces + - Preserves original casing + + Args: + model_name: Original model name string + + Returns: + str: Formatted model name + """ + if not model_name: + return model_name + + # Split model name by organization/model if present + if "/" in model_name: + org, name = model_name.split("/", 1) + # Format the model part only - replace underscores with spaces but preserve casing + formatted_name = name.replace("_", " ") + return f"{org}/{formatted_name}" + else: + # Format the whole name - replace underscores with spaces but preserve casing + return model_name.replace("_", " ") + +def create_evalmix_table(data): + """ + Hybrid benchmark sonuçlarından tablo oluşturur + """ + if not data: + return pd.DataFrame() + + # Apply model name formatting and add metadata from lookup table + for item in data: + if "model_name" in item: + raw_model_name = item["model_name"] + item["model_name"] = format_model_name(raw_model_name) + + # Always use lookup table values for metadata (override JSON values) + for field in ["dtype", "license"]: + if raw_model_name in MODEL_METADATA_LOOKUP: + item[field] = MODEL_METADATA_LOOKUP[raw_model_name][field] + else: + defaults = {"dtype": "unknown", "license": "Unknown"} + item[field] = defaults[field] + + df = pd.DataFrame(data) + + # Remove the file column if present + if 'file' in df.columns: + df = df.drop(columns=['file']) + + # Remove all sample count columns + sample_columns = ["total_samples", "Total Samples", "samples_number"] + for col in sample_columns: + if col in df.columns: + df = df.drop(columns=[col]) + + if "model_name" in df.columns: + df = df.sort_values(by="model_name") + + # Ortalama metrik ekle - now handling the case when judge_metric is not available + if all(col in df.columns for col in ["lexical_metric", "semantic_metric"]): + if "judge_metric" in df.columns: + df["average_score"] = df[["lexical_metric", "semantic_metric", "judge_metric"]].mean(axis=1).round(2) + else: + df["average_score"] = df[["lexical_metric", "semantic_metric"]].mean(axis=1).round(2) + + # Float değerleri 2 ondalık basamağa yuvarla + for column in df.columns: + try: + if pd.api.types.is_float_dtype(df[column]): + df[column] = df[column].round(2) + except: + continue + + # Format column names for better display + column_mapping = {} + for col in df.columns: + # Skip already well-formatted columns + if col == "model_name": + column_mapping[col] = "Model Name" + continue + + # Special handling for Turkish and Multilingual Semantic + if "turkish_semantic" in col.lower(): + column_mapping[col] = "Turkish Semantic" + continue + + if "multilingual_semantic" in col.lower(): + column_mapping[col] = "Multilingual Semantic" + continue + + # Special handling for certain columns + if col == "average_score": + column_mapping[col] = "Average Score" + continue + if col == "lexical_metric": + column_mapping[col] = "Lexical Score" + continue + if col == "semantic_metric": + column_mapping[col] = "Semantic Score" + continue + if col == "judge_metric": + column_mapping[col] = "Judge Score" + continue + if col == "openai_accuracy": + column_mapping[col] = "OpenAI Accuracy" + continue + if col == "dtype": + column_mapping[col] = "Dtype" + continue + if col == "license": + column_mapping[col] = "License" + continue + + # Format column name + formatted_col = " ".join([word.capitalize() for word in col.replace("_", " ").split()]) + column_mapping[col] = formatted_col + + # Rename DataFrame columns + df = df.rename(columns=column_mapping) + + # Sort by openai_accuracy if present, otherwise use Average Score + if "Turkish Semantic" in df.columns: + df = df.sort_values(by="Turkish Semantic", ascending=False) + elif "turkish_semantic" in df.columns: + df = df.sort_values(by="turkish_semantic", ascending=False) + + # Define desired column order for EvalMix - metadata columns at the end + desired_cols = [ + "Model Name", + "Turkish Semantic", + "Multilingual Semantic", + "Average Score", + "Lexical Score", + "Semantic Score", + "Judge Score", + "OpenAI Accuracy", + "Dtype", + "License" + ] + + # Filter out columns that don't exist in the DataFrame + final_cols = [col for col in desired_cols if col in df.columns] + + # Add any remaining columns that weren't in the desired list + remaining_cols = [col for col in df.columns if col not in final_cols] + final_cols.extend(remaining_cols) + + # Set the new column order + df = df[final_cols] + + return df + +def create_light_eval_table(data, is_detail=False): + """ + Creates a table from Light Eval results + + Args: + data: Light eval data + is_detail: If True, keep 4 decimal places for detail results + """ + if not data: + return pd.DataFrame() + + # Light eval sonuçları farklı formatta, düzenleme gerekiyor + formatted_data = [] + for item in data: + model_data = {"model_name": format_model_name(item.get("model_name", "Bilinmeyen Model"))} + + # Add specific metrics we're interested in + metrics = [ + "overall_average", + "mmlu_average", + "truthfulqa", + "winogrande", + "hellaswag", + "gsm8k", + "arc_challenge", + "dtype", + "license" + # Removed total_samples + ] + + for metric in metrics: + try: + if metric in ["dtype", "license"]: + # Always use lookup table for metadata (override JSON values) + raw_model_name = item.get("model_name", "") + if raw_model_name in MODEL_METADATA_LOOKUP: + model_data[metric] = MODEL_METADATA_LOOKUP[raw_model_name][metric] + else: + # Default values for unknown models + defaults = {"dtype": "unknown", "license": "Unknown"} + model_data[metric] = defaults[metric] + elif metric in item: + if metric == "overall_average" and item[metric] == "N/A": + model_data[metric] = "N/A" + elif isinstance(item[metric], str) and item[metric] != "N/A": + model_data[metric] = float(item[metric]) + else: + model_data[metric] = item[metric] + else: + model_data[metric] = "N/A" + except Exception as e: + if metric in ["dtype", "license"]: + defaults = {"dtype": "unknown", "license": "Unknown"} + model_data[metric] = defaults[metric] + else: + model_data[metric] = item.get(metric, "N/A") + + formatted_data.append(model_data) + + # Create DataFrame + df = pd.DataFrame(formatted_data) + + # Remove the file column if present + if 'file' in df.columns: + df = df.drop(columns=['file']) + + # Try to convert metrics to float with error handling (only numeric columns) + numeric_cols = ["overall_average", "mmlu_average", "truthfulqa", "winogrande", "hellaswag", "gsm8k", "arc_challenge"] + for col in numeric_cols: + if col in df.columns: + try: + # Convert column to float but keep "N/A" as is + df[col] = df[col].apply(lambda x: float(x) if isinstance(x, (int, float)) or (isinstance(x, str) and x != "N/A") else x) + except Exception as e: + pass # Keep original values if conversion fails + + # Sort by overall_average if available + if "overall_average" in df.columns: + # For sorting, replace non-numeric values with NaN temporarily + sort_col = pd.to_numeric(df["overall_average"], errors="coerce") + # Sort with NaN at the end + df = df.iloc[sort_col.fillna(-1).argsort(kind="stable").iloc[::-1]] + + # Float değerleri yuvarlama - detail için 4 hane, avg için 2 hane + decimal_places = 4 if is_detail else 2 + for column in df.columns: + try: + if pd.api.types.is_float_dtype(df[column]): + df[column] = df[column].round(decimal_places) + except: + continue + + # Format column names according to user request + column_mapping = { + "model_name": "Model Name", + "overall_average": "Overall", + "mmlu_average": "MMLU", + "truthfulqa": "Truthfulqa", + "winogrande": "Winogrande", + "hellaswag": "Hellaswag", + "gsm8k": "Gsm8k", + "arc_challenge": "ARC", + "dtype": "Dtype", + "license": "License" + } + + # Rename DataFrame columns + df = df.rename(columns=column_mapping) + + # Define desired column order for Light-Eval - metadata columns at the end + desired_cols = [ + "Model Name", + "Overall", + "MMLU", + "Truthfulqa", + "Winogrande", + "Hellaswag", + "Gsm8k", + "ARC", + "Dtype", + "License" + ] + + # Filter out columns that don't exist in the DataFrame + final_cols = [col for col in desired_cols if col in df.columns] + + # Add any remaining columns that weren't in the desired list + remaining_cols = [col for col in df.columns if col not in final_cols] + final_cols.extend(remaining_cols) + + # Set the new column order + df = df[final_cols] + + return df + +def create_benchmark_plots(benchmark_data, data_type="avg"): + """ + Benchmark verilerinden grafikler oluşturur + + Args: + benchmark_data: Benchmark verileri + data_type: "avg" veya "raw" olabilir + """ + plots = {} + + # Hybrid Benchmark için çubuk grafik + if benchmark_data[data_type]["evalmix"]: + df = create_evalmix_table(benchmark_data[data_type]["evalmix"]) + if not df.empty and all(col in df.columns for col in ["model_name", "lexical_metric", "semantic_metric"]): + # Determine which metrics are available + metrics = ["lexical_metric", "semantic_metric"] + if "judge_metric" in df.columns: + metrics.append("judge_metric") + + # Veriyi uzun formata dönüştür + plot_df = pd.melt( + df, + id_vars=["model_name"], + value_vars=metrics, + var_name="Metrik", + value_name="Değer" + ) + + # Metrik isimlerini daha okunabilir hale getir + plot_df["Metrik"] = plot_df["Metrik"].replace({ + "lexical_metric": "Lexical Metric", + "semantic_metric": "Semantic Metric", + "judge_metric": "Judge Metric" + }) + + fig = px.bar( + plot_df, + x="model_name", + y="Değer", + color="Metrik", + title="Hybrid Benchmark Results", + labels={"model_name": "Model", "Değer": "Score"}, + barmode="group" + ) + plots["evalmix"] = fig + + # Light Eval için radar grafik + if benchmark_data[data_type]["light_eval"]: + df = create_light_eval_table(benchmark_data[data_type]["light_eval"]) + if not df.empty: + # Ortalama ve total_samples sütunlarını hariç tut + metric_cols = [col for col in df.columns if col not in ["model_name", "Ortalama", "file", "overall_average", "total_samples"]] + if metric_cols: + fig = go.Figure() + + for _, row in df.iterrows(): + fig.add_trace(go.Scatterpolar( + r=[row[col] for col in metric_cols], + theta=metric_cols, + fill='toself', + name=row.get("model_name", "Unknown Model") + )) + + fig.update_layout( + polar=dict( + radialaxis=dict( + visible=True, + range=[0, 1] + ) + ), + title="Light Eval Results", + showlegend=True + ) + plots["light_eval"] = fig + + return plots + +def create_combined_leaderboard_table(benchmark_data): + """ + Creates a combined leaderboard table from avg JSON data + """ + # Define benchmark types to include in the leaderboard + benchmark_types = ["evalmix", "light_eval", "retrieval", "arena", "human_arena"] # "lm_harness" removed + + all_models = {} + + # Process each benchmark type - exclude snake + for benchmark_type in benchmark_types: + # For human_arena, use raw data since there are no avg files + if benchmark_type == "human_arena": + data_source = benchmark_data["raw"][benchmark_type] + else: + data_source = benchmark_data["avg"][benchmark_type] + + # Skip if no data for this benchmark type + if not data_source: + continue + + # Process each model in this benchmark type + for item in data_source: + model_name = item.get("model_name", "") + if not model_name: + continue + + # Format the model name + formatted_model_name = format_model_name(model_name) + + # Create entry for this model if it doesn't exist + if formatted_model_name not in all_models: + all_models[formatted_model_name] = {"model_name": formatted_model_name} + + # Add metadata fields using lookup table + for field in ["dtype", "license"]: + if model_name in MODEL_METADATA_LOOKUP: + all_models[formatted_model_name][field] = MODEL_METADATA_LOOKUP[model_name][field] + else: + defaults = {"dtype": "unknown", "license": "Unknown"} + all_models[formatted_model_name][field] = defaults[field] + + # Extract only the fields we care about for each benchmark type + if benchmark_type == "evalmix": + if "lexical_metric" in item: + all_models[formatted_model_name]["Lexical"] = round(item.get("lexical_metric", 0), 2) + if "semantic_metric" in item: + all_models[formatted_model_name]["Multilingual Semantic"] = round(item.get("semantic_metric", 0), 2) + # Extract Turkish Semantic score if available + if "turkish_semantic" in item: + all_models[formatted_model_name]["Turkish Semantic"] = round(item.get("turkish_semantic", 0), 2) + elif "turkish_semantic_" in item: + all_models[formatted_model_name]["Turkish Semantic"] = round(item.get("turkish_semantic_", 0), 2) + elif "nlp_metrics" in item and "cosine_similarity_turkish" in item.get("nlp_metrics", {}): + turkish_sim = item.get("nlp_metrics", {}).get("cosine_similarity_turkish", {}).get("mean", 0) + all_models[formatted_model_name]["Turkish Semantic"] = round(turkish_sim, 2) + + # Extract Multilingual Semantic explicitly if available + if "multilingual_semantic" in item: + all_models[formatted_model_name]["Multilingual Semantic"] = round(item.get("multilingual_semantic", 0), 2) + elif "multilingual_semantic_" in item: + all_models[formatted_model_name]["Multilingual Semantic"] = round(item.get("multilingual_semantic_", 0), 2) + elif "nlp_metrics" in item and "cosine_similarity_multilingual" in item.get("nlp_metrics", {}): + multi_sim = item.get("nlp_metrics", {}).get("cosine_similarity_multilingual", {}).get("mean", 0) + all_models[formatted_model_name]["Multilingual Semantic"] = round(multi_sim, 2) + + # Extract BERTScore F1 if available + if "bert_score" in item and isinstance(item.get("bert_score"), dict) and "f1" in item.get("bert_score", {}): + bert_f1 = item.get("bert_score", {}).get("f1", {}).get("mean", 0) + all_models[formatted_model_name]["BERTScore F1"] = round(bert_f1, 2) + elif "nlp_metrics" in item and "bert_score" in item.get("nlp_metrics", {}): + bert_f1 = item.get("nlp_metrics", {}).get("bert_score", {}).get("f1", {}).get("mean", 0) + all_models[formatted_model_name]["BERTScore F1"] = round(bert_f1, 2) + # Remove dtype and license from JSON - use only lookup table values + elif benchmark_type == "light_eval": + if "overall_average" in item: + try: + if isinstance(item["overall_average"], str) and item["overall_average"] != "N/A": + avg_value = float(item["overall_average"]) + else: + avg_value = item["overall_average"] + all_models[formatted_model_name]["Light Eval"] = round(avg_value, 2) + except (ValueError, TypeError): + all_models[formatted_model_name]["Light Eval"] = item["overall_average"] + # Remove dtype and license from JSON - use only lookup table values + elif benchmark_type == "retrieval": + if "RAG_success_rate" in item: + avg_value = item["RAG_success_rate"] # Convert to percentage + all_models[formatted_model_name]["Retrieval"] = round(avg_value, 2) + # Remove dtype and license from JSON - use only lookup table values + elif benchmark_type == "arena": + if "Melo Score" in item: + all_models[formatted_model_name]["Auto Elo Score"] = round(item.get("Melo Score", 0), 2) + # Remove dtype and license from JSON - use only lookup table values + elif benchmark_type == "human_arena": + if "elo_rating" in item: + all_models[formatted_model_name]["Human Elo Score"] = round(item.get("elo_rating", 0), 2) + # Remove dtype and license from JSON - use only lookup table values + + # Create DataFrame from the collected data + if all_models: + df = pd.DataFrame(list(all_models.values())) + + # Rename model_name column to be more user-friendly + if "model_name" in df.columns: + df = df.rename(columns={"model_name": "Model Name"}) + + # Rename metadata columns to proper case + column_mapping = { + "dtype": "Dtype", + "license": "License" + } + df = df.rename(columns=column_mapping) + + # Make sure to remove the file column if it's present + if 'file' in df.columns: + df = df.drop(columns=['file']) + + # Remove run_id and user_id fields if present + for field in ['run_id', 'user_id', 'Run Id', 'User Id']: + if field in df.columns: + df = df.drop(columns=[field]) + + # Define the exact columns we want to display in the order we want them + display_cols = [ + "Auto Elo Score", + "Human Elo Score", + "Retrieval", + "Light Eval", + "Turkish Semantic", + "Multilingual Semantic", + "Lexical", + "Dtype", + "License" + ] + valid_display_cols = [col for col in display_cols if col in df.columns] + + # Fill NaN values with 0 + for col in valid_display_cols: + df[col] = df[col].fillna(0) + + # Explicitly reorder columns to match the UI display order exactly as in the screenshot + desired_order = ["Model Name", "Auto Elo Score", "Human Elo Score", "Retrieval", "Light Eval", "Turkish Semantic", "Multilingual Semantic", "Lexical", "Dtype", "License"] + + # Filter out columns that don't exist in the DataFrame + actual_order = [col for col in desired_order if col in df.columns] + + # Reorder columns + if len(actual_order) > 0: + df = df[actual_order] + + # Sort by Auto Elo Score if available, otherwise by Human Elo Score + if "Auto Elo Score" in df.columns: + df = df.sort_values(by="Auto Elo Score", ascending=False) + elif "Human Elo Score" in df.columns: + df = df.sort_values(by="Human Elo Score", ascending=False) + + # Float değerleri 2 ondalık basamağa yuvarla + for column in df.columns: + try: + if pd.api.types.is_float_dtype(df[column]): + df[column] = df[column].round(2) + except: + continue + + return df + + return pd.DataFrame() + +def create_raw_details_table(benchmark_data, benchmark_type): + """ + Creates a detailed table from raw JSON data for a specific benchmark type + """ + if not benchmark_data["raw"][benchmark_type]: + return pd.DataFrame() + + # Flatten the raw data + flattened_data = [] + + for item in benchmark_data["raw"][benchmark_type]: + raw_model_name = item.get("model_name", "Unknown Model") + flat_item = { + "file": item.get("file", ""), + "model_name": format_model_name(raw_model_name) + } + + # Always use lookup table values for metadata (override JSON values) + for field in ["dtype", "license"]: + if raw_model_name in MODEL_METADATA_LOOKUP: + flat_item[field] = MODEL_METADATA_LOOKUP[raw_model_name][field] + else: + defaults = {"dtype": "unknown", "license": "Unknown"} + flat_item[field] = defaults[field] + + # Define metadata fields to exclude - especially for LightEval + excluded_fields = ["file", "job_id", "start_time", "end_time", "run_id", "user_id", + "total_samples", "Total Samples", "samples_number", "sample_count", "eval_samples", + "total_success_references", "Total Success References", "total_eval_samples", + "provider", "Provider"] # Exclude provider fields + + # For LightEval, also exclude mmlu_tasks field + if benchmark_type == "light_eval": + excluded_fields.append("mmlu_tasks") + + # Add top-level fields (skip metadata fields and dtype/license which come from lookup table) + for key, value in item.items(): + if key not in excluded_fields and key not in ["dtype", "license"] and not key.startswith("_") and not isinstance(value, (dict, list)): + flat_item[key] = value + + # Flatten nested fields + for key, value in item.items(): + if key.startswith("_") or key in excluded_fields: + # Skip metadata fields + continue + elif isinstance(value, dict): + # Flatten nested dictionaries + _flatten_dict(value, flat_item, prefix=key) + elif isinstance(value, list) and all(isinstance(x, dict) for x in value): + # Flatten list of dictionaries + for i, sub_dict in enumerate(value): + _flatten_dict(sub_dict, flat_item, prefix=f"{key}_{i}") + + flattened_data.append(flat_item) + + # Create DataFrame + df = pd.DataFrame(flattened_data) + + # Ensure model_name is first column + if "model_name" in df.columns: + cols = ["model_name"] + [col for col in df.columns if col != "model_name"] + df = df[cols] + + # Float değerleri 2 ondalık basamağa yuvarla + for column in df.columns: + try: + if pd.api.types.is_float_dtype(df[column]): + df[column] = df[column].round(2) + except: + continue + + # Remove the file column + if 'file' in df.columns: + df = df.drop(columns=['file']) + + # Format column names for better display based on benchmark type + column_mapping = { + "model_name": "Model Name", + "dtype": "Dtype", + "license": "License" + } + + # Use specific column mappings for each benchmark type + if benchmark_type == "arena": + # Arena benchmark column mappings + custom_columns = { + "Melo Score": "Auto Elo Score", + "Win Rate": "Win Rate", + "95%(CI)": "95% CI", + "Response Tokens Average": "Completion Tokens", + "dtype": "Dtype", + "Licance": "License", + } + column_mapping.update(custom_columns) + + elif benchmark_type == "retrieval": + # RAG benchmark column mappings + custom_columns = { + "RAG_success_rate": "Rag Success Rate", + "max_correct_references": "Max Correct Ref.", + "total_false_positives": "Hallucinate Ref.", + "total_missed_references": "Missed Ref.", + "average_judge_score": "Legal Judge Score" + # Removed "samples_number": "Total Samples" + } + column_mapping.update(custom_columns) + + elif benchmark_type == "evalmix": + # Hybrid/EvalMix benchmark column mappings + custom_columns = { + "turkish_semantic_mean": "Turkish Semantic", + "turkish_semantic": "Turkish Semantic", + "multilingual_semantic_mean": "Multilingual Semantic", + "multilingual_semantic": "Multilingual Semantic", + "judge_metric": "Judge Score", + "bleu mean": "BLEU", + "rouge1 mean": "ROUGE-1", + "rouge2 mean": "ROUGE-2", + "rougeL mean": "ROUGE-L", + "bert_score f1 mean": "BERTScore F1", + "dtype": "Dtype", + "license": "License", + "bert_score precision mean": "BERTScore Precision" + # Removed "total_samples": "Total Samples" + } + column_mapping.update(custom_columns) + + # Calculate Judge Average Score from OpenAI scores if they exist + if all(col in df.columns for col in ["openai_accuracy", "openai_relevance", "openai_coherence"]): + df["judge_average_score"] = df[["openai_accuracy", "openai_relevance", "openai_coherence"]].mean(axis=1).round(2) + column_mapping["judge_average_score"] = "Judge Score" + + # Remove individual OpenAI score columns + columns_to_drop = ["openai_accuracy", "openai_relevance", "openai_coherence"] + for col in columns_to_drop: + if col in df.columns: + df = df.drop(columns=[col]) + + elif benchmark_type == "light_eval": + # Light Eval benchmark column mappings + custom_columns = { + "overall_average": "Overall", + "mmlu_average": "MMLU", + "truthfulqa": "Truthfulqa", + "winogrande": "Winogrande", + "hellaswag": "Hellaswag", + "gsm8k": "Gsm8k", + "arc_challenge": "ARC", + "dtype": "Dtype", + "license": "License" + } + column_mapping.update(custom_columns) + + elif benchmark_type == "snake": + # Snake benchmark column mappings + custom_columns = { + "elo": "Elo Rating", + "win_rate": "Win Rate", + "draw_rate": "Draw Rate", + "dtype": "Dtype", + "license": "License" + } + column_mapping.update(custom_columns) + + + + # For any columns not specifically mapped, use the default formatting + for col in df.columns: + if col not in column_mapping: + # Remove "mean" from column names + cleaned_col = col.replace(" mean", "") + # Format column name with default formatting + formatted_col = " ".join([word.capitalize() for word in cleaned_col.replace("_", " ").split()]) + column_mapping[col] = formatted_col + + # Rename DataFrame columns + df = df.rename(columns=column_mapping) + + # Drop specific columns based on benchmark type + if benchmark_type == "retrieval" and "Success Ref." in df.columns: + df = df.drop(columns=["Success Ref."]) + # Drop "Total Success References" column if it exists + if "Total Success References" in df.columns: + df = df.drop(columns=["Total Success References"]) + + # Sort by specific metrics based on benchmark type - AFTER column renaming + if benchmark_type == "arena" and "Auto Elo Score" in df.columns: + df = df.sort_values(by="Auto Elo Score", ascending=False) + + # Define desired column order for Arena - metadata columns at the end + desired_cols = [ + "Model Name", + "Auto Elo Score", + "Win Rate", + "95% CI", + "Completion Tokens", + "Dtype", + "License" + ] + + # Filter out columns that don't exist in the DataFrame + final_cols = [col for col in desired_cols if col in df.columns] + + # Add any remaining columns that weren't in the desired list + remaining_cols = [col for col in df.columns if col not in final_cols] + final_cols.extend(remaining_cols) + + # Set the new column order + df = df[final_cols] + + elif benchmark_type == "retrieval" and "Rag Success Rate" in df.columns: + df = df.sort_values(by="Rag Success Rate", ascending=False) + + # Define desired column order for Retrieval - metadata columns at the end + desired_cols = [ + "Model Name", + "Rag Success Rate", + "Max Correct Ref.", + "Hallucinate Ref.", + "Missed Ref.", + "Legal Judge Score", + "Dtype", + "License" + ] + + # Filter out columns that don't exist in the DataFrame + final_cols = [col for col in desired_cols if col in df.columns] + + # Add any remaining columns that weren't in the desired list + remaining_cols = [col for col in df.columns if col not in final_cols] + final_cols.extend(remaining_cols) + + # Set the new column order + df = df[final_cols] + elif benchmark_type == "evalmix": + if "Turkish Semantic" in df.columns: + df = df.sort_values(by="Turkish Semantic", ascending=False) + + # Define desired column order + desired_cols = [ + "Model Name", + "Turkish Semantic", + "Multilingual Semantic", + "Judge Score", + "BLEU", + "ROUGE-1", + "ROUGE-2", + "ROUGE-L", + "BERTScore F1", + "BERTScore Precision", + "BERTScore Recall", + "Dtype", + "License" + # "Total Samples" removed + ] + + # Filter out columns that don't exist in the DataFrame + final_cols = [col for col in desired_cols if col in df.columns] + + # Set the new column order + df = df[final_cols] + + # elif benchmark_type == "lm_harness" and "Overall" in df.columns: + # df = df.sort_values(by="Overall", ascending=False) + elif benchmark_type == "light_eval" and "Overall" in df.columns: + df = df.sort_values(by="Overall", ascending=False) + elif benchmark_type == "snake": + # Sort by Elo or Elo Rating if available + if "Elo Rating" in df.columns: + df = df.sort_values(by="Elo Rating", ascending=False) + elif "Elo" in df.columns: + df = df.sort_values(by="Elo", ascending=False) + + # Define desired column order for Snake - metadata columns at the end + desired_cols = [ + "Model Name", + "Elo Rating", + "Win Rate", + "Draw Rate", + "Wins", + "Losses", + "Ties", + "Loss Rate", + "Dtype", + "License" + ] + + # Filter out columns that don't exist in the DataFrame + final_cols = [col for col in desired_cols if col in df.columns] + + # Add any remaining columns that weren't in the desired list + remaining_cols = [col for col in df.columns if col not in final_cols] + final_cols.extend(remaining_cols) + + # Set the new column order + df = df[final_cols] + + return df + +def _flatten_dict(d, target_dict, prefix=""): + """ + Flattens nested dictionaries + + Args: + d: Dictionary to flatten + target_dict: Target dictionary to add flattened values to + prefix: Key prefix + """ + # List of fields to exclude when flattening + excluded_fields = ["total_success_references", "total_eval_samples", + "details", "metadata", "config", "logs"] + + # List of special field name transformations + special_field_mappings = { + "turkish_semantic_mean": "turkish_semantic", + "turkish_semantic_ mean": "turkish_semantic", + "multilingual_semantic_mean": "multilingual_semantic" + } + + for key, value in d.items(): + # Skip excluded fields + if key in excluded_fields: + continue + + # Apply special field name transformations + transformed_key = special_field_mappings.get(key, key) + + new_key = f"{prefix}_{transformed_key}" if prefix else transformed_key + + if isinstance(value, dict): + # Flatten nested dictionaries + _flatten_dict(value, target_dict, new_key) + elif isinstance(value, list) and all(isinstance(x, dict) for x in value): + # Flatten list of dictionaries + for i, sub_dict in enumerate(value): + _flatten_dict(sub_dict, target_dict, f"{new_key}_{i}") + elif isinstance(value, list) and len(value) > 0: + # Convert simple lists to string + try: + # For numeric lists, calculate mean and std + if all(isinstance(x, (int, float)) for x in value): + import numpy as np + target_dict[f"{new_key}_mean"] = round(sum(value) / len(value), 2) + if len(value) > 1: + target_dict[f"{new_key}_std"] = round(np.std(value), 2) + else: + # For non-numeric lists, convert to string + target_dict[new_key] = str(value) + except: + # Fallback to string representation + target_dict[new_key] = str(value) + else: + # Add other values directly + # Float değerleri yuvarla + if isinstance(value, float): + target_dict[new_key] = round(value, 2) + else: + target_dict[new_key] = value + +def update_supported_base_models(): + """ + Updates the list of supported base models by querying API. + This function is called when the application starts to keep the base model list up to date. + """ + try: + import requests + import json + import re + from dotenv import load_dotenv + import os + + # Load environment variables from .env file + load_dotenv() + + # Get API key from environment variable + api_key = os.getenv("API_KEY") + if not api_key: + logger.error("API_KEY not found in environment variables") + return None + + # API endpoint and headers + url = os.getenv("API_URL") + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}" + } + + # Test payload with non-existent model + payload = { + "source": "FILE_ID_BURAYA_GELECEK", + "base_model": "non-existent-model/fake-model-123", + "name": "test-invalid-model", + "description": "Desteklenen modelleri görmeye çalışıyorum" + } + + # Make the request + response = requests.post(url, headers=headers, json=payload) + + # Extract supported models from error message + if response.status_code != 200: + error_detail = response.json().get("detail", "") + # Extract the list of supported models using regex + match = re.search(r"list of supported models: \[(.*?)\]", error_detail) + if match: + supported_models_str = match.group(1) + # Parse the list of models without filtering out 'fast' models + supported_models = [model.strip("'") for model in supported_models_str.split(", ")] + + # Update the base model list in the configuration + from api.config import update_base_model_list + update_base_model_list(supported_models) + + logger.info(f"Successfully updated supported base models: {supported_models}") + return supported_models + else: + logger.error("Could not extract supported models from API response") + return None + else: + logger.error("Unexpected successful response from API") + return None + + except Exception as e: + logger.error(f"Error updating supported base models: {str(e)}") + return None + +def create_human_arena_table(data): + """ + Create Human Arena results table from detail data + """ + if not data: + return pd.DataFrame() + + # Apply model name formatting and add metadata from lookup table + for item in data: + if "model_name" in item: + raw_model_name = item["model_name"] + item["model_name"] = format_model_name(raw_model_name) + + # Always use lookup table values for metadata (override JSON values) + for field in ["dtype", "license"]: + if raw_model_name in MODEL_METADATA_LOOKUP: + item[field] = MODEL_METADATA_LOOKUP[raw_model_name][field] + else: + defaults = {"dtype": "unknown", "license": "Unknown"} + item[field] = defaults[field] + + df = pd.DataFrame(data) + + # Ensure model_name is first column + if "model_name" in df.columns: + cols = ["model_name"] + [col for col in df.columns if col != "model_name"] + df = df[cols] + + # Define column mapping for better display + column_mapping = { + 'model_name': 'Model Name', + 'elo_rating': 'Human Elo Score', + 'wins': 'Wins', + 'losses': 'Losses', + 'ties': 'Ties', + 'total_games': 'Total Games', + 'win_rate': 'Win Rate (%)', + 'votes': 'Votes', + 'dtype': 'Dtype', + 'license': 'License', + 'evaluation_date': 'Evaluation Date', + 'evaluation_type': 'Type' + } + + # Rename columns + df = df.rename(columns=column_mapping) + + # Remove file, run_id, evaluation_date, evaluation_type, votes, and provider columns if present + columns_to_remove = ['file', 'run_id', 'Evaluation Date', 'Type', 'provider', 'Provider', 'Votes'] + for col in columns_to_remove: + if col in df.columns: + df = df.drop(columns=[col]) + + # Sort by Human Elo Score in descending order + if 'Human Elo Score' in df.columns: + df = df.sort_values(by='Human Elo Score', ascending=False) + + # Round numeric columns + numeric_cols = ['Human Elo Score', 'Win Rate (%)'] + for col in numeric_cols: + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors='coerce').round(2) + + return df + diff --git a/submissions.log b/submissions.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391