Spaces:

newmindai
/

Mezura

Running

App Files Files Community

nmmursit commited on Jul 17

Commit

8c404fc

verified ·

1 Parent(s): a1aa495

Upload 5 files

Browse files

A Comprehensive Rag Score Calculation Metric Added

Files changed (5) hide show

app.py +1 -1
src/.DS_Store +0 -0
src/display/about.py +15 -5
src/utils.py +63 -7
utils/rag_score_calculator.py +171 -0

app.py CHANGED Viewed

@@ -267,7 +267,7 @@ def create_demo():
                                 value=rag_details_df,
                                 label="Retrieval Detailed Results",
                                 interactive=False,
-                                column_widths=["300px", "180px", "180px", "180px", "143px", "190px", "100px", "120px"]
                             )

                                 value=rag_details_df,
                                 label="Retrieval Detailed Results",
                                 interactive=False,
+                                column_widths=["280px", "120px", "140px", "140px", "140px", "120px", "160px", "100px", "120px"]
                             )

src/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

src/display/about.py CHANGED Viewed

@@ -199,11 +199,21 @@ An evaluation system designed to assess Retrieval-Augmented Generation (RAG) cap
 - **Content Safety**: Evaluates content safety and appropriateness
 **Judge Model**: nvidia/Llama-3_1-Nemotron-Ultra-253B-v1
-The final RAG score is calculated as a weighted combination of:
-- RAG Success Rate (e.g., 0.91)
-- Average Judge Score (e.g., 84.7)
-- Retrieval Accuracy (based on false positives and missed references)
-- Generation Quality (based on format compliance and response coherence)
 ### 6. 👥 Human Arena
 Human Arena is a community-driven evaluation platform where language models are compared through human preferences and voting. This evaluation method captures real-world user preferences and provides insights into model performance from a human perspective.

 - **Content Safety**: Evaluates content safety and appropriateness
 **Judge Model**: nvidia/Llama-3_1-Nemotron-Ultra-253B-v1
+**RAG Score Calculation**
+The RAG Score is a comprehensive metric that combines multiple performance indicators using dynamic normalization across all models. The formula weights different aspects of retrieval performance:
+**Formula Components:**
+- **RAG Success Rate** (0.9 weight): Direct percentage of successful retrievals (higher is better)
+- **Normalized False Positives** (0.9 weight): Hallucinated references, min-max normalized (lower is better)
+- **Normalized Max Correct References** (0.1 weight): Maximum correct retrievals, min-max normalized (higher is better)
+- **Normalized Missed References** (0.1 weight): Relevant documents not retrieved, min-max normalized (lower is better)
+**Final Score Formula:**
+```
+RAG Score = (0.9 × RAG_success_rate + 0.9 × norm_false_positives +
+             0.1 × norm_max_correct + 0.1 × norm_missed_refs) ÷ 2.0
+```
 ### 6. 👥 Human Arena
 Human Arena is a community-driven evaluation platform where language models are compared through human preferences and voting. This evaluation method captures real-world user preferences and provides insights into model performance from a human perspective.

src/utils.py CHANGED Viewed

@@ -12,6 +12,7 @@ import requests
 import logging
 from datetime import datetime
 from dotenv import load_dotenv
 # Logger setup
 logger = logging.getLogger("mezura.utils")
@@ -144,6 +145,26 @@ def load_benchmark_results():
     # Define benchmark types to look for
     benchmark_types = ["evalmix", "light_eval", "snake", "retrieval", "arena", "human_arena"]  # "lm_harness" removed
     # Load raw JSON files (detailed results)
     for benchmark_type in benchmark_types:
         dir_path = f"result/{benchmark_type}"
@@ -202,6 +223,15 @@ def load_benchmark_results():
                     if "model_name" in data:
                         data["model_name"] = format_model_name(data["model_name"])
                     results["avg"][benchmark_type].append(data)
             except Exception as e:
                 print(f"Error loading {benchmark_type} avg file: {file} - {e}")
@@ -254,6 +284,15 @@ def load_benchmark_results():
                     if "model_name" in data:
                         data["model_name"] = format_model_name(data["model_name"])
                     results["raw"][benchmark_type].append(data)
                     # Also add to default results to ensure we have all models in the leaderboard
@@ -263,7 +302,10 @@ def load_benchmark_results():
                     # Extract key metrics based on benchmark type
                     if benchmark_type == "retrieval":
-                        # For RAG Judge, extract RAG_success_rate and average_judge_score if available
                         if "RAG_success_rate" in data:
                             simplified_data["RAG_success_rate"] = data["RAG_success_rate"]
                         if "average_judge_score" in data:
@@ -655,8 +697,8 @@ def create_combined_leaderboard_table(benchmark_data):
     # Process each benchmark type - exclude snake
     for benchmark_type in benchmark_types:
-        # For human_arena, use raw data since there are no avg files
-        if benchmark_type == "human_arena":
             data_source = benchmark_data["raw"][benchmark_type]
         else:
             data_source = benchmark_data["avg"][benchmark_type]
@@ -730,8 +772,12 @@ def create_combined_leaderboard_table(benchmark_data):
                         all_models[formatted_model_name]["Light Eval"] = item["overall_average"]
                 # Remove dtype and license from JSON - use only lookup table values
             elif benchmark_type == "retrieval":
-                if "RAG_success_rate" in item:
-                    avg_value = item["RAG_success_rate"]  # Convert to percentage
                     all_models[formatted_model_name]["Retrieval"] = round(avg_value, 2)
                 # Remove dtype and license from JSON - use only lookup table values
             elif benchmark_type == "arena":
@@ -911,6 +957,7 @@ def create_raw_details_table(benchmark_data, benchmark_type):
     elif benchmark_type == "retrieval":
         # RAG benchmark column mappings
         custom_columns = {
             "RAG_success_rate": "Rag Success Rate",
             "max_correct_references": "Max Correct Ref.",
             "total_false_positives": "Hallucinate Ref.",
@@ -1023,12 +1070,21 @@ def create_raw_details_table(benchmark_data, benchmark_type):
         # Set the new column order
         df = df[final_cols]
-    elif benchmark_type == "retrieval" and "Rag Success Rate" in df.columns:
-        df = df.sort_values(by="Rag Success Rate", ascending=False)
         # Define desired column order for Retrieval - metadata columns at the end
         desired_cols = [
             "Model Name",
             "Rag Success Rate",
             "Max Correct Ref.",
             "Hallucinate Ref.",

 import logging
 from datetime import datetime
 from dotenv import load_dotenv
+from utils.rag_score_calculator import RAGScoreCalculator
 # Logger setup
 logger = logging.getLogger("mezura.utils")
     # Define benchmark types to look for
     benchmark_types = ["evalmix", "light_eval", "snake", "retrieval", "arena", "human_arena"]  # "lm_harness" removed
+    # Initialize RAG Score calculator for runtime calculation
+    rag_calculator = None
+    rag_scores_cache = {}  # Cache for RAG scores by run_id
+    try:
+        rag_calculator = RAGScoreCalculator()
+        if rag_calculator.stats:
+            logger.info("RAG Score calculator initialized successfully")
+            # Pre-calculate RAG scores from detail files
+            for data in rag_calculator.all_data:
+                run_id = data.get('run_id')
+                if run_id:
+                    rag_score = rag_calculator.calculate_rag_score(data)
+                    rag_scores_cache[run_id] = rag_score
+            logger.info(f"Pre-calculated {len(rag_scores_cache)} RAG scores")
+        else:
+            logger.warning("No RAG statistics available for score calculation")
+    except Exception as e:
+        logger.warning(f"Could not initialize RAG Score calculator: {e}")
+        rag_calculator = None
     # Load raw JSON files (detailed results)
     for benchmark_type in benchmark_types:
         dir_path = f"result/{benchmark_type}"
                     if "model_name" in data:
                         data["model_name"] = format_model_name(data["model_name"])
+                    # Add pre-calculated RAG Score for retrieval data (from detail files cache)
+                    if benchmark_type == "retrieval" and rag_scores_cache:
+                        run_id = data.get('run_id')
+                        if run_id and run_id in rag_scores_cache:
+                            data["RAG_score"] = rag_scores_cache[run_id]
+                            logger.debug(f"Added cached RAG_score {rag_scores_cache[run_id]} for avg file {data.get('model_name', 'unknown')}")
+                        else:
+                            logger.debug(f"No cached RAG_score found for run_id: {run_id}")
                     results["avg"][benchmark_type].append(data)
             except Exception as e:
                 print(f"Error loading {benchmark_type} avg file: {file} - {e}")
                     if "model_name" in data:
                         data["model_name"] = format_model_name(data["model_name"])
+                    # Add pre-calculated RAG Score for retrieval data (from cache)
+                    if benchmark_type == "retrieval" and rag_scores_cache:
+                        run_id = data.get('run_id')
+                        if run_id and run_id in rag_scores_cache:
+                            data["RAG_score"] = rag_scores_cache[run_id]
+                            logger.debug(f"Added cached RAG_score {rag_scores_cache[run_id]} for detail file {data.get('model_name', 'unknown')}")
+                        else:
+                            logger.debug(f"No cached RAG_score found for detail run_id: {run_id}")
                     results["raw"][benchmark_type].append(data)
                     # Also add to default results to ensure we have all models in the leaderboard
                     # Extract key metrics based on benchmark type
                     if benchmark_type == "retrieval":
+                        # For RAG Judge, extract RAG_score, RAG_success_rate and average_judge_score if available
+                        # RAG_score should be available since we just calculated it above
+                        if "RAG_score" in data:
+                            simplified_data["RAG_score"] = data["RAG_score"]
                         if "RAG_success_rate" in data:
                             simplified_data["RAG_success_rate"] = data["RAG_success_rate"]
                         if "average_judge_score" in data:
     # Process each benchmark type - exclude snake
     for benchmark_type in benchmark_types:
+        # For human_arena and retrieval, use raw data since avg files don't have complete info
+        if benchmark_type in ["human_arena", "retrieval"]:
             data_source = benchmark_data["raw"][benchmark_type]
         else:
             data_source = benchmark_data["avg"][benchmark_type]
                         all_models[formatted_model_name]["Light Eval"] = item["overall_average"]
                 # Remove dtype and license from JSON - use only lookup table values
             elif benchmark_type == "retrieval":
+                # Prefer RAG_score if available, otherwise use RAG_success_rate
+                if "RAG_score" in item:
+                    avg_value = item["RAG_score"]
+                    all_models[formatted_model_name]["Retrieval"] = round(avg_value, 4)  # Higher precision for RAG Score
+                elif "RAG_success_rate" in item:
+                    avg_value = item["RAG_success_rate"]
                     all_models[formatted_model_name]["Retrieval"] = round(avg_value, 2)
                 # Remove dtype and license from JSON - use only lookup table values
             elif benchmark_type == "arena":
     elif benchmark_type == "retrieval":
         # RAG benchmark column mappings
         custom_columns = {
+            "RAG_score": "RAG Score",
             "RAG_success_rate": "Rag Success Rate",
             "max_correct_references": "Max Correct Ref.",
             "total_false_positives": "Hallucinate Ref.",
         # Set the new column order
         df = df[final_cols]
+    elif benchmark_type == "retrieval":
+        # Sort by RAG Score if available, otherwise by Rag Success Rate
+        if "RAG Score" in df.columns:
+            df = df.sort_values(by="RAG Score", ascending=False)
+            primary_metric = "RAG Score"
+        elif "Rag Success Rate" in df.columns:
+            df = df.sort_values(by="Rag Success Rate", ascending=False)
+            primary_metric = "Rag Success Rate"
+        else:
+            primary_metric = None
         # Define desired column order for Retrieval - metadata columns at the end
         desired_cols = [
             "Model Name",
+            "RAG Score",
             "Rag Success Rate",
             "Max Correct Ref.",
             "Hallucinate Ref.",

utils/rag_score_calculator.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import json
+import os
+from typing import Dict, List, Tuple
+class RAGScoreCalculator:
+    """
+    Dynamic RAG Score calculator that calculates scores at runtime
+    without modifying the original JSON files.
+    """
+    def __init__(self, retrieval_dir: str = "result/retrieval"):
+        self.retrieval_dir = retrieval_dir
+        self.stats = None
+        self.all_data = None
+        self._load_and_analyze()
+    def _load_and_analyze(self):
+        """Load all retrieval detail files and calculate normalization statistics."""
+        self.all_data = []
+        detail_files = [f for f in os.listdir(self.retrieval_dir) if f.startswith('detail_')]
+        if not detail_files:
+            print("Warning: No detail files found in retrieval directory")
+            return
+        for filename in detail_files:
+            filepath = os.path.join(self.retrieval_dir, filename)
+            try:
+                with open(filepath, 'r') as f:
+                    data = json.load(f)
+                    self.all_data.append(data)
+            except Exception as e:
+                print(f"Error loading {filename}: {e}")
+                continue
+        if not self.all_data:
+            print("Warning: No valid data loaded from detail files")
+            return
+        # Calculate normalization statistics
+        self._calculate_stats()
+    def _calculate_stats(self):
+        """Calculate min/max statistics for normalization."""
+        if not self.all_data:
+            return
+        # Extract values for analysis
+        rag_success_rates = [d.get('RAG_success_rate', 0) for d in self.all_data]
+        max_correct_refs = [d.get('max_correct_references', 0) for d in self.all_data]
+        false_positives = [d.get('total_false_positives', 0) for d in self.all_data]
+        missed_refs = [d.get('total_missed_references', 0) for d in self.all_data]
+        # Calculate min/max for normalization
+        self.stats = {
+            'rag_success_rate': {
+                'min': min(rag_success_rates),
+                'max': max(rag_success_rates)
+            },
+            'max_correct_references': {
+                'min': min(max_correct_refs),
+                'max': max(max_correct_refs)
+            },
+            'total_false_positives': {
+                'min': min(false_positives),
+                'max': max(false_positives)
+            },
+            'total_missed_references': {
+                'min': 0,  # Fixed minimum value
+                'max': 7114  # Fixed maximum value
+            }
+        }
+    def normalize_value(self, value, min_val, max_val, higher_is_better=True):
+        """Normalize a value to 0-1 range."""
+        if max_val == min_val:
+            return 1.0  # If all values are the same, return 1
+        normalized = (value - min_val) / (max_val - min_val)
+        if not higher_is_better:
+            normalized = 1 - normalized  # Flip for "lower is better" metrics
+        return normalized
+    def calculate_rag_score(self, data: Dict) -> float:
+        """Calculate the RAG score for a single model's data."""
+        if not self.stats:
+            print("Warning: No statistics available for normalization")
+            return 0.0
+        # Extract values with defaults
+        rag_success_rate = data.get('RAG_success_rate', 0)
+        max_correct_refs = data.get('max_correct_references', 0)
+        false_positives = data.get('total_false_positives', 0)
+        missed_refs = data.get('total_missed_references', 0)
+        # Normalize values (0-1)
+        norm_max_correct = self.normalize_value(
+            max_correct_refs,
+            self.stats['max_correct_references']['min'],
+            self.stats['max_correct_references']['max'],
+            higher_is_better=True
+        )
+        norm_false_positives = self.normalize_value(
+            false_positives,
+            self.stats['total_false_positives']['min'],
+            self.stats['total_false_positives']['max'],
+            higher_is_better=False  # Lower is better
+        )
+        norm_missed_refs = self.normalize_value(
+            missed_refs,
+            self.stats['total_missed_references']['min'],
+            self.stats['total_missed_references']['max'],
+            higher_is_better=False  # Lower is better
+        )
+        # Calculate weighted score
+        # Weights: rag_success_rate=0.9, false_positives=0.9, max_correct=0.1, missed_refs=0.1
+        rag_score = (
+            0.9 * rag_success_rate +
+            0.9 * norm_false_positives +
+            0.1 * norm_max_correct +
+            0.1 * norm_missed_refs
+        ) / 2.0  # Divide by 2 since total weights = 2.0
+        return round(rag_score, 4)
+    def get_normalization_info(self) -> Dict:
+        """Get current normalization statistics for debugging."""
+        return {
+            'stats': self.stats,
+            'total_files': len(self.all_data) if self.all_data else 0,
+            'retrieval_dir': self.retrieval_dir
+        }
+    def refresh_stats(self):
+        """Refresh statistics by reloading data - call this when new data is added."""
+        print("Refreshing RAG Score normalization statistics...")
+        self._load_and_analyze()
+        return self.stats is not None
+def main():
+    """Main function for testing RAG score calculations."""
+    calculator = RAGScoreCalculator()
+    print("RAG Score Calculator (Runtime Only)")
+    print("===================================")
+    # Show normalization info
+    info = calculator.get_normalization_info()
+    print(f"Total files: {info['total_files']}")
+    print(f"Retrieval directory: {info['retrieval_dir']}")
+    if info['stats']:
+        print("\nNormalization ranges:")
+        for metric, data in info['stats'].items():
+            print(f"  {metric}: {data['min']} - {data['max']}")
+        print("\nSample RAG Score calculations:")
+        for i, data in enumerate(calculator.all_data[:5]):  # Show first 5
+            rag_score = calculator.calculate_rag_score(data)
+            model_name = data.get('model_name', 'Unknown')
+            print(f"  {model_name}: {rag_score}")
+    else:
+        print("\n❌ No statistics available for normalization")
+if __name__ == "__main__":
+    main()