File size: 6,437 Bytes
1e34522 16d6a7f 378f7fe 57eb2f4 378f7fe 57eb2f4 378f7fe 57eb2f4 378f7fe 57eb2f4 1e34522 24675aa 1e34522 8757c0f 6be31de 8757c0f 1e34522 ac13c3c 6be31de 6a21d55 5c17e4f 16d6a7f 1e34522 16d6a7f 1e34522 16d6a7f 1e34522 16d6a7f 1e34522 16d6a7f 378f7fe 16d6a7f 1e34522 16d6a7f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
from dataclasses import dataclass
from enum import Enum
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
class Tasks(Enum):
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
Task1_Title_Search_Rate = Task("(Task1) Title Search Rate", "(Task1) Title Search Rate","(T1) Title Search Rate (%)")
Task1_Precision = Task("(Task1) Precision", "(Task1) Precision","(T1) Precision (%)")
Task1_Overlap = Task("(Task1) Overlap", "(Task1) Overlap","(T1) Overlap (%)")
Task1_Precision_First_Author = Task("(Task1) Precision (First Author)", "(Task1) Precision (First Author)","(T1) Precision (First Author) (%)")
Task1_Overlap_First_Author = Task("(Task1) Overlap (First Author)", "(Task1) Overlap (First Author)","(T1) Overlap (First Author) (%)")
# Task2
Task2_Similarity = Task("(Task2) Similarity", "(Task2) Similarity", "(T2) Similarity (%)")
Task2_Entail_TRUE = Task("(Task2) Entail (TRUE)", "(Task2) Entail (TRUE)", "(T2) Entail (TRUE %)")
Task2_Entail_GPT_4o = Task("(Task2) Entail (GPT-4o)", "(Task2) Entail (GPT-4o)", "(T2) Entail (GPT-4o %)")
Task2_ROUGE_1 = Task("(Task2) ROUGE-1", "(Task2) ROUGE-1", "(T2) ROUGE-1 (%)")
Task2_ROUGE_2 = Task("(Task2) ROUGE-2", "(Task2) ROUGE-2", "(T2) ROUGE-2 (%)")
Task2_ROUGE_L = Task("(Task2) ROUGE-L", "(Task2) ROUGE-L", "(T2) ROUGE-L (%)")
# Task3
Task3_Precision = Task("(Task3) Precision", "(Task3) Precision", "(T3) Precision (%)")
Task3_Title_Search_Rate = Task("(Task3) Title Search Rate", "(Task3) Title Search Rate", "(T3) Title Search Rate (%)")
Task3_Overlap = Task("(Task3) Overlap", "(Task3) Overlap", "(T3) Overlap (%)")
Task3_KPR = Task("(Task3) KPR", "(Task3) KPR", "(T3) KPR (%)")
Task3_ROUGE_1 = Task("(Task3) ROUGE-1", "(Task3) ROUGE-1", "(T3) ROUGE-1 (%)")
Task3_ROUGE_2 = Task("(Task3) ROUGE-2", "(Task3) ROUGE-2", "(T3) ROUGE-2 (%)")
Task3_ROUGE_L = Task("(Task3) ROUGE-L", "(Task3) ROUGE-L", "(T3) ROUGE-L (%)")
# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">LLM-Based Automated Literature Review Evaluation Benchmark</h1>"""
# What does your leaderboard evaluate?
# Which evaluations are you running? how can people reproduce what you have?
INTRODUCTION_TEXT = """
This leaderboard evaluates Large Language Models (LLMs) on their ability to perform automated literature review tasks, including reference generation, abstract writing, and review composition.<br>
It is based on the study: <b>Large Language Models for Automated Literature Review: An Evaluation of Reference Generation, Abstract Writing, and Review Composition.</b><br>
The leaderboard measures how well different models perform in references generation, factually consistent, and stylistically appropriate academic texts.<br><br>
"""
# <table>
# <tr>
# <td align="center">
# <img src="https://huggingface.co/spaces/XuemeiTang/LLM4LitReview_Benchmark/resolve/main/llm_litReview_images/acc_score.png" width="150"><br>
# Reference Generation: Precision
# </td>
# <td align="center">
# <img src="https://huggingface.co/spaces/XuemeiTang/LLM4LitReview_Benchmark/resolve/main/llm_litReview_images/t2_true_entailment.png" width="150"><br>
# Abstract Writing: True
# </td>
# <td align="center">
# <img src="https://huggingface.co/spaces/XuemeiTang/LLM4LitReview_Benchmark/resolve/main/llm_litReview_images/acc_score_t3.png" width="150"><br>
# Review Composition: Precision
# </td>
# <td align="center">
# <img src="https://huggingface.co/spaces/XuemeiTang/LLM4LitReview_Benchmark/resolve/main/llm_litReview_images/kpr_score.png" width="150"><br>
# Literature Review Writing: KPR
# </td>
# </tr>
# </table>
# """
EVALUATION_QUEUE_TEXT = """"""
# Which evaluations are you running? How can people reproduce what you have?
LLM_BENCHMARKS_TEXT = """
## Introduction
The **LLM4LitReview Leaderboard** is dedicated to evaluating the capabilities of large language models (LLMs) in automating academic writing tasks, specifically literature review generation.
We focus on three subtasks:
1. **Reference Generation** β accuracy and validity of citations.
2. **Abstract Writing** β semantic coverage and factual consistency.
3. **Review Composition** β accuracy and validity of citations and semantic coverage and factual consistency.
This benchmark provides a structured and reproducible framework for assessing how close LLMs are to human-level academic writing quality.
---
## Evaluation Dataset
Our evaluation dataset includes:
- 1015 academic papers sampled from open-access journals across multiple disciplines.
- For each paper, models generate:
- A reference list according to the given title and key words
- An abstract summarizing according to the given title and key words
- A short literature review paragraph based on the provided title, keywords, and abstract
---
## Metrics Explained
- **Precision** β Precision of references that correspond to real, verifiable academic papers.
- **Title_search_rate** β Whether the generated paper can be searched by title in Semantic Scholar.
- **Overlap_rate** β LLM-cited vs human-cited references.
- **Similarity** β Semantic similarity between model and human-generated texts (human-written and LLM-generated abstract).
- **Entailment (TRUE %)** β Factual consistency between model and human-generated texts (Use NLI model TRUE as the evaluator).
- **Entailment (GPT-4o%)** β Factual consistency between model and human-generated texts (Use GPT-4o as the evaluator).
- **ROUGE-1, ROUGE-2, ROUGE-L** β Standard metrics for evaluating text generation quality based on n-gram Overlap with human-written texts.
- **KPR (Key Point Recall)** β Measures how well key points from source documents are captured in the generated text.
---
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@misc{tang2025largelanguagemodelsautomated,
title={Large Language Models for Automated Literature Review: An Evaluation of Reference Generation, Abstract Writing, and Review Composition},
author={Xuemei Tang and Xufeng Duan and Zhenguang G. Cai},
year={2025},
eprint={2412.13612},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2412.13612},
}""" |