File size: 6,437 Bytes
1e34522
 
 
 
 
 
 
 
 
 
 
16d6a7f
378f7fe
57eb2f4
 
 
 
 
 
378f7fe
 
57eb2f4
 
 
 
 
 
378f7fe
57eb2f4
378f7fe
57eb2f4
 
 
1e34522
 
24675aa
1e34522
 
 
 
8757c0f
 
6be31de
8757c0f
1e34522
ac13c3c
6be31de
6a21d55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c17e4f
16d6a7f
 
 
 
 
1e34522
16d6a7f
 
 
 
1e34522
16d6a7f
1e34522
 
16d6a7f
1e34522
16d6a7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378f7fe
16d6a7f
 
1e34522
 
 
 
 
16d6a7f
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from dataclasses import dataclass
from enum import Enum

@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str


class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
    Task1_Title_Search_Rate = Task("(Task1) Title Search Rate", "(Task1) Title Search Rate","(T1) Title Search Rate (%)")
    Task1_Precision = Task("(Task1) Precision", "(Task1) Precision","(T1) Precision (%)")
    Task1_Overlap = Task("(Task1) Overlap", "(Task1) Overlap","(T1) Overlap (%)")
    Task1_Precision_First_Author = Task("(Task1) Precision (First Author)", "(Task1) Precision (First Author)","(T1) Precision (First Author) (%)")
    Task1_Overlap_First_Author = Task("(Task1) Overlap (First Author)", "(Task1) Overlap (First Author)","(T1) Overlap (First Author) (%)")
    # Task2
    Task2_Similarity = Task("(Task2) Similarity", "(Task2) Similarity", "(T2) Similarity (%)")
    Task2_Entail_TRUE = Task("(Task2) Entail (TRUE)", "(Task2) Entail (TRUE)", "(T2) Entail (TRUE %)")
    Task2_Entail_GPT_4o = Task("(Task2) Entail (GPT-4o)", "(Task2) Entail (GPT-4o)", "(T2) Entail (GPT-4o %)")
    Task2_ROUGE_1 = Task("(Task2) ROUGE-1", "(Task2) ROUGE-1", "(T2) ROUGE-1 (%)")
    Task2_ROUGE_2 = Task("(Task2) ROUGE-2", "(Task2) ROUGE-2", "(T2) ROUGE-2 (%)")
    Task2_ROUGE_L = Task("(Task2) ROUGE-L", "(Task2) ROUGE-L", "(T2) ROUGE-L (%)")

    # Task3
    Task3_Precision = Task("(Task3) Precision", "(Task3) Precision", "(T3) Precision (%)")
    Task3_Title_Search_Rate = Task("(Task3) Title Search Rate", "(Task3) Title Search Rate", "(T3) Title Search Rate (%)")
    Task3_Overlap = Task("(Task3) Overlap", "(Task3) Overlap", "(T3) Overlap (%)")
    Task3_KPR = Task("(Task3) KPR", "(Task3) KPR", "(T3) KPR (%)")
    Task3_ROUGE_1 = Task("(Task3) ROUGE-1", "(Task3) ROUGE-1", "(T3) ROUGE-1 (%)")
    Task3_ROUGE_2 = Task("(Task3) ROUGE-2", "(Task3) ROUGE-2", "(T3) ROUGE-2 (%)")
    Task3_ROUGE_L = Task("(Task3) ROUGE-L", "(Task3) ROUGE-L", "(T3) ROUGE-L (%)")

# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">LLM-Based Automated Literature Review Evaluation Benchmark</h1>"""

# What does your leaderboard evaluate?
# Which evaluations are you running? how can people reproduce what you have?

INTRODUCTION_TEXT = """
This leaderboard evaluates Large Language Models (LLMs) on their ability to perform automated literature review tasks, including reference generation, abstract writing, and review composition.<br>
It is based on the study: <b>Large Language Models for Automated Literature Review: An Evaluation of Reference Generation, Abstract Writing, and Review Composition.</b><br>
The leaderboard measures how well different models perform in references generation, factually consistent, and stylistically appropriate academic texts.<br><br>
"""


# <table>
# <tr>
#     <td align="center">
#         <img src="https://huggingface.co/spaces/XuemeiTang/LLM4LitReview_Benchmark/resolve/main/llm_litReview_images/acc_score.png" width="150"><br>
#         Reference Generation: Precision
#     </td>
#     <td align="center">
#         <img src="https://huggingface.co/spaces/XuemeiTang/LLM4LitReview_Benchmark/resolve/main/llm_litReview_images/t2_true_entailment.png" width="150"><br>
#         Abstract Writing: True
#     </td>
#     <td align="center">
#         <img src="https://huggingface.co/spaces/XuemeiTang/LLM4LitReview_Benchmark/resolve/main/llm_litReview_images/acc_score_t3.png" width="150"><br>
#         Review Composition: Precision
#     </td>
#     <td align="center">
#         <img src="https://huggingface.co/spaces/XuemeiTang/LLM4LitReview_Benchmark/resolve/main/llm_litReview_images/kpr_score.png" width="150"><br>
#         Literature Review Writing: KPR
#     </td>
# </tr>
# </table>
# """

EVALUATION_QUEUE_TEXT = """"""
# Which evaluations are you running? How can people reproduce what you have?
LLM_BENCHMARKS_TEXT = """
## Introduction

The **LLM4LitReview Leaderboard** is dedicated to evaluating the capabilities of large language models (LLMs) in automating academic writing tasks, specifically literature review generation.

We focus on three subtasks:
1. **Reference Generation** – accuracy and validity of citations.
2. **Abstract Writing** – semantic coverage and factual consistency.
3. **Review Composition** – accuracy and validity of citations and semantic coverage and factual consistency.

This benchmark provides a structured and reproducible framework for assessing how close LLMs are to human-level academic writing quality.


---

## Evaluation Dataset

Our evaluation dataset includes:
- 1015 academic papers sampled from open-access journals across multiple disciplines.
- For each paper, models generate:
  - A reference list according to the given title and key words
  - An abstract summarizing according to the given title and key words
  - A short literature review paragraph based on the provided title, keywords, and abstract

---

## Metrics Explained
- **Precision** – Precision of references that correspond to real, verifiable academic papers.
- **Title_search_rate** – Whether the generated paper can be searched by title in Semantic Scholar.
- **Overlap_rate** – LLM-cited vs human-cited references.
- **Similarity** – Semantic similarity between model and human-generated texts (human-written and LLM-generated abstract).
- **Entailment (TRUE %)** – Factual consistency between model and human-generated texts (Use NLI model TRUE as the evaluator).
- **Entailment (GPT-4o%)** – Factual consistency between model and human-generated texts (Use GPT-4o as the evaluator).
- **ROUGE-1, ROUGE-2, ROUGE-L** – Standard metrics for evaluating text generation quality based on n-gram Overlap with human-written texts.
- **KPR (Key Point Recall)** – Measures how well key points from source documents are captured in the generated text.
---

"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@misc{tang2025largelanguagemodelsautomated,
      title={Large Language Models for Automated Literature Review: An Evaluation of Reference Generation, Abstract Writing, and Review Composition}, 
      author={Xuemei Tang and Xufeng Duan and Zhenguang G. Cai},
      year={2025},
      eprint={2412.13612},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2412.13612}, 
}"""