Spaces:

XuemeiTang
/

LLM4LitReview_Benchmark

Running

tangtang

Update space1

03ff9a5 about 2 months ago

6.44 kB

	from dataclasses import dataclass
	from enum import Enum

	@dataclass
	class Task:
	benchmark: str
	metric: str
	col_name: str


	class Tasks(Enum):
	# task_key in the json file, metric_key in the json file, name to display in the leaderboard
	Task1_Title_Search_Rate = Task("(Task1) Title Search Rate", "(Task1) Title Search Rate","(T1) Title Search Rate (%)")
	Task1_Precision = Task("(Task1) Precision", "(Task1) Precision","(T1) Precision (%)")
	Task1_Overlap = Task("(Task1) Overlap", "(Task1) Overlap","(T1) Overlap (%)")
	Task1_Precision_First_Author = Task("(Task1) Precision (First Author)", "(Task1) Precision (First Author)","(T1) Precision (First Author) (%)")
	Task1_Overlap_First_Author = Task("(Task1) Overlap (First Author)", "(Task1) Overlap (First Author)","(T1) Overlap (First Author) (%)")
	# Task2
	Task2_Similarity = Task("(Task2) Similarity", "(Task2) Similarity", "(T2) Similarity (%)")
	Task2_Entail_TRUE = Task("(Task2) Entail (TRUE)", "(Task2) Entail (TRUE)", "(T2) Entail (TRUE %)")
	Task2_Entail_GPT_4o = Task("(Task2) Entail (GPT-4o)", "(Task2) Entail (GPT-4o)", "(T2) Entail (GPT-4o %)")
	Task2_ROUGE_1 = Task("(Task2) ROUGE-1", "(Task2) ROUGE-1", "(T2) ROUGE-1 (%)")
	Task2_ROUGE_2 = Task("(Task2) ROUGE-2", "(Task2) ROUGE-2", "(T2) ROUGE-2 (%)")
	Task2_ROUGE_L = Task("(Task2) ROUGE-L", "(Task2) ROUGE-L", "(T2) ROUGE-L (%)")

	# Task3
	Task3_Precision = Task("(Task3) Precision", "(Task3) Precision", "(T3) Precision (%)")
	Task3_Title_Search_Rate = Task("(Task3) Title Search Rate", "(Task3) Title Search Rate", "(T3) Title Search Rate (%)")
	Task3_Overlap = Task("(Task3) Overlap", "(Task3) Overlap", "(T3) Overlap (%)")
	Task3_KPR = Task("(Task3) KPR", "(Task3) KPR", "(T3) KPR (%)")
	Task3_ROUGE_1 = Task("(Task3) ROUGE-1", "(Task3) ROUGE-1", "(T3) ROUGE-1 (%)")
	Task3_ROUGE_2 = Task("(Task3) ROUGE-2", "(Task3) ROUGE-2", "(T3) ROUGE-2 (%)")
	Task3_ROUGE_L = Task("(Task3) ROUGE-L", "(Task3) ROUGE-L", "(T3) ROUGE-L (%)")

	# Your leaderboard name
	TITLE = """<h1 align="center" id="space-title">LLM-Based Automated Literature Review Evaluation Benchmark</h1>"""

	# What does your leaderboard evaluate?
	# Which evaluations are you running? how can people reproduce what you have?

	INTRODUCTION_TEXT = """
	This leaderboard evaluates Large Language Models (LLMs) on their ability to perform automated literature review tasks, including reference generation, abstract writing, and review composition.<br>
	It is based on the study: <b>Large Language Models for Automated Literature Review: An Evaluation of Reference Generation, Abstract Writing, and Review Composition.</b><br>
	The leaderboard measures how well different models perform in references generation, factually consistent, and stylistically appropriate academic texts.<br><br>
	"""


	# <table>
	# <tr>
	# <td align="center">
	# <img src="https://huggingface.co/spaces/XuemeiTang/LLM4LitReview_Benchmark/resolve/main/llm_litReview_images/acc_score.png" width="150"><br>
	# Reference Generation: Precision
	# </td>
	# <td align="center">
	# <img src="https://huggingface.co/spaces/XuemeiTang/LLM4LitReview_Benchmark/resolve/main/llm_litReview_images/t2_true_entailment.png" width="150"><br>
	# Abstract Writing: True
	# </td>
	# <td align="center">
	# <img src="https://huggingface.co/spaces/XuemeiTang/LLM4LitReview_Benchmark/resolve/main/llm_litReview_images/acc_score_t3.png" width="150"><br>
	# Review Composition: Precision
	# </td>
	# <td align="center">
	# <img src="https://huggingface.co/spaces/XuemeiTang/LLM4LitReview_Benchmark/resolve/main/llm_litReview_images/kpr_score.png" width="150"><br>
	# Literature Review Writing: KPR
	# </td>
	# </tr>
	# </table>
	# """

	EVALUATION_QUEUE_TEXT = """"""
	# Which evaluations are you running? How can people reproduce what you have?
	LLM_BENCHMARKS_TEXT = """
	## Introduction

	The LLM4LitReview Leaderboard is dedicated to evaluating the capabilities of large language models (LLMs) in automating academic writing tasks, specifically literature review generation.

	We focus on three subtasks:
	1. Reference Generation – accuracy and validity of citations.
	2. Abstract Writing – semantic coverage and factual consistency.
	3. Review Composition – accuracy and validity of citations and semantic coverage and factual consistency.

	This benchmark provides a structured and reproducible framework for assessing how close LLMs are to human-level academic writing quality.


	---

	## Evaluation Dataset

	Our evaluation dataset includes:
	- 1015 academic papers sampled from open-access journals across multiple disciplines.
	- For each paper, models generate:
	- A reference list according to the given title and key words
	- An abstract summarizing according to the given title and key words
	- A short literature review paragraph based on the provided title, keywords, and abstract

	---

	## Metrics Explained
	- Precision – Precision of references that correspond to real, verifiable academic papers.
	- Title_search_rate – Whether the generated paper can be searched by title in Semantic Scholar.
	- Overlap_rate – LLM-cited vs human-cited references.
	- Similarity – Semantic similarity between model and human-generated texts (human-written and LLM-generated abstract).
	- Entailment (TRUE %) – Factual consistency between model and human-generated texts (Use NLI model TRUE as the evaluator).
	- Entailment (GPT-4o%) – Factual consistency between model and human-generated texts (Use GPT-4o as the evaluator).
	- ROUGE-1, ROUGE-2, ROUGE-L – Standard metrics for evaluating text generation quality based on n-gram Overlap with human-written texts.
	- KPR (Key Point Recall) – Measures how well key points from source documents are captured in the generated text.
	---

	"""

	CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
	CITATION_BUTTON_TEXT = r"""
	@misc{tang2025largelanguagemodelsautomated,
	title={Large Language Models for Automated Literature Review: An Evaluation of Reference Generation, Abstract Writing, and Review Composition},
	author={Xuemei Tang and Xufeng Duan and Zhenguang G. Cai},
	year={2025},
	eprint={2412.13612},
	archivePrefix={arXiv},
	primaryClass={cs.CL},
	url={https://arxiv.org/abs/2412.13612},
	}"""