Aidan Phillips
commited on
Commit
·
0885169
1
Parent(s):
f5893dd
accuracy scoring pretty good
Browse files- categories/accuracy.py +146 -0
- categories/fluency.py +103 -66
- scorer.ipynb +44 -22
categories/accuracy.py
CHANGED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import string
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from scipy.spatial.distance import cosine
|
| 5 |
+
from simalign import SentenceAligner
|
| 6 |
+
from transformers import AutoModel, AutoTokenizer
|
| 7 |
+
|
| 8 |
+
# setup global variables on import (bad practice, but whatever)
|
| 9 |
+
# --------------------------------------------------------------
|
| 10 |
+
|
| 11 |
+
aligner = SentenceAligner(model="distilbert-base-multilingual-cased", layer=6)
|
| 12 |
+
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
|
| 13 |
+
model = AutoModel.from_pretrained("distilbert-base-multilingual-cased")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def accuracy(src_sentence: str, trg_sentence: str) -> dict:
|
| 17 |
+
"""
|
| 18 |
+
Calculate the accuracy of a translation by comparing the source and target
|
| 19 |
+
sentences.
|
| 20 |
+
|
| 21 |
+
Parameters:
|
| 22 |
+
src_sentence (str): The source sentence.
|
| 23 |
+
trg_sentence (str): The target sentence.
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
dict: A dictionary containing the accuracy score and errors.
|
| 27 |
+
"""
|
| 28 |
+
# Preprocess both sentences
|
| 29 |
+
src_sentence = __preprocess_text(src_sentence)
|
| 30 |
+
trg_sentence = __preprocess_text(trg_sentence)
|
| 31 |
+
|
| 32 |
+
r = __get_alignment_score(src_sentence, trg_sentence)
|
| 33 |
+
score = __get_bertscore(src_sentence, trg_sentence)
|
| 34 |
+
|
| 35 |
+
res = {"score": __bertscore_to_percentage(score), "errors": r}
|
| 36 |
+
return res
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def __preprocess_text(text: str) -> str:
|
| 40 |
+
"""
|
| 41 |
+
Remove punctuation and convert text to lowercase.
|
| 42 |
+
|
| 43 |
+
Parameters:
|
| 44 |
+
text (str): The text to preprocess.
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
str: The preprocessed text.
|
| 48 |
+
"""
|
| 49 |
+
# Remove punctuation
|
| 50 |
+
text = text.translate(str.maketrans("", "", string.punctuation))
|
| 51 |
+
# Convert to lowercase
|
| 52 |
+
text = text.lower()
|
| 53 |
+
return text
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def __get_bertscore(src_sentence: str, trg_sentence: str) -> float:
|
| 57 |
+
"""
|
| 58 |
+
Get the BERTScore between two sentences.
|
| 59 |
+
|
| 60 |
+
Parameters:
|
| 61 |
+
src_sentence (str): The source sentence.
|
| 62 |
+
trg_sentence (str): The target sentence.
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
float: The BERTScore.
|
| 66 |
+
"""
|
| 67 |
+
# Tokenize and generate embeddings
|
| 68 |
+
inputs_src = tokenizer(
|
| 69 |
+
src_sentence, return_tensors="pt", padding=True, truncation=True
|
| 70 |
+
)
|
| 71 |
+
inputs_trg = tokenizer(
|
| 72 |
+
trg_sentence, return_tensors="pt", padding=True, truncation=True
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
with torch.no_grad():
|
| 76 |
+
outputs_src = model(**inputs_src)
|
| 77 |
+
outputs_trg = model(**inputs_trg)
|
| 78 |
+
|
| 79 |
+
# Get sentence embeddings by averaging token embeddings (from last hidden state)
|
| 80 |
+
src_embedding = torch.mean(outputs_src.last_hidden_state, dim=1).squeeze().numpy()
|
| 81 |
+
trg_embedding = torch.mean(outputs_trg.last_hidden_state, dim=1).squeeze().numpy()
|
| 82 |
+
|
| 83 |
+
# Calculate cosine similarity (1 - cosine distance)
|
| 84 |
+
similarity = 1 - cosine(src_embedding, trg_embedding)
|
| 85 |
+
|
| 86 |
+
return similarity
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def __bertscore_to_percentage(similarity: float) -> float:
|
| 90 |
+
"""
|
| 91 |
+
Convert the BERTScore cosine similarity to a percentage score (0-100).
|
| 92 |
+
|
| 93 |
+
Parameters:
|
| 94 |
+
similarity (float): The cosine similarity from BERTScore.
|
| 95 |
+
|
| 96 |
+
Returns:
|
| 97 |
+
int: A score from 0 to 100.
|
| 98 |
+
"""
|
| 99 |
+
# Scale the similarity score from [-1, 1] range to [0, 100] (rarely negative)
|
| 100 |
+
scaled_score = max(((similarity) / 2) * 100, 0)
|
| 101 |
+
return round(scaled_score, 2)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def __get_alignment_score(src_sentence: str, trg_sentence: str) -> list:
|
| 105 |
+
"""
|
| 106 |
+
Get the alignment score between two sentences.
|
| 107 |
+
|
| 108 |
+
Parameters:
|
| 109 |
+
src_sentence (str): The source sentence.
|
| 110 |
+
trg_sentence (str): The target sentence.
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
list: Mistranslations
|
| 114 |
+
"""
|
| 115 |
+
src_list = src_sentence.split()
|
| 116 |
+
trg_list = trg_sentence.split()
|
| 117 |
+
|
| 118 |
+
# The output is a dictionary with different matching methods.
|
| 119 |
+
# Each method has a list of pairs indicating the indexes of aligned words (The alignments are zero-indexed).
|
| 120 |
+
alignments = aligner.get_word_aligns(src_list, trg_list)
|
| 121 |
+
|
| 122 |
+
src_aligns = {x[0] for x in alignments["inter"]}
|
| 123 |
+
trg_aligns = {x[1] for x in alignments["inter"]}
|
| 124 |
+
|
| 125 |
+
mistranslations = []
|
| 126 |
+
for i in range(len(src_list)):
|
| 127 |
+
if i not in src_aligns:
|
| 128 |
+
mistranslations.append(
|
| 129 |
+
{
|
| 130 |
+
"start": i,
|
| 131 |
+
"end": i,
|
| 132 |
+
"message": f"Word {src_list[i]} possibly mistranslated or omitted",
|
| 133 |
+
}
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
for i in range(len(trg_list)):
|
| 137 |
+
if i not in trg_aligns:
|
| 138 |
+
mistranslations.append(
|
| 139 |
+
{
|
| 140 |
+
"start": i,
|
| 141 |
+
"end": i,
|
| 142 |
+
"message": f"Word {trg_list[i]} possibly mistranslated or added erroneously",
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
return mistranslations
|
categories/fluency.py
CHANGED
|
@@ -1,28 +1,29 @@
|
|
| 1 |
import language_tool_python
|
| 2 |
-
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 3 |
-
import torch
|
| 4 |
import numpy as np
|
| 5 |
import spacy
|
|
|
|
| 6 |
import wordfreq
|
|
|
|
| 7 |
|
| 8 |
# setup global variables on import (bad practice, but whatever)
|
| 9 |
-
|
| 10 |
|
| 11 |
# grammar checker
|
| 12 |
-
tool = language_tool_python.LanguageTool(
|
| 13 |
|
| 14 |
# masked language model and tokenizer from huggingface
|
| 15 |
-
model_name="distilbert-base-multilingual-cased"
|
| 16 |
model = AutoModelForMaskedLM.from_pretrained(model_name)
|
| 17 |
model.eval()
|
| 18 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 19 |
|
| 20 |
# spacy model for parsing
|
| 21 |
nlp = spacy.load("en_core_web_sm")
|
| 22 |
|
| 23 |
-
|
|
|
|
| 24 |
"""
|
| 25 |
-
Returns the rarity of a word in the given language. word_freq retuns a value
|
| 26 |
between 0 and 1, where 1 is the most common word. Therefore, taking the log results
|
| 27 |
in a value between 0 (log 1 = 0) and -27.63 (log 1e-12). We then negate it so super
|
| 28 |
rare words have a high score and common words have a low score.
|
|
@@ -30,20 +31,21 @@ def __get_rarity(word, lang="en") -> float:
|
|
| 30 |
Parameters:
|
| 31 |
word (str): The word to check.
|
| 32 |
lang (str): The language to check. Default is "en".
|
| 33 |
-
|
| 34 |
Returns:
|
| 35 |
float: The rarity of the word.
|
| 36 |
"""
|
| 37 |
return -np.log(wordfreq.word_frequency(word, lang) + 1e-12)
|
| 38 |
|
| 39 |
-
|
|
|
|
| 40 |
"""
|
| 41 |
Produce groupings of tokens that are part of the same word.
|
| 42 |
|
| 43 |
Parameters:
|
| 44 |
offset_mapping (list): The offset mapping of the tokens.
|
| 45 |
input_ids (list): The input ids of the tokens.
|
| 46 |
-
|
| 47 |
Returns:
|
| 48 |
list: A list of groupings of tokens.
|
| 49 |
"""
|
|
@@ -64,10 +66,11 @@ def __produce_groupings(offset_mapping, input_ids):
|
|
| 64 |
# Append final group
|
| 65 |
if current_group:
|
| 66 |
res.append(current_group)
|
| 67 |
-
|
| 68 |
return res
|
| 69 |
|
| 70 |
-
|
|
|
|
| 71 |
"""
|
| 72 |
Calculate the pseudo-perplexity of a text using a masked language model. Return all
|
| 73 |
words that exceed a threshold of "adjusted awkwardness". The threshold is a measure
|
|
@@ -77,7 +80,7 @@ def pseudo_perplexity(text, threshold=4, max_len=128):
|
|
| 77 |
text (str): The text to check.
|
| 78 |
threshold (float): The threshold for awkwardness. Default is 4.
|
| 79 |
max_len (int): The maximum length of the text. Default is 128.
|
| 80 |
-
|
| 81 |
Returns:
|
| 82 |
dict: A dictionary containing the score and errors.
|
| 83 |
"""
|
|
@@ -94,7 +97,7 @@ def pseudo_perplexity(text, threshold=4, max_len=128):
|
|
| 94 |
for group in word_groups:
|
| 95 |
# Skip special tokens (CLS and SEP)
|
| 96 |
if group[0] == 0 or group[-1] == len(input_ids) - 1:
|
| 97 |
-
continue
|
| 98 |
|
| 99 |
# Mask the word group
|
| 100 |
masked = input_ids.clone()
|
|
@@ -119,7 +122,9 @@ def pseudo_perplexity(text, threshold=4, max_len=128):
|
|
| 119 |
word_loss = -np.sum(log_probs) / len(log_probs)
|
| 120 |
# Adjust the loss based on the rarity of the word
|
| 121 |
word = tokenizer.decode(input_ids[group[0]])
|
| 122 |
-
word_loss -= 0.6 * __get_rarity(
|
|
|
|
|
|
|
| 123 |
loss_values.append(word_loss)
|
| 124 |
|
| 125 |
# Structure the results for output
|
|
@@ -129,22 +134,24 @@ def pseudo_perplexity(text, threshold=4, max_len=128):
|
|
| 129 |
for i, l in enumerate(loss_values):
|
| 130 |
if l < threshold:
|
| 131 |
continue
|
| 132 |
-
errors.append(
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
| 137 |
|
| 138 |
-
res = {
|
| 139 |
-
"score": __fluency_score(average_loss),
|
| 140 |
-
"errors": errors
|
| 141 |
-
}
|
| 142 |
|
| 143 |
return res
|
| 144 |
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
| 146 |
"""
|
| 147 |
-
Transform the loss into a score from 0 to 100. Steepness controls how quickly the
|
| 148 |
score drops as loss increases and midpoint controls the loss at which the score is
|
| 149 |
50.
|
| 150 |
|
|
@@ -152,20 +159,21 @@ def __fluency_score(loss, midpoint=5, steepness=0.3):
|
|
| 152 |
loss (float): The loss to transform.
|
| 153 |
midpoint (float): The loss at which the score is 50. Default is 5.
|
| 154 |
steepness (float): The steepness of the curve. Default is 0.3.
|
| 155 |
-
|
| 156 |
Returns:
|
| 157 |
float: The score from 0 to 100.
|
| 158 |
"""
|
| 159 |
score = 100 / (1 + np.exp(steepness * (loss - midpoint)))
|
| 160 |
return round(score, 2)
|
| 161 |
|
| 162 |
-
|
|
|
|
| 163 |
"""
|
| 164 |
Check the grammar of a text using a grammar checker and a structural grammar check.
|
| 165 |
|
| 166 |
Parameters:
|
| 167 |
text (str): The text to check.
|
| 168 |
-
|
| 169 |
Returns:
|
| 170 |
dict: A dictionary containing the score and errors.
|
| 171 |
"""
|
|
@@ -195,83 +203,112 @@ def grammar_errors(text) -> tuple[int, list[str]]:
|
|
| 195 |
|
| 196 |
grammar_score = len(r) / len(text.split())
|
| 197 |
|
| 198 |
-
res = {
|
| 199 |
-
"score": __grammar_score_from_prob(grammar_score),
|
| 200 |
-
"errors": r
|
| 201 |
-
}
|
| 202 |
|
| 203 |
return res
|
| 204 |
|
| 205 |
-
|
|
|
|
| 206 |
"""
|
| 207 |
Transform the number of errors divided by words into a score from 0 to 100.
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
"""
|
| 210 |
-
score = 100*(1-error_ratio)
|
| 211 |
return round(score, 2)
|
| 212 |
|
| 213 |
|
| 214 |
-
def __check_structural_grammar(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
doc = nlp(text)
|
| 216 |
issues = []
|
| 217 |
|
| 218 |
# 1. Missing main verb (ROOT)
|
| 219 |
-
root_verbs = [
|
|
|
|
|
|
|
| 220 |
if not root_verbs:
|
| 221 |
root_root = [tok for tok in doc if tok.dep_ == "ROOT"]
|
| 222 |
token = root_root[0] if root_root else doc[0]
|
| 223 |
-
issues.append(
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
|
|
|
|
|
|
| 228 |
|
| 229 |
# 2. Verb(s) present but no subject
|
| 230 |
verbs = [tok for tok in doc if tok.pos_ in {"VERB", "AUX"}]
|
| 231 |
subjects = [tok for tok in doc if tok.dep_ in {"nsubj", "nsubjpass"}]
|
| 232 |
if verbs and not subjects:
|
| 233 |
for verb in verbs:
|
| 234 |
-
issues.append(
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
|
|
|
|
|
|
| 239 |
|
| 240 |
# 3. Dangling prepositions
|
| 241 |
for tok in doc:
|
| 242 |
if tok.pos_ == "ADP" and len(list(tok.children)) == 0:
|
| 243 |
-
issues.append(
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
|
|
|
|
|
|
| 248 |
|
| 249 |
# 4. Noun pile-up (no verbs, all tokens are nominal)
|
| 250 |
-
if not any(tok.pos_ in {"VERB", "AUX"} for tok in doc) and
|
| 251 |
-
|
|
|
|
|
|
|
|
|
|
| 252 |
token = doc[0]
|
| 253 |
-
issues.append(
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
|
|
|
|
|
|
| 258 |
|
| 259 |
# 5. Multiple ROOTs (possible run-on)
|
| 260 |
root_count = sum(1 for tok in doc if tok.dep_ == "ROOT")
|
| 261 |
if root_count > 1:
|
| 262 |
for tok in doc:
|
| 263 |
if tok.dep_ == "ROOT":
|
| 264 |
-
issues.append(
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
| 269 |
|
| 270 |
return issues
|
| 271 |
|
| 272 |
|
|
|
|
| 273 |
def main():
|
| 274 |
pass
|
| 275 |
|
|
|
|
| 276 |
if __name__ == "__main__":
|
| 277 |
main()
|
|
|
|
| 1 |
import language_tool_python
|
|
|
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import spacy
|
| 4 |
+
import torch
|
| 5 |
import wordfreq
|
| 6 |
+
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
| 7 |
|
| 8 |
# setup global variables on import (bad practice, but whatever)
|
| 9 |
+
# --------------------------------------------------------------
|
| 10 |
|
| 11 |
# grammar checker
|
| 12 |
+
tool = language_tool_python.LanguageTool("en-US")
|
| 13 |
|
| 14 |
# masked language model and tokenizer from huggingface
|
| 15 |
+
model_name = "distilbert-base-multilingual-cased"
|
| 16 |
model = AutoModelForMaskedLM.from_pretrained(model_name)
|
| 17 |
model.eval()
|
| 18 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name) # tokenizer
|
| 19 |
|
| 20 |
# spacy model for parsing
|
| 21 |
nlp = spacy.load("en_core_web_sm")
|
| 22 |
|
| 23 |
+
|
| 24 |
+
def __get_rarity(word: str, lang: str = "en") -> float:
|
| 25 |
"""
|
| 26 |
+
Returns the rarity of a word in the given language. word_freq retuns a value
|
| 27 |
between 0 and 1, where 1 is the most common word. Therefore, taking the log results
|
| 28 |
in a value between 0 (log 1 = 0) and -27.63 (log 1e-12). We then negate it so super
|
| 29 |
rare words have a high score and common words have a low score.
|
|
|
|
| 31 |
Parameters:
|
| 32 |
word (str): The word to check.
|
| 33 |
lang (str): The language to check. Default is "en".
|
| 34 |
+
|
| 35 |
Returns:
|
| 36 |
float: The rarity of the word.
|
| 37 |
"""
|
| 38 |
return -np.log(wordfreq.word_frequency(word, lang) + 1e-12)
|
| 39 |
|
| 40 |
+
|
| 41 |
+
def __produce_groupings(offset_mapping: list, input_ids: list) -> list:
|
| 42 |
"""
|
| 43 |
Produce groupings of tokens that are part of the same word.
|
| 44 |
|
| 45 |
Parameters:
|
| 46 |
offset_mapping (list): The offset mapping of the tokens.
|
| 47 |
input_ids (list): The input ids of the tokens.
|
| 48 |
+
|
| 49 |
Returns:
|
| 50 |
list: A list of groupings of tokens.
|
| 51 |
"""
|
|
|
|
| 66 |
# Append final group
|
| 67 |
if current_group:
|
| 68 |
res.append(current_group)
|
| 69 |
+
|
| 70 |
return res
|
| 71 |
|
| 72 |
+
|
| 73 |
+
def pseudo_perplexity(text: str, threshold: int = 4, max_len: int = 128) -> dict:
|
| 74 |
"""
|
| 75 |
Calculate the pseudo-perplexity of a text using a masked language model. Return all
|
| 76 |
words that exceed a threshold of "adjusted awkwardness". The threshold is a measure
|
|
|
|
| 80 |
text (str): The text to check.
|
| 81 |
threshold (float): The threshold for awkwardness. Default is 4.
|
| 82 |
max_len (int): The maximum length of the text. Default is 128.
|
| 83 |
+
|
| 84 |
Returns:
|
| 85 |
dict: A dictionary containing the score and errors.
|
| 86 |
"""
|
|
|
|
| 97 |
for group in word_groups:
|
| 98 |
# Skip special tokens (CLS and SEP)
|
| 99 |
if group[0] == 0 or group[-1] == len(input_ids) - 1:
|
| 100 |
+
continue
|
| 101 |
|
| 102 |
# Mask the word group
|
| 103 |
masked = input_ids.clone()
|
|
|
|
| 122 |
word_loss = -np.sum(log_probs) / len(log_probs)
|
| 123 |
# Adjust the loss based on the rarity of the word
|
| 124 |
word = tokenizer.decode(input_ids[group[0]])
|
| 125 |
+
word_loss -= 0.6 * __get_rarity(
|
| 126 |
+
word
|
| 127 |
+
) # subtract rarity (rare words reduce loss)
|
| 128 |
loss_values.append(word_loss)
|
| 129 |
|
| 130 |
# Structure the results for output
|
|
|
|
| 134 |
for i, l in enumerate(loss_values):
|
| 135 |
if l < threshold:
|
| 136 |
continue
|
| 137 |
+
errors.append(
|
| 138 |
+
{
|
| 139 |
+
"start": i,
|
| 140 |
+
"end": i,
|
| 141 |
+
"message": f"Adjusted liklihood {l} over threshold {threshold}",
|
| 142 |
+
}
|
| 143 |
+
)
|
| 144 |
|
| 145 |
+
res = {"score": __fluency_score(average_loss), "errors": errors}
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
return res
|
| 148 |
|
| 149 |
+
|
| 150 |
+
def __fluency_score(
|
| 151 |
+
loss: float, midpoint: float = 5.0, steepness: float = 0.3
|
| 152 |
+
) -> float:
|
| 153 |
"""
|
| 154 |
+
Transform the loss into a score from 0 to 100. Steepness controls how quickly the
|
| 155 |
score drops as loss increases and midpoint controls the loss at which the score is
|
| 156 |
50.
|
| 157 |
|
|
|
|
| 159 |
loss (float): The loss to transform.
|
| 160 |
midpoint (float): The loss at which the score is 50. Default is 5.
|
| 161 |
steepness (float): The steepness of the curve. Default is 0.3.
|
| 162 |
+
|
| 163 |
Returns:
|
| 164 |
float: The score from 0 to 100.
|
| 165 |
"""
|
| 166 |
score = 100 / (1 + np.exp(steepness * (loss - midpoint)))
|
| 167 |
return round(score, 2)
|
| 168 |
|
| 169 |
+
|
| 170 |
+
def grammar_errors(text: str) -> dict:
|
| 171 |
"""
|
| 172 |
Check the grammar of a text using a grammar checker and a structural grammar check.
|
| 173 |
|
| 174 |
Parameters:
|
| 175 |
text (str): The text to check.
|
| 176 |
+
|
| 177 |
Returns:
|
| 178 |
dict: A dictionary containing the score and errors.
|
| 179 |
"""
|
|
|
|
| 203 |
|
| 204 |
grammar_score = len(r) / len(text.split())
|
| 205 |
|
| 206 |
+
res = {"score": __grammar_score_from_prob(grammar_score), "errors": r}
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
return res
|
| 209 |
|
| 210 |
+
|
| 211 |
+
def __grammar_score_from_prob(error_ratio: float) -> float:
|
| 212 |
"""
|
| 213 |
Transform the number of errors divided by words into a score from 0 to 100.
|
| 214 |
+
|
| 215 |
+
Parameters:
|
| 216 |
+
error_ratio (float): The ratio of errors to words.
|
| 217 |
+
|
| 218 |
+
Returns:
|
| 219 |
+
float: The score from 0 to 100.
|
| 220 |
"""
|
| 221 |
+
score = 100 * (1 - error_ratio)
|
| 222 |
return round(score, 2)
|
| 223 |
|
| 224 |
|
| 225 |
+
def __check_structural_grammar(text: str) -> list:
|
| 226 |
+
"""
|
| 227 |
+
Check the structural grammar of a text using spaCy.
|
| 228 |
+
|
| 229 |
+
Parameters:
|
| 230 |
+
text (str): The text to check.
|
| 231 |
+
|
| 232 |
+
Returns:
|
| 233 |
+
list: A list of structural grammar errors.
|
| 234 |
+
"""
|
| 235 |
doc = nlp(text)
|
| 236 |
issues = []
|
| 237 |
|
| 238 |
# 1. Missing main verb (ROOT)
|
| 239 |
+
root_verbs = [
|
| 240 |
+
tok for tok in doc if tok.dep_ == "ROOT" and tok.pos_ in {"VERB", "AUX"}
|
| 241 |
+
]
|
| 242 |
if not root_verbs:
|
| 243 |
root_root = [tok for tok in doc if tok.dep_ == "ROOT"]
|
| 244 |
token = root_root[0] if root_root else doc[0]
|
| 245 |
+
issues.append(
|
| 246 |
+
{
|
| 247 |
+
"start": token.i,
|
| 248 |
+
"end": token.i + 1,
|
| 249 |
+
"message": "Sentence is missing a main verb (no ROOT verb).",
|
| 250 |
+
}
|
| 251 |
+
)
|
| 252 |
|
| 253 |
# 2. Verb(s) present but no subject
|
| 254 |
verbs = [tok for tok in doc if tok.pos_ in {"VERB", "AUX"}]
|
| 255 |
subjects = [tok for tok in doc if tok.dep_ in {"nsubj", "nsubjpass"}]
|
| 256 |
if verbs and not subjects:
|
| 257 |
for verb in verbs:
|
| 258 |
+
issues.append(
|
| 259 |
+
{
|
| 260 |
+
"start": verb.i,
|
| 261 |
+
"end": verb.i + 1,
|
| 262 |
+
"message": "Sentence has verb(s) but no subject (possible fragment).",
|
| 263 |
+
}
|
| 264 |
+
)
|
| 265 |
|
| 266 |
# 3. Dangling prepositions
|
| 267 |
for tok in doc:
|
| 268 |
if tok.pos_ == "ADP" and len(list(tok.children)) == 0:
|
| 269 |
+
issues.append(
|
| 270 |
+
{
|
| 271 |
+
"start": tok.i,
|
| 272 |
+
"end": tok.i + 1,
|
| 273 |
+
"message": f"Dangling preposition '{tok.text}' (no object or complement).",
|
| 274 |
+
}
|
| 275 |
+
)
|
| 276 |
|
| 277 |
# 4. Noun pile-up (no verbs, all tokens are nominal)
|
| 278 |
+
if not any(tok.pos_ in {"VERB", "AUX"} for tok in doc) and all(
|
| 279 |
+
tok.pos_ in {"NOUN", "PROPN", "ADJ", "DET", "NUM"}
|
| 280 |
+
for tok in doc
|
| 281 |
+
if tok.is_alpha
|
| 282 |
+
):
|
| 283 |
token = doc[0]
|
| 284 |
+
issues.append(
|
| 285 |
+
{
|
| 286 |
+
"start": token.i,
|
| 287 |
+
"end": token.i + 1,
|
| 288 |
+
"message": "Sentence lacks a verb or any verbal structure (nominal phrase pile-up).",
|
| 289 |
+
}
|
| 290 |
+
)
|
| 291 |
|
| 292 |
# 5. Multiple ROOTs (possible run-on)
|
| 293 |
root_count = sum(1 for tok in doc if tok.dep_ == "ROOT")
|
| 294 |
if root_count > 1:
|
| 295 |
for tok in doc:
|
| 296 |
if tok.dep_ == "ROOT":
|
| 297 |
+
issues.append(
|
| 298 |
+
{
|
| 299 |
+
"start": tok.i,
|
| 300 |
+
"end": tok.i + 1,
|
| 301 |
+
"message": "Sentence has multiple ROOTs — possible run-on sentence.",
|
| 302 |
+
}
|
| 303 |
+
)
|
| 304 |
|
| 305 |
return issues
|
| 306 |
|
| 307 |
|
| 308 |
+
# Unit tests can go here eventually
|
| 309 |
def main():
|
| 310 |
pass
|
| 311 |
|
| 312 |
+
|
| 313 |
if __name__ == "__main__":
|
| 314 |
main()
|
scorer.ipynb
CHANGED
|
@@ -4,78 +4,100 @@
|
|
| 4 |
"cell_type": "code",
|
| 5 |
"execution_count": 1,
|
| 6 |
"metadata": {},
|
| 7 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"source": [
|
| 9 |
-
"from categories.fluency import
|
|
|
|
| 10 |
]
|
| 11 |
},
|
| 12 |
{
|
| 13 |
"cell_type": "code",
|
| 14 |
-
"execution_count":
|
| 15 |
"metadata": {},
|
| 16 |
"outputs": [
|
| 17 |
{
|
| 18 |
"name": "stdout",
|
| 19 |
"output_type": "stream",
|
| 20 |
"text": [
|
| 21 |
-
"Sentence:
|
| 22 |
]
|
| 23 |
}
|
| 24 |
],
|
| 25 |
"source": [
|
| 26 |
-
"
|
|
|
|
| 27 |
"\n",
|
| 28 |
-
"if
|
| 29 |
-
"
|
| 30 |
"\n",
|
| 31 |
-
"print(\"Sentence:\",
|
| 32 |
"\n",
|
| 33 |
-
"err = grammar_errors(
|
| 34 |
-
"flu = pseudo_perplexity(
|
|
|
|
| 35 |
]
|
| 36 |
},
|
| 37 |
{
|
| 38 |
"cell_type": "code",
|
| 39 |
-
"execution_count":
|
| 40 |
"metadata": {},
|
| 41 |
"outputs": [
|
| 42 |
{
|
| 43 |
"name": "stdout",
|
| 44 |
"output_type": "stream",
|
| 45 |
"text": [
|
| 46 |
-
"
|
| 47 |
-
"
|
| 48 |
-
"
|
| 49 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
]
|
| 51 |
}
|
| 52 |
],
|
| 53 |
"source": [
|
| 54 |
-
"combined_err = err[\"errors\"] + flu[\"errors\"] # Combine the error counts from both functions\n",
|
| 55 |
"\n",
|
| 56 |
"for e in combined_err:\n",
|
| 57 |
-
" substr = \" \".join(
|
| 58 |
" print(f\"{e['message']}: {substr}\") # Print the error messages\n"
|
| 59 |
]
|
| 60 |
},
|
| 61 |
{
|
| 62 |
"cell_type": "code",
|
| 63 |
-
"execution_count":
|
| 64 |
"metadata": {},
|
| 65 |
"outputs": [
|
| 66 |
{
|
| 67 |
"name": "stdout",
|
| 68 |
"output_type": "stream",
|
| 69 |
"text": [
|
| 70 |
-
"
|
| 71 |
-
"
|
| 72 |
]
|
| 73 |
}
|
| 74 |
],
|
| 75 |
"source": [
|
| 76 |
"fluency_score = 0.5 * err[\"score\"] + 0.5 * flu[\"score\"] # Calculate the fluency score\n",
|
| 77 |
-
"print(
|
| 78 |
-
"
|
|
|
|
| 79 |
]
|
| 80 |
}
|
| 81 |
],
|
|
|
|
| 4 |
"cell_type": "code",
|
| 5 |
"execution_count": 1,
|
| 6 |
"metadata": {},
|
| 7 |
+
"outputs": [
|
| 8 |
+
{
|
| 9 |
+
"name": "stderr",
|
| 10 |
+
"output_type": "stream",
|
| 11 |
+
"text": [
|
| 12 |
+
"2025-04-08 22:18:10,848 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: distilbert-base-multilingual-cased\n",
|
| 13 |
+
"Initialized the EmbeddingLoader with model: distilbert-base-multilingual-cased\n"
|
| 14 |
+
]
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
"source": [
|
| 18 |
+
"from categories.fluency import *\n",
|
| 19 |
+
"from categories.accuracy import *"
|
| 20 |
]
|
| 21 |
},
|
| 22 |
{
|
| 23 |
"cell_type": "code",
|
| 24 |
+
"execution_count": 2,
|
| 25 |
"metadata": {},
|
| 26 |
"outputs": [
|
| 27 |
{
|
| 28 |
"name": "stdout",
|
| 29 |
"output_type": "stream",
|
| 30 |
"text": [
|
| 31 |
+
"Sentence: The cat sat the quickly up apples banana.\n"
|
| 32 |
]
|
| 33 |
}
|
| 34 |
],
|
| 35 |
"source": [
|
| 36 |
+
"src_sent = \"Das ist ein Test.\" # Example source sentence\n",
|
| 37 |
+
"trg_sent = input(f\"{src_sent}: \") # Prompt the user to enter a sentence\n",
|
| 38 |
"\n",
|
| 39 |
+
"if trg_sent == \"\":\n",
|
| 40 |
+
" trg_sent = \"The cat sat the quickly up apples banana.\"\n",
|
| 41 |
"\n",
|
| 42 |
+
"print(\"Sentence:\", trg_sent) # Print the input sentence\n",
|
| 43 |
"\n",
|
| 44 |
+
"err = grammar_errors(trg_sent) # Call the function to execute the grammar error checking\n",
|
| 45 |
+
"flu = pseudo_perplexity(trg_sent, threshold=3.1) # Call the function to execute the fluency checking\n",
|
| 46 |
+
"acc = accuracy(src_sent, trg_sent) # Call the function to execute the accuracy checking"
|
| 47 |
]
|
| 48 |
},
|
| 49 |
{
|
| 50 |
"cell_type": "code",
|
| 51 |
+
"execution_count": 3,
|
| 52 |
"metadata": {},
|
| 53 |
"outputs": [
|
| 54 |
{
|
| 55 |
"name": "stdout",
|
| 56 |
"output_type": "stream",
|
| 57 |
"text": [
|
| 58 |
+
"An apostrophe may be missing.: apples banana.\n",
|
| 59 |
+
"Adjusted liklihood 4.8056646935577145 over threshold 3.1: sat\n",
|
| 60 |
+
"Adjusted liklihood 4.473408069089179 over threshold 3.1: the\n",
|
| 61 |
+
"Adjusted liklihood 4.732453441503642 over threshold 3.1: quickly\n",
|
| 62 |
+
"Adjusted liklihood 5.1115574262487735 over threshold 3.1: apples\n",
|
| 63 |
+
"Word ist possibly mistranslated or omitted: cat\n",
|
| 64 |
+
"Word ein possibly mistranslated or omitted: sat\n",
|
| 65 |
+
"Word sat possibly mistranslated or added erroneously: sat\n",
|
| 66 |
+
"Word the possibly mistranslated or added erroneously: the\n",
|
| 67 |
+
"Word quickly possibly mistranslated or added erroneously: quickly\n",
|
| 68 |
+
"Word up possibly mistranslated or added erroneously: up\n",
|
| 69 |
+
"Word apples possibly mistranslated or added erroneously: apples\n",
|
| 70 |
+
"Word banana possibly mistranslated or added erroneously: banana.\n"
|
| 71 |
]
|
| 72 |
}
|
| 73 |
],
|
| 74 |
"source": [
|
| 75 |
+
"combined_err = err[\"errors\"] + flu[\"errors\"] + acc[\"errors\"] # Combine the error counts from both functions\n",
|
| 76 |
"\n",
|
| 77 |
"for e in combined_err:\n",
|
| 78 |
+
" substr = \" \".join(trg_sent.split(\" \")[e[\"start\"]:e[\"end\"]+1])\n",
|
| 79 |
" print(f\"{e['message']}: {substr}\") # Print the error messages\n"
|
| 80 |
]
|
| 81 |
},
|
| 82 |
{
|
| 83 |
"cell_type": "code",
|
| 84 |
+
"execution_count": null,
|
| 85 |
"metadata": {},
|
| 86 |
"outputs": [
|
| 87 |
{
|
| 88 |
"name": "stdout",
|
| 89 |
"output_type": "stream",
|
| 90 |
"text": [
|
| 91 |
+
"Fluency Score: 76.61500000000001\n",
|
| 92 |
+
"Accuracy Score: 24.45\n"
|
| 93 |
]
|
| 94 |
}
|
| 95 |
],
|
| 96 |
"source": [
|
| 97 |
"fluency_score = 0.5 * err[\"score\"] + 0.5 * flu[\"score\"] # Calculate the fluency score\n",
|
| 98 |
+
"print(\"Fluency Score:\", round(fluency_score, 2)) # Print the fluency score\n",
|
| 99 |
+
"\n",
|
| 100 |
+
"print(\"Accuracy Score:\", acc[\"score\"]) # Print the accuracy score"
|
| 101 |
]
|
| 102 |
}
|
| 103 |
],
|