Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# backend/main.py
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import io
|
|
@@ -39,6 +39,7 @@ except Exception:
|
|
| 39 |
|
| 40 |
# GECToR (neural grammatical error correction)
|
| 41 |
try:
|
|
|
|
| 42 |
from gector import GECToR, predict as gector_predict, load_verb_dict
|
| 43 |
except Exception:
|
| 44 |
GECToR = None
|
|
@@ -383,55 +384,42 @@ except Exception as e:
|
|
| 383 |
|
| 384 |
# ------------------ GECToR LOADING (Neural GEC) ------------------
|
| 385 |
GEC_MODEL = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
print("[GECToR] Initializing model... (This may take 30s)")
|
| 393 |
-
|
| 394 |
-
GEC_MODEL = GecBERTModel(
|
| 395 |
-
vocab_path="/app/data", # Directory containing verb-form-vocab.txt
|
| 396 |
-
model_paths=["/app/data/gector_model.th"],
|
| 397 |
-
model_name='roberta-base',
|
| 398 |
-
max_len=50,
|
| 399 |
-
min_len=3,
|
| 400 |
-
iterations=5,
|
| 401 |
-
min_error_probability=0.0,
|
| 402 |
-
lowercase_tokens=0,
|
| 403 |
-
special_tokens_fix=1,
|
| 404 |
-
log=False,
|
| 405 |
-
is_ensemble=0,
|
| 406 |
-
weigths=None,
|
| 407 |
-
confidence=0,
|
| 408 |
-
del_confidence=0
|
| 409 |
-
)
|
| 410 |
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
|
| 419 |
-
except Exception as e:
|
| 420 |
-
GEC_MODEL = None
|
| 421 |
-
print(f"[GECToR] Failed to load. Error: {e}")
|
| 422 |
-
print("[GECToR] Ensure you updated Dockerfile to download 'gector_model.th'")
|
| 423 |
|
| 424 |
def gector_correct(text: str):
|
| 425 |
"""
|
| 426 |
-
Run neural grammatical error correction using GECToR.
|
| 427 |
"""
|
| 428 |
-
|
| 429 |
-
if GEC_MODEL is None:
|
| 430 |
print("[GECToR] Model not loaded, skipping.")
|
| 431 |
-
return text, 0, len(text.split())
|
| 432 |
|
| 433 |
-
# 2. Safety Truncate (Server protection)
|
| 434 |
parts = text.strip().split()
|
|
|
|
| 435 |
if len(parts) > 1000:
|
| 436 |
text_proc = " ".join(parts[:1000])
|
| 437 |
else:
|
|
@@ -440,37 +428,31 @@ def gector_correct(text: str):
|
|
| 440 |
if not text_proc:
|
| 441 |
return text_proc, 0, 0
|
| 442 |
|
| 443 |
-
|
| 444 |
-
# GECToR expects a list of token lists: [['Hello', 'world'], ['How', 'are', 'you']]
|
| 445 |
-
sentences = re.split(r"(?<=[.!?])\s+", text_proc)
|
| 446 |
-
batch = [s.strip().split() for s in sentences if s.strip()]
|
| 447 |
-
|
| 448 |
-
if not batch:
|
| 449 |
-
return text_proc, 0, 0
|
| 450 |
|
| 451 |
try:
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
)
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
original_words = len(text_proc.split())
|
| 467 |
-
corrections = sum(1 for a, b in zip(text_proc.split(), corrected_text.split()) if a != b)
|
| 468 |
-
|
| 469 |
return corrected_text, corrections, original_words
|
| 470 |
|
| 471 |
except Exception as e:
|
| 472 |
print(f"[GECToR] Prediction error: {e}")
|
| 473 |
-
# Fallback to original text if crash
|
| 474 |
return text_proc, 0, len(text_proc.split())
|
| 475 |
|
| 476 |
|
|
@@ -692,8 +674,6 @@ def corpus_plagiarism_combined(text: str):
|
|
| 692 |
for row in top:
|
| 693 |
matches.append({
|
| 694 |
"title": corpus_titles[row["index"]],
|
| 695 |
-
|
| 696 |
-
|
| 697 |
"score": round(row["combined"] * 100, 2),
|
| 698 |
"tfidf_score": round(row["tfidf"] * 100, 2) if row["tfidf"] is not None else None,
|
| 699 |
"semantic_score": round(row["semantic"] * 100, 2) if row["semantic"] is not None else None,
|
|
@@ -965,4 +945,4 @@ def api_history(user=Depends(get_current_user)):
|
|
| 965 |
|
| 966 |
@app.get("/")
|
| 967 |
def read_root():
|
| 968 |
-
return {"status": "Backend is running with 16GB RAM!"}
|
|
|
|
| 1 |
+
# app.py (was: backend/main.py)
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import io
|
|
|
|
| 39 |
|
| 40 |
# GECToR (neural grammatical error correction)
|
| 41 |
try:
|
| 42 |
+
# This is the official import path from gotutiyan/gector README
|
| 43 |
from gector import GECToR, predict as gector_predict, load_verb_dict
|
| 44 |
except Exception:
|
| 45 |
GECToR = None
|
|
|
|
| 384 |
|
| 385 |
# ------------------ GECToR LOADING (Neural GEC) ------------------
|
| 386 |
GEC_MODEL = None
|
| 387 |
+
GEC_TOKENIZER = None
|
| 388 |
+
GEC_ENCODE = None
|
| 389 |
+
GEC_DECODE = None
|
| 390 |
+
GEC_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 391 |
|
| 392 |
+
if GECToR is not None and gector_predict is not None and load_verb_dict is not None:
|
| 393 |
+
try:
|
| 394 |
+
print("[GECToR] Initializing model... (This may take a bit on first run)")
|
| 395 |
+
GEC_MODEL_ID = os.getenv("GEC_MODEL_ID", "gotutiyan/gector-roberta-base-5k")
|
| 396 |
+
VERB_DICT_PATH = os.getenv("GEC_VERB_DICT", "/app/data/verb-form-vocab.txt")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
|
| 398 |
+
GEC_MODEL = GECToR.from_pretrained(GEC_MODEL_ID).to(GEC_DEVICE)
|
| 399 |
+
GEC_TOKENIZER = AutoTokenizer.from_pretrained(GEC_MODEL_ID)
|
| 400 |
+
GEC_ENCODE, GEC_DECODE = load_verb_dict(VERB_DICT_PATH)
|
| 401 |
+
|
| 402 |
+
print(f"[GECToR] Model & verb dict loaded: {GEC_MODEL_ID}")
|
| 403 |
+
except Exception as e:
|
| 404 |
+
print(f"[GECToR] Failed to load. Error: {e}")
|
| 405 |
+
GEC_MODEL = None
|
| 406 |
+
GEC_TOKENIZER = None
|
| 407 |
+
GEC_ENCODE = None
|
| 408 |
+
GEC_DECODE = None
|
| 409 |
+
else:
|
| 410 |
+
print("[GECToR] Library not available; skipping neural GEC.")
|
| 411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
|
| 413 |
def gector_correct(text: str):
|
| 414 |
"""
|
| 415 |
+
Run neural grammatical error correction using GECToR (gotutiyan implementation).
|
| 416 |
"""
|
| 417 |
+
if GEC_MODEL is None or GEC_TOKENIZER is None or GEC_ENCODE is None or GEC_DECODE is None:
|
|
|
|
| 418 |
print("[GECToR] Model not loaded, skipping.")
|
| 419 |
+
return text, 0, len(text.split()) if text.strip() else 0
|
| 420 |
|
|
|
|
| 421 |
parts = text.strip().split()
|
| 422 |
+
# Safety truncate (protect server)
|
| 423 |
if len(parts) > 1000:
|
| 424 |
text_proc = " ".join(parts[:1000])
|
| 425 |
else:
|
|
|
|
| 428 |
if not text_proc:
|
| 429 |
return text_proc, 0, 0
|
| 430 |
|
| 431 |
+
srcs = [text_proc]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
|
| 433 |
try:
|
| 434 |
+
corrected_list = gector_predict(
|
| 435 |
+
GEC_MODEL,
|
| 436 |
+
GEC_TOKENIZER,
|
| 437 |
+
srcs,
|
| 438 |
+
GEC_ENCODE,
|
| 439 |
+
GEC_DECODE,
|
| 440 |
+
keep_confidence=0.0,
|
| 441 |
+
min_error_prob=0.0,
|
| 442 |
+
n_iteration=5,
|
| 443 |
+
batch_size=2,
|
| 444 |
)
|
| 445 |
+
corrected_text = corrected_list[0]
|
| 446 |
+
|
| 447 |
+
orig_tokens = text_proc.split()
|
| 448 |
+
corr_tokens = corrected_text.split()
|
| 449 |
+
corrections = sum(1 for a, b in zip(orig_tokens, corr_tokens) if a != b)
|
| 450 |
+
original_words = len(orig_tokens)
|
| 451 |
+
|
|
|
|
|
|
|
|
|
|
| 452 |
return corrected_text, corrections, original_words
|
| 453 |
|
| 454 |
except Exception as e:
|
| 455 |
print(f"[GECToR] Prediction error: {e}")
|
|
|
|
| 456 |
return text_proc, 0, len(text_proc.split())
|
| 457 |
|
| 458 |
|
|
|
|
| 674 |
for row in top:
|
| 675 |
matches.append({
|
| 676 |
"title": corpus_titles[row["index"]],
|
|
|
|
|
|
|
| 677 |
"score": round(row["combined"] * 100, 2),
|
| 678 |
"tfidf_score": round(row["tfidf"] * 100, 2) if row["tfidf"] is not None else None,
|
| 679 |
"semantic_score": round(row["semantic"] * 100, 2) if row["semantic"] is not None else None,
|
|
|
|
| 945 |
|
| 946 |
@app.get("/")
|
| 947 |
def read_root():
|
| 948 |
+
return {"status": "Backend is running with GECToR + 16GB RAM!"}
|