Spaces:

GopalKrushnaMahapatra
/

TrueWrite-Scan-Backend

Sleeping

App Files Files Community

GopalKrushnaMahapatra commited on 14 days ago

Commit

ab44c3c

verified ·

1 Parent(s): e7d6b5a

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -67

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# backend/main.py
 import os
 import re
 import io
@@ -39,6 +39,7 @@ except Exception:
 # GECToR (neural grammatical error correction)
 try:
     from gector import GECToR, predict as gector_predict, load_verb_dict
 except Exception:
     GECToR = None
@@ -383,55 +384,42 @@ except Exception as e:
 # ------------------ GECToR LOADING (Neural GEC) ------------------
 GEC_MODEL = None
-try:
-    # Import specific classes from the installed library
-    from gector.gec_model import GecBERTModel
-    from gector.utils.helpers import load_verb_dict
-    print("[GECToR] Initializing model... (This may take 30s)")
-    GEC_MODEL = GecBERTModel(
-        vocab_path="/app/data",         # Directory containing verb-form-vocab.txt
-        model_paths=["/app/data/gector_model.th"],
-        model_name='roberta-base',
-        max_len=50,
-        min_len=3,
-        iterations=5,
-        min_error_probability=0.0,
-        lowercase_tokens=0,
-        special_tokens_fix=1,
-        log=False,
-        is_ensemble=0,
-        weigths=None,
-        confidence=0,
-        del_confidence=0
-    )
-    # 2. Load and Attach the Verb Dictionary
-    # This maps verb forms (e.g., "go" -> "gone")
-    encode, decode = load_verb_dict("/app/data/verb-form-vocab.txt")
-    GEC_MODEL.encode = encode
-    GEC_MODEL.decode = decode
-    print(f"[GECToR] Model & Verb Dict Loaded Successfully!")
-except Exception as e:
-    GEC_MODEL = None
-    print(f"[GECToR] Failed to load. Error: {e}")
-    print("[GECToR] Ensure you updated Dockerfile to download 'gector_model.th'")
 def gector_correct(text: str):
     """
-    Run neural grammatical error correction using GECToR.
     """
-    # 1. Check if model is loaded
-    if GEC_MODEL is None:
         print("[GECToR] Model not loaded, skipping.")
-        return text, 0, len(text.split())
-    # 2. Safety Truncate (Server protection)
     parts = text.strip().split()
     if len(parts) > 1000:
         text_proc = " ".join(parts[:1000])
     else:
@@ -440,37 +428,31 @@ def gector_correct(text: str):
     if not text_proc:
         return text_proc, 0, 0
-    # 3. Split into sentences and then tokens
-    # GECToR expects a list of token lists: [['Hello', 'world'], ['How', 'are', 'you']]
-    sentences = re.split(r"(?<=[.!?])\s+", text_proc)
-    batch = [s.strip().split() for s in sentences if s.strip()]
-    if not batch:
-        return text_proc, 0, 0
     try:
-        # 4. Run Prediction
-        # We pass the encode/decode maps we loaded earlier
-        final_batch, total_updates = GEC_MODEL.handle_batch(
-            batch,
-            encode_mapping=GEC_MODEL.encode,
-            decode_mapping=GEC_MODEL.decode
         )
-        # 5. Reconstruct Text
-        corrected_sentences = [" ".join(tokens) for tokens in final_batch]
-        corrected_text = " ".join(corrected_sentences)
-        # 6. Count Corrections
-        # Simple word-by-word comparison
-        original_words = len(text_proc.split())
-        corrections = sum(1 for a, b in zip(text_proc.split(), corrected_text.split()) if a != b)
         return corrected_text, corrections, original_words
     except Exception as e:
         print(f"[GECToR] Prediction error: {e}")
-        # Fallback to original text if crash
         return text_proc, 0, len(text_proc.split())
@@ -692,8 +674,6 @@ def corpus_plagiarism_combined(text: str):
     for row in top:
         matches.append({
             "title": corpus_titles[row["index"]],
             "score": round(row["combined"] * 100, 2),
             "tfidf_score": round(row["tfidf"] * 100, 2) if row["tfidf"] is not None else None,
             "semantic_score": round(row["semantic"] * 100, 2) if row["semantic"] is not None else None,
@@ -965,4 +945,4 @@ def api_history(user=Depends(get_current_user)):
 @app.get("/")
 def read_root():
-    return {"status": "Backend is running with 16GB RAM!"}

+# app.py  (was: backend/main.py)
 import os
 import re
 import io
 # GECToR (neural grammatical error correction)
 try:
+    # This is the official import path from gotutiyan/gector README
     from gector import GECToR, predict as gector_predict, load_verb_dict
 except Exception:
     GECToR = None
 # ------------------ GECToR LOADING (Neural GEC) ------------------
 GEC_MODEL = None
+GEC_TOKENIZER = None
+GEC_ENCODE = None
+GEC_DECODE = None
+GEC_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+if GECToR is not None and gector_predict is not None and load_verb_dict is not None:
+    try:
+        print("[GECToR] Initializing model... (This may take a bit on first run)")
+        GEC_MODEL_ID = os.getenv("GEC_MODEL_ID", "gotutiyan/gector-roberta-base-5k")
+        VERB_DICT_PATH = os.getenv("GEC_VERB_DICT", "/app/data/verb-form-vocab.txt")
+        GEC_MODEL = GECToR.from_pretrained(GEC_MODEL_ID).to(GEC_DEVICE)
+        GEC_TOKENIZER = AutoTokenizer.from_pretrained(GEC_MODEL_ID)
+        GEC_ENCODE, GEC_DECODE = load_verb_dict(VERB_DICT_PATH)
+        print(f"[GECToR] Model & verb dict loaded: {GEC_MODEL_ID}")
+    except Exception as e:
+        print(f"[GECToR] Failed to load. Error: {e}")
+        GEC_MODEL = None
+        GEC_TOKENIZER = None
+        GEC_ENCODE = None
+        GEC_DECODE = None
+else:
+    print("[GECToR] Library not available; skipping neural GEC.")
 def gector_correct(text: str):
     """
+    Run neural grammatical error correction using GECToR (gotutiyan implementation).
     """
+    if GEC_MODEL is None or GEC_TOKENIZER is None or GEC_ENCODE is None or GEC_DECODE is None:
         print("[GECToR] Model not loaded, skipping.")
+        return text, 0, len(text.split()) if text.strip() else 0
     parts = text.strip().split()
+    # Safety truncate (protect server)
     if len(parts) > 1000:
         text_proc = " ".join(parts[:1000])
     else:
     if not text_proc:
         return text_proc, 0, 0
+    srcs = [text_proc]
     try:
+        corrected_list = gector_predict(
+            GEC_MODEL,
+            GEC_TOKENIZER,
+            srcs,
+            GEC_ENCODE,
+            GEC_DECODE,
+            keep_confidence=0.0,
+            min_error_prob=0.0,
+            n_iteration=5,
+            batch_size=2,
         )
+        corrected_text = corrected_list[0]
+        orig_tokens = text_proc.split()
+        corr_tokens = corrected_text.split()
+        corrections = sum(1 for a, b in zip(orig_tokens, corr_tokens) if a != b)
+        original_words = len(orig_tokens)
         return corrected_text, corrections, original_words
     except Exception as e:
         print(f"[GECToR] Prediction error: {e}")
         return text_proc, 0, len(text_proc.split())
     for row in top:
         matches.append({
             "title": corpus_titles[row["index"]],
             "score": round(row["combined"] * 100, 2),
             "tfidf_score": round(row["tfidf"] * 100, 2) if row["tfidf"] is not None else None,
             "semantic_score": round(row["semantic"] * 100, 2) if row["semantic"] is not None else None,
 @app.get("/")
 def read_root():
+    return {"status": "Backend is running with GECToR + 16GB RAM!"}