GopalKrushnaMahapatra commited on
Commit
ab44c3c
·
verified ·
1 Parent(s): e7d6b5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -67
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # backend/main.py
2
  import os
3
  import re
4
  import io
@@ -39,6 +39,7 @@ except Exception:
39
 
40
  # GECToR (neural grammatical error correction)
41
  try:
 
42
  from gector import GECToR, predict as gector_predict, load_verb_dict
43
  except Exception:
44
  GECToR = None
@@ -383,55 +384,42 @@ except Exception as e:
383
 
384
  # ------------------ GECToR LOADING (Neural GEC) ------------------
385
  GEC_MODEL = None
 
 
 
 
386
 
387
- try:
388
- # Import specific classes from the installed library
389
- from gector.gec_model import GecBERTModel
390
- from gector.utils.helpers import load_verb_dict
391
-
392
- print("[GECToR] Initializing model... (This may take 30s)")
393
-
394
- GEC_MODEL = GecBERTModel(
395
- vocab_path="/app/data", # Directory containing verb-form-vocab.txt
396
- model_paths=["/app/data/gector_model.th"],
397
- model_name='roberta-base',
398
- max_len=50,
399
- min_len=3,
400
- iterations=5,
401
- min_error_probability=0.0,
402
- lowercase_tokens=0,
403
- special_tokens_fix=1,
404
- log=False,
405
- is_ensemble=0,
406
- weigths=None,
407
- confidence=0,
408
- del_confidence=0
409
- )
410
 
411
- # 2. Load and Attach the Verb Dictionary
412
- # This maps verb forms (e.g., "go" -> "gone")
413
- encode, decode = load_verb_dict("/app/data/verb-form-vocab.txt")
414
- GEC_MODEL.encode = encode
415
- GEC_MODEL.decode = decode
416
-
417
- print(f"[GECToR] Model & Verb Dict Loaded Successfully!")
 
 
 
 
 
 
418
 
419
- except Exception as e:
420
- GEC_MODEL = None
421
- print(f"[GECToR] Failed to load. Error: {e}")
422
- print("[GECToR] Ensure you updated Dockerfile to download 'gector_model.th'")
423
 
424
  def gector_correct(text: str):
425
  """
426
- Run neural grammatical error correction using GECToR.
427
  """
428
- # 1. Check if model is loaded
429
- if GEC_MODEL is None:
430
  print("[GECToR] Model not loaded, skipping.")
431
- return text, 0, len(text.split())
432
 
433
- # 2. Safety Truncate (Server protection)
434
  parts = text.strip().split()
 
435
  if len(parts) > 1000:
436
  text_proc = " ".join(parts[:1000])
437
  else:
@@ -440,37 +428,31 @@ def gector_correct(text: str):
440
  if not text_proc:
441
  return text_proc, 0, 0
442
 
443
- # 3. Split into sentences and then tokens
444
- # GECToR expects a list of token lists: [['Hello', 'world'], ['How', 'are', 'you']]
445
- sentences = re.split(r"(?<=[.!?])\s+", text_proc)
446
- batch = [s.strip().split() for s in sentences if s.strip()]
447
-
448
- if not batch:
449
- return text_proc, 0, 0
450
 
451
  try:
452
- # 4. Run Prediction
453
- # We pass the encode/decode maps we loaded earlier
454
- final_batch, total_updates = GEC_MODEL.handle_batch(
455
- batch,
456
- encode_mapping=GEC_MODEL.encode,
457
- decode_mapping=GEC_MODEL.decode
 
 
 
 
458
  )
459
-
460
- # 5. Reconstruct Text
461
- corrected_sentences = [" ".join(tokens) for tokens in final_batch]
462
- corrected_text = " ".join(corrected_sentences)
463
-
464
- # 6. Count Corrections
465
- # Simple word-by-word comparison
466
- original_words = len(text_proc.split())
467
- corrections = sum(1 for a, b in zip(text_proc.split(), corrected_text.split()) if a != b)
468
-
469
  return corrected_text, corrections, original_words
470
 
471
  except Exception as e:
472
  print(f"[GECToR] Prediction error: {e}")
473
- # Fallback to original text if crash
474
  return text_proc, 0, len(text_proc.split())
475
 
476
 
@@ -692,8 +674,6 @@ def corpus_plagiarism_combined(text: str):
692
  for row in top:
693
  matches.append({
694
  "title": corpus_titles[row["index"]],
695
-
696
-
697
  "score": round(row["combined"] * 100, 2),
698
  "tfidf_score": round(row["tfidf"] * 100, 2) if row["tfidf"] is not None else None,
699
  "semantic_score": round(row["semantic"] * 100, 2) if row["semantic"] is not None else None,
@@ -965,4 +945,4 @@ def api_history(user=Depends(get_current_user)):
965
 
966
  @app.get("/")
967
  def read_root():
968
- return {"status": "Backend is running with 16GB RAM!"}
 
1
+ # app.py (was: backend/main.py)
2
  import os
3
  import re
4
  import io
 
39
 
40
  # GECToR (neural grammatical error correction)
41
  try:
42
+ # This is the official import path from gotutiyan/gector README
43
  from gector import GECToR, predict as gector_predict, load_verb_dict
44
  except Exception:
45
  GECToR = None
 
384
 
385
  # ------------------ GECToR LOADING (Neural GEC) ------------------
386
  GEC_MODEL = None
387
+ GEC_TOKENIZER = None
388
+ GEC_ENCODE = None
389
+ GEC_DECODE = None
390
+ GEC_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
391
 
392
+ if GECToR is not None and gector_predict is not None and load_verb_dict is not None:
393
+ try:
394
+ print("[GECToR] Initializing model... (This may take a bit on first run)")
395
+ GEC_MODEL_ID = os.getenv("GEC_MODEL_ID", "gotutiyan/gector-roberta-base-5k")
396
+ VERB_DICT_PATH = os.getenv("GEC_VERB_DICT", "/app/data/verb-form-vocab.txt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
+ GEC_MODEL = GECToR.from_pretrained(GEC_MODEL_ID).to(GEC_DEVICE)
399
+ GEC_TOKENIZER = AutoTokenizer.from_pretrained(GEC_MODEL_ID)
400
+ GEC_ENCODE, GEC_DECODE = load_verb_dict(VERB_DICT_PATH)
401
+
402
+ print(f"[GECToR] Model & verb dict loaded: {GEC_MODEL_ID}")
403
+ except Exception as e:
404
+ print(f"[GECToR] Failed to load. Error: {e}")
405
+ GEC_MODEL = None
406
+ GEC_TOKENIZER = None
407
+ GEC_ENCODE = None
408
+ GEC_DECODE = None
409
+ else:
410
+ print("[GECToR] Library not available; skipping neural GEC.")
411
 
 
 
 
 
412
 
413
  def gector_correct(text: str):
414
  """
415
+ Run neural grammatical error correction using GECToR (gotutiyan implementation).
416
  """
417
+ if GEC_MODEL is None or GEC_TOKENIZER is None or GEC_ENCODE is None or GEC_DECODE is None:
 
418
  print("[GECToR] Model not loaded, skipping.")
419
+ return text, 0, len(text.split()) if text.strip() else 0
420
 
 
421
  parts = text.strip().split()
422
+ # Safety truncate (protect server)
423
  if len(parts) > 1000:
424
  text_proc = " ".join(parts[:1000])
425
  else:
 
428
  if not text_proc:
429
  return text_proc, 0, 0
430
 
431
+ srcs = [text_proc]
 
 
 
 
 
 
432
 
433
  try:
434
+ corrected_list = gector_predict(
435
+ GEC_MODEL,
436
+ GEC_TOKENIZER,
437
+ srcs,
438
+ GEC_ENCODE,
439
+ GEC_DECODE,
440
+ keep_confidence=0.0,
441
+ min_error_prob=0.0,
442
+ n_iteration=5,
443
+ batch_size=2,
444
  )
445
+ corrected_text = corrected_list[0]
446
+
447
+ orig_tokens = text_proc.split()
448
+ corr_tokens = corrected_text.split()
449
+ corrections = sum(1 for a, b in zip(orig_tokens, corr_tokens) if a != b)
450
+ original_words = len(orig_tokens)
451
+
 
 
 
452
  return corrected_text, corrections, original_words
453
 
454
  except Exception as e:
455
  print(f"[GECToR] Prediction error: {e}")
 
456
  return text_proc, 0, len(text_proc.split())
457
 
458
 
 
674
  for row in top:
675
  matches.append({
676
  "title": corpus_titles[row["index"]],
 
 
677
  "score": round(row["combined"] * 100, 2),
678
  "tfidf_score": round(row["tfidf"] * 100, 2) if row["tfidf"] is not None else None,
679
  "semantic_score": round(row["semantic"] * 100, 2) if row["semantic"] is not None else None,
 
945
 
946
  @app.get("/")
947
  def read_root():
948
+ return {"status": "Backend is running with GECToR + 16GB RAM!"}