Spaces:

gagan3012
/

summarization

Runtime error

Dean commited on Aug 5, 2021

Commit

c6912f8

Merge commit '8d1f074f67512e839e8d290ade59fc8fe73f7c9c' into fix-mlflow

Incorporating latest version from Gagan

# Conflicts:
# dvc.yaml
# reports/evaluation_metrics.txt
# src/data/process_data.py
# src/models/evaluate_model.py
# src/models/model.py
# src/models/train_model.py

Files changed (13) hide show

Makefile +1 -0
dvc.yaml +10 -3
params.yml +6 -2
reports/{metrics.csv → evaluation_metrics.csv} +0 -0
src/__init__.py +12 -0
src/data/make_dataset.py +11 -12
src/data/process_data.py +9 -9
src/models/__init__.py +4 -1
src/models/evaluate_model.py +5 -5
src/models/model.py +176 -122
src/models/predict_model.py +3 -4
src/models/train_model.py +21 -11
src/visualization/visualize.py +32 -0

Makefile CHANGED Viewed

@@ -35,6 +35,7 @@ clean:
 ## Lint using flake8
 lint:
 	flake8 src
 ## Upload Data to default DVC remote
 push:

 ## Lint using flake8
 lint:
 	flake8 src
+	black src
 ## Upload Data to default DVC remote
 push:

dvc.yaml CHANGED Viewed

@@ -32,8 +32,6 @@ stages:
     outs:
       - models:
           persist: true
-      - reports/training_params.yml:
-          cache: false
     metrics:
       - reports/training_metrics.csv:
           cache: false
@@ -45,6 +43,15 @@ stages:
       - models
       - src/models/evaluate_model.py
     metrics:
-      - reports/metrics.csv:
           cache: false

     outs:
       - models:
           persist: true
     metrics:
       - reports/training_metrics.csv:
           cache: false
       - models
       - src/models/evaluate_model.py
     metrics:
+      - reports/evaluation_metrics.csv:
+          cache: false
+  visualize:
+    cmd: streamlit run src/visualization/visualize.py
+    deps:
+      - models
+      - src/visualization/visualize.py
+      - params.yml
+    metrics:
+      - reports/visualization_metrics.csv:
           cache: false

params.yml CHANGED Viewed

@@ -1,3 +1,4 @@
 data: cnn_dailymail
 batch_size: 2
 num_workers: 2
@@ -8,5 +9,8 @@ epochs: 5
 source_dir: src
 model_dir: models
 metric: rouge
-split: 0.02
-use_gpu: True

+name: summarsiation
 data: cnn_dailymail
 batch_size: 2
 num_workers: 2
 source_dir: src
 model_dir: models
 metric: rouge
+split: 0.001
+use_gpu: True
+visualise: True
+hf_username: gagan3012
+upload_to_hf: True

reports/{metrics.csv → evaluation_metrics.csv} RENAMED Viewed

File without changes

src/__init__.py CHANGED Viewed

	@@ -0,0 +1,12 @@

+import os  # noqa: F401
+import sys  # noqa: F401
+from src.data.make_dataset import make_dataset  # noqa: F401
+from src.data.process_data import process_data  # noqa: F401
+from src.models.evaluate_model import evaluate_model  # noqa: F401
+from src.models.model import Summarization  # noqa: F401
+from src.models.predict_model import predict_model  # noqa: F401
+from src.models.train_model import train_model  # noqa: F401
+from src.visualization.visualize import visualize  # noqa: F401
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))  # noqa: F401

src/data/make_dataset.py CHANGED Viewed

@@ -5,22 +5,21 @@ import os
 import pprint
-def make_dataset(dataset='cnn_dailymail', split='train'):
     """make dataset for summarisation"""
-    if not os.path.exists('data/raw'):
-        os.makedirs('data/raw')
-    dataset = load_dataset(dataset, '3.0.0', split=split)
     df = pd.DataFrame()
-    df['article'] = dataset['article']
-    df['highlights'] = dataset['highlights']
-    df.to_csv('data/raw/{}.csv'.format(split))
-if __name__ == '__main__':
     with open("params.yml") as f:
         params = yaml.safe_load(f)
     pprint.pprint(params)
-    make_dataset(dataset=params['data'], split='train')
-    make_dataset(dataset=params['data'], split='test')
-    make_dataset(dataset=params['data'], split='validation')

 import pprint
+def make_dataset(dataset="cnn_dailymail", split="train"):
     """make dataset for summarisation"""
+    if not os.path.exists("data/raw"):
+        os.makedirs("data/raw")
+    dataset = load_dataset(dataset, "3.0.0", split=split)
     df = pd.DataFrame()
+    df["article"] = dataset["article"]
+    df["highlights"] = dataset["highlights"]
+    df.to_csv("data/raw/{}.csv".format(split))
+if __name__ == "__main__":
     with open("params.yml") as f:
         params = yaml.safe_load(f)
     pprint.pprint(params)
+    make_dataset(dataset=params["data"], split="train")
+    make_dataset(dataset=params["data"], split="test")
+    make_dataset(dataset=params["data"], split="validation")

src/data/process_data.py CHANGED Viewed

@@ -3,18 +3,18 @@ import yaml
 import os
-def process_data(split='train'):
     with open("params.yml") as f:
         params = yaml.safe_load(f)
-    df = pd.read_csv('data/raw/{}.csv'.format(split))
-    df.columns = ['Unnamed: 0', 'input_text', 'output_text']
-    df = df.sample(frac=params['split'], replace=True, random_state=1)
-    df.to_csv('data/processed/{}.csv'.format(split))
-if __name__ == '__main__':
-    process_data(split='train')
-    process_data(split='test')
-    process_data(split='validation')

 import os
+def process_data(split="train"):
     with open("params.yml") as f:
         params = yaml.safe_load(f)
+    df = pd.read_csv("data/raw/{}.csv".format(split))
+    df.columns = ["Unnamed: 0", "input_text", "output_text"]
+    df = df.sample(frac=params["split"], replace=True, random_state=1)
+    df.to_csv("data/processed/{}.csv".format(split))
+if __name__ == "__main__":
+    process_data(split="train")
+    process_data(split="test")
+    process_data(split="validation")

src/models/__init__.py CHANGED Viewed

	@@ -1 +1,4 @@
1	- from .model import Summarization

+from .model import Summarization  # noqa: F401
+from .train_model import train_model  # noqa: F401
+from .predict_model import predict_model  # noqa: F401
+from .evaluate_model import evaluate_model  # noqa: F401

src/models/evaluate_model.py CHANGED Viewed

@@ -13,14 +13,14 @@ def evaluate_model():
     with open("params.yml") as f:
         params = yaml.safe_load(f)
-    test_df = pd.read_csv('data/processed/test.csv')[:25]
     model = Summarization()
-    model.load_model(model_type=params['model_type'], model_dir=params['model_dir'])
-    results = model.evaluate(test_df=test_df, metrics=params['metric'])
-    with dagshub_logger(metrics_path='reports/metrics.csv', should_log_hparams=False) as logger:
         logger.log_metrics(results)
-if __name__ == '__main__':
     evaluate_model()

     with open("params.yml") as f:
         params = yaml.safe_load(f)
+    test_df = pd.read_csv("data/processed/test.csv")[:25]
     model = Summarization()
+    model.load_model(model_type=params["model_type"], model_dir=params["model_dir"])
+    results = model.evaluate(test_df=test_df, metrics=params["metric"])
+    with dagshub_logger(metrics_path='reports/evaluation_metrics.csv', should_log_hparams=False) as logger:
         logger.log_metrics(results)
+if __name__ == "__main__":
     evaluate_model()

src/models/model.py CHANGED Viewed

@@ -1,9 +1,17 @@
 import torch
 import pandas as pd
 from transformers import (
     AdamW,
     T5ForConditionalGeneration,
-    T5TokenizerFast as T5Tokenizer, MT5Tokenizer, MT5ForConditionalGeneration, ByT5Tokenizer,
 )
 from torch.utils.data import Dataset, DataLoader
 import pytorch_lightning as pl
@@ -27,11 +35,11 @@ class DataModule(Dataset):
     """
     def __init__(
-            self,
-            data: pd.DataFrame,
-            tokenizer: T5Tokenizer,
-            source_max_token_len: int = 512,
-            target_max_token_len: int = 512,
     ):
         """
         :param data:
@@ -71,9 +79,7 @@ class DataModule(Dataset):
         )
         labels = output_encoding["input_ids"]
-        labels[
-            labels == 0
-            ] = -100
         return dict(
             keywords=data_row["input_text"],
@@ -87,15 +93,15 @@ class DataModule(Dataset):
 class PLDataModule(LightningDataModule):
     def __init__(
-            self,
-            train_df: pd.DataFrame,
-            test_df: pd.DataFrame,
-            tokenizer: T5Tokenizer,
-            source_max_token_len: int = 512,
-            target_max_token_len: int = 512,
-            batch_size: int = 4,
-            split: float = 0.1,
-            num_workers: int = 2
     ):
         """
         :param data_df:
@@ -130,28 +136,45 @@ class PLDataModule(LightningDataModule):
         )
     def train_dataloader(self):
-        """ training dataloader """
         return DataLoader(
-            self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers
         )
     def test_dataloader(self):
-        """ test dataloader """
         return DataLoader(
-            self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers
         )
     def val_dataloader(self):
-        """ validation dataloader """
         return DataLoader(
-            self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers
         )
 class LightningModel(LightningModule):
-    """ PyTorch Lightning Model class"""
-    def __init__(self, tokenizer, model, learning_rate, adam_epsilon, weight_decay, output: str = "outputs"):
         """
         initiates a PyTorch Lightning Model
         Args:
@@ -168,7 +191,7 @@ class LightningModel(LightningModule):
         self.weight_decay = weight_decay
     def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
-        """ forward step """
         output = self.model(
             input_ids,
             attention_mask=attention_mask,
@@ -179,7 +202,7 @@ class LightningModel(LightningModule):
         return output.loss, output.logits
     def training_step(self, batch, batch_size):
-        """ training step """
         input_ids = batch["keywords_input_ids"]
         attention_mask = batch["keywords_attention_mask"]
         labels = batch["labels"]
@@ -195,7 +218,7 @@ class LightningModel(LightningModule):
         return loss
     def validation_step(self, batch, batch_size):
-        """ validation step """
         input_ids = batch["keywords_input_ids"]
         attention_mask = batch["keywords_attention_mask"]
         labels = batch["labels"]
@@ -211,7 +234,7 @@ class LightningModel(LightningModule):
         return loss
     def test_step(self, batch, batch_size):
-        """ test step """
         input_ids = batch["keywords_input_ids"]
         attention_mask = batch["keywords_attention_mask"]
         labels = batch["labels"]
@@ -228,29 +251,41 @@ class LightningModel(LightningModule):
         return loss
     def configure_optimizers(self):
-        """ configure optimizers """
         model = self.model
         no_decay = ["bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
             {
-                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                 "weight_decay": self.weight_decay,
             },
             {
-                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                 "weight_decay": 0.0,
             },
         ]
-        optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon)
         self.opt = optimizer
         return [optimizer]
 class Summarization:
-    """ Custom Summarization class """
     def __init__(self) -> None:
-        """ initiates Summarization class """
         pass
     def from_pretrained(self, model_type="t5", model_name="t5-base") -> None:
@@ -277,20 +312,20 @@ class Summarization:
             )
     def train(
-            self,
-            train_df: pd.DataFrame,
-            eval_df: pd.DataFrame,
-            source_max_token_len: int = 512,
-            target_max_token_len: int = 512,
-            batch_size: int = 8,
-            max_epochs: int = 5,
-            use_gpu: bool = True,
-            outputdir: str = "models",
-            early_stopping_patience_epochs: int = 0,  # 0 to disable early stopping feature
-            learning_rate: float = 0.0001,
-            adam_epsilon: float = 0.01,
-            num_workers: int = 2,
-            weight_decay: float = 0.0001
     ):
         """
         trains T5/MT5 model on custom dataset
@@ -322,8 +357,12 @@ class Summarization:
         )
         self.T5Model = LightningModel(
-            tokenizer=self.tokenizer, model=self.model, output=outputdir,
-            learning_rate=learning_rate, adam_epsilon=adam_epsilon, weight_decay=weight_decay
         )
         logger = DAGsHubLogger(metrics_path='reports/training_metrics.csv',
@@ -359,7 +398,7 @@ class Summarization:
             trainer.fit(self.T5Model, self.data_module)
     def load_model(
-            self, model_type: str = 't5', model_dir: str = "models", use_gpu: bool = False
     ):
         """
         loads a checkpoint for inferencing/prediction
@@ -388,16 +427,15 @@ class Summarization:
             if torch.cuda.is_available():
                 self.device = torch.device("cuda")
             else:
-                raise Exception("exception ---> no gpu found. set use_gpu=False, to use CPU")
         else:
             self.device = torch.device("cpu")
         self.model = self.model.to(self.device)
-    def save_model(
-            self,
-            model_dir="models"
-    ):
         """
         Save model to dir
         :param model_dir:
@@ -408,19 +446,19 @@ class Summarization:
         self.model.save_pretrained(path)
     def predict(
-            self,
-            source_text: str,
-            max_length: int = 512,
-            num_return_sequences: int = 1,
-            num_beams: int = 2,
-            top_k: int = 50,
-            top_p: float = 0.95,
-            do_sample: bool = True,
-            repetition_penalty: float = 2.5,
-            length_penalty: float = 1.0,
-            early_stopping: bool = True,
-            skip_special_tokens: bool = True,
-            clean_up_tokenization_spaces: bool = True,
     ):
         """
         generates prediction for T5/MT5 model
@@ -463,14 +501,10 @@ class Summarization:
         )
         return preds
-    def evaluate(
-            self,
-            test_df: pd.DataFrame,
-            metrics: str = "rouge"
-    ):
         metric = load_metric(metrics)
-        input_text = test_df['input_text']
-        references = test_df['output_text']
         references = references.to_list()
         predictions = [self.predict(x) for x in tqdm(input_text)]
@@ -478,49 +512,69 @@ class Summarization:
         results = metric.compute(predictions=predictions, references=references)
         output = {
-            'Rouge 1': {
-                'Rouge_1 Low Precision': results["rouge1"].low.precision,
-                'Rouge_1 Low recall': results["rouge1"].low.recall,
-                'Rouge_1 Low F1': results["rouge1"].low.fmeasure,
-                'Rouge_1 Mid Precision': results["rouge1"].mid.precision,
-                'Rouge_1 Mid recall': results["rouge1"].mid.recall,
-                'Rouge_1 Mid F1': results["rouge1"].mid.fmeasure,
-                'Rouge_1 High Precision': results["rouge1"].high.precision,
-                'Rouge_1 High recall': results["rouge1"].high.recall,
-                'Rouge_1 High F1': results["rouge1"].high.fmeasure,
-            },
-            'Rouge 2': {
-                'Rouge_2 Low Precision': results["rouge2"].low.precision,
-                'Rouge_2 Low recall': results["rouge2"].low.recall,
-                'Rouge_2 Low F1': results["rouge2"].low.fmeasure,
-                'Rouge_2 Mid Precision': results["rouge2"].mid.precision,
-                'Rouge_2 Mid recall': results["rouge2"].mid.recall,
-                'Rouge_2 Mid F1': results["rouge2"].mid.fmeasure,
-                'Rouge_2 High Precision': results["rouge2"].high.precision,
-                'Rouge_2 High recall': results["rouge2"].high.recall,
-                'Rouge_2 High F1': results["rouge2"].high.fmeasure,
-            },
-            'Rouge L': {
-                'Rouge_L Low Precision': results["rougeL"].low.precision,
-                'Rouge_L Low recall': results["rougeL"].low.recall,
-                'Rouge_L Low F1': results["rougeL"].low.fmeasure,
-                'Rouge_L Mid Precision': results["rougeL"].mid.precision,
-                'Rouge_L Mid recall': results["rougeL"].mid.recall,
-                'Rouge_L Mid F1': results["rougeL"].mid.fmeasure,
-                'Rouge_L High Precision': results["rougeL"].high.precision,
-                'Rouge_L High recall': results["rougeL"].high.recall,
-                'Rouge_L High F1': results["rougeL"].high.fmeasure,
-            },
-            'rougeLsum': {
-                'rougeLsum Low Precision': results["rougeLsum"].low.precision,
-                'rougeLsum Low recall': results["rougeLsum"].low.recall,
-                'rougeLsum Low F1': results["rougeLsum"].low.fmeasure,
-                'rougeLsum Mid Precision': results["rougeLsum"].mid.precision,
-                'rougeLsum Mid recall': results["rougeLsum"].mid.recall,
-                'rougeLsum Mid F1': results["rougeLsum"].mid.fmeasure,
-                'rougeLsum High Precision': results["rougeLsum"].high.precision,
-                'rougeLsum High recall': results["rougeLsum"].high.recall,
-                'rougeLsum High F1': results["rougeLsum"].high.fmeasure,
-            }
         }
         return output

+import shutil
+from getpass import getpass
+from pathlib import Path
 import torch
 import pandas as pd
+from huggingface_hub import HfApi, Repository
 from transformers import (
     AdamW,
     T5ForConditionalGeneration,
+    T5TokenizerFast as T5Tokenizer,
+    MT5Tokenizer,
+    MT5ForConditionalGeneration,
+    ByT5Tokenizer,
 )
 from torch.utils.data import Dataset, DataLoader
 import pytorch_lightning as pl
     """
     def __init__(
+        self,
+        data: pd.DataFrame,
+        tokenizer: T5Tokenizer,
+        source_max_token_len: int = 512,
+        target_max_token_len: int = 512,
     ):
         """
         :param data:
         )
         labels = output_encoding["input_ids"]
+        labels[labels == 0] = -100
         return dict(
             keywords=data_row["input_text"],
 class PLDataModule(LightningDataModule):
     def __init__(
+        self,
+        train_df: pd.DataFrame,
+        test_df: pd.DataFrame,
+        tokenizer: T5Tokenizer,
+        source_max_token_len: int = 512,
+        target_max_token_len: int = 512,
+        batch_size: int = 4,
+        split: float = 0.1,
+        num_workers: int = 2,
     ):
         """
         :param data_df:
         )
     def train_dataloader(self):
+        """training dataloader"""
         return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
         )
     def test_dataloader(self):
+        """test dataloader"""
         return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
         )
     def val_dataloader(self):
+        """validation dataloader"""
         return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
         )
 class LightningModel(LightningModule):
+    """PyTorch Lightning Model class"""
+    def __init__(
+        self,
+        tokenizer,
+        model,
+        learning_rate,
+        adam_epsilon,
+        weight_decay,
+        output: str = "outputs",
+    ):
         """
         initiates a PyTorch Lightning Model
         Args:
         self.weight_decay = weight_decay
     def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
+        """forward step"""
         output = self.model(
             input_ids,
             attention_mask=attention_mask,
         return output.loss, output.logits
     def training_step(self, batch, batch_size):
+        """training step"""
         input_ids = batch["keywords_input_ids"]
         attention_mask = batch["keywords_attention_mask"]
         labels = batch["labels"]
         return loss
     def validation_step(self, batch, batch_size):
+        """validation step"""
         input_ids = batch["keywords_input_ids"]
         attention_mask = batch["keywords_attention_mask"]
         labels = batch["labels"]
         return loss
     def test_step(self, batch, batch_size):
+        """test step"""
         input_ids = batch["keywords_input_ids"]
         attention_mask = batch["keywords_attention_mask"]
         labels = batch["labels"]
         return loss
     def configure_optimizers(self):
+        """configure optimizers"""
         model = self.model
         no_decay = ["bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
             {
+                "params": [
+                    p
+                    for n, p in model.named_parameters()
+                    if not any(nd in n for nd in no_decay)
+                ],
                 "weight_decay": self.weight_decay,
             },
             {
+                "params": [
+                    p
+                    for n, p in model.named_parameters()
+                    if any(nd in n for nd in no_decay)
+                ],
                 "weight_decay": 0.0,
             },
         ]
+        optimizer = AdamW(
+            optimizer_grouped_parameters,
+            lr=self.learning_rate,
+            eps=self.adam_epsilon,
+        )
         self.opt = optimizer
         return [optimizer]
 class Summarization:
+    """Custom Summarization class"""
     def __init__(self) -> None:
+        """initiates Summarization class"""
         pass
     def from_pretrained(self, model_type="t5", model_name="t5-base") -> None:
             )
     def train(
+        self,
+        train_df: pd.DataFrame,
+        eval_df: pd.DataFrame,
+        source_max_token_len: int = 512,
+        target_max_token_len: int = 512,
+        batch_size: int = 8,
+        max_epochs: int = 5,
+        use_gpu: bool = True,
+        outputdir: str = "models",
+        early_stopping_patience_epochs: int = 0,  # 0 to disable early stopping feature
+        learning_rate: float = 0.0001,
+        adam_epsilon: float = 0.01,
+        num_workers: int = 2,
+        weight_decay: float = 0.0001,
     ):
         """
         trains T5/MT5 model on custom dataset
         )
         self.T5Model = LightningModel(
+            tokenizer=self.tokenizer,
+            model=self.model,
+            output=outputdir,
+            learning_rate=learning_rate,
+            adam_epsilon=adam_epsilon,
+            weight_decay=weight_decay,
         )
         logger = DAGsHubLogger(metrics_path='reports/training_metrics.csv',
             trainer.fit(self.T5Model, self.data_module)
     def load_model(
+        self, model_type: str = "t5", model_dir: str = "models", use_gpu: bool = False
     ):
         """
         loads a checkpoint for inferencing/prediction
             if torch.cuda.is_available():
                 self.device = torch.device("cuda")
             else:
+                raise Exception(
+                    "exception ---> no gpu found. set use_gpu=False, to use CPU"
+                )
         else:
             self.device = torch.device("cpu")
         self.model = self.model.to(self.device)
+    def save_model(self, model_dir="models"):
         """
         Save model to dir
         :param model_dir:
         self.model.save_pretrained(path)
     def predict(
+        self,
+        source_text: str,
+        max_length: int = 512,
+        num_return_sequences: int = 1,
+        num_beams: int = 2,
+        top_k: int = 50,
+        top_p: float = 0.95,
+        do_sample: bool = True,
+        repetition_penalty: float = 2.5,
+        length_penalty: float = 1.0,
+        early_stopping: bool = True,
+        skip_special_tokens: bool = True,
+        clean_up_tokenization_spaces: bool = True,
     ):
         """
         generates prediction for T5/MT5 model
         )
         return preds
+    def evaluate(self, test_df: pd.DataFrame, metrics: str = "rouge"):
         metric = load_metric(metrics)
+        input_text = test_df["input_text"]
+        references = test_df["output_text"]
         references = references.to_list()
         predictions = [self.predict(x) for x in tqdm(input_text)]
         results = metric.compute(predictions=predictions, references=references)
         output = {
+            "Rouge_1 Low Precision": results["rouge1"].low.precision,
+            "Rouge_1 Low recall": results["rouge1"].low.recall,
+            "Rouge_1 Low F1": results["rouge1"].low.fmeasure,
+            "Rouge_1 Mid Precision": results["rouge1"].mid.precision,
+            "Rouge_1 Mid recall": results["rouge1"].mid.recall,
+            "Rouge_1 Mid F1": results["rouge1"].mid.fmeasure,
+            "Rouge_1 High Precision": results["rouge1"].high.precision,
+            "Rouge_1 High recall": results["rouge1"].high.recall,
+            "Rouge_1 High F1": results["rouge1"].high.fmeasure,
+            "Rouge_2 Low Precision": results["rouge2"].low.precision,
+            "Rouge_2 Low recall": results["rouge2"].low.recall,
+            "Rouge_2 Low F1": results["rouge2"].low.fmeasure,
+            "Rouge_2 Mid Precision": results["rouge2"].mid.precision,
+            "Rouge_2 Mid recall": results["rouge2"].mid.recall,
+            "Rouge_2 Mid F1": results["rouge2"].mid.fmeasure,
+            "Rouge_2 High Precision": results["rouge2"].high.precision,
+            "Rouge_2 High recall": results["rouge2"].high.recall,
+            "Rouge_2 High F1": results["rouge2"].high.fmeasure,
+            "Rouge_L Low Precision": results["rougeL"].low.precision,
+            "Rouge_L Low recall": results["rougeL"].low.recall,
+            "Rouge_L Low F1": results["rougeL"].low.fmeasure,
+            "Rouge_L Mid Precision": results["rougeL"].mid.precision,
+            "Rouge_L Mid recall": results["rougeL"].mid.recall,
+            "Rouge_L Mid F1": results["rougeL"].mid.fmeasure,
+            "Rouge_L High Precision": results["rougeL"].high.precision,
+            "Rouge_L High recall": results["rougeL"].high.recall,
+            "Rouge_L High F1": results["rougeL"].high.fmeasure,
+            "rougeLsum Low Precision": results["rougeLsum"].low.precision,
+            "rougeLsum Low recall": results["rougeLsum"].low.recall,
+            "rougeLsum Low F1": results["rougeLsum"].low.fmeasure,
+            "rougeLsum Mid Precision": results["rougeLsum"].mid.precision,
+            "rougeLsum Mid recall": results["rougeLsum"].mid.recall,
+            "rougeLsum Mid F1": results["rougeLsum"].mid.fmeasure,
+            "rougeLsum High Precision": results["rougeLsum"].high.precision,
+            "rougeLsum High recall": results["rougeLsum"].high.recall,
+            "rougeLsum High F1": results["rougeLsum"].high.fmeasure,
         }
         return output
+    def upload(self, hf_username, model_name):
+        hf_password = getpass("Enter your HuggingFace password")
+        if Path("./models").exists():
+            shutil.rmtree("./models")
+        token = HfApi().login(username=hf_username, password=hf_password)
+        del hf_password
+        model_url = HfApi().create_repo(token=token, name=model_name, exist_ok=True)
+        model_repo = Repository(
+            "./model",
+            clone_from=model_url,
+            use_auth_token=token,
+            git_email=f"{hf_username}@users.noreply.huggingface.co",
+            git_user=hf_username,
+        )
+        readme_txt = f"""
+            ---
+            Summarisation model {model_name}
+            """.strip()
+        (Path(model_repo.local_dir) / "README.md").write_text(readme_txt)
+        self.save_model()
+        commit_url = model_repo.push_to_hub()
+        print("Check out your model at:")
+        print(commit_url)
+        print(f"https://huggingface.co/{hf_username}/{model_name}")

src/models/predict_model.py CHANGED Viewed

@@ -11,14 +11,13 @@ def predict_model(text):
     with open("params.yml") as f:
         params = yaml.safe_load(f)
     model = Summarization()
-    model.load_model(model_type=params['model_type'], model_dir=params['model_dir'])
     pre_summary = model.predict(text)
     return pre_summary
-if __name__ == '__main__':
-    text = pd.load_csv('data/processed/test.csv')['input_text'][0]
     pre_summary = predict_model(text)
     print(pre_summary)

     with open("params.yml") as f:
         params = yaml.safe_load(f)
     model = Summarization()
+    model.load_model(model_type=params["model_type"], model_dir=params["model_dir"])
     pre_summary = model.predict(text)
     return pre_summary
+if __name__ == "__main__":
+    text = pd.load_csv("data/processed/test.csv")["input_text"][0]
     pre_summary = predict_model(text)
     print(pre_summary)

src/models/train_model.py CHANGED Viewed

@@ -12,22 +12,32 @@ def train_model():
         params = yaml.safe_load(f)
     # Load the data
-    train_df = pd.read_csv('data/processed/train.csv')
-    eval_df = pd.read_csv('data/processed/validation.csv')
-    train_df = train_df.sample(frac=params['split'], replace=True, random_state=1)
-    eval_df = eval_df.sample(frac=params['split'], replace=True, random_state=1)
     model = Summarization()
-    model.from_pretrained(model_type=params['model_type'], model_name=params['model_name'])
-    model.train(train_df=train_df, eval_df=eval_df,
-                batch_size=params['batch_size'], max_epochs=params['epochs'],
-                use_gpu=params['use_gpu'], learning_rate=float(params['learning_rate']),
-                num_workers=int(params['num_workers']))
-    model.save_model(model_dir=params['model_dir'])
-if __name__ == '__main__':
     train_model()

         params = yaml.safe_load(f)
     # Load the data
+    train_df = pd.read_csv("data/processed/train.csv")
+    eval_df = pd.read_csv("data/processed/validation.csv")
+    train_df = train_df.sample(frac=params["split"], replace=True, random_state=1)
+    eval_df = eval_df.sample(frac=params["split"], replace=True, random_state=1)
     model = Summarization()
+    model.from_pretrained(
+        model_type=params["model_type"], model_name=params["model_name"]
+    )
+    model.train(
+        train_df=train_df,
+        eval_df=eval_df,
+        batch_size=params["batch_size"],
+        max_epochs=params["epochs"],
+        use_gpu=params["use_gpu"],
+        learning_rate=float(params["learning_rate"]),
+        num_workers=int(params["num_workers"]),
+    )
+    model.save_model(model_dir=params["model_dir"])
+    if params["upload_to_hf"]:
+        model.upload(hf_username=params["hf_username"], model_name=params["name"])
+if __name__ == "__main__":
     train_model()

src/visualization/visualize.py CHANGED Viewed

	@@ -0,0 +1,32 @@

+import streamlit as st
+import yaml
+from models import predict_model
+def visualize():
+    st.write("# Summarization  UI")
+    st.markdown(
+        """
+        *For additional questions and inquiries, please contact **Gagan Bhatia** via [LinkedIn](
+        https://www.linkedin.com/in/gbhatia30/) or [Github](https://github.com/gagan3012).*
+        """
+    )
+    text = st.text_area("Enter text here")
+    if st.button("Generate Summary"):
+        with st.spinner("Connecting the Dots..."):
+            sumtext = predict_model(text=text)
+        st.write("# Generated Summary:")
+        st.write("{}".format(sumtext))
+        with open("reports/visualization_metrics.txt", "w") as file1:
+            file1.writelines(text)
+            file1.writelines(sumtext)
+if __name__ == "__main__":
+    with open("params.yml") as f:
+        params = yaml.safe_load(f)
+    if params["visualise"]:
+        visualize()