Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import plotly.express as px | |
| num_rows = 50 | |
| df = pd.read_csv('emails_cleaned.csv', on_bad_lines='skip', nrows=num_rows) | |
| def get_message(Series: pd.Series): | |
| result = pd.Series(index=Series.index) | |
| for row, message in enumerate(Series): | |
| message_words = message.split('\n') | |
| del message_words[:15] | |
| result.iloc[row] = ''.join(message_words).strip() | |
| return result | |
| def get_date(Series: pd.Series): | |
| result = pd.Series(index=Series.index) | |
| for row, message in enumerate(Series): | |
| message_words = message.split('\n') | |
| del message_words[0] | |
| del message_words[1:] | |
| result.iloc[row] = ''.join(message_words).strip() | |
| result.iloc[row] = result.iloc[row].replace('Date: ', '') | |
| print('Done parsing, converting to datetime format..') | |
| return pd.to_datetime(result) | |
| def get_sender_and_receiver(Series: pd.Series): | |
| sender = pd.Series(index = Series.index) | |
| recipient1 = pd.Series(index = Series.index) | |
| recipient2 = pd.Series(index = Series.index) | |
| recipient3 = pd.Series(index = Series.index) | |
| for row,message in enumerate(Series): | |
| message_words = message.split('\n') | |
| sender[row] = message_words[2].replace('From: ', '') | |
| recipient1[row] = message_words[3].replace('To: ', '') | |
| recipient2[row] = message_words[10].replace('X-cc: ', '') | |
| recipient3[row] = message_words[11].replace('X-bcc: ', '') | |
| return sender, recipient1, recipient2, recipient3 | |
| def get_subject(Series: pd.Series): | |
| result = pd.Series(index = Series.index) | |
| for row, message in enumerate(Series): | |
| message_words = message.split('\n') | |
| message_words = message_words[4] | |
| result[row] = message_words.replace('Subject: ', '') | |
| return result | |
| def get_folder(Series: pd.Series): | |
| result = pd.Series(index = Series.index) | |
| for row, message in enumerate(Series): | |
| message_words = message.split('\n') | |
| message_words = message_words[12] | |
| result[row] = message_words.replace('X-Folder: ', '') | |
| return result | |
| df['text'] = get_message(df.message) | |
| df['sender'], df['recipient1'], df['recipient2'], df['recipient3'] = get_sender_and_receiver(df.message) | |
| df['Subject'] = get_subject(df.message) | |
| df['folder'] = get_folder(df.message) | |
| df['date'] = get_date(df.message) | |
| df = df.drop(['message', 'file'], axis = 1) | |
| import chromadb | |
| chroma_client = chromadb.Client() | |
| from chromadb.utils import embedding_functions | |
| sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-MiniLM-L3-v2") | |
| collection_minilm = chroma_client.create_collection(name="emails_minilm", embedding_function=sentence_transformer_ef) | |
| for i in df.index: | |
| print(i) | |
| collection_minilm.add( | |
| documents = df.loc[i, 'text'], | |
| metadatas = [{"sender": df.loc[i, 'sender'], | |
| "recipient1": df.loc[i, 'recipient1'], | |
| "recipient2": df.loc[i, 'recipient2'], | |
| "recipient3": df.loc[i, 'recipient3'], | |
| "subject": df.loc[i, 'Subject'], | |
| "folder": df.loc[i, 'folder'], | |
| "date": str(df.loc[i, 'date']) | |
| }], | |
| ids = str(i) | |
| ) | |
| results = collection_minilm.query( | |
| query_texts = ["this is a document"], | |
| n_results = 2, | |
| include = ['distances', 'metadatas', 'documents'] | |
| ) | |
| results | |
| import gradio as gr | |
| import ast | |
| def create_output(dictionary, number): | |
| dictionary_ids = str(dictionary['ids']) | |
| dictionary_ids_clean = dictionary_ids.strip("[]") | |
| dictionary_ids_clean = dictionary_ids_clean.replace("'", "") | |
| dictionary_ids_list = dictionary_ids_clean.split(", ") | |
| string_results = ""; | |
| for n in range(number): | |
| t = collection_minilm.get( | |
| ids=[dictionary_ids_list[n]] | |
| ) | |
| id = str(t["ids"]) | |
| doc = str(t["documents"]) | |
| metadata = str(t["metadatas"]) | |
| dictionary_metadata = ast.literal_eval(metadata.strip("[]")) | |
| string_results_old = string_results | |
| string_temp = """--------------- | |
| SUBJECT: """ + dictionary_metadata['subject'] + """" | |
| MESSAGE: """ + "\n" + doc + """ | |
| ---------------""" | |
| string_results = string_results_old + string_temp | |
| return string_results | |
| def query_chromadb_advanced(question,numberOfResults): | |
| results = collection_minilm.query( | |
| query_texts = question, | |
| n_results = numberOfResults, | |
| ) | |
| return create_output(results, numberOfResults) | |
| result_advance = query_chromadb_advanced("bank", 4) | |
| iface = gr.Interface( | |
| fn=query_chromadb_advanced, | |
| inputs=["text","number"], | |
| outputs="text", | |
| title="Email Dataset Interface", | |
| description="Insert the question or the key word to find the topic correlated in the dataset" | |
| ) | |
| iface.launch(share=True) |