Spaces:

UKURIKIYEYEZU
/

Help_chatbot

Build error

App Files Files Community

Help_chatbot / app.py

UKURIKIYEYEZU

Update app.py

919d528 verified 8 months ago

raw

history blame contribute delete

23.6 kB

	import os
	import PyPDF2
	from PyPDF2 import PdfReader
	import pandas as pd

	## Embedding model!
	from langchain_huggingface import HuggingFaceEmbeddings
	embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")



	folder_path = "./"
	context_data = []

	# List all files in the folder
	files = os.listdir(folder_path)

	# Get list of CSV and Excel files
	data_files = [f for f in files if f.endswith(('.csv', '.xlsx', '.xls'))]

	# Process each file
	for f, file in enumerate(data_files, 1):
	print(f"\nProcessing file {f}: {file}")
	file_path = os.path.join(folder_path, file)

	try:
	# Read the file based on its extension
	if file.endswith('.csv'):
	df = pd.read_csv(file_path)
	else:
	df = pd.read_excel(file_path)

	# Extract non-empty values from column 2 and append them
	context_data.extend(df.iloc[:, 2].dropna().astype(str).tolist())

	except Exception as e:
	print(f"Error processing file {file}: {str(e)}")




	import os
	import PyPDF2
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.schema import Document

	def extract_text_from_pdf(pdf_path):
	"""Extract text from a PDF file."""
	try:
	with open(pdf_path, "rb") as file:
	reader = PyPDF2.PdfReader(file)
	return "".join(page.extract_text() or "" for page in reader.pages)
	except Exception as e:
	print(f"Error with {pdf_path}: {e}")
	return ""

	pdf_files = [f for f in files if f.lower().endswith(".pdf")]

	# Process PDFs
	documents = []
	for file in pdf_files:
	print(f"Processing: {file}")
	pdf_path = os.path.join(folder_path, file)
	text = extract_text_from_pdf(pdf_path)
	if text:
	documents.append(Document(page_content=text, metadata={"source": file}))

	# Split into chunks
	text_splitter = RecursiveCharacterTextSplitter(
	separators=['\n\n', '\n', '.', ','],
	chunk_size=500,
	chunk_overlap=50
	)
	chunks = text_splitter.split_documents(documents)
	text_only_chunks = [chunk.page_content for chunk in chunks]


	from urllib.parse import urljoin, urlparse
	import requests
	from io import BytesIO

	from bs4 import BeautifulSoup
	from langchain_core.prompts import ChatPromptTemplate
	import gradio as gr


	def scrape_websites(base_urls):
	try:
	visited_links = set() # To avoid revisiting the same link
	content_by_url = {} # Store content from each URL

	for base_url in base_urls:
	if not base_url.strip():
	continue # Skip empty or invalid URLs

	print(f"Scraping base URL: {base_url}")
	html_content = fetch_page_content(base_url)
	if html_content:
	cleaned_content = clean_body_content(html_content)
	content_by_url[base_url] = cleaned_content
	visited_links.add(base_url)

	# Extract and process all internal links
	soup = BeautifulSoup(html_content, "html.parser")
	links = extract_internal_links(base_url, soup)

	for link in links:
	if link not in visited_links:
	print(f"Scraping link: {link}")
	page_content = fetch_page_content(link)
	if page_content:
	cleaned_content = clean_body_content(page_content)
	content_by_url[link] = cleaned_content
	visited_links.add(link)

	# If the link is a PDF file, extract its content
	if link.lower().endswith('.pdf'):
	print(f"Extracting PDF content from: {link}")
	pdf_content = extract_pdf_text(link)
	if pdf_content:
	content_by_url[link] = pdf_content

	return content_by_url

	except Exception as e:
	print(f"Error during scraping: {e}")
	return {}


	def fetch_page_content(url):
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	return response.text
	except requests.exceptions.RequestException as e:
	print(f"Error fetching {url}: {e}")
	return None


	def extract_internal_links(base_url, soup):
	links = set()
	for anchor in soup.find_all("a", href=True):
	href = anchor["href"]
	full_url = urljoin(base_url, href)
	if is_internal_link(base_url, full_url):
	links.add(full_url)
	return links


	def is_internal_link(base_url, link_url):
	base_netloc = urlparse(base_url).netloc
	link_netloc = urlparse(link_url).netloc
	return base_netloc == link_netloc


	def extract_pdf_text(pdf_url):
	try:
	response = requests.get(pdf_url)
	response.raise_for_status()

	# Open the PDF from the response content
	with BytesIO(response.content) as file:
	reader = PdfReader(file)
	pdf_text = ""
	for page in reader.pages:
	pdf_text += page.extract_text()

	return pdf_text if pdf_text else None
	except requests.exceptions.RequestException as e:
	print(f"Error fetching PDF {pdf_url}: {e}")
	return None
	except Exception as e:
	print(f"Error reading PDF {pdf_url}: {e}")
	return None


	def clean_body_content(html_content):
	soup = BeautifulSoup(html_content, "html.parser")

	# Remove scripts and styles
	for script_or_style in soup(["script", "style"]):
	script_or_style.extract()

	# Get text and clean up
	cleaned_content = soup.get_text(separator="\n")
	cleaned_content = "\n".join(
	line.strip() for line in cleaned_content.splitlines() if line.strip()
	)
	return cleaned_content



	# if __name__ == "__main__":
	# website = [
	# #"https://www.rib.gov.rw/index.php?id=371",
	# "https://haguruka.org.rw/our-work/"
	# ]
	# all_content = scrape_websites(website)

	# # Temporary list to store (url, content) tuples
	# temp_list = []

	# # Process and store each URL with its content
	# for url, content in all_content.items():
	# temp_list.append((url, content))



	# processed_texts = []

	# # Process each element in the temporary list
	# for element in temp_list:
	# if isinstance(element, tuple):
	# url, content = element # Unpack the tuple
	# processed_texts.append(f"url: {url}, content: {content}")
	# elif isinstance(element, str):
	# processed_texts.append(element)
	# else:
	# processed_texts.append(str(element))

	# def chunk_string(s, chunk_size=2000):
	# return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]

	# # List to store the chunks
	# chunked_texts = []

	# for text in processed_texts:
	# chunked_texts.extend(chunk_string(text))

	data = []
	data.extend(context_data)
	#data.extend([item for item in text_only_chunks if item not in data])
	#data.extend([item for item in chunked_texts if item not in data])



	#from langchain_community.vectorstores import Chroma
	from langchain_chroma import Chroma



	vectorstore = Chroma(
	collection_name="GBV_data_set",
	embedding_function=embed_model,
	)

	vectorstore.get().keys()

	# add data to vector nstore
	vectorstore.add_texts(data)


	api= os.environ.get('V1')





	from openai import OpenAI
	from langchain_core.prompts import PromptTemplate
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.runnables import RunnablePassthrough
	import gradio as gr
	from typing import Iterator
	import time



	#template for GBV support chatbot
	template = ("""
	You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your primary goal is to provide emotionally intelligent support while maintaining appropriate boundaries.
	You are a conversational AI. Respond directly and naturally to the user's input without displaying any system messages, backend processes, or 'thinking...' responses. Only provide the final response in a human-like and engaging manner.

	When responding follow these guidelines:

	1. Emotional Intelligence
	- Validate feelings without judgment (e.g., "It is completely understandable to feel this way")
	- Offer reassurance when appropriate, always centered on empowerment
	- Adjust your tone based on the emotional state conveyed

	2. Personalized Communication
	- Avoid contractions (e.g., use I am instead of I'm)
	- Incorporate thoughtful pauses or reflective questions when the conversation involves difficult topics
	- Use selective emojis (😊, 🤗, ❤️) only when tone-appropriate and not during crisis discussions
	- Balance warmth with professionalism

	3. Conversation Management
	- Refer to {conversation_history} to maintain continuity and avoid repetition
	- Keep responses concise unless greater detail is explicitly requested
	- Use clear paragraph breaks for readability
	- Prioritize immediate concerns before addressing secondary issues

	4. Information Delivery
	- Extract only relevant information from {context} that directly addresses the question
	- Present information in accessible, non-technical language
	- Organize resource recommendations in order of relevance and accessibility
	- Provide links [URL] only when specifically requested, prefaced with clear descriptions
	- When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?"

	5. Safety and Ethics
	- Prioritize user safety in all responses
	- Never generate speculative content about their specific situation
	- Avoid phrases that could minimize experiences or create pressure
	- Include gentle reminders about professional help when discussing serious issues

	Your response should balance emotional support with practical guidance.

	Context: {context}
	User's Question: {question}
	Your Response:
	""")

	rag_prompt = PromptTemplate.from_template(template)

	retriever = vectorstore.as_retriever()

	import requests

	API_TOKEN = os.environ.get('Token')

	model_name = "facebook/nllb-200-distilled-600M"

	url = f"https://api-inference.huggingface.co/models/{model_name}"

	headers = {
	"Authorization": f"Bearer {API_TOKEN}"
	}

	def translate_text(text, src_lang, tgt_lang):
	"""Translate text using Hugging Face API"""
	response = requests.post(
	url,
	headers=headers,
	json={
	"inputs": text,
	"parameters": {
	"src_lang": src_lang,
	"tgt_lang": tgt_lang
	}
	}
	)

	if response.status_code == 200:
	result = response.json()
	if isinstance(result, list) and len(result) > 0:
	return result[0]['translation_text']
	return result['translation_text']
	else:
	print(f"Translation error: {response.status_code}, {response.text}")
	return text # Return original text if translation fails


	class OpenRouterLLM:
	def __init__(self, key: str):
	try:
	self.client = OpenAI(
	base_url="https://openrouter.ai/api/v1",
	api_key=key
	)
	self.headers = {
	"HTTP-Referer": "http://localhost:3000",
	"X-Title": "Local Development"
	}
	except Exception as e:
	print(f"Initialization error: {e}")
	raise

	def stream(self, prompt: str) -> Iterator[str]:
	try:
	completion = self.client.chat.completions.create(
	#model="deepseek/deepseek-r1-distill-llama-70b:free",
	model="meta-llama/llama-3.3-70b-instruct:free",
	#model="google/gemini-2.5-pro-exp-03-25:free",
	messages=[{"role": "user", "content": prompt}],
	stream=True
	)

	for chunk in completion:
	delta = chunk.choices[0].delta
	if hasattr(delta, "content") and delta.content:
	yield delta.content
	except Exception as e:
	yield f"Streaming error: {str(e)}"


	class UserSession:
	def __init__(self, llm: OpenRouterLLM): # Accept an instance of OpenRouterLLM
	self.current_user = None
	self.welcome_message = None
	self.conversation_history = [] # Add conversation history storage
	self.llm = llm # Store the LLM instance

	def set_user(self, user_info):
	self.current_user = user_info
	self.set_welcome_message(user_info.get("Nickname", "Guest"))
	# Initialize conversation history with welcome message
	welcome = self.get_welcome_message()
	self.conversation_history = [
	{"role": "assistant", "content": welcome},
	]

	def get_user(self):
	return self.current_user

	def set_welcome_message(self, Nickname, src_lang="eng_Latn", tgt_lang="kin_Latn"):
	"""Set a dynamic welcome message using the OpenRouterLLM."""
	prompt = (
	f"Create a very brief welcome message for {Nickname}. "
	f"The message should: "
	f"1. Welcome {Nickname} warmly and professionally. "
	f"2. Emphasize that this is a safe and trusted space. "
	f"3. Highlight specialized support for gender-based violence (GBV) and legal assistance. "
	f"4. Use a tone that is warm, reassuring, and professional. "
	f"5. Keep the message concise and impactful."
	)

	# Use the OpenRouterLLM to generate the message
	welcome = "".join(self.llm.stream(prompt)) # Stream and concatenate the response
	welcome_text=translate_text(welcome, src_lang, tgt_lang)

	# Format the message with HTML styling
	self.welcome_message = (
	f"<div style='font-size: 20px;'>"
	f"{welcome_text}"
	f"</div>"
	)

	def get_welcome_message(self):
	return self.welcome_message

	def add_to_history(self, role, message):
	"""Add a message to the conversation history"""
	self.conversation_history.append({"role": role, "content": message})

	def get_conversation_history(self):
	"""Get the full conversation history"""
	return self.conversation_history

	def get_formatted_history(self):
	"""Get conversation history formatted as a string for the LLM"""
	formatted_history = ""
	for entry in self.conversation_history:
	role = "User" if entry["role"] == "user" else "Assistant"
	formatted_history += f"{role}: {entry['content']}\n\n"
	return formatted_history

	api_key =api
	llm_instance = OpenRouterLLM(key=api_key)
	#llm_instance = model
	user_session = UserSession(llm_instance)


	def collect_user_info(Nickname):
	if not Nickname:
	return "Nickname is required to proceed.", gr.update(visible=False), gr.update(visible=True), []

	# Store user info for chat session
	user_info = {
	"Nickname": Nickname,
	"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
	}

	# Set user in session
	user_session.set_user(user_info)

	# Generate welcome message
	welcome_message = user_session.get_welcome_message()

	# Add initial message to start the conversation
	chat_history = add_initial_message([(None, welcome_message)])

	# Return welcome message and update UI
	return welcome_message, gr.update(visible=True), gr.update(visible=False), chat_history

	# Add initial message to start the conversation
	def add_initial_message(chatbot):
	#initial_message = (" "
	# )
	return chatbot #+ [(None, initial_message)]

	# Create RAG chain with user context and conversation history
	def create_rag_chain(retriever, template, api_key):
	llm = OpenRouterLLM(api_key)
	rag_prompt = PromptTemplate.from_template(template)

	def stream_func(input_dict):
	# Get context using the retriever's invoke method
	context = retriever.invoke(input_dict["question"])
	context_str = "\n".join([doc.page_content for doc in context])

	# Get user info from the session
	user_info = user_session.get_user() or {}
	first_name = user_info.get("Nickname", "User")

	# Get conversation history
	conversation_history = user_session.get_formatted_history()

	# Format prompt with user context and conversation history
	prompt = rag_prompt.format(
	context=context_str,
	question=input_dict["question"],
	first_name=first_name,
	conversation_history=conversation_history
	)

	# Stream response
	return llm.stream(prompt)

	return stream_func

	# def rag_memory_stream(message, history):
	# # Add user message to history
	# user_session.add_to_history("user", message)

	# # Initialize with empty response
	# partial_text = ""
	# full_response = ""

	# # Use the rag_chain with the question
	# for new_text in rag_chain({"question": message}):
	# partial_text += new_text
	# full_response = partial_text
	# yield partial_text

	# # After generating the complete response, add it to history
	# user_session.add_to_history("assistant", full_response)


	def rag_memory_stream(message, history, user_lang="kin_Latn", system_lang="eng_Latn"):
	english_message = translate_text(message, user_lang, system_lang)

	user_session.add_to_history("user", english_message)

	full_response = ""

	for new_text in rag_chain({"question": english_message}):
	full_response += new_text


	translated_response = translate_text(full_response, system_lang, user_lang)

	user_session.add_to_history("assistant", full_response)

	yield translated_response



	import gradio as gr


	api_key = api

	def chatbot_interface():
	api_key = api

	global template

	template = """
	You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your responses must be based EXCLUSIVELY on the information provided in the context. Your primary goal is to provide emotionally intelligent support while maintaining appropriate boundaries.

	Previous conversation: {conversation_history}
	Context information: {context}
	User's Question: {question}

	When responding follow these guidelines:

	1. Strict Context Adherence
	- Only use information that appears in the provided {context}
	- If the answer is not found in the context, state "I don't have that information in my available resources" rather than generating a response

	2. Personalized Communication
	- Avoid contractions (e.g., use I am instead of I'm)
	- Incorporate thoughtful pauses or reflective questions when the conversation involves difficult topics
	- Use selective emojis (😊, 🤗, ❤️) only when tone-appropriate and not during crisis discussions
	- Balance warmth with professionalism

	3. Emotional Intelligence
	- Validate feelings without judgment
	- Offer reassurance when appropriate, always centered on empowerment
	- Adjust your tone based on the emotional state conveyed

	4. Conversation Management
	- Refer to {conversation_history} to maintain continuity and avoid repetition
	- Use clear paragraph breaks for readability

	5. Information Delivery
	- Extract only relevant information from {context} that directly addresses the question
	- Present information in accessible, non-technical language
	- When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?"


	6. Safety and Ethics
	- Do not generate any speculative content or advice not supported by the context
	- If the context contains safety information, prioritize sharing that information

	Your response must come entirely from the provided context, maintaining the supportive tone while never introducing information from outside the provided materials.

	Context: {context}
	User's Question: {question}
	Your Response:
	"""


	global rag_chain
	rag_chain = create_rag_chain(retriever, template, api_key)

	with gr.Blocks() as demo:
	# User registration section
	with gr.Column(visible=True, elem_id="registration_container") as registration_container:
	gr.Markdown("### Your privacy matters to us! Just share a nickname you feel comfy with to start chatting..")

	with gr.Row():
	first_name = gr.Textbox(
	label="Nickname",
	placeholder="Enter your Nickname You feel comfy",
	scale=1,
	elem_id="input_nickname"
	)

	with gr.Row():
	submit_btn = gr.Button("Start Chatting", variant="primary", scale=2)

	response_message = gr.Markdown()

	# Chatbot section (initially hidden)
	with gr.Column(visible=False, elem_id="chatbot_container") as chatbot_container:
	chat_interface = gr.ChatInterface(
	fn=rag_memory_stream,
	title="Chat with GBVR",
	fill_height=True
	)

	# Footer with version info
	gr.Markdown("Ijwi ry'Ubufasha Chatbot v1.0.0 © 2025")

	# Handle user registration
	submit_btn.click(
	collect_user_info,
	inputs=[first_name],
	outputs=[response_message, chatbot_container, registration_container, chat_interface.chatbot]
	)

	demo.css = """
	:root {
	--background: #f0f0f0;
	--text: #000000;
	}

	body, .gradio-container {
	margin: 0;
	padding: 0;
	width: 100vw;
	height: 100vh;
	display: flex;
	flex-direction: column;
	justify-content: center;
	align-items: center;
	background: var(--background);
	color: var(--text);
	}

	.gradio-container {
	max-width: 100%;
	max-height: 100%;
	}

	.gr-box {
	background: var(--background);
	color: var(--text);
	border-radius: 12px;
	padding: 2rem;
	border: 1px solid rgba(0, 0, 0, 0.1);
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
	}

	.gr-button-primary {
	background: var(--background);
	color: var(--text);
	padding: 12px 24px;
	border-radius: 8px;
	transition: all 0.3s ease;
	border: 1px solid rgba(0, 0, 0, 0.1);
	}

	.gr-button-primary:hover {
	transform: translateY(-1px);
	box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
	}

	footer {
	text-align: center;
	color: var(--text);
	opacity: 0.7;
	padding: 1rem;
	font-size: 0.9em;
	}

	.gr-markdown h3 {
	color: var(--text);
	margin-bottom: 1rem;
	}

	.registration-markdown, .chat-title h1 {
	color: var(--text);
	}
	"""

	return demo

	# Launch the interface
	if __name__ == "__main__":
	chatbot_interface().launch(share=True)