import matplotlib matplotlib.use('Agg') # Use non-GUI backend import matplotlib.pyplot as plt import numpy as np import pandas as pd import spacy import time import faiss from sentence_transformers import SentenceTransformer, util from sklearn.decomposition import PCA import textwrap from sklearn.metrics.pairwise import cosine_similarity from utils.model_loader import load_embedding_model from utils.helpers import fig_to_html, df_to_html_table def vector_embeddings_handler(text_input, search_query=""): """Show vector embeddings and semantic search capabilities.""" output_html = [] # Add result area container output_html.append('
') output_html.append('

Vector Embeddings Analysis Results

') output_html.append("""

Embeddings Generated Successfully!

Your text has been processed and converted into high-dimensional vector representations.

""") # Load model and create embeddings try: model = load_embedding_model() # Split the text into chunks (sentences) import spacy nlp = spacy.load("en_core_web_sm") doc = nlp(text_input) sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10] # If we have too few sentences, create artificial chunks if len(sentences) < 3: words = text_input.split() chunk_size = max(10, len(words) // 3) sentences = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size) if i+chunk_size <= len(words)] # Limit to 10 sentences to avoid overwhelming the visualization if len(sentences) > 10: sentences = sentences[:10] # Create embeddings embeddings = model.encode(sentences) # Text Statistics output_html.append(f"""

Processing Statistics

{len(text_input)}

Characters

{len(sentences)}

Text Segments

{embeddings.shape[1]}

Vector Dimensions

{embeddings.shape[0]}

Embedding Vectors

""") # Text Segments Display output_html.append("""

Text Segments

""") for i, sentence in enumerate(sentences[:6]): # Show max 6 segments output_html.append(f"""
Segment {i+1}

{sentence}

""") output_html.append("""
""") # Semantic Search Interface output_html.append("""

Semantic Search

Search for content by meaning, not just keywords. The system will find the most semantically similar text segments.

Try these example searches:
""") except Exception as e: output_html.append(f"""

Error

Could not generate embeddings: {str(e)}

""") # Close result-area div output_html.append('
') # Add About section at the end output_html.append(get_about_section()) return '\n'.join(output_html) def perform_semantic_search(context, query): """Perform semantic search on the given context with the query.""" try: # Load model model = load_embedding_model() # Split context into sentences import spacy nlp = spacy.load("en_core_web_sm") doc = nlp(context) sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 5] # Create embeddings sentence_embeddings = model.encode(sentences) query_embedding = model.encode([query])[0] # Calculate similarities from sentence_transformers import util similarities = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0].cpu().numpy() # Create result pairs (sentence, similarity) results = [(sentences[i], float(similarities[i])) for i in range(len(sentences))] # Sort by similarity (descending) results.sort(key=lambda x: x[1], reverse=True) # Return top results return { "success": True, "results": [ {"text": text, "score": score} for text, score in results[:5] # Return top 5 results ] } except Exception as e: return { "success": False, "error": str(e) } def get_about_section(): """Generate the About Vector Embeddings section""" return """

About Vector Embeddings

What are Vector Embeddings?

Vector embeddings are numerical representations of text that capture semantic meaning in high-dimensional space. They convert words, sentences, or documents into dense vectors where similar content has similar vector representations.

Applications of Vector Embeddings:
How It Works:

Our system uses the SentenceTransformer model to create embeddings that capture the semantic meaning of your text. The cosine similarity between vectors determines how related different pieces of content are, enabling powerful semantic search capabilities.

"""