Spaces:

jing-bi
/

perceptual-copilot

Running

App Files Files Community

perceptual-copilot / main.py

jing-bi

Fresh deploy: all latest files

011bd45 6 months ago

raw

history blame contribute delete

7.34 kB

	from pathlib import Path
	import os
	import cv2
	import gradio as gr
	from fastrtc import Stream,WebRTC
	from app.config import env
	from fastrtc import AdditionalOutputs
	from app.memory import Memory,Message
	from fastrtc import get_cloudflare_turn_credentials
	from app.agent import build_agent
	from fastrtc import get_current_context
	session_memories = {}

	def get_session_memory(session_id: str = None) -> Memory:
	if session_id not in session_memories:
	session_memories[session_id] = Memory(build_agent())
	welcome_message = "👋 Now I can see. Feel free to ask me about anything!"
	session_memories[session_id].chat.append(Message.assistant(welcome_message))
	return session_memories[session_id]

	def video_handler(frame):
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	rtcid = get_current_context().webrtc_id
	mem = get_session_memory(rtcid)
	if (s := mem.enqueue(frame)):
	if mem.chat.history[-1].metadata.get('status') == 'pending':
	mem.chat.history[-1] = Message.tool(s.gr, title=s.sender, status=s.status)
	else:
	mem.chat.append(Message.tool(s.gr, title=s.sender, status=s.status))
	return frame, AdditionalOutputs(mem.chat.messages, rtcid)

	def chat_handler(text, webrtc_state):
	if webrtc_state is None:
	return "", [{"role": "assistant", "content": "Please start your camera first to begin the conversation."}], webrtc_state

	mem = get_session_memory(webrtc_state)
	if not mem.is_running:
	mem.receive(text.strip())
	return "", mem.chat.messages, webrtc_state





	if __name__ == "__main__":
	print("🚀 Starting Perceptual Copilot...")
	print(f"HF Spaces: {os.getenv('SPACE_ID') is not None}")
	print(f"Environment check - API_KEY: {'✓' if env.api_key else '✗'}")
	print(f"Environment check - END_LANG: {'✓' if env.end_lang else '✗'}")
	print(f"Environment check - OpenAI Client: {'✓' if env.client else '✗'}")



	with gr.Blocks(
	title="🤖 Perceptual Copilot - AI Vision Assistant",
	theme=gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="orange",
	neutral_hue="slate",
	font=("system-ui", "sans-serif")
	),
	css=Path("styles.css").read_text(),
	) as demo:

	# Header section with sleek styling
	gr.Markdown("""
	<div class="ultra-sleek-header">
	<h1 class="hero-title">
	<span class="title-primary">Perceptual</span>
	<span class="title-accent">Copilot</span>
	</h1>
	<p class="hero-subtitle">
	<span class="status-dot"></span>
	An experimental prototype that integrates OpenAI agents with visual tools to process real-time video streams.
	</p>
	<div class="feature-pills">
	<span class="pill">Real-time streaming</span>
	<span class="pill">Visual Agent</span>
	<span class="pill">Large vision language model</span>
	<span class="pill">Reasoning</span>
	</div>
	</div>
	""", elem_classes="ultra-sleek-header")

	state = gr.State(value=None)

	# Main interface with improved layout
	with gr.Row(equal_height=True):
	with gr.Column(scale=1, elem_classes="video-container"):
	video = WebRTC(
	label="🎥 Camera Stream",
	rtc_configuration=get_cloudflare_turn_credentials(hf_token=env.hf_token),
	track_constraints={
	"width": {"exact": 600},
	"height": {"exact": 600},
	"aspectRatio": {"exact": 1}},
	mode="send",
	modality="video",
	mirror_webcam=True,
	width=600,
	height=600,
	)

	with gr.Column(scale=1, elem_classes="chat-container"):
	gr.Markdown("### 💬 Chat")
	chatbot = gr.Chatbot(
	type="messages",
	height=450,
	label="🤖 AI Assistant",
	placeholder="Chat history will appear here...",
	show_label=False,
	)

	with gr.Row(elem_classes="items-center"):
	textbox = gr.Textbox(
	placeholder="💭 Question goes here, press ENTER to send",
	lines=1,
	show_label=False,
	)
	# Event handlers
	video.stream(
	fn=video_handler,
	inputs=[video],
	outputs=[video],
	concurrency_limit=10,
	)
	video.on_additional_outputs(
	fn=lambda messages, webrtc_id: (messages, webrtc_id),
	outputs=[chatbot, state]
	)

	# Chat handler for textbox
	textbox.submit(
	chat_handler,
	inputs=[textbox, state],
	outputs=[textbox, chatbot, state]
	)

	# Enhanced instructions section
	with gr.Column(elem_classes="instructions-container"):
	gr.Markdown("""
	## 🚀 Get Started

	📌 Quick Reminder:
	1. Allow camera access when prompted
	2. Wait for the camera to initialize and first message to appear
	3. 💡 Tip: If you find it hard to see the interface, please turn off night mode for better visibility
	""")

	with gr.Row():
	with gr.Column():
	gr.Markdown("""
	### 💡 Example Prompts

	🌍 General Vision:
	- "What do you see in front of me?"
	- "What's the overall environment like?"

	📄 Text & Documents:
	- "Read the text in this document"
	- "Extract the code snippet from this image"

	🔍 Object Recognition:
	- "What objects are visible?"
	- "Help me identify this item"
	""")

	with gr.Column():
	gr.Markdown("""
	### 🔧 Current Capabilities

	🚀 Available Features:
	- OCR - Text extraction and reading
	- Q&A - Visual question answering
	- Caption - Scene description and analysis
	- Localization - Object detection and positioning
	- Time - Current time and temporal context

	📈 More Coming Soon:
	We're continuously adding new capabilities to enhance your visual AI experience.

	⚠️ Important Note:
	All models are self-hosted. Please avoid abuse of the system.
	""")
	demo.queue(default_concurrency_limit=None)
	demo.launch()