Spaces:
Running
Running
| from pathlib import Path | |
| import os | |
| import cv2 | |
| import gradio as gr | |
| from fastrtc import Stream,WebRTC | |
| from app.config import env | |
| from fastrtc import AdditionalOutputs | |
| from app.memory import Memory,Message | |
| from fastrtc import get_cloudflare_turn_credentials | |
| from app.agent import build_agent | |
| from fastrtc import get_current_context | |
| session_memories = {} | |
| def get_session_memory(session_id: str = None) -> Memory: | |
| if session_id not in session_memories: | |
| session_memories[session_id] = Memory(build_agent()) | |
| welcome_message = "π Now I can see. Feel free to ask me about anything!" | |
| session_memories[session_id].chat.append(Message.assistant(welcome_message)) | |
| return session_memories[session_id] | |
| def video_handler(frame): | |
| frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
| rtcid = get_current_context().webrtc_id | |
| mem = get_session_memory(rtcid) | |
| if (s := mem.enqueue(frame)): | |
| if mem.chat.history[-1].metadata.get('status') == 'pending': | |
| mem.chat.history[-1] = Message.tool(s.gr, title=s.sender, status=s.status) | |
| else: | |
| mem.chat.append(Message.tool(s.gr, title=s.sender, status=s.status)) | |
| return frame, AdditionalOutputs(mem.chat.messages, rtcid) | |
| def chat_handler(text, webrtc_state): | |
| if webrtc_state is None: | |
| return "", [{"role": "assistant", "content": "Please start your camera first to begin the conversation."}], webrtc_state | |
| mem = get_session_memory(webrtc_state) | |
| if not mem.is_running: | |
| mem.receive(text.strip()) | |
| return "", mem.chat.messages, webrtc_state | |
| if __name__ == "__main__": | |
| print("π Starting Perceptual Copilot...") | |
| print(f"HF Spaces: {os.getenv('SPACE_ID') is not None}") | |
| print(f"Environment check - API_KEY: {'β' if env.api_key else 'β'}") | |
| print(f"Environment check - END_LANG: {'β' if env.end_lang else 'β'}") | |
| print(f"Environment check - OpenAI Client: {'β' if env.client else 'β'}") | |
| with gr.Blocks( | |
| title="π€ Perceptual Copilot - AI Vision Assistant", | |
| theme=gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="orange", | |
| neutral_hue="slate", | |
| font=("system-ui", "sans-serif") | |
| ), | |
| css=Path("styles.css").read_text(), | |
| ) as demo: | |
| # Header section with sleek styling | |
| gr.Markdown(""" | |
| <div class="ultra-sleek-header"> | |
| <h1 class="hero-title"> | |
| <span class="title-primary">Perceptual</span> | |
| <span class="title-accent">Copilot</span> | |
| </h1> | |
| <p class="hero-subtitle"> | |
| <span class="status-dot"></span> | |
| An experimental prototype that integrates OpenAI agents with visual tools to process real-time video streams. | |
| </p> | |
| <div class="feature-pills"> | |
| <span class="pill">Real-time streaming</span> | |
| <span class="pill">Visual Agent</span> | |
| <span class="pill">Large vision language model</span> | |
| <span class="pill">Reasoning</span> | |
| </div> | |
| </div> | |
| """, elem_classes="ultra-sleek-header") | |
| state = gr.State(value=None) | |
| # Main interface with improved layout | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1, elem_classes="video-container"): | |
| video = WebRTC( | |
| label="π₯ Camera Stream", | |
| rtc_configuration=get_cloudflare_turn_credentials(hf_token=env.hf_token), | |
| track_constraints={ | |
| "width": {"exact": 600}, | |
| "height": {"exact": 600}, | |
| "aspectRatio": {"exact": 1}}, | |
| mode="send", | |
| modality="video", | |
| mirror_webcam=True, | |
| width=600, | |
| height=600, | |
| ) | |
| with gr.Column(scale=1, elem_classes="chat-container"): | |
| gr.Markdown("### π¬ Chat") | |
| chatbot = gr.Chatbot( | |
| type="messages", | |
| height=450, | |
| label="π€ AI Assistant", | |
| placeholder="Chat history will appear here...", | |
| show_label=False, | |
| ) | |
| with gr.Row(elem_classes="items-center"): | |
| textbox = gr.Textbox( | |
| placeholder="π Question goes here, press ENTER to send", | |
| lines=1, | |
| show_label=False, | |
| ) | |
| # Event handlers | |
| video.stream( | |
| fn=video_handler, | |
| inputs=[video], | |
| outputs=[video], | |
| concurrency_limit=10, | |
| ) | |
| video.on_additional_outputs( | |
| fn=lambda messages, webrtc_id: (messages, webrtc_id), | |
| outputs=[chatbot, state] | |
| ) | |
| # Chat handler for textbox | |
| textbox.submit( | |
| chat_handler, | |
| inputs=[textbox, state], | |
| outputs=[textbox, chatbot, state] | |
| ) | |
| # Enhanced instructions section | |
| with gr.Column(elem_classes="instructions-container"): | |
| gr.Markdown(""" | |
| ## π Get Started | |
| **π Quick Reminder:** | |
| 1. Allow camera access when prompted | |
| 2. Wait for the camera to initialize and first message to appear | |
| 3. π‘ **Tip:** If you find it hard to see the interface, please turn off night mode for better visibility | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown(""" | |
| ### π‘ Example Prompts | |
| **π General Vision:** | |
| - *"What do you see in front of me?"* | |
| - *"What's the overall environment like?"* | |
| **π Text & Documents:** | |
| - *"Read the text in this document"* | |
| - *"Extract the code snippet from this image"* | |
| **π Object Recognition:** | |
| - *"What objects are visible?"* | |
| - *"Help me identify this item"* | |
| """) | |
| with gr.Column(): | |
| gr.Markdown(""" | |
| ### π§ Current Capabilities | |
| **π Available Features:** | |
| - **OCR** - Text extraction and reading | |
| - **Q&A** - Visual question answering | |
| - **Caption** - Scene description and analysis | |
| - **Localization** - Object detection and positioning | |
| - **Time** - Current time and temporal context | |
| **π More Coming Soon:** | |
| We're continuously adding new capabilities to enhance your visual AI experience. | |
| **β οΈ Important Note:** | |
| All models are self-hosted. Please avoid abuse of the system. | |
| """) | |
| demo.queue(default_concurrency_limit=None) | |
| demo.launch() |