Spaces:

agents-course
/

Final_Assignment_Template

Running

App Files Files Community

289

base upload

#204

by onkar127 - opened Jul 2

base: refs/heads/main

←

from: refs/pr/204

Discussion Files changed

+1348

-5

Files changed (10) hide show

agentsList.py +340 -0
app.py +22 -5
tools/__init__.py +19 -0
tools/chess_tools.py +126 -0
tools/classifier_tool.py +89 -0
tools/content_retriever_tool.py +89 -0
tools/get_attachment_tool.py +77 -0
tools/google_search_tools.py +90 -0
tools/speech_recognition_tool.py +113 -0
tools/youtube_video_tool.py +383 -0

agentsList.py ADDED Viewed

	@@ -0,0 +1,340 @@

+from typing import TypedDict, Optional
+from langgraph.graph import StateGraph, START, END
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import HumanMessage
+from rich.console import Console
+from smolagents import (
+    CodeAgent,
+    ToolCallingAgent,
+    OpenAIServerModel,
+    AgentLogger,
+    LogLevel,
+    Panel,
+    Text,
+)
+from tools import (
+    GetAttachmentTool,
+    GoogleSearchTool,
+    GoogleSiteSearchTool,
+    ContentRetrieverTool,
+    YoutubeVideoTool,
+    SpeechRecognitionTool,
+    ClassifierTool,
+    ImageToChessBoardFENTool,
+    chess_engine_locator,
+)
+import openai
+import backoff
+def create_genai_agent(verbosity: int = LogLevel.INFO):
+    get_attachment_tool = GetAttachmentTool()
+    speech_recognition_tool = SpeechRecognitionTool()
+    env_tools = [
+        get_attachment_tool,
+    ]
+    model = OpenAIServerModel(model_id="gpt-4.1")
+    console = Console(record=True)
+    logger = AgentLogger(level=verbosity, console=console)
+    steps_buffer = []
+    def capture_step_log(agent) -> None:
+        steps_buffer.append(console.export_text(clear=True))
+    agents = {
+        agent.name: agent
+        for agent in [
+            ToolCallingAgent(
+                name="general_assistant",
+                description="Answers questions for best of knowledge and common reasoning grounded on already known information. Can understand multimedia including audio and video files and YouTube.",
+                model=model,
+                tools=env_tools
+                      + [
+                          speech_recognition_tool,
+                          YoutubeVideoTool(
+                              client=model.client,
+                              speech_recognition_tool=speech_recognition_tool,
+                              frames_interval=3,
+                              chunk_duration=60,
+                              debug=True,
+                          ),
+                          ClassifierTool(
+                              client=model.client,
+                              model_id="gpt-4.1-mini",
+                          ),
+                      ],
+                logger=logger,
+                step_callbacks=[capture_step_log],
+            ),
+            ToolCallingAgent(
+                name="web_researcher",
+                description="Answers questions that require grounding in unknown information through search on web sites and other online resources.",
+                tools=env_tools
+                      + [
+                          GoogleSearchTool(),
+                          GoogleSiteSearchTool(),
+                          ContentRetrieverTool(),
+                      ],
+                model=model,
+                planning_interval=3,
+                max_steps=9,
+                logger=logger,
+                step_callbacks=[capture_step_log],
+            ),
+            CodeAgent(
+                name="data_analyst",
+                description="Data analyst with advanced skills in statistic, handling tabular data and related Python packages.",
+                tools=env_tools,
+                additional_authorized_imports=[
+                    "numpy",
+                    "pandas",
+                    "tabulate",
+                    "matplotlib",
+                    "seaborn",
+                ],
+                model=model,
+                logger=logger,
+                step_callbacks=[capture_step_log],
+            ),
+            CodeAgent(
+                name="chess_player",
+                description="Chess grandmaster empowered by chess engine. Always thinks at least 100 steps ahead.",
+                tools=env_tools
+                      + [
+                          ImageToChessBoardFENTool(client=model.client),
+                          chess_engine_locator,
+                      ],
+                additional_authorized_imports=[
+                    "chess",
+                    "chess.engine",
+                ],
+                model=model,
+                logger=logger,
+                step_callbacks=[capture_step_log],
+            ),
+        ]
+    }
+    class GAIATask(TypedDict):
+        task_id: Optional[str | None] = None
+        question: str
+        steps: list[str] = []
+        agent: Optional[str | None] = None
+        raw_answer: Optional[str | None] = None
+        final_answer: Optional[str | None] = None
+    llm = ChatOpenAI(model="gpt-4.1")
+    logger = AgentLogger(level=verbosity)
+    @backoff.on_exception(backoff.expo, openai.RateLimitError, max_time=60, max_tries=6)
+    def llm_invoke_with_retry(messages):
+        response = llm.invoke(messages)
+        return response
+    def read_question(state: GAIATask):
+        logger.log_task(
+            content=state["question"].strip(),
+            subtitle=f"LangGraph with {type(llm).__name__} - {llm.model_name}",
+            level=LogLevel.INFO,
+            title="Final Assignment Agent for Hugging Face Agents Course",
+        )
+        get_attachment_tool.attachment_for(state["task_id"])
+        return {
+            "steps": [],
+            "agent": None,
+            "raw_answer": None,
+            "final_answer": None,
+        }
+    def select_agent(state: GAIATask):
+        agents_description = "\n\n".join(
+            [
+                f"AGENT NAME: {a.name}\nAGENT DESCRIPTION: {a.description}"
+                for a in agents.values()
+            ]
+        )
+        prompt = f"""\
+    You are a general AI assistant.
+    I will provide you a question and a list of agents with their descriptions.
+    Your task is to select the most appropriate agent to answer the question.
+    You can select one of the agents or decide that no agent is needed.
+    If question has attachment only agent can answer it.
+    QUESTION:
+    {state["question"]}
+    {agents_description}
+    Now, return the name of the agent you selected or "no agent needed" if you think that no agent is needed.
+    """
+        response = llm_invoke_with_retry([HumanMessage(content=prompt)])
+        agent_name = response.content.strip()
+        if agent_name in agents:
+            logger.log(
+                f"Agent {agent_name} selected for solving the task.",
+                level=LogLevel.DEBUG,
+            )
+            return {
+                "agent": agent_name,
+                "steps": state.get("steps", [])
+                         + [
+                             f"Agent '{agent_name}' selected for task execution.",
+                         ],
+            }
+        elif agent_name == "no agent needed":
+            logger.log(
+                "No appropriate agent found in the list. No agent will be used.",
+                level=LogLevel.DEBUG,
+            )
+            return {
+                "agent": None,
+                "steps": state.get("steps", [])
+                         + [
+                             "A decision is made to solve the task directly without invoking any agent.",
+                         ],
+            }
+        else:
+            logger.log(
+                f"[bold red]Warning to user: Unexpected agent name '{agent_name}' selected. No agent will be used.[/bold red]",
+                level=LogLevel.INFO,
+            )
+            return {
+                "agent": None,
+                "steps": state.get("steps", [])
+                         + [
+                             f"Attempt to select non-existing agent '{agent_name}'. No agent will be used.",
+                         ],
+            }
+    def delegate_to_agent(state: GAIATask):
+        agent_name = state.get("agent", None)
+        if not agent_name:
+            raise ValueError("Agent not selected.")
+        if agent_name not in agents:
+            raise ValueError(f"Agent '{agent_name}' is not available.")
+        logger.log(
+            Panel(Text(f"Calling agent: {agent_name}.")),
+            level=LogLevel.INFO,
+        )
+        agent = agents[agent_name]
+        agent_answer = agent.run(task=state["question"])
+        steps = [f"Agent '{agent_name}' step:\n{s}" for s in steps_buffer]
+        steps_buffer.clear()
+        return {
+            "raw_answer": agent_answer,
+            "steps": state.get("steps", []) + steps,
+        }
+    def one_shot_answering(state: GAIATask):
+        response = llm_invoke_with_retry([HumanMessage(content=state.get("question"))])
+        return {
+            "raw_answer": response.content,
+            "steps": state.get("steps", [])
+                     + [
+                         f"One-shot answer:\n{response.content}",
+                     ],
+        }
+    def refine_answer(state: GAIATask):
+        question = state.get("question")
+        answer = state.get("raw_answer", None)
+        if not answer:
+            return {"final_answer": "No answer."}
+        prompt = f"""\
+    You are a general AI assistant.
+    I will provide you a question and correct answer to it. Answer is correct but may be too verbose or not follow the rules below.
+    Your task is to rephrase answer according to rules below.
+    Answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+    If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
+    If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
+    If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
+    If you are asked for a comma separated list, use space after comma and before next element of the list unless other directly specified in a question.
+    Check question context to define if letters case matters. Do not change case if not prescribed by other rules or question.
+    If you are not asked for the list, capitalize the first letter of the answer unless it changes meaning of the answer.
+    If answer is number, use digits only not words unless other directly specified in a question.
+    If answer is not full sentence, do not add period at the end.
+    Preserve all items if the answer is a list.
+    QUESTION:
+    {question}
+    ANSWER:
+    {answer}
+    """
+        response = llm_invoke_with_retry([HumanMessage(content=prompt)])
+        refined_answer = response.content.strip()
+        logger.log(
+            Text(f"GAIA final answer: {refined_answer}", style="bold #d4b702"),
+            level=LogLevel.INFO,
+        )
+        return {
+            "final_answer": refined_answer,
+            "steps": state.get("steps", [])
+                     + [
+                         "Refining the answer according to GAIA benchmark rules.",
+                         f"FINAL ANSWER: {response.content}",
+                     ],
+        }
+    def route_task(state: GAIATask) -> str:
+        if state.get("agent") in agents:
+            return "agent selected"
+        else:
+            return "no agent matched"
+    # Create the graph
+    gaia_graph = StateGraph(GAIATask)
+    # Add nodes
+    gaia_graph.add_node("read_question", read_question)
+    gaia_graph.add_node("select_agent", select_agent)
+    gaia_graph.add_node("delegate_to_agent", delegate_to_agent)
+    gaia_graph.add_node("one_shot_answering", one_shot_answering)
+    gaia_graph.add_node("refine_answer", refine_answer)
+    # Start the edges
+    gaia_graph.add_edge(START, "read_question")
+    # Add edges - defining the flow
+    gaia_graph.add_edge("read_question", "select_agent")
+    # Add conditional branching from select_agent
+    gaia_graph.add_conditional_edges(
+        "select_agent",
+        route_task,
+        {"agent selected": "delegate_to_agent", "no agent matched": "one_shot_answering"},
+    )
+    # Add the final edges
+    gaia_graph.add_edge("delegate_to_agent", "refine_answer")
+    gaia_graph.add_edge("one_shot_answering", "refine_answer")
+    gaia_graph.add_edge("refine_answer", END)
+    gaia = gaia_graph.compile()
+    return gaia

app.py CHANGED Viewed

@@ -3,21 +3,26 @@ import gradio as gr
 import requests
 import inspect
 import pandas as pd
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Basic Agent Definition ---
-# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
-        fixed_answer = "This is a default answer."
-        print(f"Agent returning fixed answer: {fixed_answer}")
-        return fixed_answer
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
@@ -34,6 +39,18 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         print("User not logged in.")
         return "Please Login to Hugging Face with the button.", None
     api_url = DEFAULT_API_URL
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
@@ -80,7 +97,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
-            submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:

 import requests
 import inspect
 import pandas as pd
+import agentsList
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Basic Agent Definition ---
 class BasicAgent:
     def __init__(self):
+        self.genaiAgent = agentsList.create_genai_agent()
         print("BasicAgent initialized.")
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
+        task = self.genaiAgent.invoke({
+            "task_id": task_id,
+            "question": question,
+        })
+        final_answer = task.get("final_answer")
+        print(f"Agent returning fixed answer: {final_answer}")
+        return task["final_answer"]
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
         print("User not logged in.")
         return "Please Login to Hugging Face with the button.", None
+    # --- Allow only space owner to run agent to avoid misuse ---
+    if not space_id.startswith(username.strip()):
+        print("User is not an owner of the space. Please duplicate space and configure OPENAI_API_KEY, HF_TOKEN, GOOGLE_SEARCH_API_KEY, and GOOGLE_SEARCH_ENGINE_ID environment variables.")
+        return "Please duplicate space to your account to run the agent.", None
+    # --- Check for required environment variables ---
+    required_env_vars = ["OPENAI_API_KEY", "HF_TOKEN", "GOOGLE_SEARCH_API_KEY", "GOOGLE_SEARCH_ENGINE_ID"]
+    missing_env_vars = [var for var in required_env_vars if not os.getenv(var)]
+    if missing_env_vars:
+        print(f"Missing environment variables: {', '.join(missing_env_vars)}")
+        return f"Missing environment variables: {', '.join(missing_env_vars)}", None
     api_url = DEFAULT_API_URL
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
+            submitted_answer = agent(task_id=task_id, question=question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:

tools/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from .get_attachment_tool import GetAttachmentTool
+from .google_search_tools import GoogleSearchTool, GoogleSiteSearchTool
+from .content_retriever_tool import ContentRetrieverTool
+from .speech_recognition_tool import SpeechRecognitionTool
+from .youtube_video_tool import YoutubeVideoTool
+from .classifier_tool import ClassifierTool
+from .chess_tools import ImageToChessBoardFENTool, chess_engine_locator
+__all__ = [
+    "GetAttachmentTool",
+    "GoogleSearchTool",
+    "GoogleSiteSearchTool",
+    "ContentRetrieverTool",
+    "SpeechRecognitionTool",
+    "YoutubeVideoTool",
+    "ClassifierTool",
+    "ImageToChessBoardFENTool",
+    "chess_engine_locator",
+]

tools/chess_tools.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from smolagents import Tool, tool
+from openai import OpenAI
+import shutil
+@tool
+def chess_engine_locator() -> str | None:
+    """
+    Get the path to the chess engine binary. Can be used with chess.engine.SimpleEngine.popen_uci function from chess.engine Python module.
+    Returns:
+        str: Path to the chess engine.
+    """
+    path = shutil.which("stockfish")
+    return path if path else None
+class ImageToChessBoardFENTool(Tool):
+    name = "image_to_chess_board_fen"
+    description = """Convert a chessboard image to board part of the FEN."""
+    inputs = {
+        "image_url": {
+            "type": "string",
+            "description": "Public URL of the image (preferred) or base64 encoded image in data URL format.",
+        }
+    }
+    output_type = "string"
+    def __init__(self, client: OpenAI | None = None, **kwargs):
+        self.client = client if client is not None else OpenAI()
+        super().__init__(**kwargs)
+    def attachment_for(self, task_id: str | None):
+        self.task_id = task_id
+    def forward(self, image_url: str) -> str:
+        """
+        Convert a chessboard image to board part of the FEN.
+        Args:
+            image_url (str): Public URL of the image (preferred) or base64 encoded image in data URL format.
+        Returns:
+            str: Board part of the FEN.
+        """
+        client = self.client
+        response = client.responses.create(
+            model="gpt-4.1",
+            input=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "input_text",
+                            "text": "Describe the position of the pieces on the chessboard from the image. Please, nothing else but description.",
+                        },
+                        {"type": "input_image", "image_url": image_url},
+                    ],
+                }
+            ],
+        )
+        response = client.responses.create(
+            model="gpt-4.1",
+            input=[
+                      {
+                          "role": "user",
+                          "content": [
+                              {
+                                  "type": "input_text",
+                                  "text": "Describe the position of the pieces on the chessboard from the image. Please, nothing else but description.",
+                              },
+                          ],
+                      }
+                  ]
+                  + response.output
+                  + [
+                      {
+                          "role": "user",
+                          "content": [
+                              {
+                                  "type": "input_text",
+                                  "text": """\
+          Write down all positions with known pieces.
+          Use a standard one-letter code to name pieces.
+          It is important to use the correct case for piece code. Use upper case for white and lower case for black.
+          It is important to include information about all the mentioned positions.
+          Describe each position in a new line.
+          Follow format: <piece><position> (piece first, than position, no spaces)
+          Return nothing but lines with positions.
+          """,
+                              },
+                          ],
+                      }
+                  ],
+        )
+        board_pos = response.output_text
+        pos_dict = {}
+        for pos_str in board_pos.splitlines():
+            pos_str = pos_str.strip()
+            if len(pos_str) != 3:
+                continue
+            piece = pos_str[0]
+            pos = pos_str[1:3]
+            pos_dict[pos] = piece
+        board_fen = ""
+        for rank in range(8, 0, -1):
+            empty = 0
+            for file_c in range(ord("a"), ord("h") + 1):
+                file = chr(file_c)
+                square = file + str(rank)
+                if square in pos_dict:
+                    if empty > 0:
+                        board_fen += str(empty)
+                        empty = 0
+                    board_fen += pos_dict[square]
+                else:
+                    empty += 1
+            if empty > 0:
+                board_fen += str(empty)
+            if rank != 1:
+                board_fen += "/"
+        return board_fen

tools/classifier_tool.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from smolagents import Tool
+from openai import OpenAI
+class ClassifierTool(Tool):
+    name = "open_classifier"
+    description = """Classifies given items into given categories from perspective of specific knowledge area."""
+    inputs = {
+        "knowledge_area": {
+            "type": "string",
+            "description": "The knowledge area that should be used for classification.",
+        },
+        "environment": {  # context make models too verbose
+            "type": "string",
+            "description": "Couple words that describe environment or location in which items should be classified in case of plural meaning or if only part of item relevant for classification.",
+        },
+        "categories": {
+            "type": "string",
+            "description": "Comma separated list of categories to distribute objects.",
+        },
+        "items": {
+            "type": "string",
+            "description": "Comma separated list of items to be classified. Please include adjectives if available.",
+        },
+    }
+    output_type = "string"
+    def __init__(
+            self,
+            client: OpenAI | None = None,
+            model_id: str = "gpt-4.1-mini",
+            **kwargs,
+    ):
+        self.client = client or OpenAI()
+        self.model_id = model_id
+        super().__init__(**kwargs)
+    def forward(
+            self, knowledge_area: str, environment: str, categories: str, items: str
+    ) -> str:
+        response = self.client.responses.create(
+            model=self.model_id,
+            input=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "input_text",
+                            "text": self._prompt(
+                                knowledge_area=knowledge_area,
+                                context=environment,
+                                categories=categories,
+                                items=items,
+                            ),
+                        },
+                    ],
+                }
+            ],
+        )
+        answer = response.output_text
+        return answer
+    def _prompt(
+            self, knowledge_area: str, context: str, categories: str, items: str
+    ) -> str:
+        return f"""\
+You are {knowledge_area} classifier located in {context} context.
+I will provide you a list of items and a list of categories and context in which items should be considered.
+Your task is to classify the items into the categories.
+Use context to determine the meaning of the items and decide if you need to classify entire item or only part of it.
+Do not miss any item and do not add any item to the list of categories.
+Use highest probability category for each item.
+You can add category "Other" if you are not sure about the classification.
+Use only considerations from from the {knowledge_area} perspective.
+Explain your reasoning from {knowledge_area} perspective in {context} context and then provide final answer.
+Important: Do not allow {context} influence your judgment for classification.
+ITEMS: {items}
+CATEGORIES: {categories}
+Now provide your reasoning and finalize it with the classification in the following format:
+Category 1: items list
+Category 2: items list
+Other (if needed): items list
+"""

tools/content_retriever_tool.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from smolagents import Tool
+from docling.document_converter import DocumentConverter
+from docling.chunking import HierarchicalChunker
+from sentence_transformers import SentenceTransformer, util
+import torch
+class ContentRetrieverTool(Tool):
+    name = "retrieve_content"
+    description = """Retrieve the content of a webpage or document in markdown format. Supports PDF, DOCX, XLSX, HTML, images, and more."""
+    inputs = {
+        "url": {
+            "type": "string",
+            "description": "The URL or local path of the webpage or document to retrieve.",
+        },
+        "query": {
+            "type": "string",
+            "description": "The subject on the page you are looking for. The shorter the more relevant content is returned.",
+        },
+    }
+    output_type = "string"
+    def __init__(
+            self,
+            model_name: str | None = None,
+            threshold: float = 0.2,
+            **kwargs,
+    ):
+        self.threshold = threshold
+        self._document_converter = DocumentConverter()
+        self._model = SentenceTransformer(
+            model_name if model_name is not None else "all-MiniLM-L6-v2"
+        )
+        self._chunker = HierarchicalChunker()
+        super().__init__(**kwargs)
+    def forward(self, url: str, query: str) -> str:
+        document = self._document_converter.convert(url).document
+        chunks = list(self._chunker.chunk(dl_doc=document))
+        if len(chunks) == 0:
+            return "No content found."
+        chunks_text = [chunk.text for chunk in chunks]
+        chunks_with_context = [self._chunker.contextualize(chunk) for chunk in chunks]
+        chunks_context = [
+            chunks_with_context[i].replace(chunks_text[i], "").strip()
+            for i in range(len(chunks))
+        ]
+        chunk_embeddings = self._model.encode(chunks_text, convert_to_tensor=True)
+        context_embeddings = self._model.encode(chunks_context, convert_to_tensor=True)
+        query_embedding = self._model.encode(
+            [term.strip() for term in query.split(",") if term.strip()],
+            convert_to_tensor=True,
+        )
+        selected_indices = []  # aggregate indexes across chunks and context matches and for all queries
+        for embeddings in [
+            context_embeddings,
+            chunk_embeddings,
+        ]:
+            # Compute cosine similarities (returns 1D tensor)
+            for cos_scores in util.pytorch_cos_sim(query_embedding, embeddings):
+                # Convert to softmax probabilities
+                probabilities = torch.nn.functional.softmax(cos_scores, dim=0)
+                # Sort by probability descending
+                sorted_indices = torch.argsort(probabilities, descending=True)
+                # Accumulate until total probability reaches threshold
+                cumulative = 0.0
+                for i in sorted_indices:
+                    cumulative += probabilities[i].item()
+                    selected_indices.append(i.item())
+                    if cumulative >= self.threshold:
+                        break
+        selected_indices = list(
+            dict.fromkeys(selected_indices)
+        )  # remove duplicates and preserve order
+        selected_indices = selected_indices[
+                           ::-1
+                           ]  # make most relevant items last for better focus
+        if len(selected_indices) == 0:
+            return "No content found."
+        return "\n\n".join([chunks_with_context[idx] for idx in selected_indices])

tools/get_attachment_tool.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from smolagents import Tool
+import requests
+from urllib.parse import urljoin
+import base64
+import tempfile
+class GetAttachmentTool(Tool):
+    name = "get_attachment"
+    description = """Retrieves attachment for current task in specified format."""
+    inputs = {
+        "fmt": {
+            "type": "string",
+            "description": "Format to retrieve attachment. Options are: URL (preferred), DATA_URL, LOCAL_FILE_PATH, TEXT. URL returns the URL of the file, DATA_URL returns a base64 encoded data URL, LOCAL_FILE_PATH returns a local file path to the downloaded file, and TEXT returns the content of the file as text.",
+            "nullable": True,
+            "default": "URL",
+        }
+    }
+    output_type = "string"
+    def __init__(
+            self,
+            agent_evaluation_api: str | None = None,
+            task_id: str | None = None,
+            **kwargs,
+    ):
+        self.agent_evaluation_api = (
+            agent_evaluation_api
+            if agent_evaluation_api is not None
+            else "https://agents-course-unit4-scoring.hf.space/"
+        )
+        self.task_id = task_id
+        super().__init__(**kwargs)
+    def attachment_for(self, task_id: str | None):
+        self.task_id = task_id
+    def forward(self, fmt: str = "URL") -> str:
+        fmt = fmt.upper()
+        assert fmt in ["URL", "DATA_URL", "LOCAL_FILE_PATH", "TEXT"]
+        if not self.task_id:
+            return ""
+        file_url = urljoin(self.agent_evaluation_api, f"files/{self.task_id}")
+        if fmt == "URL":
+            return file_url
+        response = requests.get(
+            file_url,
+            headers={
+                "Content-Type": "application/json",
+                "Accept": "application/json",
+            },
+        )
+        if 400 <= response.status_code < 500:
+            return ""
+        response.raise_for_status()
+        mime = response.headers.get("content-type", "text/plain")
+        if fmt == "TEXT":
+            if mime.startswith("text/"):
+                return response.text
+            else:
+                raise ValueError(
+                    f"Content of file type {mime} cannot be retrieved as TEXT."
+                )
+        elif fmt == "DATA_URL":
+            return f"data:{mime};base64,{base64.b64encode(response.content).decode('utf-8')}"
+        elif fmt == "LOCAL_FILE_PATH":
+            with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+                tmp_file.write(response.content)
+                return tmp_file.name
+        else:
+            raise ValueError(
+                f"Unsupported format: {fmt}. Supported formats are URL, DATA_URL, LOCAL_FILE_PATH, and TEXT."
+            )

tools/google_search_tools.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from smolagents import Tool
+from googleapiclient.discovery import build
+import os
+class GoogleSearchTool(Tool):
+    name = "web_search"
+    description = """Performs a google web search for query then returns top search results in markdown format."""
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The query to perform search.",
+        },
+    }
+    output_type = "string"
+    skip_forward_signature_validation = True
+    def __init__(
+            self,
+            api_key: str | None = None,
+            search_engine_id: str | None = None,
+            num_results: int = 10,
+            **kwargs,
+    ):
+        api_key = api_key if api_key is not None else os.getenv("GOOGLE_SEARCH_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "Please set the GOOGLE_SEARCH_API_KEY environment variable."
+            )
+        search_engine_id = (
+            search_engine_id
+            if search_engine_id is not None
+            else os.getenv("GOOGLE_SEARCH_ENGINE_ID")
+        )
+        if not search_engine_id:
+            raise ValueError(
+                "Please set the GOOGLE_SEARCH_ENGINE_ID environment variable."
+            )
+        self.cse = build("customsearch", "v1", developerKey=api_key).cse()
+        self.cx = search_engine_id
+        self.num = num_results
+        super().__init__(**kwargs)
+    def _collect_params(self) -> dict:
+        return {}
+    def forward(self, query: str, *args, **kwargs) -> str:
+        params = {
+            "q": query,
+            "cx": self.cx,
+            "fields": "items(title,link,snippet)",
+            "num": self.num,
+        }
+        params = params | self._collect_params(*args, **kwargs)
+        response = self.cse.list(**params).execute()
+        if "items" not in response:
+            return "No results found."
+        result = "\n\n".join(
+            [
+                f"[{item['title']}]({item['link']})\n{item['snippet']}"
+                for item in response["items"]
+            ]
+        )
+        return result
+class GoogleSiteSearchTool(GoogleSearchTool):
+    name = "site_search"
+    description = """Performs a google search within the website for query then returns top search results in markdown format."""
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The query to perform search.",
+        },
+        "site": {
+            "type": "string",
+            "description": "The domain of the site on which to search.",
+        },
+    }
+    def _collect_params(self, site: str) -> dict:
+        return {
+            "siteSearch": site,
+            "siteSearchFilter": "i",
+        }

tools/speech_recognition_tool.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from smolagents import Tool
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, logging
+import warnings
+class SpeechRecognitionTool(Tool):
+    name = "speech_to_text"
+    description = """Transcribes speech from audio."""
+    inputs = {
+        "audio": {
+            "type": "string",
+            "description": "Path to the audio file to transcribe.",
+        },
+        "with_time_markers": {
+            "type": "boolean",
+            "description": "Whether to include timestamps in the transcription output. Each timestamp appears on its own line in the format [float, float], indicating the number of seconds elapsed from the start of the audio.",
+            "nullable": True,
+            "default": False,
+        },
+    }
+    output_type = "string"
+    chunk_length_s = 30
+    def __new__(cls, *args, **kwargs):
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        model_id = "openai/whisper-large-v3-turbo"
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id,
+            torch_dtype=torch_dtype,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        )
+        model.to(device)
+        processor = AutoProcessor.from_pretrained(model_id)
+        logging.set_verbosity_error()
+        warnings.filterwarnings(
+            "ignore",
+            category=FutureWarning,
+            message=r".*The input name `inputs` is deprecated.*",
+        )
+        cls.pipe = pipeline(
+            "automatic-speech-recognition",
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            torch_dtype=torch_dtype,
+            device=device,
+            chunk_length_s=cls.chunk_length_s,
+            return_timestamps=True,
+        )
+        return super().__new__(cls, *args, **kwargs)
+    def forward(self, audio: str, with_time_markers: bool = False) -> str:
+        """
+        Transcribes speech from audio.
+        Args:
+            audio (str): Path to the audio file to transcribe.
+            with_time_markers (bool): Whether to include timestamps in the transcription output. Each timestamp appears on its own line in the format [float], indicating the number of seconds elapsed from the start of the audio.
+        Returns:
+            str: The transcribed text.
+        """
+        result = self.pipe(audio)
+        if not with_time_markers:
+            return result["text"].strip()
+        txt = ""
+        for chunk in self._normalize_chunks(result["chunks"]):
+            txt += f"[{chunk['start']:.2f}]\n{chunk['text']}\n[{chunk['end']:.2f}]\n"
+        return txt.strip()
+    def transcribe(self, audio, **kwargs):
+        result = self.pipe(audio, **kwargs)
+        return self._normalize_chunks(result["chunks"])
+    def _normalize_chunks(self, chunks):
+        chunk_length_s = self.chunk_length_s
+        absolute_offset = 0.0
+        chunk_offset = 0.0
+        normalized = []
+        for chunk in chunks:
+            timestamp_start = chunk["timestamp"][0]
+            timestamp_end = chunk["timestamp"][1]
+            if timestamp_start < chunk_offset:
+                absolute_offset += chunk_length_s
+                chunk_offset = timestamp_start
+            absolute_start = absolute_offset + timestamp_start
+            if timestamp_end < timestamp_start:
+                absolute_offset += chunk_length_s
+            absolute_end = absolute_offset + timestamp_end
+            chunk_offset = timestamp_end
+            chunk_text = chunk["text"].strip()
+            if chunk_text:
+                normalized.append(
+                    {
+                        "start": absolute_start,
+                        "end": absolute_end,
+                        "text": chunk_text,
+                    }
+                )
+        return normalized

tools/youtube_video_tool.py ADDED Viewed

	@@ -0,0 +1,383 @@

+from smolagents import Tool
+from openai import OpenAI
+from .speech_recognition_tool import SpeechRecognitionTool
+from io import BytesIO
+import yt_dlp
+import av
+import torchaudio
+import subprocess
+import requests
+import base64
+class YoutubeVideoTool(Tool):
+    name = "youtube_video"
+    description = """Process the video and return the requested information from it."""
+    inputs = {
+        "url": {
+            "type": "string",
+            "description": "The URL of the YouTube video.",
+        },
+        "query": {
+            "type": "string",
+            "description": "The question to answer.",
+        },
+    }
+    output_type = "string"
+    def __init__(
+            self,
+            video_quality: int = 360,
+            frames_interval: int | float | None = 2,
+            chunk_duration: int | float | None = 20,
+            speech_recognition_tool: SpeechRecognitionTool | None = None,
+            client: OpenAI | None = None,
+            model_id: str = "gpt-4.1-mini",
+            debug: bool = False,
+            **kwargs,
+    ):
+        self.video_quality = video_quality
+        self.speech_recognition_tool = speech_recognition_tool
+        self.frames_interval = frames_interval
+        self.chunk_duration = chunk_duration
+        self.client = client or OpenAI()
+        self.model_id = model_id
+        self.debug = debug
+        super().__init__(**kwargs)
+    def forward(self, url: str, query: str):
+        """
+        Process the video and return the requested information.
+        Args:
+            url (str): The URL of the YouTube video.
+            query (str): The question to answer.
+        Returns:
+            str: Answer to the query.
+        """
+        answer = ""
+        for chunk in self._split_video_into_chunks(url):
+            prompt = self._prompt(
+                chunk,
+                query,
+                answer,
+            )
+            response = self.client.responses.create(
+                model="gpt-4.1-mini",
+                input=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "input_text",
+                                "text": prompt,
+                            },
+                            *[
+                                {
+                                    "type": "input_image",
+                                    "image_url": f"data:image/jpeg;base64,{frame}",
+                                }
+                                for frame in self._base64_frames(chunk["frames"])
+                            ],
+                        ],
+                    }
+                ],
+            )
+            answer = response.output_text
+            if self.debug:
+                print(
+                    f"CHUNK {chunk['start']} - {chunk['end']}:\n\n{prompt}\n\nANSWER:\n{answer}"
+                )
+        if answer.strip() == "I need to keep watching":
+            answer = ""
+        return answer
+    def _prompt(self, chunk, query, aggregated_answer):
+        prompt = [
+            f"""\
+These are some frames of a video that I want to upload.
+I will ask a question about the entire video, but I will only last part of it.
+Aggregate answer about the entire video, use information about previous parts but do not reference the previous parts in the answer directly.
+Ground your answer based on video title, description, captions, vide frames or answer from previous parts.
+If no evidences presented just say "I need to keep watching".
+VIDEO TITLE:
+{chunk["title"]}
+VIDEO DESCRIPTION:
+{chunk["description"]}
+FRAMES SUBTITLES:
+{chunk["captions"]}"""
+        ]
+        if aggregated_answer:
+            prompt.append(f"""\
+Here is the answer to the same question based on the previous video parts:
+BASED ON PREVIOUS PARTS:
+{aggregated_answer}""")
+        prompt.append(f"""\
+QUESTION:
+{query}""")
+        return "\n\n".join(prompt)
+    def _split_video_into_chunks(
+            self, url: str, with_captions: bool = True, with_frames: bool = True
+    ):
+        video = self._process_video(
+            url, with_captions=with_captions, with_frames=with_frames
+        )
+        video_duration = video["duration"]
+        chunk_duration = self.chunk_duration or video_duration
+        chunk_start = 0.0
+        while chunk_start < video_duration:
+            chunk_end = min(chunk_start + chunk_duration, video_duration)
+            chunk = self._get_video_chunk(video, chunk_start, chunk_end)
+            yield chunk
+            chunk_start += chunk_duration
+    def _get_video_chunk(self, video, start, end):
+        chunk_captions = [
+            c for c in video["captions"] if c["start"] <= end and c["end"] >= start
+        ]
+        chunk_frames = [
+            f
+            for f in video["frames"]
+            if f["timestamp"] >= start and f["timestamp"] <= end
+        ]
+        return {
+            "title": video["title"],
+            "description": video["description"],
+            "start": start,
+            "end": end,
+            "captions": "\n".join([c["text"] for c in chunk_captions]),
+            "frames": chunk_frames,
+        }
+    def _process_video(
+            self, url: str, with_captions: bool = True, with_frames: bool = True
+    ):
+        lang = "en"
+        info = self._get_video_info(url, lang)
+        if with_captions:
+            captions = self._extract_captions(
+                lang, info.get("subtitles", {}), info.get("automatic_captions", {})
+            )
+            if not captions and self.speech_recognition_tool:
+                audio_url = self._select_audio_format(info["formats"])
+                audio = self._capture_audio(audio_url)
+                waveform, sample_rate = torchaudio.load(audio)
+                assert sample_rate == 16000
+                waveform_np = waveform.squeeze().numpy()
+                captions = self.speech_recognition_tool.transcribe(waveform_np)
+        else:
+            captions = []
+        if with_frames:
+            video_url = self._select_video_format(info["formats"], 360)["url"]
+            frames = self._capture_video_frames(video_url, self.frames_interval)
+        else:
+            frames = []
+        return {
+            "id": info["id"],
+            "title": info["title"],
+            "description": info["description"],
+            "duration": info["duration"],
+            "captions": captions,
+            "frames": frames,
+        }
+    def _get_video_info(self, url: str, lang: str):
+        ydl_opts = {
+            "quiet": True,
+            "skip_download": True,
+            "format": "bestvideo[ext=mp4][height<=360]+bestaudio[ext=m4a]/best[height<=360]",
+            "forceurl": True,
+            "noplaylist": True,
+            "writesubtitles": True,
+            "writeautomaticsub": True,
+            "subtitlesformat": "vtt",
+            "subtitleslangs": [lang],
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(url, download=False)
+        return info
+    def _extract_captions(self, lang, subtitles, auto_captions):
+        caption_tracks = subtitles.get(lang) or auto_captions.get(lang) or []
+        structured_captions = []
+        srt_track = next(
+            (track for track in caption_tracks if track["ext"] == "srt"), None
+        )
+        vtt_track = next(
+            (track for track in caption_tracks if track["ext"] == "vtt"), None
+        )
+        if srt_track:
+            import pysrt
+            response = requests.get(srt_track["url"])
+            response.raise_for_status()
+            srt_data = response.content.decode("utf-8")
+            def to_sec(t):
+                return (
+                        t.hours * 3600 + t.minutes * 60 + t.seconds + t.milliseconds / 1000
+                )
+            structured_captions = [
+                {
+                    "start": to_sec(sub.start),
+                    "end": to_sec(sub.end),
+                    "text": sub.text.strip(),
+                }
+                for sub in pysrt.from_str(srt_data)
+            ]
+        if vtt_track:
+            import webvtt
+            from io import StringIO
+            response = requests.get(vtt_track["url"])
+            response.raise_for_status()
+            vtt_data = response.text
+            vtt_file = StringIO(vtt_data)
+            def to_sec(t):
+                """Convert 'HH:MM:SS.mmm' to float seconds"""
+                h, m, s = t.split(":")
+                s, ms = s.split(".")
+                return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
+            for caption in webvtt.read_buffer(vtt_file):
+                structured_captions.append(
+                    {
+                        "start": to_sec(caption.start),
+                        "end": to_sec(caption.end),
+                        "text": caption.text.strip(),
+                    }
+                )
+        return structured_captions
+    def _select_video_format(self, formats, video_quality):
+        video_format = next(
+            f
+            for f in formats
+            if f.get("vcodec") != "none" and f.get("height") == video_quality
+        )
+        return video_format
+    def _capture_video_frames(self, video_url, capture_interval_sec=None):
+        ffmpeg_cmd = [
+            "ffmpeg",
+            "-i",
+            video_url,
+            "-f",
+            "matroska",  # container format
+            "-",
+        ]
+        process = subprocess.Popen(
+            ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL
+        )
+        container = av.open(process.stdout)
+        stream = container.streams.video[0]
+        time_base = stream.time_base
+        frames = []
+        next_capture_time = 0
+        for frame in container.decode(stream):
+            if frame.pts is None:
+                continue
+            timestamp = float(frame.pts * time_base)
+            if capture_interval_sec is None or timestamp >= next_capture_time:
+                frames.append(
+                    {
+                        "timestamp": timestamp,
+                        "image": frame.to_image(),  # PIL image
+                    }
+                )
+                if capture_interval_sec is not None:
+                    next_capture_time += capture_interval_sec
+        process.terminate()
+        return frames
+    def _base64_frames(self, frames):
+        base64_frames = []
+        for f in frames:
+            buffered = BytesIO()
+            f["image"].save(buffered, format="JPEG")
+            encoded = base64.b64encode(buffered.getvalue()).decode("utf-8")
+            base64_frames.append(encoded)
+        return base64_frames
+    def _select_audio_format(self, formats):
+        audio_formats = [
+            f
+            for f in formats
+            if f.get("vcodec") == "none"
+               and f.get("acodec")
+               and f.get("acodec") != "none"
+        ]
+        if not audio_formats:
+            raise ValueError("No valid audio-only formats found.")
+        # Prefer m4a > webm, highest abr first
+        preferred_exts = ["m4a", "webm"]
+        def sort_key(f):
+            ext_score = (
+                preferred_exts.index(f["ext"]) if f["ext"] in preferred_exts else 99
+            )
+            abr = f.get("abr") or 0
+            return (ext_score, -abr)
+        audio_formats.sort(key=sort_key)
+        return audio_formats[0]["url"]
+    def _capture_audio(self, audio_url) -> BytesIO:
+        audio_buffer = BytesIO()
+        ffmpeg_audio_cmd = [
+            "ffmpeg",
+            "-i",
+            audio_url,
+            "-f",
+            "wav",
+            "-acodec",
+            "pcm_s16le",  # Whisper prefers PCM
+            "-ac",
+            "1",  # Mono
+            "-ar",
+            "16000",  # 16kHz for Whisper
+            "-",
+        ]
+        result = subprocess.run(
+            ffmpeg_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        if result.returncode != 0:
+            raise RuntimeError("ffmpeg failed:\n" + result.stderr.decode())
+        audio_buffer = BytesIO(result.stdout)
+        audio_buffer.seek(0)
+        return audio_buffer