Spaces:

fathindifa
/

food_indo

Sleeping

File size: 2,397 Bytes

import gradio as gr
import torch
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration

def load_model():
    # Load model and processor
    model_id = "fathindifa/food-caption-blip2"
    processor = Blip2Processor.from_pretrained(model_id)
    model = Blip2ForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.float32,
        device_map="auto"  # Will use GPU if available, otherwise CPU
    )
    model.eval()
    return processor, model

processor, model = load_model()

def generate_caption(image, prompt="", max_length=32, temperature=0.7):
    """Generate caption for the input image."""
    if image is None:
        return "Please upload an image."
    
    # Ensure image is in RGB
    image = Image.fromarray(image).convert("RGB")
    
    # Process image
    inputs = processor(images=image, text=prompt, return_tensors="pt")
    
    # Move inputs to same device as model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate caption
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            num_beams=5,
            temperature=temperature
        )
    
    # Decode and return caption
    caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    return caption

# Create Gradio interface
iface = gr.Interface(
    fn=generate_caption,
    inputs=[
        gr.Image(label="Upload Food Image"),
        gr.Textbox(label="Optional Prompt", placeholder="Enter prompt to guide the caption (optional)", value=""),
        gr.Slider(minimum=10, maximum=50, value=32, step=1, label="Maximum Caption Length"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
    ],
    outputs=gr.Textbox(label="Generated Caption"),
    title="🍜 Food Image Captioning",
    description="Upload a food image and get an automatically generated caption! This demo uses a fine-tuned BLIP2 model specifically trained for food image captioning.",
    article="""
    ### Tips:
    - Try different prompts to guide the caption generation
    - Adjust temperature for more/less creative captions
    - Increase max length for longer descriptions
    """,
    flagging_mode="never",
    cache_examples=False
)

# Launch the interface
if __name__ == "__main__":
    iface.launch()