import gradio as gr
from PIL import Image
import torch
from torchvision import transforms
from transformers import BlipForConditionalGeneration, AutoTokenizer

# Load model and tokenizer
model_name = "hackergeek/radiology-image-captioning"
model = BlipForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Manual preprocessing
preprocess = transforms.Compose([
    transforms.Resize((384, 384)),  # BLIP models usually expect 384x384
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

def generate_caption(image):
    """
    Generate radiology caption for a PIL image.
    """
    if not isinstance(image, Image.Image):
        image = Image.open(image).convert("RGB")
    else:
        image = image.convert("RGB")
    
    pixel_values = preprocess(image).unsqueeze(0)  # add batch dimension
    with torch.no_grad():
        outputs = model.generate(pixel_values=pixel_values)
    caption = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return caption

# Gradio Interface
title = "Radiology Image Captioning"
description = (
    "Upload a radiology image (X-ray, CT, MRI) and get an automatic caption "
    "generated by the `hackergeek/radiology-image-captioning` model."
)

iface = gr.Interface(
    fn=generate_caption,
    inputs=gr.Image(type="pil", label="Upload Radiology Image"),
    outputs=gr.Textbox(label="Generated Caption"),
    title=title,
    description=description,
    examples=[
        ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/medical_xray.png"]
    ]
)

if __name__ == "__main__":
    iface.launch()