File size: 3,600 Bytes
04bf4e8
4142af5
 
 
 
 
04bf4e8
4142af5
 
 
04bf4e8
4142af5
 
 
 
 
 
 
 
 
 
 
 
 
04bf4e8
4142af5
04bf4e8
4142af5
 
04bf4e8
4142af5
 
04bf4e8
4142af5
 
 
 
 
 
 
 
 
 
04bf4e8
4142af5
 
 
 
 
 
 
 
 
 
 
04bf4e8
4142af5
 
04bf4e8
4142af5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04bf4e8
4142af5
 
 
 
 
 
 
 
 
04bf4e8
4142af5
 
 
606feba
4142af5
 
 
04bf4e8
4142af5
 
 
 
 
04bf4e8
 
 
 
4142af5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gradio as gr
import torch
from PIL import Image
import requests
from io import BytesIO
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

# --- Configuration ---
MODEL_PATH = "zhangbaoxin/qwen3-vl-2b-package_unsloth_finetune"
CPU_DEVICE = "cpu"

# --- Model and Processor Loading ---
print("Loading model and processor... This will take a few minutes on a CPU.")
processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
    dtype="auto",  # Use 'auto' for dtype for better compatibility
    device_map="auto" # This is the key for CPU (and GPU) compatibility
)
print("Model and processor loaded successfully.")

# --- Inference Function ---
def process_and_generate(image_input, text_prompt):
    """
    Processes the image and text prompt, and generates a response from the model.
    """
    if image_input is None or not text_prompt.strip():
        return "Please provide both an image and a text prompt."

    # Convert Gradio's numpy array to a PIL Image
    pil_image = Image.fromarray(image_input)

    # Prepare the messages payload for the model
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": pil_image},
                {"type": "text", "text": text_prompt},
            ],
        }
    ]

    print("Processing inputs and generating response... This will be slow.")
    try:
        # Preparation for inference
        inputs = processor.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        )
        inputs = inputs.to(model.device)

        # Inference: Generation of the output
        generated_ids = model.generate(**inputs, max_new_tokens=1024)

        # To get only the new tokens, we trim the input IDs from the generated IDs
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        
        # Decode the trimmed IDs to text
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        
        # batch_decode returns a list, we return the first element
        return output_text[0]
        
    except Exception as e:
        return f"An error occurred during generation: {str(e)}"

# --- Gradio Interface ---
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Qwen3-VL-2B-Instruct CPU Demo
        This Space runs the `Qwen/Qwen3-VL-2B-Instruct` model using the standard `transformers` library.
        **Warning:** Running this on a free CPU Space is **very slow**. Duplicate this space for solo experience.
        """
    )

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="numpy", label="Upload Image")
            text_prompt = gr.Textbox(label="Prompt", value="Question:\n此图片有包裹吗?\nOptions:\nA. 有包裹.\nB. 没有包裹(地毯,消防栓等).\nPlease select the correct answer from the options above.")
            submit_button = gr.Button("Generate Response")
        with gr.Column():
            output_text = gr.Textbox(label="Model Output", lines=10, interactive=False)

    submit_button.click(
        fn=process_and_generate,
        inputs=[image_input, text_prompt],
        outputs=output_text
    )



if __name__ == "__main__":
    demo.launch()