import time
from PIL import Image
import numpy as np
import torch
from transformers import AutoProcessor, AutoModel
import onnxruntime as ort

# Set random seed
np.random.seed(42)

def load_model(model_path):
    """
    Load an ONNX model and measure loading time.

    Args:
        model_path (str): Path to the ONNX model file.

    Returns:
        ort.InferenceSession: Loaded ONNX model session.
    """
    start_time = time.time()
    session = ort.InferenceSession(model_path)
    elapsed_time = time.time() - start_time
    print(f"Model loading time: {elapsed_time:.2f} seconds")
    return session

def model_inference(sess, inputs):
    """
    Perform inference using the given ONNX session and measure inference time.

    Args:
        sess (ort.InferenceSession): ONNX model session.
        inputs (list or np.ndarray): Input data.

    Returns:
        np.ndarray: Inference results.
    """
    input_name = sess.get_inputs()[0].name
    start_time = time.time()
    outputs = [sess.run(None, {input_name: np.array([i])})[1] for i in inputs]
    outputs = np.array(outputs).reshape(-1, 1152)  # Adjust output shape if needed
    elapsed_time = time.time() - start_time
    print(f"Model inference time: {elapsed_time:.2f} seconds")
    return outputs


# Load processor and prepare inputs
processor = AutoProcessor.from_pretrained("./tokenizer")
image = Image.open("./000000039769.jpg")
texts = ["a photo of 2 cats", "a photo of 2 dogs"]
inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

# Load vision and text models
start_time = time.time()
vision_sess = load_model("./onnx/siglip-so400m-patch14-384_vision.onnx")
text_sess = load_model("./onnx/siglip-so400m-patch14-384_text.onnx")
print(f"Total model loading time: {time.time() - start_time:.2f} seconds")

# Run inference on both models
start_time = time.time()
vision_outputs = model_inference(vision_sess, inputs['pixel_values'].numpy())
text_outputs = model_inference(text_sess, inputs['input_ids'].numpy())
print(f"Total inference time: {time.time() - start_time:.2f} seconds")

# Post-processing
image_embeds = torch.tensor(vision_outputs)
text_embeds = torch.tensor(text_outputs)

# Normalize features
image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

# Compute similarity logits
logit_scale = np.random.randn(1)[0]  # Replace with trained value if available
logit_bias = np.random.randn(1)[0]

logits_per_text = (
    np.matmul(text_embeds, image_embeds.t().numpy()) * np.exp(logit_scale)
    + logit_bias
)
logits_per_image = logits_per_text.T

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

probs = sigmoid(logits_per_image)  # Get probabilities
print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")