Upload 11 files

Browse files

Files changed (12) hide show

.gitattributes +1 -0
000000039769.jpg +3 -0
config.json +0 -0
python/inference_axmodel.py +87 -0
python/inference_onnx.py +87 -0
python/requirements.txt +5 -0
tokenizer/config.json +25 -0
tokenizer/preprocessor_config.json +23 -0
tokenizer/special_tokens_map.json +23 -0
tokenizer/spiece.model +3 -0
tokenizer/tokenizer.json +0 -0
tokenizer/tokenizer_config.json +33 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *.axmodel filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *.axmodel filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+000000039769.jpg filter=lfs diff=lfs merge=lfs -text

000000039769.jpg ADDED Viewed

Git LFS Details

SHA256: dea9e7ef97386345f7cff32f9055da4982da5471c48d575146c796ab4563b04e
Pointer size: 131 Bytes
Size of remote file: 173 kB

config.json ADDED Viewed

File without changes

python/inference_axmodel.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import time
+from PIL import Image
+import numpy as np
+import torch
+from transformers import AutoProcessor, AutoModel
+import axengine as ort
+# Set random seed
+np.random.seed(42)
+def load_model(model_path):
+    """
+    Load an ONNX model and measure loading time.
+    Args:
+        model_path (str): Path to the ONNX model file.
+    Returns:
+        ort.InferenceSession: Loaded ONNX model session.
+    """
+    start_time = time.time()
+    session = ort.InferenceSession(model_path)
+    elapsed_time = time.time() - start_time
+    print(f"Model loading time: {elapsed_time:.2f} seconds")
+    return session
+def model_inference(sess, inputs):
+    """
+    Perform inference using the given ONNX session and measure inference time.
+    Args:
+        sess (ort.InferenceSession): ONNX model session.
+        inputs (list or np.ndarray): Input data.
+    Returns:
+        np.ndarray: Inference results.
+    """
+    input_name = sess.get_inputs()[0].name
+    start_time = time.time()
+    outputs = [sess.run(None, {input_name: np.array([i])})[1] for i in inputs]
+    outputs = np.array(outputs).reshape(-1, 1152)  # Adjust output shape if needed
+    elapsed_time = time.time() - start_time
+    print(f"Model inference time: {elapsed_time:.2f} seconds")
+    return outputs
+# Load processor and prepare inputs
+processor = AutoProcessor.from_pretrained("./tokenizer")
+image = Image.open("./000000039769.jpg")
+texts = ["a photo of 2 cats", "a photo of 2 dogs"]
+inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
+# Load vision and text models
+start_time = time.time()
+vision_sess = load_model("./ax650/siglip_vision_u16_fcu8.axmodel")
+text_sess = load_model("./ax650/siglip_text_u16.axmodel")
+print(f"Total model loading time: {time.time() - start_time:.2f} seconds")
+# Run inference on both models
+start_time = time.time()
+vision_outputs = model_inference(vision_sess, inputs['pixel_values'].numpy())
+text_outputs = model_inference(text_sess, inputs['input_ids'].numpy().astype(np.int32))
+print(f"Total inference time: {time.time() - start_time:.2f} seconds")
+# Post-processing
+image_embeds = torch.tensor(vision_outputs)
+text_embeds = torch.tensor(text_outputs)
+# Normalize features
+image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+# Compute similarity logits
+logit_scale = np.random.randn(1)[0]  # Replace with trained value if available
+logit_bias = np.random.randn(1)[0]
+logits_per_text = (
+    np.matmul(text_embeds, image_embeds.t().numpy()) * np.exp(logit_scale)
+    + logit_bias
+)
+logits_per_image = logits_per_text.T
+def sigmoid(x):
+    return 1 / (1 + np.exp(-x))
+probs = sigmoid(logits_per_image)  # Get probabilities
+print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")

python/inference_onnx.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import time
+from PIL import Image
+import numpy as np
+import torch
+from transformers import AutoProcessor, AutoModel
+import onnxruntime as ort
+# Set random seed
+np.random.seed(42)
+def load_model(model_path):
+    """
+    Load an ONNX model and measure loading time.
+    Args:
+        model_path (str): Path to the ONNX model file.
+    Returns:
+        ort.InferenceSession: Loaded ONNX model session.
+    """
+    start_time = time.time()
+    session = ort.InferenceSession(model_path)
+    elapsed_time = time.time() - start_time
+    print(f"Model loading time: {elapsed_time:.2f} seconds")
+    return session
+def model_inference(sess, inputs):
+    """
+    Perform inference using the given ONNX session and measure inference time.
+    Args:
+        sess (ort.InferenceSession): ONNX model session.
+        inputs (list or np.ndarray): Input data.
+    Returns:
+        np.ndarray: Inference results.
+    """
+    input_name = sess.get_inputs()[0].name
+    start_time = time.time()
+    outputs = [sess.run(None, {input_name: np.array([i])})[1] for i in inputs]
+    outputs = np.array(outputs).reshape(-1, 1152)  # Adjust output shape if needed
+    elapsed_time = time.time() - start_time
+    print(f"Model inference time: {elapsed_time:.2f} seconds")
+    return outputs
+# Load processor and prepare inputs
+processor = AutoProcessor.from_pretrained("./tokenizer")
+image = Image.open("./000000039769.jpg")
+texts = ["a photo of 2 cats", "a photo of 2 dogs"]
+inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
+# Load vision and text models
+start_time = time.time()
+vision_sess = load_model("./onnx/siglip-so400m-patch14-384_vision.onnx")
+text_sess = load_model("./onnx/siglip-so400m-patch14-384_text.onnx")
+print(f"Total model loading time: {time.time() - start_time:.2f} seconds")
+# Run inference on both models
+start_time = time.time()
+vision_outputs = model_inference(vision_sess, inputs['pixel_values'].numpy())
+text_outputs = model_inference(text_sess, inputs['input_ids'].numpy())
+print(f"Total inference time: {time.time() - start_time:.2f} seconds")
+# Post-processing
+image_embeds = torch.tensor(vision_outputs)
+text_embeds = torch.tensor(text_outputs)
+# Normalize features
+image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+# Compute similarity logits
+logit_scale = np.random.randn(1)[0]  # Replace with trained value if available
+logit_bias = np.random.randn(1)[0]
+logits_per_text = (
+    np.matmul(text_embeds, image_embeds.t().numpy()) * np.exp(logit_scale)
+    + logit_bias
+)
+logits_per_image = logits_per_text.T
+def sigmoid(x):
+    return 1 / (1 + np.exp(-x))
+probs = sigmoid(logits_per_image)  # Get probabilities
+print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")

python/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Pillow
+requests
+transformers
+sentencepiece
+torch

tokenizer/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "SiglipModel"
+  ],
+  "initializer_factor": 1.0,
+  "model_type": "siglip",
+  "text_config": {
+    "hidden_size": 1152,
+    "intermediate_size": 4304,
+    "model_type": "siglip_text_model",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.37.0.dev0",
+  "vision_config": {
+    "hidden_size": 1152,
+    "image_size": 384,
+    "intermediate_size": 4304,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27,
+    "patch_size": 14
+  }
+}

tokenizer/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "SiglipProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 384,
+    "width": 384
+  }
+}

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": true,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": true,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": true,
+    "single_word": false
+  }
+}

tokenizer/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e5036bed065526c3c212dfbe288752391797c4bb1a284aa18c9a0b23fcaf8ec
+size 798330

tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "added_tokens_decoder": {
+    "1": {
+      "content": "</s>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<unk>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": true,
+  "eos_token": "</s>",
+  "model_input_names": [
+    "input_ids"
+  ],
+  "model_max_length": 64,
+  "pad_token": "</s>",
+  "processor_class": "SiglipProcessor",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "SiglipTokenizer",
+  "unk_token": "<unk>"
+}