|
|
--- |
|
|
library_name: transformers |
|
|
pipeline_tag: mask-generation |
|
|
inference: true |
|
|
widget: |
|
|
- text: Hello! |
|
|
example_title: Hello world |
|
|
group: Python |
|
|
base_model: |
|
|
- facebook/sam3 |
|
|
--- |
|
|
|
|
|
This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [facebook/sam3](https://huggingface.co/facebook/sam3). |
|
|
|
|
|
### Example usage: |
|
|
|
|
|
```python |
|
|
import requests |
|
|
import torch |
|
|
from PIL import Image |
|
|
from transformers import Sam3Model, Sam3Processor |
|
|
from transformers.models.sam3.modeling_sam3 import Sam3Config |
|
|
|
|
|
model_id = "tiny-random/sam3" |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
model = Sam3Model.from_pretrained(model_id).to(device) |
|
|
processor = Sam3Processor.from_pretrained(model_id) |
|
|
|
|
|
kitchen_url = "http://images.cocodataset.org/val2017/000000136466.jpg" |
|
|
kitchen_image = Image.open(requests.get( |
|
|
kitchen_url, stream=True).raw).convert("RGB") |
|
|
# Segment "handle" but exclude the oven handle using a negative box |
|
|
text = "handle" |
|
|
# Negative box covering oven handle area (xyxy): [40, 183, 318, 204] |
|
|
oven_handle_box = [40, 183, 318, 204] |
|
|
input_boxes = [[oven_handle_box]] |
|
|
inputs = processor( |
|
|
images=kitchen_image, |
|
|
text=text, |
|
|
input_boxes=input_boxes, |
|
|
input_boxes_labels=[[0]], # 0 = negative (exclude this region) |
|
|
return_tensors="pt" |
|
|
).to(device) |
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
# Post-process results |
|
|
results = processor.post_process_instance_segmentation( |
|
|
outputs, |
|
|
threshold=0.5, |
|
|
mask_threshold=0.5, |
|
|
target_sizes=inputs.get("original_sizes").tolist() |
|
|
)[0] |
|
|
print(results) |
|
|
# This will segment pot handles but exclude the oven handle |
|
|
``` |
|
|
|
|
|
### Codes to create this repo: |
|
|
|
|
|
```python |
|
|
import json |
|
|
from pathlib import Path |
|
|
|
|
|
import accelerate |
|
|
import torch |
|
|
from huggingface_hub import file_exists, hf_hub_download |
|
|
from transformers import ( |
|
|
AutoConfig, |
|
|
AutoModelForCausalLM, |
|
|
AutoProcessor, |
|
|
GenerationConfig, |
|
|
Sam3Processor, |
|
|
set_seed, |
|
|
) |
|
|
from transformers.models.sam3.modeling_sam3 import Sam3Config, Sam3Model |
|
|
|
|
|
source_model_id = "facebook/sam3" |
|
|
save_folder = "/tmp/tiny-random/sam3" |
|
|
|
|
|
processor = Sam3Processor.from_pretrained( |
|
|
source_model_id, trust_remote_code=True) |
|
|
processor.save_pretrained(save_folder) |
|
|
|
|
|
with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f: |
|
|
config_json = json.load(f) |
|
|
HIDDEN_SIZE = 16 |
|
|
INTERMEDIATE_SIZE = 32 |
|
|
NUM_ATTENTION_HEADS = 2 |
|
|
config_json['detector_config']['detr_decoder_config'].update({ |
|
|
'hidden_size': HIDDEN_SIZE, |
|
|
'intermediate_size': INTERMEDIATE_SIZE, |
|
|
'num_attention_heads': NUM_ATTENTION_HEADS, |
|
|
}) |
|
|
config_json['detector_config']['detr_encoder_config'].update({ |
|
|
'hidden_size': HIDDEN_SIZE, |
|
|
'intermediate_size': INTERMEDIATE_SIZE, |
|
|
'num_attention_heads': NUM_ATTENTION_HEADS, |
|
|
}) |
|
|
config_json['detector_config']['geometry_encoder_config'].update({ |
|
|
'hidden_size': HIDDEN_SIZE, |
|
|
'intermediate_size': INTERMEDIATE_SIZE, |
|
|
'num_attention_heads': NUM_ATTENTION_HEADS, |
|
|
}) |
|
|
config_json['detector_config']['mask_decoder_config'].update({ |
|
|
'hidden_size': HIDDEN_SIZE, |
|
|
'intermediate_size': INTERMEDIATE_SIZE, |
|
|
'num_attention_heads': NUM_ATTENTION_HEADS, |
|
|
}) |
|
|
config_json['detector_config']['text_config'].update({ |
|
|
'hidden_size': HIDDEN_SIZE, |
|
|
'intermediate_size': INTERMEDIATE_SIZE, |
|
|
'num_attention_heads': NUM_ATTENTION_HEADS, |
|
|
'projection_dim': HIDDEN_SIZE, |
|
|
'num_hidden_layers': 2, |
|
|
}) |
|
|
config_json['detector_config']['vision_config']['backbone_config'].update({ |
|
|
'hidden_size': HIDDEN_SIZE, |
|
|
'intermediate_size': INTERMEDIATE_SIZE, |
|
|
'num_attention_heads': NUM_ATTENTION_HEADS, |
|
|
'fpn_hidden_size': HIDDEN_SIZE, |
|
|
'global_attn_indexes': [1, 3, 5, 7], |
|
|
'num_hidden_layers': 8, |
|
|
}) |
|
|
config_json['detector_config']['vision_config'].update({ |
|
|
'fpn_hidden_size': HIDDEN_SIZE, |
|
|
}) |
|
|
config_json['tracker_config']['mask_decoder_config'].update({ |
|
|
'hidden_size': HIDDEN_SIZE, |
|
|
'iou_head_hidden_dim': HIDDEN_SIZE, |
|
|
'num_attention_heads': NUM_ATTENTION_HEADS, |
|
|
}) |
|
|
config_json['tracker_config'].update({ |
|
|
'mask_downsampler_embed_dim': HIDDEN_SIZE, |
|
|
'memory_attention_feed_forward_hidden_size': HIDDEN_SIZE, |
|
|
'memory_attention_hidden_size': HIDDEN_SIZE, |
|
|
'memory_encoder_hidden_size': HIDDEN_SIZE, |
|
|
'memory_fuser_embed_dim': HIDDEN_SIZE, |
|
|
'memory_fuser_intermediate_dim': INTERMEDIATE_SIZE, |
|
|
}) |
|
|
config_json['tracker_config']['prompt_encoder_config'].update({ |
|
|
'hidden_size': HIDDEN_SIZE, |
|
|
'intermediate_size': INTERMEDIATE_SIZE, |
|
|
'num_attention_heads': NUM_ATTENTION_HEADS, |
|
|
}) |
|
|
config_json['tracker_config']['vision_config']['backbone_config'].update({ |
|
|
'hidden_size': HIDDEN_SIZE, |
|
|
'intermediate_size': INTERMEDIATE_SIZE, |
|
|
'num_attention_heads': NUM_ATTENTION_HEADS, |
|
|
'global_attn_indexes': [1, 3, 5, 7], |
|
|
'num_hidden_layers': 8, |
|
|
}) |
|
|
config_json['tracker_config']['vision_config'].update({ |
|
|
'fpn_hidden_size': HIDDEN_SIZE, |
|
|
}) |
|
|
|
|
|
with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f: |
|
|
json.dump(config_json, f, indent=2) |
|
|
|
|
|
config = Sam3Config.from_pretrained( |
|
|
save_folder, |
|
|
trust_remote_code=True, |
|
|
) |
|
|
print(config) |
|
|
torch.set_default_dtype(torch.float32) |
|
|
model = Sam3Model(config) |
|
|
set_seed(42) |
|
|
model = model.cpu() |
|
|
with torch.no_grad(): |
|
|
for name, p in sorted(model.named_parameters()): |
|
|
torch.nn.init.normal_(p, 0, 0.1) |
|
|
print(name, p.shape) |
|
|
model.save_pretrained(save_folder) |
|
|
# print(list(model.state_dict().keys())) |
|
|
# there is some bug in model.save_pretrained... Re-save the model weights here. |
|
|
import safetensors.torch |
|
|
safetensors.torch.save_file( |
|
|
tensors=model.state_dict(), |
|
|
filename=f"{save_folder}/model.safetensors" |
|
|
) |
|
|
``` |
|
|
|
|
|
### Printing the model: |
|
|
|
|
|
```text |
|
|
Sam3Model( |
|
|
(vision_encoder): Sam3VisionModel( |
|
|
(backbone): Sam3ViTModel( |
|
|
(embeddings): Sam3ViTEmbeddings( |
|
|
(patch_embeddings): Sam3ViTPatchEmbeddings( |
|
|
(projection): Conv2d(3, 16, kernel_size=(14, 14), stride=(14, 14), bias=False) |
|
|
) |
|
|
(dropout): Dropout(p=0.0, inplace=False) |
|
|
) |
|
|
(layer_norm): LayerNorm((16,), eps=1e-06, elementwise_affine=True) |
|
|
(layers): ModuleList( |
|
|
(0-7): 8 x Sam3ViTLayer( |
|
|
(layer_norm1): LayerNorm((16,), eps=1e-06, elementwise_affine=True) |
|
|
(rotary_emb): Sam3ViTRotaryEmbedding() |
|
|
(attention): Sam3ViTRoPEAttention( |
|
|
(q_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(k_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(v_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(o_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
) |
|
|
(layer_norm2): LayerNorm((16,), eps=1e-06, elementwise_affine=True) |
|
|
(mlp): Sam3MLP( |
|
|
(activation_fn): GELUActivation() |
|
|
(fc1): Linear(in_features=16, out_features=32, bias=True) |
|
|
(fc2): Linear(in_features=32, out_features=16, bias=True) |
|
|
(dropout): Dropout(p=0.0, inplace=False) |
|
|
) |
|
|
(dropout): Dropout(p=0.0, inplace=False) |
|
|
) |
|
|
) |
|
|
) |
|
|
(neck): Sam3VisionNeck( |
|
|
(position_encoding): Sam3SinePositionEmbedding() |
|
|
(fpn_layers): ModuleList( |
|
|
(0): Sam3FPNLayer( |
|
|
(scale_layers): ModuleList( |
|
|
(0): ConvTranspose2d(16, 8, kernel_size=(2, 2), stride=(2, 2)) |
|
|
(1): GELU(approximate='none') |
|
|
(2): ConvTranspose2d(8, 4, kernel_size=(2, 2), stride=(2, 2)) |
|
|
) |
|
|
(proj1): Conv2d(4, 16, kernel_size=(1, 1), stride=(1, 1)) |
|
|
(proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) |
|
|
) |
|
|
(1): Sam3FPNLayer( |
|
|
(scale_layers): ModuleList( |
|
|
(0): ConvTranspose2d(16, 8, kernel_size=(2, 2), stride=(2, 2)) |
|
|
) |
|
|
(proj1): Conv2d(8, 16, kernel_size=(1, 1), stride=(1, 1)) |
|
|
(proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) |
|
|
) |
|
|
(2): Sam3FPNLayer( |
|
|
(scale_layers): ModuleList() |
|
|
(proj1): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1)) |
|
|
(proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) |
|
|
) |
|
|
(3): Sam3FPNLayer( |
|
|
(scale_layers): ModuleList( |
|
|
(0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) |
|
|
) |
|
|
(proj1): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1)) |
|
|
(proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) |
|
|
) |
|
|
) |
|
|
) |
|
|
) |
|
|
(text_encoder): CLIPTextModelWithProjection( |
|
|
(text_model): CLIPTextTransformer( |
|
|
(embeddings): CLIPTextEmbeddings( |
|
|
(token_embedding): Embedding(49408, 16) |
|
|
(position_embedding): Embedding(32, 16) |
|
|
) |
|
|
(encoder): CLIPEncoder( |
|
|
(layers): ModuleList( |
|
|
(0-1): 2 x CLIPEncoderLayer( |
|
|
(self_attn): CLIPAttention( |
|
|
(k_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(v_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(q_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(out_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
) |
|
|
(layer_norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(mlp): CLIPMLP( |
|
|
(activation_fn): GELUActivation() |
|
|
(fc1): Linear(in_features=16, out_features=32, bias=True) |
|
|
(fc2): Linear(in_features=32, out_features=16, bias=True) |
|
|
) |
|
|
(layer_norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
) |
|
|
) |
|
|
) |
|
|
(final_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
) |
|
|
(text_projection): Linear(in_features=16, out_features=16, bias=False) |
|
|
) |
|
|
(text_projection): Linear(in_features=16, out_features=16, bias=True) |
|
|
(geometry_encoder): Sam3GeometryEncoder( |
|
|
(position_encoding): Sam3SinePositionEmbedding() |
|
|
(label_embed): Embedding(2, 16) |
|
|
(cls_embed): Embedding(1, 16) |
|
|
(boxes_direct_project): Linear(in_features=4, out_features=16, bias=True) |
|
|
(boxes_pool_project): Conv2d(16, 16, kernel_size=(7, 7), stride=(1, 1)) |
|
|
(boxes_pos_enc_project): Linear(in_features=18, out_features=16, bias=True) |
|
|
(vision_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(final_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(prompt_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(layers): ModuleList( |
|
|
(0-2): 3 x Sam3GeometryEncoderLayer( |
|
|
(layer_norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(self_attn): Sam3Attention( |
|
|
(q_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(k_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(v_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(o_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
) |
|
|
(dropout): Dropout(p=0.1, inplace=False) |
|
|
(cross_attn): Sam3Attention( |
|
|
(q_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(k_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(v_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(o_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
) |
|
|
(layer_norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(mlp): Sam3MLP( |
|
|
(activation_fn): ReLU() |
|
|
(fc1): Linear(in_features=16, out_features=32, bias=True) |
|
|
(fc2): Linear(in_features=32, out_features=16, bias=True) |
|
|
(dropout): Dropout(p=0.0, inplace=False) |
|
|
) |
|
|
(layer_norm3): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
) |
|
|
) |
|
|
(output_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
) |
|
|
(detr_encoder): Sam3DetrEncoder( |
|
|
(layers): ModuleList( |
|
|
(0-5): 6 x Sam3DetrEncoderLayer( |
|
|
(layer_norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(self_attn): Sam3Attention( |
|
|
(q_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(k_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(v_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(o_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
) |
|
|
(dropout): Dropout(p=0.1, inplace=False) |
|
|
(cross_attn): Sam3Attention( |
|
|
(q_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(k_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(v_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(o_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
) |
|
|
(layer_norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(mlp): Sam3MLP( |
|
|
(activation_fn): ReLU() |
|
|
(fc1): Linear(in_features=16, out_features=32, bias=True) |
|
|
(fc2): Linear(in_features=32, out_features=16, bias=True) |
|
|
(dropout): Dropout(p=0.0, inplace=False) |
|
|
) |
|
|
(layer_norm3): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
) |
|
|
) |
|
|
) |
|
|
(detr_decoder): Sam3DetrDecoder( |
|
|
(layers): ModuleList( |
|
|
(0-5): 6 x Sam3DetrDecoderLayer( |
|
|
(self_attn): Sam3Attention( |
|
|
(q_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(k_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(v_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(o_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
) |
|
|
(self_attn_dropout): Dropout(p=0.1, inplace=False) |
|
|
(self_attn_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(text_cross_attn): Sam3Attention( |
|
|
(q_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(k_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(v_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(o_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
) |
|
|
(text_cross_attn_dropout): Dropout(p=0.1, inplace=False) |
|
|
(text_cross_attn_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(vision_cross_attn): Sam3Attention( |
|
|
(q_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(k_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(v_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(o_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
) |
|
|
(vision_cross_attn_dropout): Dropout(p=0.1, inplace=False) |
|
|
(vision_cross_attn_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(mlp): Sam3MLP( |
|
|
(activation_fn): ReLU() |
|
|
(fc1): Linear(in_features=16, out_features=32, bias=True) |
|
|
(fc2): Linear(in_features=32, out_features=16, bias=True) |
|
|
(dropout): Dropout(p=0.0, inplace=False) |
|
|
) |
|
|
(mlp_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(mlp_dropout): Dropout(p=0.1, inplace=False) |
|
|
) |
|
|
) |
|
|
(output_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(box_head): Sam3DecoderMLP( |
|
|
(layer1): Linear(in_features=16, out_features=16, bias=True) |
|
|
(layer2): Linear(in_features=16, out_features=16, bias=True) |
|
|
(layer3): Linear(in_features=16, out_features=4, bias=True) |
|
|
) |
|
|
(query_embed): Embedding(200, 16) |
|
|
(reference_points): Embedding(200, 4) |
|
|
(presence_token): Embedding(1, 16) |
|
|
(presence_head): Sam3DecoderMLP( |
|
|
(layer1): Linear(in_features=16, out_features=16, bias=True) |
|
|
(layer2): Linear(in_features=16, out_features=16, bias=True) |
|
|
(layer3): Linear(in_features=16, out_features=1, bias=True) |
|
|
) |
|
|
(presence_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(ref_point_head): Sam3DecoderMLP( |
|
|
(layer1): Linear(in_features=32, out_features=16, bias=True) |
|
|
(layer2): Linear(in_features=16, out_features=16, bias=True) |
|
|
) |
|
|
(box_rpb_embed_x): Sam3DecoderMLP( |
|
|
(layer1): Linear(in_features=2, out_features=16, bias=True) |
|
|
(layer2): Linear(in_features=16, out_features=2, bias=True) |
|
|
) |
|
|
(box_rpb_embed_y): Sam3DecoderMLP( |
|
|
(layer1): Linear(in_features=2, out_features=16, bias=True) |
|
|
(layer2): Linear(in_features=16, out_features=2, bias=True) |
|
|
) |
|
|
(position_encoding): Sam3SinePositionEmbedding() |
|
|
) |
|
|
(mask_decoder): Sam3MaskDecoder( |
|
|
(pixel_decoder): Sam3PixelDecoder( |
|
|
(conv_layers): ModuleList( |
|
|
(0-2): 3 x Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) |
|
|
) |
|
|
(norms): ModuleList( |
|
|
(0-2): 3 x GroupNorm(8, 16, eps=1e-05, affine=True) |
|
|
) |
|
|
) |
|
|
(mask_embedder): Sam3MaskEmbedder( |
|
|
(layers): ModuleList( |
|
|
(0-2): 3 x Linear(in_features=16, out_features=16, bias=True) |
|
|
) |
|
|
(activation): ReLU() |
|
|
) |
|
|
(instance_projection): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1)) |
|
|
(semantic_projection): Conv2d(16, 1, kernel_size=(1, 1), stride=(1, 1)) |
|
|
(prompt_cross_attn): Sam3Attention( |
|
|
(q_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(k_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(v_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(o_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
) |
|
|
(prompt_cross_attn_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(prompt_cross_attn_dropout): Dropout(p=0.0, inplace=False) |
|
|
) |
|
|
(dot_product_scoring): Sam3DotProductScoring( |
|
|
(text_mlp): Sam3DecoderMLP( |
|
|
(layer1): Linear(in_features=16, out_features=32, bias=True) |
|
|
(layer2): Linear(in_features=32, out_features=16, bias=True) |
|
|
) |
|
|
(text_mlp_dropout): Dropout(p=0.1, inplace=False) |
|
|
(text_mlp_out_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
|
|
(text_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
(query_proj): Linear(in_features=16, out_features=16, bias=True) |
|
|
) |
|
|
) |
|
|
``` |