{ "architectures": [ "JinaVLMForConditionalGeneration" ], "auto_map": { "AutoConfig": "configuration_jvlm.JinaVLMConfig", "AutoModel": "modeling_jvlm.JinaVLM", "AutoModelForCausalLM": "modeling_jvlm.JinaVLMForConditionalGeneration" }, "bos_token_id": 151643, "dtype": "float32", "eos_token_id": 151643, "model_type": "jvlm", "pad_token_id": 151643, "text_config": { "additional_vocab_size": 128, "block_config": { "attn_config": { "clip_qkv": null, "dropout": 0.0, "fp32": false, "head_dim": 128, "inner_lnorm": false, "k_bias": false, "k_lnorm": true, "lnorm_config": { "bias": false, "eps": 1e-06, "type": "rms", "with_affine": true }, "n_heads": 16, "n_kv_heads": 8, "o_bias": false, "o_lnorm": false, "q_bias": false, "q_lnorm": true, "qkv_lnorm_on_heads": true, "sliding_window": -1, "softmax_scale": null, "v_bias": false, "v_lnorm": false }, "attn_lscale_init": null, "ffn_config": { "activation_type": "silu", "bias": false, "gated_activation": true, "inner_lnorm": false, "lnorm_config": { "bias": false, "eps": 1e-06, "type": "rms", "with_affine": true }, "ratio": 4, "size": 6144 }, "ffn_lscale_init": null, "lnorm_config": { "bias": false, "eps": 1e-06, "type": "rms", "with_affine": true }, "postnorm": false, "residual_dropout": 0.0, "residual_path_dropout": 0.0, "residual_response_dropout": 0.1 }, "embedding_dropout": 0.0, "embedding_size": 151936, "hidden_size": 2048, "max_position_embeddings": null, "max_sequence_length": 40960, "model_type": "jvlm", "n_layers": 28, "normalize_input_embeds": false, "num_hidden_layers": 28, "partial_rotary_factor": 1.0, "rope": true, "rope_partial_factor": 1.0, "rope_scaling": null, "rope_theta": 1000000, "tie_word_embeddings": false, "vocab_size": 151936 }, "tie_word_embeddings": false, "transformers_version": "4.57.1", "vision_config": { "block_config": { "attn_config": { "clip_qkv": null, "dropout": 0.0, "fp32": false, "head_dim": 72, "inner_lnorm": false, "k_bias": true, "k_lnorm": false, "lnorm_config": { "bias": true, "eps": 1e-06, "type": "default", "with_affine": true }, "n_heads": 16, "n_kv_heads": null, "o_bias": true, "o_lnorm": false, "q_bias": true, "q_lnorm": false, "qkv_lnorm_on_heads": false, "sliding_window": -1, "softmax_scale": null, "v_bias": true, "v_lnorm": false }, "attn_lscale_init": null, "ffn_config": { "activation_type": "gelu_pytorch_tanh", "bias": true, "gated_activation": false, "inner_lnorm": false, "lnorm_config": { "bias": true, "eps": 1e-06, "type": "default", "with_affine": true }, "ratio": 4, "size": 4304 }, "ffn_lscale_init": null, "lnorm_config": { "bias": true, "eps": 1e-06, "type": "default", "with_affine": true }, "postnorm": false, "residual_dropout": 0.0, "residual_path_dropout": 0.0, "residual_response_dropout": 0.0 }, "hidden_size": 1152, "input_size": [ 378, 378 ], "linear_patch_embedding": true, "model_type": "jvlm", "n_channels": 3, "n_layers": 27, "output_size": 2048, "patch_dropout": 0.0, "patch_embedding_bias": true, "patch_size": 14, "positional_interpolation": "bicubic", "post_lnorm": true, "pre_lnorm": false, "use_absolute_positional_embeddings": true, "use_cls_token": false, "vit_layers": [ -4, -10 ], "vl_connector_config": { "attn_pooling_config": { "clip_qkv": null, "dropout": 0.0, "fp32": false, "head_dim": 72, "inner_lnorm": false, "k_bias": true, "k_lnorm": false, "lnorm_config": { "bias": true, "eps": 1e-06, "type": "default", "with_affine": true }, "n_heads": 16, "n_kv_heads": null, "o_bias": true, "o_lnorm": false, "q_bias": true, "q_lnorm": false, "qkv_lnorm_on_heads": false, "sliding_window": -1, "softmax_scale": null, "v_bias": true, "v_lnorm": false }, "feature_dropout": 0.0, "mlp_projector_config": { "activation_type": "silu", "bias": false, "gated_activation": true, "inner_lnorm": false, "lnorm_config": { "bias": false, "eps": 1e-06, "type": "rms", "with_affine": true }, "ratio": 4, "size": 6144 }, "padding_embed_type": "pad_and_partial_pad", "pooling_h": 2, "pooling_type": "attention_meanq", "pooling_w": 2, "projector_dropout": 0.0, "projector_type": "mlp", "spatial_merge_size": 2 } } }