| # Copyright 2025 Jina AI. All rights reserved. | |
| from dataclasses import asdict, dataclass, is_dataclass | |
| from enum import Enum | |
| from typing import Any, Dict, Optional, Tuple, Type, Union | |
| from transformers import PretrainedConfig | |
| class StrEnum(str, Enum): | |
| def __str__(self) -> str: | |
| return self.value | |
| def __repr__(self) -> str: | |
| return f"'{str(self)}'" | |
| class ActivationType(StrEnum): | |
| gelu = 'gelu' | |
| gelu_10 = 'gelu_10' | |
| gelu_fast = 'gelu_fast' | |
| gelu_new = 'gelu_new' | |
| gelu_python = 'gelu_python' | |
| gelu_pytorch_tanh = 'gelu_pytorch_tanh' | |
| gelu_accurate = 'gelu_accurate' | |
| laplace = 'laplace' | |
| leaky_relu = 'leaky_relu' | |
| linear = 'linear' | |
| mish = 'mish' | |
| quick_gelu = 'quick_gelu' | |
| relu = 'relu' | |
| relu2 = 'relu2' | |
| relu6 = 'relu6' | |
| sigmoid = 'sigmoid' | |
| silu = 'silu' | |
| swish = 'swish' | |
| tanh = 'tanh' | |
| prelu = 'prelu' | |
| xielu = 'xielu' | |
| class ImagePaddingEmbedType(StrEnum): | |
| pad_and_partial_pad = 'pad_and_partial_pad' | |
| pad_embed = 'pad_embed' | |
| regress = 'regress' | |
| class ImagePooling2DType(StrEnum): | |
| attention = 'attention' | |
| attention_meanq = 'attention_meanq' | |
| attention_2wide = 'attention_2wide' | |
| none = 'none' | |
| stack = 'stack' | |
| token_merger = 'token_merger' | |
| class ImageProjectionType(StrEnum): | |
| mlp = 'mlp' | |
| mlpx2 = '2mlp' | |
| linear = 'linear' | |
| class LayerNormType(StrEnum): | |
| default = 'default' | |
| low_precision = 'low_precision' | |
| rms = 'rms' | |
| def _resolve_subconfig(obj: Union[None, Dict[str, Any], Any], cls: Type[Any]) -> Any: | |
| if isinstance(obj, dict): | |
| return cls(**obj) | |
| elif obj is None: | |
| return cls() | |
| return obj | |
| class JinaLNormConfig: | |
| """Layer Norm configuration. | |
| Args: | |
| type (`LayerNormType`, *optional*, defaults to `LayerNormType.default`): | |
| The layernorm implementation to use. | |
| with_affine (`bool`, *optional*, defaults to `True`): | |
| Whether to include bias and weight parameters for the layer norms. | |
| This only affects layer norms that are immediately followed by a linear | |
| layer in the forward pass, so everything except QK-norms. To turn off | |
| affines for QK norms as well, set :attr:`attention_layer_norm_with_affine` | |
| to ``False``. | |
| eps (`float`, *optional*, defaults to `None`): | |
| Epsilon for layer norms. | |
| bias (`bool`, *optional*, defaults to `None`): | |
| Whether or not to include bias parameters in layer norm. | |
| """ | |
| type: LayerNormType = LayerNormType.default | |
| with_affine: bool = True | |
| eps: Optional[float] = None | |
| bias: Optional[bool] = None | |
| class JinaFFNConfig: | |
| """Feed Forward Network configuration. | |
| Args: | |
| activation_type (`ActivationType`, *optional*, defaults to `'silu'`): | |
| The activation function to use within the MLP layer. | |
| ratio (`int`, *optional*, defaults to 4): | |
| The ratio of the MLP dimensionality to ``model_dim``. | |
| size (`int`, *optional*, defaults to `None`): | |
| This is only used when ``ratio`` is not set. Set the exact hidden size | |
| for the MLP. Otherwise the inner MLP hidden size will be set to | |
| `ratio * model_dim`. | |
| bias (`bool`, *optional*, defaults to `False`): | |
| Add bias to the MLP layers. | |
| inner_lnorm (`bool`, *optional*, defaults to `False`): | |
| Add inner layer normalization to the FFN module. | |
| gated_activation (`bool`, *optional*, defaults to `True`): | |
| Use gated activation in the MLP, i.e. SwiGLU rather than a standard | |
| activation. Combine with activation type silu for SwiGLU. | |
| lnorm_config (`JinaLNormConfig`, *optional*, defaults to `JinaLNormConfig()`): | |
| The inner layernorm configuration. | |
| """ | |
| activation_type: ActivationType = ActivationType.silu | |
| ratio: int = 4 | |
| size: Optional[int] = None | |
| bias: bool = False | |
| inner_lnorm: bool = False | |
| gated_activation: bool = True | |
| lnorm_config: Union[None, Dict[str, Any], JinaLNormConfig] = None | |
| def __post_init__(self): | |
| self.lnorm_config = _resolve_subconfig(self.lnorm_config, JinaLNormConfig) | |
| class JinaAttentionConfig: | |
| """Attention module configuration. | |
| Args: | |
| head_dim (`int`, *optional*, defaults to `None`): | |
| Head dimensionality. | |
| n_heads (`int`, *optional*, defaults to 12): | |
| The number of self-attention heads. | |
| n_kv_heads (`int`, *optional*, defaults to `None`): | |
| The number of heads to use for keys and values. Defaults to `n_heads`. | |
| Set this to ``None`` or ``n_heads`` for normal multi-head attention. | |
| Set this to 1 for multi-query attention. | |
| Set it to some in-between value for Llama2-style grouped query attention. | |
| softmax_scale (`float`, *optional*, defaults to `None`): | |
| Attention softmax scale. If set to `None`, the default inverse of head | |
| dimension is used. | |
| sliding_window (`int`, *optional*, defaults to -1): | |
| Attention sliding window size. If set to -1, no sliding window attention | |
| is used. | |
| fp32 (`bool`, *optional*, defaults to `False`): | |
| Compute attention in float32. | |
| dropout (`float`, *optional*, defaults to 0.0): | |
| The dropout probability within the attention modules. | |
| q_bias (`bool`, *optional*, defaults to `False`): | |
| Add bias to the query projection. | |
| k_bias (`bool`, *optional*, defaults to `False`): | |
| Add bias to the key projection. | |
| v_bias (`bool`, *optional*, defaults to `False`): | |
| Add bias to the value projection. | |
| o_bias (`bool`, *optional*, defaults to `False`): | |
| Add bias to the output projection. | |
| q_lnorm (`bool`, *optional*, defaults to `False`): | |
| Add layer normalization to the query projection. | |
| k_lnorm (`bool`, *optional*, defaults to `False`): | |
| Add layer normalization to the key projection. | |
| v_lnorm (`bool`, *optional*, defaults to `False`): | |
| Add layer normalization to the value projection. | |
| qkv_lnorm_on_heads (`bool`, *optional*, defaults to `False`): | |
| If enabled, Q,K and V layer norms are applied on the heads. | |
| o_lnorm (`bool`, *optional*, defaults to `False`): | |
| Add layer normalization to the output projection. | |
| inner_lnorm (`bool`, *optional*, defaults to `False`): | |
| Add inner layer normalization to the attention module. | |
| clip_qkv (`float`, *optional*, defaults to `None`): | |
| Clip QKV to this value when set. | |
| lnorm_config (`JinaLNormConfig`, *optional*, defaults to `JinaLNormConfig()`): | |
| The inner layernorm configuration. | |
| """ | |
| head_dim: Optional[int] = None | |
| n_heads: int = 12 | |
| n_kv_heads: Optional[int] = None | |
| softmax_scale: Optional[float] = None | |
| sliding_window: int = -1 | |
| fp32: bool = False | |
| dropout: float = 0.0 | |
| q_bias: bool = False | |
| k_bias: bool = False | |
| v_bias: bool = False | |
| o_bias: bool = False | |
| q_lnorm: bool = False | |
| k_lnorm: bool = False | |
| v_lnorm: bool = False | |
| qkv_lnorm_on_heads: bool = False | |
| o_lnorm: bool = False | |
| inner_lnorm: bool = False | |
| clip_qkv: Optional[float] = None | |
| lnorm_config: Union[None, Dict[str, Any], JinaLNormConfig] = None | |
| def __post_init__(self): | |
| self.lnorm_config = _resolve_subconfig(self.lnorm_config, JinaLNormConfig) | |
| class JinaTransformerBlockConfig: | |
| """Transformer model block configuration. | |
| Args: | |
| attn_config (`JinaAttentionConfig`, *optional*, defaults to | |
| `JinaAttentionConfig()`): | |
| The attention module configuration. | |
| attn_lscale_init (`float`, *optional*, defaults to `None`): | |
| Initial value of layer scale gamma in the attention module. | |
| ffn_config (`JinaFFNConfig`, *optional*, defaults to `JinaFFNConfig()`): | |
| The attention module configuration. | |
| ffn_lscale_init (`float`, *optional*, defaults to `None`): | |
| Initial value of layer scale gamma in the FFN module. | |
| residual_dropout (`float`, *optional*, defaults to 0.0): | |
| The dropout probability for the MLP and attention output within each block. | |
| residual_response_dropout (`float`, *optional*, defaults to 0.0): | |
| Dropout applied only to loss/response tokens. | |
| residual_path_dropout (`float`, *optional*, defaults to 0.0): | |
| The dropout probability for the MLP and attention output within each block. | |
| postnorm (`bool`, *optional*, defaults to `False`): | |
| Apply norm after the attention/feedforward layers rather than before, as | |
| introduced in the Swin transformer paper (Liu et al). | |
| lnorm_config (`JinaLNormConfig`, *optional*, defaults to `JinaLNormConfig()`): | |
| The inner layernorm configuration. | |
| """ | |
| attn_config: Union[None, Dict[str, Any], JinaAttentionConfig] = None | |
| attn_lscale_init: Optional[float] = None | |
| ffn_config: Union[None, Dict[str, Any], JinaFFNConfig] = None | |
| ffn_lscale_init: Optional[float] = None | |
| residual_dropout: float = 0.0 | |
| residual_response_dropout: float = 0.0 | |
| residual_path_dropout: float = 0.0 | |
| postnorm: bool = False | |
| lnorm_config: Union[None, Dict[str, Any], JinaLNormConfig] = None | |
| def __post_init__(self): | |
| self.attn_config = _resolve_subconfig(self.attn_config, JinaAttentionConfig) | |
| self.ffn_config = _resolve_subconfig(self.ffn_config, JinaFFNConfig) | |
| self.lnorm_config = _resolve_subconfig(self.lnorm_config, JinaLNormConfig) | |
| class JinaVLConnectorConfig: | |
| """Vision Language Connector configuration. | |
| Args: | |
| pooling_type (`ImagePooling2DType`, *optional*, defaults to | |
| `ImagePooling2DType.attention`): | |
| The type of 2D pooling to use for image features. | |
| padding_embed_type (`ImagePaddingEmbedType`, *optional*, defaults to `None`): | |
| The type of image padding embedding to use. | |
| projector_type (`ImageProjectionType`, *optional*, defaults to | |
| `ImageProjectionType.mlp | |
| The type of image projector to use. | |
| attn_pooling_config (`JinaAttentionConfig`, *optional*, defaults to | |
| `JinaAttentionConfig()`): | |
| The attention pooling configuration. | |
| mlp_projector_config (`JinaFFNConfig`, *optional*, defaults to | |
| `JinaFFNConfig()`): | |
| The MLP projector configuration. | |
| pooling_h (`int`, *optional*, defaults to 2): | |
| The height of the pooling grid. | |
| pooling_w (`int`, *optional*, defaults to 2): | |
| The width of the pooling grid. | |
| spatial_merge_size (`int`, *optional*, defaults to 2): | |
| The spatial merge size for image features. | |
| projector_dropout (`float`, *optional*, defaults to 0.0): | |
| The dropout probability for the image projector. | |
| feature_dropout (`float`, *optional*, defaults to 0.0): | |
| The dropout probability for the image features. | |
| """ | |
| padding_embed_type: Optional[ImagePaddingEmbedType] = None | |
| pooling_type: ImagePooling2DType = ImagePooling2DType.attention | |
| projector_type: ImageProjectionType = ImageProjectionType.mlp | |
| attn_pooling_config: Optional[JinaAttentionConfig] = None | |
| mlp_projector_config: Optional[JinaFFNConfig] = None | |
| pooling_h: int = 2 | |
| pooling_w: int = 2 | |
| spatial_merge_size: int = 2 | |
| projector_dropout: float = 0.0 | |
| feature_dropout: float = 0.0 | |
| def __post_init__(self): | |
| self.attn_pooling_config = _resolve_subconfig( | |
| self.attn_pooling_config, JinaAttentionConfig | |
| ) | |
| self.mlp_projector_config = _resolve_subconfig( | |
| self.mlp_projector_config, JinaFFNConfig | |
| ) | |
| class PretrainedConfigWithDataclasses(PretrainedConfig): | |
| """PretrainedConfig base class with dataclass support.""" | |
| def to_dict(self) -> Dict[str, Any]: | |
| return { | |
| key: asdict(value) if is_dataclass(value) else value | |
| for key, value in super().to_dict().items() | |
| } | |
| class JinaVLMVisionConfig(PretrainedConfigWithDataclasses): | |
| """JinaVLM Vision Model configuration. | |
| Args: | |
| block_config (`JinaTransformerBlockConfig`, *optional*, defaults to | |
| `JinaTransformerBlockConfig(`): | |
| The transformer block configuration to use within the ViT. | |
| vl_connector_config (`JinaVLConnectorConfig`, *optional*, defaults to | |
| `JinaVLConnectorConfig()`): | |
| The VLC (Vision Language Connector) configuration. | |
| n_layers (`int`, *optional*, defaults to 12): | |
| The number of layers/blocks. | |
| hidden_size (`int`, *optional*, defaults to 768): | |
| The hidden size of the model. | |
| input_size (`Tuple[int, int]`, *optional*, defaults to `None`): | |
| The input image size as (height, width). If not set, the model can | |
| process variable size images. | |
| patch_size (`int`, *optional*, defaults to 14): | |
| The patch size to use. | |
| n_channels (`int`, *optional*, defaults to 3): | |
| The number of input channels. | |
| linear_patch_embedding (`bool`, *optional*, defaults to `True`): | |
| Use a faster linear layer for patch embedding rather than a convolutional | |
| layer. Requires a fixed input size. | |
| patch_embedding_bias (`bool`, *optional*, defaults to `True`): | |
| Add bias to the patch embedding layer. | |
| patch_dropout (`float`, *optional*, defaults to 0.0): | |
| The dropout probability to use right after the patch embedding layer. | |
| pre_lnorm (`bool`, *optional*, defaults to `False`): | |
| Apply layer normalization to the features before feeding them into each | |
| transformer block. | |
| post_lnorm (`bool`, *optional*, defaults to `False`): | |
| Apply layer normalization to the features after feeding them into each | |
| transformer block. | |
| use_absolute_positional_embeddings (`bool`, *optional*, defaults to `True`): | |
| Use absolute positional embeddings rather than RoPE. | |
| use_cls_token (`bool`, *optional*, defaults to `True`): | |
| Use a class token. | |
| positional_interpolation (`str`, *optional*, defaults to `'bicubic'`): | |
| The interpolation mode to use when interpolating the positional embeddings. | |
| vit_layers (`Tuple[int, ...]`, *optional*, defaults to `(-1,)`): | |
| The layers of the ViT to use for image features. | |
| output_size (`int`, *optional*, defaults to 768): | |
| The output size of the vision model, after the connector. | |
| """ | |
| model_type = 'jvlm' | |
| base_config_key = 'vision_config' | |
| def __init__( | |
| self, | |
| block_config: Optional[JinaTransformerBlockConfig] = None, | |
| vl_connector_config: Optional[JinaVLConnectorConfig] = None, | |
| n_layers: int = 12, | |
| hidden_size: int = 768, | |
| input_size: Optional[Tuple[int, int]] = None, | |
| patch_size: int = 14, | |
| n_channels: int = 3, | |
| linear_patch_embedding: bool = True, | |
| patch_embedding_bias: bool = True, | |
| patch_dropout: float = 0.0, | |
| pre_lnorm: bool = False, | |
| post_lnorm: bool = False, | |
| use_absolute_positional_embeddings: bool = True, | |
| use_cls_token: bool = True, | |
| positional_interpolation: str = 'bicubic', | |
| vit_layers: Tuple[int, ...] = (-1,), | |
| output_size: int = 768, | |
| **kwargs: Any, | |
| ): | |
| self.block_config = _resolve_subconfig(block_config, JinaTransformerBlockConfig) | |
| self.vl_connector_config = _resolve_subconfig( | |
| vl_connector_config, JinaVLConnectorConfig | |
| ) | |
| super().__init__(**kwargs) | |
| self.n_layers = n_layers | |
| self.hidden_size = hidden_size | |
| self.input_size = input_size | |
| self.patch_size = patch_size | |
| self.n_channels = n_channels | |
| self.linear_patch_embedding = linear_patch_embedding | |
| self.patch_embedding_bias = patch_embedding_bias | |
| self.patch_dropout = patch_dropout | |
| self.pre_lnorm = pre_lnorm | |
| self.post_lnorm = post_lnorm | |
| self.use_absolute_positional_embeddings = use_absolute_positional_embeddings | |
| self.use_cls_token = use_cls_token | |
| self.positional_interpolation = positional_interpolation | |
| self.vit_layers = vit_layers | |
| self.output_size = output_size | |
| class JinaVLMTextConfig(PretrainedConfigWithDataclasses): | |
| """JinaVLM Text Model configuration. | |
| Args: | |
| block_config (`JinaTransformerBlockConfig`, *optional*, defaults to | |
| `JinaTransformerBlockConfig()`): | |
| Decoder LM transformer block configuration. | |
| n_layers (`int`, *optional*, defaults to 12): | |
| The number of layers/blocks. | |
| hidden_size (`int`, *optional*, defaults to 768): | |
| The hidden size of the model. | |
| embedding_dropout (`float`, *optional*, defaults to 0.0): | |
| The dropout probability for embeddings. | |
| max_sequence_length (`int`, *optional*, defaults to 1024): | |
| The maximum input sequence length supported by the model. | |
| max_position_embeddings (`int`, *optional*, defaults to `None`): | |
| Max positional embeddings to use in RoPE cache. | |
| vocab_size (`int`, *optional*, defaults to 50257): | |
| Vocabulary size of the model. | |
| embedding_size (`int`, *optional*, defaults to 50304): | |
| The number of embeddings, i.e. the number of tokens. If set to ``None`` it | |
| will default to ``vocab_size``. If ``vocab_size`` is not a multiple of 128, | |
| setting this to the next multiple of 128 that's greater than ``vocab_size`` | |
| can improve throughput substantially. | |
| additional_vocab_size (`int`, *optional*, defaults to `None`): | |
| New tokens to add to the embeddings as part of the vision/language | |
| connector. | |
| normalize_input_embeds (`bool`, *optional*, defaults to `False`): | |
| Normalize input embeddings (both for text and images) before. | |
| rope (`bool`, *optional*, defaults to `False`): | |
| Use rotary positional embeddings (RoPE). | |
| rope_theta (`float`, *optional*, defaults to 1000000.0): | |
| The base period of the RoPE embeddings. | |
| partial_rotary_factor (`float`, *optional*, defaults to 1.0): | |
| The fraction of hidden dimensions to apply RoPE to. For example, a value of | |
| 0.5 will apply RoPE to half of the hidden dimensions, leaving the other half | |
| unmodified. | |
| rope_scaling (`Dict`, *optional*): | |
| Dictionary containing the scaling configuration for the RoPE embeddings. | |
| NOTE: if you apply a new rope type and you expect the model to work on | |
| longer `max_position_embeddings`, we recommend you to update this value | |
| accordingly. | |
| Expected contents: | |
| `rope_type` (`str`): | |
| The sub-variant of RoPE to use. Can be one of ['default', 'linear', | |
| 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the | |
| original RoPE implementation. | |
| `factor` (`float`, *optional*): | |
| Used with all rope types except 'default'. The scaling factor to | |
| apply to the RoPE embeddings. In most scaling types, a `factor` of | |
| x will enable the model to handle sequences of length x * | |
| original maximum pre-trained length. | |
| `original_max_position_embeddings` (`int`, *optional*): | |
| Used with 'dynamic', 'longrope' and 'llama3'. The original | |
| max position embeddings used during pretraining. | |
| `attention_factor` (`float`, *optional*): | |
| Used with 'yarn' and 'longrope'. The scaling factor to be applied | |
| on the attention computation. If unspecified, it defaults to value | |
| recommended by the implementation, using the `factor` field to infer | |
| the suggested value. | |
| `beta_fast` (`float`, *optional*): | |
| Only used with 'yarn'. Parameter to set the boundary for | |
| extrapolation (only) in the linear ramp function. If unspecified, | |
| it defaults to 32. | |
| `beta_slow` (`float`, *optional*): | |
| Only used with 'yarn'. Parameter to set the boundary for | |
| interpolation (only) in the linear ramp function. If unspecified, | |
| it defaults to 1. | |
| `short_factor` (`list[float]`, *optional*): | |
| Only used with 'longrope'. The scaling factor to be applied to | |
| short contexts (<`original_max_position_embeddings`). Must be a | |
| list of numbers with the same length as the hidden size divided by | |
| the number of attention heads divided by 2. | |
| `long_factor` (`list[float]`, *optional*): | |
| Only used with 'longrope'. The scaling factor to be applied to | |
| long contexts (<`original_max_position_embeddings`). Must be a list | |
| of numbers with the same length as the hidden size divided by the | |
| number of attention heads divided by 2. | |
| `low_freq_factor` (`float`, *optional*): | |
| Only used with 'llama3'. Scaling factor applied to low frequency | |
| components of the RoPE. | |
| `high_freq_factor` (`float`, *optional*): | |
| Only used with 'llama3'. Scaling factor applied to high frequency | |
| components of the RoPE. | |
| """ | |
| model_type = 'jvlm' | |
| base_config_key = 'text_config' | |
| def __init__( | |
| self, | |
| block_config: Optional[JinaTransformerBlockConfig] = None, | |
| n_layers: int = 12, | |
| hidden_size: int = 768, | |
| embedding_dropout: float = 0.0, | |
| max_sequence_length: int = 1024, | |
| max_position_embeddings: Optional[int] = None, | |
| vocab_size: int = 50257, | |
| embedding_size: Optional[int] = 50304, | |
| additional_vocab_size: Optional[int] = None, | |
| normalize_input_embeds: bool = False, | |
| rope: bool = False, | |
| partial_rotary_factor: float = 1.0, | |
| rope_theta: float = 1000000.0, | |
| rope_scaling: Optional[Dict[str, Any]] = None, | |
| **kwargs: Any, | |
| ): | |
| self.block_config = _resolve_subconfig(block_config, JinaTransformerBlockConfig) | |
| super().__init__(**kwargs) | |
| self.n_layers = n_layers | |
| self.hidden_size = hidden_size | |
| self.embedding_dropout = embedding_dropout | |
| self.max_sequence_length = max_sequence_length | |
| self.max_position_embeddings = max_position_embeddings | |
| self.vocab_size = vocab_size | |
| self.embedding_size = embedding_size | |
| self.additional_vocab_size = additional_vocab_size | |
| self.normalize_input_embeds = normalize_input_embeds | |
| self.rope = rope | |
| self.partial_rotary_factor = partial_rotary_factor | |
| self.rope_theta = rope_theta | |
| self.rope_scaling = rope_scaling | |
| # Needed for vLLM | |
| def num_attention_heads(self) -> int: | |
| return self.block_config.attn_config.n_heads | |
| class JinaVLMConfig(PretrainedConfig): | |
| """JinaVLM configuration. | |
| Args: | |
| text_config (`JinaVLMTextConfig`, *optional*, defaults to | |
| `JinaVLMTextConfig()`): | |
| The text model configuration. | |
| vision_config (`JinaVLMVisionConfig`, *optional*, defaults to | |
| `JinaVLMVisionConfig()`): | |
| The vision model configuration. | |
| """ | |
| model_type = 'jvlm' | |
| sub_configs = { | |
| 'vision_config': JinaVLMVisionConfig, | |
| 'text_config': JinaVLMTextConfig, | |
| } | |
| def __init__( | |
| self, | |
| text_config: Optional[JinaVLMTextConfig] = None, | |
| vision_config: Optional[JinaVLMVisionConfig] = None, | |
| tie_word_embeddings: bool = False, | |
| **kwargs, | |
| ): | |
| self.text_config = _resolve_subconfig( | |
| text_config, self.sub_configs['text_config'] | |
| ) | |
| self.vision_config = _resolve_subconfig( | |
| vision_config, self.sub_configs['vision_config'] | |
| ) | |
| if self.text_config.hidden_size != self.vision_config.output_size: | |
| raise ValueError( | |
| f'Text hidden size ({self.text_config.hidden_size}) and ' | |
| f'vision output size ({self.vision_config.output_size}) must be ' | |
| 'the same for JinaVL' | |
| ) | |
| super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings) | |
| JinaVLMConfig.register_for_auto_class() | |