jina-vlm / configuration_jvlm.py

Model update

3c865c7 verified 21 days ago

24.8 kB

	# Copyright 2025 Jina AI. All rights reserved.

	from dataclasses import asdict, dataclass, is_dataclass
	from enum import Enum
	from typing import Any, Dict, Optional, Tuple, Type, Union

	from transformers import PretrainedConfig


	class StrEnum(str, Enum):
	def __str__(self) -> str:
	return self.value

	def __repr__(self) -> str:
	return f"'{str(self)}'"


	class ActivationType(StrEnum):
	gelu = 'gelu'
	gelu_10 = 'gelu_10'
	gelu_fast = 'gelu_fast'
	gelu_new = 'gelu_new'
	gelu_python = 'gelu_python'
	gelu_pytorch_tanh = 'gelu_pytorch_tanh'
	gelu_accurate = 'gelu_accurate'
	laplace = 'laplace'
	leaky_relu = 'leaky_relu'
	linear = 'linear'
	mish = 'mish'
	quick_gelu = 'quick_gelu'
	relu = 'relu'
	relu2 = 'relu2'
	relu6 = 'relu6'
	sigmoid = 'sigmoid'
	silu = 'silu'
	swish = 'swish'
	tanh = 'tanh'
	prelu = 'prelu'
	xielu = 'xielu'


	class ImagePaddingEmbedType(StrEnum):
	pad_and_partial_pad = 'pad_and_partial_pad'
	pad_embed = 'pad_embed'
	regress = 'regress'


	class ImagePooling2DType(StrEnum):
	attention = 'attention'
	attention_meanq = 'attention_meanq'
	attention_2wide = 'attention_2wide'
	none = 'none'
	stack = 'stack'
	token_merger = 'token_merger'


	class ImageProjectionType(StrEnum):
	mlp = 'mlp'
	mlpx2 = '2mlp'
	linear = 'linear'


	class LayerNormType(StrEnum):
	default = 'default'
	low_precision = 'low_precision'
	rms = 'rms'


	def _resolve_subconfig(obj: Union[None, Dict[str, Any], Any], cls: Type[Any]) -> Any:
	if isinstance(obj, dict):
	return cls(**obj)
	elif obj is None:
	return cls()
	return obj


	@dataclass
	class JinaLNormConfig:
	"""Layer Norm configuration.

	Args:
	type (`LayerNormType`, optional, defaults to `LayerNormType.default`):
	The layernorm implementation to use.
	with_affine (`bool`, optional, defaults to `True`):
	Whether to include bias and weight parameters for the layer norms.
	This only affects layer norms that are immediately followed by a linear
	layer in the forward pass, so everything except QK-norms. To turn off
	affines for QK norms as well, set :attr:`attention_layer_norm_with_affine`
	to ``False``.
	eps (`float`, optional, defaults to `None`):
	Epsilon for layer norms.
	bias (`bool`, optional, defaults to `None`):
	Whether or not to include bias parameters in layer norm.
	"""

	type: LayerNormType = LayerNormType.default
	with_affine: bool = True
	eps: Optional[float] = None
	bias: Optional[bool] = None


	@dataclass
	class JinaFFNConfig:
	"""Feed Forward Network configuration.

	Args:
	activation_type (`ActivationType`, optional, defaults to `'silu'`):
	The activation function to use within the MLP layer.
	ratio (`int`, optional, defaults to 4):
	The ratio of the MLP dimensionality to ``model_dim``.
	size (`int`, optional, defaults to `None`):
	This is only used when ``ratio`` is not set. Set the exact hidden size
	for the MLP. Otherwise the inner MLP hidden size will be set to
	`ratio * model_dim`.
	bias (`bool`, optional, defaults to `False`):
	Add bias to the MLP layers.
	inner_lnorm (`bool`, optional, defaults to `False`):
	Add inner layer normalization to the FFN module.
	gated_activation (`bool`, optional, defaults to `True`):
	Use gated activation in the MLP, i.e. SwiGLU rather than a standard
	activation. Combine with activation type silu for SwiGLU.
	lnorm_config (`JinaLNormConfig`, optional, defaults to `JinaLNormConfig()`):
	The inner layernorm configuration.
	"""

	activation_type: ActivationType = ActivationType.silu
	ratio: int = 4
	size: Optional[int] = None
	bias: bool = False
	inner_lnorm: bool = False
	gated_activation: bool = True
	lnorm_config: Union[None, Dict[str, Any], JinaLNormConfig] = None

	def __post_init__(self):
	self.lnorm_config = _resolve_subconfig(self.lnorm_config, JinaLNormConfig)


	@dataclass
	class JinaAttentionConfig:
	"""Attention module configuration.

	Args:
	head_dim (`int`, optional, defaults to `None`):
	Head dimensionality.
	n_heads (`int`, optional, defaults to 12):
	The number of self-attention heads.
	n_kv_heads (`int`, optional, defaults to `None`):
	The number of heads to use for keys and values. Defaults to `n_heads`.
	Set this to ``None`` or ``n_heads`` for normal multi-head attention.
	Set this to 1 for multi-query attention.
	Set it to some in-between value for Llama2-style grouped query attention.
	softmax_scale (`float`, optional, defaults to `None`):
	Attention softmax scale. If set to `None`, the default inverse of head
	dimension is used.
	sliding_window (`int`, optional, defaults to -1):
	Attention sliding window size. If set to -1, no sliding window attention
	is used.
	fp32 (`bool`, optional, defaults to `False`):
	Compute attention in float32.
	dropout (`float`, optional, defaults to 0.0):
	The dropout probability within the attention modules.
	q_bias (`bool`, optional, defaults to `False`):
	Add bias to the query projection.
	k_bias (`bool`, optional, defaults to `False`):
	Add bias to the key projection.
	v_bias (`bool`, optional, defaults to `False`):
	Add bias to the value projection.
	o_bias (`bool`, optional, defaults to `False`):
	Add bias to the output projection.
	q_lnorm (`bool`, optional, defaults to `False`):
	Add layer normalization to the query projection.
	k_lnorm (`bool`, optional, defaults to `False`):
	Add layer normalization to the key projection.
	v_lnorm (`bool`, optional, defaults to `False`):
	Add layer normalization to the value projection.
	qkv_lnorm_on_heads (`bool`, optional, defaults to `False`):
	If enabled, Q,K and V layer norms are applied on the heads.
	o_lnorm (`bool`, optional, defaults to `False`):
	Add layer normalization to the output projection.
	inner_lnorm (`bool`, optional, defaults to `False`):
	Add inner layer normalization to the attention module.
	clip_qkv (`float`, optional, defaults to `None`):
	Clip QKV to this value when set.
	lnorm_config (`JinaLNormConfig`, optional, defaults to `JinaLNormConfig()`):
	The inner layernorm configuration.
	"""

	head_dim: Optional[int] = None
	n_heads: int = 12
	n_kv_heads: Optional[int] = None
	softmax_scale: Optional[float] = None
	sliding_window: int = -1
	fp32: bool = False
	dropout: float = 0.0
	q_bias: bool = False
	k_bias: bool = False
	v_bias: bool = False
	o_bias: bool = False
	q_lnorm: bool = False
	k_lnorm: bool = False
	v_lnorm: bool = False
	qkv_lnorm_on_heads: bool = False
	o_lnorm: bool = False
	inner_lnorm: bool = False
	clip_qkv: Optional[float] = None
	lnorm_config: Union[None, Dict[str, Any], JinaLNormConfig] = None

	def __post_init__(self):
	self.lnorm_config = _resolve_subconfig(self.lnorm_config, JinaLNormConfig)


	@dataclass
	class JinaTransformerBlockConfig:
	"""Transformer model block configuration.

	Args:
	attn_config (`JinaAttentionConfig`, optional, defaults to
	`JinaAttentionConfig()`):
	The attention module configuration.
	attn_lscale_init (`float`, optional, defaults to `None`):
	Initial value of layer scale gamma in the attention module.
	ffn_config (`JinaFFNConfig`, optional, defaults to `JinaFFNConfig()`):
	The attention module configuration.
	ffn_lscale_init (`float`, optional, defaults to `None`):
	Initial value of layer scale gamma in the FFN module.
	residual_dropout (`float`, optional, defaults to 0.0):
	The dropout probability for the MLP and attention output within each block.
	residual_response_dropout (`float`, optional, defaults to 0.0):
	Dropout applied only to loss/response tokens.
	residual_path_dropout (`float`, optional, defaults to 0.0):
	The dropout probability for the MLP and attention output within each block.
	postnorm (`bool`, optional, defaults to `False`):
	Apply norm after the attention/feedforward layers rather than before, as
	introduced in the Swin transformer paper (Liu et al).
	lnorm_config (`JinaLNormConfig`, optional, defaults to `JinaLNormConfig()`):
	The inner layernorm configuration.
	"""

	attn_config: Union[None, Dict[str, Any], JinaAttentionConfig] = None
	attn_lscale_init: Optional[float] = None
	ffn_config: Union[None, Dict[str, Any], JinaFFNConfig] = None
	ffn_lscale_init: Optional[float] = None
	residual_dropout: float = 0.0
	residual_response_dropout: float = 0.0
	residual_path_dropout: float = 0.0
	postnorm: bool = False
	lnorm_config: Union[None, Dict[str, Any], JinaLNormConfig] = None

	def __post_init__(self):
	self.attn_config = _resolve_subconfig(self.attn_config, JinaAttentionConfig)
	self.ffn_config = _resolve_subconfig(self.ffn_config, JinaFFNConfig)
	self.lnorm_config = _resolve_subconfig(self.lnorm_config, JinaLNormConfig)


	@dataclass
	class JinaVLConnectorConfig:
	"""Vision Language Connector configuration.

	Args:
	pooling_type (`ImagePooling2DType`, optional, defaults to
	`ImagePooling2DType.attention`):
	The type of 2D pooling to use for image features.
	padding_embed_type (`ImagePaddingEmbedType`, optional, defaults to `None`):
	The type of image padding embedding to use.
	projector_type (`ImageProjectionType`, optional, defaults to
	`ImageProjectionType.mlp
	The type of image projector to use.
	attn_pooling_config (`JinaAttentionConfig`, optional, defaults to
	`JinaAttentionConfig()`):
	The attention pooling configuration.
	mlp_projector_config (`JinaFFNConfig`, optional, defaults to
	`JinaFFNConfig()`):
	The MLP projector configuration.
	pooling_h (`int`, optional, defaults to 2):
	The height of the pooling grid.
	pooling_w (`int`, optional, defaults to 2):
	The width of the pooling grid.
	spatial_merge_size (`int`, optional, defaults to 2):
	The spatial merge size for image features.
	projector_dropout (`float`, optional, defaults to 0.0):
	The dropout probability for the image projector.
	feature_dropout (`float`, optional, defaults to 0.0):
	The dropout probability for the image features.
	"""

	padding_embed_type: Optional[ImagePaddingEmbedType] = None
	pooling_type: ImagePooling2DType = ImagePooling2DType.attention
	projector_type: ImageProjectionType = ImageProjectionType.mlp
	attn_pooling_config: Optional[JinaAttentionConfig] = None
	mlp_projector_config: Optional[JinaFFNConfig] = None
	pooling_h: int = 2
	pooling_w: int = 2
	spatial_merge_size: int = 2
	projector_dropout: float = 0.0
	feature_dropout: float = 0.0

	def __post_init__(self):
	self.attn_pooling_config = _resolve_subconfig(
	self.attn_pooling_config, JinaAttentionConfig
	)
	self.mlp_projector_config = _resolve_subconfig(
	self.mlp_projector_config, JinaFFNConfig
	)


	class PretrainedConfigWithDataclasses(PretrainedConfig):
	"""PretrainedConfig base class with dataclass support."""

	def to_dict(self) -> Dict[str, Any]:
	return {
	key: asdict(value) if is_dataclass(value) else value
	for key, value in super().to_dict().items()
	}


	class JinaVLMVisionConfig(PretrainedConfigWithDataclasses):
	"""JinaVLM Vision Model configuration.

	Args:
	block_config (`JinaTransformerBlockConfig`, optional, defaults to
	`JinaTransformerBlockConfig(`):
	The transformer block configuration to use within the ViT.
	vl_connector_config (`JinaVLConnectorConfig`, optional, defaults to
	`JinaVLConnectorConfig()`):
	The VLC (Vision Language Connector) configuration.
	n_layers (`int`, optional, defaults to 12):
	The number of layers/blocks.
	hidden_size (`int`, optional, defaults to 768):
	The hidden size of the model.
	input_size (`Tuple[int, int]`, optional, defaults to `None`):
	The input image size as (height, width). If not set, the model can
	process variable size images.
	patch_size (`int`, optional, defaults to 14):
	The patch size to use.
	n_channels (`int`, optional, defaults to 3):
	The number of input channels.
	linear_patch_embedding (`bool`, optional, defaults to `True`):
	Use a faster linear layer for patch embedding rather than a convolutional
	layer. Requires a fixed input size.
	patch_embedding_bias (`bool`, optional, defaults to `True`):
	Add bias to the patch embedding layer.
	patch_dropout (`float`, optional, defaults to 0.0):
	The dropout probability to use right after the patch embedding layer.
	pre_lnorm (`bool`, optional, defaults to `False`):
	Apply layer normalization to the features before feeding them into each
	transformer block.
	post_lnorm (`bool`, optional, defaults to `False`):
	Apply layer normalization to the features after feeding them into each
	transformer block.
	use_absolute_positional_embeddings (`bool`, optional, defaults to `True`):
	Use absolute positional embeddings rather than RoPE.
	use_cls_token (`bool`, optional, defaults to `True`):
	Use a class token.
	positional_interpolation (`str`, optional, defaults to `'bicubic'`):
	The interpolation mode to use when interpolating the positional embeddings.
	vit_layers (`Tuple[int, ...]`, optional, defaults to `(-1,)`):
	The layers of the ViT to use for image features.
	output_size (`int`, optional, defaults to 768):
	The output size of the vision model, after the connector.
	"""

	model_type = 'jvlm'
	base_config_key = 'vision_config'

	def __init__(
	self,
	block_config: Optional[JinaTransformerBlockConfig] = None,
	vl_connector_config: Optional[JinaVLConnectorConfig] = None,
	n_layers: int = 12,
	hidden_size: int = 768,
	input_size: Optional[Tuple[int, int]] = None,
	patch_size: int = 14,
	n_channels: int = 3,
	linear_patch_embedding: bool = True,
	patch_embedding_bias: bool = True,
	patch_dropout: float = 0.0,
	pre_lnorm: bool = False,
	post_lnorm: bool = False,
	use_absolute_positional_embeddings: bool = True,
	use_cls_token: bool = True,
	positional_interpolation: str = 'bicubic',
	vit_layers: Tuple[int, ...] = (-1,),
	output_size: int = 768,
	**kwargs: Any,
	):
	self.block_config = _resolve_subconfig(block_config, JinaTransformerBlockConfig)
	self.vl_connector_config = _resolve_subconfig(
	vl_connector_config, JinaVLConnectorConfig
	)
	super().__init__(**kwargs)

	self.n_layers = n_layers
	self.hidden_size = hidden_size
	self.input_size = input_size
	self.patch_size = patch_size
	self.n_channels = n_channels
	self.linear_patch_embedding = linear_patch_embedding
	self.patch_embedding_bias = patch_embedding_bias
	self.patch_dropout = patch_dropout
	self.pre_lnorm = pre_lnorm
	self.post_lnorm = post_lnorm
	self.use_absolute_positional_embeddings = use_absolute_positional_embeddings
	self.use_cls_token = use_cls_token
	self.positional_interpolation = positional_interpolation
	self.vit_layers = vit_layers
	self.output_size = output_size


	class JinaVLMTextConfig(PretrainedConfigWithDataclasses):
	"""JinaVLM Text Model configuration.

	Args:
	block_config (`JinaTransformerBlockConfig`, optional, defaults to
	`JinaTransformerBlockConfig()`):
	Decoder LM transformer block configuration.
	n_layers (`int`, optional, defaults to 12):
	The number of layers/blocks.
	hidden_size (`int`, optional, defaults to 768):
	The hidden size of the model.
	embedding_dropout (`float`, optional, defaults to 0.0):
	The dropout probability for embeddings.
	max_sequence_length (`int`, optional, defaults to 1024):
	The maximum input sequence length supported by the model.
	max_position_embeddings (`int`, optional, defaults to `None`):
	Max positional embeddings to use in RoPE cache.
	vocab_size (`int`, optional, defaults to 50257):
	Vocabulary size of the model.
	embedding_size (`int`, optional, defaults to 50304):
	The number of embeddings, i.e. the number of tokens. If set to ``None`` it
	will default to ``vocab_size``. If ``vocab_size`` is not a multiple of 128,
	setting this to the next multiple of 128 that's greater than ``vocab_size``
	can improve throughput substantially.
	additional_vocab_size (`int`, optional, defaults to `None`):
	New tokens to add to the embeddings as part of the vision/language
	connector.
	normalize_input_embeds (`bool`, optional, defaults to `False`):
	Normalize input embeddings (both for text and images) before.
	rope (`bool`, optional, defaults to `False`):
	Use rotary positional embeddings (RoPE).
	rope_theta (`float`, optional, defaults to 1000000.0):
	The base period of the RoPE embeddings.
	partial_rotary_factor (`float`, optional, defaults to 1.0):
	The fraction of hidden dimensions to apply RoPE to. For example, a value of
	0.5 will apply RoPE to half of the hidden dimensions, leaving the other half
	unmodified.
	rope_scaling (`Dict`, optional):
	Dictionary containing the scaling configuration for the RoPE embeddings.
	NOTE: if you apply a new rope type and you expect the model to work on
	longer `max_position_embeddings`, we recommend you to update this value
	accordingly.
	Expected contents:
	`rope_type` (`str`):
	The sub-variant of RoPE to use. Can be one of ['default', 'linear',
	'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the
	original RoPE implementation.
	`factor` (`float`, optional):
	Used with all rope types except 'default'. The scaling factor to
	apply to the RoPE embeddings. In most scaling types, a `factor` of
	x will enable the model to handle sequences of length x *
	original maximum pre-trained length.
	`original_max_position_embeddings` (`int`, optional):
	Used with 'dynamic', 'longrope' and 'llama3'. The original
	max position embeddings used during pretraining.
	`attention_factor` (`float`, optional):
	Used with 'yarn' and 'longrope'. The scaling factor to be applied
	on the attention computation. If unspecified, it defaults to value
	recommended by the implementation, using the `factor` field to infer
	the suggested value.
	`beta_fast` (`float`, optional):
	Only used with 'yarn'. Parameter to set the boundary for
	extrapolation (only) in the linear ramp function. If unspecified,
	it defaults to 32.
	`beta_slow` (`float`, optional):
	Only used with 'yarn'. Parameter to set the boundary for
	interpolation (only) in the linear ramp function. If unspecified,
	it defaults to 1.
	`short_factor` (`list[float]`, optional):
	Only used with 'longrope'. The scaling factor to be applied to
	short contexts (<`original_max_position_embeddings`). Must be a
	list of numbers with the same length as the hidden size divided by
	the number of attention heads divided by 2.
	`long_factor` (`list[float]`, optional):
	Only used with 'longrope'. The scaling factor to be applied to
	long contexts (<`original_max_position_embeddings`). Must be a list
	of numbers with the same length as the hidden size divided by the
	number of attention heads divided by 2.
	`low_freq_factor` (`float`, optional):
	Only used with 'llama3'. Scaling factor applied to low frequency
	components of the RoPE.
	`high_freq_factor` (`float`, optional):
	Only used with 'llama3'. Scaling factor applied to high frequency
	components of the RoPE.
	"""

	model_type = 'jvlm'
	base_config_key = 'text_config'

	def __init__(
	self,
	block_config: Optional[JinaTransformerBlockConfig] = None,
	n_layers: int = 12,
	hidden_size: int = 768,
	embedding_dropout: float = 0.0,
	max_sequence_length: int = 1024,
	max_position_embeddings: Optional[int] = None,
	vocab_size: int = 50257,
	embedding_size: Optional[int] = 50304,
	additional_vocab_size: Optional[int] = None,
	normalize_input_embeds: bool = False,
	rope: bool = False,
	partial_rotary_factor: float = 1.0,
	rope_theta: float = 1000000.0,
	rope_scaling: Optional[Dict[str, Any]] = None,
	**kwargs: Any,
	):
	self.block_config = _resolve_subconfig(block_config, JinaTransformerBlockConfig)

	super().__init__(**kwargs)
	self.n_layers = n_layers
	self.hidden_size = hidden_size
	self.embedding_dropout = embedding_dropout
	self.max_sequence_length = max_sequence_length
	self.max_position_embeddings = max_position_embeddings
	self.vocab_size = vocab_size
	self.embedding_size = embedding_size
	self.additional_vocab_size = additional_vocab_size
	self.normalize_input_embeds = normalize_input_embeds
	self.rope = rope
	self.partial_rotary_factor = partial_rotary_factor
	self.rope_theta = rope_theta
	self.rope_scaling = rope_scaling


	class JinaVLMConfig(PretrainedConfig):
	"""JinaVLM configuration.

	Args:
	text_config (`JinaVLMTextConfig`, optional, defaults to
	`JinaVLMTextConfig()`):
	The text model configuration.
	vision_config (`JinaVLMVisionConfig`, optional, defaults to
	`JinaVLMVisionConfig()`):
	The vision model configuration.
	"""

	model_type = 'jvlm'
	sub_configs = {
	'vision_config': JinaVLMVisionConfig, 'text_config': JinaVLMTextConfig
	}

	def __init__(
	self,
	text_config: Optional[JinaVLMTextConfig] = None,
	vision_config: Optional[JinaVLMVisionConfig] = None,
	tie_word_embeddings: bool = False,
	**kwargs,
	):
	self.text_config = _resolve_subconfig(
	text_config, self.sub_configs['text_config']
	)
	self.vision_config = _resolve_subconfig(
	vision_config, self.sub_configs['vision_config']
	)
	if self.text_config.hidden_size != self.vision_config.output_size:
	raise ValueError(
	f'Text hidden size ({self.text_config.hidden_size}) and '
	f'vision output size ({self.vision_config.output_size}) must be '
	'the same for JinaVL'
	)
	super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)


	JinaVLMConfig.register_for_auto_class()