
    i                       d dl Z d dlmZ d dlmZmZ d dlZd dlmZ ddl	m
Z ddlmZmZmZ ddlmZmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZD ddlEmFZF ddlGmHZHmIZImJZJmKZK  e/j                  eM      ZN G d de8e      ZO G d de7      ZP G d de8e      ZQ G d de      ZR G d d e>      ZS G d! d"e;      ZT G d# d$e?      ZU G d% d&e:      ZV G d' d(e:      ZWdId)eXd*efd+ZY G d, d-eI      ZZ G d. d/eI      Z[ G d0 d1eJ      Z\ G d2 d3eH      Z] G d4 d5e<      Z^ G d6 d7e@      Z_e- G d8 d9e=             Z` G d: d;e`      Za G d< d=e`      Zb G d> d?e`      Zce- G d@ dAe`             Zd G dB dCe`e      Zee- G dD dEe`             Zfe- G dF dGe`             Zgg dHZhy)J    N)Callable)AnyOptional   )initialization)DynamicCacheEncoderDecoderCacheStaticCache)PreTrainedConfiglayer_type_validation)GenerationConfigGenerationMixinGenerationMode)create_bidirectional_mask)FlashAttentionKwargs)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPoolingSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSRopeParameters)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)OutputRecordercheck_model_inputs   )	AutoModel)Gemma3ConfigGemma3TextConfig)Gemma3Attention	Gemma3MLPGemma3MultiModalProjectorGemma3PreTrainedModelGemma3RMSNormGemma3RotaryEmbeddingGemma3TextScaledWordEmbeddingapply_rotary_pos_embcreate_causal_mask!create_sliding_window_causal_maskeager_attention_forward)SiglipVisionConfig)T5GemmaClassificationHeadT5GemmaEncoderLayerT5GemmaLMHeadbidirectional_mask_functionc            2       H   e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedz  dedz  dedz  dedz  dedz  d	edz  d
edz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  de	e   dz  dedz  dedz  de
eee
f   z  dz  f.dZy)T5Gemma2TextConfiga  
    This is the configuration class to store the configuration of a [`T5Gemma2TextModel`]. It is used to instantiate the encoder's
    text model portion of the T5Gemma2 Model according to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to that of the T5Gemma2Text-7B.
    e.g. [google/t5gemma2_text-7b](https://huggingface.co/google/t5gemma2_text-7b)
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the T5Gemma2Text model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`T5Gemma2TextModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            In T5Gemma2Text, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
    t5gemma2_textN
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dimhidden_activationmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_ideos_token_idbos_token_idattention_biasattention_dropoutquery_pre_attn_scalarsliding_windowlayer_typesfinal_logit_softcappingattn_logit_softcappingrope_parametersc                    || _         || _        || _        || _        |	| _        || _        || _        || _        || _        || _	        || _
        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        |j-                  dd      | _        | j*                  Et1        | j                        D cg c]!  }t3        |dz   | j.                  z        rdnd# c}| _        t5        | j*                  | j                         || _        t9        j:                  di | y c c}w Nsliding_window_pattern      sliding_attentionfull_attention rH   rJ   rI   r<   rD   r=   r>   r?   r@   rB   rA   rE   rF   rG   rK   rL   rC   rM   rN   rP   rQ   rO   get_sliding_window_patternrangeboolr   rR   r   __init__selfr<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   kwargsis                             w/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/t5gemma2/modular_t5gemma2.pyr`   zT5Gemma2TextConfig.__init__   N   6 )(($'>$&!2!2#6  #6 !2(",!2!2%:",'>$&<#& (.zz2JA'N$# t556  (,QUd6R6R,R'S#Yii D 	d..0F0FG.!!+F+    &E i@  i 	  i $              gelu_pytorch_tanhi   {Gz?gư>Tr   rW   r%   F        rl   i   NNNN__name__
__module____qualname____doc__
model_typeintstrfloatr_   listr   dictr`   rZ       re   r:   r:   K   s   BH !J ")"&(,(**+*+"(;.5*.#'!%#$#$#$&+*-,/%)(,04/3MQ1=,$J=, 4Z=, :	=,
 :=, !4Z=, !4Z=, *=, :=, "%t=, !4<=, Dj=, $;=, Dj=, Dj=,  Dj!=," t#=,$ !4<%=,&  #Tz'=,( d
)=,* #Y%+=,, "'-=,. !&/=,0 ($sN/B*CCdJ1=,r{   r:   c                       e Zd ZdZeedZy)T5Gemma2EncoderConfigt5gemma2_encoder)text_configvision_configN)rq   rr   rs   ru   r:   r4   sub_configsrZ   r{   re   r}   r}      s    #J *+Kr{   r}   c            2       H   e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedz  dedz  dedz  dedz  dedz  d	edz  d
edz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  de	e   dz  dedz  dedz  de
eee
f   z  dz  f.dZy)T5Gemma2DecoderConfiga
  
    This is the configuration class to store the configuration of a [`T5Gemma2DecoderModel`]. It is used to instantiate the decoder
    text model portion of the T5Gemma2 Model according to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to that of the T5Gemma2Decoder-7B.
    e.g. [google/t5gemma2_text-7b](https://huggingface.co/google/t5gemma2_text-7b)
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the T5Gemma2Decoder model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`T5Gemma2DecoderModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            In T5Gemma2Decoder, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
    t5gemma2_decoderNr<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   c                    || _         || _        || _        || _        |	| _        || _        || _        || _        || _        || _	        || _
        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        |j-                  dd      | _        | j*                  Et1        | j                        D cg c]!  }t3        |dz   | j.                  z        rdnd# c}| _        t5        | j*                  | j                         || _        t9        j:                  di | y c c}w rT   r[   ra   s                             re   r`   zT5Gemma2DecoderConfig.__init__"  rf   rg   rh   rp   rZ   r{   re   r   r      s   BH $J ")"&(,(**+*+"(;.5*.#'!%#$#$#$&+*-,/%)(,04/3MQ1=,$J=, 4Z=, :	=,
 :=, !4Z=, !4Z=, *=, :=, "%t=, !4<=, Dj=, $;=, Dj=, Dj=,  Dj!=," t#=,$ !4<%=,&  #Tz'=,( d
)=,* #Y%+=,, "'-=,. !&/=,0 ($sN/B*CCdJ1=,r{   r   c                        e Zd ZdZdZdgZeedZdddZ		 	 	 	 	 	 	 	 	 dd	ee
eef   z  dz  d
ee
eef   z  dz  dedededededededz  f fdZ xZS )T5Gemma2ConfigaV  
    This is the configuration class to store the configuration of a [`T5Gemma2Model`]. It is used to instantiate an T5Gemma2
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to a hypothetical balanced Gemma3 encoder-decoder model.
    e.g. [google/t5gemma-2-270m-270m](https://huggingface.co/google/t5gemma-2-270m-270m)
    Configuration objects inherit from [PreTrainedConfig] and can be used to control the model outputs. Read the
    documentation from [PreTrainedConfig] for more information.

    Args:
        encoder (`Union[T5Gemma2EncoderConfig, dict]`, optional, *optional*):
            Configuration for the encoder.
        decoder (`Union[T5Gemma2DecoderConfig, dict]`, optional, *optional*):
            Configuration for the decoder.
        is_encoder_decoder (bool, optional, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        dropout_rate (`float`, *optional*, defaults to 0.0):
            The ratio for all dropout layers (following T5).
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for attention.
        classifier_dropout_rate (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier (following T5).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        image_token_index (`int`, *optional*, defaults to 256001):
            The image token index to encode the image prompt. Defaults to 256001, which is right after the eoi_token_index.
            Note this is different from Gemma 3.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    ```python
    >>> from transformers import T5Gemma2Config, T5Gemma2Model
    >>> t5gemma2_config = T5Gemma2Config.from_pretrained("google/t5gemma-270m-270m")
    >>> model = T5Gemma2Model(t5gemma2_config)
    ```
    t5gemma2past_key_values)encoderdecoderimage_token_indexeoi_token_index)image_token_ideoi_token_idNr   r   is_encoder_decoderdropout_raterL   classifier_dropout_raterE   tie_word_embeddingsc
                 ^   t        |t              rt        di |}nI| t               }t        j	                  d       n't        |t              st        t        |       d      t        |t              rt        di |}nI| t               }t        j	                  d       n't        |t              st        t        |       d      |j                  j                  |j                  k7  r0t        d|j                  j                   d|j                   d      |st        d      |j                  j                  |j                  k7  r0t        d|j                  j                   d|j                   d      ||j                  _        ||j                  _        ||j                  _        ||_        || _        ||_        ||_        || _        d	D ]  }||
vst#        ||      |
|<    || _        || _        |j(                  | _        || _        |	| _        t-        | \  dd
|i|
 y )NzDencoder is None, using default T5Gemma2EncoderConfig encoder config.z is not supported.zDdecoder is None, using default T5Gemma2DecoderConfig decoder config.zBImbalanced encoder-decoder is not supported in T5Gemma2: encoder (z) vs decoder (z).z4T5Gemma2Model only support encoder-decoder modeling.zRImbalanced encoder-decoder vocabulary size is not supported in T5Gemma2: encoder ()rJ   rH   rI   r<   r   rZ   )
isinstancerz   r}   loggerinfo
ValueErrortyper   r   r=   r<   r   rL   r   r   r   r   getattrr   rE   r   r   superr`   )rb   r   r   r   r   rL   r   rE   r   r   rc   special_token_key	__class__s               re   r`   zT5Gemma2Config.__init__  s+    gt$+6g6G_+-GKK^_g'<= DM?2D!EFFgt$+6g6G_+-GKK^_g'<= DM?2D!EFF**g.A.AA#//;;<N7K^K^J__ac 
 "STT))W-?-??#//::;>'J\J\I]]_a  ,8(0A-2C/$5!  ,$5!!_ 	P .,3G=N,O()	P (?$!2&66!2#6 I,>I&Ir{   )	NNTro   ro   ro   rn   i T)rq   rr   rs   rt   ru   keys_to_ignore_at_inferencer}   r   r   attribute_maprz   rw   r   r_   rx   rv   r`   __classcell__r   s   @re   r   r   b  s    "H J#4"5 )(K .)M BFAE#'!#&),#'!(+/DJ&c3h7$>DJ 'c3h7$>DJ !	DJ
 DJ !DJ "'DJ !DJ DJ "D[DJ DJr{   r   c                       e Zd Zy)T5Gemma2RMSNormNrq   rr   rs   rZ   r{   re   r   r         r{   r   c                   *     e Zd Zdef fdZd Z xZS )T5Gemma2MLPconfigc                 l    t         |   |       t        j                  |j                        | _        y N)r   r`   nnDropoutr   dropoutrb   r   r   s     re   r`   zT5Gemma2MLP.__init__  s&     zz&"5"56r{   c                     | j                  | j                  |            | j                  |      z  }| j                  |      }| j	                  |      }|S r   )act_fn	gate_projup_projr   	down_proj)rb   xhidden_statesr   s       re   forwardzT5Gemma2MLP.forward  sH    DNN1$56aH]3NN=1	r{   )rq   rr   rs   r:   r`   r   r   r   s   @re   r   r     s    71 7r{   r   c                   |     e Zd Zddef fdZe	 	 	 	 ddedz  ded   dedz  dedz  de	d	e
f   f
 fd
       Z xZS )T5Gemma2RotaryEmbeddingNr   c                 &    t         |   ||       y r   r   r`   )rb   r   devicer   s      re   r`   z T5Gemma2RotaryEmbedding.__init__  s    (r{   r   ztorch.deviceseq_len
layer_typereturnztorch.Tensorc                 (    t         |   | |||      S r   )r   compute_default_rope_parameters)r   r   r   r   r   s       re   r   z7T5Gemma2RotaryEmbedding.compute_default_rope_parameters  s     w6vvwPZ[[r{   r   )NNNN)rq   rr   rs   r:   r`   staticmethodr   rv   rw   tuplerx   r   r   r   s   @re   r   r     s    )1 ) ,0+/"!%	\"T)\(\ t\ $J	\
 
~u$	%\ \r{   r   c                   (     e Zd Zdedef fdZ xZS )T5Gemma2SelfAttentionr   	layer_idxc                 4    t         |   ||       d| _        y NFr   r`   	is_causalrb   r   r   r   s      re   r`   zT5Gemma2SelfAttention.__init__      +r{   )rq   rr   rs   r:   rv   r`   r   r   s   @re   r   r     s    1 c  r{   r   c                   N    e Zd ZdZdedef fdZ	 	 ddej                  de	ej                  ej                  f   dej                  dz  d	ej                  d
e
dz  dej                  dz  dee   de	ej                  ej                  dz  e	ej                     dz  f   fdZ xZS )T5Gemma2MergedAttentionz6Merged self-attention and cross-attention for decoder.r   r   c                 4    t         |   ||       d| _        y r   r   r   s      re   r`   z T5Gemma2MergedAttention.__init__  r   r{   Nr   position_embeddingsmerged_attention_maskencoder_hidden_statesr   cache_positionrc   r   c                    |j                   d d }g |d| j                  }	|j                   d d }
g |
d| j                  }| j                  |      j                  |	      j	                  dd      }| j                  |      j                  |	      j	                  dd      }| j                  |      j                  |	      j	                  dd      }| j                  |      }| j                  |      }|\  }}t        ||||      \  }}|d|||d}|j                  }|j                  ||| j                  |      \  }}|j                  j                  | j                        }|j                  }|s| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      }|j                  ||| j                        \  }}d|j                  | j                  <   nFj                   | j                     j"                  }|j                   | j                     j$                  }|}|
d   }t'        j(                  ||gd      }t'        j(                  ||gd      }t+        j,                  | j.                  j0                  t2              } || ||||f| j4                  r| j6                  nd| j8                  d|\  }} |j:                  g |d j=                         }| j?                  |      }||d	d | f   }|d	| d f   }nd
\  }}|||fS )NrW   r%   )sincosr   Tdimro   )r   scaling.NN) shaperB   q_projview	transposek_projv_projq_normk_normr0   self_attention_cacheupdater   
is_updatedr\   cross_attention_cachelayerskeysvaluestorchcatr   get_interfacer   _attn_implementationr3   trainingrL   r   reshape
contiguouso_proj)rb   r   r   r   r   r   r   rc   input_shapehidden_shapecross_input_shapecross_hidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsr   r   r   cross_key_statescross_value_statescross_key_sizeattention_interfaceattn_outputattn_weightsself_attn_weightscross_attn_weightss                                re   r   zT5Gemma2MergedAttention.forward  s    $))#2.88b8$--8177<D0D"DdmmD {{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j& $'snUL#2#G#G ';'B'BL$..,($J
 )3377GJ$3$I$I!"*#{{+@AFFGYZddefhij!%-B!C!H!HI[!\!f!fghjk!l#{{+;<*7L7S7S$&8$..84 "4 >B**4>>:4;;DNNKPP!6!=!=dnn!M!T!T $*1-YY
,<=1E
yy,0B!CK(?(M(MKK,,.E)
 %8!	%
 /3mmD**LL	%
 	%
!\ *k));;;;FFHkk+. # ,S2BN?2B-B C!-cN?3C.C!D4>11-/AAAr{   r   )rq   rr   rs   rt   r:   rv   r`   r   Tensorr   r	   
LongTensorr   r   r   r   r   s   @re   r   r     s    @1 c  7;26YB ||YB #5<<#=>	YB
  %||d2YB  %||YB -t3YB ((4/YB -.YB 
u||U\\D0%2E2LL	MYBr{   r   rN   r   c           
      T     dt         dt         dt         dt         dt        f
 fd}|S )zL
    This creates uni/bidirectional attention mask with sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c                 t    	r
d}}n
dz   dz  
dz  dz   }}||z
  }|dk\  ||k  z  }|dk  | |k  z  }||z  S )Nr   rW   r%   rZ   )r  r  r  r  left_window_sizeright_window_sizedist	left_mask
right_maskr   rN   s            re   
inner_maskz0sliding_window_mask_function.<locals>.inner_maskg  sp    2@!/4BQ4F13L~bcNcfgNg/v~QY4*:#:;	QhD5+<#<=
:%%r{   )rv   r_   )rN   r   r  s   `` re   sliding_window_mask_functionr  b  s3    
	&c 	&S 	& 	&c 	&d 	& r{   c                       e Zd Zy)T5Gemma2EncoderLayerNr   rZ   r{   re   r  r  u  r   r{   r  c                   0    e Zd ZdZdef fdZ	 	 	 	 	 	 ddej                  deej                  ej                  f   dej                  dz  dej                  dz  d	e
dz  d
edz  dej                  dz  dej                  dz  dej                  fdZ xZS )T5Gemma2DecoderLayerzFDecoder sub-layer: merged attention instead of vanilla self-attention.r   c                 J    t         |   ||       t        ||      | _        y )N)r   r   )r   r`   r   	self_attnr   s      re   r`   zT5Gemma2DecoderLayer.__init__|  s&    + 1
r{   Nr   r   r   position_idsr   rG   r   r   r   c	                 F   |}
| j                  |      } | j                  d||||||||d|	\  }}}| j                  |      }|
| j                  |      z   }|}
| j	                  |      }| j                  |      }| j                  |      }|
| j                  |      z   }|S )N)r   r   r   r  r   rG   r   r   rZ   )pre_self_attn_layernormr  post_self_attn_layernormr   pre_feedforward_layernormmlppost_feedforward_layernorm)rb   r   r   r   r  r   rG   r   r   rc   residual_s               re   r   zT5Gemma2DecoderLayer.forward  s     !44]C,dnn 

' 3"7%+)"7

 

q! 55mD 4<<#>> 66}E/77F 4<<#>>r{   )NNNFNN)rq   rr   rs   rt   rv   r`   r   r  r   r  r	   r_   FloatTensorr   r   r   s   @re   r  r  y  s    P
# 
 6:046:!&2659"||" #5<<#=>"  %||d2	"
 &&-" -t3" $;" ((4/"  %||d2" 
		"r{   r  c                       e Zd Zy)T5Gemma2LMHeadNr   rZ   r{   re   r"  r"    r   r{   r"  c                       e Zd Zy)T5Gemma2ClassificationHeadNr   rZ   r{   re   r$  r$    r   r{   r$  c                   $     e Zd Zdef fdZ xZS )T5Gemma2MultiModalProjectorr   c                 $    t         |   |       y r   r   r   s     re   r`   z$T5Gemma2MultiModalProjector.__init__  s     r{   )rq   rr   rs   r}   r`   r   r   s   @re   r&  r&    s    !4 ! !r{   r&  c                   b     e Zd ZdZ	 	 d
dededededef
 fdZdej                  f fd	Z	 xZ
S )T5Gemma2TextScaledWordEmbeddingzCT5Gemma2 Embedding: override to add eoi token embedding separately.num_embeddingsembedding_dimpadding_idxembed_scaler   c                     t         |   ||||       || _        t        j                  t        j                  | j                              | _        y r   )	r   r`   r   r   	Parameterr   zerosr+  eoi_embedding)rb   r*  r+  r,  r-  r   r   s         re   r`   z(T5Gemma2TextScaledWordEmbedding.__init__  s@     	[Q.\\%++d6H6H*IJr{   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  }| j                  j                  |j
                        ||| j                  k(  <   |S r   )r   r   r-  toweightdtyper1  r   )rb   r2  input_embeddingsr   s      re   r   z'T5Gemma2TextScaledWordEmbedding.forward  sf     7?958H8H8K8KDKKL]L]8^^>B>P>P>S>STdTjTj>kd&:&::;r{   )g      ?  )rq   rr   rs   rt   rv   rx   r`   r   r  r   r   r   s   @re   r)  r)    s^    M !&
K
K 
K 	
K
 
K 
K     r{   r)  c                       e Zd ZU eed<   dZdZdZdZg dZ	e
eg eedd       eedd       eed	d
      gdZd Zd Zy)T5Gemma2PreTrainedModelr   modelTF)r  r  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadrW   r  )index
layer_namer%   
cross_attn)r   
attentionsc                    t        j                  | |       t        |t              r t	        j
                  |j                         y t        |t              rJt	        j
                  |j                         t	        j                  |j                  |j                         y t        |t              r|j                  j                  j                  d   dz  }t	        j                   |j                  j                  d| j"                  j$                  |z         t'        |j                  d      rA|j                  j(                  *t	        j
                  |j                  j(                         y y y d|j*                  j,                  v r t	        j
                  |j                         y t        |t.              r|j0                  D ]  }|j2                  }|j4                  |   dk7  rt6        |j4                  |      } ||j"                  |      \  }}t	        j8                  t;        || d	      |       t	        j8                  t;        || d
      |        y y )Nr   g      ro   )meanstdbiasRMSNormdefault)r   	_inv_freq_original_inv_freq)r   _init_weightsr   r&  initzeros_mm_input_projection_weightr)  r1  	constant_r-  scalar_embed_scaler$  out_projr5  r   normal_r   rE   hasattrrF  r   rq   r   rO   r   	rope_typer   copy_r   )rb   modulescaler   rope_init_fncurr_inv_freqr  s          re   rK  z%T5Gemma2PreTrainedModel._init_weights  s   %%dF3f9:KK99: ?@KK,,-NN6--v/H/HI :;OO**003t;ELL//ct{{?\?\_d?dev/FOO4H4H4TFOO001 5U/ &**333KK& 78$00 ^
%EE##J/9<#6v7G7G
7S#TL#/*#U q

76j\+CDmT

76j\9K+LM}]^ 9r{   c                 <   | j                   j                  }|j                  }|j                  }|t	        d      |j                  |j                        }|dddf   j                         |dddf<   ||d<   |t	        d      |j                  |dk(  |       |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .r   rW   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	r   r   rJ   rH   r   	new_zerosr   clonemasked_fill_)rb   r2  decoder_configdecoder_start_token_idrH   shifted_input_idss         re   %prepare_decoder_input_ids_from_labelsz=T5Gemma2PreTrainedModel.prepare_decoder_input_ids_from_labels   s     ,,!/!<!<%22!)YZZ &//	@%.sCRCx%8%>%>%@#qr'"$:&!XYY 	&&'8D'@,O  r{   N)rq   rr   rs   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_flex_attn_no_split_modulesr  r  r#   r   r   _can_record_outputsrK  ra  rZ   r{   re   r:  r:    st    &*# ! /0DE0kR2!T2!U
^0!r{   r:  c                       e Zd ZU eed<   eedZ	 ddedef fdZ	e
e	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dee   defd              Z xZS )T5Gemma2TextEncoderr   )rB  r   r   c           	      ^   t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  | j                  |j                  dz  |      | _        t        |j                  |j                        | _
        d| _        t        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t        j$                  |j&                        | _        t+        |      | _        | j/                          y c c}w Ng      ?)r-  r   )epsF)r   r`   rH   r,  r<   r)  r=   embed_tokensr   rF   normgradient_checkpointingr   
ModuleListr^   r?   r  r   r   r   r   r   
rotary_emb	post_initrb   r   r   r   r   s       re   r`   zT5Gemma2TextEncoder.__init__#  s    
 	 !.. ++;**C/+
 $F$6$6F<O<OP	&+#mmFKFLdLdFef!&)4f
 zz&"5"561&9 	 g   D*Nr2  attention_maskr  inputs_embedstoken_type_idsrc   r   c           
      0   |d u |d uz  rt        d      |j                  dd        || j                  |      }|>t        j                  d|j
                  d   |j                        j                  d      }t        |x}t              sJ| j                  ||d}t        di |t        di |dt        | j                  j                  d	      id
}|}	i }
| j                  j                  D ]  }| j                  |	||      |
|<    | j!                  |	      }	| j"                  d | j                  j$                   D ](  } ||	|
|j&                     ||j&                     |fi |}	* | j)                  |	      }	| j!                  |	      }	t+        |	      S )N:You must specify exactly one of input_ids or inputs_embedsr   r   rW   r   )r   input_embedsrv  and_mask_functionF)r   rY   rX   last_hidden_staterZ   )r   poprn  r   aranger   r   	unsqueezer   rz   r   r   r  rN   rO   rr  r   r   r?   attention_typero  r   )rb   r2  rv  r  rw  rx  rc   self_attn_mask_mappingmask_kwargsr   r   r   layer_modules                re   r   zT5Gemma2TextEncoder.forward?  s    -t";<YZZ 	

$d+  --i8M <<=+>+>q+A-J^J^_iijklLNB0DI++ -"0K #<"Jk"J%> &!&&B4;;C]C]in&o&&" & !++11 	gJ.2oom\[e.f
+	g ]3 KK(G$++*G*GH 	L(#L$?$?@&|'B'BC	
 M	 		-0]3+
 	
r{   r8  )NNNNN)rq   rr   rs   r:   rb  r   r  rh  rv   r`   r$   r   r   r  r  r   r   r   r   r   r   r   s   @re   rj  rj    s    +-  '" 8  .2.20426.2<
##d*<
 t+<
 &&-	<

 ((4/<
 t+<
 +,<
 
<
  <
r{   rj  c                       e Zd ZU eed<   	 ddedef fdZd Zd Ze	e
dej                  dee   deez  fd	              Zd
ej$                  dz  dej&                  dz  dej&                  fdZee
	 	 	 	 	 	 dd
ej$                  dz  dej                  dz  dej$                  dz  dej&                  dz  dej&                  dz  dej                  dz  dee   defd              Z xZS )T5Gemma2Encoderr   r   c                     t         |   |       t        j                  |j                  |      | _        t        j                  |j                        | _	        t        |      | _        | j                          y )N)r   r   )r   r`   rj  _from_configr   
text_modelr&   from_configr   vision_towerr&  multi_modal_projectorrs  )rb   r   r   r   s      re   r`   zT5Gemma2Encoder.__init__  sb    
 	 -::6;M;M_n:o%119M9MN%@%H" 	r{   c                 6    | j                   j                         S r   )r  get_input_embeddingsrb   s    re   r  z$T5Gemma2Encoder.get_input_embeddings  s    3355r{   c                 8    | j                   j                  |      S r   )r  set_input_embeddingsrb   new_embeddingss     re   r  z$T5Gemma2Encoder.set_input_embeddings  s    33NCCr{   pixel_valuesrc   r   c                 x     | j                   d|dd|}|j                  }| j                  |      }||_        |S )NT)r  return_dictrZ   )r  r  r  pooler_output)rb   r  rc   vision_outputsr  image_featuress         re   get_image_featuresz"T5Gemma2Encoder.get_image_features  sM     +**aRVaZ`a*<<334EF'5$r{   r2  Nrw  r  c                 D   | j                   j                  }|f|t        d      | | j                         t	        j
                  |t        j                  |j                              k(  }|j                  d      }n||k(  }|j                         }|j                  d      j                  |      j                  |j                        }|j                  d   |j                  d   z  }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        z9Either `input_ids` or `inputs_embeds` has to be provided.)r6  r   r   r   rW   z6Image features and image tokens do not match: tokens: z, features )r   r   r   r  r   tensorlongr   allsumr  	expand_asr4  r   r"   numel)rb   r2  rw  r  r   special_image_maskn_image_tokensn_image_featuress           re   get_image_placeholder_maskz*T5Gemma2Encoder.get_image_placeholder_mask  s"    33$ !\]]!.2M$2K2K2M^5::mFZFZ[3 " "4!7!7!;!*n!<+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL,-3359M9M9OOD^DTT_`p_qr	
 "!r{   rv  r  rx  c                 ~   |d u |d uz  rt        d      || j                  j                  |      }|i| j                  |d      j                  }|j                  |j                  |j                        }| j                  |||      }	|j                  |	|      } | j                  d|||d|}
t        |
      S )Nrz  T)r  )rw  r  )rw  rv  r  r  rZ   )r   r  rn  r  r  r4  r   r6  r  masked_scatterr   )rb   r2  rv  r  rw  r  rx  rc   r  
image_maskr   s              re   r   zT5Gemma2Encoder.forward  s     -t";<YZZ  OO88CM#!44\t4TbbN+..}/C/C]EXEXYN88~ 9 J *88^TM' 
')%
 	
 +
 	
r{   r  )NNNNNN)rq   rr   rs   r}   rb  rv   r`   r  r  r    r   r   r  r   r   r   r   r  r  r   r  r$   r   r   r   r   s   @re   r  r    ss   !!
  '% 6D 
!LL
4:;M4N
	+	+
  
"##d*" ((4/" ))	"<  .2.2042615.2$
##d*$
 t+$
 &&-	$

 ((4/$
 ''$.$
 t+$
 +,$
 
$
  $
r{   r  c                       e Zd ZU eed<    eed       eed      edZddede	f fdZ
ee	 	 	 	 	 	 	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dedz  dej"                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  dee   defd              Z xZS )T5Gemma2Decoderr   rW   )r?  r%   )rB  cross_attentionsr   r   c           	      ^   t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  |j                  |j                  dz  |      | _        t        |j                  |j                        | _
        d| _        t        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t        j$                  |j&                        | _        t+        |      | _        | j/                          y c c}w rl  )r   r`   rH   r,  r<   r)  r=   rn  r   rF   ro  rp  r   rq  r^   r?   r  r   r   r   r   r   rr  rs  rt  s       re   r`   zT5Gemma2Decoder.__init__  s     !.. ++;**C/+
 $F$6$6F<O<OP	&+#mmFKFLdLdFef!&)4f
 zz&"5"561&9	 gru  Nr2  rv  r  r   rw  rG   r   r   encoder_attention_maskrc   r   c
                    |d u |d uz  rt        d      |t        d      || j                  |      }| j                  s,|r*|(t        t	        | j
                        t	                     }|F||j                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      }t        |x}t              s>| j
                  |||||j                  nd |d}d |d	<   t        di |t!        di |d
}t        |	x}t              s-| j
                  ||	|d d d}dt        di |dt#        |	      ii}t        j$                  |d   |d   gd      t        j$                  |d   |d   gd      d
}|}i }| j
                  j&                  D ]  }| j)                  |||      ||<    | j+                  |      }| j,                  d | j
                  j.                   D ],  } ||||j0                     ||j0                     |||||fi |
}. | j3                  |      }| j+                  |      }t5        ||      S )Nrz  z0`encoder_hidden_states` must be given in decoderr  r   rW   r{  )r   r|  rv  r   r   r  c                  L    t        j                  dt         j                        S )NT)r6  )r   r  r_   )argss    re   <lambda>z)T5Gemma2Decoder.forward.<locals>.<lambda>8  s    U\\$V[V`V`=a r{   r}  r~  rY   or_mask_functionr   r   rX   )r  r   rZ   )r   rn  r   r	   r   r   get_seq_lengthr   r  r   r   r  r   rz   r   r1   r2   r8   r   rO   rr  r   r   r?   r  ro  r   )rb   r2  rv  r  r   rw  rG   r   r   r  rc   past_seen_tokensr  r  cross_attn_mask_mappingmerged_attn_mask_mappingr   r   r   r  s                       re   r   zT5Gemma2Decoder.forward
  s    -t";<YZZ (OPP  --i8M}}/F1,dkk2RT`TbcO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6LNB0DI++ -"0"0KZKf?#G#Glp ,K 0bK+,"4"C{"C%F%U%U&"
 5KK1TR++ 5"8"0#' $K !"4 #!#%@AW%X#'# $ii'(89;RSc;dekm "''(;<>UVf>ghnp"	$
  & !++11 	gJ.2oom\[e.f
+	g ]3 KK(G$++*G*GH 	L(#L$?$?@()D)DE%
 
M	 		-0]38++
 	
r{   r  )	NNNNNNNNN)rq   rr   rs   r   rb  r#   r   r  rh  rv   r`   r$   r   r   r  r  r	   r   r_   r   r   r   r   r   r   s   @re   r  r    sF   !!$%<AF*+B!L-4 s ,  .2.2046:26!%26596:h
##d*h
 t+h
 &&-	h

 -t3h
 ((4/h
 $;h
 ((4/h
  %||d2h
 !&t 3h
 +,h
 
3h
  h
r{   r  c            !           e Zd ZdddZdef fdZd Zd Zd Zd	 Z	e
e	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  ded
z  ded
z  dej$                  d
z  dej$                  d
z  ded
z  dej                  d
z  dee   defd              Z xZS )T5Gemma2Modelz&encoder.text_model.embed_tokens.weightz-encoder.text_model.embed_tokens.eoi_embedding)zdecoder.embed_tokens.weightz"decoder.embed_tokens.eoi_embeddingr   c                     t         |   |       t        |j                  |j                        | _        t        |j                  |j                        | _        | j                          y r   )r   r`   r  r   r   r  r   rs  r   s     re   r`   zT5Gemma2Model.__init__~  sL      'v~~v7M7MN&v~~v7M7MNr{   c                     | j                   S r   )r   r  s    re   get_encoderzT5Gemma2Model.get_encoder      ||r{   c                     | j                   S r   r   r  s    re   get_decoderzT5Gemma2Model.get_decoder  r  r{   c                 6    | j                   j                         S r   )r   r  r  s    re   r  z"T5Gemma2Model.get_input_embeddings  s    ||0022r{   c                 8    | j                   j                  |      S r   )r   r  r  s     re   r  z"T5Gemma2Model.set_input_embeddings  s    ||00@@r{   Nr2  r  rv  r  decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr   rw  decoder_inputs_embedsrG   r   rc   r   c                 P   | | j                   d||||
|dd|}|j                  } | j                  d|||||	||||dd
|}t        |j                  |j                  |j
                  |j                  |j                  |j                  |j
                  |j                        S )aX  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        T)r2  rv  r  rw  r  r  )
r2  rv  r  rw  r   r   r  rG   r   r  )r  r   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentionsrZ   )r   r  r   r   r   r   rB  r  )rb   r2  r  rv  r  r  r  r  r  r   rw  r  rG   r   rc   r   decoder_outputss                    re   r   zT5Gemma2Model.forward  s    8 "*dll #-)+)  O !0 A A '$,, 
'1-/+"7#1)
 
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r{   )NNNNNNNNNNNNN)rq   rr   rs   _tied_weights_keysr   r`   r  r  r  r  r    r   r   r  r   
BoolTensorr   r	   r  r_   r   r   r   r   r   r   s   @re   r  r  w  s    (P.]
~ 3A  .215370459:>8<266:-159!%26#?
 ##d*?
 ''$.	?

 ))D0?
 &&-?
 !++d2?
 !& 0 04 7?
 $..5?
 )4/?
 -t3?
 ||d*?
  %||d2?
  $;!?
" ((4/#?
$ +,%?
& 
'?
  ?
r{   r  c            &           e Zd ZddiZddiZddgdgfiZdef fdZd	 Zd
 Z	d Z
d Zd Zd Zeedej"                  dee   deez  fd              Zed        Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d*dej2                  dz  dej4                  dz  dej4                  dz  dej2                  dz  dej2                  dz  dej6                  dz  dej2                  dz  dedz  dedz  dej4                  dz  dej4                  dz  dej2                  dz  d edz  d!ej2                  dz  d"eej"                  z  dee   deej4                     e z  f"d#              Z!d$e"d%e#d&e$d'ed(edef fd)Z% xZ&S )+ T5Gemma2ForConditionalGenerationzlm_head.out_proj.weightz,model.encoder.text_model.embed_tokens.weightzlm_head.out_projcolwise_gather_outputr   logitsr   c                    t         |   |       t        |      | _        |j                  j
                  | _        t        |j                  j                  | j
                        | _        d| _	        | j                          y )NForMaskedLM)r   r`   r  r;  r   r<   r"  r=   lm_head	loss_typers  r   s     re   r`   z)T5Gemma2ForConditionalGeneration.__init__  sZ     "6*
 ..33%fnn&@&@$//R&r{   c                 &    || j                   _        y r   r  rQ  r  s     re   set_output_embeddingsz6T5Gemma2ForConditionalGeneration.set_output_embeddings  s     .r{   c                 .    | j                   j                  S r   r  r  s    re   get_output_embeddingsz6T5Gemma2ForConditionalGeneration.get_output_embeddings  s    ||$$$r{   c                 6    | j                   j                         S r   r;  r  r  s    re   r  z5T5Gemma2ForConditionalGeneration.get_input_embeddings      zz..00r{   c                 :    | j                   j                  |       y r   r;  r  rb   values     re   r  z5T5Gemma2ForConditionalGeneration.set_input_embeddings      

''.r{   c                 6    | j                   j                         S r   )r;  r  r  s    re   r  z,T5Gemma2ForConditionalGeneration.get_encoder      zz%%''r{   c                 6    | j                   j                         S r   )r;  r  r  s    re   r  z,T5Gemma2ForConditionalGeneration.get_decoder  r  r{   r  rc   r   c                 D     | j                         j                  |fi |S r   )r  r  )rb   r  rc   s      re   r  z3T5Gemma2ForConditionalGeneration.get_image_features  s%    
 5t!44\LVLLr{   c                 6    | j                         j                  S r   )r  r  r  s    re   r  z-T5Gemma2ForConditionalGeneration.vision_tower  s    !...r{   Nr2  rv  r  r  r  r  r  r   rw  r  labelsrG   r   logits_to_keepc                    |||| j                  |      } | j                  d|||||||||	|
|||d|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                  j                  }|j                  3||j                  z  }t        j                  |      }||j                  z  }d}| | j                  ||| j                  fi |}t        |||j                  |j                   |j"                  |j$                  |j&                  |j(                  |j*                  	      S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r2  r  rv  r  r  r  r  r  r   rw  r  rG   r   )	lossr  r   r  r  r  r  r   r  rZ   )ra  r;  r  r   rv   slicer  r   r   rP   r   tanhloss_functionr<   r   r   r  r  r  r  r   r  )rb   r2  r  rv  r  r  r  r  r  r   rw  r  r  rG   r   r  rc   r  r   slice_indicesr  r^  r  s                          re   r   z(T5Gemma2ForConditionalGeneration.forward  su   D "3";@U@] $ J J6 R.8djj /
%)%/#9!5++'"7)/
 /
" (998B>SV8W~ot4]kmA}a,?@A,,11=nDDDFZZ'FnDDDF%4%%ffdooPPD+;;"1"G"G.AA,==&5&O&O"1"G"G.AA

 
	
r{   generation_configmodel_kwargsgeneration_mode
batch_sizemax_cache_lengthc           	      P   t         |   |||||       |j                  du ry|j                  }|d}nd|j                  v }t	        j
                  | j                  j                  d            }|`|`	||d}	|j                  d      }
|
t        |
t              st        d      t        |
j                        d	kD  r|
j                  j                  d	      ryt!        |
j"                        }|t$        k(  r|d
   d	   j&                  d   |	d<    |di |	|
_        n=t        t)        di | j                  j                  d      |dt)                     |d<   t+        | d      r=| j,                  0t        | j,                  t              st        d      |d   | _        yyy)zMOverride cache preparation to support T5Gemma2-specific EncoderDecoder Cache.FN	offloadedTr  )r   
offloadingr   zaThe `past_key_values` in `model_kwargs` must be of type `EncoderDecoderCache` for T5Gemma2 model.r   r  rW   max_cache_len_cachezLThe internal cache must be of type `EncoderDecoderCache` for T5Gemma2 model.rZ   )r   _prepare_cache_for_generationrG   cache_implementationcopydeepcopyr   get_text_configrN   rO   r\   r   r	   r   lenr   r   r   r
   r   r   rS  r  )rb   r  r  r  r  r  r  offload_cachecross_attn_configcross_attn_cache_kwargsr   cross_attn_clsr   s               re   r   z>T5Gemma2ForConditionalGeneration._prepare_cache_for_generationX  s    	-	
 &&%/0EE'!M'+<+Q+QQM !MM$++*E*Ed*E*ST ,) ('#

 '**+<=&o/BC w 
 ?--.27Q7Q7U7UVW7X!/"G"GHN,;GHY;Z[\;];c;cde;f'84B4]E\4]O1 /B "&++"="=d"="K&3 /L*+ 4"t{{'>dkk+>? !opp&'89DK	 (?"r{   )NNNNNNNNNNNNNNr   )'rq   rr   rs   r  _tp_plan_pp_planr   r`   r  r  r  r  r  r  r    r   r   r  r   r   r   r   r  propertyr  r  r   r  r   r	   r_   rv   r   r   r   rz   r   r   r   r   s   @re   r  r    s   !#Q #$;<H"o%6
$CDH~ /%1/(( M!LLM4:;M4NM	+	+M  M
 / /  .215370459:>8<266:26:>*.!%26-.'O
 ##d*O
 ''$.	O

 ))D0O
 &&-O
 !++d2O
 !& 0 04 7O
 $..5O
 )4/O
 -t3O
 ((4/O
  %0047O
    4'!O
" $;#O
$ ((4/%O
& ell*'O
( +,)O
* 
u  	!O	3+O
  O
bI:+I: I: (	I:
 I: I: 
I: I:r{   r  c                       e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  de	j                  dz  dee   defd              Z xZS )!T5Gemma2ForSequenceClassificationr   c                 "   t         |   |       |j                  | _        |j                  j                  | _        t        |      | _        t        |dd      }t        | j                  | j                  |      | _	        | j                          y Nr   g?r   r`   
num_labelsr   r=   r  r;  r   r$  scorers  rb   r   classifier_dropoutr   s      re   r`   z*T5Gemma2ForSequenceClassification.__init__  sp      ++!>>55"6*
$V-FL/0@0@$//Sef
r{   c                 6    | j                   j                         S r   r  r  s    re   r  z6T5Gemma2ForSequenceClassification.get_input_embeddings  r  r{   c                 :    | j                   j                  |       y r   r  r  s     re   r  z6T5Gemma2ForSequenceClassification.set_input_embeddings  r  r{   Nr2  r  rv  r  r  r  r  r  rw  r  r  rc   r   c                 v   |	|
#t        d| j                  j                   d      |t        d      || j	                  |      } | j
                  |f||||||||	|
dd
|}|j                  }|j                  }|j                  }| j                  |      }|j                  d   }|| j                  j                  k7  j                  |j                  t        j                         }t        j"                  |j                  d   |j                  t        j                   	      }||z  j%                  d      }t        j&                  ||j                  d   d
z
        }|t        j"                  ||j                        |f   }d}|| j)                  |||| j                        }t+        ||||      S )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for .You have to specify input_idsF
r  rv  r  r  r  r  r  rw  r  rG   r   r   )r   r6  rW   )maxr{  )r  r  pooled_logitsr   r  r  r   rB  )NotImplementedErrorr   rq   r   ra  r;  r  r  r  r  r   r   rH   r4  r   r   int32r  argmaxclampr  r   )rb   r2  r  rv  r  r  r  r  r  rw  r  r  rc   outputsr  r   rB  r  r  non_pad_masktoken_indiceslast_non_pad_tokenr  r  s                           re   r   z)T5Gemma2ForSequenceClassification.forward  s   4 $(=(I%J4>>KbKbJccde  <==$ $ J J9 U&0djj'
%)%/#9!5+'"7'
 '
 $5555//
-.__Q'
)T[[-E-EEII&--Y^YdYde%6%<%<R%@^c^i^ij+l:BB2F"[[);ARAXAXY[A\_`A`au||Jv}}MOaab%%VFR_hlhshs%tD' '!	
 	
r{   NNNNNNNNNNN)rq   rr   rs   r   r`   r  r  r    r   r   r  r   r  r   r   r   r   r   r   r   s   @re   r  r    s\   	~ 	1/  .215.204596:8<2626:>*.J
##d*J
 ''$.J
 t+	J

 &&-J
 !++d2J
 !&t 3J
 $..5J
 )4/J
 ((4/J
  %0047J
   4'J
 +,J
 
"J
  J
r{   r  c                       e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  de	j                  dz  dee   defd              Z xZS )T5Gemma2ForTokenClassificationr   c                 "   t         |   |       |j                  | _        |j                  j                  | _        t        |      | _        t        |dd      }t        | j                  | j                  |      | _	        | j                          y r  r  r  s      re   r`   z'T5Gemma2ForTokenClassification.__init__  sp      ++!>>55"6*
$V-FL/0@0@$//Sef
r{   c                 6    | j                   j                         S r   r  r  s    re   r  z3T5Gemma2ForTokenClassification.get_input_embeddings  r  r{   c                 :    | j                   j                  |       y r   r  r  s     re   r  z3T5Gemma2ForTokenClassification.set_input_embeddings  r  r{   Nr2  r  rv  r  r  r  r  r  rw  r  r  rc   r   c                    |	|
#t        d| j                  j                   d      |t        d      || j	                  |      } | j
                  |f||||||||	|
dd
|}|j                  }|j                  }|j                  }| j                  |      }d}|| j                  ||| j                        }t        ||||      S )r  Nr  r  r  Fr  r   )r!  r   rq   r   ra  r;  r  r  r  r  r  r   r   )rb   r2  r  rv  r  r  r  r  r  rw  r  r  rc   r%  r  r   rB  r  r  s                      re   r   z&T5Gemma2ForTokenClassification.forward  s   4 $(=(I%J4>>KbKbJccde  <==$ $ J J9 U&0djj'
%)%/#9!5+'"7'
 '
 $5555//
-.%%ffdkkBD$'!	
 	
r{   r)  )rq   rr   rs   r   r`   r  r  r    r   r   r  r   r  r   r   r   r   r   r   r   s   @re   r+  r+    s\   
~ 
1/  .215.204596:8<2626:>*.@
##d*@
 ''$.@
 t+	@

 &&-@
 !++d2@
 !&t 3@
 $..5@
 )4/@
 ((4/@
  %0047@
   4'@
 +,@
 
@
  @
r{   r+  )
r   r:   r}   r   r  r  r  r:  r  r+  )T)ir  collections.abcr   typingr   r   r   torch.nnr    r   rL  cache_utilsr   r	   r
   configuration_utilsr   r   
generationr   r   r   masking_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r    r!   r"   utils.genericr#   r$   autor&   gemma3.configuration_gemma3r'   r(   gemma3.modeling_gemma3r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   siglipr4   t5gemma.modeling_t5gemmar5   r6   r7   r8   
get_loggerrq   r   r:   r}   r   r   r   r   r   r   r   rv   r  r  r  r"  r$  r&  r)  r:  rj  r  r  r  r  r  r+  __all__rZ   r{   re   <module>rF     sH    $     & I I J K K 6 B   G F &  @  H    (  
		H	%D,)+; D,NL D,,.> D,NvJ% vJr	m 		) 	\3 \O `Bo `BF  &	. 	.. .b	] 		!: 	!"; !
 &C  * L!3 L! L!^a
1 a
Hi
- i
XH
- H
V \
+ \
 \
~J:'> J:Z ^
(? ^
 ^
B U
%< U
 U
pr{   