
    i                     4   d dl mZ d dlmZmZmZ d dlZd dlmZ ddl	m
Z ddlmZmZ ddlmZmZ ddlmZmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z> ddl?m@Z@  e*j                  eB      ZC G d de.e      ZD G d de      ZE G d de=      ZF G d de:      ZG G d dej                        ZI G d d e2      ZJ G d! d"e5      ZK G d# d$e6      ZL G d% d&e0      ZM G d' d(e      ZNdZO G d) d*e4      ZPd+eQd,eeQeQeQeQgeRf   fd-ZS G d. d/e3      ZT G d0 d1e1      ZU G d2 d3ej                        ZW	 	 	 	 dHd4ed5ej                  d6ej                  dz  d7ej                  d8edz  d9ej                  dz  d:ej                  dz  d;ej                  dz  d<eRd=eRdz  d,eZfd>Z[ G d? d@e<      Z\ G dA dBe;      Z] G dC dDeP      Z^ G dE dFeeP      Z_g dGZ`y)I    )Callable)AnyLiteralOptionalN   )initialization)CacheDynamicCache)PreTrainedConfiglayer_type_validation)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPooling SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSRopeParametersdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)maybe_autocast   )Gemma2Config)	Gemma2AttentionGemma2ForCausalLM	Gemma2MLPGemma2ModelGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward)PaliGemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPasttoken_type_ids_mask_function)SiglipVisionConfigc            4       r   e Zd ZdZdZdddZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#dedz  dedz  d	edz  d
edz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  de	dz  dedz  dedz  dedz  de	dz  dedz  dedz  dedz  de
e   dz  dedz  dedz  deed   ef   dz  de	dz  d e	dz  f2d!Zd$d"Zy)%Gemma3TextConfigay  
    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma3Text-7B.
    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma3TextModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            In Gemma3Text, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        rope_parameters (`dict`, *optional*):
            Dictionary mapping attention patterns (`"full_attention"`, `"sliding_attention"`) to `RopeParameters`.
            Each value should be a dictionary containing `rope_type` and optional scaling parameters.
        use_bidirectional_attention (`bool`, *optional*, defaults to `False`):
            If True, the model will attend to all text tokens instead of using a causal mask. This does not change
            behavior for vision tokens.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    ```python
    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
    >>> configuration = Gemma3TextConfig()
    >>> # Initializing a model from the gemma3_text-7b style configuration
    >>> model = Gemma3TextModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    gemma3_textg    .Ag     @)globallocalN
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dimhidden_activationmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_ideos_token_idbos_token_idattention_biasattention_dropoutquery_pre_attn_scalarsliding_windowlayer_typesfinal_logit_softcappingattn_logit_softcappingrope_parametersfull_attentionsliding_attentionuse_bidirectional_attentiontie_word_embeddingsc                    || _         || _        || _        || _        || _        |	| _        || _        || _        || _        || _	        || _
        || _        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        |r| j&                  dz  dz   | _        |j1                  dd      | _        | j,                  Et5        | j                        D cg c]!  }t7        |dz   | j2                  z        rdnd# c}| _        t9        | j,                  | j                         || _        t=        j>                  di | y c c}w )Nr       sliding_window_pattern   rO   rN    ) rB   rD   rC   rQ   r6   r>   r7   r8   r9   r:   r<   r;   r?   r@   rA   rE   rF   r=   rG   rH   rJ   rK   rI   rP   get_sliding_window_patternrangeboolr   rL   r   __init__)selfr6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rP   rQ   kwargsis                               s/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/gemma3/modular_gemma3.pyr[   zGemma3TextConfig.__init__   sy   : )((#6 $'>$&!2!2#6  #6 !2(",!2!2%:",'>$&<#&+F(&#'#6#6!#;q"@D (.zz2JA'N$# t556  (,QUd6R6R,R'S#Yii D 	d..0F0FG.!!+F+ s   <&E'c                    |j                  dd       }ddiddid}| j                  | j                  n|| _        || j                  d   j                  |       | j                  j                  d      ddi| j                  d<   | j                  d   j	                  d|j                  d| j
                  d                | j                  j                  d      ddi| j                  d<   | j                  d   j	                  d|j                  d	| j
                  d
                | j                          | j                  |       |S )Nrope_scaling	rope_typedefault)rO   rN   rN   
rope_thetar4   rO   rope_local_base_freqr5   )ignore_keys)poprL   updaterW   
setdefaultdefault_thetastandardize_rope_paramsvalidate_rope)r\   ignore_keys_at_rope_validationr]   ra   default_rope_paramss        r_   convert_rope_params_to_dictz,Gemma3TextConfig.convert_rope_params_to_dict   s[   zz.$7
 #.y!9*I6
 8<7K7K7Wt33]p#  !1299,G ##$45=6A95MD  !12-.99&**\43E3Eh3OP	
 ##$78@9Di8PD  !4501<<&**%;T=O=OPW=XY	

 	$$&'EF    )i@  i 	  i $              gelu_pytorch_tanhi   {Gz?ư>Tr   rS   r    F        rt   i   NNNNFTN)__name__
__module____qualname____doc__
model_typerj   intstrfloatrZ   listdictr   r   r[   ro   rV   rp   r_   r2   r2   =   s   Pd J*X>M ")"&(,(**+*+"(;.5*.#'!%#$#$#$&+*-,/%)(,04/3gk38+/5D,$JD, 4ZD, :	D,
 :D, !4ZD, !4ZD, *D, :D, "%tD, !4<D, DjD, $;D, DjD, DjD,  Dj!D," t#D,$ !4<%D,&  #Tz'D,( d
)D,* #Y%+D,, "'-D,. !&/D,0 g&KLn\]`dd1D,2 &*D[3D,4 "D[5D,Lrp   r2   c                        e Zd ZdZdZddddZeedZ	 	 	 	 	 	 	 	 dd	ee	e
ef   z  dz  d
ee	e
ef   z  dz  dedz  dedz  dedz  dedz  dedz  dedz  f fdZ xZS )Gemma3Configa	  
    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the PaliGemma-2B.

    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
            The config object of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom vision config or dict.
        mm_tokens_per_image (`int`, *optional*, defaults to 256):
            The number of tokens per image embedding.
        boi_token_index (`int`, *optional*, defaults to 255999):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_index (`int`, *optional*, defaults to 256000):
            The end-of-image token index to wrap the image prompt.
        image_token_index (`int`, *optional*, defaults to 262144):
            The image token index to encode the image prompt.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    Example:

    ```python
    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

    >>> # Initializing a Siglip-like vision config
    >>> vision_config = SiglipVisionConfig()

    >>> # Initializing a Gemma3 Text config
    >>> text_config = Gemma3TextConfig()

    >>> # Initializing a Gemma3 gemma-3-4b style configuration
    >>> configuration = Gemma3Config(vision_config, text_config)

    >>> # Initializing a model from the gemma-3-4b style configuration
    >>> model = Gemma3TextConfig(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma3image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configvision_configNr   r   mm_tokens_per_imager?   rQ   c	                    | t               }t        j                  d       nt        |t              rt        di |}t        |t              rt        di |}n!|t               }t        j                  d       || _        || _        || _        || _	        || _
        || _        || _        || _        t        
| <  di |	 y )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config.rV   )r2   loggerinfo
isinstancer   r0   r   r   r   r   r   r   r?   rQ   superr[   )r\   r   r   r   r   r   r   r?   rQ   r]   	__class__s             r_   r[   zGemma3Config.__init__5  s     *,KKKZ[T**9[9KmT*.??M".0MKK`a&*#6 ..!2!2#6 "6"rp   )NNrt   i i  i   rv   T)rz   r{   r|   r}   r~   attribute_mapr2   r0   sub_configsr   r   r   r   r   rZ   r[   __classcell__r   s   @r_   r   r      s    /b J-))M (+K AEDH*-&-&-(/*.+/!#%S#X6=!# *DcN:TA!# !4Z	!#
 t!# t!# :!# !4<!# "D[!# !#rp   r   c                       e Zd Zy)Gemma3ModelOutputWithPastNrz   r{   r|   rV   rp   r_   r   r   Y      rp   r   c                       e Zd Zy)Gemma3CausalLMOutputWithPastNr   rV   rp   r_   r   r   ]  r   rp   r   c            	       Z     e Zd ZdZd	dedededef fdZdej                  f fdZ	 xZ
S )
Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                     t         |   |||       || _        | j                  dt	        j
                  |      d       y )Nr   F
persistent)r   r[   scalar_embed_scaleregister_buffertorchtensor)r\   r   r   r   r   r   s        r_   r[   z&Gemma3TextScaledWordEmbedding.__init__f  s;    D"-]ELL,ERWXrp   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  S ry   )r   forwardr   toweightdtype)r\   r   r   s     r_   r   z%Gemma3TextScaledWordEmbedding.forwardk  s2    wy)D,<,<,?,?@Q@Q,RRRrp   )      ?)rz   r{   r|   r}   r   r   r[   r   Tensorr   r   r   s   @r_   r   r   a  sG    Ys Y3 YS Y_d Y
S S Srp   r   c                   $     e Zd Zdef fdZ xZS )	Gemma3MLPconfigc                 $    t         |   |       y ry   r   r[   r\   r   r   s     r_   r[   zGemma3MLP.__init__p  s     rp   )rz   r{   r|   r2   r[   r   r   s   @r_   r   r   o  s    !/ ! !rp   r   c                   *     e Zd Zddedef fdZ xZS )Gemma3RMSNormdimepsc                 (    t         |   ||       y )Nr   r   r   )r\   r   r   r   s      r_   r[   zGemma3RMSNorm.__init__u  s    Sc*rp   )rw   )rz   r{   r|   r   r   r[   r   r   s   @r_   r   r   t  s    +C +e + +rp   r   c                       e Zd ZddefdZe	 	 	 	 ddedz  ded   dedz  dedz  de	d	e
f   f
d
       Z ej                         edd              Zy)Gemma3RotaryEmbeddingNr   c                    t         j                  j                          |j                  | _        |j                  | _        || _        t        t        |j                              | _	        i | _
        | j                  D ]  }| j                  j                  |   }||d   | j                  |<   | j                  }| j                  |   dk7  rt        | j                  |      } || j                  ||      \  }}| j                  | d|d       | j                  | d|j                         d       t!        | | d|        y )	Nrb   rc   
layer_type	_inv_freqFr   _original_inv_freq_attention_scaling)nnModuler[   r>   max_seq_len_cachedoriginal_max_seq_lenr   r   setrI   rb   rL   compute_default_rope_parametersr   r   clonesetattr)r\   r   devicer   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalings           r_   r[   zGemma3RotaryEmbedding.__init__z  s<   
		"("@"@$*$B$B!F$6$6 78** 	UJ++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@fak4l1M1  J<y!9=UZ [  J</A!BMDWDWDYfk lDZL(:;=ST	Urp   r   ztorch.deviceseq_lenr   returnztorch.Tensorc                     | j                   |   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        rd   r<   Nr   r   r    r   r   r   )	rL   getattrr7   r:   r   arangeint64r   r   )r   r   r   r   baser   attention_factorinv_freqs           r_   r   z5Gemma3RotaryEmbedding.compute_default_rope_parameters  s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))rp   c                 N   t        | | d      }t        | | d      }|d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d	      5  |j                         |j                         z  j                  dd
      }	t        j                  |	|	fd      }
|
j                         |z  }|
j                         |z  }d d d        j	                  |j                        j	                  |j                        fS # 1 sw Y   AxY w)Nr   r   r   rS   mpscpuF)device_typeenabledr    r   r   )r   r   expandshaper   r   r   typer   r   	transposer   catcossinr   )r\   xposition_idsr   r   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   s                r_   r   zGemma3RotaryEmbedding.forward  sl    4J<y!9:#DZL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	0&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')//C'')//C		0 vvAGGv$cff177f&;;;	0 	0s   *A1FF$)NNNNNNry   )rz   r{   r|   r2   r[   staticmethodr   r   r   tupler   r   r   no_gradr   r   rV   rp   r_   r   r   y  s    U/ U. *.+/"!%	!* 4'!*(!* t!* $J	!*
 
~u$	%!* !*F U]]_<  <rp   r   c                       e Zd Zdedef fdZ	 	 	 	 ddej                  dej                  dej                  dz  dedz  d	ej                  dz  d
e
e   deej                  ej                  dz  eej                     dz  f   fdZ xZS )Gemma3Attentionr   	layer_idxc                 b   t         |   ||       | j                  dk(  r|j                  nd | _        | j                  dk(  | _        | j
                  j                   | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )NrO   r   )r   r[   r   rH   
is_slidingr   rP   	is_causalr   r<   r@   q_normk_normr\   r   r   r   s      r_   r[   zGemma3Attention.__init__  s    +7;J]7]f33cg//-@@![[DDD#V=P=PQ#V=P=PQrp   Nhidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionr]   r   c                 r   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }| j                  |	      }	| j                  |
      }
|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t               } || |	|
||f| j"                  r| j$                  nd| j&                  | j(                  d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )Nr   rS   r    )r   r   r  rx   )dropoutscalingrH   )r   r<   q_projviewr   k_projv_projr   r  r)   rh   r   r   get_interfacer   _attn_implementationr*   trainingrF   r
  rH   reshape
contiguouso_proj)r\   r  r  r  r  r  r]   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsattention_interfaceattn_outputattn_weightss                     r_   r   zGemma3Attention.forward  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((rp   r   )rz   r{   r|   r2   r   r[   r   r   r	   
LongTensorr   r   r   r   r   r   s   @r_   r   r     s    R/ RC R -1.2(,26-)||-) #\\-) t+	-)
 -) ((4/-) +,-) 
u||U\\D0%2E2LL	M-)rp   r   c                   4    e Zd Zdedef fdZ	 	 	 	 	 ddej                  dej                  dej                  dz  dej                  dz  d	e	dz  d
ej                  dz  de
e   deej                  eej                  ej                  f   dz  f   fdZ xZS )Gemma3DecoderLayerr   r   c                    t         |           || _        |j                  | _        || _        |j
                  |   | _        t        ||      | _        t        |      | _
        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        y )N)r   r   r   )r   r[   r   r7   r   rI   attention_typer   	self_attnr   mlpr   r@   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr  s      r_   r[   zGemma3DecoderLayer.__init__  s    !--"$00;()LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'rp   Nr  r  r  r   r  r  r]   r   c           
         |}| j                  |      } | j                  d||||||d|\  }}	| j                  |      }||z   }|}| j                  |      }| j	                  |      }| j                  |      }||z   }|S )N)r  r  r  r   r  r  rV   )r&  r$  r'  r(  r%  r)  )
r\   r  r  r  r   r  r  r]   residual_s
             r_   r   zGemma3DecoderLayer.forward  s     !,,];)4>> 
' 3)%+)
 
q 55mD =0 66}E/77F =0rp   )NNNNN)rz   r{   r|   r2   r   r[   r   r   r  r	   r   r   r   FloatTensorr   r   r   s   @r_   r   r     s    c/ cC c  -1.204(,26 ||  #\\  t+	 
 &&-    ((4/  +,  
u  %(9(95;L;L(L"MPT"TT	U rp   r   c                   J    e Zd ZdZdZg dZ ej                         d        Zy)Gemma3PreTrainedModelmodel)imagetext)r   SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadc                    t        j                  | |       t        |t              r t	        j
                  |j                         y d|j                  j                  v r t	        j
                  |j                         y t        |t              r+t	        j                  |j                  |j                         y t        |t              r|j                  D ]  }|j                   }|j"                  |   dk7  rt$        |j"                  |      } ||j&                  |      \  }}t	        j(                  t+        || d      |       t	        j(                  t+        || d      |        y y )NRMSNormrc   r   r   r   )r   _init_weightsr   Gemma3MultiModalProjectorinitzeros_mm_input_projection_weightr   rz   r   r   	constant_r   r   r   rI   r   rb   r   r   copy_r   )r\   moduler   r   r   r,  s         r_   r8  z#Gemma3PreTrainedModel._init_weightsA  s    %%dF3f78KK99:&**333KK& =>NN6--v/H/HI 56$00 ^
%EE##J/9<#6v7G7G
7S#TL#/*#U q

76j\+CDmT

76j\9K+LM}]^ 7rp   N)	rz   r{   r|   base_model_prefixinput_modalities_no_split_modulesr   r   r8  rV   rp   r_   r/  r/  7  s4    ( U]]_^ ^rp   r/  rH   r   c           
      P     dt         dt         dt         dt         dt        f
 fd}|S )zA
    Enables a bidirectional mask within the sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c                 &    t        ||z
        k  S )zA token can attend to any other token if their absolute distance is within
        the (exclusive) sliding window size (distance < sliding_window).)abs)rD  rE  rF  rG  rH   s       r_   
inner_maskz1_bidirectional_window_overlay.<locals>.inner_maskZ  s     56>"^33rp   )r   rZ   )rH   rJ  s   ` r_   _bidirectional_window_overlayrK  U  s3    
4c 4S 4 4c 4d 4
 rp   c                       e Zd ZU eed<   dZdef fdZ	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  d	ej                  dz  d
edz  dej                  dz  dee   defdZ xZS )Gemma3TextModelr   r2  c                     t         |   |       t        |j                  |j                  | j
                  | j                  j                  dz        | _        y )N      ?)r   )r   r[   r   r6   r7   r   r   embed_tokensr   s     r_   r[   zGemma3TextModel.__init__f  sM      :v1143C3CQUQ\Q\QhQhjmQm
rp   Nr   r  r   r  inputs_embedsrA   r  r]   r   c           
         |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        |x}
t              sx| j                  |||||d}|j                         }| j                  j                  r(d |d<   t        | j                  j                        |d<   t!        di |t#        di |d	}
|}i }| j                  j$                  D ]  }| j'                  |||      ||<    | j(                  d | j                  j*                   D ]+  } ||f|
|j,                     ||j,                     |||d
|}- | j/                  |      }t1        ||      S )N:You must specify exactly one of input_ids or inputs_embeds)r   r   rS   r   r   input_embedsr  r  r  r   c                  L    t        j                  dt         j                        S )NTr   )r   r   rZ   )argss    r_   <lambda>z)Gemma3TextModel.forward.<locals>.<lambda>  s    TY^YcYc@d rp   or_mask_functionrM   )r  r  r   r  r  )last_hidden_stater  rV   )
ValueErrorrQ  r
   r   get_seq_lengthr   r   r   r   	unsqueezer   r   copyrP   rK  rH   r   r   rI   
rotary_emblayersr9   r#  normr   )r\   r   r  r   r  rR  rA   r  r]   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargsr  r  r   decoder_layers                    r_   r   zGemma3TextModel.forwardn  s    -t";<YZZ  --i8M0*$++>O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L ?-F ++ -"0"0#2 ,K #."2"2"4{{662d./:WX\XcXcXrXr:s#$67 #5"C{"C%F%]I\%]# & ++11 	gJ.2oom\[e.f
+	g "[[)H4;;+H+HI 		M)2=3O3OP$78T8T$U) /- M		 		-0&++
 	
rp   )NNNNNNN)rz   r{   r|   r2   __annotations__rA  r[   r   r  r   r	   r-  rZ   r   r   r   r   r   r   s   @r_   rM  rM  b  s     
/ 
 .2.204(,26!%26J
##d*J
 t+J
 &&-	J

 J
 ((4/J
 $;J
 ((4/J
 +,J
 
!J
rp   rM  c                   0     e Zd ZU eed<   def fdZ xZS )Gemma3ForCausalLMr   c                 D    t         |   |       t        |      | _        y ry   )r   r[   rM  r0  r   s     r_   r[   zGemma3ForCausalLM.__init__  s     $V,
rp   )rz   r{   r|   r2   ri  r[   r   r   s   @r_   rk  rk    s    -/ - -rp   rk  c                   D     e Zd Zdef fdZdej                  fdZ xZS )r9  r   c                    t         |           t        j                  t	        j
                  |j                  j                  |j                  j                              | _	        t        |j                  j                  |j                  j                        | _        t        |j                  j                  |j                  j                  z        | _        t        |j"                  dz        | _        | j                   | j$                  z  | _        t        j(                  | j&                  | j&                        | _        y )Nr"  rP  )kernel_sizestride)r   r[   r   	Parameterr   zerosr   r7   r   r<  r   layer_norm_epsmm_soft_emb_normr   
image_size
patch_sizepatches_per_imager   tokens_per_sidero  	AvgPool2davg_poolr   s     r_   r[   z"Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[rp   vision_outputsc                    |j                   \  }}}|j                  dd      }|j                  ||| j                  | j                        }|j	                         }| j                  |      }|j                  d      }|j                  dd      }| j                  |      }t        j                  || j                        }|j                  |      S )NrS   r    )r   r   r  rw  r  rz  flattenrt  r   matmulr<  type_as)	r\   r{  
batch_sizer,  r7   reshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            r_   r   z!Gemma3MultiModalProjector.forward  s    %3%9%9"
A{"0":":1a"@"9"A"AT%;%;T=S=S#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EtGfGf#g '//??rp   )	rz   r{   r|   r   r[   r   r   r   r   r   s   @r_   r9  r9    s#    \| \ @ell @rp   r9  r   rW  r  r  r  r   token_type_idspixel_valuesis_trainingis_first_iterationc
                    |r|t        d      | j                         |||||d}|	|	n|du xs |j                   xs |du}	||	r|dk(  j                  |j                        }t
        j                  j                  |dd      ddddf   }|| z  }t        j                  |j                         d	      dz
  }t        j                  ||d      }t        |j                  |j                        |      |d
<   t        di |S )a  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Gemma3 uses a bidirectional mask for images.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz;`token_type_ids` is required as a model input when trainingrV  rS   )rS   r   r   )valuer   r   r[  rV   )r]  get_text_configis_initializedr   r   r   
functionalpadr   cumsumr   wherer/   r   )r   rW  r  r  r  r   r  r  r  r  r]   rf  is_imageis_previous_imagenew_image_startimage_group_idss                   r_   create_causal_mask_mappingr    s4   & ~-VWW ((*$((*$K ) 	%g_-K-K)Kg|cgOg 
 !&8 #a'++N,A,ABMM--ha-HCRCP"&7%77,,':':'<!DqH++hD*Fn334o+
&' %3{33rp   c                       e Zd ZdZdef fdZe ed      dej                  de
e   deez  fd	              Zee	 	 	 	 	 	 	 	 	 	 ddej                  d
z  dej                  d
z  dej                   d
z  dej                  d
z  ded
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  ded
z  de
e   deez  fd              Z xZS )Gemma3ModelFr   c                 (    t         |   |       | `y ry   )r   r[   text_config_dtyper   s     r_   r[   zGemma3Model.__init__#  s     "rp   zOProjects the last hidden state from the vision model into language model space.)custom_intror  r]   r   c                 t     | j                   d|dd|}|j                  }| j                  |      |_        |S )NT)r  return_dictrV   )vision_towerr\  multi_modal_projectorpooler_output)r\   r  r]   r{  r\  s        r_   get_image_featureszGemma3Model.get_image_features'  sH    
 +**aRVaZ`a*<<'+'A'ABS'T$rp   Nr   r  r   r  r  r  rR  labelsrA   	lm_kwargsc                    |d u |d uz  rt        d      |R| j                  j                  | j                  k\  r/|| j                  j                  k(  }|j	                         }d||<   n|}| | j                         |      }|F||j                         nd}t        j                  |||j                  d   z   |j                        }|i| j                  |d      j                  }|j                  |j                  |j                        }| j                  |||      }|j!                  ||      }t#        |x}t$              s(t'        | j                  |||||||| j(                  	      } | j*                  d|||||
d|d	|}t-        |j.                  |j0                  |j2                  |j4                  |
      S d 
      S )NrT  r   rS   rU  T)r  )rR  image_features)r  )r  r   r  rR  rA   r  r  )r\  r  r  
attentionsimage_hidden_statesrV   )r]  r   r   r6   r   get_input_embeddingsr^  r   r   r   r   r  r  r   r   get_placeholder_maskmasked_scatterr   r   r  r  language_modelr   r\  r  r  r  )r\   r   r  r  r   r  r  r  rR  r  rA   r  special_image_maskllm_input_idsrd  r  re  outputss                     r_   r   zGemma3Model.forward2  s     -t";<YZZ  T[[%?%?4??%R!*dkk.H.H!H%OO-M01M,-%M 7D557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 #!44\t4TbbN+..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M ?-F"< MM
# &$%% 	
.%+')	
 	
 )%77#33!//))2>2J
 	

 QU
 	
rp   )
NNNNNNNNNN)rz   r{   r|   accepts_loss_kwargsr   r[   r   r   r   r-  r   r   r   r   r  r  r   r	   rZ   r   r   r   r   s   @r_   r  r    s   #| # !rs!--9?@R9S	+	+ t   .215.204(,262626*.!%J
##d*J
 ''$.J
 t+	J

 &&-J
 J
 ((4/J
 ((4/J
 ((4/J
   4'J
 $;J
 ./J
 
*	*J
  J
rp   r  c                       e Zd ZdZee	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  de
dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dedz  deej                  z  dee   deez  fd              Z	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Gemma3ForConditionalGenerationFNr   r  r  r   r  r  r  rR  r  rA   logits_to_keepr  r   c                     | j                   d||||||||
|	|d
|}|d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }d}|	O|j                         }|dddddf   }|	dddf   }||dd|j                  d    df   j                  |j                        }||j                  |j                        dk7     j                         }||j                  |j                        dk7     j                         }n |j                         }|j                         }t        j                         }|j                  d| j                  j                  j                        }|j                  d      j                  |j                        } |||      }t!        |||j"                  |j$                  |j&                  |j(                        S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        )
r   r  r  r  r   r  rR  rA   r  r  r   N.r   rS   )losslogitsr  r  r  r  rV   )r0  r   r   slicelm_headr   r   r   r   r  r   CrossEntropyLossr  r   r   r6   r   r  r  r  r  )r\   r   r  r  r   r  r  r  rR  r  rA   r  r  r  r  slice_indicesr  r  shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                           r_   r   z&Gemma3ForConditionalGeneration.forward  s   | $** 
%))%+')
 
  
8B>SV8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5D+#33!//)) ' ; ;
 	
rp   c                 N    t        |   |f||||||	|
||d	|}|s|	s||d<   |S )N)	r  rR  r  r   r  rA   r  r  r  r  )r   prepare_inputs_for_generation)r\   r   r  rR  r  r   r  r  r  rA   r  r  r  r]   model_inputsr   s                  r_   r  z<Gemma3ForConditionalGeneration.prepare_inputs_for_generation  sX    " w<
+')%)))1
 
$ Y+7L(rp   )NNNNNNNNNNr   )NNNNNNNTNNF)rz   r{   r|   r  r   r   r   r  r-  r   r	   rZ   r   r   r   r   r   r   r  r   r   s   @r_   r  r    sx      .215.204(,262626*.!%-.l
##d*l
 ''$.l
 t+	l

 &&-l
 l
 ((4/l
 ((4/l
 ((4/l
   4'l
 $;l
 ell*l
 ./l
 
-	-l
  l
b  & &rp   r  c                   Z    e Zd ZddddZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 dd	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  de	j                  dz  dedz  dee   defd              Z xZS )Gemma3ForSequenceClassificationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projector)z^language_model.modelz^vision_towerz^multi_modal_projectorc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  j                  | j                  d      | _	        | j                          y )NF)bias)r   r[   
num_labelsr  r0  r   Linearr   r7   score	post_initr   s     r_   r[   z(Gemma3ForSequenceClassification.__init__&  sZ      ++ (
YYv11==tUZ[
 	rp   c                 6    | j                   j                         S ry   )r0  r  )r\   s    r_   r  z4Gemma3ForSequenceClassification.get_input_embeddings/  s    zz..00rp   c                 :    | j                   j                  |       y ry   )r0  set_input_embeddings)r\   r  s     r_   r  z4Gemma3ForSequenceClassification.set_input_embeddings2  s    

''.rp   Nr   r  r  r   r  rR  r  r  rA   r]   r   c
                     | j                   |f|||||||	d|
}|j                  }| j                  |      }||j                  d   }n|j                  d   }| j                  j
                  j                  |dk7  rt        d      | j                  j
                  j                  d}n||| j                  j
                  j                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                   j"                   d       |t        j                  ||j                  	      |f   }d}|| j%                  |||| j                  
      }t'        |||j(                  |j*                  |j,                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r  r  r   r  rR  r  rA   Nr   rS   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rU  )r  r  pooled_logitsr   )r  r  r  r  r  )r0  r\  r  r   r   r   rB   r]  r   r   r   int32r   argmaxr   warning_oncer   rz   loss_functionr   r  r  r  )r\   r   r  r  r   r  rR  r  r  rA   r]   transformer_outputsr  r  r  last_non_pad_tokennon_pad_masktoken_indicesr  r  s                       r_   r   z'Gemma3ForSequenceClassification.forward5  s   , )djj

)%%+')

 

 ,==M* "+J&,,Q/J;;""//7J!O\]];;""//7!#"%)@)@)M)MMQQRXR_R_afalalmL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD/ /??-;;*55
 	
rp   )	NNNNNNNNN)rz   r{   r|   _checkpoint_conversion_mappingr[   r  r  r   r   r   r  r-  r   r	   rZ   r   r   r   r   r   r   s   @r_   r  r    s.   !7-"?&"1/  .215.204(,2626*.!%C
##d*C
 ''$.C
 t+	C

 &&-C
 C
 ((4/C
 ((4/C
   4'C
 $;C
 +,C
 
*C
  C
rp   r  c                        e Zd ZU dZeed<   dZy)#Gemma3TextForSequenceClassificationz
    Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
    It uses the generic sequence classification implementation for efficiency and consistency.
    r   rN  N)rz   r{   r|   r}   r2   ri  rA  rV   rp   r_   r  r  }  s    
  rp   r  )	r   r2   r/  rM  rk  r  r  r  r  )NNFN)acollections.abcr   typingr   r   r   r   torch.nnr    r   r:  cache_utilsr	   r
   configuration_utilsr   r   masking_utilsr   r   r   modeling_layersr   r   modeling_outputsr   r   r   modeling_rope_utilsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   gemma2.configuration_gemma2r!   gemma2.modeling_gemma2r"   r#   r$   r%   r&   r'   r(   r)   r*   paligemma.modeling_paligemmar+   r,   r-   r.   r/   siglipr0   
get_loggerrz   r   r2   r   r   r   	Embeddingr   r   r   r   r   r   GEMMA3_START_DOCSTRINGr/  r   rZ   rK  rM  rk  r   r9  r   r-  r   r  r  r  r  r  __all__rV   rp   r_   <module>r     sx   % ) )   & . J m m [ u u 
 G & R R + 6
 
 
  ( 
		H	%x|%5 xv^## ^#B	 < 		#B 	SBLL S!	 !
+M +
L<1 L<`7)o 7)t.3 .b  ^1 ^<
# 
(CcSVCWY]C]:^ 
V
k V
r-) -!@		 !@V +/-1&*5454,,54 LL4'54 LL	54
 T\54 ,,%54 LL4'54 ##d*54 54 t54 
54p_
. _
D[%F [|[
&; [
|!*JLa !
rp   