
    im                    D   d dl Z d dlmZmZ d dlmZ d dlmZ d dlZd dl	m
Z
 d dlm
c mZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z: ee. G d de!                    Z;e e.d       G d de                     Z<e e.d       G d de,                    Z= G d  d!e
j|                        Z? G d" d#e
j|                        Z@ G d$ d%e
j|                        ZA G d& d'e
j|                        ZB G d( d)e
j|                        ZC G d* d+e
j|                        ZD G d, d-e
j|                        ZE G d. d/e
j|                        ZF G d0 d1e
j|                        ZG G d2 d3e
j|                        ZH G d4 d5e(      ZI G d6 d7e
j                        ZK G d8 d9e
j|                        ZL G d: d;e
j|                        ZM G d< d=e
j|                        ZNd> ZOd?ej                  d@eQdAej                  fdBZR	 	 	 dhdCe
j|                  dDej                  dEej                  dFej                  dGej                  dz  dHeSdIeSdz  dJeSdz  dAeTej                  ej                  f   fdKZUdidLej                  dMej                  dNej                  dOeQfdPZV eeV       G dQ dRe
j|                               ZW G dS dTe      ZXe. G dU dVe(             ZY G dW dXe
j|                        ZZ e.dY       G dZ d[eY             Z[ e.d\       G d] d^eYe             Z\ G d_ d`e
j|                        Z] e.da       G db dceY             Z^ e.dd       G de dfeYe             Z_g dgZ`y)j    N)CallableSequence)	dataclass)Optional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernelized_func)create_causal_mask!create_sliding_window_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)check_model_inputsmaybe_autocast   )	AutoModel   )Gemma3nAudioConfigGemma3nConfigGemma3nTextConfigGemma3nVisionConfigc                   :    e Zd ZU dZdZej                  dz  ed<   y)Gemma3nAudioEncoderModelOutputzz
    audio_mel_mask (`torch.FloatTensor`, *optional*):
        A torch.BoolTensor of shape `(batch_size, num_frames)`
    Naudio_mel_mask)__name__
__module____qualname____doc__r)   torch
BoolTensor__annotations__     v/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/gemma3n/modeling_gemma3n.pyr(   r(   5   s    
 /3NE$$t+2r2   r(   zL
    Base class for Gemma3n outputs, with hidden states and attentions.
    custom_introc                   b    e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   y)Gemma3nModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nimage_hidden_statesaudio_hidden_states)	r*   r+   r,   r-   r8   r.   FloatTensorr0   r9   r1   r2   r3   r7   r7   @   s5     59**T1848**T18r2   r7   zS
    Base class for Gemma3n causal language model (or autoregressive) outputs.
    c                   "   e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   dZej                  dz  ed	<   y)
Gemma3nCausalLMOutputWithPastaF  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr8   r9   )r*   r+   r,   r-   r=   r.   r:   r0   r>   r?   r
   r@   tuplerA   r8   r9   r1   r2   r3   r<   r<   Z   s    $ &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T1848**T18r2   r<   c                   r     e Zd Zd
dededef fdZd Zdej                  dej                  fdZ
d	 Z xZS )Gemma3nRMSNormdimeps
with_scalec                     t         |           || _        || _        | j                  r.t	        j
                  t        j                  |            | _        y | j                  dt        j                  d      d       y )Nweight      ?F
persistent)super__init__rF   rG   nn	Parameterr.   onesrI   register_buffertensor)selfrE   rF   rG   	__class__s       r3   rN   zGemma3nRMSNorm.__init__~   sY    $??,,uzz#7DK  5<<+< Or2   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S )Nr    T)keepdim)r.   sqrtpowmeanrF   )rT   xs     r3   _normzGemma3nRMSNorm._norm   s4    5::aeeAhmmBm=HIIIr2   r\   returnc                     | j                  |j                               | j                  j                         z  }|j                  |      S N)r]   floatrI   type_as)rT   r\   outputs      r3   forwardzGemma3nRMSNorm.forward   s9     AGGI&):):)<<~~a  r2   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)rB   rI   shaperF   rT   s    r3   
extra_reprzGemma3nRMSNorm.extra_repr   s'    ))*+6$((<<r2   )gư>T)r*   r+   r,   intra   boolrN   r]   r.   Tensorrd   rh   __classcell__rU   s   @r3   rD   rD   }   sG    PC Pe P PJ! !%,, !=r2   rD   c                       e Zd Zdef fdZdej                  dej                  dej                  fdZdej                  de	d	e	d
e	de	de	de	dej                  fdZ
dej                  dej                  dej                  fdZ xZS )%Gemma3nAudioRelativePositionEmbeddingconfigc                 R   t         |           || _        | j                  j                  | _        | j                  j
                  | _        | j                  | j                  z  | _        t        d| j                  j                  dz
        | _
        | j                  j                  | _        t        j                  | j                  | j                  | j                  z  d      | _        d}d}| j                  dz  }t!        j"                  t%        |      t%        |      z        t        |dz
  d      z  }|t'        j(                  t'        j*                  |      | z        z  }| j-                  d|j%                         j/                  d      j/                  d      d	       y )
Nr   r"   FbiasrJ        @r    inv_timescalesrK   )rM   rN   rp   conf_num_attention_heads	num_headshidden_sizechannelshead_dimmaxconf_attention_context_leftmax_backwardconf_attention_context_rightmax_forwardrO   Linearpos_projmathlogra   r.   exparangerR   	unsqueeze)rT   rp   min_timescalemax_timescalenum_timescaleslog_timescale_incrementru   rU   s          r3   rN   z.Gemma3nAudioRelativePositionEmbedding.__init__   sL   ==//74;;#J#JQ#NO;;CC		$--$--1OV[\!+"&((5+?%BV+V"WZ]^lop^prsZt"t&5<<3OSjRj3j)kk  ",,Q/99!< 	 	
r2   positiondtyper^   c                 P   |j                         j                  d      }|| j                  j                  |j                  t
        j                        z  }t        j                  t        j                  |      t        j                  |      gd      }|j                  |      S )NrW   devicer   rE   )ra   r   ru   tor   r.   float32catsincostype)rT   r   r   scaled_timetiming_signals        r3   _get_timing_signal_1d_posz?Gemma3nAudioRelativePositionEmbedding._get_timing_signal_1d_pos   s}    >>#--b1!4!4!7!7xV[VcVc!7!dd		599[#9599[;Q"RXZ[!!%((r2   term_bd_before_shift
batch_sizerw   num_query_blocksquery_block_sizekey_context_sizemax_span_plus_1c                     |dz   |z
  }d|f}	t         j                  j                  ||	      }
|
j                  |||||dz   z  f      }|ddddddd||z  f   }|j                  |||||f      }|S )aZ  Performs the relative shift.

        Args:
          term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
            (B), num_heads (N), num_query_blocks (U), query_block_size (W),
            key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1).

        Returns:
          Tensor of shape [B, N, U, W, C].
        r"   r   N)rO   
functionalpadreshape)rT   r   r   rw   r   r   r   r   pad_amount_last_dimpadding_tupleterm_bd_paddedterm_bd_reshapedterm_bd_slicedterm_bd_shifteds                 r3   _relative_shiftz5Gemma3nAudioRelativePositionEmbedding._relative_shift   s    4  0!3F /0**+?O
 *11  $4q$89	
 *!Q3X5EHX5X3X*XY )00   
 r2   querieskeysc           	      R   |j                   \  }}}}}|j                   \  }}}	}}t        j                  | j                  | j                   dz
  d|j
                        j                  d      }
|
j                   d   }| j                  |
|j                        }| j                  |      }|j                  d|| j                  | j                        j                  d      }|j                  ddddd      }|j                  ddddd      }t        j                  ||      }|j                  ddddd      }|j                  ddd      }|j                  ||||z  |      }t        j                  ||      }|j                  |||||      }| j!                  ||||||	|      }||z   S )	Nr"   rW   r   r   r   r   r       )rf   r.   r   r}   r   r   r   r   r   r   r   rw   rz   squeezepermutematmulr   )rT   r   r   r   r   r   rw   rz   _r   pos_indicesr   sin_emb_timing_signalprojected_sin_embsin_emb	queries_pkeys_p_tterm_ac
q_permuted
s_permuted
q_reshapedterm_bd_unshifed_matmulterm_bd_unshifedr   s                           r3   rd   z-Gemma3nAudioRelativePositionEmbedding.forward   s    OVmmK
$&6	8'+zz$11 ll4#4#4t7G7G6G!6KRX_XfXfgqq
 &++A. $ > >w}} !? !

 !MM*?@#++APTP]P]^ff
 OOAq!Q2	<<1aA.,,y(3 __Q1a3
 __Q1-
  ''
I?ORb?bdlm

 #(,,z:"F 3::
 ..
 ((r2   )r*   r+   r,   r#   rN   r.   rk   r   r   ri   r   rd   rl   rm   s   @r3   ro   ro      s    
1 
.)%,, )u{{ )W\WcWc );#ll; ; 	;
 ; ; ; ; 
;zL)u|| L)5<< L)ELL L)r2   ro   c                   *    e Zd Zdef fdZd Zdej                  dededej                  fdZ	d	ej                  dej                  fd
Z
d	ej                  dej                  fdZd	ej                  dej                  dej                  fdZ xZS )Gemma3nAudioAttentionrp   c                    t         |           || _        | j                  j                  | _        | j                  j
                  | _        | j
                  | j                  z  | _        | j                  j                  | _        | j                  j                  | _
        t        d| j                  j                  dz
        | _        | j                  j                  | _        | j                  | j                  z   | j                  z   | _        t#        |      | _        t'        j(                  t+        j,                  | j                  f            | _        t'        j0                  | j
                  | j                  | j                  z  d      | _        t'        j0                  | j
                  | j                  | j                  z  d      | _        t'        j0                  | j
                  | j                  | j                  z  d      | _        | j                  dz  }dt*        j&                  j8                  j;                  t+        j<                  d            z  }| j?                  d||z  jA                         jC                         d	       | jE                         }| j?                  d
|d	       | j?                  dt+        j<                  | j                        jG                         d	       y )Nr   r"   Frr         rJ           q_scalerK   local_causal_valid_masksoftcap)$rM   rN   rp   rv   rw   rx   rz   conf_attention_chunk_size
chunk_sizer~   max_future_horizonr{   r|   max_past_horizonconf_attention_logit_capattention_logits_soft_capcontext_sizero   relative_position_embeddingrO   rP   r.   zerosper_dim_scaler   q_projk_projv_projr   softplusrS   rR   clonedetachcreate_local_causal_valid_maskra   )rT   rp   r   r_softplus_0r   rU   s        r3   rN   zGemma3nAudioAttention.__init__C  s)   ==;;22((DNN:++??"&++"J"J #At{{'N'NQR'R S)-)M)M& OOd.C.CCdF]F]]+PQW+X(\\%++t}}6F*GHii 0 0$..4==2PW\]ii 0 0$..4==2PW\]ii 0 0$..4==2PW\]--%UXX0099%,,s:KLLY<)?(F(F(H(O(O(Q^cd"&"E"E"G68O\abLL778>>@ 	 	
r2   c                    t        j                  t        j                  | j                  | j                  ft         j
                        d      j                  }t        j                  t        j                  | j                  | j                  ft         j
                        | j                  | j                  z         }t        j                  | j                  | j                  ft         j
                        }||z  |z  }|S )Nr   r   )diagonal)	r.   trilrQ   r   r   rj   Tr   r   )rT   lower_causal_maskupper_causal_maskr   s       r3   r   z4Gemma3nAudioAttention.create_local_causal_valid_maske  s    !JJJJ))4??;5::N
 ! 	 "JJJJ):):;5::N**T-D-DD
 #(**doot?P?P-QY^YcYc"d"9<M"MPa"a&&r2   r\   pad_left	pad_rightr^   c                     |j                   ^}}}|j                  ||g|      }|j                  ||g|      }t        j                  |||gd      }|S )Nr"   r   )rf   	new_zerosr.   r   )	rT   r\   r   r   batchr   
tail_shapeleftrights	            r3   	_pad_dim1zGemma3nAudioAttention._pad_dim1r  s^     !q:{{E89j9:UI;
;<IItQ&A.r2   r@   c                 (   |j                   }|dd \  }}|| j                  z   dz
  | j                  z  }|| j                  z  |z
  x}dkD  r| j                  |d|      }||| j                  f|dd z   }|j                  |      j	                         }|S )aE  Turns a sequence to non overlapping blocks.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, block_size, ...], with necessary
            paddings,
            where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
        Nr    r"   r   )rf   r   r   r   
contiguous)rT   r@   rf   bt
num_blockspadding_lenpermute_dimss           r3   _convert_to_blockz'Gemma3nAudioAttention._convert_to_blocky  s     ##Ray1$//)A-$//A
%7!;;Kq@ NN=![IM:t7%)C%--l;FFHr2   c                 \   | j                   }| j                  | j                  z   dz
  }| j                  |||      }| j                  }| j                  }|j                  d||      }|j                  dkD  r'|j                  dkD  rt        j                  |dd      }|j                         S )a  Extracts temporal context for every block.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, context_size, ...], with necessary
            paddings,
            where context_size = block_size + left_context + right_context,
            and output[:, i, ...] are x[:, start-left_context:end+right_context,
            ...],
            start = i * block_size, end = (i + 1) * block_size.
        r"   )	dimensionsizestepr    r   rW   )sourcedestination)
r   r   r   r   r   unfoldndimr.   movedimr   )rT   r@   r   r   	frame_len
frame_step
x_unfoldeds          r3   _extract_block_contextz,Gemma3nAudioAttention._extract_block_context  s     (( ++doo=A	}h	J%%	__
 #))AIJ)W
 !joo&9 z"!LJ$$&&r2   maskc                 	   g |j                   d d | j                  | j                  }| j                  |      j	                  |      j                         }| j                  |      j	                  |      j                         }| j                  |      j	                  |      j                         }t        j                  j                  j                  | j                        }ddd| j                  f}|j                  |      }	|| j                  z  |	z  }|j                   d d \  }
}| j                  |      }| j!                  |      }| j!                  |      }|j                   d   }| }| j!                  |      }|j"                  dk(  rI|j                   d   |j                   d   z  | j$                  k(  r|j	                  |
|| j$                        }|j                   |
|| j$                  fk7  r,t'        d|j                    d|
 d| d| j$                   d		      |j)                  d      j)                  d
      }| j*                  j)                  d      j)                  d      j)                  d      }t        j,                  ||j/                  |j0                              }| j3                  ||      }| j4                  j/                  |j0                        }||z  }t        j6                  |      }||z  }t        j8                  ||t        j:                  |j<                        j>                        }t        j                  j                  jA                  |dt        jB                        j/                  |j<                        }|j                   \  }}}}}|j                   d   }|jE                  ddddd      j	                  d||      }|jE                  ddddd      j	                  d||      }t        jF                  ||      } | j	                  |||||      jE                  ddddd      }!|!j	                  |
|| jH                  z  | j                  | j                  f      }!|!d d d |f   }!|!S )NrW   r"   r    r   r   z%Shape of extracted_valid_mask_blocks z	 is not (z, z) after potential reshape.r   rE   r   r   )%rf   rw   rz   r   r   r   r   r   r.   rO   r   r   r   viewr   r   r   r   r   
ValueErrorr   r   logical_andr   r   r   r   tanhwherefinfor   minsoftmaxr   r   bmmr   )"rT   r@   r   	qkv_shapequery_states
key_statesvalue_statesper_dim_scale_spbroadcast_shapeper_dim_scale_sp_broadcastr   q_timequery_blocks
key_blocksvalue_blocksr   original_valid_maskextracted_valid_mask_blockscondition_from_input_validitycondition_from_causalityfinal_condition_for_wherer>   softcap_valprobabilitiesb_dimn_dimu_dimw_dimc_dimh_dimprob_bunv_bun
result_bmmcontext_vectorss"                                     r3   rd   zGemma3nAudioAttention.forward  sT   Nm))#2.NNN	{{=199)DOOQ[[/77	BMMO
{{=199)DOOQ 88..778J8JKaDMM2%5%:%:?%K"#dll25OO)//3
F--l;00<
22<@'--a0  $e '+&A&ABU&V# (,,1+11!47R7X7XYZ7[[_c_p_pp*E*M*M,d.?.?+' ',,1
 

 /556i
| L$%R(9(9'::TV  )D(M(Ma(P(Z(Z[](^% $(#?#?#I#I!#L#V#VWX#Y#c#cde#f 
 %*$5$5)$''(E(L(LM%
! 11,
K lloofmm4+%F#+% 6FLL@Y@]@]^++33F%--3X[[bnbtbt[u -:,?,?)ueUE""2& ((Aq!Q7??E5Q$$Q1a3;;BuMYYx/
$,,UE5%OWWXY[\^_abdef)11 4??2	
 *!WfW*5r2   )r*   r+   r,   r#   rN   r   r.   rk   ri   r   r   r   r/   rd   rl   rm   s   @r3   r   r   B  s     
1  
D'5<< 3 3 5<< u||  ,.'ELL .'U\\ .'`dU\\ d9I9I dell dr2   r   c                   r     e Zd ZdZ	 d	dedee   def fdZdej                  dej                  fdZ
 xZS )
Gemma3nAudioCumulativeGroupNorma  Applies Group Normalization cumulatively over the time dimension.

    This layer normalizes the input by calculating the mean and variance
    cumulatively over the time dimension (dim 1). The statistics are computed
    over all feature dimensions (specified by `feature_dims` and `num_channels`)
    for elements marked as valid by the optional `mask`.

    If a `mask` is provided (True for valid, False for invalid/padded),
    invalid time steps do not contribute to the statistics calculation, and
    their corresponding output values are zeroed out.

    Scale and bias, if enabled, are applied per-channel (last dimension).
    This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
    and `cumulative=True`.
    num_channelsfeature_dimsrF   c           	         t         |           || _        t        |      | _        || _        t        j                  t        j                  |            | _
        t        t        ddt        | j                        z   dz               | _        y )Nr    r"   )rM   rN   r*  rB   r+  rF   rO   rP   r.   rQ   rI   rangelenreduction_axes)rT   r*  r+  rF   rU   s       r3   rN   z(Gemma3nAudioCumulativeGroupNorm.__init__7  sr     	(!,/ ll5::l#;< $E!QT5F5F1G-G!-K$LMr2   r@   r^   c                    | j                   | j                  fz   }|j                  dd |k7  rt        d|j                  dd  d|       |j                  }t
        j                  }|j                  |      }t        j                  ||      }t        j                  || j                  d      }t        j                  |d	      }t        j                  || j                  d      }	t        j                  |	d	      }
t        j                  |
d
      }||z  }||z
  j                  d      }t        j                  || j                  d      }t        j                  |d	      }||z  }||z
  t        j                  || j                  z         z  }| j                   j                  |      }dg|j#                         dz
  z  | j                  gz   }||j%                  |      z  }||z  }|j                  |      S )zApplies cumulative group norm, optionally using a mask.

        Args:
          hidden_states: Input tensor, shape [B, T, *feature_dims, C].

        Returns:
          Normalized tensor with the same shape as x.
        r    NzInput tensor shape suffix z> does not match expected suffix (feature_dims + num_channels) r   TrE   rX   r"   r   rJ   )r	  )r+  r*  rf   r  r   r.   r   r   	ones_likesumr/  cumsumclamprZ   rsqrtrF   rI   rE   r  )rT   r@   expected_input_suffixinput_dtype
calc_dtypex_calc	mask_calcsum_values_at_tcum_sum_valueselements_in_group_at_tcum_count_elementssafe_cum_count_elementscum_meansquared_diff_from_meansum_sq_diff_at_tcum_sum_sq_diffcum_variancenormalized_xscalescale_view_shapefinal_outputs                        r3   rd   z'Gemma3nAudioCumulativeGroupNorm.forwardI  s    !% 1 1T5F5F4H Hqr"&;;,]-@-@-D,E F99N8OQ 
 $))]]
!!*- OOF*=	  ))F0C0CTRo1= "'9$:M:MW[!\"\\*@aH"'++.@c"J "$;;
 #)8"3!8!8!; 99%;ATAT^bc  ,,'7Q? ')@@ )U[[9P-QQ z*3-"3"3"5"9:d>O>O=PP#ejj1A&BB $i/{++r2   )gMbP?)r*   r+   r,   r-   ri   r   ra   rN   r.   rk   rd   rl   rm   s   @r3   r)  r)  &  sT    ( 	NN smN 	N$G,U\\ G,ell G,r2   r)  c                   ~     e Zd ZdZ	 d
dedededeeeeef   f fdZdej                  dej                  fd	Z
 xZS )Gemma3nAudioSSCPConvBlockzA single convolution block for the SubSampleConvProjection.

    This block consists of a 2D convolution, followed by CumulativeGroupNorm,
    and a ReLU activation. It handles manual padding for the convolution.
    rp   idxinput_freq_dimmanual_paddingc                 J   t         |           || _        || _        |dk(  rdn| j                  j                  |dz
     }| j                  j                  |   }| j                  j
                  |   \  }}| j                  j                  |   \  }	}
t        j                  ||||f|	|
fdd      | _	        || j                  d   z   | j                  d   z   }||z
  |
z  dz   }t        ||f| j                  j                        | _        t        j                         | _        y )Nr   r"   )r   r   F)in_channelsout_channelskernel_sizestridepaddingrs   )r*  r+  rF   )rM   rN   rp   rN  sscp_conv_channel_sizesscp_conv_kernel_sizesscp_conv_stride_sizerO   Conv2dconvr)  sscp_conv_group_norm_epsnormReLU
activation)rT   rp   rL  rM  rN  rP  rQ  kernel_hkernel_wstride_hstride_wf_in_padded
f_out_convrU   s                r3   rN   z"Gemma3nAudioSSCPConvBlock.__init__  s%    	, !8a)K)KCRSG)T{{99#>![[>>sC(![[>>sC(II#% h'

	 %t':':1'==@S@STU@VV!H,9A=
3%$44
	 '')r2   audio_encodingsr^   c                    t        j                  || j                  dd      j                  | j                  j
                  j                        }| j	                  |      }|j                  dddd      j                         }| j                  |      }|j                  dddd      j                         }| j                  |      S )Nconstantr   )modevaluer   r    r   r"   )Fr   rN  r   rY  rI   r   r   r   r[  r]  )rT   rd  audio_encodings_paddedaudio_encodings_conv
x_for_normx_normedaudio_encodings_normeds          r3   rd   z!Gemma3nAudioSSCPConvBlock.forward  s     "#8K8KR\dg!h!k!kII"""

  $yy)?@ *11!Q1=HHJ
99Z(!)!1!1!Q1!=!H!H!J566r2   ))r   r   r   r   )r*   r+   r,   r-   r#   ri   rB   rN   r.   rk   rd   rl   rm   s   @r3   rK  rK    sc     5A)$")$ )$ 	)$
 c3S01)$V7u|| 7 7r2   rK  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )#Gemma3nAudioSubSampleConvProjectionrp   c                 p   t         |           || _        |j                  }g }g }t	        d      D ]n  }|j
                  |   \  }}|j                  |   \  }}	d}
|dz
  }d}d}|||
|f}|j                  |       ||z   |z   }||z
  |	z  dz   }|j                  |       |}p t        d|j                  ||d         | _	        t        d|d   ||d         | _
        |j                  d   }|d   }||z  | _        t        j                  | j                  | j                  j                  d      | _        y )Nr    r   r"   )rL  rM  rp   rN  rW   Frr   )rM   rN   rp   input_feat_sizer-  rV  rW  appendrK  conv_0conv_1rU  input_proj_in_featuresrO   r   rx   input_proj_linear)rT   rp   current_f_for_block_inputcalculated_block_paddingcalculated_f_out_dimsir^  r_  r`  ra  	pad_t_toppad_t_bottom
pad_f_leftpad_f_rightmanual_padding_tuplerb  f_out_after_convfinal_c_outfinal_f_outrU   s                      r3   rN   z,Gemma3nAudioSubSampleConvProjection.__init__  s   $*$:$:!#%  "q 	9A!'!=!=a!@Hh!'!=!=a!@Hh I#a<L JK 	$  %++,@A 4j@;NK +h 68CaG!(()9:(8%=	9@ 0!113A6	
 0033A6	
 33B7+B/&1K&?#!#4+F+FH_H_fk!lr2   rd  r^   c                     |j                  d      }| j                  |      }| j                  |      }|j                  \  }}}}|j	                  dddd      j                         }|j                  ||||z        }	| j                  |	      }
|
S )Nr"   r   r    r   )r   rt  ru  rf   r   r   r  rw  )rT   rd  audio_encodings_reshapedr\   r   c_outt_outf_out
x_permutedoutput_flattenedrc   s              r3   rd   z+Gemma3nAudioSubSampleConvProjection.forward  s     $3#<#<Q#? KK01KKN!"5%YYq!Q*557
%??1eUU]C''(89r2   	r*   r+   r,   r#   rN   r.   rk   rd   rl   rm   s   @r3   rp  rp    s.    7m1 7mru||  r2   rp  c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZ	S )Gemma3nAudioConformerAttentionrp   c                    t         |           || _        | j                  j                  | _        | j                  dt        j                  | j                  j                        d       t        | j                  j                        | _
        t        |      | _        t        j                  | j                  | j                  j                  d      | _        t        | j                  j                        | _        y )Ngradient_clippingFrK   rr   )rM   rN   rp   rx   post_in_featuresrR   r.   rS   r  rD   pre_attn_normr   attnrO   r   post	post_normrT   rp   rU   s     r3   rN   z'Gemma3nAudioConformerAttention.__init__"  s     $ 7 70%,,t{{?\?\2]jop+DKK,C,CD)&1	IId33T[[5L5LSXY	'(?(?@r2   rd  r)   r^   c                    |}t        j                  || j                   | j                        }| j                  |      }| j	                  ||      }|j
                  \  }}}}	|j                  ||||	z        }
| j                  |
      }t        j                  || j                   | j                        }|| j                  |      z   S r`   )	r.   r5  r  r  r  rf   r   r  r  )rT   rd  r)   audio_encodings_input_to_attnaudio_encodings_normaudio_encodings_attn_outr   r   rw   rz   r  s              r3   rd   z&Gemma3nAudioConformerAttention.forward,  s    (7%++o8N8N7NPTPfPfg#11/B#'99-A>#R  %=$B$B!1i#;#C#CAq)V^J^#_ ))$<=++o8N8N7NPTPfPfg,t~~o/NNNr2   
r*   r+   r,   r#   rN   r.   rk   r/   rd   rl   rm   s   @r3   r  r  !  sA    A1 AOu|| OUEUEU OZ_ZfZf Or2   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS ) Gemma3nAudioConformerFeedForwardrp   c                    t         |           || _        | j                  dt	        j
                  | j                  j                        d       t        | j                  j                        | _	        t        j                  | j                  j                  | j                  j                  dz  d      | _        t        j                  | j                  j                  dz  | j                  j                  d      | _        t        | j                  j                        | _        | j                  j                  | _        y )Nr  FrK   r   rr   )rM   rN   rp   rR   r.   rS   r  rD   rx   pre_layer_normrO   r   ffw_layer_1ffw_layer_2post_layer_normconf_residual_weightpost_layer_scaler  s     r3   rN   z)Gemma3nAudioConformerFeedForward.__init__>  s    0%,,t{{?\?\2]jop,T[[-D-DE99T[[%<%<dkk>U>UXY>Y`ef99T[[%<%<q%@$++BYBY`ef-dkk.E.EF $ @ @r2   rd  r^   c                    |}t        j                  || j                   | j                        }| j                  |      }| j	                  |      }t
        j                  j                  |      }| j                  |      }t        j                  || j                   | j                        }| j                  |      }||| j                  z  z   S r`   )r.   r5  r  r  r  rO   r   silur  r  r  )rT   rd  residuals      r3   rd   z(Gemma3nAudioConformerFeedForward.forwardJ  s    "++o8N8N7NPTPfPfg--o>(,(8(8(I--,,_=(,(8(8(I++o8N8N7NPTPfPfg..??T-B-BBCCr2   r  rm   s   @r3   r  r  =  s0    
A1 
A	Du|| 	D 	Dr2   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS ) Gemma3nAudioConformerLightConv1drp   c           	         t         |           || _        t        | j                  j                  | j                  j
                        | _        t        j                  | j                  j                  | j                  j                  dz  d      | _	        t        j                  | j                  j                  | j                  j                  | j                  j                  dd| j                  j                  d      | _        | j                  dt        j                  | j                  j                         d	       t        | j                  j                  | j                  j
                        | _        t        j                  | j                  j                  | j                  j                  d      | _        | j                  j                  dz
  | _        y )
NrF   r    Frr   r"   r   )rP  rQ  rR  rS  rT  groupsrs   r  rK   )rM   rN   rp   rD   rx   rms_norm_epsr  rO   r   linear_startConv1dconf_conv_kernel_sizedepthwise_conv1drR   r.   rS   r  	conv_norm
linear_endcausal_paddingr  s     r3   rN   z)Gemma3nAudioConformerLightConv1d.__init__W  sD   ,T[[-D-D$++JbJbcIIdkk&=&=t{{?V?VYZ?Zafg "		//0099;;**!
 	0%,,t{{?\?\2]jop'(?(?T[[E]E]^))DKK$;$;T[[=T=T[`a"kk??!Cr2   rd  r^   c                 :   |}| j                  |      }| j                  |      }t        j                  j                  j                  |d      }|j                  ddd      }t        j                  || j                  df      }| j                  |      }|j                  ddd      }t        j                  || j                   | j                        }| j                  |      }t        j                  j                  |      }| j                  |      }||z   }|S )NrW   r   r   r    r"   )r  r  r.   rO   r   glur   ri  r   r  r  r5  r  r  r  r  )rT   rd  audio_encodings_residualaudio_encodings_permutedaudio_encodings_permuted_paddedrc   s         r3   rd   z(Gemma3nAudioConformerLightConv1d.forwardl  s   #2 --o>++O<((--11/r1J#2#:#:1a#C *+%%0H4K^K^`aJb*c'//0OP)11!Q:++o8N8N7NPTPfPfg..9--,,_=///: #;;r2   r  rm   s   @r3   r  r  V  s-    D1 D*u||  r2   r  c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZ	S )Gemma3nAudioConformerBlockrp   c                    t         |           || _        t        | j                        | _        t        | j                        | _        t        | j                        | _        t        | j                        | _	        | j                  dt        j                  | j                  j                        d       t        | j                  j                        | _        y )Nr  FrK   )rM   rN   rp   r  ffw_layer_startr  	attentionr  lconv1dffw_layer_endrR   r.   rS   r  rD   rx   r[  r  s     r3   rN   z#Gemma3nAudioConformerBlock.__init__  s    ?L7D7D=dkkJ0%,,t{{?\?\2]jop"4;;#:#:;	r2   rd  r)   r^   c                 j   | j                  |      }| j                  ||      }| }||j                  d      j                  |j                        z  }| j                  |      }| j                  |      }t        j                  || j                   | j                        }| j                  |      }|S )NrW   )r  r  r   r   r   r  r  r.   r5  r  r[  )rT   rd  r)   validity_mask_for_lconvaudio_encodings_for_lconv_inputrc   s         r3   rd   z"Gemma3nAudioConformerBlock.forward  s    ..?...I#1/*9<S<]<]^`<a<d<d!!=
 +
' ,,'FG,,_=++o8N8N7NPTPfPfg?+r2   r  rm   s   @r3   r  r    s;    	<1 	<u|| UEUEU Z_ZfZf r2   r  c            
            e Zd ZU dZeed<   dZdZdef fdZe	de
j                  de
j                  dee   deez  fd	       Z xZS )
Gemma3nAudioEncoderzx
    An audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.
    rp   	audio_melaudioc                    t         |   |       || _        t        |      | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _
        | j                          y c c}w r`   )rM   rN   rp   rp  subsample_conv_projectionrO   
ModuleListr-  conf_num_hidden_layersr  	conformer	post_init)rT   rp   r   rU   s      r3   rN   zGemma3nAudioEncoder.__init__  se     )LV)T&9>v?\?\9]^A'/^
 	 _s   A=r)   kwargsr^   c                 >   | j                  |      }|j                  d   }d}t        t        | j                  j
                              D ]!  }|| j                  j
                  |   d   z  }# t        j                  ||j                        |z  }t        j                  ||j                  d   dz
        }|j                  dkD  r>|j                  dk(  r/|j                  d      j                  |j                  d   d      }n`|j                  |j                  k(  rG|j                  d   dk(  r5|j                  d   dk7  r#||j                  d   k(  r|j                  d      }t        j                  |d|      }	| j                  D ]  }
 |
||	      } | j                  j                  dkD  r@|dddd| j                  j                  f   }|	dddd| j                  j                  f   }	|j!                  |	j                  d      d      }t#        ||	      S )	a  Encodes a batch of MELs.

        Args:
            audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
              mel_bins].

        Returns:
            audio_encodings: a torch.Tensor of shape
                `[batch_size, self.config.audio_soft_tokens_per_image,
                self.config.audio_config.hidden_size]`
            audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
        r"   r   r   )r{   rW   Nr   )last_hidden_stater)   )r  rf   r-  r.  rp   rW  r.   r   r   r5  r   r   expandgatherr  conf_reduction_factormasked_fillr(   )rT   r  r)   r  rd  t_subtime_stride_productstride_pair_idxindicescurrent_maskblocks              r3   rd   zGemma3nAudioEncoder.forward  s     88C  %%a($S)J)J%KL 	YO4;;#D#D_#UVW#XX	Y ,,u^-B-BCFYY++g>+?+?+BQ+FG "w||q'8''*11.2F2Fq2I2NG7<</$$Q'1,a A%q)) ''*G||NAw?^^ 	CE#O\BO	C ;;,,q0-a1UDKK4U4U1U.UVO'+Odkk.O.O+O(OPL)55l6L6LR6PRUV--'
 	
r2   )r*   r+   r,   r-   r#   r0   main_input_nameinput_modalitiesrN   r   r.   rk   r/   r   r   rB   r(   rd   rl   rm   s   @r3   r  r    sr     !O1  8
8
7<7G7G8
SYZlSm8
	/	/8
 8
r2   r  c            	       Z     e Zd ZdZd	dedededef fdZdej                  f fdZ	 xZ
S )
Gemma3nTextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                     t         |   |||       || _        | j                  dt	        j
                  |      d       y )Nr  FrK   )rM   rN   scalar_embed_scalerR   r.   rS   )rT   r  r  r  r  rU   s        r3   rN   z'Gemma3nTextScaledWordEmbedding.__init__  s;    D"-]ELL,ERWXr2   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  S r`   )rM   rd   r  r   rI   r   )rT   r  rU   s     r3   rd   z&Gemma3nTextScaledWordEmbedding.forward  s2    wy)D,<,<,?,?@Q@Q,RRRr2   )rJ   )r*   r+   r,   r-   ri   ra   rN   r.   rk   rd   rl   rm   s   @r3   r  r    sG    Ys Y3 YS Y_d Y
S S Sr2   r  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )Gemma3nTextLaurelBlockz Learned Augmented Residual Layerrp   c                    t         |           || _        t        j                  | j                  j
                  | j                  j                  d      | _        t        j                  | j                  j                  | j                  j
                  d      | _        t        | j                  j
                  | j                  j                        | _        y )NFrr   r  )rM   rN   rp   rO   r   rx   laurel_ranklinear_leftlinear_rightrD   r  post_laurel_normr  s     r3   rN   zGemma3nTextLaurelBlock.__init__  s    99T[[%<%<dkk>U>U\abIIdkk&=&=t{{?V?V]bc .t{{/F/FDKKLdLd er2   r@   r^   c                 r    | j                  |      }| j                  |      }| j                  |      }||z   S r`   )r  r  r  )rT   r@   laurel_hidden_statesnormed_laurel_hidden_statess       r3   rd   zGemma3nTextLaurelBlock.forward  sC    -1-=-=m-L-1->->?S-T&*&;&;<P&Q#:::r2   )
r*   r+   r,   r-   r%   rN   r.   rk   rd   rl   rm   s   @r3   r  r    s0    *f0 f;U\\ ;ell ;r2   r  c                        e Zd Zd	dedef fdZdej                  dej                  fdZdej                  dej                  fdZ	 xZ
S )
Gemma3nTextMLPrp   	layer_idxc                    t         |           || _        |j                  | _        |j                  |   | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        |j                  |   | _        y NFrr   )rM   rN   rp   rx   intermediate_sizerO   r   	gate_projup_proj	down_projr	   hidden_activationact_fnactivation_sparsity_patternactivation_sparsityrT   rp   r  rU   s      r3   rN   zGemma3nTextMLP.__init__  s    !--!'!9!9)!D4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556#)#E#Ei#P r2   r@   r^   c                     | j                  |      }| j                  dkD  r| j                  |      }| j                  |      }| j	                  |      }| j                  ||z        }|S )Nr   )r  r  _gaussian_topkr  r  r  )rT   r@   r  activationsr  r  s         r3   rd   zGemma3nTextMLP.forward  sc    NN=1	##c)++I6Ikk),,,}-NN;#89	r2   inputsc                    t        j                  | j                  t         j                  |j                        }t         j
                  j                  j                  dd      }|j                  |      }|j                  |j                        }t        j                  |dd      }t        j                  |ddd      }|||z  z   }t        j                  j                  ||z
        S )	Nr   r   r   r"   rW   Tr1  F)rE   rX   unbiased)r.   rS   r  r   r   distributionsnormalNormalicdfr   r   r[   stdrO   r   relu)rT   r  target_sparsity_tensornormal_diststd_multiplierinputs_mean
inputs_stdcutoff_xs           r3   r  zGemma3nTextMLP._gaussian_topk!  s    !&d.F.Femmdjdqdq!r ))00771='2'7'78N'O',,V\\:jjR>YYv2teL
n!<<}}!!&8"344r2   )r   )r*   r+   r,   r%   ri   rN   r.   rk   rd   r  rl   rm   s   @r3   r  r    sP    	Q0 	QS 	QU\\ ell 5U\\ 5ell 5r2   r  c                   X    e Zd ZdZdef fdZdej                  dej                  fdZdej                  dej                  fdZ	d	ej                  d
ej                  dej                  fdZ
dej                  dej                  fdZdej                  dej                  fdZ xZS )Gemma3nTextAltUpa  Alternating Updates (AltUp)

    The AltUp module wraps transformer layers. The `predict` step modifies the
    input to the transformer layer, and the `correct` step propagates the output
    of the transformer layer to the sparsely updated dimensions.

    See more in the research paper:

    https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
    rp   c                 F   t         |           || _        t        j                  t        j                  | j                  j                              | _        t        j                  | j                  j                  | j                  j                  d      | _        t        j                  | j                  j                  | j                  j                  dz  d      | _        t        j                  | j                  j                  | j                  j                  d      | _        t        | j                  j                  | j                  j                        | _        | j#                  dt        j$                  | j                  j                  dz        d       y )NFrr   r    r  router_input_scale      rK   )rM   rN   rp   rO   rP   r.   r   rx   correct_output_scaler   altup_num_inputscorrection_coefsprediction_coefsmodality_routerrD   r  router_normrR   rS   r  s     r3   rN   zGemma3nTextAltUp.__init__>  s   $&LLT[[=T=T1U$V! "		$++*F*FHdHdkp q "		$++*F*FHdHdfgHgns t!yy)@)@$++B^B^ejk)$++*A*At{{G_G_`15<<@W@WY]@]3^kpqr2   r\   r^   c                     | j                  |      | j                  z  }| j                  |      }t        j                  |j                               j                  |      S r`   )r  r  r  r.   r  ra   rb   )rT   r\   router_inputsrouteds       r3   compute_router_modalitiesz*Gemma3nTextAltUp.compute_router_modalitiesH  sM    ((+d.E.EE%%m4zz&,,.)11!44r2   r@   c                    | j                  || j                  j                           }| j                  ro| j                  j                  Y| j
                  j                  j                  j                  | j                  j                   | j                  j                          | j                  |      j                  g |j                  dd | j                  j                  | j                  j                   j                  dddd      }t        j                  |j                  dddd      |      }|j                  dddd      }||z  }|j                         j!                  |      S )a  Predicts the output of a layer using a trainable map.

        Args:
            hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
        NrW   r   r"   r   r    )r  rp   altup_active_idxtrainingaltup_coef_clipr  rI   dataclamp_r   rf   r  r   r.   r   r   rb   )rT   r@   
modalities	all_coefspredictionss        r3   predictzGemma3nTextAltUp.predictM  s@    33M$++B^B^4_`
==T[[88D!!((--44dkk6Q6Q5QSWS^S^SnSnoD!!*-Wi &&s+i-1[[-I-IiKO;;KgKgiWQ1a  	 ll=#8#8Aq!#DiP!))!Q15}$%%'//>>r2   r(  	activatedc                    | j                  |      }||| j                  j                     z
  }|j                  | j                  j                  ddd      }| j
                  ro| j                  j                  Y| j                  j                  j                  j                  | j                  j                   | j                  j                         | j                  |      dz   }|j                  ddd      j                  d      }t        j                  ||      }||z  }|j                         j!                  |      S )a_  Corrects the predictions relative to the

        Args:
            predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
            activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
                predictions relative to the activated input embeddings.
        r"   rJ   r    r   rW   )r  rp   r!  repeatr  r"  r#  r  rI   r$  r%  r   r   r.   mulr   rb   )rT   r(  r*  r&  
innovationr'  	correcteds          r3   correctzGemma3nTextAltUp.correcti  s	    33I>
T[[-I-I!JJ
&&t{{'C'CQ1M
==T[[88D!!((--44dkk6Q6Q5QSWS^S^SnSno
 #'"7"7
"Cc"I	%%aA.88<	IIj)4	[ 	##%--i88r2   r/  c                 p    |j                  | j                        | j                  z  j                  |      S )a	  
        This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
        (which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
        `scale_corrected_output`
        )rb   r  rT   r/  s     r3   rd   zGemma3nTextAltUp.forward  s2     !!$";";<t?X?XXaabkllr2   c                 $    | j                  |      S )zMScales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].)rd   r2  s     r3   scale_corrected_outputz'Gemma3nTextAltUp.scale_corrected_output  s    ||I&&r2   )r*   r+   r,   r-   r%   rN   r.   rk   r  r)  r0  rd   r4  rl   rm   s   @r3   r  r  2  s    	r0 r55<< 5ELL 5
?U\\ ?ell ?895<< 9ELL 9U\\ 9:m m%,, m' ' 'r2   r  c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrW   r    r   )rf   r.   r   )r\   x1x2s      r3   rotate_halfr8    sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r2   r@   n_repr^   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r"   N)rf   r  r   )r@   r9  r   num_key_value_headsslenrz   s         r3   	repeat_kvr=    so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr2   modulequerykeyrh  attention_maskdropoutscalingr   c                    || j                   dz  }t        || j                        }	t        || j                        }
t        j                  ||	j                  dd            |z  }|||z  }t        j                  |      }||z  }|#|d d d d d d d |	j                  d   f   }||z   }t        j                  j                  |dt        j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||
      }|j                  dd      j!                         }||fS )	Nr   r    r   r  rW   r  )pr"  r"   )rz   r=  num_key_value_groupsr.   r   	transposer  rf   rO   r   r
  r   r   r   rB  r"  r   )r>  r?  r@  rh  rA  rB  rC  r   r  r  r  attn_weightscausal_maskattn_outputs                 r3   eager_attention_forwardrK    sA    //4'3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL#g-zz,/#g-!$Q1.D
0@0@0D.D%DE#k1 ==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r2   r\   r   r   unsqueeze_dimc                 n    |j                  |      }|j                  |      }| |z  t        |       |z  z   S )a\  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        x (`torch.Tensor`): The tensor to embed.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r   r8  )r\   r   r   rL  s       r3   apply_rotary_pos_embrN    s8    " --
&C
--
&CGA,--r2   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  dej                  dej                  dz  d	e	dz  d
ej                  dz  dee   deej                  ej                  dz  eej                     dz  f   fdZ xZS )Gemma3nTextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrp   r  c                    t         |           t        |d      r|j                  |   nd | _        || _        || _        t        |d|j                  |j                  z        | _
        |j                  |j                  z  | _        d| _        | j
                  j                  | _        d| _        t!        j"                  |j                  |j                  | j                  z  |j$                        | _        t!        j"                  |j                  |j                  | j                  z  |j$                        | _        t!        j"                  |j                  |j                  | j                  z  |j$                        | _        t!        j"                  |j                  | j                  z  |j                  |j$                        | _        | j                  dk(  r|j.                  nd | _        | j                  dk(  | _        t3        |j                  |j4                        | _        t3        |j                  |j4                        | _        t3        |j                  |j4                  d	      | _        | j
                  j<                  | j
                  j>                  z
  }||cxk\  xr d
kD  nc | _         |j                  d | }| j@                  r@tC        |      dz
  |d d d   jE                  |j                  |         z
  | _#        d| _$        y d | _#        |tC        |      dz
  |d d d   jE                  |j                  |         z
  k(  | _$        y )Nlayer_typesrz   rJ   Trr   sliding_attention)rE   rF   F)rE   rF   rG   r   r"   rW   )%rM   rN   hasattrrR  
layer_typerp   r  getattrrx   num_attention_headsrz   r;  rF  rC  attention_dropout	is_causalrO   r   attention_biasr   r   r   o_projsliding_window
is_slidingrD   r  q_normk_normv_normnum_hidden_layersnum_kv_shared_layersis_kv_shared_layerr.  indexkv_shared_layer_indexstore_full_length_kv)rT   rp   r  first_kv_shared_layer_idxprev_layersrU   s        r3   rN   zGemma3nTextAttention.__init__  s   ;B6=;Y&,,Y7_c"
F4F4F&JdJd4de$*$>$>&B\B\$\!!%!>!>ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 8<J]7]f33cg//-@@$f>Q>QR$f>Q>QR$f>Q>Q^cd$(KK$A$ADKKDdDd$d!"+/H"L1"L(()C*CD""),[)9A)=DbD@Q@W@WX^XjXjktXu@v)vD&(-D%)-D&(1S5E5IKX\Z\X\L]LcLc""9-M 6 )D%r2   Nr@   position_embeddingsrA  r?   cache_positionr  r^   c                 2   |j                   d d }g |d| j                  j                  }|\  }	}
| j                  |      j	                  |      }| j                  |      }t        ||	|
d      }|j                  dd      }| j                  rU|S|j                  | j                     \  }}|j                  |j                        }|j                  |j                        }n| j                  |      j	                  |      }| j                  |      }t        ||	|
d      }|j                  dd      }| j                  |      j	                  |      }| j!                  |      }|j                  dd      }|x|
|	|| j"                  d}| j                  s!|j%                  ||| j&                  |      \  }}| j(                  r.t+        |d      si |_	        ||f|j                  | j&                  <   t-        j.                  | j                  j0                  t2              } || ||||f| j4                  r| j6                  nd| j8                  | j"                  d|\  }} |j:                  g |d j=                         }| j?                  |      }||fS )	NrW   r    )rL  r"   )r   r   rj  r\  shared_layersr   )rB  rC  r\  ) rf   rp   rz   r   r  r^  rN  rG  rc  rl  re  r   r   r   r_  r   r`  r\  updater  rf  rT  r   get_interface_attn_implementationrK  r"  rX  rC  r   r   r[  )rT   r@   ri  rA  r?   rj  r  input_shapehidden_shaper   r   r  r  r  cache_kwargsattention_interfacerJ  rH  s                     r3   rd   zGemma3nTextAttention.forward  s    $))#2.??b?$++*>*>?&S{{=166|D{{<0+L#sRST#--a3 ""'B'6'D'DTE_E_'`$J#|':':;J'??<+>+>?L]388FJZ0J-j#sRSTJ#--a3J;;}5::<HL;;|4L'11!Q7L& "0"&"5"5	L **+:+A+Adnnl,(
L ((@46O1@JL@X--dnn=(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((r2   NNNN)r*   r+   r,   r-   r%   ri   rN   r.   rk   r
   
LongTensorr   r   rB   rd   rl   rm   s   @r3   rP  rP    s    G*0 *S *^ -1.2(,26E)||E) #\\E) t+	E)
 E) ((4/E) +,E) 
u||U\\D0%2E2LL	ME)r2   rP  c                   N    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  dej                  dej                  dej                  dz  d	ej                  dz  d
e	dz  dej                  dz  de
e   deej                  eej                  ej                  f   dz  f   fdZ xZS )Gemma3nTextDecoderLayerrp   r  c                    t         |           || _        |j                  | _        || _        |j
                  |   | _        t        ||      | _        t        ||      | _
        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        |j"                  | _        t$        |j&                     | _        t+        |      | _        t/        |      | _        t3        j4                  | j                  | j"                  d      | _        t3        j4                  | j"                  | j                  d      | _        t        | j                  |j                        | _        y )N)r  r  Frr   )rM   rN   rp   rx   r  rR  attention_typerP  	self_attnr  mlprD   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormhidden_size_per_layer_inputr	   r  r  r  altupr  laurelrO   r   per_layer_input_gateper_layer_projectionpost_per_layer_input_normr  s      r3   rN   z Gemma3nTextDecoderLayer.__init__X  s]   !--"$00;-fi@!&I>-d.>.>FDWDWX(6t7G7GVM`M`(a%)78H8HfNaNa)b&*89I9IvObOb*c'+1+M+M(V556%f-
,V4$&IId.>.>@`@`gl$m!$&IId.N.NPTP`P`gl$m!)78H8HfNaNa)b&r2   Nr@   ri  per_layer_inputrA  position_idsr?   rj  r  r^   c           
      t   | j                   j                  |      }	|	| j                  j                     }
| j	                  |
      }| j                  |      } | j                  d||||||d|\  }}| j                  |      }|
|z   }||z   t        j                  d      z  }| j                  |      }| j                  |      }| j                  |      }||z   }| j                   j                  |	|      }|| j                  j                     j                         }| j                  j                  r| j                   j!                  |      }| j#                  |      }| j%                  |      }t'        j(                  ||      }| j+                  |      }| j-                  |      }|dd xxx |z  ccc |S )N)r@   rA  r  ri  r?   rj  r    r"   r1   )r  r)  rp   r!  r|  r  rz  r}  r   rY   r~  r{  r  r0  r   altup_correct_scaler4  r  r  r.   multiplyr  r  )rT   r@   ri  r  rA  r  r?   rj  r  r(  active_predictionactive_prediction_normedlaurel_outputr  r   
attn_gatedattn_laurel	attn_normattn_ffwattn_ffw_normattn_ffw_laurel_gatedcorrected_predictionsfirst_predictions                          r3   rd   zGemma3nTextDecoderLayer.forwardn  s    jj((7'(D(DE#'#7#78I#J $<= $.. 
2)% 3+)
 
a ,,T2&-
!M1TYYq\A22;?	88I&77A +m ; $

 2 2;@U V01M1MNTTV;;**#zz@@AQR  445EF;;'78 >>*:OL  445EF99:JKab!%55!$$r2   )NNNNNN)r*   r+   r,   r%   ri   rN   r.   rk   ru  r
   r   r   rB   r:   rd   rl   rm   s   @r3   rw  rw  W  s    c0 cS c2 -1(,.204(,263%||3% #\\3% 	3%
 t+3% &&-3% 3% ((4/3% +,3% 
u||U5#4#4e6G6G#GH4OO	P3%r2   rw  c                        e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZdZ ej&                          fd       Z xZS )	Gemma3nPreTrainedModelrp   modelTrw  r?   )r@   rA   )imagetextr  c                 F   t         |   |       t        |t              r!t	        j
                  |j                         nt        |t              rt	        j                  |j                         |j                  dz  }dt        j                  j                  j                  t        j                  d            z  }t	        j                   |j"                  ||z         t	        j$                  |j&                  |j(                         t	        j                   |j*                  |j-                                nt        |t.              r,t	        j$                  |j0                  |j2                         nit        |t4              rXt	        j                  |j6                         t	        j$                  |j8                  | j:                  j<                  dz         nt        |t>              rd\  }}|j@                  dz  }tC        jD                  tG        |      tG        |      z        tI        |dz
  d      z  }|t        jJ                  t        jL                  |      | z        z  }t	        j                   |jN                  |jG                         jQ                  d      jQ                  d             n&t        |tR              rdt	        j$                  |jT                  | j<                  dz         t	        j$                  |jV                  dtC        jX                  d	      z         nt        |tZ              r|j\                  D ]  }	|j^                  }
|j`                  |	   d
k7  rtb        |j`                  |	      }
 |
|j:                  |	      \  }}t	        j                   te        ||	 d      |       t	        j                   te        ||	 d      |        tg        |d      r5t	        j$                  |jh                  | j:                  jh                         y y )Nr   rJ   r   r  )rJ   rt   r    r"   r          @defaultrU  	_inv_freq_original_inv_freqr  )5rM   _init_weights
isinstancer)  initones_rI   r   zeros_r   rz   r.   rO   r   r   rS   copy_r   	constant_r   r   r   r   r  r  r  r  r  r  rp   rx   ro   ry   r   r   ra   r{   r   r   ru   r   Gemma3nTextModelper_layer_projection_scaleper_layer_input_scalerY   Gemma3nRotaryEmbeddingrR  compute_default_rope_parameters	rope_typer   rV  rT  r  )rT   r>  r   r   r   r   r   r   ru   rU  rope_init_fncurr_inv_freqr   rU   s                r3   r  z$Gemma3nPreTrainedModel._init_weights  s   f%f=>JJv}}% 56KK,,-oot+G!4!4!=!=ell3>O!PPLJJv~~w'=>NN6>>6+K+KLJJv55v7\7\7^_ >?NN6--v/H/HI 01KK334NN644dkk6M6Mt6ST EF+5(M=#__1N&*hhu]/CeMFZ/Z&[^a"A_ '# +UYYu||N7SWnVn7n-ooNJJv,,n.B.B.D.N.Nq.Q.[.[\].^_ 01NN6<<d>N>NPT>TUNN677TYYs^9KL 67$00 ^
%EE##J/9<#6v7G7G
7S#TL#/*#U q

76j\+CDmT

76j\9K+LM}]^ 6./NN633T[[5R5RS 0r2   )r*   r+   r,   r$   r0   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendrw  rP  _can_record_outputsr  r.   no_gradr  rl   rm   s   @r3   r  r    sv    &*#23#4"5N!"&0* 2U]]_%T %Tr2   r  c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 	 ddedz  de	d   de
dz  dedz  d	ed
ef   f
d       Z ej                         edd              Z xZS )r  inv_freqNrp   c                 v   t         |           |j                  | _        |j                  | _        || _        t        t        |j                              | _        i | _	        | j                  D ]  }| j
                  j                  |   }||d   | j                  |<   | j                  }| j                  |   dk7  rt        | j                  |      } || j
                  ||      \  }}| j                  | d|d       | j                  | d|j                         d       t        | | d|        y )	Nr  r  r  r  FrK   r  _attention_scaling)rM   rN   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrp   listsetrR  r  rope_parametersr  r   rR   r   setattr)	rT   rp   r   rU  rope_paramsr  r  curr_attention_scalingrU   s	           r3   rN   zGemma3nRotaryEmbedding.__init__  s8   "("@"@$*$B$B!F$6$6 78** 	UJ++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@fak4l1M1  J<y!9=UZ [  J</A!BMDWDWDYfk lDZL(:;=ST	Ur2   r   ztorch.deviceseq_lenrU  r^   ztorch.Tensorc                     | j                   |   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetarz   NrJ   r   r    r   r   )	r  rV  rx   rW  r.   r   int64r   ra   )rp   r   r  rU  baserE   attention_factorr  s           r3   r  z6Gemma3nRotaryEmbedding.compute_default_rope_parameters  s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r2   c                 N   t        | | d      }t        | | d      }|d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d	      5  |j                         |j                         z  j                  dd
      }	t        j                  |	|	fd      }
|
j                         |z  }|
j                         |z  }d d d        j	                  |j                        j	                  |j                        fS # 1 sw Y   AxY w)Nr  r  r   rW   r"   mpscpuF)device_typeenabledr    r   r   )rV  ra   r  rf   r   r   r  r   strr   rG  r.   r   r   r   r   )rT   r\   r  rU  r  attention_scalinginv_freq_expandedposition_ids_expandedr  freqsembr   r   s                r3   rd   zGemma3nRotaryEmbedding.forward  sl    4J<y!9:#DZL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	0&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')//C'')//C		0 vvAGGv$cff177f&;;;	0 	0s   *A1FF$NNrt  r`   )r*   r+   r,   r.   rk   r0   r%   rN   staticmethodr   ri   r  rB   ra   r  r  r   rd   rl   rm   s   @r3   r  r    s    llU0 U. +/+/"!%	!*!D(!*(!* t!* $J	!*
 
~u$	%!* !*F U]]_<  <r2   r  zBThe base Gemma 3n language model without a language modeling head.c                       e Zd ZU eed<   dZdef fdZ ed      e	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  dedz  de	j                  dz  dedz  de	j                  dz  dee   defd              Zde	j                  de	j                  fdZ	 dde	j                  de	j                  dz  de	j                  fdZ xZS )r  rp   )r  c           
      |   t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  | j                  | j                  j                  dz        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                         | _        t%        |      | _        d| _        |j                  | _        |j*                  | _        t        |j,                  |j                  |j*                  z  | j                  |j*                  dz        | _        t        j0                  | j                  |j                  |j*                  z  d      | _        t        |j*                  |j                         | _        t        j                  t        d| j                  j6                        D cg c].  }t        j0                  | j                  | j                  d      0 c}      | _        t        j                  t        d| j                  j6                        D cg c].  }t        j0                  | j                  | j                  d      0 c}      | _        | j=                  dt?        j@                  | j                  dz        d	       | j=                  d
t?        jB                  t?        j@                  d            d	       | jE                          y c c}w c c}w c c}w )N      ?)r  r  Frr   r"   r  r   rK   r  r  )#rM   rN   pad_token_idr  
vocab_sizer  rx   rp   embed_tokensrO   r  r-  ra  rw  layersrD   r  r[  r  
rotary_embgradient_checkpointingr  vocab_size_per_layer_inputembed_tokens_per_layerr   per_layer_model_projectionper_layer_projection_normr  altup_projectionsaltup_unembed_projectionsrR   r.   rS   r6  r  )rT   rp   r  r   rU   s       r3   rN   zGemma3nTextModel.__init__6  s    !.. ++ ;v1143C3CQUQ\Q\QhQhjmQm
 mmINvOgOgIhiI$VY7i
 #6#5#56;N;NO	08&+#!--+1+M+M(&D--$$v'I'II::C?	'
# +-))$$v'I'II+
' *88Z8Z`f`s`s)t&!#PUVWY]YdYdYuYuPvw1RYYt'')9)9Fw"
 *,PUVWY]YdYdYuYuPvw1RYYt'')9)9Fw*
& 	95<<HXHXZ^H^;_lqr4ekk%,,sBS6Tafg 	K j4 x xs   "L/3L413L9F)tie_last_hidden_statesNr  per_layer_inputsrA  r  r?   inputs_embeds	use_cacherj  r  r^   c	           	      $   |du |duz  rt        d      |"| j                  |      }| j                  |      }| j                  ||      }|r|t	        | j
                        }|E||j                         nd}
t        j                  |j                  d   |j                        |
z   }||j                  d      }t        |x}t              s*| j
                  |||||d}t        di |t        di |d}|}t        j                   |d	z  d
d      dz  }t        j"                  d      }|g}t%        d| j
                  j&                        D ]  } | j(                  |dz
     |      }|j+                  |j,                  |j                        }t        j                   |d	z  d
d      }t        j.                  t        j0                  ||j+                  |j                                    }||z  |z  }|j3                  |        t        j4                  |d      }i }| j
                  j6                  D ]  }| j9                  |||      ||<    | j:                  d| j
                  j<                   D ]G  }||j>                     }|dddd|j@                  ddf   } ||||j>                     |f||||d|	}I t        j                   |d   d	z  d
d      dz  }|d   g}t%        d| j
                  j&                        D ]  } | jB                  |dz
     ||         }|j+                  |j,                  |j                        }t        j                   |d	z  d
d      }t        j.                  t        j0                  ||j+                  |j                                    }||z  |z  }|j3                  |        t        j4                  |      }t        j                   |d      }| jE                  |      }tG        ||      S )z
        per_layer_inputs (torch.Tensor, *optional*, defaults to None):
            Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
        N:You must specify exactly one of input_ids or inputs_embedsrp   r   r"   r   )rp   input_embedsrA  rj  r?   r  )full_attentionrS  r    rW   Tr1  r  gh㈵>r  r   )rA  r  r?   rj  )r  r?   r1   )$r  r  get_per_layer_inputsproject_per_layer_inputsr   rp   get_seq_lengthr.   r   rf   r   r   r  dictr   r   r[   rS   r-  r  r  r   r   rY   maximumrs  stackrR  r  r  ra  ry  r  r  r[  r   )rT   r  r  rA  r  r?   r  r  rj  r  past_seen_tokenscausal_mask_mappingmask_kwargshidden_states_0target_magnitudeepsilon_tensortemp_hidden_statesr{  
altup_projcurrent_hidden_statenew_magnituder@   ri  rU  decoder_layerrI  r  altup_unemb_projs                               r3   rd   zGemma3nTextModel.forwardg  s3   $ -t";<YZZ  --i8M#88C88HXY0*$++>O!CRC^==?de"\\-*=*=a*@I]I]^aqqN)33A6L ?-F ++ -"0"0#2 ,K #5"C{"C%F%U%U# ( !::oq&8b$OSVVd+-.q$++667 	<A6//A6GJ#-==7L7LUeUlUl=#m !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:;	< $6A> ++11 	gJ.2oom\[e.f
+	g "[[)H4;;+H+HI 	M-m.J.JKK.q!]5L5La/OPO)#M$@$@A	  +) /-	 	M		  !::mA&6!&;TRVYY+A./q$++667 	<A-RT-K-KAPQE-RS`abSc-d#3#6#6_=R=R[k[r[r#6#s !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:;	< $67

=a8		-0&++
 	
r2   c                      | j                  |      j                  g |j                  | j                  j                  | j
                   S r`   )r  r   rf   rp   ra  r  )rT   r  s     r3   r  z%Gemma3nTextModel.get_per_layer_inputs  sP    =t**95== 
__
KK))
 ,,
 	
r2   c                    | j                  |      }|| j                  j                  |j                  |j                        z  } |j
                  g |j                  d d | j                  j                  | j                   }| j                  |      }||S |j                  |j                  k7  r |dd | j                  j                  d d f   }||z   | j                  j                  |j                  |j                        z  S )Nr  rW   .)r  r  r   r   r   r   rf   rp   ra  r  r  r  )rT   r  r  r  s       r3   r  z)Gemma3nTextModel.project_per_layer_inputs  s.   
 .2-L-L]-[ ? ? B B%%.B.I.I !C !
 	
  <3;;  
  "% 
KK)) 
 ,, 

  $==>RS#''%%)9)?)??/5Tt{{7T7T5TVW0WX$'774;U;U;X;X%%.B.I.I <Y <
 
 	
r2   )NNNNNNNNr`   )r*   r+   r,   r%   r0   r  rN   r   r   r.   ru  rk   r
   r:   rj   r   r   r   rd   r  r  rl   rm   s   @r3   r  r  1  sg    /0 /b u5 .204.204(,26!%26l
##d*l
  ,,-l
 t+	l

 &&-l
 l
 ((4/l
 $;l
 ((4/l
 +,l
 
!l
  6l
\
e.>.> 
5<< 
 15
||
  ,,-
 
	
r2   r  z?The base Gemma 3n language model with a language modeling head.c                   |    e Zd ZU ddiZddiZddgdgfiZeed<   dd	iZdef fd
Z	e
e	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                   dz  dej                  dz  dedz  dej                  dz  deej                  z  dee   defd              Z xZS )Gemma3nForCausalLMlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr@   r>   rp   zmodel.language_modelr  c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y r  )
rM   rN   r  r  r  rO   r   rx   r
  r  r  s     r3   rN   zGemma3nForCausalLM.__init__  sU     %f-
 ++yy!3!3V5F5FUS 	r2   Nr  rA  r  r?   r  labelsr  rj  logits_to_keepr  r^   c
                     | j                   d|||||||d|
}|j                  }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }| j                  j                  G|| j                  j                  z  }t        j                  |      }|| j                  j                  z  }d}| | j                  ||| j                  fi |
}t        |||j                  |j                  |j                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3nForCausalLM

        >>> model = Gemma3nForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```)r  rA  r  r?   r  r  rj  N)r=   r>   r?   r@   rA   r1   )r  r  r  ri   slicer
  rp   final_logit_softcappingr.   r  loss_functionr  r   r?   r@   rA   )rT   r  rA  r  r?   r  r  r  rj  r  r  outputsr@   slice_indicesr>   r=   s                   r3   rd   zGemma3nForCausalLM.forward  s   B ,64:: 	,
)%+')	,
 	,
  118B>SV8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%4%%ffdooPPD%#33!//))
 	
r2   )	NNNNNNNNr   )r*   r+   r,   _tied_weights_keys_tp_plan_pp_planr%   r0   _checkpoint_conversion_mappingrN   r   r   r.   ru  rk   r
   r:   rj   ri   r   r   r   rd   rl   rm   s   @r3   r  r    sH   *,GH23H_-z:;H&<g%F"0   .2.204(,26*.!%26-.=
##d*=
 t+=
 &&-	=

 =
 ((4/=
   4'=
 $;=
 ((4/=
 ell*=
 +,=
 
 =
  =
r2   r  c                        e Zd ZdZdeez  def fdZ	 	 d
dej                  dz  dej                  dz  dej                  fd	Z xZS )Gemma3nMultimodalEmbedderzQEmbeds token ids or soft tokens for multimodal content into language model space.multimodal_configtext_configc                 r   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        j                  | j                  | j                        | _        t        | j                  | j
                        | _        t        | j                  | j
                        | _        t        j                  | j                  | j                  d      | _        t        | j                  | j
                  d      | _        y )Nr  Frr   )rF   rG   )rM   rN   rx   multimodal_hidden_sizer  rF   vocab_offsetr  text_hidden_sizerO   	Embedding	embeddingrD   hard_embedding_normsoft_embedding_normr   embedding_projectionembedding_post_projection_norm)rT   r  r  rU   s      r3   rN   z"Gemma3nMultimodalEmbedder.__init__P  s    
 	&7&C&C#$11-::+66 + 7 7doot7R7RS#1$2M2MSWS[S[#\ #1$2M2MSWS[S[#\ $&IId.I.I4K`K`gl$m!.<T=R=RX\X`X`mr.s+r2   Nr  r  r^   c                     |du |duz  rt        d      || j                  |      }n/| j                  || j                  z
        }| j	                  |      }| j                  |      }| j                  |      S )a  Embeds token ids or soft tokens for multimodal content into language model space.

        Args:
            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
                `[vocab_offset, vocab_offset + vocab_size)`.
            inputs_embeds: A torch.Tensor containing the soft tokens to embed.

        Returns:
            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
        Nr  )r  r$  r"  r  r#  r%  r&  )rT   r  r  emb_normhard_embemb_norm_projs         r3   rd   z!Gemma3nMultimodalEmbedder.forwardc  s     -t";<YZZ$//>H~~i$2C2C&CDH//9H11(;22=AAr2   r  )r*   r+   r,   r-   r#   r&   r%   rN   r.   ru  rk   rd   rl   rm   s   @r3   r  r  M  sk    [t-0CCt 't* .2-1B##d*B ||d*B 
	Br2   r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c            "           e Zd Zi ZdZdef fdZd Zd Ze	 e
d      dej                  d	ee   d
eez  fd              Z	 	 	 	 d!dej$                  dz  dej                  dz  dej                  dz  dej                  dz  fdZe		 	 	 	 	 	 	 	 	 	 	 	 	 	 d"dej$                  dz  dej                  dz  dej                  dz  dej(                  dz  dej(                  dz  dej$                  dz  dedz  dej$                  dz  dej$                  dz  dej                  dz  dej$                  dz  dedz  dedz  dedz  dee   d
ef d       Ze	 e
d      dej(                  dej(                  d	ee   d
eez  fd               Z xZS )#Gemma3nModelFrp   c                 $   t         |   |       t        j                  |j                        | _        |j                  j                  | _        t        j                  |j                        }|| _        |j                  j                  | _	        t        j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        | j                          y )Nr  )rM   rN   r!   from_configvision_configvision_towerr  r  language_modelr  audio_configaudio_towerr  embed_visionembed_audior  )rT   rp   r1  rU   s      r3   rN   zGemma3nModel.__init__  s     %119M9MN ,,77"..f6H6HI,*0*<*<*W*W'$001D1DE5f6J6JFL^L^_4V5H5H&J\J\]r2   c                 6    | j                   j                         S r`   )r1  get_input_embeddingsrg   s    r3   r7  z!Gemma3nModel.get_input_embeddings  s    ""7799r2   c                 :    | j                   j                  |       y r`   )r1  set_input_embeddingsrT   rh  s     r3   r9  z!Gemma3nModel.set_input_embeddings  s    007r2   zOProjects the last hidden state from the vision model into language model space.r4   pixel_valuesr  r^   c                     | j                   d	|ddd|}|j                  }|j                  |j                  d   | j                  j
                  j                  | j                  j                        j                  ddd      }|| j                  j
                  j                  dz  z  }| j                  |      |_
        |S )
NFT)r;  
do_poolingreturn_dictr   r    r"   r  r  r1   )r0  r  r   rf   rp   r/  rx   vision_soft_tokens_per_imager   r4  pooler_output)rT   r;  r  vision_outputsr  s        r3   get_image_featureszGemma3nModel.get_image_features  s     +**sQVdhslrs*<< .55##A&KK%%11KK44
 '!Q
	 	 	T[[66BBCGG'+'8'8GX'8'Y$r2   Nr  r  image_featuresaudio_featuresc           	         || | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  j                  d      }n2|| j                  j                  k(  }|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|Qt        ||   j                         |j                         k(  d| d|j                  d   |j                  d   z          |j                         }|j                  d      j                  |      j                  |j                        }|Qt        ||   j                         |j                         k(  d| d|j                  d   |j                  d   z          ||fS )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r  rW   z6Image features and image tokens do not match, tokens: z, features: r   r"   z6Audio features and audio tokens do not match, tokens: )r7  r.   rS   rp   image_token_idlongr   allaudio_token_idr3  r   	expand_asr   r   numelrf   )	rT   r  r  rD  rE  special_image_maskspecial_audio_maskn_image_tokensn_audio_tokenss	            r3   get_placeholder_maskz!Gemma3nModel.get_placeholder_mask  sb    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;.4,,.LL!;!;5::VcVjVjk c"g  "+dkk.H.H!H!*dkk.H.H!H+//1/99"=GGVYYZgZnZno%"01779^=Q=Q=SSHHXXdeseyeyz{e|  @N  @T  @T  UV  @W  fW  eX  Y
 ,//1/99"=GGVYYZgZnZno%"01779^=Q=Q=SSHHXXdeseyeyz{e|  @N  @T  @T  UV  @W  fW  eX  Y
 "#555r2   input_featuresrA  input_features_maskr  r?   token_type_idsrj  r  r  output_attentionsoutput_hidden_states	lm_kwargsc                 	   |du |
duz  rt        d      ||n| j                  j                  }||n| j                  j                  }|} | j	                         |      }
t        j                  |dk\  || j                  k        }t        j                  ||t        j                  |            }| j                  j                  |      }t        j                  || j                  j                  k\  || j                  j                  k        }| j                  j                  | j                  j                  z   dz
  }t        j                  |||      j!                  |
j"                        }| j                  |      }|j!                  |
j"                  |
j$                        }|j'                  d      j)                  |
      }t        j                  |||
      }
|| j                  j                  k\  }| j                  j                  | j                  j                  z   dz
  }t        j                  |||      j!                  |
j"                        }| j                  |      }|j!                  |
j"                  |
j$                        }|j'                  d      j)                  |
      }t        j                  |||
      }
nd}|l| j+                  |d      j,                  }|j!                  |
j"                  |
j$                        }| j/                  ||
|	      \  }}|
j1                  ||      }
|K|H| j3                  || d      } | j,                  } | j4                  }t        j6                  | j                  dz
  ggt
        j8                  | j"                  
      }!| j                  |!      }"t        j                  |j'                  d      |"|       } | j:                  \  }#}$}%| j                  j<                  |$z
  }&|"j?                  |#|&|%      }'t        j@                  | |'fd      } | j!                  |
j"                  |
j$                        } | j/                  ||
|       \  }}(|
j1                  |(|       }
 | j                  dd|||||
|||d|	d|})tC        |)jD                  |r|)jF                  nd|)jH                  |)jJ                  |nd|       S d      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration

        >>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```
        Nr  r   r"   )r  rW   T)r>  )r  rD  r  r   )r  rE  )r  r  rA  r  r?   r  r  rU  rV  r>  rj  )r  r?   r@   rA   r8   r9   r1   )&r  rp   rU  rV  r7  r.   r  r  r  
zeros_liker1  r  r4  r  r5  r  r   r   r   r   rK  rC  rA  rQ  masked_scatterget_audio_featuresr)   rS   rH  rf   audio_soft_tokens_per_imager  r   r7   r  r?   r@   rA   )*rT   r  r;  rR  rA  rS  r  r?   rT  rj  r  r  r  rU  rV  rW  per_layer_inputs_maskper_layer_inputs_tokensr  vision_maskdummy_vision_token_idvision_input_idsvision_embedsexpanded_vision_mask
audio_maskdummy_audio_token_idaudio_input_idsaudio_embedsexpanded_audio_maskrD  rM  r   rE  audio_padding_toksaudio_padding_embsaudio_batch_sizeaudio_seq_lenaudio_embed_dimextra_padding_tokensextra_padding_featuresrN  r  s*                                             r3   rd   zGemma3nModel.forward  s   b -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	  7D557	BM %*$5$5i1niRVRqRqFq$r!&+kk2GTYTdTdenTo&p##22GGH_`  ++T..;;;YIYIYIfIf=fK %)$5$5$B$BTEVEVEaEa$ade$e!${{;	CXY\\]j]q]qr --8H-IM),,]-A-A=CVCVWM#.#8#8#<#F#F}#U !KK(<m][M #d&6&6&C&CCJ#'#3#3#@#@4CSCSC^C^#^ab#b #kk*iAUVYYZgZnZnoO++o+FL'??=+?+?ATATUL","6"6r":"D"D]"S!KK(;\=YM# #!44\t4TbbN+..}/C/C]EXEXYN$($=$=~ %> %! *889K^\M %*=*I!44^FYEYgk4lN+99N'66J "'!0C/D.EUZZ`n`u`u!v!%!1!1<N!1!O"[[)=)=b)ACUWefN?M?S?S<m_#';;#J#J]#Z %7%>%>?OQegv%w""YY8N'OUVWN+..}/C/C]EXEXYN$($=$=~ %> %!A! *889K^\M%$%% 
-)%+'/!5)
 
 *%777@G33d!//))2>2JPT2@2L
 	
 SW
 	
r2   zPProjects the last hidden state from the audio encoder into language model space.c                 x     | j                   ||fddi|}| j                  |j                        }||_        |S )a0  
        input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
            The tensors corresponding to the input audio.
        input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
            The attention mask for the input audio.
        r>  Tr?  )r3  r5  r  rA  )rT   rR  rS  r  audio_outputsrg  s         r3   r[  zGemma3nModel.get_audio_featuresq  sV     9I8H8H/9
=A9
EK9
 ''m6U6U'V&2#r2   rt  )NNNNNNNNNNNNNN)r*   r+   r,   r  accepts_loss_kwargsr$   rN   r7  r9  r   r   r.   r:   r   r   rB   r   rC  ru  rQ  rk   r
   rj   r<   rd   r(   r[  rl   rm   s   @r3   r,  r,    s    &("} :8 !rs'' +, 
+	+	 t , .2263737*6##d**6 ((4/*6 ))D0	*6
 ))D0*6X  .21537.23704(,262626*.!%)-,0O
##d*O
 ''$.O
 ))D0	O

 t+O
 #\\D0O
 &&-O
 O
 ((4/O
 ((4/O
 ((4/O
   4'O
 $;O
  $;O
 #TkO
  ./!O
" 
'#O
 O
b !st #\\ +,	
 
/	/ u r2   r,  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c            %       R    e Zd Zi ZddiZdef fdZd Zd Ze	de
j                  dee   fd	       Zee		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde
j                   d
z  de
j                  d
z  de
j                  d
z  de
j"                  d
z  de
j"                  d
z  de
j                   d
z  ded
z  de
j                   d
z  de
j                   d
z  de
j                  d
z  de
j                   d
z  ded
z  ded
z  ded
z  dee
j"                  z  dee   def"d              Z	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Gemma3nForConditionalGenerationr	  z(model.language_model.embed_tokens.weightrp   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y r  )rM   rN   r,  r  rO   r   r  rx   r  r
  r  r  s     r3   rN   z(Gemma3nForConditionalGeneration.__init__  sS     !&)
yy!3!3!?!?ASASA^A^ejkr2   c                 6    | j                   j                         S r`   )r  r7  rg   s    r3   r7  z4Gemma3nForConditionalGeneration.get_input_embeddings  s    zz..00r2   c                 :    | j                   j                  |       y r`   )r  r9  r:  s     r3   r9  z4Gemma3nForConditionalGeneration.set_input_embeddings  s    

''.r2   r;  r  c                 <     | j                   j                  |fi |S r`   )r  rC  )rT   r;  r  s      r3   rC  z2Gemma3nForConditionalGeneration.get_image_features  s    ,tzz,,\DVDDr2   Nr  rR  rA  rS  r  r?   rT  rj  r  r  r  rU  rV  r  rW  r^   c                    ||n| j                   j                  }||n| j                   j                  } | j                  d	|||||||||	|
||||dd|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                   j                         j                  x}||z  }t        j                  |      }||z  }d}|O|j                         }|dddddf   }|dddf   }||dd|j                  d    df   j                  |j                         }||j                  |j                         dk7     j#                         }||j                  |j                         dk7     j#                         }n |j#                         }|j#                         }t%        j&                         }|j)                  d| j                   j*                  j,                        }|j)                  d      j                  |j                         } |||      }t/        |||j0                  |j2                  |j4                  |j6                  |j8                        S )
a  
        input_features_mask (torch.Tensor, *optional*, defaults to None):
            The attention mask for the input audio.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in
            `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenizer=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        NT)r  r;  rR  rA  rS  r  r?   rT  rj  r  r  r  rU  rV  r>  .rW   r"   r   )r=   r>   r?   r@   rA   r8   r9   r1   )rp   rU  rV  r  r  r  ri   r  r
  get_text_configr  r.   r  ra   rf   r   r   r   rO   CrossEntropyLossr  r  r  r<   r?   r@   rA   r8   r9   )rT   r  r;  rR  rA  rS  r  r?   rT  rj  r  r  r  rU  rV  r  rW  r  r@   r  r>   r  r=   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                                r3   rd   z'Gemma3nForConditionalGeneration.forward  s   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 $** 
%)) 3%+))'/!5
  !
&  118B>SV8W~ot4]kmA}a,?@A'+{{'B'B'D'\'\\#i55FZZ'F55F\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5D,#33!//)) ' ; ; ' ; ;
 	
r2   c                 b    t        |   |f||||||||
|d	|}|s|s||d<   ||d<   |	|d<   |S )N)	r?   r  rA  r  rj  r  r  rT  is_first_iterationr;  rR  rS  )rM   prepare_inputs_for_generation)rT   r  r?   r  rj  r  r;  rR  rA  rS  rT  r  r  r  r  r  model_inputsrU   s                    r3   r  z=Gemma3nForConditionalGeneration.prepare_inputs_for_generation(	  sn    & w<
+')%)))1
 
" Y+7L(-;L)*2EL./r2   )NNNNNNNNNNNNNNr   )NNNNNNNNNTNNF)r*   r+   r,   r  r  r$   rN   r7  r9  r   r.   r:   r   r   rC  r   ru  rk   r
   rj   ri   r<   rd   r  rl   rm   s   @r3   rt  rt    s"    &("*,VW} 1/ Eu/@/@ EFSeLf E E  .21537.23704(,262626*.!%)-,0-.!B
##d*B
 ''$.B
 ))D0	B

 t+B
 #\\D0B
 &&-B
 B
 ((4/B
 ((4/B
 ((4/B
   4'B
 $;B
  $;B
 #TkB
  ell*!B
" ./#B
$ 
'%B
  B
N   ) )r2   rt  )r  r  rt  r,  r  r  )r   NN)r"   )ar   collections.abcr   r   dataclassesr   typingr   r.   torch.nnrO   torch.nn.functionalr   ri   r   r  r   r	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   autor!   configuration_gemma3nr#   r$   r%   r&   r(   r7   r<   ModulerD   ro   r   r)  rK  rp  r  r  r  r  r  r!  r  r  r  r  r8  rk   ri   r=  ra   rB   rK  rN  rP  rw  r  r  r  r  r  r,  rt  __all__r1   r2   r3   <module>r     s  *  . !      & ! . ) / R 9 k k K F &  @  l l 3%? 3  3 
9!8 9 9( 
9K 9 9:=RYY =6g)BII g)TaBII aHj,bii j,ZB7		 B7JF")) FRORYY O8Dryy D2(ryy (V 6M
/ M
`SR\\ S;RYY ;$#5RYY #5L^'ryy ^'B(	UU\\ 	U# 	U%,, 	U$    %II %<< % 
 % <<	 %
 LL4' %  % T\ % T\ % 5<<%& %F.ELL .u|| .%,, ._b ., )*t)299 t) +t)nJ%8 J%Z 8T_ 8T 8TvN<RYY N<b abE
- E
 cE
P ^_O
/ O
 `O
d/B		 /Bd @) @@F C&<o CCLr2   