
    i                       d dl Z d dlmZ d dlmZ d dlZd dlmZ ddlm	Z
 ddlmZ ddlmZmZmZmZ ddlmZmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8m9Z9 ddl:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@  G d dej                        ZB G d dej                        ZC G d dej                        ZDd ZE ed      dQd       ZFd ej                  d!eHd"ej                  fd#ZI	 	 	 dRd$ej                  d%ej                  d&ej                  d'ej                  d(ej                  dz  d)eJd*eJdz  d+eJdz  d"eKej                  ej                  f   fd,ZL eeF       G d- d.ej                               ZM eeF       G d/ d0ej                               ZN G d1 d2e       ZO G d3 d4e       ZP G d5 d6ej                        ZQ G d7 d8ej                        ZR G d9 d:ej                        ZS G d; d<ej                        ZUe3 G d= d>e.             ZVdSd?eHd"efd@ZW G dA dBeV      ZX G dC dDeV      ZYd(ej                  dz  d"efdEZZ G dF dGeV      Z[e3 G dH dIeV             Z\ G dJ dKeVe      Z]e3 G dL dMeV             Z^e3 G dN dOeV             Z_g dPZ`y)T    N)Callable)Optional   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCacheStaticCache)GenerationConfigGenerationMixinGenerationMode)use_kernel_func_from_hubuse_kernelized_func)create_bidirectional_maskcreate_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPoolingSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)OutputRecordercheck_model_inputsmaybe_autocast   )	AutoModel   )T5Gemma2ConfigT5Gemma2DecoderConfigT5Gemma2EncoderConfigT5Gemma2TextConfigc                   <     e Zd Zddedef fdZd Zd Zd Z xZ	S )T5Gemma2RMSNormdimepsc                     t         |           || _        t        j                  t        j                  |            | _        y N)super__init__r3   nn	Parametertorchzerosweight)selfr2   r3   	__class__s      x/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/t5gemma2/modeling_t5gemma2.pyr7   zT5Gemma2RMSNorm.__init__7   s.    ll5;;s#34    c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S )Nr)   T)keepdim)r:   rsqrtpowmeanr3   )r=   xs     r?   _normzT5Gemma2RMSNorm._norm<   s4    5;;quuQx}}R}>IJJJr@   c                     | j                  |j                               }|d| j                  j                         z   z  }|j                  |      S )N      ?)rH   floatr<   type_as)r=   rG   outputs      r?   forwardzT5Gemma2RMSNorm.forward?   sC    AGGI& 3!2!2!445~~a  r@   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler<   shaper3   r=   s    r?   
extra_reprzT5Gemma2RMSNorm.extra_reprF   s'    ))*+6$((<<r@   )gư>)
__name__
__module____qualname__intrK   r7   rH   rN   rS   __classcell__r>   s   @r?   r1   r1   6   s&    5C 5e 5
K!=r@   r1   c                   *     e Zd Zdef fdZd Z xZS )T5Gemma2MLPconfigc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        t        j                  |j                        | _        y )NFbias)r6   r7   r\   hidden_sizeintermediate_sizer8   Linear	gate_projup_proj	down_projr   hidden_activationact_fnDropoutdropout_ratedropoutr=   r\   r>   s     r?   r7   zT5Gemma2MLP.__init__K   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556zz&"5"56r@   c                     | j                  | j                  |            | j                  |      z  }| j                  |      }| j	                  |      }|S r5   )rg   rc   rd   rj   re   )r=   rG   hidden_statesre   s       r?   rN   zT5Gemma2MLP.forwardV   sH    DNN1$56aH]3NN=1	r@   )rT   rU   rV   r/   r7   rN   rX   rY   s   @r?   r[   r[   J   s    	71 	7r@   r[   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 	 ddedz  de	d   de
dz  dedz  d	ed
ef   f
d       Z ej                         edd              Z xZS )T5Gemma2RotaryEmbeddinginv_freqNr\   c                 v   t         |           |j                  | _        |j                  | _        || _        t        t        |j                              | _        i | _	        | j                  D ]  }| j
                  j                  |   }||d   | j                  |<   | j                  }| j                  |   dk7  rt        | j                  |      } || j
                  ||      \  }}| j                  | d|d       | j                  | d|j                         d       t        | | d|        y )	N	rope_typedefault
layer_type	_inv_freqF
persistent_original_inv_freq_attention_scaling)r6   r7   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr\   listsetlayer_typesrr   rope_parameterscompute_default_rope_parametersr   register_bufferclonesetattr)	r=   r\   deviceru   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalingr>   s	           r?   r7   z T5Gemma2RotaryEmbedding.__init__`   s8   "("@"@$*$B$B!F$6$6 78** 	UJ++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@fak4l1M1  J<y!9=UZ [  J</A!BMDWDWDYfk lDZL(:;=ST	Ur@   r   ztorch.deviceseq_lenru   returnztorch.Tensorc                     | j                   |   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNrJ   r   r)   dtyper   r   )	r   getattrr`   num_attention_headsr:   arangeint64torK   )r\   r   r   ru   baser2   attention_factorrp   s           r?   r   z7T5Gemma2RotaryEmbedding.compute_default_rope_parametersw   s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r@   c                 N   t        | | d      }t        | | d      }|d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d	      5  |j                         |j                         z  j                  dd
      }	t        j                  |	|	fd      }
|
j                         |z  }|
j                         |z  }d d d        j	                  |j                        j	                  |j                        fS # 1 sw Y   AxY w)Nrv   rz   r   rB   r+   mpscpuF)device_typeenabledr)   r2   r   )r   rK   expandrQ   r   r   
isinstancetypestrr(   	transposer:   catcossinr   )r=   rG   position_idsru   rp   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   s                r?   rN   zT5Gemma2RotaryEmbedding.forward   sl    4J<y!9:#DZL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	0&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')//C'')//C		0 vvAGGv$cff177f&;;;	0 	0s   *A1FF$r5   NNNN)rT   rU   rV   r:   Tensor__annotations__r/   r7   staticmethodr   rW   r   rP   rK   r   no_gradr   rN   rX   rY   s   @r?   ro   ro   ]   s    llU1 U. ,0+/"!%	!*"T)!*(!* t!* $J	!*
 
~u$	%!* !*F U]]_<  <r@   ro   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrB   r)   r   )rQ   r:   r   )rG   x1x2s      r?   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r@   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embeds          r?   apply_rotary_pos_embr      sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGr@   rm   n_repr   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r+   N)rQ   r   reshape)rm   r   batchnum_key_value_headsslenr   s         r?   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr@   modulequerykeyvalueattention_maskrj   scalingsoftcapc                    || j                   dz  }t        || j                        }	t        || j                        }
t        j                  ||	j                  dd            |z  }|||z  }t        j                  |      }||z  }|#|d d d d d d d |	j                  d   f   }||z   }t        j                  j                  |dt        j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||
      }|j                  dd      j!                         }||fS )	N      r)   r   rB   )r2   r   )ptrainingr+   )r   r   num_key_value_groupsr:   matmulr   tanhrQ   r8   
functionalsoftmaxfloat32r   r   rj   r   
contiguous)r   r   r   r   r   rj   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                 r?   eager_attention_forwardr      sA    //4'3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL#g-zz,/#g-!$Q1.D
0@0@0D.D%DE#k1 ==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r@   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  dej                  dej                  dz  d	e	dz  d
ej                  dz  dee   deej                  ej                  dz  eej                     dz  f   fdZ xZS )T5Gemma2SelfAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr\   	layer_idxc                    t         |           t        |d      r|j                  |   nd | _        || _        || _        t        |d|j                  |j                  z        | _
        |j                  |j                  z  | _        |j                  dz  | _        | j
                  j                  | _        d| _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  | j                  z  |j                  |j&                        | _        | j
                  j0                  | _        | j                  dk(  r|j2                  nd | _        | j                  dk(  | _        t7        |j                  |j8                        | _        t7        |j                  |j8                        | _        y Nr   r   r   Fr^   sliding_attention)r2   r3   r6   r7   hasattrr   ru   r\   r   r   r`   r   r   r   r   query_pre_attn_scalarr   attention_dropout	is_causalr8   rb   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_window
is_slidingr1   rms_norm_epsq_normk_normr=   r\   r   r>   s      r?   r7   zT5Gemma2SelfAttention.__init__     ;B6=;Y&,,Y7_c"
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7;J]7]f33cg//-@@%&//v?R?RS%&//v?R?RSr@   Nrm   position_embeddingsr   past_key_valuescache_positionr   r   c                 r   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }| j                  |	      }	| j                  |
      }
|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t               } || |	|
||f| j"                  r| j$                  nd| j&                  | j(                  d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )NrB   r+   r)   r   r   r           )rj   r   r   )rQ   r   r   viewr   r   r   r   r   r   updater   r   get_interfacer\   _attn_implementationr   r   r   r   r   r   r   r   )r=   rm   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r?   rN   zT5Gemma2SelfAttention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((r@   r   )rT   rU   rV   __doc__r/   rW   r7   r:   r   r   
LongTensorr!   r"   rP   rN   rX   rY   s   @r?   r   r      s    GT1 Tc TB -1.2(,26-)||-) #\\-) t+	-)
 -) ((4/-) +,-) 
u||U\\D0%2E2LL	M-)r@   r   c                   N    e Zd ZdZdedef fdZ	 	 ddej                  de	ej                  ej                  f   dej                  dz  d	ej                  d
e
dz  dej                  dz  dee   de	ej                  ej                  dz  e	ej                     dz  f   fdZ xZS )T5Gemma2MergedAttentionz6Merged self-attention and cross-attention for decoder.r\   r   c                    t         |           t        |d      r|j                  |   nd | _        || _        || _        t        |d|j                  |j                  z        | _
        |j                  |j                  z  | _        |j                  dz  | _        | j
                  j                  | _        d| _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  | j                  z  |j                  |j&                        | _        | j
                  j0                  | _        | j                  dk(  r|j2                  nd | _        | j                  dk(  | _        t7        |j                  |j8                        | _        t7        |j                  |j8                        | _        y r   r   r   s      r?   r7   z T5Gemma2MergedAttention.__init__T  r   r@   Nrm   r   merged_attention_maskencoder_hidden_statesr   r   r   r   c                    |j                   d d }g |d| j                  }	|j                   d d }
g |
d| j                  }| j                  |      j                  |	      j	                  dd      }| j                  |      j                  |	      j	                  dd      }| j                  |      j                  |	      j	                  dd      }| j                  |      }| j                  |      }|\  }}t        ||||      \  }}|d|||d}|j                  }|j                  ||| j                  |      \  }}|j                  j                  | j                        }|j                  }|s| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      }|j                  ||| j                        \  }}d|j                  | j                  <   nFj                   | j                     j"                  }|j                   | j                     j$                  }|}|
d   }t'        j(                  ||gd      }t'        j(                  ||gd      }t+        j,                  | j.                  j0                  t2              } || ||||f| j4                  r| j6                  nd| j8                  d|\  }} |j:                  g |d j=                         }| j?                  |      }||d	d | f   }|d	| d f   }nd
\  }}|||fS )NrB   r+   r)   r   Tr   r   )rj   r   .NN) rQ   r   r   r   r   r   r   r   r   r   self_attention_cacher   r   
is_updatedgetcross_attention_cachelayerskeysvaluesr:   r   r   r  r\   r  r   r   r   r   r   r   r   )r=   rm   r   r  r  r   r   r   r  r  cross_input_shapecross_hidden_shaper  r   r   r   r   r  r  r  r  cross_key_statescross_value_statescross_key_sizer  r   r   self_attn_weightscross_attn_weightss                                r?   rN   zT5Gemma2MergedAttention.forwardr  s    $))#2.88b8$--8177<D0D"DdmmD {{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j& $'snUL#2#G#G ';'B'BL$..,($J
 )3377GJ$3$I$I!"*#{{+@AFFGYZddefhij!%-B!C!H!HI[!\!f!fghjk!l#{{+;<*7L7S7S$&8$..84 "4 >B**4>>:4;;DNNKPP!6!=!=dnn!M!T!T $*1-YY
,<=1E
yy,0B!CK(?(M(MKK,,.E)
 %8!	%
 /3mmD**LL	%
 	%
!\ *k));;;;FFHkk+. # ,S2BN?2B-B C!-cN?3C.C!D4>11-/AAAr@   r  )rT   rU   rV   r  r/   rW   r7   r:   r   rP   r
   r	  r!   r   rN   rX   rY   s   @r?   r  r  P  s    @T1 Tc TN 7;26YB ||YB #5<<#=>	YB
  %||d2YB  %||YB -t3YB ((4/YB -.YB 
u||U\\D0%2E2LL	MYBr@   r  c                        e Zd ZdZdef fdZ	 	 	 ddej                  deej                  ej                  f   dz  dej                  dz  dej                  dz  d	eej                  f   f
d
Z xZS )T5Gemma2EncoderLayerzEncoder sub-layer.r   c                 D   t         |           |j                  | _        || _        || _        |j
                  |   | _        t        ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t#        j$                  |j&                        | _        y N)r\   r   r3   )r6   r7   r`   r\   r   r   attention_typer   	self_attnr1   r   pre_self_attn_layernormpost_self_attn_layernormr[   mlppre_feedforward_layernormpost_feedforward_layernormr8   rh   ri   rj   r   s      r?   r7   zT5Gemma2EncoderLayer.__init__  s    !--"$00;.
 (7v7I7IvObOb'c$(78J8JPVPcPc(d%v&)89K9KQWQdQd)e&*9&:L:LRXReRe*f'zz&"5"56r@   Nrm   r   r   r   r   c           	      >   |}| j                  |      } | j                  d||||d d|\  }}| j                  |      }|| j                  |      z   }|}| j	                  |      }| j                  |      }| j                  |      }|| j                  |      z   }|S )N)rm   r   r   r   r    r&  r%  r'  rj   r)  r(  r*  )r=   rm   r   r   r   r   residual_s           r?   rN   zT5Gemma2EncoderLayer.forward  s     !44]C)4>> 
' 3)% 
 
q 55mD 4<<#>> 66}E/77F 4<<#>>r@   )NNN)rT   rU   rV   r  rW   r7   r:   r   rP   r	  FloatTensorrN   rX   rY   s   @r?   r   r     s    7# 7. IM.204|| #5<<#=>E t+	
 &&- 
u  !	"r@   r   c                   0    e Zd ZdZdef fdZ	 	 	 	 	 	 ddej                  deej                  ej                  f   dej                  dz  dej                  dz  d	e
dz  d
edz  dej                  dz  dej                  dz  dej                  fdZ xZS )T5Gemma2DecoderLayerzFDecoder sub-layer: merged attention instead of vanilla self-attention.r   c                 D   t         |           |j                  | _        || _        || _        |j
                  |   | _        t        ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t#        j$                  |j&                        | _        y r"  )r6   r7   r`   r\   r   r   r$  r  r%  r1   r   r&  r'  r[   r(  r)  r*  r8   rh   ri   rj   r   s      r?   r7   zT5Gemma2DecoderLayer.__init__  s    !--"$00; 1
 (7v7I7IvObOb'c$(78J8JPVPcPc(d%v&)89K9KQWQdQd)e&*9&:L:LRXReRe*f'zz&"5"56r@   Nrm   r   r  r   r   	use_cacher   r  r   c	                 F   |}
| j                  |      } | j                  d||||||||d|	\  }}}| j                  |      }|
| j                  |      z   }|}
| j	                  |      }| j                  |      }| j                  |      }|
| j                  |      z   }|S )N)rm   r   r  r   r   r4  r   r  r,  r-  )r=   rm   r   r  r   r   r4  r   r  r   r.  r/  s               r?   rN   zT5Gemma2DecoderLayer.forward  s     !44]C,dnn 

' 3"7%+)"7

 

q! 55mD 4<<#>> 66}E/77F 4<<#>>r@   )NNNFNN)rT   rU   rV   r  rW   r7   r:   r   rP   r	  r
   boolr0  rN   rX   rY   s   @r?   r2  r2    s    P7# 72 6:046:!&2659"||" #5<<#=>"  %||d2	"
 &&-" -t3" $;" ((4/"  %||d2" 
		"r@   r2  c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ	 xZ
S )
T5Gemma2LMHeadz.Head for language modeling (generation) tasks.r`   
vocab_sizer_   c                 \    t         |           t        j                  |||      | _        y )Nr^   )r6   r7   r8   rb   out_proj)r=   r`   r9  r_   r>   s       r?   r7   zT5Gemma2LMHead.__init__B  s"    		+zEr@   rm   r   c                 (    | j                  |      }|S r5   )r;  )r=   rm   logitss      r?   rN   zT5Gemma2LMHead.forwardF  s    }-r@   )F)rT   rU   rV   r  rW   r6  r7   r:   r   rN   rX   rY   s   @r?   r8  r8  ?  s?    8FC FS F FU\\ ell r@   r8  c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ	 xZ
S )
T5Gemma2ClassificationHeadz-Head for sentence-level classification tasks.r`   
num_labelsclassifier_dropout_ratec                     t         |           t        j                  |      | _        t        j
                  ||      | _        y )N)r   )r6   r7   r8   rh   rj   rb   r;  )r=   r`   r@  rA  r>   s       r?   r7   z#T5Gemma2ClassificationHead.__init__N  s1    zz$;<		+z:r@   rm   r   c                 J    | j                  |      }| j                  |      }|S r5   )rj   r;  )r=   rm   s     r?   rN   z"T5Gemma2ClassificationHead.forwardS  s$    ]3m4r@   )r   rT   rU   rV   r  rW   rK   r7   r:   r   rN   rX   rY   s   @r?   r?  r?  K  s<    7;C ;S ;SX ;
U\\ ell r@   r?  c                   D     e Zd Zdef fdZdej                  fdZ xZS )T5Gemma2MultiModalProjectorr\   c                    t         |           t        j                  t	        j
                  |j                  j                  |j                  j                              | _	        t        |j                  j                  |j                  j                        | _        t        |j                  j                  |j                  j                  z        | _        t        |j"                  dz        | _        | j                   | j$                  z  | _        t        j(                  | j&                  | j&                        | _        y )Nr#        ?)kernel_sizestride)r6   r7   r8   r9   r:   r;   vision_configr`   text_configmm_input_projection_weightr1   layer_norm_epsmm_soft_emb_normrW   
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_siderI  	AvgPool2davg_poolrk   s     r?   r7   z$T5Gemma2MultiModalProjector.__init__Z  s    *,,,KK,,88&:L:L:X:XY+
' !0  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r@   vision_outputsc                    |j                   \  }}}|j                  dd      }|j                  ||| j                  | j                        }|j	                         }| j                  |      }|j                  d      }|j                  dd      }| j                  |      }t        j                  || j                        }|j                  |      S )Nr+   r)   )rQ   r   r   rR  r   rV  flattenrO  r:   r   rM  rL   )	r=   rW  
batch_sizer/  r`   reshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            r?   rN   z#T5Gemma2MultiModalProjector.forwardj  s    %3%9%9"
A{"0":":1a"@"9"A"AT%;%;T=S=S#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EtGfGf#g '//??r@   )	rT   rU   rV   r.   r7   r:   r   rN   rX   rY   s   @r?   rF  rF  Y  s$    \4 \ @ell @r@   rF  c                   b     e Zd ZdZ	 	 d
dededededef
 fdZdej                  f fd	Z	 xZ
S )T5Gemma2TextScaledWordEmbeddingzCT5Gemma2 Embedding: override to add eoi token embedding separately.num_embeddingsembedding_dimpadding_idxembed_scaleeoi_token_indexc                     t         |   |||       || _        | j                  dt	        j
                  |      d       || _        t        j                  t	        j                  | j                              | _        y )Nrd  Frw   )r6   r7   scalar_embed_scaler   r:   tensorre  r8   r9   r;   rb  eoi_embedding)r=   ra  rb  rc  rd  re  r>   s         r?   r7   z(T5Gemma2TextScaledWordEmbedding.__init__  se     	D"-]ELL,ERWX.\\%++d6H6H*IJr@   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  }| j                  j                  |j
                        ||| j                  k(  <   |S r5   )r6   rN   rd  r   r<   r   ri  re  )r=   rj  input_embeddingsr>   s      r?   rN   z'T5Gemma2TextScaledWordEmbedding.forward  sf     7?958H8H8K8KDKKL]L]8^^>B>P>P>S>STdTjTj>kd&:&::;r@   )rJ     rD  rY   s   @r?   r`  r`  }  s^    M !&KK K 	K
 K K     r@   r`  c                        e Zd ZU eed<   dZdZg dZdgZdZ	dZ
dZdZdZeeg eedd	       eedd	       eed
d	      gdZdZ ej,                          fd       Zd Z xZS )T5Gemma2PreTrainedModelr\   modelT)r   r2  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadr   Fr+   r%  )index
layer_namer)   
cross_attn)rm   
attentions)imagetextc                    t         |   |       t        |t              r t	        j
                  |j                         y t        |t              rJt	        j
                  |j                         t	        j                  |j                  |j                         y t        |t              r|j                  j                  j                  d   dz  }t	        j                   |j                  j                  d| j"                  j$                  |z         t'        |j                  d      rA|j                  j(                  *t	        j
                  |j                  j(                         y y y d|j*                  j,                  v r t	        j
                  |j                         y t        |t.              r|j0                  D ]  }|j2                  }|j4                  |   dk7  rt6        |j4                  |      } ||j"                  |      \  }}t	        j8                  t;        || d	      |       t	        j8                  t;        || d
      |        y y )Nr   r   r   )rF   stdr_   RMSNormrs   rt   rv   ry   )r6   _init_weightsr   rF  initzeros_rM  r`  ri  	constant_rd  rg  r?  r;  r<   rQ   normal_r\   initializer_ranger   r_   r>   rT   ro   r   r   rr   r   copy_r   )r=   r   scaleru   r   r   r/  r>   s          r?   r}  z%T5Gemma2PreTrainedModel._init_weights  s   f%f9:KK99: ?@KK,,-NN6--v/H/HI :;OO**003t;ELL//ct{{?\?\_d?dev/FOO4H4H4TFOO001 5U/ &**333KK& 78$00 ^
%EE##J/9<#6v7G7G
7S#TL#/*#U q

76j\+CDmT

76j\9K+LM}]^ 9r@   c                 <   | j                   j                  }|j                  }|j                  }|t	        d      |j                  |j                        }|dddf   j                         |dddf<   ||d<   |t	        d      |j                  |dk(  |       |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .rB   r+   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	r\   decoderbos_token_idpad_token_id
ValueError	new_zerosrQ   r   masked_fill_)r=   rj  decoder_configdecoder_start_token_idr  shifted_input_idss         r?   %prepare_decoder_input_ids_from_labelsz=T5Gemma2PreTrainedModel.prepare_decoder_input_ids_from_labels  s     ,,!/!<!<%22!)YZZ &//	@%.sCRCx%8%>%>%@#qr'"$:&!XYY 	&&'8D'@,O  r@   )rT   rU   rV   r,   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r2  r&   r   r  _can_record_outputsinput_modalitiesr:   r   r}  r  rX   rY   s   @r?   ro  ro    s    &*# $5"5 !N!"&.0DE0kR2!T2!U
 )U]]_^ ^0!r@   ro  r   c           
      T     dt         dt         dt         dt         dt        f
 fd}|S )zL
    This creates uni/bidirectional attention mask with sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c                 t    	r
d}}n
dz   dz  
dz  dz   }}||z
  }|dk\  ||k  z  }|dk  | |k  z  }||z  S )Nr   r+   r)   r,  )r  r  r  r  left_window_sizeright_window_sizedist	left_mask
right_maskr   r   s            r?   
inner_maskz0sliding_window_mask_function.<locals>.inner_mask  sp    2@!/4BQ4F13L~bcNcfgNg/v~QY4*:#:;	QhD5+<#<=
:%%r@   rW   r6  )r   r   r  s   `` r?   sliding_window_mask_functionr    s3    
	&c 	&S 	& 	&c 	&d 	& r@   c                       e Zd ZU eed<   eedZ	 ddedef fdZ	e
e	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dee   defd              Z xZS )T5Gemma2TextEncoderr\   )rw  rm   re  c           	      ^   t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  | j                  |j                  dz  |      | _        t        |j                  |j                        | _
        d| _        t        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t        j$                  |j&                        | _        t+        |      | _        | j/                          y c c}w NrH  )rd  re  r#  F)r6   r7   r  rc  r9  r`  r`   embed_tokensr1   r   normgradient_checkpointingr8   
ModuleListrangenum_hidden_layersr   r  rh   ri   rj   ro   
rotary_emb	post_initr=   r\   re  r   r>   s       r?   r7   zT5Gemma2TextEncoder.__init__  s    
 	 !.. ++;**C/+
 $F$6$6F<O<OP	&+#mmFKFLdLdFef!&)4f
 zz&"5"561&9 	 g   D*Nrj  r   r   inputs_embedstoken_type_idsr   r   c           
      0   |d u |d uz  rt        d      |j                  dd        || j                  |      }|>t        j                  d|j
                  d   |j                        j                  d      }t        |x}t              sJ| j                  ||d}t        di |t        di |dt        | j                  j                  d	      id
}|}	i }
| j                  j                  D ]  }| j                  |	||      |
|<    | j!                  |	      }	| j"                  d | j                  j$                   D ](  } ||	|
|j&                     ||j&                     |fi |}	* | j)                  |	      }	| j!                  |	      }	t+        |	      S )N:You must specify exactly one of input_ids or inputs_embedsr   r   r+   r   )r\   input_embedsr   and_mask_functionF)r   full_attentionr   last_hidden_stater,  )r  popr  r:   r   rQ   r   r   r   dictr\   r   r  r   r   r  rj   r  r  r$  r  r   )r=   rj  r   r   r  r  r   self_attn_mask_mappingmask_kwargsrm   r   ru   layer_modules                r?   rN   zT5Gemma2TextEncoder.forward!  s    -t";<YZZ 	

$d+  --i8M <<=+>+>q+A-J^J^_iijklLNB0DI++ -"0K #<"Jk"J%> &!&&B4;;C]C]in&o&&" & !++11 	gJ.2oom\[e.f
+	g ]3 KK(G$++*G*GH 	L(#L$?$?@&|'B'BC	
 M	 		-0]3+
 	
r@   rm  )NNNNN)rT   rU   rV   r/   r   r   r   r  rW   r7   r'   r#   r:   r	  r   r0  r!   r"   r   rN   rX   rY   s   @r?   r  r    s    +-  '" 8  .2.20426.2<
##d*<
 t+<
 &&-	<

 ((4/<
 t+<
 +,<
 
<
  <
r@   r  c                       e Zd ZU eed<   	 ddedef fdZd Zd Ze	e
dej                  dee   deez  fd	              Zd
ej$                  dz  dej&                  dz  dej&                  fdZee
	 	 	 	 	 	 dd
ej$                  dz  dej                  dz  dej$                  dz  dej&                  dz  dej&                  dz  dej                  dz  dee   defd              Z xZS )T5Gemma2Encoderr\   re  c                     t         |   |       t        j                  |j                  |      | _        t        j                  |j                        | _	        t        |      | _        | j                          y )N)re  r\   )r6   r7   r  _from_configrL  
text_modelr*   from_configrK  vision_towerrF  multi_modal_projectorr  )r=   r\   re  r>   s      r?   r7   zT5Gemma2Encoder.__init__e  sb    
 	 -::6;M;M_n:o%119M9MN%@%H" 	r@   c                 6    | j                   j                         S r5   )r  get_input_embeddingsrR   s    r?   r  z$T5Gemma2Encoder.get_input_embeddingss  s    3355r@   c                 8    | j                   j                  |      S r5   )r  set_input_embeddingsr=   new_embeddingss     r?   r  z$T5Gemma2Encoder.set_input_embeddingsv  s    33NCCr@   pixel_valuesr   r   c                 x     | j                   d|dd|}|j                  }| j                  |      }||_        |S )NT)r  return_dictr,  )r  r  r  pooler_output)r=   r  r   rW  r  image_featuress         r?   get_image_featuresz"T5Gemma2Encoder.get_image_featuresy  sM     +**aRVaZ`a*<<334EF'5$r@   rj  Nr  r  c                 D   | j                   j                  }|f|t        d      | | j                         t	        j
                  |t        j                  |j                              k(  }|j                  d      }n||k(  }|j                         }|j                  d      j                  |      j                  |j                        }|j                  d   |j                  d   z  }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        z9Either `input_ids` or `inputs_embeds` has to be provided.)r   r   rB   r   r+   z6Image features and image tokens do not match: tokens: z, features )r\   image_token_idr  r  r:   rh  longr   allsumr   	expand_asr   rQ   r%   numel)r=   rj  r  r  r  special_image_maskn_image_tokensn_image_featuress           r?   get_image_placeholder_maskz*T5Gemma2Encoder.get_image_placeholder_mask  s"    33$ !\]]!.2M$2K2K2M^5::mFZFZ[3 " "4!7!7!;!*n!<+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL,-3359M9M9OOD^DTT_`p_qr	
 "!r@   r   r   r  c                 ~   |d u |d uz  rt        d      || j                  j                  |      }|i| j                  |d      j                  }|j                  |j                  |j                        }| j                  |||      }	|j                  |	|      } | j                  d|||d|}
t        |
      S )Nr  T)r  )r  r  )r  r   r   r  r,  )r  r  r  r  r  r   r   r   r  masked_scatterr   )r=   rj  r   r   r  r  r  r   r  
image_maskrm   s              r?   rN   zT5Gemma2Encoder.forward  s     -t";<YZZ  OO88CM#!44\t4TbbN+..}/C/C]EXEXYN88~ 9 J *88^TM' 
')%
 	
 +
 	
r@   r  )NNNNNN)rT   rU   rV   r.   r   rW   r7   r  r  r$   r#   r:   r   r!   r"   rP   r   r  r	  r0  r  r'   r   rN   rX   rY   s   @r?   r  r  b  ss   !!
  '% 6D 
!LL
4:;M4N
	+	+
  
"##d*" ((4/" ))	"<  .2.2042615.2$
##d*$
 t+$
 &&-	$

 ((4/$
 ''$.$
 t+$
 +,$
 
$
  $
r@   r  c           
      P     dt         dt         dt         dt         dt        f
 fd}|S )z4
    This creates bidirectional attention mask.
    r  r  r  r  r   c                     %t        j                  dt         j                        S | |f   j                  t         j                        S )Nr,  r   )r:   onesr6  r   )r  r  r  r  r   s       r?   r  z/bidirectional_mask_function.<locals>.inner_mask  s=    !::b

33i/033EJJ??r@   r  )r   r  s   ` r?   bidirectional_mask_functionr    s9    
@c @S @ @c @d @
 r@   c                       e Zd ZU eed<    eed       eed      edZddede	f fdZ
ee	 	 	 	 	 	 	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dedz  dej"                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  dee   defd              Z xZS )T5Gemma2Decoderr\   r+   )rt  r)   )rw  cross_attentionsrm   re  c           	      ^   t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  |j                  |j                  dz  |      | _        t        |j                  |j                        | _
        d| _        t        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t        j$                  |j&                        | _        t+        |      | _        | j/                          y c c}w r  )r6   r7   r  rc  r9  r`  r`   r  r1   r   r  r  r8   r  r  r  r2  r  rh   ri   rj   ro   r  r  r  s       r?   r7   zT5Gemma2Decoder.__init__  s     !.. ++;**C/+
 $F$6$6F<O<OP	&+#mmFKFLdLdFef!&)4f
 zz&"5"561&9	 gr  Nrj  r   r   r   r  r4  r   r  encoder_attention_maskr   r   c
                    |d u |d uz  rt        d      |t        d      || j                  |      }| j                  s,|r*|(t        t	        | j
                        t	                     }|F||j                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      }t        |x}t              s>| j
                  |||||j                  nd |d}d |d	<   t        di |t!        di |d
}t        |	x}t              s-| j
                  ||	|d d d}dt        di |dt#        |	      ii}t        j$                  |d   |d   gd      t        j$                  |d   |d   gd      d
}|}i }| j
                  j&                  D ]  }| j)                  |||      ||<    | j+                  |      }| j,                  d | j
                  j.                   D ],  } ||||j0                     ||j0                     |||||fi |
}. | j3                  |      }| j+                  |      }t5        ||      S )Nr  z0`encoder_hidden_states` must be given in decoderr  r   r+   r  )r\   r  r   r   r   r   c                  L    t        j                  dt         j                        S )NTr   )r:   rh  r6  )argss    r?   <lambda>z)T5Gemma2Decoder.forward.<locals>.<lambda>'  s    U\\$V[V`V`=a r@   r  r  r  or_mask_functionrB   r   r   )r  r   r,  )r  r  r   r
   r	   r\   get_seq_lengthr:   r   rQ   r   r   r   r  r  r   r   r  r   r   r  rj   r  r  r$  r  r   )r=   rj  r   r   r   r  r4  r   r  r  r   past_seen_tokensr  r  cross_attn_mask_mappingmerged_attn_mask_mappingrm   r   ru   r  s                       r?   rN   zT5Gemma2Decoder.forward  s    -t";<YZZ (OPP  --i8M}}/F1,dkk2RT`TbcO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6LNB0DI++ -"0"0KZKf?#G#Glp ,K 0bK+,"4"C{"C%F%U%U&"
 5KK1TR++ 5"8"0#' $K !"4 #!#%@AW%X#'# $ii'(89;RSc;dekm "''(;<>UVf>ghnp"	$
  & !++11 	gJ.2oom\[e.f
+	g ]3 KK(G$++*G*GH 	L(#L$?$?@()D)DE%
 
M	 		-0]38++
 	
r@   r  )	NNNNNNNNN)rT   rU   rV   r-   r   r&   r  r2  r  rW   r7   r'   r#   r:   r	  r   r
   r0  r6  r!   r"   r   rN   rX   rY   s   @r?   r  r    sF   !!$%<AF*+B!L-4 s ,  .2.2046:26!%26596:h
##d*h
 t+h
 &&-	h

 -t3h
 ((4/h
 $;h
 ((4/h
  %||d2h
 !&t 3h
 +,h
 
3h
  h
r@   r  c            !           e Zd ZdddZdef fdZd Zd Zd Zd	 Z	e
e	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  ded
z  ded
z  dej$                  d
z  dej$                  d
z  ded
z  dej                  d
z  dee   defd              Z xZS )T5Gemma2Modelz&encoder.text_model.embed_tokens.weightz-encoder.text_model.embed_tokens.eoi_embedding)zdecoder.embed_tokens.weightz"decoder.embed_tokens.eoi_embeddingr\   c                     t         |   |       t        |j                  |j                        | _        t        |j                  |j                        | _        | j                          y r5   )r6   r7   r  encoderre  r  r  r  rk   s     r?   r7   zT5Gemma2Model.__init__m  sL      'v~~v7M7MN&v~~v7M7MNr@   c                     | j                   S r5   )r   rR   s    r?   get_encoderzT5Gemma2Model.get_encoderv      ||r@   c                     | j                   S r5   r  rR   s    r?   get_decoderzT5Gemma2Model.get_decodery  r  r@   c                 6    | j                   j                         S r5   )r   r  rR   s    r?   r  z"T5Gemma2Model.get_input_embeddings|  s    ||0022r@   c                 8    | j                   j                  |      S r5   )r   r  r  s     r?   r  z"T5Gemma2Model.set_input_embeddings  s    ||00@@r@   Nrj  r  r   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr   r  decoder_inputs_embedsr4  r   r   r   c                 P   | | j                   d||||
|dd|}|j                  } | j                  d|||||	||||dd
|}t        |j                  |j                  |j
                  |j                  |j                  |j                  |j
                  |j                        S )aX  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        T)rj  r   r   r  r  r  )
rj  r   r   r  r   r  r  r4  r   r  )r  r   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentionsr,  )r   r  r  r   r   rm   rw  r  )r=   rj  r  r   r   r	  r
  r  r  r   r  r  r4  r   r   r  decoder_outputss                    r?   rN   zT5Gemma2Model.forward  s    8 "*dll #-)+)  O !0 A A '$,, 
'1-/+"7#1)
 
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r@   )NNNNNNNNNNNNN)rT   rU   rV   _tied_weights_keysr,   r7   r  r  r  r  r$   r#   r:   r	  r0  
BoolTensorr   r
   r   r6  r!   r"   r   rN   rX   rY   s   @r?   r  r  f  s    (P.]
~ 3A  .215370459:>8<266:-159!%26#?
 ##d*?
 ''$.	?

 ))D0?
 &&-?
 !++d2?
 !& 0 04 7?
 $..5?
 )4/?
 -t3?
 ||d*?
  %||d2?
  $;!?
" ((4/#?
$ +,%?
& 
'?
  ?
r@   r  c            &           e Zd ZddiZddiZddgdgfiZdef fdZd	 Zd
 Z	d Z
d Zd Zd Zeedej"                  dee   deez  fd              Zed        Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d*dej2                  dz  dej4                  dz  dej4                  dz  dej2                  dz  dej2                  dz  dej6                  dz  dej2                  dz  dedz  dedz  dej4                  dz  dej4                  dz  dej2                  dz  d edz  d!ej2                  dz  d"eej"                  z  dee   deej4                     e z  f"d#              Z!d$e"d%e#d&e$d'ed(edef fd)Z% xZ&S )+ T5Gemma2ForConditionalGenerationzlm_head.out_proj.weightz,model.encoder.text_model.embed_tokens.weightzlm_head.out_projcolwise_gather_outputrm   r=  r\   c                    t         |   |       t        |      | _        |j                  j
                  | _        t        |j                  j                  | j
                        | _        d| _	        | j                          y )NForMaskedLM)r6   r7   r  rp  r  r9  r8  r`   lm_head	loss_typer  rk   s     r?   r7   z)T5Gemma2ForConditionalGeneration.__init__  sZ     "6*
 ..33%fnn&@&@$//R&r@   c                 &    || j                   _        y r5   r  r;  r  s     r?   set_output_embeddingsz6T5Gemma2ForConditionalGeneration.set_output_embeddings  s     .r@   c                 .    | j                   j                  S r5   r  rR   s    r?   get_output_embeddingsz6T5Gemma2ForConditionalGeneration.get_output_embeddings  s    ||$$$r@   c                 6    | j                   j                         S r5   rp  r  rR   s    r?   r  z5T5Gemma2ForConditionalGeneration.get_input_embeddings      zz..00r@   c                 :    | j                   j                  |       y r5   rp  r  r=   r   s     r?   r  z5T5Gemma2ForConditionalGeneration.set_input_embeddings      

''.r@   c                 6    | j                   j                         S r5   )rp  r  rR   s    r?   r  z,T5Gemma2ForConditionalGeneration.get_encoder      zz%%''r@   c                 6    | j                   j                         S r5   )rp  r  rR   s    r?   r  z,T5Gemma2ForConditionalGeneration.get_decoder  r*  r@   r  r   r   c                 D     | j                         j                  |fi |S r5   )r  r  )r=   r  r   s      r?   r  z3T5Gemma2ForConditionalGeneration.get_image_features  s%    
 5t!44\LVLLr@   c                 6    | j                         j                  S r5   )r  r  rR   s    r?   r  z-T5Gemma2ForConditionalGeneration.vision_tower  s    !...r@   Nrj  r   r   r	  r
  r  r  r   r  r  labelsr4  r   logits_to_keepc                    |||| j                  |      } | j                  d|||||||||	|
|||d|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                  j                  }|j                  3||j                  z  }t        j                  |      }||j                  z  }d}| | j                  ||| j                  fi |}t        |||j                  |j                   |j"                  |j$                  |j&                  |j(                  |j*                  	      S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)rj  r  r   r   r	  r
  r  r  r   r  r  r4  r   )	lossr=  r   r  r  r  r  r  r  r,  )r  rp  r  r   rW   slicer  r\   r  final_logit_softcappingr:   r   loss_functionr9  r   r   r  r  r  r  r  r  )r=   rj  r  r   r   r	  r
  r  r  r   r  r  r.  r4  r   r/  r   r  rm   slice_indicesr=  r  r1  s                          r?   rN   z(T5Gemma2ForConditionalGeneration.forward  su   D "3";@U@] $ J J6 R.8djj /
%)%/#9!5++'"7)/
 /
" (998B>SV8W~ot4]kmA}a,?@A,,11=nDDDFZZ'FnDDDF%4%%ffdooPPD+;;"1"G"G.AA,==&5&O&O"1"G"G.AA

 
	
r@   generation_configmodel_kwargsgeneration_moderZ  max_cache_lengthc           	      P   t         |   |||||       |j                  du ry|j                  }|d}nd|j                  v }t	        j
                  | j                  j                  d            }|`|`	||d}	|j                  d      }
|
t        |
t              st        d      t        |
j                        d	kD  r|
j                  j                  d	      ryt!        |
j"                        }|t$        k(  r|d
   d	   j&                  d   |	d<    |di |	|
_        n=t        t)        di | j                  j                  d      |dt)                     |d<   t+        | d      r=| j,                  0t        | j,                  t              st        d      |d   | _        yyy)zMOverride cache preparation to support T5Gemma2-specific EncoderDecoder Cache.FN	offloadedTr  )r\   
offloadingr   zaThe `past_key_values` in `model_kwargs` must be of type `EncoderDecoderCache` for T5Gemma2 model.r   r  r+   max_cache_len_cachezLThe internal cache must be of type `EncoderDecoderCache` for T5Gemma2 model.r,  )r6   _prepare_cache_for_generationr4  cache_implementationcopydeepcopyr\   get_text_configr   r   r  r   r
   r  lenr  r   r  r   rQ   r	   r   r>  )r=   r6  r7  r8  rZ  r9  r@  offload_cachecross_attn_configcross_attn_cache_kwargsr   cross_attn_clsr>   s               r?   r?  z>T5Gemma2ForConditionalGeneration._prepare_cache_for_generationG  s    	-	
 &&%/0EE'!M'+<+Q+QQM !MM$++*E*Ed*E*ST ,) ('#

 '**+<=&o/BC w 
 ?--.27Q7Q7U7UVW7X!/"G"GHN,;GHY;Z[\;];c;cde;f'84B4]E\4]O1 /B "&++"="=d"="K&3 /L*+ 4"t{{'>dkk+>? !opp&'89DK	 (?"r@   )NNNNNNNNNNNNNNr   )'rT   rU   rV   r  _tp_plan_pp_planr,   r7   r  r!  r  r  r  r  r$   r#   r:   r   r!   r"   rP   r   r  propertyr  r	  r0  r  r   r
   r6  rW   r   rN   r   r  r   r?  rX   rY   s   @r?   r  r    s   !#Q #$;<H"o%6
$CDH~ /%1/(( M!LLM4:;M4NM	+	+M  M
 / /  .215370459:>8<266:26:>*.!%26-.'O
 ##d*O
 ''$.	O

 ))D0O
 &&-O
 !++d2O
 !& 0 04 7O
 $..5O
 )4/O
 -t3O
 ((4/O
  %0047O
    4'!O
" $;#O
$ ((4/%O
& ell*'O
( +,)O
* 
u  	!O	3+O
  O
bI:+I: I: (	I:
 I: I: 
I: I:r@   r  c                       e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  de	j                  dz  dee   defd              Z xZS )!T5Gemma2ForSequenceClassificationr\   c                 "   t         |   |       |j                  | _        |j                  j                  | _        t        |      | _        t        |dd      }t        | j                  | j                  |      | _	        | j                          y NrA  g?r6   r7   r@  r  r`   r  rp  r   r?  scorer  r=   r\   classifier_dropoutr>   s      r?   r7   z*T5Gemma2ForSequenceClassification.__init__  sp      ++!>>55"6*
$V-FL/0@0@$//Sef
r@   c                 6    | j                   j                         S r5   r#  rR   s    r?   r  z6T5Gemma2ForSequenceClassification.get_input_embeddings  r$  r@   c                 :    | j                   j                  |       y r5   r&  r'  s     r?   r  z6T5Gemma2ForSequenceClassification.set_input_embeddings  r(  r@   Nrj  r  r   r   r	  r
  r  r  r  r  r.  r   r   c                 v   |	|
#t        d| j                  j                   d      |t        d      || j	                  |      } | j
                  |f||||||||	|
dd
|}|j                  }|j                  }|j                  }| j                  |      }|j                  d   }|| j                  j                  k7  j                  |j                  t        j                         }t        j"                  |j                  d   |j                  t        j                   	      }||z  j%                  d      }t        j&                  ||j                  d   d
z
        }|t        j"                  ||j                        |f   }d}|| j)                  |||| j                        }t+        ||||      S )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for .You have to specify input_idsF
r  r   r   r	  r
  r  r  r  r  r4  r   rB   r   r+   )maxr  )r=  r.  pooled_logitsr\   r1  r=  rm   rw  )NotImplementedErrorr>   rT   r  r  rp  r  r  r  rQ  rQ   r\   r  r   r   r:   int32r   argmaxclampr4  r   )r=   rj  r  r   r   r	  r
  r  r  r  r  r.  r   outputsr  rm   rw  r=  rZ  non_pad_masktoken_indiceslast_non_pad_tokenr]  r1  s                           r?   rN   z)T5Gemma2ForSequenceClassification.forward  s   4 $(=(I%J4>>KbKbJccde  <==$ $ J J9 U&0djj'
%)%/#9!5+'"7'
 '
 $5555//
-.__Q'
)T[[-E-EEII&--Y^YdYde%6%<%<R%@^c^i^ij+l:BB2F"[[);ARAXAXY[A\_`A`au||Jv}}MOaab%%VFR_hlhshs%tD' '!	
 	
r@   NNNNNNNNNNN)rT   rU   rV   r,   r7   r  r  r$   r#   r:   r	  r0  r   r   r!   r"   r   rN   rX   rY   s   @r?   rM  rM    s\   	~ 	1/  .215.204596:8<2626:>*.J
##d*J
 ''$.J
 t+	J

 &&-J
 !++d2J
 !&t 3J
 $..5J
 )4/J
 ((4/J
  %0047J
   4'J
 +,J
 
"J
  J
r@   rM  c                       e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  de	j                  dz  dee   defd              Z xZS )T5Gemma2ForTokenClassificationr\   c                 "   t         |   |       |j                  | _        |j                  j                  | _        t        |      | _        t        |dd      }t        | j                  | j                  |      | _	        | j                          y rO  rP  rR  s      r?   r7   z'T5Gemma2ForTokenClassification.__init__  sp      ++!>>55"6*
$V-FL/0@0@$//Sef
r@   c                 6    | j                   j                         S r5   r#  rR   s    r?   r  z3T5Gemma2ForTokenClassification.get_input_embeddings  r$  r@   c                 :    | j                   j                  |       y r5   r&  r'  s     r?   r  z3T5Gemma2ForTokenClassification.set_input_embeddings  r(  r@   Nrj  r  r   r   r	  r
  r  r  r  r  r.  r   r   c                    |	|
#t        d| j                  j                   d      |t        d      || j	                  |      } | j
                  |f||||||||	|
dd
|}|j                  }|j                  }|j                  }| j                  |      }d}|| j                  ||| j                        }t        ||||      S )rW  NrX  rY  rZ  Fr[  r^  )r_  r>   rT   r  r  rp  r  r  r  rQ  r4  r\   r   )r=   rj  r  r   r   r	  r
  r  r  r  r  r.  r   rc  r  rm   rw  r=  r1  s                      r?   rN   z&T5Gemma2ForTokenClassification.forward	  s   4 $(=(I%J4>>KbKbJccde  <==$ $ J J9 U&0djj'
%)%/#9!5+'"7'
 '
 $5555//
-.%%ffdkkBD$'!	
 	
r@   rg  )rT   rU   rV   r,   r7   r  r  r$   r#   r:   r	  r0  r   r   r!   r"   r   rN   rX   rY   s   @r?   ri  ri    s\   
~ 
1/  .215.204596:8<2626:>*.@
##d*@
 ''$.@
 t+	@

 &&-@
 !++d2@
 !&t 3@
 $..5@
 )4/@
 ((4/@
  %0047@
   4'@
 +,@
 
@
  @
r@   ri  )r  r  r  ro  rM  ri  )r+   )r   NN)T)arA  collections.abcr   typingr   r:   torch.nnr8    r   r~  activationsr   cache_utilsr   r	   r
   r   
generationr   r   r   integrationsr   r   masking_utilsr   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r    processing_utilsr!   utilsr"   r#   r$   r%   utils.genericr&   r'   r(   autor*   configuration_t5gemma2r,   r-   r.   r/   Moduler1   r[   ro   r   r   r   rW   r   rK   rP   r   r   r  r   r2  r8  r?  rF  	Embeddingr`  ro  r  r  r  r  r  r  r  rM  ri  __all__r,  r@   r?   <module>r     s$  *  $    & ! P P K K I m m B 9   L F & a a O O  t t=bii =(")) &N<bii N<b( *+ ,2	UU\\ 	U# 	U%,, 	U$    %II %<< % 
 % <<	 %
 LL4' %  % T\ % T\ % 5<<%& %F )*N)BII N) +N)b )*zBbii zB +zBz15 1h:5 :z	RYY 	 !@")) !@H bll  . S!o S! S!l  &a
1 a
Hi
- i
X
t0C 
 
H
- H
V \
+ \
 \
~J:'> J:Z ^
(? ^
 ^
B U
%< U
 U
pr@   