
    i*?                       d Z ddlZddlZddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/  e(       rddl0m1Z1 ddl2m3Z3  e*jh                  e5      Z6 G d dejn                        Z8 G d dejn                        Z9 G d dejn                        Z: G d dejn                        Z; G d dejn                        Z< G d d ejn                        Z= G d! d"ejn                        Z> G d# d$e      Z? G d% d&ejn                        Z@e' G d' d(e#             ZA G d) d*eA      ZBe' G d+ d,eA             ZC e'd-.       G d/ d0eAe             ZDe' G d1 d2eA             ZE e'd3.       G d4 d5eA             ZFe' G d6 d7eA             ZGe' G d8 d9eA             ZHg d:ZIy);zPyTorch UMT5 model.    N)Union)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torchdynamo_compilingloggingtorch_compilable_check)is_flash_attention_requested   )
UMT5Config)	BlockMask)make_flex_block_causal_maskc                   &     e Zd Zd fd	Zd Z xZS )UMT5LayerNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)ze
        Construct a layernorm module in the UMT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      p/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/umt5/modeling_umt5.pyr)   zUMT5LayerNorm.__init__?   s1     	ll5::k#:; #    c                    |j                  t        j                        j                  d      j	                  dd      }|t        j
                  || j                  z         z  }| j                  j                  t        j                  t        j                  fv r%|j                  | j                  j                        }| j                  |z  S )N   T)keepdim)tor+   float32powmeanrsqrtr.   r-   dtypefloat16bfloat16)r/   hidden_statesvariances      r3   forwardzUMT5LayerNorm.forwardG   s     !##EMM266q9>>r4>P%Ht?T?T4T(UU ;; ??),,T[[->->?M{{]**r4   )gư>)__name__
__module____qualname__r)   rC   __classcell__r2   s   @r3   r&   r&   >   s    $+r4   r&   c                   *     e Zd Zdef fdZd Z xZS )UMT5DenseActDenseconfigc                 ^   t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _
        t        |j                     | _        y NFbias)r(   r)   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr
   dense_act_fnactr/   rK   r2   s     r3   r)   zUMT5DenseActDense.__init__Y   sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r4   c                    | j                  |      }| j                  |      }| j                  |      }t        | j                  j
                  t        j                        r|j                  | j                  j
                  j                  k7  r`| j                  j
                  j                  t        j                  k7  r/|j                  | j                  j
                  j                        }| j	                  |      }|S N)rS   rY   rW   
isinstancerT   r-   r+   Tensorr>   int8r9   r/   rA   s     r3   rC   zUMT5DenseActDense.forward`   s    ./]3tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r4   rD   rE   rF   r"   r)   rC   rG   rH   s   @r3   rJ   rJ   X   s    /z /r4   rJ   c                   *     e Zd Zdef fdZd Z xZS )UMT5DenseGatedActDenserK   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y rM   )r(   r)   r   rP   rQ   rR   wi_0wi_1rT   rU   rV   rW   r
   rX   rY   rZ   s     r3   r)   zUMT5DenseGatedActDense.__init__p   s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r4   c                 ,   | j                  | j                  |            }| j                  |      }||z  }| j                  |      }t	        | j
                  j                  t        j                        r|j                  | j
                  j                  j                  k7  r`| j
                  j                  j                  t        j                  k7  r/|j                  | j
                  j                  j                        }| j                  |      }|S r\   )rY   re   rf   rW   r]   rT   r-   r+   r^   r>   r_   r9   )r/   rA   hidden_geluhidden_linears       r3   rC   zUMT5DenseGatedActDense.forwardx   s    hhtyy78		-0#m3]3 tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r4   ra   rH   s   @r3   rc   rc   o   s    /z /r4   rc   c                   *     e Zd Zdef fdZd Z xZS )UMT5LayerFFrK   c                    t         |           |j                  rt        |      | _        nt        |      | _        t        |j                  |j                        | _	        t        j                  |j                        | _        y )Nr1   )r(   r)   is_gated_actrc   DenseReluDenserJ   r&   rQ   layer_norm_epsilon
layer_normr   rU   rV   rW   rZ   s     r3   r)   zUMT5LayerFF.__init__   s_    "8"@D"3F";D'F<U<UVzz&"5"56r4   c                 r    | j                  |      }| j                  |      }|| j                  |      z   }|S r\   )rq   ro   rW   )r/   rA   forwarded_statess      r3   rC   zUMT5LayerFF.forward   s=    ??=9../?@%5E(FFr4   ra   rH   s   @r3   rk   rk      s    7z 7r4   rk   c                       e Zd ZdZddedz  f fdZdej                  dej                  fdZd Z	dd	Z
	 	 	 	 dd
ej                  dej                  dz  dedz  dej                  dz  dej                  dz  f
dZ xZS )UMT5Attentionz7
    T5's attention using relative_attention_bias.
    N	layer_idxc                    t         |           |j                  | _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | _
        |j                  | _        | j                  | j                  z  | _        || _        |9| j                  r-t        j!                  d| j"                  j$                   d       t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        | j                  r0t'        j2                  | j                  | j                        | _        y y )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrN   )r(   r)   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerQ   d_kvkey_value_proj_dim	num_headsn_headsrV   rW   	inner_dimrv   loggerwarning_oncer2   rD   r   rP   qkvo	Embeddingrelative_attention_bias)r/   rK   ry   rv   r2   s       r3   r)   zUMT5Attention.__init__   sd    +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD( ,r4   
projectionreturnc                     |j                         d d | j                  | j                  fz   }|j                  |      j	                  dddd      }|S )Nr7   r   r6   r!   r   )sizer   r}   viewpermute)r/   r   new_projection_shapenew_projections       r3   _shapezUMT5Attention._shape   sQ    )0"5tG^G^8__#)=>FFq!QPQRr4   c                    d}| j                   }| j                  }| j                  sC|dz  }||dkD  j                  t        j
                        |z  z  }t	        j                  |      }n*t	        j                  |t	        j                  |             }|dz  }||k  }t	        j                  |j                         |z        t        j                  ||z        z  }|||z
  z  }||j                  t        j
                        z   }t	        j                  |t	        j                  ||dz
              }|t	        j                  |||      z  }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r6   r!   )rz   r{   rx   r9   r+   longabsmin
zeros_likelogfloatmath	full_likewhere)	r/   relative_positionrelative_bucketsnum_bucketsmax_distance	max_exactis_small	log_ratiorelative_position_if_larges	            r3   _relative_position_bucketz'UMT5Attention._relative_position_bucket   sA   * 99;;AK!2Q!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$y0 II/557)CDtxxP\_hPhGii	y!89	%.ejj1I%I"%*YY&8RT_bcTc(d&
" 	EKK2CE_``r4   c                    | | j                   j                  j                  }|.t        j                  |t        j
                  |      dddf   }n	|dddf   }t        j                  |t        j
                  |      dddf   }||z
  }| j                  |      }| j                  |      }	|	j                  g d      j                  d      }	|	S )z%Compute binned relative position biasN)r>   device)r6   r   r!   r   )	r   r-   r   r+   aranger   r   r   	unsqueeze)
r/   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr   relative_position_bucketvaluess
             r3   compute_biaszUMT5Attention.compute_bias   s    >1188??F!$||L

SYZ[\^b[bc-ag6,,zFSTXZ[T[\+.>>#'#A#ABS#T --.FG	*44Q7r4   rA   encoder_hidden_statespast_key_valuesattention_maskr   c                 h   |j                   d d \  }}|d u}| j                  |      }	|	j                  |d| j                  | j                        j                  dd      }	d}
|Qt        |t              rA|j                  j                  | j                        }
|r|j                  }n|j                  }n|}|r|n|}|rK|I|
rG|j                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      }| j#                  |      }|j                  |d| j                  | j                        j                  dd      }|j                  |d| j                  | j                        j                  dd      }|T|s|nd }|j%                  ||| j                  d|i      \  }}|r)t        |t              rd|j                  | j                  <   t'        j(                  |	|j                  dd            }|||j+                         z   n|}|j                   d   }| j,                  s;t'        j.                  d| j                  ||f|j0                  |j2                  	      }n1| j5                  |||j0                  |
      }|d d d d | d d d f   }|#|d d d d d d d |j                   d   f   }||z   }|}||z  }t6        j8                  j;                  |j=                         d      j?                  |      }t6        j8                  jA                  || j@                  | jB                        }t'        j(                  ||      }|j                  dd      jE                         }|j                  ||d      }| jG                  |      }||fS )Nr6   r7   r!   Fr   Tr   )r   r>   )r   r   dim)ptraining)$shaper   r   r   r}   	transposer]   r   
is_updatedgetrv   cross_attention_cacheself_attention_cachelayerskeysr   r   r   updater+   matmulget_seq_lengthry   zerosr   r>   r   r   
functionalsoftmaxr   type_asrW   r   
contiguousr   )r/   rA   r   r   r   r   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_statesscoresreal_seq_lengthr   position_biascausal_maskposition_bias_maskedattn_weightsattn_outputs                          r3   rC   zUMT5Attention.forward  s    "/!4!4Ra!8
J 3$>vvm,#((RtG^G^_iijkmno 
&:oGZ+[(3377GJ!'6'L'L$'6'K'K$#2 2D.-/"=*-44T^^DIIJ/66t~~FMML/J66.1L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL*7It+?+F+Fdnn?OQ_>`,(
L &*_FY*ZAEO..t~~> lJ,@,@A,FG L[Kf*'E'E'GGlv%%b)
//!KKDLL*j9&--W]WcWcM !--FMMR` . M *!Qa*?@M%(Aq2HJ4D4DR4H2H)HIK)K7M,&& }},,V\\^,DLLVT}},,\T\\TXTaTa,bll<>!++Aq1<<>!&&z:rBff[)L((r4   )FN)NNNNNN)rD   rE   rF   __doc__intr)   r+   r^   r   r   r   r   rC   rG   rH   s   @r3   ru   ru      s    kSSWZ k6 %,, - ^$ 6:(,.2.2R)||R)  %||d2R) 	R)
 t+R) t+R)r4   ru   c                   :     e Zd Zddedz  f fdZ	 	 	 ddZ xZS )UMT5LayerSelfAttentionNrv   c                     t         |           t        |d|      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )NTry   rv   rm   )r(   r)   ru   SelfAttentionr&   rQ   rp   rq   r   rU   rV   rW   r/   rK   rv   r2   s      r3   r)   zUMT5LayerSelfAttention.__init__Y  sN    *6t_hi'F<U<UVzz&"5"56r4   c                     | j                  |      }| j                  ||||      }|| j                  |d         z   }|f|dd  z   }|S )Nr   r   r   r   r!   )rq   r   rW   )r/   rA   r   r   r   normed_hidden_statesattention_outputoutputss           r3   rC   zUMT5LayerSelfAttention.forward_  sj      $}=-- )+)	 . 
 &5Ea5H(II "%5ab%99r4   r\   )NNNrD   rE   rF   r   r)   rC   rG   rH   s   @r3   r   r   X  s#    7#* 7 r4   r   c                   <     e Zd Zddedz  f fdZ	 	 	 	 ddZ xZS )UMT5LayerCrossAttentionNrv   c                     t         |           t        |d|      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )NFr   rm   )r(   r)   ru   EncDecAttentionr&   rQ   rp   rq   r   rU   rV   rW   r   s      r3   r)   z UMT5LayerCrossAttention.__init__s  sO    ,VQVbkl'F<U<UVzz&"5"56r4   c                     | j                  |      }| j                  |||||      }|| j                  |d         z   }|f|dd  z   }	|	S )Nr   r   r   r   r   r!   )rq   r   rW   )
r/   rA   r   r   r   r   r   r   layer_outputr   s
             r3   rC   zUMT5LayerCrossAttention.forwardy  sl      $}=// "7)+) 0 
 %t||4DQ4G'HH/$4QR$88r4   r\   r   r   rH   s   @r3   r   r   r  s&    7#* 7 #r4   r   c                   B     e Zd Zddedz  f fdZ	 	 	 	 	 	 	 ddZ xZS )	UMT5BlockNrv   c                 n   t         |           |j                  | _        t        j                         | _        | j
                  j                  t        ||             | j                  r&| j
                  j                  t        ||             | j
                  j                  t        |             y )Nrv   )
r(   r)   rx   r   
ModuleListlayerappendr   r   rk   r   s      r3   r)   zUMT5Block.__init__  sz     ++]]_


09MN??JJ5f	RS

+f-.r4   c	                     | j                   d   ||||      \  }}	|j                  t        j                  k(  r}t        j                  |j                        j
                  }
t        j                  t        j                  |      j                         |
dz
  |
      }t        j                  || |      }d }| j                  xr |d u}|r | j                   d   |||||      \  }}|j                  t        j                  k(  r}t        j                  |j                        j
                  }
t        j                  t        j                  |      j                         |
dz
  |
      }t        j                  || |      } | j                   d   |      }|j                  t        j                  k(  r}t        j                  |j                        j
                  }
t        j                  t        j                  |      j                         |
dz
  |
      }t        j                  || |      }|f}|r||	|fz  }|S )Nr   r   i  )r   maxr!   r   r7   )r   r>   r+   r?   finfor   r   isinfanyclamprx   )r/   rA   r   r   encoder_attention_maskr   	use_cacheoutput_attentionsr   self_attn_weights	max_dtypeclamp_valuecross_attn_weightsdo_cross_attentionr   s                  r3   rC   zUMT5Block.forward  s    ,94::a=)+)	,
(( %--/M$7$78<<I++ekk-&@&D&D&F	TXHXZcdK!KKK<[YM "!__R1Fd1R0=

1&;5 /-1-M- ""emm3!KK(;(;<@@	#kk%++m*D*H*H*JIX\L\^gh %M|Q\ ] '

2}5 %--/M$7$78<<I++ekk-&@&D&D&F	TXHXZcdK!KKK<[YM ")+=>>Gr4   r\   )NNNNFFNr   rH   s   @r3   r   r     s/    /#* / "#7r4   r   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )UMT5ClassificationHeadz-Head for sentence-level classification tasks.rK   c                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        y )N)r   )r(   r)   r   rP   rQ   denserU   classifier_dropoutrW   
num_labelsout_projrZ   s     r3   r)   zUMT5ClassificationHead.__init__  sZ    YYv~~v~~>
zzF$=$=>		&..&2C2CDr4   rA   r   c                     | j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r\   )rW   r  r+   tanhr
  r`   s     r3   rC   zUMT5ClassificationHead.forward  sN    ]3

=1

=1]3m4r4   )
rD   rE   rF   r   r"   r)   r+   r^   rC   rG   rH   s   @r3   r  r    s/    7Ez EU\\ ell r4   r  c                   t    e Zd ZU eed<   dZdZdZdgZdgZ	e
d        Z ej                         d        Zd Zy	)
UMT5PreTrainedModelrK   transformerTr   rT   c                 v    t        j                  t              }t        j                  t              }|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r+   tensorr   r   )r/   r  
input_maskdummy_inputss       r3   r  z UMT5PreTrainedModel.dummy_inputs  s6    LL.	\\*-
!*"&0

 r4   c                 r   | j                   j                  }t        |t              r$t	        j
                  |j                  |dz         y	t        |t        t        t        t        f      rt	        j                  |j                  j                  d|dz         t        |d      rE| j                   j                  s/t	        j                  |j                  j                  d|dz         t        |d      rpt	        j                  |j                   j                  d|| j                   j"                  dz  z         t	        j$                  |j                   j&                         y	y	t        |t(              rft        |d      rYt	        j                  |j*                  j                  d|dz         t	        j$                  |j*                  j&                         y	y	t        |t,              r9t	        j                  |j.                  j                  d|| j                   j"                  dz  z         t        |j.                  d      r?|j.                  j&                  )t	        j$                  |j.                  j&                         t	        j                  |j0                  j                  d|| j                   j"                  dz  z         t        |j0                  d      rA|j0                  j&                  *t	        j$                  |j0                  j&                         y	y	y	t        |t2              r9t	        j                  |j4                  j                  d|| j                   j"                  dz  z         t        |j4                  d      r?|j4                  j&                  )t	        j$                  |j4                  j&                         t	        j                  |j6                  j                  d|| j                   j8                  dz  z         t        |j6                  d      rA|j6                  j&                  *t	        j$                  |j6                  j&                         y	y	y	t        |t:              rt	        j                  |j<                  j                  d|| j                   j"                  dz  z         t        |j<                  d      r?|j<                  j&                  )t	        j$                  |j<                  j&                         t	        j                  |j>                  j                  d|| j                   j"                  dz  z         t        |j>                  d      r?|j>                  j&                  )t	        j$                  |j>                  j&                         t	        j                  |j6                  j                  d|| j                   j8                  dz  z         t        |j6                  d      rA|j6                  j&                  *t	        j$                  |j6                  j&                         y	y	y	t        |t@              rP| j                   j"                  }| j                   jB                  }| j                   jD                  }t	        j                  |jF                  j                  d|||z  dz  z         t	        j                  |jH                  j                  d||dz  z         t	        j                  |jJ                  j                  d||dz  z         t	        j                  |jL                  j                  d|||z  dz  z         |jN                  r3t	        j                  |jP                  j                  d||dz  z         y	y	y	)
zInitialize the weights      ?        )r<   stdlm_head
qa_outputs      
classifierrO   N))rK   initializer_factorr]   r&   init	constant_r-   	UMT5ModelUMT5ForConditionalGenerationUMT5EncoderModelUMT5ForQuestionAnsweringnormal_sharedhasattrtie_word_embeddingsr  r  rQ   zeros_rO   UMT5ForTokenClassificationr  r  r  r
  rJ   rS   rT   rR   rc   re   rf   ru   r|   r~   r   r   r   r   ry   r   )r/   modulefactorrQ   r}   r   s         r3   _init_weightsz!UMT5PreTrainedModel._init_weights  s>    //fm,NN6==&3,7, (	
 LL--CVc\Jvy)$++2Q2QV^^22&3,Ov|,V..55CVPTP[P[PcPchlOlEmnF--223 -  :;v|,V..55CVc\RF--223 -  67LL,,3Ft{{GZGZ_cFc<dev||V,1B1B1NFLL--.LL//cv$++J]J]bfIf?ghv/FOO4H4H4TFOO001 5U/ 12 LL))DKKDWDW\`C`9abvyy&)fiinn.HFIINN+LL))DKKDTDTY]C]9^_vyy&)fiinn.HFIINN+ /I) 67LL++#6dkkFYFY^bEb;cdv{{F+0@0@0LFKK,,-LL++#6dkkFYFY^bEb;cdv{{F+0@0@0LFKK,,-LL))DKKDTDTY]C]9^_vyy&)fiinn.HFIINN+ /I). kk))G!%!1!1kk++GLLs7M_C_dhBh8ijLLs'4-8PQLLs'4-8PQLLs7M_C_dhBh8ij11V;;BBRX]dim\mRno 2 /r4   c                 8   | j                   j                  }| j                   j                  }|t        d      |j	                  |j
                        }|dd df   j                         |ddd f<   ||d<   |t        d      |j                  |dk(  |       |S )Nzself.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id. See UMT5 docs for more information..r7   r!   ).r   z1self.model.config.pad_token_id has to be defined.)rK   decoder_start_token_idpad_token_id
ValueError	new_zerosr   clonemasked_fill_)r/   r  r1  r2  shifted_input_idss        r3   _shift_rightz UMT5PreTrainedModel._shift_right>  s    !%!C!C{{//!)6 
 &//	@%.sCRCx%8%>%>%@#qr'"$:&!PQQ&&'8D'@,O  r4   N)rD   rE   rF   r"   __annotations__base_model_prefixsupports_gradient_checkpointing_can_compile_fullgraph_no_split_modules_keep_in_fp32_modulespropertyr  r+   no_gradr.  r8   r4   r3   r  r    sb    %&*#!$!F  U]]_@p @pD!r4   r  c                       e Zd Z fdZd Z	 	 	 	 	 	 	 	 	 	 	 ddZ	 ddeej                  df   dej                  dej                  de	d	e
f
d
Zedej                  dededej                  dej                  defd       Z xZS )	UMT5Stackc           	         t         |   |       t        j                  |j                  |j
                        | _        |j                  | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j
                  |j                        | _        t        j                   |j"                        | _        d| _        | j)                          y c c}w )Nr   rm   F)r(   r)   r   r   
vocab_sizerQ   embed_tokensrx   r   range
num_layersr   blockr&   rp   final_layer_normrU   rV   rW   gradient_checkpointing	post_init)r/   rK   ir2   s      r3   r)   zUMT5Stack.__init__U  s     LL):):FNNK ++]]ERXRcRcLd#eqIf$B#ef
 -fnn&B[B[ \zz&"5"56 ',# $fs   7C9c                     || _         y r\   )rF  r/   new_embeddingss     r3   set_input_embeddingszUMT5Stack.set_input_embeddingsa  s
    *r4   c                 d   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
|$|"| j
                  rdnd}t        d| d| d      |&|j                         }|j                  d|d         }n8||j                         d d }n"| j
                  rdnd}t        d| d| d	      | j                  r%| j                  r|rt        j                  d
       d}|(| j                  t        d      | j                  |      }|\  }}|du r| j
                  st        d|  d      | j
                  rf|rr|p| j                   j                  r5t        t!        | j                         t!        | j                               }n%t!        | j                         }n| j
                  sd }||j#                         nd}|%t%        j&                  |||z   |j(                        }|1t+               s'||z   }t%        j,                  |||j(                        }| j
                  r2| j/                  |||t1        |t              r|j2                  n||      }n]|Y|d d d d d d f   }|j5                  |j6                        }d|z
  t%        j8                  |j6                        j:                  z  }nd }| j
                  rO|M|j                         \  }}}||f}|!t%        j,                  ||j(                        }| j=                  |      }nd }|	rdnd }|rdnd }|r| j
                  rdnd }| j?                  |      }tA        | jB                        D ]D  \  }}|	r||fz   } |||||||||      }|d   }|s&||d   fz  }| j
                  s<||d   fz  }F | jE                  |      }| j?                  |      }|	r||fz   }|
stG        d |||||fD              S tI        |||||      S )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer7   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoder)rK   r   r   )r>   r  rA  )r   r   r   r   r   r!   r6   c              3   $   K   | ]  }|| 
 y wr\   rA  ).0r   s     r3   	<genexpr>z$UMT5Stack.forward.<locals>.<genexpr>  s      
 = 
s   )last_hidden_stater   rA   
attentionscross_attentions)%rK   r   r   output_hidden_statesuse_return_dictrx   r3  r   r   rK  r   r   r   rF  is_encoder_decoderr   r   r   r+   r   r   r   r,   _update_causal_maskr]   r   r9   r>   r   r   invert_attention_maskrW   	enumeraterI  rJ  tupler   ) r/   r  r   r   r   rU  r   r   r   r]  return_dictr   kwargserr_msg_prefixinput_shaper   r   past_key_values_lengthmask_seq_lengthr   encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsrA   rM  layer_modulelayer_outputss                                    r3   rC   zUMT5Stack.forwardd  s    "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	   ( !_`` --i8M!,
J?? #LTFRg!hii ??_4;;11&9$DKK8,dkk:Z'O '3$++&FO #OETE`!?!?!Afg!"\\&(>(KTaThThN !*B*D4zAO"ZZ
OML`L`aN??22o/BC  44$!K '(D$)9:K%..}/B/B.CK,M<O<O0P0T0TTKK ??4@=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+"6BD0d%64??rPT]3(4 	@OA|#$58H$H!(%'F /#"3-	M *!,M =#3"55??(]1-=,??()	@, --m<]3   1]4D D 
 "#%"(
 
 
 9+++%1
 	
r4   r   r#   input_tensorr   r   r   c           	         t        | j                        r||dk(  j                         r|S y | j                  j                  dk(  r't	        |t
        j                        rt        |      }|S ||j                         nd}||j                  nd}| j                  j                  dk(  r(|s&|s$t        j                  |||| j                        ry |j                  }|j                  d   }	|r|j                         }
n1t	        |t
        j                        r|j                  d   n||	z   dz   }
| j!                  ||	|
|||j                  d   	      }| j                  j                  dk(  rQ|O|j"                  j$                  d
v r7|s5t        j&                  |      j(                  }t        j*                  ||      }|S )Nr  flex_attentionr   Fsdpa)rU  rh  is_trainingr!   r7   )sequence_lengthtarget_lengthr>   r   r   )cudaxpunpu)r    rK   r   _attn_implementationr]   r+   r^   r$   r   is_compileabler   _ignore_causal_mask_sdpar   r>   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer   r   _unmask_unattended)r/   r   rt  r   r   r   past_seen_tokensusing_compilable_cacher>   ry  rz  r   	min_dtypes                r3   r`  zUMT5Stack._update_causal_mask  s    (4)~/D.I.I.K%%;;++/??.%,,7!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr4   ry  rz  r>   r   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer>   r   r!   )diagonalrV  r7   r   )r   r+   r   r   fullr   triur   reshapeexpandr5  r   r9   masked_fill)r   ry  rz  r>   r   r   re  r   r  mask_lengthpadding_masks              r3   r  z?UMT5Stack._prepare_4d_causal_attention_mask_with_cache_positionK  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r4   )NNNNNNNNNNN)F)rD   rE   rF   r)   rQ  rC   r   r+   r^   r   boolr`  staticmethodr   r>   r  rG   rH   s   @r3   rC  rC  T  s    
+
 "#!`
R #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r4   rC  c                       e Zd ZU dZdZeed<   dddZ fdZd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  deeej                        d	z  ded	z  dej$                  d	z  dej$                  d	z  ded	z  ded	z  ded	z  ded	z  dej                  d	z  deej                     ez  fd       Z xZS )r"  ao  
    Examples:

    ```python
    >>> from transformers import UMT5Model, AutoTokenizer

    >>> model = UMT5Model.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
    >>> label = "<extra_id_0> verhandelt"
    >>> inputs = tokenizer(inputs, return_tensors="pt")
    >>> labels = tokenizer(label=label, return_tensors="pt")

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
    >>> hidden_states = outputs.last_hidden_state
    ```umt5rK   shared.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightc                    t         |   |       t        j                  |j                  |j
                        | _        t        j                  |      }d|_	        d|_
        t        |      | _        t        j                  |      }d|_	        |j                  |_        t        |      | _        | j!                          y NFT)r(   r)   r   r   rE  rQ   r'  copydeepcopyrx   r   rC  encodernum_decoder_layersrH  decoderrL  r/   rK   encoder_configdecoder_configr2   s       r3   r)   zUMT5Model.__init__  s     ll6#4#4fnnEv.$)!#(  0v.$(!$*$=$=! 0 	r4   c                     | j                   S r\   r'  r/   s    r3   get_input_embeddingszUMT5Model.get_input_embeddings      {{r4   c                 ~    || _         | j                  j                  |       | j                  j                  |       y r\   r'  r  rQ  r  rO  s     r3   rQ  zUMT5Model.set_input_embeddings  -    $)).9)).9r4   Nr  r   r  r  encoder_outputsr   rU  decoder_inputs_embedsr   r   r]  rd  r   r   c                 F   |	|	n| j                   j                  }	||n| j                   j                  }|| j                  ||||
||      }nI|rGt	        |t
              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }| j                  |||||||	|
|||      }|s||z   S t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                        S )	ah
  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5Model.from_pretrained("google/umt5-small")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for UMT5Model.
        >>> # This is not needed for torch's UMT5ForConditionalGeneration as it does this internally using labels arg.
        >>> decoder_input_ids = model._shift_right(decoder_input_ids)

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  r   rU  r   r]  rd  r   r!   r6   rZ  rA   r[  r  r   rU  r   r   r   r   r   r]  rd  r   )rZ  r   decoder_hidden_statesdecoder_attentionsr\  encoder_last_hidden_stater   encoder_attentions)rK   r   r^  r  r]   r   lenr  r   rZ  r   rA   r[  r\  )r/   r  r   r  r  r  r   rU  r  r   r   r]  rd  r   re  rA   decoder_outputss                    r3   rC   zUMT5Model.forward  sR   F "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/+"/#1/!5#) ' 
 "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r4   NNNNNNNNNNNNN)rD   rE   rF   r   
model_typer"   r9  _tied_weights_keysr)   r  rQ  r   r+   
LongTensorFloatTensor
BoolTensorrc  r   r^   r  r   rC   rG   rH   s   @r3   r"  r"    s   " J'6'6
$:
  .23759:>BF(,-159!%)-,0#'26s
##d*s
 ))D0s
 !++d2	s

 !& 0 04 7s
 uU%6%6784?s
 s
 ||d*s
  %||d2s
 $;s
  $;s
 #Tks
 D[s
 ((4/s
  
u  	!$6	6!s
 s
r4   r"  z<
    UMT5 Model with a `language modeling` head on top.
    )custom_introc            !           e Zd ZdZdZddddZ fdZd Zd Ze		 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  deee
j                        dz  dedz  de
j                  dz  de
j                  dz  de
j                  dz  dedz  dedz  dedz  dedz  de
j                  dz  dee
j                     ez  fd       Zde
j                  fdZ xZS )r#  a  
    Examples:

    ```python
    >>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer

    >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

    >>> outputs = model(**inputs)
    >>> loss = outputs.loss
    ```r  r  )r  r  zlm_head.weightc                    t         |   |       |j                  | _        t	        j
                  |j                  |j                        | _        t        j                  |      }d|_
        d|_        t        |      | _        t        j                  |      }d|_
        |j                  |_        t        |      | _        t	        j"                  |j                  |j                  d      | _        | j'                          y )NFTrN   )r(   r)   rQ   	model_dimr   r   rE  r'  r  r  rx   r   rC  r  r  rH  r  rP   r  rL  r  s       r3   r)   z%UMT5ForConditionalGeneration.__init__M  s     ll6#4#4fnnEv.$)!#(  0v.$(!$*$=$=! 0yy1B1BO 	r4   c                     | j                   S r\   r  r  s    r3   r  z1UMT5ForConditionalGeneration.get_input_embeddingsc  r  r4   c                 ~    || _         | j                  j                  |       | j                  j                  |       y r\   r  rO  s     r3   rQ  z1UMT5ForConditionalGeneration.set_input_embeddingsg  r  r4   Nr  r   r  r  r  r   rU  r  labelsr   r   r]  rd  r   r   c                    |
|
n| j                   j                  }
||n| j                   j                  }|| j                  ||||||      }nI|rGt	        |t
              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }|	||| j                  |	      }| j                  |||||||
||||      }|d   }| j                   j                  r|| j                  dz  z  }| j                  |      }d}|	^t        d	
      }|	j                  |j                        }	 ||j                  d|j!                  d            |	j                  d            }|s|f|dd z   |z   }||f|z   S |S t#        |||j$                  |j&                  |j(                  |j*                  |j,                  |j&                  |j(                  	      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, UMT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")

        >>> # training
        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
        >>> outputs = model(input_ids=input_ids, labels=labels)
        >>> loss = outputs.loss
        >>> logits = outputs.logits

        >>> # inference
        >>> input_ids = tokenizer("Studies have shown that <extra_id_0> good for you", return_tensors="pt").input_ids
        >>> outputs = model.generate(input_ids)
        >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
        ```Nr  r   r!   r6   r  r  r  r0  ignore_indexr7   	losslogitsr   r  r  r\  r  r   r  )rK   r   r^  r  r]   r   r  r8  r  r)  r  r  r   r9   r   r   r   r   r   rA   r[  r\  rZ  )r/   r  r   r  r  r  r   rU  r  r  r   r   r]  rd  r   re  rA   r  sequence_output	lm_logitsr  loss_fctoutputs                          r3   rC   z$UMT5ForConditionalGeneration.forwardl  s"   N "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1/!5#) ' 
 *!,;;** .1EFOLL1	'T:HYYy//0FINN2y~~b/ABFKKPROTD\OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r4   c                 $    | j                  |      S r\   )r8  )r/   r  s     r3   %prepare_decoder_input_ids_from_labelszBUMT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels  s      ((r4   )NNNNNNNNNNNNNN)rD   rE   rF   r   r  r  r)   r  rQ  r   r+   r  r  r  rc  r^   r   r  r   rC   r  rG   rH   s   @r3   r#  r#  0  s     J'6'6),:
  .23759:>=A(,26:>*.!%)-,0#'26N
##d*N
 ))D0N
 !++d2	N

 !& 0 04 7N
 uU\\23d:N
 N
 ((4/N
  %0047N
   4'N
 $;N
  $;N
 #TkN
 D[N
 ((4/N
" 
u  	!O	3#N
 N
b)ELL )r4   r#  c                        e Zd ZdZdZddiZ fdZd Zd Ze		 	 	 	 	 	 dd	e
j                  dz  d
e
j                  dz  de
j                  dz  dedz  dedz  dedz  dee
j                     ez  fd       Z xZS )r$  a  
    Examples:

    ```python
    >>> from transformers import UMT5EncoderModel, AutoTokenizer

    >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
    >>> outputs = model(input_ids)
    >>> hidden_state = outputs.last_hidden_state
    ```r  r  r  c                    t         |   |       t        j                  |j                  |j
                        | _        t        j                  |      }d|_	        d|_
        t        |      | _        | j                          y )NF)r(   r)   r   r   rE  rQ   r'  r  r  r   r_  rC  r  rL  )r/   rK   r  r2   s      r3   r)   zUMT5EncoderModel.__init__  sb     ll6#4#4fnnEv.#( ,1) 0 	r4   c                     | j                   S r\   r  r  s    r3   r  z%UMT5EncoderModel.get_input_embeddings%  r  r4   c                 H    || _         | j                  j                  |       y r\   )r'  r  rQ  rO  s     r3   rQ  z%UMT5EncoderModel.set_input_embeddings)  s    $)).9r4   Nr  r   rU  r   r]  rd  r   c                 h    ||n| j                   j                  }| j                  ||||||      }|S )aQ  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5EncoderModel

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```r  )rK   r^  r  )	r/   r  r   rU  r   r]  rd  re  r  s	            r3   rC   zUMT5EncoderModel.forward-  sH    F &1%<k$++B]B],,)'/!5# ' 
 r4   )NNNNNN)rD   rE   rF   r   r  r  r)   r  rQ  r   r+   r  r  r  rc  r   rC   rG   rH   s   @r3   r$  r$    s     J 	&
:  .23726)-,0#',##d*, ))D0, ((4/	,
  $;, #Tk, D[, 
u  	!O	3, ,r4   r$  z
    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                   ~    e Zd ZdgZdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e
ej                     dz  d
ej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  deez  fd       Z xZS )UMT5ForSequenceClassificationFdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightrK   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r\   )r(   r)   r"  r  r  classification_headrL  rZ   s     r3   r)   z&UMT5ForSequenceClassification.__init__h  s6     $V,#9&#A  	r4   Nr  r   r  r  r  rU  r  r  r   r   r]  rd  r   c                    ||n| j                   j                  }|d}	|$|"t        d| j                  j                         | ||t        d      | j                  |      }| j                  ||||||||	|
||      }|d   }|j                  | j                   j                        j                  |j                        }t        t        j                  |j                  d            j!                         dk(  d       |j"                  \  }}}||ddf   j%                  |d	|      ddd	ddf   }| j'                  |      }d}||j                  |j                        }| j                   j(                  | j                   j*                  dk(  rd
| j                   _        nv| j                   j*                  dkD  rL|j,                  t        j.                  k(  s|j,                  t        j0                  k(  rd| j                   _        nd| j                   _        | j                   j(                  d
k(  rSt3               }| j                   j*                  dk(  r& ||j5                         |j5                               }n |||      }n| j                   j(                  dk(  rGt7               } ||j%                  d	| j                   j*                        |j%                  d	            }n,| j                   j(                  dk(  rt9               } |||      }|s|f|dd z   }||f|z   S |S t;        |||j<                  |j>                  |j@                  |jB                  |jD                  |jF                  |jH                  	      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)
r   r  r  r  rU  r  r   r   r]  rd  r   r!   z7All examples must have the same number of <eos> tokens.r7   
regressionsingle_label_classificationmulti_label_classificationr  )%rK   r^  NotImplementedErrorr2   rD   r3  r8  r  eqeos_token_idr9   r   r   r+   unique_consecutivesumnumelr   r   r  problem_typer	  r>   r   r   r   squeezer   r   r   r   r  r  r\  r  r   r  )r/   r  r   r  r  r  rU  r  r  r   r   r]  rd  re  r   r  eos_maskr   rl  r0   sentence_representationr  r  r  r  s                            r3   rC   z%UMT5ForSequenceClassification.forwardp  sN   ` &1%<k$++B]B]I!:%J4>>KbKbJcd  $)>)F  U 
 !% 1 1) <"")/#9+'"7/!5# # 
 "!*<< 8 89<<_=S=ST$$X\\!_5;;=BE	
 &5%:%:"
A{"1(A+">"C"CJPRT_"`abdfhiai"j))*ABYYv}}-F{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r4   )NNNNNNNNNNNN)rD   rE   rF   "_keys_to_ignore_on_load_unexpectedr"   r)   r   r+   r  r^   listr  r  rc  r   rC   rG   rH   s   @r3   r  r  ^  sU    +s)s&z   .2.259:>:>26:>*.!%)-,0#'A
##d*A
 t+A
 !++d2	A

 !& 0 04 7A
 e//047A
 ((4/A
  %0047A
   4'A
 $;A
  $;A
 #TkA
 D[A
 
0	0A
 A
r4   r  c                       e Zd ZdgZdef fdZe	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	dz  d
e	dz  de	dz  de
ej                     ez  fd       Z xZS )r+  r  rK   c                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r\   )r(   r)   r	  r$  r  r   rU   r  rW   rP   r0   r  rL  rZ   s     r3   r)   z#UMT5ForTokenClassification.__init__  sj      +++F3zz&";";<))F$6$68I8IJ 	r4   Nr  r   rU  r  r   r]  rd  r   c                    ||n| j                   j                  }| j                  ||||||      }	|	d   }
| j                  |
      }
| j	                  |
      }d}|<t               } ||j                  d| j                        |j                  d            }|s||	dd f}||f|z   S |S t        |||	j                  |	j                        S )aB  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        N)r   rU  r   r]  rd  r   r7   r6   )r  r  rA   r[  )rK   r^  r  rW   r  r   r   r	  r   rA   r[  )r/   r  r   rU  r  r   r]  rd  re  r   rA   r  r  r  r  s                  r3   rC   z"UMT5ForTokenClassification.forward  s    6 &1%<k$++B]B]"")'/!5# # 
  
]3/')HFKKDOO<fkk"oNDgam,F)-)9TGf$EvE$!//))	
 	
r4   )NNNNNNN)rD   rE   rF   r  r"   r)   r   r+   r^   r  rc  r   rC   rG   rH   s   @r3   r+  r+    s    *r)s&	z 	  *..2-1&*)-,0#'6
<<$&6
 t+6
 ||d*	6

 t#6
  $;6
 #Tk6
 D[6
 
u||	4	46
 6
r4   r+  c                       e Zd ZdddZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  deeej                        dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  deej                     ez  fd       Z xZS )r%  r  r  c                 $   t         |   |       |j                  | _        t	        j
                  |j                  |j                        | _        t        j                  |      }d|_
        d|_        t        |      | _        t        j                  |      }d|_
        |j                  |_        t        |      | _        |j"                  | _        t	        j$                  |j                  |j"                        | _        | j)                          y r  )r(   r)   rQ   r  r   r   rE  r'  r  r  rx   r   rC  r  r  rH  r  r	  rP   r  rL  r  s       r3   r)   z!UMT5ForQuestionAnswering.__init__G  s     ll6#4#4fnnEv.$)!#(  0v.$(!$*$=$=! 0 ++))FNNF4E4EF 	r4   c                     | j                   S r\   r  r  s    r3   r  z-UMT5ForQuestionAnswering.get_input_embeddings^  r  r4   c                 ~    || _         | j                  j                  |       | j                  j                  |       y r\   r  rO  s     r3   rQ  z-UMT5ForQuestionAnswering.set_input_embeddingsb  r  r4   Nr  r   r  r  r  start_positionsend_positionsrU  r  r   r   r]  rd  r   c                    ||n| j                   j                  }|
|
n| j                   j                  }
||d}
| |	|t        d      | j	                  |      }|
|
n| j                   j                  }
||n| j                   j                  }|| j                  ||||||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }| j                  |||	d|||
|||	
      }|d   }| j                  |      }|j                  dd
      \  }}|j                  d
      j                         }|j                  d
      j                         }d}||t        |j                               dkD  r*|j                  d
      j                  |j                         }t        |j                               dkD  r*|j                  d
      j                  |j                         }|j                  d      }|j#                  d|      }|j#                  d|      }t%        |      } |||      } |||      }||z   dz  }|s||f|dd z   |z   }||f|z   S |S t'        ||||j(                  |j*                  |j,                  |j.                  |j0                  |j*                  |j,                  
      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        NFr  r  r   r!   r6   r  )
r  r   rU  r   r   r   r   r   r]  rd  r7   r   r  )
r  start_logits
end_logitsr   r  r  r\  r  r   r  )rK   r^  r   r3  r8  r  r]   r   r  r  r  splitr  r   r   r9   r   r   r   r   r   rA   r[  r\  rZ  )r/   r  r   r  r  r  r  r  rU  r  r   r   r]  rd  re  rA   r  r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                              r3   rC   z UMT5ForQuestionAnswering.forwardg  s0   \ &1%<k$++B]B]!*!6IDKK<Q<Q	&=+DI
 $)>)F  U 
 !% 1 1) <!*!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/ "/#1/!5# ' 
 *!,1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""="@"@ATAT"U=%%'(1, - 5 5b 9 < <Z=N=N O(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J//!"2EEWF/9/EZMF*Q6Q2%!+;;"1"?"?.99,==&5&G&G"1"?"?.99
 	
r4   r  )rD   rE   rF   r  r)   r  rQ  r   r+   r  r  r  rc  r^   r  r   rC   rG   rH   s   @r3   r%  r%  @  s    (7'6
.:
  .23759:>=A371526:>!%)-,0#'I
##d*I
 ))D0I
 !++d2	I

 !& 0 04 7I
 uU\\23d:I
 ))D0I
 ''$.I
 ((4/I
  %0047I
 $;I
  $;I
 #TkI
 D[I
  
u  	!$G	G!I
 I
r4   r%  )r$  r#  r%  r  r+  r"  r  )Jr   r  r   typingr   r+   r   torch.nnr   r   r   rT  r	   r   activationsr
   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   r   utils.genericr    configuration_umt5r"   !torch.nn.attention.flex_attentionr#   integrations.flex_attentionr$   
get_loggerrD   r   Moduler&   rJ   rc   rk   ru   r   r   r   r  r  rC  r"  r#  r$  r  r+  r%  __all__rA  r4   r3   <module>r     s         A A & ! C C ) > 9   .   : *  !;J			H	%+BII +4		 .RYY <")) $v)BII v)rRYY 4bii 8B* BLRYY $ j!/ j! j!Zm# m`	 h
# h
 h
V 
J)#6 J)
J)Z X* X Xv N
$7 N
N
b G
!4 G
 G
T p
2 p
 p
fr4   