
    iG                        d Z ddlmZ ddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5  e0jl                  e7      Z8 G d dejr                        Z:	 	 dUdejr                  dejv                  dejv                  dejv                  dejv                  dz  de<dz  de<de)e.   fd Z= G d! d"ejr                        Z> G d# d$ejr                        Z? G d% d&ejr                        Z@ G d' d(ejr                        ZA G d) d*ejr                        ZB G d+ d,ejr                        ZC G d- d.e      ZD G d/ d0ejr                        ZE G d1 d2ejr                        ZF G d3 d4ejr                        ZGe/ G d5 d6e'             ZHe e/d78       G d9 d:e-                    ZIe/ G d; d<eH             ZJ G d= d>ejr                        ZK G d? d@ejr                        ZL e/dA8       G dB dCeH             ZM e/dD8       G dE dFeH             ZN e/dG8       G dH dIeH             ZO e/dJ8       G dK dLeH             ZPe/ G dM dNeH             ZQe/ G dO dPeH             ZR e/dQ8       G dR dSeHe             ZSg dTZTy)VzPyTorch ELECTRA model.    )Callable)	dataclassN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FNget_activation)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentions)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplecheck_model_inputs   )ElectraConfigc                        e Zd ZdZ fdZ	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  ded	ej                  fd
Z
 xZS )ElectraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       | j#                  dt%        j*                  | j,                  j/                         t$        j0                        d       y )	N)padding_idxepsposition_idsr&   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandzerosr.   sizelongselfconfig	__class__s     v/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/electra/modeling_electra.pyr6   zElectraEmbeddings.__init__:   s   !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`"f&;&;AVAVWzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    N	input_idsr2   r.   inputs_embedspast_key_values_lengthreturnc                    ||j                         }n|j                         d d }|\  }}|| j                  d d |||z   f   }|t        | d      rT| j                  j	                  |j
                  d   d      }	t        j                  |	d|      }	|	j	                  ||      }n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  |      }||z   }| j                  |      }| j                  |      }|S )Nr0   r2   r   r&   )dimindex)r4   device)rJ   r.   hasattrr2   rH   shaperF   gatherrI   rK   rY   r;   r?   r=   r@   rD   )rM   rR   r2   r.   rS   rT   input_shape
batch_size
seq_lengthbuffered_token_type_idsr?   
embeddingsr=   s                rP   forwardzElectraEmbeddings.forwardL   sP     #..*K',,.s3K!,
J,,Q0FVlIl0l-lmL
 !t-.*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
J!W!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D"55
^^J/
\\*-
rQ   )NNNNr   )__name__
__module____qualname____doc__r6   rF   
LongTensorFloatTensorintTensorrb   __classcell__rO   s   @rP   r)   r)   7   s    Q
( .2260426&'(##d*( ((4/( &&-	(
 ((4/( !$( 
(rQ   r)   modulequerykeyvalueattention_maskscalingrD   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|#|d d d d d d d |j                  d   f   }||z   }t
        j                  j                  |d      }t
        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )	Nr0            r	   rW   )ptrainingr&   )rJ   rF   matmul	transposer[   r   
functionalsoftmaxrD   rz   
contiguous)
rm   rn   ro   rp   rq   rr   rD   rs   attn_weightsattn_outputs
             rP   eager_attention_forwardr   x   s     **R.D( <<s}}Q':;gEL!'1a399R=(@A#n4==((2(>L==((6??([L,,|U3K''1-88:K$$rQ   c                        e Zd Zd
 fd	Z	 	 	 ddej
                  dej                  dz  dedz  dej
                  dz  dee	   de
ej
                     fd	Z xZS )ElectraSelfAttentionNc                 @   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        | j                  dz  | _
        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        |j&                  | _        || _        || _        y Nr   r9   zThe hidden size (z6) is not a multiple of the number of attention heads ()ru   )r5   r6   hidden_sizenum_attention_headsrZ   
ValueErrorrN   ri   attention_head_sizeall_head_sizerr   r   Linearrn   ro   rp   rB   attention_probs_dropout_probrD   
is_decoder	is_causal	layer_idxrM   rN   r   r   rO   s       rP   r6   zElectraSelfAttention.__init__   sP    : ::a?PVXhHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++""rQ   hidden_statesrq   past_key_valuescache_positionrs   rU   c                    |j                   d d }g |d| j                  } | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      }	 | j                  |      j                  | j	                  dd      }
|A|}t        |t              r|j                  }|j                  |	|
| j                  d|i      \  }	}
t        j                  | j                  j                  t               } || ||	|
|f| j"                  sdn| j$                  j&                  | j(                  d|\  }} |j*                  g |d j-                         }||fS )Nr0   r&   rv   r           rD   rr   )r[   r   rn   viewr|   ro   rp   
isinstancer   self_attention_cacheupdater   r   get_interfacerN   _attn_implementationr   rz   rD   ry   rr   reshaper   )rM   r   rq   r   r   rs   r]   hidden_shapequery_layer	key_layervalue_layercurrent_past_key_valuesattention_interfacer   r   s                  rP   rb   zElectraSelfAttention.forward   s    $))#2.CCbC$*B*BC 5djj/44lCMMaQRS0DHH]+00,?II!QO	4djj/44lCMMaQRS&&5#/+>?*9*N*N' &=%C%C!>2	&"I{ )@(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
!\ *k));;;;FFHL((rQ   FNNNNrc   rd   re   r6   rF   rj   rh   r   r   r!   tuplerb   rk   rl   s   @rP   r   r      s}    #6 48(,.2-)||-) ))D0-) 	-)
 t+-) +,-) 
u||	-)rQ   r   c                        e Zd Zd
 fd	Z	 	 	 ddej
                  dej                  dz  dej                  dz  dedz  dee	   de
ej
                     fd	Z xZS )ElectraCrossAttentionNc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        | j                  dz  | _
        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        || _        || _        y r   )r5   r6   r   r   rZ   r   rN   ri   r   r   rr   r   r   rn   ro   rp   rB   r   rD   r   r   r   s       rP   r6   zElectraCrossAttention.__init__   sC    : ::a?PVXhHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF""rQ   r   encoder_hidden_statesrq   r   rs   rU   c                 V   |j                   d d \  }}|j                   d   }||d| j                  f}	||d| j                  f}
 | j                  |      j                  |	 j	                  dd      }|%|j
                  j                  | j                        nd}|]|r[|j                  j                  | j                     j                  }|j                  j                  | j                     j                  }n | j                  |      j                  |
 j	                  dd      } | j                  |      j                  |
 j	                  dd      }|C|j                  j                  ||| j                        \  }}d|j
                  | j                  <   t        j                   | j"                  j$                  t&              } || ||||f| j(                  sdn| j*                  j,                  | j.                  d|\  }}|j1                  ||d      j3                         }||fS )Nr0   r&   rv   FTr   r   )r[   r   rn   r   r|   
is_updatedgetr   cross_attention_cachelayerskeysvaluesro   rp   r   r   r   rN   r   r   rz   rD   ry   rr   r   r   )rM   r   r   rq   r   rs   bsztgt_lensrc_lenq_input_shapekv_input_shaper   r   r   r   r   r   r   s                     rP   rb   zElectraCrossAttention.forward   s    %**3B/W'--a0gr4+C+CDwD,D,DE 5djj/44mDNNqRSTGVGb_//33DNNChm
&:'==DDT^^TYYI)??FFt~~V]]K<!67<<nMWWXY[\]I@$**%:;@@.Q[[\]_`aK*)8)N)N)U)U{DNN*&	; >B**4>>:(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
!\ "))#w;FFHL((rQ   r   r   )rc   rd   re   r6   rF   rj   rh   r   r   r!   r   rb   rk   rl   s   @rP   r   r      s    #4 ;?376:2)||2)  %00472) ))D0	2)
 -t32) +,2) 
u||	2)rQ   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ElectraSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr,   )r5   r6   r   r   r   denser@   rA   rB   rC   rD   rL   s     rP   r6   zElectraSelfOutput.__init__/  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rQ   r   input_tensorrU   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S Nr   rD   r@   rM   r   r   s      rP   rb   zElectraSelfOutput.forward5  7    

=1]3}|'CDrQ   rc   rd   re   r6   rF   rj   rb   rk   rl   s   @rP   r   r   .  1    >U\\  RWR^R^ rQ   r   c                        e Zd Zd fd	Z	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dej
                  dz  d	ee	   d
e
ej
                     fdZ xZS )ElectraAttentionNc                     t         |           || _        |rt        nt        } ||||      | _        t        |      | _        y )Nr   r   )r5   r6   is_cross_attentionr   r   rM   r   output)rM   rN   r   r   r   attention_classrO   s         rP   r6   zElectraAttention.__init__>  s=    "43E/K_#Fi9U	'/rQ   r   rq   r   encoder_attention_maskr   r   rs   rU   c                     | j                   s|n|} | j                  |f||||d|\  }}	| j                  ||      }||	fS )N)r   rq   r   r   )r   rM   r   )
rM   r   rq   r   r   r   r   rs   attention_outputr   s
             rP   rb   zElectraAttention.forwardE  sg     04/F/FLb)2*
"7)+)*
 *
&,  ;;'7G--rQ   )FNFNNNNNr   rl   s   @rP   r   r   =  s    0 48:>;?(,.2.||. ))D0.  %0047	.
 !& 1 1D 8. . t+. +,. 
u||	.rQ   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ElectraIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r5   r6   r   r   r   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnrL   s     rP   r6   zElectraIntermediate.__init__^  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$rQ   r   rU   c                 J    | j                  |      }| j                  |      }|S r   )r   r   )rM   r   s     rP   rb   zElectraIntermediate.forwardf  s&    

=100?rQ   r   rl   s   @rP   r   r   ]  s#    9U\\ ell rQ   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ElectraOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r5   r6   r   r   r   r   r   r@   rA   rB   rC   rD   rL   s     rP   r6   zElectraOutput.__init__n  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rQ   r   r   rU   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      rP   rb   zElectraOutput.forwardt  r   rQ   r   rl   s   @rP   r   r   m  r   rQ   r   c                        e Zd Zd fd	Z	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dej
                  dz  d	ee	   d
e
ej
                     fdZd Z xZS )ElectraLayerNc                    t         |           |j                  | _        d| _        t	        ||j
                  |      | _        |j
                  | _        |j                  | _        | j                  r.| j
                  st        |  d      t	        |d|d      | _	        t        |      | _        t        |      | _        y )Nr&   r   z> should be used as a decoder model if cross attention is addedFT)r   r   r   )r5   r6   chunk_size_feed_forwardseq_len_dimr   r   	attentionadd_cross_attentionr   crossattentionr   intermediater   r   )rM   rN   r   rO   s      rP   r6   zElectraLayer.__init__}  s    '-'E'E$)&F<M<MYbc ++#)#=#= ##?? D6)g!hii"2##'	#D 07#F+rQ   r   rq   r   r   r   r   rs   rU   c                 "    | j                   ||f||d|\  }}	|}
| j                  r:|8t        | d      st        d|  d       | j                  |d ||fd|i|\  }}	|}
t        | j                  | j                  | j                  |
      }|S )N)r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r   r   rZ   r   r   r   feed_forward_chunkr   r   )rM   r   rq   r   r   r   r   rs   self_attention_output_r   cross_attention_outputlayer_outputs                rP   rb   zElectraLayer.forward  s     $24>>$
 ,)	$

 $
 q 1??4@4!12 =dV DD D 
 )<(;(;%%&	)
 !0) )%"A  60##T%A%A4CSCSUe
 rQ   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )rM   r   intermediate_outputr   s       rP   r   zElectraLayer.feed_forward_chunk  s,    "//0@A{{#68HIrQ   r   r   )rc   rd   re   r6   rF   rj   rh   r   r   r!   r   rb   r   rk   rl   s   @rP   r   r   |  s    ,, 48:>;?(,.2'||' ))D0'  %0047	'
 !& 1 1D 8' ' t+' +,' 
u||	'RrQ   r   c                       e Zd Z fdZ	 	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  d	ej
                  dz  d
e	e
   deej
                     ez  fdZ xZS )ElectraEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w )N)r   )	r5   r6   rN   r   
ModuleListrangenum_hidden_layersr   layer)rM   rN   irO   s      rP   r6   zElectraEncoder.__init__  sF    ]]uU[UmUmOn#o!L1$E#op
#os   ANr   rq   r   r   r   	use_cacher   rs   rU   c                     t        | j                        D ]  \  }	}
 |
|||f|||d|} t        ||r|      S d       S )N)r   r   r   last_hidden_stater   )	enumerater   r   )rM   r   rq   r   r   r   r   r   rs   r   layer_modules              rP   rb   zElectraEncoder.forward  sq      )4 		OA|(% (> /- M		 9+/8O
 	
>B
 	
rQ   NNNNNN)rc   rd   re   r6   rF   rj   rh   r   boolr   r!   r   r   rb   rk   rl   s   @rP   r   r     s    q 48:>;?(,!%.2
||
 ))D0
  %0047	

 !& 1 1D 8
 
 $;
 t+
 +,
 
u||	H	H
rQ   r   c                   (     e Zd ZdZ fdZd Z xZS )ElectraDiscriminatorPredictionszEPrediction module for the discriminator, made up of two dense layers.c                    t         |           t        j                  |j                  |j                        | _        t        |j                        | _        t        j                  |j                  d      | _	        || _
        y Nr&   )r5   r6   r   r   r   r   r   r   
activationdense_predictionrN   rL   s     rP   r6   z(ElectraDiscriminatorPredictions.__init__  s^    YYv1163E3EF
():):; "		&*<*<a @rQ   c                     | j                  |      }| j                  |      }| j                  |      j                  d      }|S )Nr0   )r   r  r  squeeze)rM   discriminator_hidden_statesr   logitss       rP   rb   z'ElectraDiscriminatorPredictions.forward  s?    

#>?6&&}5==bArQ   rc   rd   re   rf   r6   rb   rk   rl   s   @rP   r   r     s    OrQ   r   c                   (     e Zd ZdZ fdZd Z xZS )ElectraGeneratorPredictionszAPrediction module for the generator, made up of two dense layers.c                     t         |           t        d      | _        t	        j
                  |j                  |j                        | _        t	        j                  |j                  |j                        | _
        y )Ngelur,   )r5   r6   r   r  r   r@   r9   rA   r   r   r   rL   s     rP   r6   z$ElectraGeneratorPredictions.__init__  sV    (0f&;&;AVAVWYYv1163H3HI
rQ   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r  r@   )rM   generator_hidden_statesr   s      rP   rb   z#ElectraGeneratorPredictions.forward  s3    

#:;6}5rQ   r	  rl   s   @rP   r  r    s    KJrQ   r  c                   F     e Zd ZeZdZdZdZdZdZ	dZ
eeedZ fdZ xZS )ElectraPreTrainedModelelectraT)r   
attentionscross_attentionsc                 6   t         |   |       t        |t              ryt	        j
                  |j                  t        j                  |j                  j                  d         j                  d             t	        j                  |j                         y y )Nr0   r/   )r5   _init_weightsr   r)   initcopy_r.   rF   rG   r[   rH   zeros_r2   )rM   rm   rO   s     rP   r  z$ElectraPreTrainedModel._init_weights  sm    f%f/0JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 1rQ   )rc   rd   re   r'   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   r   _can_record_outputsr  rk   rl   s   @rP   r  r    sF     L!&*#N"&%*1/ /rQ   r  z3
    Output type of [`ElectraForPreTraining`].
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                     dz  ed<   dZe
ej                     dz  ed<   y)ElectraForPreTrainingOutputa+  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss of the ELECTRA objective.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        Prediction scores of the head (scores for each token before SoftMax).
    Nlossr  r   r  )rc   rd   re   rf   r%  rF   rh   __annotations__r  r   r   r   rQ   rP   r$  r$    sg     &*D%

d
")'+FE$+59M5**+d2926Je''(4/6rQ   r$  c                       e Zd Z fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  de
ej                     dz  dedz  dej                  dz  dee   deej                     ez  fd              Zd Z xZS )ElectraModelc                 0   t         |   |       t        |      | _        |j                  |j
                  k7  r/t        j                  |j                  |j
                        | _        t        |      | _
        || _        d| _        | j                          y )NF)r5   r6   r)   ra   r9   r   r   r   embeddings_projectr   encoderrN   gradient_checkpointing	post_initrL   s     rP   r6   zElectraModel.__init__3  st     +F3  F$6$66&(ii0E0EvGYGY&ZD#%f-&+#rQ   c                 .    | j                   j                  S r   ra   r;   rM   s    rP   get_input_embeddingsz!ElectraModel.get_input_embeddings@  s    ...rQ   c                 &    || j                   _        y r   r0  )rM   rp   s     rP   set_input_embeddingsz!ElectraModel.set_input_embeddingsC  s    */'rQ   NrR   rq   r2   r.   rS   r   r   r   r   r   rs   rU   c           
      D   | j                   j                  r|	|	n| j                   j                  }	nd}	|	rd|b|| j                   j                  r4t	        t        | j                         t        | j                               nt        | j                         }|d u |d uz  rt        d      ||j                  }|j                  }n|j                  }|j                  d d }|d   }||j                         nd}|
t        j                  |||z   |      }
| j                  |||||      }t        | d	      r| j                  |      }| j                  |||||
|
      \  }} | j                   |f|||||	|d|}t#        |j$                  |j&                        S )NF)rN   z:You must specify exactly one of input_ids or inputs_embedsr0   r&   r   )rY   )rR   r.   r2   rS   rT   r+  )rq   r   embedding_outputr   r   r   )rq   r   r   r   r   r.   r   )rN   r   r   is_encoder_decoderr   r   r   rY   r[   get_seq_lengthrF   rG   ra   rZ   r+  _create_attention_masksr,  r   r   r   )rM   rR   rq   r2   r.   rS   r   r   r   r   r   rs   rY   r]   r_   rT   r6  encoder_outputss                     rP   rb   zElectraModel.forwardF  s     ;;!!%.%:	@U@UII0 )48V8V $L$DlZ^ZeZeFfg!5  -t";<YZZ %%F#//K"))F'--cr2K ^
ETE`!?!?!Afg!"\\*@BX[eBentuN??%)'#9 + 
 4-.#667GH151M1M)#9-"7)+ 2N 2
.. '$,,	
)"7#9+%	
 	
 9-??+;;
 	
rQ   c                     | j                   j                  rt        | j                   ||||      }nt        | j                   ||      }|t        | j                   |||      }||fS )N)rN   input_embedsrq   r   r   )rN   r<  rq   )rN   r<  rq   r   )rN   r   r   r   )rM   rq   r   r6  r   r   r   s          rP   r9  z$ElectraModel._create_attention_masks  sx     ;;!!/{{--- /N 7{{--N "-%>{{-5&;	&" 555rQ   )
NNNNNNNNNN)rc   rd   re   r6   r2  r4  r%   r"   rF   rj   listrh   r   r   r!   r   r   rb   r9  rk   rl   s   @rP   r)  r)  1  sI   /0  *..2.2,0-1596::>!%.2J
<<$&J
 t+J
 t+	J

 llT)J
 ||d*J
  %||d2J
 !&t 3J
 e//047J
 $;J
 t+J
 +,J
 
u||	A	AJ
  J
Z 6rQ   r)  c                   (     e Zd ZdZ fdZd Z xZS )ElectraClassificationHeadz-Head for sentence-level classification tasks.c                 z   t         |           t        j                  |j                  |j                        | _        |j                  |j                  n|j                  }t        d      | _	        t        j                  |      | _        t        j                  |j                  |j                        | _        y )Nr  )r5   r6   r   r   r   r   classifier_dropoutrC   r   r  rB   rD   
num_labelsout_projrM   rN   rA  rO   s      rP   r6   z"ElectraClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 )0zz"45		&"4"4f6G6GHrQ   c                     |d d dd d f   }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }|S )Nr   )rD   r   r  rC  )rM   featuresrs   xs       rP   rb   z!ElectraClassificationHead.forward  sZ    Q1WLLOJJqMOOALLOMM!rQ   r	  rl   s   @rP   r?  r?    s    7IrQ   r?  c                        e Zd ZdZdef fdZ	 d	dej                  dej                  dz  dej                  fdZ	 xZ
S )
ElectraSequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`ElectraConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    rN   c                 f   t         |           t        |dd      | _        | j                  dk(  rt        t        j                         | _        t        |d      rq|j                  ret        |d      r(|j                  r|j                  dkD  r|j                  }n|j                  }t        j                  |j                  |      | _        t        |dd       }|rt        |      nt        j                         | _        t        j                         | _        t        |d      r3|j"                  dkD  r$t        j$                  |j"                        | _        t        j                         | _        t        |d	      r5|j(                  dkD  r%t        j$                  |j(                        | _        y y y )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)r5   r6   getattrrK  NotImplementedErrorr   IdentitysummaryrZ   rN  rO  rB  r   r   r   r  first_dropoutrQ  rB   last_dropoutrR  )rM   rN   num_classesactivation_stringrO   s       rP   r6   zElectraSequenceSummary.__init__  sU   #FNFC& &%{{}6-.63J3Jv78V=Z=Z_e_p_pst_t$//$0099V%7%7EDL#F,@$GIZN3D$E`b`k`k`m[[]6238T8TWX8X!#F,H,H!IDKKM612v7R7RUV7V "

6+F+F GD 8W2rQ   Nr   	cls_indexrU   c                    | j                   dk(  r|dddf   }n| j                   dk(  r|dddf   }n| j                   dk(  r|j                  d      }n| j                   d	k(  r|At        j                  |d
ddddf   |j                  d   dz
  t        j
                        }nX|j                  d      j                  d      }|j                  d|j                         dz
  z  |j                  d      fz         }|j                  d|      j                  d      }n| j                   dk(  rt        | j                        }| j                  |      }| j                  |      }| j!                  |      }|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        rL  Nr0   firstr   meanr&   rx   r[  .rw   r3   )r0   rM  )rK  r^  rF   	full_liker[   rK   	unsqueezerH   rW   rJ   r\   r  rT  rW  rV  r  rX  )rM   r   r[  r   s       rP   rb   zElectraSequenceSummary.forward  sn    &"1b5)F')"1a4(F&("''A'.F+- !OO!#rr1*-!''+a/**	 &//3==bA	%,,Uimmo6I-JmN`N`acNdMf-fg	"))"i8@@DF&(%%##F+f%(""6*rQ   r   )rc   rd   re   rf   r'   r6   rF   rh   rg   rb   rk   rl   s   @rP   rI  rI    sQ    2H} H< VZ)"..);@;K;Kd;R)			)rQ   rI  z
    ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee	   d
e
ej                     ez  fd              Z xZS ) ElectraForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |      | _        t        |      | _        | j                          y r   )	r5   r6   rB  rN   r)  r  r?  
classifierr.  rL   s     rP   r6   z)ElectraForSequenceClassification.__init__:  sH      ++#F+3F; 	rQ   NrR   rq   r2   r.   rS   labelsrs   rU   c           	          | j                   |f||||dd|}|d   }	| j                  |	      }
d}|| j                  j                  | j                  dk(  rd| j                  _        nl| j                  dkD  rL|j
                  t        j                  k(  s|j
                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j                  dk(  rIt               }| j                  dk(  r& ||
j                         |j                               }n ||
|      }n| j                  j                  dk(  r=t               } ||
j                  d	| j                        |j                  d	            }n,| j                  j                  dk(  rt               } ||
|      }t        ||
|j                  |j                   
      S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Trq   r2   r.   rS   return_dictr   Nr&   
regressionsingle_label_classificationmulti_label_classificationr0   r%  r  r   r  )r  rd  rN   problem_typerB  r4   rF   rK   ri   r   r  r   r   r   r   r   r  )rM   rR   rq   r2   r.   rS   re  rs   r  sequence_outputr  r%  loss_fcts                rP   rb   z(ElectraForSequenceClassification.forwardD  s   $ '3dll'
))%''
 '
# 6a81{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'5CC2==	
 	
rQ   r   )rc   rd   re   r6   r$   r"   rF   rj   r   r!   r   r   rb   rk   rl   s   @rP   rb  rb  3  s      *..2.2,0-1&*9
<<$&9
 t+9
 t+	9

 llT)9
 ||d*9
 t#9
 +,9
 
u||	7	79
  9
rQ   rb  z
    Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.

    It is recommended to load the discriminator checkpoint into that model.
    c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee	   d
e
ej                     ez  fd              Z xZS )ElectraForPreTrainingc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )r5   r6   r)  r  r   discriminator_predictionsr.  rL   s     rP   r6   zElectraForPreTraining.__init__  s3     #F+)H)P&rQ   NrR   rq   r2   r.   rS   re  rs   rU   c           	          | j                   |f||||dd|}|d   }	| j                  |	      }
d}|t        j                         }|a|j	                  d|	j
                  d         dk(  }|
j	                  d|	j
                  d         |   }||   } |||j                               }n4 ||
j	                  d|	j
                  d         |j                               }t        ||
|j                  |j                        S )am  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see `input_ids` docstring)
            Indices should be in `[0, 1]`:

            - 0 indicates the token is an original token,
            - 1 indicates the token was replaced.

        Examples:

        ```python
        >>> from transformers import ElectraForPreTraining, AutoTokenizer
        >>> import torch

        >>> discriminator = ElectraForPreTraining.from_pretrained("google/electra-base-discriminator")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")

        >>> sentence = "The quick brown fox jumps over the lazy dog"
        >>> fake_sentence = "The quick brown fox fake over the lazy dog"

        >>> fake_tokens = tokenizer.tokenize(fake_sentence, add_special_tokens=True)
        >>> fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
        >>> discriminator_outputs = discriminator(fake_inputs)
        >>> predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)

        >>> fake_tokens
        ['[CLS]', 'the', 'quick', 'brown', 'fox', 'fake', 'over', 'the', 'lazy', 'dog', '[SEP]']

        >>> predictions.squeeze().tolist()
        [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        ```Trg  r   Nr0   r&   rl  )
r  rs  r   r   r   r[   floatr$  r   r  )rM   rR   rq   r2   r.   rS   re  rs   r  discriminator_sequence_outputr  r%  ro  active_lossactive_logitsactive_labelss                   rP   rb   zElectraForPreTraining.forward  s/   V '3dll'
))%''
 '
# )DA(F%//0MN++-H),11"6S6Y6YZ[6\]abb &B0M0S0STU0V WXc d &{ 3}/B/B/DEB0M0S0STU0V WY_YeYeYgh*5CC2==	
 	
rQ   r   )rc   rd   re   r6   r$   r"   rF   rj   r   r!   r   r$  rb   rk   rl   s   @rP   rq  rq    s      *..2.2,0-1&*F
<<$&F
 t+F
 t+	F

 llT)F
 ||d*F
 t#F
 +,F
 
u||	:	:F
  F
rQ   rq  z
    Electra model with a language modeling head on top.

    Even though both the discriminator and generator may be loaded into this model, the generator is the only model of
    the two to have been trained for the masked language modeling task.
    c                   >    e Zd ZddiZ fdZd Zd Zee	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  dee   dee	j                     ez  fd              Z xZS )ElectraForMaskedLMgenerator_lm_head.weight)electra.embeddings.word_embeddings.weightc                     t         |   |       t        |      | _        t	        |      | _        t        j                  |j                  |j                        | _
        | j                          y r   )r5   r6   r)  r  r  generator_predictionsr   r   r9   r8   generator_lm_headr.  rL   s     rP   r6   zElectraForMaskedLM.__init__  sR     #F+%@%H"!#6+@+@&BSBS!TrQ   c                     | j                   S r   r  r1  s    rP   get_output_embeddingsz(ElectraForMaskedLM.get_output_embeddings      %%%rQ   c                     || _         y r   r  )rM   r;   s     rP   set_output_embeddingsz(ElectraForMaskedLM.set_output_embeddings  s
    !0rQ   NrR   rq   r2   r.   rS   re  rs   rU   c           	      p    | j                   |f||||dd|}|d   }	| j                  |	      }
| j                  |
      }
d}|Pt        j                         } ||
j                  d| j                  j                        |j                  d            }t        ||
|j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Trg  r   Nr0   rl  )r  r  r  r   r   r   rN   r8   r   r   r  )rM   rR   rq   r2   r.   rS   re  rs   r  generator_sequence_outputprediction_scoresr%  ro  s                rP   rb   zElectraForMaskedLM.forward  s    $ #/$,,#
))%'#
 #
 %<A$>! 667PQ 223DE**,H-222t{{7M7MNPVP[P[\^P_`D$1??.99	
 	
rQ   r   )rc   rd   re   _tied_weights_keysr6   r  r  r$   r"   rF   rj   r   r!   r   r   rb   rk   rl   s   @rP   r{  r{    s     56ab&1  *..2.2,0-1&*)
<<$&)
 t+)
 t+	)

 llT))
 ||d*)
 t#)
 +,)
 
u||	~	-)
  )
rQ   r{  z
    Electra model with a token classification head on top.

    Both the discriminator and generator may be loaded into this model.
    c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee	   d
e
ej                     ez  fd              Z xZS )ElectraForTokenClassificationc                 `   t         |   |       |j                  | _        t        |      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y r   )r5   r6   rB  r)  r  rA  rC   r   rB   rD   r   r   rd  r.  rD  s      rP   r6   z&ElectraForTokenClassification.__init__.  s      ++#F+)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJrQ   NrR   rq   r2   r.   rS   re  rs   rU   c           	      H    | j                   |f||||dd|}|d   }	| j                  |	      }	| j                  |	      }
d}|<t               } ||
j	                  d| j
                        |j	                  d            }t        ||
|j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Trg  r   Nr0   rl  )	r  rD   rd  r   r   rB  r   r   r  )rM   rR   rq   r2   r.   rS   re  rs   r  rv  r  r%  ro  s                rP   rb   z%ElectraForTokenClassification.forward;  s      '3dll'
))%''
 '
# )DA(F%(,5R(S%!>?')HFKKDOO<fkk"oND$5CC2==	
 	
rQ   r   )rc   rd   re   r6   r$   r"   rF   rj   r   r!   r   r   rb   rk   rl   s   @rP   r  r  &  s      *..2.2,0-1&*&
<<$&&
 t+&
 t+	&

 llT)&
 ||d*&
 t#&
 +,&
 
u||	4	4&
  &
rQ   r  c                   R    e Zd ZeZdZ fdZee	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  dee   dee	j                     ez  fd              Z xZS )ElectraForQuestionAnsweringr  c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r   )
r5   r6   rB  r)  r  r   r   r   
qa_outputsr.  rL   s     rP   r6   z$ElectraForQuestionAnswering.__init__k  sS      ++#F+))F$6$68I8IJ 	rQ   NrR   rq   r2   r.   rS   start_positionsend_positionsrs   rU   c           	          | j                   |f||||dd|}	|	d   }
| j                  |
      }|j                  dd      \  }}|j                  d      j	                         }|j                  d      j	                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }t        ||||	j                  |	j                  	      S )
NTrg  r   r&   r0   rx   )ignore_indexrv   )r%  start_logits
end_logitsr   r  )r  r  splitr  r   lenrJ   clampr   r   r   r  )rM   rR   rq   r2   r.   rS   r  r  rs   r  rn  r  r  r  
total_lossignored_indexro  
start_lossend_losss                      rP   rb   z#ElectraForQuestionAnswering.forwardu  s    '3dll'
))%''
 '
# 6a81#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J+%!5CC2==
 	
rQ   )NNNNNNN)rc   rd   re   r'   r  r  r6   r$   r"   rF   rj   r   r!   r   r   rb   rk   rl   s   @rP   r  r  f  s     L!  *..2.2,0-1/3-13
<<$&3
 t+3
 t+	3

 llT)3
 ||d*3
 ,3
 ||d*3
 +,3
 
u||	;	;3
  3
rQ   r  c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee	   d
e
ej                     ez  fd              Z xZS )ElectraForMultipleChoicec                     t         |   |       t        |      | _        t	        |      | _        t        j                  |j                  d      | _	        | j                          y r  )r5   r6   r)  r  rI  sequence_summaryr   r   r   rd  r.  rL   s     rP   r6   z!ElectraForMultipleChoice.__init__  sM     #F+ 6v >))F$6$6: 	rQ   NrR   rq   r2   r.   rS   re  rs   rU   c           	         ||j                   d   n|j                   d   }|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|1|j                  d|j                  d      |j                  d            nd} | j                  |f||||dd|}	|	d   }
| j	                  |
      }| j                  |      }|j                  d|      }d}|t               } |||      }t        |||	j                  |	j                        S )	a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr&   r0   rw   Trg  r   rl  )
r[   r   rJ   r  r  rd  r   r   r   r  )rM   rR   rq   r2   r.   rS   re  rs   num_choicesr  rn  pooled_outputr  reshaped_logitsr%  ro  s                   rP   rb   z ElectraForMultipleChoice.forward  s   T -6,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 '3dll'
))%''
 '
# 6a8--o>/ ++b+6')HOV4D("5CC2==	
 	
rQ   r   )rc   rd   re   r6   r$   r"   rF   rj   r   r!   r   r   rb   rk   rl   s   @rP   r  r    s      *..2.2,0-1&*N
<<$&N
 t+N
 t+	N

 llT)N
 ||d*N
 t#N
 +,N
 
u||	8	8N
  N
rQ   r  zS
    ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.
    c                        e Zd ZddiZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dedz  dedz  de	j                  dz  dee	j                  z  dee   dee	j                     ez  fd              Z xZS )ElectraForCausalLMr|  r}  c                 $   t         |   |       |j                  st        j	                  d       t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y )NzOIf you want to use `ElectraForCausalLM` as a standalone, add `is_decoder=True.`)r5   r6   r   loggerwarningr)  r  r  r  r   r   r9   r8   r  r.  rL   s     rP   r6   zElectraForCausalLM.__init__  sh       NNlm#F+%@%H"!#6+@+@&BSBS!TrQ   c                     | j                   S r   r  r1  s    rP   r  z(ElectraForCausalLM.get_output_embeddings   r  rQ   c                     || _         y r   r  )rM   new_embeddingss     rP   r  z(ElectraForCausalLM.set_output_embeddings#  s
    !/rQ   NrR   rq   r2   r.   rS   r   r   re  r   r   r   logits_to_keeprs   rU   c                    |d}
 | j                   |f|||||||	|
|dd
|}|j                  }t        |t              rt	        | d      n|}| j                  | j                  |dd|ddf               }d}|* | j                  d||| j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a3  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ElectraForCausalLM, ElectraConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-generator")
        >>> config = ElectraConfig.from_pretrained("google/electra-base-generator")
        >>> config.is_decoder = True
        >>> model = ElectraForCausalLM.from_pretrained("google/electra-base-generator", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NFT)
rq   r2   r.   rS   r   r   r   r   r   rh  )r  re  r8   )r%  r  r   r   r  r  r'  )r  r   r   ri   slicer  r  loss_functionrN   r8   r   r   r   r  r  )rM   rR   rq   r2   r.   rS   r   r   re  r   r   r   r  rs   outputsr   slice_indicesr  r%  s                      rP   rb   zElectraForCausalLM.forward&  s   P I=IT\\>
))%'"7#9+)>
 >
  118B>SV8W~ot4]k''(B(B=QRTacdQdCe(fg%4%%pVFt{{OeOepiopD0#33!//))$55
 	
rQ   )NNNNNNNNNNNr   )rc   rd   re   r  r6   r  r  r$   r"   rF   rj   r   r   ri   r   r!   r   r   rb   rk   rl   s   @rP   r  r    ss    56ab
&0  *..2.2,0-1596:&*(,!%.2-.H
<<$&H
 t+H
 t+	H

 llT)H
 ||d*H
  %||d2H
 !&t 3H
 t#H
 H
 $;H
 t+H
 ell*H
 +,H
 
u||	@	@H
  H
rQ   r  )	r  r{  r  rq  r  rb  r  r)  r  )Nr   )Urf   collections.abcr   dataclassesr   rF   r   torch.nnr   r   r    r
   r  activationsr   r   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr    r!   r"   r#   utils.genericr$   r%   configuration_electrar'   
get_loggerrc   r  Moduler)   rj   ru  r   r   r   r   r   r   r   r   r   r   r  r  r$  r)  r?  rI  rb  rq  r{  r  r  r  r  __all__r'  rQ   rP   <module>r     sH    $ !   A A & 1 C C ) J 9	 	 	 G & 6  B 0 
		H	%=		 =N !%II%<<% 
% <<	%
 LL4'% T\% % '(%<F)299 F)TJ)BII J)\		 .ryy .@"))  BII @- @H
RYY 
Dbii &")) $ /_ / /* 
7+ 7 7 C6) C6 C6L		 0`RYY `F F
'= F
F
R Q
2 Q
Q
h >
/ >
>
B 6
$: 6
6
r C
"8 C
 C
L [
5 [
 [
| 
_
/ _

_
D
rQ   