
    i+                        d Z ddlmZ ddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5  e0jl                  e7      Z8 G d dejr                        Z:	 	 d[dejr                  dejv                  dejv                  dejv                  dejv                  dz  de<dz  de<de)e.   fd Z= G d! d"ejr                        Z> G d# d$ejr                        Z? G d% d&ejr                        Z@ G d' d(ejr                        ZA G d) d*ejr                        ZB G d+ d,ejr                        ZC G d- d.e      ZD G d/ d0ejr                        ZE G d1 d2ejr                        ZF G d3 d4ejr                        ZG G d5 d6ejr                        ZH G d7 d8ejr                        ZI G d9 d:ejr                        ZJ G d; d<ejr                        ZKe/ G d= d>e'             ZLe e/d?@       G dA dBe-                    ZM e/dC@       G dD dEeL             ZN e/dF@       G dG dHeL             ZO e/dI@       G dJ dKeLe             ZPe/ G dL dMeL             ZQ e/dN@       G dO dPeL             ZR e/dQ@       G dR dSeL             ZSe/ G dT dUeL             ZTe/ G dV dWeL             ZUe/ G dX dYeL             ZVg dZZWy)\zPyTorch BERT model.    )Callable)	dataclassN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)	)BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplecheck_model_inputs   )
BertConfigc                        e Zd ZdZ fdZ	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  ded	ej                  fd
Z
 xZS )BertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       | j#                  dt%        j*                  | j,                  j/                         t$        j0                        d       y )	N)padding_idxepsposition_idsr&   F)
persistenttoken_type_ids)dtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandzerosr.   sizelongselfconfig	__class__s     p/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.pyr5   zBertEmbeddings.__init__7   s   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    N	input_idsr2   r.   inputs_embedspast_key_values_lengthreturnc                    ||j                         }n|j                         d d }|\  }}|| j                  d d |||z   f   }|t        | d      rT| j                  j	                  |j
                  d   d      }	t        j                  |	d|      }	|	j	                  ||      }n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  |      }||z   }| j                  |      }| j                  |      }|S )Nr0   r2   r   r&   )dimindex)r3   device)rI   r.   hasattrr2   rG   shaperE   gatherrH   rJ   rX   r:   r>   r<   r?   rC   )rL   rQ   r2   r.   rR   rS   input_shape
batch_size
seq_lengthbuffered_token_type_idsr>   
embeddingsr<   s                rO   forwardzBertEmbeddings.forwardG   sP     #..*K',,.s3K!,
J,,Q0FVlIl0l-lmL
 !t-.*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
J!W!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D"55
^^J/
\\*-
rP   )NNNNr   )__name__
__module____qualname____doc__r5   rE   
LongTensorFloatTensorintTensorra   __classcell__rN   s   @rO   r)   r)   4   s    Q
$ .2260426&'(##d*( ((4/( &&-	(
 ((4/( !$( 
(rP   r)   modulequerykeyvalueattention_maskscalingrC   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|#|d d d d d d d |j                  d   f   }||z   }t
        j                  j                  |d      }t
        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )	Nr0            r	   rV   )ptrainingr&   )rI   rE   matmul	transposerZ   r   
functionalsoftmaxrC   ry   
contiguous)
rl   rm   rn   ro   rp   rq   rC   rr   attn_weightsattn_outputs
             rO   eager_attention_forwardr   r   s     **R.D( <<s}}Q':;gEL!'1a399R=(@A#n4==((2(>L==((6??([L,,|U3K''1-88:K$$rP   c                        e Zd Zd
 fd	Z	 	 	 ddej
                  dej                  dz  dedz  dej
                  dz  dee	   de
ej
                     fd	Z xZS )BertSelfAttentionNc                 @   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        | j                  dz  | _
        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        |j&                  | _        || _        || _        y Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()rt   )r4   r5   r8   num_attention_headsrY   
ValueErrorrM   rh   attention_head_sizeall_head_sizerq   r   Linearrm   rn   ro   rA   attention_probs_dropout_probrC   
is_decoder	is_causal	layer_idxrL   rM   r   r   rN   s       rO   r5   zBertSelfAttention.__init__   sP    : ::a?PVXhHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++""rP   hidden_statesrp   past_key_valuescache_positionrr   rT   c                    |j                   d d }g |d| j                  } | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      }	 | j                  |      j                  | j	                  dd      }
|A|}t        |t              r|j                  }|j                  |	|
| j                  d|i      \  }	}
t        j                  | j                  j                  t               } || ||	|
|f| j"                  sdn| j$                  j&                  | j(                  d|\  }} |j*                  g |d j-                         }||fS )Nr0   r&   ru   r           rC   rq   )rZ   r   rm   viewr{   rn   ro   
isinstancer   self_attention_cacheupdater   r   get_interfacerM   _attn_implementationr   ry   rC   rx   rq   reshaper~   )rL   r   rp   r   r   rr   r\   hidden_shapequery_layer	key_layervalue_layercurrent_past_key_valuesattention_interfacer   r   s                  rO   ra   zBertSelfAttention.forward   s    $))#2.CCbC$*B*BC 5djj/44lCMMaQRS0DHH]+00,?II!QO	4djj/44lCMMaQRS&&5#/+>?*9*N*N' &=%C%C!>2	&"I{ )@(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
!\ *k));;;;FFHL((rP   FNNNNrb   rc   rd   r5   rE   ri   rg   r   r   r!   tuplera   rj   rk   s   @rO   r   r      s}    #6 48(,.2-)||-) ))D0-) 	-)
 t+-) +,-) 
u||	-)rP   r   c                        e Zd Zd
 fd	Z	 	 	 ddej
                  dej                  dz  dej                  dz  dedz  dee	   de
ej
                     fd	Z xZS )BertCrossAttentionNc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        | j                  dz  | _
        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        || _        || _        y r   )r4   r5   r8   r   rY   r   rM   rh   r   r   rq   r   r   rm   rn   ro   rA   r   rC   r   r   r   s       rO   r5   zBertCrossAttention.__init__   sC    : ::a?PVXhHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF""rP   r   encoder_hidden_statesrp   r   rr   rT   c                 V   |j                   d d \  }}|j                   d   }||d| j                  f}	||d| j                  f}
 | j                  |      j                  |	 j	                  dd      }|%|j
                  j                  | j                        nd}|]|r[|j                  j                  | j                     j                  }|j                  j                  | j                     j                  }n | j                  |      j                  |
 j	                  dd      } | j                  |      j                  |
 j	                  dd      }|C|j                  j                  ||| j                        \  }}d|j
                  | j                  <   t        j                   | j"                  j$                  t&              } || ||||f| j(                  sdn| j*                  j,                  | j.                  d|\  }}|j1                  ||d      j3                         }||fS )Nr0   r&   ru   FTr   r   )rZ   r   rm   r   r{   
is_updatedgetr   cross_attention_cachelayerskeysvaluesrn   ro   r   r   r   rM   r   r   ry   rC   rx   rq   r   r~   )rL   r   r   rp   r   rr   bsztgt_lensrc_lenq_input_shapekv_input_shaper   r   r   r   r   r   r   s                     rO   ra   zBertCrossAttention.forward   s    %**3B/W'--a0gr4+C+CDwD,D,DE 5djj/44mDNNqRSTGVGb_//33DNNChm
&:'==DDT^^TYYI)??FFt~~V]]K<!67<<nMWWXY[\]I@$**%:;@@.Q[[\]_`aK*)8)N)N)U)U{DNN*&	; >B**4>>:(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
!\ "))#w;FFHL((rP   r   r   )rb   rc   rd   r5   rE   ri   rg   r   r   r!   r   ra   rj   rk   s   @rO   r   r      s    #4 ;?376:2)||2)  %00472) ))D0	2)
 -t32) +,2) 
u||	2)rP   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )BertSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr,   )r4   r5   r   r   r8   denser?   r@   rA   rB   rC   rK   s     rO   r5   zBertSelfOutput.__init__&  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rP   r   input_tensorrT   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S Nr   rC   r?   rL   r   r   s      rO   ra   zBertSelfOutput.forward,  7    

=1]3}|'CDrP   rb   rc   rd   r5   rE   ri   ra   rj   rk   s   @rO   r   r   %  1    >U\\  RWR^R^ rP   r   c                        e Zd Zd fd	Z	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dej
                  dz  d	ee	   d
e
ej
                     fdZ xZS )BertAttentionNc                     t         |           || _        |rt        nt        } ||||      | _        t        |      | _        y )Nr   r   )r4   r5   is_cross_attentionr   r   rL   r   output)rL   rM   r   r   r   attention_classrN   s         rO   r5   zBertAttention.__init__4  s=    "40B,HY#Fi9U	$V,rP   r   rp   r   encoder_attention_maskr   r   rr   rT   c                     | j                   s|n|} | j                  |f||||d|\  }}	| j                  ||      }||	fS )N)r   rp   r   r   )r   rL   r   )
rL   r   rp   r   r   r   r   rr   attention_outputr   s
             rO   ra   zBertAttention.forward;  sg     04/F/FLb)2*
"7)+)*
 *
&,  ;;'7G--rP   )FNFNNNNNr   rk   s   @rO   r   r   3  s    - 48:>;?(,.2.||. ))D0.  %0047	.
 !& 1 1D 8. . t+. +,. 
u||	.rP   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BertIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r4   r5   r   r   r8   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnrK   s     rO   r5   zBertIntermediate.__init__S  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$rP   r   rT   c                 J    | j                  |      }| j                  |      }|S r   )r   r   rL   r   s     rO   ra   zBertIntermediate.forward[  s&    

=100?rP   r   rk   s   @rO   r   r   R  s#    9U\\ ell rP   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )
BertOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r4   r5   r   r   r   r8   r   r?   r@   rA   rB   rC   rK   s     rO   r5   zBertOutput.__init__b  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rP   r   r   rT   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      rO   ra   zBertOutput.forwardh  r   rP   r   rk   s   @rO   r   r   a  r   rP   r   c                        e Zd Zd fd	Z	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dej
                  dz  d	ee	   d
e
ej
                     fdZd Z xZS )	BertLayerNc                    t         |           |j                  | _        d| _        t	        ||j
                  |      | _        |j
                  | _        |j                  | _        | j                  r.| j
                  st        |  d      t	        |d|d      | _	        t        |      | _        t        |      | _        y )Nr&   r   z> should be used as a decoder model if cross attention is addedFT)r   r   r   )r4   r5   chunk_size_feed_forwardseq_len_dimr   r   	attentionadd_cross_attentionr   crossattentionr   intermediater   r   )rL   rM   r   rN   s      rO   r5   zBertLayer.__init__p  s    '-'E'E$&v9J9JV_` ++#)#=#= ##?? D6)g!hii"/##'	#D -V4 (rP   r   rp   r   r   r   r   rr   rT   c                 "    | j                   ||f||d|\  }}	|}
| j                  r:|8t        | d      st        d|  d       | j                  |d ||fd|i|\  }}	|}
t        | j                  | j                  | j                  |
      }|S )N)r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r   r   rY   r   r   r   feed_forward_chunkr   r   )rL   r   rp   r   r   r   r   rr   self_attention_output_r   cross_attention_outputlayer_outputs                rO   ra   zBertLayer.forward  s     $24>>$
 ,)	$

 $
 q 1??4@4!12 =dV DD D 
 )<(;(;%%&	)
 !0) )%"A  60##T%A%A4CSCSUe
 rP   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )rL   r   intermediate_outputr   s       rO   r   zBertLayer.feed_forward_chunk  s,    "//0@A{{#68HIrP   r   r   )rb   rc   rd   r5   rE   ri   rg   r   r   r!   r   ra   r   rj   rk   s   @rO   r   r   o  s    ), 48:>;?(,.2'||' ))D0'  %0047	'
 !& 1 1D 8' ' t+' +,' 
u||	'RrP   r   c                       e Zd Z fdZ	 	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  d	ej
                  dz  d
e	e
   deej
                     ez  fdZ xZS )BertEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w )N)r   )	r4   r5   rM   r   
ModuleListrangenum_hidden_layersr   layer)rL   rM   irN   s      rO   r5   zBertEncoder.__init__  sF    ]]ERXRjRjLk#lqIf$B#lm
#ls   ANr   rp   r   r   r   	use_cacher   rr   rT   c                     t        | j                        D ]  \  }	}
 |
|||f|||d|} t        ||r|      S d       S )N)r   r   r   )last_hidden_stater   )	enumerater   r   )rL   r   rp   r   r   r   r   r   rr   r   layer_modules              rO   ra   zBertEncoder.forward  sq      )4 		OA|(% (> /- M		 9+/8O
 	
>B
 	
rP   NNNNNN)rb   rc   rd   r5   rE   ri   rg   r   boolr   r!   r   r   ra   rj   rk   s   @rO   r   r     s    n 48:>;?(,!%.2
||
 ))D0
  %0047	

 !& 1 1D 8
 
 $;
 t+
 +,
 
u||	H	H
rP   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
BertPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r4   r5   r   r   r8   r   Tanh
activationrK   s     rO   r5   zBertPooler.__init__  s9    YYv1163E3EF
'')rP   r   rT   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r  )rL   r   first_token_tensorpooled_outputs       rO   ra   zBertPooler.forward  s6     +1a40

#566rP   r   rk   s   @rO   r   r     s#    $
U\\ ell rP   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BertPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )r4   r5   r   r   r8   r   r   r   r   r   transform_act_fnr?   r@   rK   s     rO   r5   z$BertPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STrP   r   rT   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r	  r?   r   s     rO   ra   z#BertPredictionHeadTransform.forward  s4    

=1--m<}5rP   r   rk   s   @rO   r  r    s$    UU\\ ell rP   r  c                   $     e Zd Z fdZd Z xZS )BertLMPredictionHeadc                    t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        y )NT)bias)r4   r5   r  	transformr   r   r8   r7   decoder	ParameterrE   rH   r  rK   s     rO   r5   zBertLMPredictionHead.__init__  s[    4V< yy!3!3V5F5FTRLLV->->!?@	rP   c                 J    | j                  |      }| j                  |      }|S r   )r  r  r   s     rO   ra   zBertLMPredictionHead.forward  s$    }5]3rP   rb   rc   rd   r5   ra   rj   rk   s   @rO   r  r    s    ArP   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BertOnlyMLMHeadc                 B    t         |           t        |      | _        y r   )r4   r5   r  predictionsrK   s     rO   r5   zBertOnlyMLMHead.__init__  s    /7rP   sequence_outputrT   c                 (    | j                  |      }|S r   )r  )rL   r  prediction_scoress      rO   ra   zBertOnlyMLMHead.forward	  s     ,,_=  rP   r   rk   s   @rO   r  r    s#    8!u|| ! !rP   r  c                   $     e Zd Z fdZd Z xZS )BertOnlyNSPHeadc                 l    t         |           t        j                  |j                  d      | _        y Nru   )r4   r5   r   r   r8   seq_relationshiprK   s     rO   r5   zBertOnlyNSPHead.__init__  s'     "		&*<*<a @rP   c                 (    | j                  |      }|S r   )r  )rL   r  seq_relationship_scores      rO   ra   zBertOnlyNSPHead.forward  s    !%!6!6}!E%%rP   r  rk   s   @rO   r  r    s    A&rP   r  c                   $     e Zd Z fdZd Z xZS )BertPreTrainingHeadsc                     t         |           t        |      | _        t	        j
                  |j                  d      | _        y r  )r4   r5   r  r  r   r   r8   r  rK   s     rO   r5   zBertPreTrainingHeads.__init__  s4    /7 "		&*<*<a @rP   c                 N    | j                  |      }| j                  |      }||fS r   )r  r  )rL   r  r  r  r!  s        rO   ra   zBertPreTrainingHeads.forward  s0     ,,_=!%!6!6}!E "888rP   r  rk   s   @rO   r#  r#    s    A
9rP   r#  c                   n     e Zd ZeZdZdZdZdZdZ	dZ
eeedZ ej                           fd       Z xZS )BertPreTrainedModelbertT)r   
attentionscross_attentionsc                    t         |   |       t        |t              r t	        j
                  |j                         yt        |t              ryt	        j                  |j                  t        j                  |j                  j                  d         j                  d             t	        j
                  |j                         yy)zInitialize the weightsr0   r/   N)r4   _init_weightsr   r  initzeros_r  r)   copy_r.   rE   rF   rZ   rG   r2   )rL   rl   rN   s     rO   r,  z!BertPreTrainedModel._init_weights3  s     	f%f23KK$/JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 0rP   )rb   rc   rd   r'   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   r   _can_record_outputsrE   no_gradr,  rj   rk   s   @rO   r'  r'  $  sX    L&*#N"&"'. U]]_/ /rP   r'  z0
    Output type of [`BertForPreTraining`].
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
eej                     dz  ed<   dZeej                     dz  ed<   y)BertForPreTrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    Nlossprediction_logitsseq_relationship_logitsr   r)  )rb   rc   rd   re   r<  rE   rg   __annotations__r=  r>  r   r   r)   rP   rO   r;  r;  >  s~    	 &*D%

d
")26u((4/68<U..5<59M5**+d2926Je''(4/6rP   r;  a
  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    c                       e Zd ZddgZd fd	Zd Zd Zee	 	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dedz  dedz  de	j                  dz  dee   dee	j                     ez  fd              Zd Z xZS )	BertModelr)   r   c                     t         |   |       || _        d| _        t	        |      | _        t        |      | _        |rt        |      nd| _	        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        FN)r4   r5   rM   gradient_checkpointingr)   r`   r   encoderr   pooler	post_init)rL   rM   add_pooling_layerrN   s      rO   r5   zBertModel.__init__f  sU    
 	 &+#(0"6*,=j(4 	rP   c                 .    | j                   j                  S r   r`   r:   rL   s    rO   get_input_embeddingszBertModel.get_input_embeddingsw  s    ...rP   c                 &    || j                   _        y r   rJ  )rL   ro   s     rO   set_input_embeddingszBertModel.set_input_embeddingsz  s    */'rP   NrQ   rp   r2   r.   rR   r   r   r   r   r   rr   rT   c                 L   | j                   j                  r|	|	n| j                   j                  }	nd}	|	rd|b|| j                   j                  r4t	        t        | j                         t        | j                               nt        | j                         }|d u |d uz  rt        d      ||j                  }|j                  d   }n|j                  }|j                  d   }||j                         nd}|
t        j                  |||z   |      }
| j                  |||||      }| j                  |||||
|      \  }} | j                  |f|||||	|
|d	|}|j                  }| j                   | j!                  |      nd }t#        |||j$                  
      S )NF)rM   z:You must specify exactly one of input_ids or inputs_embedsr&   r   )rX   )rQ   r.   r2   rR   rS   )rp   r   embedding_outputr   r   r   )rp   r   r   r   r   r   r.   )r   pooler_outputr   )rM   r   r   is_encoder_decoderr   r   r   rX   rZ   get_seq_lengthrE   rF   r`   _create_attention_masksrE  r   rF  r   r   )rL   rQ   rp   r2   r.   rR   r   r   r   r   r   rr   rX   r^   rS   rP  encoder_outputsr  r  s                      rO   ra   zBertModel.forward}  s     ;;!!%.%:	@U@UII0 )48V8V $L$DlZ^ZeZeFfg!5  -t";<YZZ %%F"+J"))F&,,Q/JETE`!?!?!Afg!"\\*@BX[eBentuN??%)'#9 + 
 261M1M)#9-"7)+ 2N 2
.. '$,,

)"7#9+)%

 

 *;;8<8OO4UY;-'+;;
 	
rP   c                     | j                   j                  rt        | j                   ||||      }nt        | j                   ||      }|t        | j                   |||      }||fS )N)rM   input_embedsrp   r   r   )rM   rW  rp   )rM   rW  rp   r   )rM   r   r   r   )rL   rp   r   rP  r   r   r   s          rO   rT  z!BertModel._create_attention_masks  sx     ;;!!/{{--- /N 7{{--N "-%>{{-5&;	&" 555rP   )T)
NNNNNNNNNN)rb   rc   rd   _no_split_modulesr5   rL  rN  r%   r"   rE   ri   r   r   r   r!   r   r   ra   rT  rj   rk   s   @rO   rB  rB  W  sJ    *;7"/0  *..2.2,0-1596:(,!%.2K
<<$&K
 t+K
 t+	K

 llT)K
 ||d*K
  %||d2K
 !&t 3K
 K
 $;K
 t+K
 +,K
 
u||	K	KK
  K
Z 6rP   rB  z
    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                   `    e Zd ZdddZ fdZd Zd Zee	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dee   dee	j                     ez  fd              Z xZS )BertForPreTraining&bert.embeddings.word_embeddings.weightcls.predictions.biaszcls.predictions.decoder.weightzcls.predictions.decoder.biasc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )r4   r5   rB  r(  r#  clsrG  rK   s     rO   r5   zBertForPreTraining.__init__  s4     f%	'/ 	rP   c                 B    | j                   j                  j                  S r   r_  r  r  rK  s    rO   get_output_embeddingsz(BertForPreTraining.get_output_embeddings      xx##+++rP   c                     || j                   j                  _        |j                  | j                   j                  _        y r   r_  r  r  r  rL   new_embeddingss     rO   set_output_embeddingsz(BertForPreTraining.set_output_embeddings  ,    '5$$2$7$7!rP   NrQ   rp   r2   r.   rR   labelsnext_sentence_labelrr   rT   c           	          | j                   |f||||dd|}	|	dd \  }
}| j                  |
|      \  }}d}|u|st               } ||j                  d| j                  j
                        |j                  d            } ||j                  dd      |j                  d            }||z   }t        ||||	j                  |	j                        S )am  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
            the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
            pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        >>> model = BertForPreTraining.from_pretrained("google-bert/bert-base-uncased")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```
        Trp   r2   r.   rR   return_dictNru   r0   )r<  r=  r>  r   r)  )	r(  r_  r   r   rM   r7   r;  r   r)  )rL   rQ   rp   r2   r.   rR   rj  rk  rr   outputsr  r  r  r!  
total_lossloss_fctmasked_lm_lossnext_sentence_losss                     rO   ra   zBertForPreTraining.forward  s   R $))
))%'
 
 *1!&48HH_m4\11
"5"A')H%&7&<&<RAWAW&XZ`ZeZefhZijN!)*@*E*Eb!*LNaNfNfgiNj!k'*<<J'/$:!//))
 	
rP   NNNNNNN)rb   rc   rd   _tied_weights_keysr5   rb  rh  r$   r"   rE   ri   r   r!   r   r;  ra   rj   rk   s   @rO   rZ  rZ    s    +S(>
,8  *..2.2,0-1&*37A
<<$&A
 t+A
 t+	A

 llT)A
 ||d*A
 t#A
 #\\D0A
 +,A
 
u||	7	7A
  A
rP   rZ  zP
    Bert Model with a `language modeling` head on top for CLM fine-tuning.
    c                        e Zd ZdddZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dedz  dedz  de	j                  dz  dee	j                  z  dee   dee	j                     ez  fd              Z xZS )BertLMHeadModelr[  r\  r]  c                     t         |   |       |j                  st        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzLIf you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`FrH  
r4   r5   r   loggerwarningrB  r(  r  r_  rG  rK   s     rO   r5   zBertLMHeadModel.__init__\  sL       NNijf>	"6* 	rP   c                 B    | j                   j                  j                  S r   ra  rK  s    rO   rb  z%BertLMHeadModel.get_output_embeddingsh  rc  rP   c                     || j                   j                  _        |j                  | j                   j                  _        y r   re  rf  s     rO   rh  z%BertLMHeadModel.set_output_embeddingsk  ri  rP   NrQ   rp   r2   r.   rR   r   r   rj  r   r   r   logits_to_keeprr   rT   c                    |d}
 | j                   |f|||||||	|
|dd
|}|j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }d}|* | j                  d||| j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
        NFT)
rp   r2   r.   rR   r   r   r   r   r   rn  )logitsrj  r7   )r<  r  r   r   r)  r*  r@  )r(  r   r   rh   slicer_  loss_functionrM   r7   r   r   r   r)  r*  )rL   rQ   rp   r2   r.   rR   r   r   rj  r   r   r   r  rr   ro  r   slice_indicesr  r<  s                      rO   ra   zBertLMHeadModel.forwardo  s    0 I@I		A
))%'"7#9+)A
 A
  118B>SV8W~ot4]k-=!(;<=%4%%pVFt{{OeOepiopD0#33!//))$55
 	
rP   )NNNNNNNNNNNr   )rb   rc   rd   ru  r5   rb  rh  r$   r"   rE   ri   r   r   rh   r   r!   r   r   ra   rj   rk   s   @rO   rw  rw  Q  sf    +S(>

,8  *..2.2,0-1596:&*(,!%.2-.8
<<$&8
 t+8
 t+	8

 llT)8
 ||d*8
  %||d28
 !&t 38
 t#8
 8
 $;8
 t+8
 ell*8
 +,8
 
u||	@	@8
  8
rP   rw  c                       e Zd ZdddZ fdZd Zd Zee	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dee   dee	j                     ez  fd              Z xZS )BertForMaskedLMr[  r\  r]  c                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzkIf you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fry  rz  rK   s     rO   r5   zBertForMaskedLM.__init__  sR     NN1
 f>	"6* 	rP   c                 B    | j                   j                  j                  S r   ra  rK  s    rO   rb  z%BertForMaskedLM.get_output_embeddings  rc  rP   c                     || j                   j                  _        |j                  | j                   j                  _        y r   re  rf  s     rO   rh  z%BertForMaskedLM.set_output_embeddings  ri  rP   NrQ   rp   r2   r.   rR   r   r   rj  rr   rT   c	                 >    | j                   |f||||||dd|	}
|
d   }| j                  |      }d}|Ft               } ||j                  d| j                  j
                        |j                  d            }t        |||
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        T)rp   r2   r.   rR   r   r   rn  r   Nr0   r<  r  r   r)  )	r(  r_  r   r   rM   r7   r   r   r)  )rL   rQ   rp   r2   r.   rR   r   r   rj  rr   ro  r  r  rr  rq  s                  rO   ra   zBertForMaskedLM.forward  s    ( $))

))%'"7#9

 

 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
rP   )NNNNNNNN)rb   rc   rd   ru  r5   rb  rh  r$   r"   rE   ri   r   r!   r   r   ra   rj   rk   s   @rO   r  r    s    +S(>
,8  *..2.2,0-1596:&*+
<<$&+
 t++
 t+	+

 llT)+
 ||d*+
  %||d2+
 !&t 3+
 t#+
 +,+
 
u||	~	-+
  +
rP   r  zT
    Bert Model with a `next sentence prediction (classification)` head on top.
    c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee	   d
e
ej                     ez  fd              Z xZS )BertForNextSentencePredictionc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )r4   r5   rB  r(  r  r_  rG  rK   s     rO   r5   z&BertForNextSentencePrediction.__init__  s4     f%	"6* 	rP   NrQ   rp   r2   r.   rR   rj  rr   rT   c           	          | j                   |f||||dd|}|d   }	| j                  |	      }
d}|2t               } ||
j                  dd      |j                  d            }t	        ||
|j
                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring). Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BertForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        >>> model = BertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> logits = outputs.logits
        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
        ```
        Trm  r&   Nr0   ru   r  )r(  r_  r   r   r   r   r)  )rL   rQ   rp   r2   r.   rR   rj  rr   ro  r  seq_relationship_scoresrs  rq  s                rO   ra   z%BertForNextSentencePrediction.forward  s    N $))
))%'
 
  
"&((="9!')H!)*A*F*Fr1*Mv{{[]!_*#*!//))	
 	
rP   r   )rb   rc   rd   r5   r$   r"   rE   ri   r   r!   r   r   ra   rj   rk   s   @rO   r  r    s      *..2.2,0-1&*=
<<$&=
 t+=
 t+	=

 llT)=
 ||d*=
 t#=
 +,=
 
u||	:	:=
  =
rP   r  z
    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee	   d
e
ej                     ez  fd              Z xZS )BertForSequenceClassificationc                 n   t         |   |       |j                  | _        || _        t	        |      | _        |j                  |j                  n|j                  }t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y r   )r4   r5   
num_labelsrM   rB  r(  classifier_dropoutrB   r   rA   rC   r   r8   
classifierrG  rL   rM   r  rN   s      rO   r5   z&BertForSequenceClassification.__init__Q  s      ++f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rP   NrQ   rp   r2   r.   rR   rj  rr   rT   c           	          | j                   |f||||dd|}|d   }	| j                  |	      }	| j                  |	      }
d}|| j                  j                  | j
                  dk(  rd| j                  _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j                  dk(  rIt               }| j
                  dk(  r& ||
j                         |j                               }n ||
|      }n| j                  j                  dk(  r=t               } ||
j                  d| j
                        |j                  d            }n,| j                  j                  dk(  rt               } ||
|      }t        ||
|j                   |j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Trm  r&   N
regressionsingle_label_classificationmulti_label_classificationr0   r  )r(  rC   r  rM   problem_typer  r3   rE   rJ   rh   r   squeezer   r   r   r   r   r)  )rL   rQ   rp   r2   r.   rR   rj  rr   ro  r  r  r<  rq  s                rO   ra   z%BertForSequenceClassification.forward`  s   $ $))
))%'
 
  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
rP   r   )rb   rc   rd   r5   r$   r"   rE   ri   r   r!   r   r   ra   rj   rk   s   @rO   r  r  J  s      *..2.2,0-1&*;
<<$&;
 t+;
 t+	;

 llT);
 ||d*;
 t#;
 +,;
 
u||	7	7;
  ;
rP   r  c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee	   d
e
ej                     ez  fd              Z xZS )BertForMultipleChoicec                 *   t         |   |       t        |      | _        |j                  |j                  n|j
                  }t        j                  |      | _        t        j                  |j                  d      | _        | j                          y )Nr&   )r4   r5   rB  r(  r  rB   r   rA   rC   r   r8   r  rG  r  s      rO   r5   zBertForMultipleChoice.__init__  su     f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$6: 	rP   NrQ   rp   r2   r.   rR   rj  rr   rT   c           	         ||j                   d   n|j                   d   }|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|1|j                  d|j                  d      |j                  d            nd} | j                  |f||||dd|}	|	d   }
| j	                  |
      }
| j                  |
      }|j                  d|      }d}|t               } |||      }t        |||	j                  |	j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr&   r0   rv   Trm  r  )
rZ   r   rI   r(  rC   r  r   r   r   r)  )rL   rQ   rp   r2   r.   rR   rj  rr   num_choicesro  r  r  reshaped_logitsr<  rq  s                  rO   ra   zBertForMultipleChoice.forward  s   T -6,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 $))
))%'
 
  
]3/ ++b+6')HOV4D("!//))	
 	
rP   r   )rb   rc   rd   r5   r$   r"   rE   ri   r   r!   r   r   ra   rj   rk   s   @rO   r  r    s      *..2.2,0-1&*N
<<$&N
 t+N
 t+	N

 llT)N
 ||d*N
 t#N
 +,N
 
u||	8	8N
  N
rP   r  c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee	   d
e
ej                     ez  fd              Z xZS )BertForTokenClassificationc                 d   t         |   |       |j                  | _        t        |d      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y NFry  )r4   r5   r  rB  r(  r  rB   r   rA   rC   r   r8   r  rG  r  s      rO   r5   z#BertForTokenClassification.__init__  s      ++f>	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rP   NrQ   rp   r2   r.   rR   rj  rr   rT   c           	      H    | j                   |f||||dd|}|d   }	| j                  |	      }	| j                  |	      }
d}|<t               } ||
j	                  d| j
                        |j	                  d            }t        ||
|j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Trm  r   Nr0   r  )	r(  rC   r  r   r   r  r   r   r)  )rL   rQ   rp   r2   r.   rR   rj  rr   ro  r  r  r<  rq  s                rO   ra   z"BertForTokenClassification.forward  s      $))
))%'
 
 "!*,,71')HFKKDOO<fkk"oND$!//))	
 	
rP   r   )rb   rc   rd   r5   r$   r"   rE   ri   r   r!   r   r   ra   rj   rk   s   @rO   r  r    s      *..2.2,0-1&*'
<<$&'
 t+'
 t+	'

 llT)'
 ||d*'
 t#'
 +,'
 
u||	4	4'
  '
rP   r  c                   J    e Zd Z fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ee	   de
ej                     ez  fd              Z xZS )BertForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r  )
r4   r5   r  rB  r(  r   r   r8   
qa_outputsrG  rK   s     rO   r5   z!BertForQuestionAnswering.__init__@  sU      ++f>	))F$6$68I8IJ 	rP   NrQ   rp   r2   r.   rR   start_positionsend_positionsrr   rT   c           	          | j                   |f||||dd|}	|	d   }
| j                  |
      }|j                  dd      \  }}|j                  d      j	                         }|j                  d      j	                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }t        ||||	j                  |	j                  	      S )
NTrm  r   r&   r0   rw   )ignore_indexru   )r<  start_logits
end_logitsr   r)  )r(  r  splitr  r~   lenrI   clampr   r   r   r)  )rL   rQ   rp   r2   r.   rR   r  r  rr   ro  r  r  r  r  rp  ignored_indexrq  
start_lossend_losss                      rO   ra   z BertForQuestionAnswering.forwardJ  s    $))
))%'
 
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J+%!!//))
 	
rP   rt  )rb   rc   rd   r5   r$   r"   rE   ri   r   r!   r   r   ra   rj   rk   s   @rO   r  r  >  s      *..2.2,0-1/3-13
<<$&3
 t+3
 t+	3

 llT)3
 ||d*3
 ,3
 ||d*3
 +,3
 
u||	;	;3
  3
rP   r  )r  r  r  rZ  r  r  r  r   rw  rB  r'  )Nr   )Xre   collections.abcr   dataclassesr   rE   r   torch.nnr   r   r    r
   r-  activationsr   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr    r!   r"   r#   utils.genericr$   r%   configuration_bertr'   
get_loggerrb   r{  Moduler)   ri   floatr   r   r   r   r   r   r   r   r   r   r  r  r  r  r#  r'  r;  rB  rZ  rw  r  r  r  r  r  r  __all__r@  rP   rO   <module>r     s    $ !   A A & ! C C ) J 9
 
 
 G & 6 M M A * 
		H	%;RYY ;H !%II%<<% 
% <<	%
 LL4'% T\% % '(%:F)		 F)RJ) J)ZRYY .BII .>ryy  @* @F
")) 
D ")) "299  !bii !&bii &	9299 	9 // / /2 
7{ 7 7& 	I6# I6I6X Y
, Y
Y
x 
S
)? S

S
l I
) I
 I
X 
I
$7 I

I
X M
$7 M
M
` ^
/ ^
 ^
B 8
!4 8
 8
v @
2 @
 @
FrP   