
    iL                        d Z ddlmZ ddlZddlmZ ddlmZmZmZ ddl	m
Z ddlmZmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2  e-jf                  e4      Z5 G d dejl                        Z7	 	 dMdejl                  dejp                  dejp                  dejp                  dejp                  dz  de9dz  de9de'e+   fdZ: G d  d!ejl                        Z; G d" d#ejl                        Z< G d$ d%ejl                        Z= G d& d'ejl                        Z> G d( d)ejl                        Z? G d* d+ejl                        Z@ G d, d-ejl                        ZA G d. d/e      ZB G d0 d1ejl                        ZC G d2 d3ejl                        ZDe, G d4 d5e%             ZE e,d67       G d8 d9eE             ZF e,d:7       G d; d<eEe             ZGe, G d= d>eE             ZH G d? d@ejl                        ZI e,dA7       G dB dCeE             ZJe, G dD dEeE             ZKe, G dF dGeE             ZL G dH dIejl                        ZMe, G dJ dKeE             ZNg dLZOy)NzPyTorch X-MOD model.    )CallableN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FNgelu)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)TransformersKwargsauto_docstringlogging)can_return_tuplecheck_model_inputs   )
XmodConfigc                        e Zd ZdZ fdZ	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  ded	ej                  fd
Z
ed        Zedd       Z xZS )XmodEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 T   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j$                        j'                  d      d       | j                  dt!        j(                  | j*                  j-                         t         j.                        d       |j                  | _        t        j                  |j$                  |j
                  | j0                        | _        y )	N)padding_idxepsposition_idsr$   F)
persistenttoken_type_ids)dtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangemax_position_embeddingsexpandzerosr,   sizelongr)   position_embeddingsselfconfig	__class__s     p/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/xmod/modeling_xmod.pyr3   zXmodEmbeddings.__init__5   s4   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
 "..#%<<**F,>,>DL\L\$
     N	input_idsr0   r,   inputs_embedspast_key_values_lengthreturnc                    |<|| j                  || j                  |      }n| j                  || j                        }||j                         }n|j                         d d }|\  }}|t	        | d      rT| j
                  j                  |j                  d   d      }	t        j                  |	d|      }	|	j                  ||      }n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j!                  |      }||z   }| j#                  |      }| j%                  |      }|S )Nr.   r0   r   r$   )dimindexr1   device)"create_position_ids_from_input_idsr)   &create_position_ids_from_inputs_embedsrF   hasattrr0   rD   shaperA   gatherrE   rG   r,   rW   r8   r:   rH   r;   r?   )rJ   rO   r0   r,   rP   rQ   input_shape
batch_size
seq_lengthbuffered_token_type_idsr:   
embeddingsrH   s                rM   forwardzXmodEmbeddings.forwardI   sn    $#FFt//1G   $JJ=Z^ZjZjk #..*K',,.s3K!,
J
 !t-.*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
J!W!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D"55
^^J/
\\*-
rN   c                     | j                         dd }|d   }t        j                  |dz   ||z   dz   t        j                  | j                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr.   r$   rV   r   )rF   rA   rB   rG   rW   	unsqueezerD   )rP   r)   r]   sequence_lengthr,   s        rM   rY   z5XmodEmbeddings.create_position_ids_from_inputs_embedsy   sp     $((*3B/%a.||!O_{:Q>ejjYfYmYm
 %%a(//<<rN   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
        are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:

        Returns: torch.Tensor
        r$   rT   )neintrA   cumsumtype_asrG   )rO   r)   rQ   maskincremental_indicess        rM   rX   z1XmodEmbeddings.create_position_ids_from_input_ids   sW     ||K(,,.$||Da8@@FI__cgg"'')K77rN   )NNNNr   )r   )__name__
__module____qualname____doc__r3   rA   
LongTensorFloatTensorri   Tensorrb   staticmethodrY   rX   __classcell__rL   s   @rM   r'   r'   2   s    Q
, .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$. 
.` = =" 8 8rN   r'   modulequerykeyvalueattention_maskscalingr?   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|#|d d d d d d d |j                  d   f   }||z   }t
        j                  j                  |d      }t
        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )	Nr.            r   rg   )ptrainingr$   )rF   rA   matmul	transposer[   r   
functionalsoftmaxr?   r   
contiguous)
rx   ry   rz   r{   r|   r}   r?   r~   attn_weightsattn_outputs
             rM   eager_attention_forwardr      s     **R.D( <<s}}Q':;gEL!'1a399R=(@A#n4==((2(>L==((6??([L,,|U3K''1-88:K$$rN   c                        e Zd Zd
 fd	Z	 	 	 ddej
                  dej                  dz  dedz  dej
                  dz  dee	   de
ej
                     fd	Z xZS )XmodSelfAttentionNc                 @   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        | j                  dz  | _
        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        |j&                  | _        || _        || _        y Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   )r2   r3   r6   num_attention_headsrZ   
ValueErrorrK   ri   attention_head_sizeall_head_sizer}   r   Linearry   rz   r{   r=   attention_probs_dropout_probr?   
is_decoder	is_causal	layer_idxrJ   rK   r   r   rL   s       rM   r3   zXmodSelfAttention.__init__   sP    : ::a?PVXhHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++""rN   hidden_statesr|   past_key_valuescache_positionr~   rR   c                    |j                   d d }g |d| j                  } | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      }	 | j                  |      j                  | j	                  dd      }
|A|}t        |t              r|j                  }|j                  |	|
| j                  d|i      \  }	}
t        j                  | j                  j                  t               } || ||	|
|f| j"                  sdn| j$                  j&                  | j(                  d|\  }} |j*                  g |d j-                         }||fS )Nr.   r$   r   r           r?   r}   )r[   r   ry   viewr   rz   r{   
isinstancer   self_attention_cacheupdater   r   get_interfacerK   _attn_implementationr   r   r?   r   r}   reshaper   )rJ   r   r|   r   r   r~   r]   hidden_shapequery_layer	key_layervalue_layercurrent_past_key_valuesattention_interfacer   r   s                  rM   rb   zXmodSelfAttention.forward   s    $))#2.CCbC$*B*BC 5djj/44lCMMaQRS0DHH]+00,?II!QO	4djj/44lCMMaQRS&&5#/+>?*9*N*N' &=%C%C!>2	&"I{ )@(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
!\ *k));;;;FFHL((rN   FNNNN)rn   ro   rp   r3   rA   rt   rs   r   r   r   tuplerb   rv   rw   s   @rM   r   r      s}    #6 48(,.2-)||-) ))D0-) 	-)
 t+-) +,-) 
u||	-)rN   r   c                        e Zd Zd
 fd	Z	 	 	 ddej
                  dej                  dz  dej                  dz  dedz  dee	   de
ej
                     fd	Z xZS )XmodCrossAttentionNc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        | j                  dz  | _
        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        || _        || _        y r   )r2   r3   r6   r   rZ   r   rK   ri   r   r   r}   r   r   ry   rz   r{   r=   r   r?   r   r   r   s       rM   r3   zXmodCrossAttention.__init__  sC    : ::a?PVXhHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF""rN   r   encoder_hidden_statesr|   r   r~   rR   c                 V   |j                   d d \  }}|j                   d   }||d| j                  f}	||d| j                  f}
 | j                  |      j                  |	 j	                  dd      }|%|j
                  j                  | j                        nd}|]|r[|j                  j                  | j                     j                  }|j                  j                  | j                     j                  }n | j                  |      j                  |
 j	                  dd      } | j                  |      j                  |
 j	                  dd      }|C|j                  j                  ||| j                        \  }}d|j
                  | j                  <   t        j                   | j"                  j$                  t&              } || ||||f| j(                  sdn| j*                  j,                  | j.                  d|\  }}|j1                  ||d      j3                         }||fS )Nr.   r$   r   FTr   r   )r[   r   ry   r   r   
is_updatedgetr   cross_attention_cachelayerskeysvaluesrz   r{   r   r   r   rK   r   r   r   r?   r   r}   r   r   )rJ   r   r   r|   r   r~   bsztgt_lensrc_lenq_input_shapekv_input_shaper   r   r   r   r   r   r   s                     rM   rb   zXmodCrossAttention.forward  s    %**3B/W'--a0gr4+C+CDwD,D,DE 5djj/44mDNNqRSTGVGb_//33DNNChm
&:'==DDT^^TYYI)??FFt~~V]]K<!67<<nMWWXY[\]I@$**%:;@@.Q[[\]_`aK*)8)N)N)U)U{DNN*&	; >B**4>>:(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
!\ "))#w;FFHL((rN   r   r   )rn   ro   rp   r3   rA   rt   rs   r   r   r   r   rb   rv   rw   s   @rM   r   r     s    #4 ;?376:2)||2)  %00472) ))D0	2)
 -t32) +,2) 
u||	2)rN   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )XmodSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr*   )r2   r3   r   r   r6   denser;   r<   r=   r>   r?   rI   s     rM   r3   zXmodSelfOutput.__init__T  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rN   r   input_tensorrR   c                 T    | j                  |      }| j                  |      }||z   }|S N)r   r?   )rJ   r   r   s      rM   rb   zXmodSelfOutput.forwardZ  s.    

=1]3%4rN   rn   ro   rp   r3   rA   rt   rb   rv   rw   s   @rM   r   r   R  s1    >U\\  RWR^R^ rN   r   c                       e Zd Zd fd	Z	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  deeej                        dz  dej
                  dz  d	ee	   d
eej
                     fdZ
 xZS )XmodAttentionNc                     t         |           || _        |rt        nt        } ||||      | _        t        |      | _        |j                  | _        y )Nr   r   )	r2   r3   is_cross_attentionr   r   rJ   r   outputpre_norm)rJ   rK   r   r   r   attention_classrL   s         rM   r3   zXmodAttention.__init__b  sH    "40B,HY#Fi9U	$V,rN   r   r|   r   encoder_attention_maskr   r   r~   rR   c                 $   |}| j                   r| j                  j                  |      }| j                  s|n|} | j                  |f||||d|\  }	}
| j                  |	|      }	| j                   s| j                  j                  |	      }	|	|
fS )N)r   r|   r   r   )r   r   r;   r   rJ   )rJ   r   r|   r   r   r   r   r~   residualattention_outputr   s              rM   rb   zXmodAttention.forwardk  s     !== KK11-@M/3/F/FLb)2*
"7)+)*
 *
&,  ;;'7B}}#{{445EF--rN   )FNFNNNNN)rn   ro   rp   r3   rA   rt   rs   r   r   r   rb   rv   rw   s   @rM   r   r   a  s    ( 48:>;?BF.2.||. ))D0.  %0047	.
 !& 1 1D 8. uU%6%6784?. t+. +,. 
u||	.rN   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )XmodIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r2   r3   r   r   r6   intermediate_sizer   r   
hidden_actstrr
   intermediate_act_fnrI   s     rM   r3   zXmodIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$rN   r   rR   c                 J    | j                  |      }| j                  |      }|S r   )r   r   rJ   r   s     rM   rb   zXmodIntermediate.forward  s&    

=100?rN   r   rw   s   @rM   r   r     s#    9U\\ ell rN   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )XmodAdapterc                    t         |           |j                  |j                  z  | _        t        j                  |j                  | j                        | _        t        j                  | j                  |j                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r2   r3   r6   adapter_reduction_factorbottleneck_sizer   r   dense1dense2r   r   r   r
   adapter_act_fnrI   s     rM   r3   zXmodAdapter.__init__  s    %11V5T5TTii 2 2D4H4HIii 4 4f6H6HIf''-"():):";D"("3"3DrN   r   rR   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r   s     rM   rb   zXmodAdapter.forward  s4    M2++M:M2rN   r   rw   s   @rM   r   r     s#    4U\\ ell rN   r   c                        e Zd Z fdZdej
                  dej
                  dej
                  dej
                  fdZdej
                  dej
                  fdZ xZS )
XmodOutputc                    t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        |j                  | _	        t        j                  |j                        | _        |j                  r1t        j                  |j
                  |j                        | _        nd | _        |j                  | _        t        j                  i       | _        |j"                  D ]#  }t%        |      | j                   t'        |      <   % y r   )r2   r3   r   r   r   r6   r   r;   r<   ln_before_adapterr=   r>   r?   adapter_layer_normadapter_reuse_layer_norm
ModuleDictadapter_modules	languagesr   r   )rJ   rK   languagerL   s      rM   r3   zXmodOutput.__init__  s    YYv779K9KL
f&8&8f>S>ST!'!9!9zz&"<"<=$$&(ll63E3E6K`K`&aD#&*D#(.(G(G%!}}R0(( 	FH2=f2ED  X/	FrN   r   r   lang_idsrR   c                 x    | j                  |      }| j                  |      }||z   }| j                  ||      }|S r   )r   r?   lang_adapter)rJ   r   r   r   s       rM   rb   zXmodOutput.forward  s@    

=1]3%4))(MBrN   c                    | j                   s|}| j                  | j                  |      }n| j                  r| j                  |      }| j                   r|}t	        j
                  |      }t        | j                  j                               D ])  \  }}||k(  }||   } | j                  |   |      }	|	||<   + | j                  |      }|z  }|S r   )
r   r   r   r;   rA   
zeros_like	enumerater   r   r?   )
rJ   r   r   r   new_hidden_statesadapter_idxlang_key	lang_masklang_hidden_statesadapted_lang_hidden_statess
             rM   r   zXmodOutput.lang_adapter  s    %%$H"". 33MBM** NN=9M!!$H!,,];%.t/C/C/H/H/J%K 	F!K K/I!.y!9)G)=)=h)GHZ)[&+Ei(		F %67!rN   )	rn   ro   rp   r3   rA   rt   rb   r   rv   rw   s   @rM   r   r     s[    FU\\  Y^YeYe jojvjv U\\ %,, rN   r   c                   4    e Zd Zd fd	Z	 	 	 	 	 ddej
                  dej
                  dej                  dz  dej                  dz  dej                  dz  deeej                        dz  d	ej
                  dz  d
ee	   deej
                     fdZ
d Z xZS )	XmodLayerNc                    t         |           |j                  | _        d| _        t	        ||j
                  |      | _        |j
                  | _        |j                  | _        | j                  r.| j
                  st        |  d      t	        |d|d      | _	        t        |      | _        t        |      | _        |j                  | _        y )Nr$   r   z> should be used as a decoder model if cross attention is addedFT)r   r   r   )r2   r3   chunk_size_feed_forwardseq_len_dimr   r   	attentionadd_cross_attentionr   crossattentionr   intermediater   r   r   )rJ   rK   r   rL   s      rM   r3   zXmodLayer.__init__  s    '-'E'E$&v9J9JV_` ++#)#=#= ##?? D6)g!hii"/##'	#D -V4 (rN   r   r   r|   r   r   r   r   r~   rR   c                     | j                   ||f||d|\  }	}
|	}| j                  r:|8t        | d      st        d|  d       | j                  |d ||fd|i|\  }}
|}|}| j
                  r| j                  j                  |      }t        | j                  | j                  | j                  |      }| j                  |||      }| j
                  s| j                  j                  |      }|S )N)r   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )r  r   rZ   r   r  r   r   r;   r   feed_forward_chunkr  r  )rJ   r   r   r|   r   r   r   r   r~   self_attention_output_r   cross_attention_outputr   intermediate_outputlayer_outputs                   rM   rb   zXmodLayer.forward  s;    $24>>$
 ,)	$

 $
 q 1??4@4!12 =dV DD D 
 )<(;(; %&	)
 !0) )%"A  6#==#{{445EF7##((	
 {{#6(K}};;00>LrN   c                 $    | j                  |      S r   )r	  )rJ   r   s     rM   r  zXmodLayer.feed_forward_chunk$  s      !122rN   r   r   )rn   ro   rp   r3   rA   rt   rs   r   r   r   rb   r  rv   rw   s   @rM   r  r    s    (0 48:>;?BF.22||2 ,,2 ))D0	2
  %00472 !& 1 1D 82 uU%6%6784?2 t+2 +,2 
u||	2h3rN   r  c                   >    e Zd Z fdZ	 	 	 	 	 	 ddej
                  dej
                  dej                  dz  dej                  dz  dej                  dz  deeej                        dz  d	edz  d
ej
                  dz  de	e
   deej
                     ez  fdZ xZS )XmodEncoderc           	      b   t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        |j                  | _
        | j                  r1t        j                  |j                  |j                        | _        y y c c}w )N)r   r*   )r2   r3   rK   r   
ModuleListrangenum_hidden_layersr  layerr   is_pre_normr;   r6   r<   )rJ   rK   irL   s      rM   r3   zXmodEncoder.__init__)  s    ]]ERXRjRjLk#lqIf$B#lm
!??\\&*<*<&BWBWXDN  $ms   B,Nr   r   r|   r   r   r   	use_cacher   r~   rR   c	           
          t        | j                        D ]  \  }
} ||||||||fi |	} | j                  r| j                  |      }t	        ||r|      S d       S )N)last_hidden_stater   )r   r  r  r;   r   )rJ   r   r   r|   r   r   r   r  r   r~   r  layer_modules               rM   rb   zXmodEncoder.forward1  s      )4 
	OA|(%&	 	M
	  NN=9M8+/8O
 	
>B
 	
rN   )NNNNNN)rn   ro   rp   r3   rA   rt   rs   r   boolr   r   r   rb   rv   rw   s   @rM   r  r  (  s    Y 48:>;?BF!%.2
||
 ,,
 ))D0	

  %0047
 !& 1 1D 8
 uU%6%6784?
 $;
 t+
 +,
 
u||	H	H
rN   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
XmodPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r2   r3   r   r   r6   r   Tanh
activationrI   s     rM   r3   zXmodPooler.__init__T  s9    YYv1163E3EF
'')rN   r   rR   c                 \    |d d df   }| j                  |      }| j                  |      }|S Nr   )r   r$  )rJ   r   first_token_tensorpooled_outputs       rM   rb   zXmodPooler.forwardY  s6     +1a40

#566rN   r   rw   s   @rM   r!  r!  S  s#    $
U\\ ell rN   r!  c                        e Zd ZeZdZdZg dZdZdZ	dZ
dZeeedZ ej"                          fd       ZdefdZd Z xZS )	XmodPreTrainedModelrobertaT)r'   r   r   )r   
attentionscross_attentionsc                    t         |   |       t        |t              r t	        j
                  |j                         yt        |t              ryt	        j                  |j                  t        j                  |j                  j                  d         j                  d             t	        j
                  |j                         yy)zInitialize the weightsr.   r-   N)r2   _init_weightsr   
XmodLMHeadinitzeros_biasr'   copy_r,   rA   rB   r[   rD   r0   )rJ   rx   rL   s     rM   r/  z!XmodPreTrainedModel._init_weightsr  s     	f%fj)KK$/JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 0rN   r   c           	          || j                   j                  vr0t        |  d| dt        | j                   j                               || j                   _        y)z
        Set the default language code for the model. This is used when the language is not specified in the input.

        Args:
            language (`str`): The language code, such as `"en_XX"` or `"de_DE"`.
        z does not have an adapter for z. Supported languages: N)rK   r   r   listdefault_language)rJ   r   s     rM   set_default_languagez(XmodPreTrainedModel.set_default_language|  s[     4;;000&6xj@WX\]a]h]h]r]rXsWtu  (0$rN   c                    t         j                  d       | j                  j                  j	                         D ]	  }d|_         t         j                  d       | j                  j                  j                  D ]x  }|j                  j                  0|j                  j                  j	                         D ]	  }d|_         |j                  j                  j	                         D ]	  }d|_         z y)z
        Freeze the embeddings and language adapters of the model. Usually, this is applied before the model is
        fine-tuned on a downstream task.
        zFreezing embeddingsFzFreezing adaptersN)loggerinfor+  ra   
parametersrequires_gradencoderr  r   r   r   )rJ   	parameterr  s      rM   'freeze_embeddings_and_language_adaptersz;XmodPreTrainedModel.freeze_embeddings_and_language_adapters  s    
 	)*00;;= 	,I&+I#	,'(\\))// 	0E||..:!&!@!@!K!K!M 4I.3I+4"\\99DDF 0	*/	'0		0rN   )rn   ro   rp   r%   config_classbase_model_prefixsupports_gradient_checkpointingno_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr  r   r   _can_record_outputsrA   no_gradr/  r   r8  r@  rv   rw   s   @rM   r*  r*  b  so    L!&*#TN"&"'. U]]_/ /0S 00rN   r*  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    )custom_introc                       e Zd Zd fd	Zd Zd Zee	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  deej                     dz  dedz  dej                  dz  dee   deej                     ez  fd              Zd Z xZS )	XmodModelc                     t         |   |       || _        d| _        t	        |      | _        t        |      | _        |rt        |      nd| _	        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        FN)r2   r3   rK   gradient_checkpointingr'   ra   r  r>  r!  pooler	post_init)rJ   rK   add_pooling_layerrL   s      rM   r3   zXmodModel.__init__  sU    
 	 &+#(0"6*,=j(4 	rN   c                 .    | j                   j                  S r   ra   r8   rJ   s    rM   get_input_embeddingszXmodModel.get_input_embeddings  s    ...rN   c                 &    || j                   _        y r   rT  )rJ   r{   s     rM   set_input_embeddingszXmodModel.set_input_embeddings  s    */'rN   NrO   r   r|   r0   r,   rP   r   r   r   r  r   r~   rR   c                    | j                   j                  r|
|
n| j                   j                  }
nd}
|
rd|	b|| j                   j                  r4t	        t        | j                         t        | j                               nt        | j                         }	|du |duz  rt        d      ||j                  }|j                  }n|j                  }|j                  dd }|\  }}||j                  n|j                  }|	|	j                         nd}|t        j                  |||z   |      }|| j                   j                  t        d      t        | j                  j                  d   j                   j"                  j%                               }|j'                  | j                   j                        }|t        j(                  ||      z  }| j+                  |||||	      }| j-                  ||||||	
      \  }} | j                  |f|||||	|
||d|}|d   }| j.                  | j/                  |      nd}t1        |||j2                        S )  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        NF)rK   z:You must specify exactly one of input_ids or inputs_embedsr.   r   )rW   zPInput language unknown. Please call `XmodPreTrainedModel.set_default_language()`)rO   r,   r0   rP   rQ   )r|   r   embedding_outputr   r   r   )r   r|   r   r   r   r  r   r,   )r  pooler_outputr   )rK   r   r  is_encoder_decoderr   r   r   rW   r[   get_seq_lengthrA   rB   r7  r6  r>  r  r   r   r   rU   onesra   _create_attention_masksrP  r   r   )rJ   rO   r   r|   r0   r,   rP   r   r   r   r  r   r~   rW   r]   r^   r_   rQ   adapter_languagesdefault_lang_idr[  encoder_outputssequence_outputr(  s                           rM   rb   zXmodModel.forward  s|   , ;;!!%.%:	@U@UII0 )48V8V $L$DlZ^ZeZeFfg!5  -t";<YZZ %%F#//K"))F'--cr2K!,
J%.%:!!@T@TETE`!?!?!Afg!"\\*@BX[eBentuN{{++3 !stt $T\\%7%7%:%A%A%Q%Q%V%V%X Y/55dkk6R6RSO&Jv)NNH??%)'#9 + 
 261M1M)#9-"7)+ 2N 2
.. '$,,
)"7#9+)%
 
 *!,8<8OO4UY;-'+;;
 	
rN   c                     | j                   j                  rt        | j                   ||||      }nt        | j                   ||      }|t        | j                   |||      }||fS )N)rK   input_embedsr|   r   r   )rK   rf  r|   )rK   rf  r|   r   )rK   r   r   r   )rJ   r|   r   r[  r   r   r   s          rM   r`  z!XmodModel._create_attention_masks#  sx     ;;!!/{{--- /N 7{{--N "-%>{{-5&;	&" 555rN   )T)NNNNNNNNNNN)rn   ro   rp   r3   rV  rX  r#   r    rA   rt   rr   r6  rs   r  r   r   r   r   rb   r`  rv   rw   s   @rM   rM  rM    sb   $/0  *.,0.2.2,0-1596::>!%.2\
<<$&\
 ""T)\
 t+	\

 t+\
 llT)\
 ||d*\
  %||d2\
 !&t 3\
 e//047\
 $;\
 t+\
 +,\
 
u||	K	K\
  \
~ 6rN   rM  zQ
    X-MOD Model with a `language modeling` head on top for CLM fine-tuning.
    c            "           e Zd ZdddZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  deee	j                        dz  dedz  de	j                  dz  dee	j                  z  dee   dee	j                     ez  fd              Z xZS )XmodForCausalLM)roberta.embeddings.word_embeddings.weightlm_head.biaszlm_head.decoder.weightzlm_head.decoder.biasc                     t         |   |       |j                  st        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzLIf you want to use `XmodLMHeadModel` as a standalone, add `is_decoder=True.`FrR  
r2   r3   r   r:  warningrM  r+  r0  lm_headrQ  rI   s     rM   r3   zXmodForCausalLM.__init__R  sL       NNij 5A!&) 	rN   c                 .    | j                   j                  S r   rp  decoderrU  s    rM   get_output_embeddingsz%XmodForCausalLM.get_output_embeddings_      ||###rN   c                 &    || j                   _        y r   rr  rJ   new_embeddingss     rM   set_output_embeddingsz%XmodForCausalLM.set_output_embeddingsc      -rN   NrO   r   r|   r0   r,   rP   r   r   labelsr   r  r   logits_to_keepr~   rR   c                    |	d} | j                   |f||||||||
||dd|}|j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }d}|	* | j                  d||	| j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )aS  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, XmodForCausalLM, AutoConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
        >>> config = AutoConfig.from_pretrained("facebook/xmod-base")
        >>> config.is_decoder = True
        >>> model = XmodForCausalLM.from_pretrained("facebook/xmod-base", config=config)
        >>> model.set_default_language("en_XX")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NFT)r   r|   r0   r,   rP   r   r   r   r  r   return_dict)logitsr{  r5   )lossr  r   r   r,  r-   )r+  r  r   ri   slicerp  loss_functionrK   r5   r   r   r   r,  r-  )rJ   rO   r   r|   r0   r,   rP   r   r   r{  r   r  r   r|  r~   outputsr   slice_indicesr  r  s                       rM   rb   zXmodForCausalLM.forwardf  s   Z I@LA
))%'"7#9+)A
 A
   118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD0#33!//))$55
 	
rN   )NNNNNNNNNNNNr   )rn   ro   rp   _tied_weights_keysr3   rt  ry  r"   r    rA   rr   rs   r   r  rt   ri   r   r   r   rb   rv   rw   s   @rM   rh  rh  F  s    #N .
$.  .2,037260426:>;?*.BF!%.2-.N
##d*N
 ""T)N
 ))D0	N

 ((4/N
 &&-N
 ((4/N
  %0047N
 !& 1 1D 8N
   4'N
 uU%6%6784?N
 $;N
 t+N
 ell*N
 +,N
  
u||	@	@!N
  N
rN   rh  c                       e Zd ZdddZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dee   dee	j                     ez  fd              Z xZS )XmodForMaskedLMri  rj  rk  c                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzkIf you want to use `XmodForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Frm  rn  rI   s     rM   r3   zXmodForMaskedLM.__init__  sR     NN1
 !5A!&) 	rN   c                 .    | j                   j                  S r   rr  rU  s    rM   rt  z%XmodForMaskedLM.get_output_embeddings  ru  rN   c                 &    || j                   _        y r   rr  rw  s     rM   ry  z%XmodForMaskedLM.set_output_embeddings  rz  rN   NrO   r   r|   r0   r,   rP   r   r   r{  r~   rR   c
                 @    | j                   |f|||||||dd|
}|d   }| j                  |      }d}|	Ft               } ||j                  d| j                  j
                        |	j                  d            }t        |||j                  |j                        S )a  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        T)r   r|   r0   r,   rP   r   r   r~  r   Nr.   r  r  r   r,  )	r+  rp  r   r   rK   r5   r   r   r,  )rJ   rO   r   r|   r0   r,   rP   r   r   r{  r~   r  rd  prediction_scoresmasked_lm_lossloss_fcts                   rM   rb   zXmodForMaskedLM.forward  s    0 $,,
))%'"7#9
 
 "!* LL9')H%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
rN   )	NNNNNNNNN)rn   ro   rp   r  r3   rt  ry  r"   r    rA   rr   rs   r   r   r   rt   r   rb   rv   rw   s   @rM   r  r    s?    #N . $.  .2,037260426:>;?*./
##d*/
 ""T)/
 ))D0	/

 ((4//
 &&-/
 ((4//
  %0047/
 !& 1 1D 8/
   4'/
 +,/
 
u||	~	-/
  /
rN   r  c                   (     e Zd ZdZ fdZd Z xZS )r0  z*Roberta Head for masked language modeling.c                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _
        t        j                  t        j                  |j                              | _        y r   )r2   r3   r   r   r6   r   r;   r<   
layer_normr5   rs  	ParameterrA   rE   r3  rI   s     rM   r3   zXmodLMHead.__init__  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	rN   c                     | j                  |      }t        |      }| j                  |      }| j                  |      }|S r   )r   r   r  rs  rJ   featuresr~   xs       rM   rb   zXmodLMHead.forward  s;    JJx GOOA LLOrN   rn   ro   rp   rq   r3   rb   rv   rw   s   @rM   r0  r0    s    4ArN   r0  z
    X-MOD Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   J    e Zd Z fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	e
   deej                     ez  fd              Z xZS )XmodForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |d      | _        t        |      | _        | j                          y NFrm  )	r2   r3   
num_labelsrK   rM  r+  XmodClassificationHead
classifierrQ  rI   s     rM   r3   z&XmodForSequenceClassification.__init__+  sJ      ++ 5A08 	rN   NrO   r   r|   r0   r,   rP   r{  r~   rR   c           
          | j                   |f|||||dd|}	|	d   }
| j                  |
      }d}|| j                  j                  | j                  dk(  rd| j                  _        nl| j                  dkD  rL|j
                  t        j                  k(  s|j
                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                  j                  dk(  r=t               } ||j                  d	| j                        |j                  d	            }n,| j                  j                  dk(  rt               } |||      }t        |||	j                  |	j                   
      S )a  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Tr   r|   r0   r,   rP   r~  r   Nr$   
regressionsingle_label_classificationmulti_label_classificationr.   r  )r+  r  rK   problem_typer  r1   rA   rG   ri   r   squeezer   r   r   r   r   r,  rJ   rO   r   r|   r0   r,   rP   r{  r~   r  rd  r  r  r  s                 rM   rb   z%XmodForSequenceClassification.forward6  s   , $,,	
))%'	
 	
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
rN   NNNNNNN)rn   ro   rp   r3   r"   r    rA   rr   rs   r   r   r   rt   r   rb   rv   rw   s   @rM   r  r  #  s    	  .2,037260426*.=
##d*=
 ""T)=
 ))D0	=

 ((4/=
 &&-=
 ((4/=
   4'=
 +,=
 
u||	7	7=
  =
rN   r  c                   J    e Zd Z fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	e
   deej                     ez  fd              Z xZS )XmodForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y )Nr$   )r2   r3   rM  r+  r   r=   r>   r?   r   r6   r  rQ  rI   s     rM   r3   zXmodForMultipleChoice.__init__{  sV      (zz&"<"<=))F$6$6: 	rN   NrO   r   r0   r|   r{  r,   rP   r~   rR   c           
      Z   ||j                   d   n|j                   d   }	|!|j                  d|j                  d            nd}
|2|j                  |j                  d      |j                  d      z        nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|1|j                  d|j                  d      |j                  d            nd} | j                  |
f|||||dd|}|d   }| j                  |      }| j                  |      }|j                  d|	      }d}|t               } |||      }t        |||j                  |j                        S )	a|  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        lang_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr$   r.   r   r   T)r   r,   r0   r|   rP   r~  r  )r[   r   rF   repeatr+  r?   r  r   r   r   r,  )rJ   rO   r   r0   r|   r{  r,   rP   r~   num_choicesflat_input_idsflat_lang_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr  r(  r  reshaped_logitsr  r  s                         rM   rb   zXmodForMultipleChoice.forward  s   \ -6,Aiooa(}GZGZ[\G]CLCXINN2,>?^bRZRf	q(9INN1<M(MNlpLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 $,,	
"*..,	
 	
  
]3/ ++b+6')HOV4D("!//))	
 	
rN   r  )rn   ro   rp   r3   r"   r    rA   rr   rs   r   r   r   rt   r   rb   rv   rw   s   @rM   r  r  x  s      .2,02637*.0426S
##d*S
 ""T)S
 ((4/	S

 ))D0S
   4'S
 &&-S
 ((4/S
 +,S
 
u||	8	8S
  S
rN   r  c                   J    e Zd Z fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	e
   deej                     ez  fd              Z xZS )XmodForTokenClassificationc                 d   t         |   |       |j                  | _        t        |d      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y r  )r2   r3   r  rM  r+  classifier_dropoutr>   r   r=   r?   r   r6   r  rQ  rJ   rK   r  rL   s      rM   r3   z#XmodForTokenClassification.__init__  s      ++ 5A)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rN   NrO   r   r|   r0   r,   rP   r{  r~   rR   c           
      J    | j                   |f|||||dd|}	|	d   }
| j                  |
      }
| j                  |
      }d}|<t               } ||j	                  d| j
                        |j	                  d            }t        |||	j                  |	j                        S )a  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Tr  r   Nr.   r  )	r+  r?   r  r   r   r  r   r   r,  r  s                 rM   rb   z"XmodForTokenClassification.forward  s    ( $,,	
))%'	
 	
 "!*,,71')HFKKDOO<fkk"oND$!//))	
 	
rN   r  )rn   ro   rp   r3   r"   r    rA   rr   rs   r   r   r   rt   r   rb   rv   rw   s   @rM   r  r    s      .2,037260426*.,
##d*,
 ""T),
 ))D0	,

 ((4/,
 &&-,
 ((4/,
   4',
 +,,
 
u||	4	4,
  ,
rN   r  c                   (     e Zd ZdZ fdZd Z xZS )r  z-Head for sentence-level classification tasks.c                 Z   t         |           t        j                  |j                  |j                        | _        |j                  |j                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        y r   )r2   r3   r   r   r6   r   r  r>   r=   r?   r  out_projr  s      rM   r3   zXmodClassificationHead.__init__#  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrN   c                     |d d dd d f   }| j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r&  )r?   r   rA   tanhr  r  s       rM   rb   zXmodClassificationHead.forward,  sY    Q1WLLOJJqMJJqMLLOMM!rN   r  rw   s   @rM   r  r     s    7IrN   r  c                   j    e Zd Z fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  de	e
   deej                     ez  fd              Z xZS )XmodForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r  )
r2   r3   r  rM  r+  r   r   r6   
qa_outputsrQ  rI   s     rM   r3   z!XmodForQuestionAnswering.__init__9  sU      ++ 5A))F$6$68I8IJ 	rN   NrO   r   r|   r0   r,   rP   start_positionsend_positionsr~   rR   c	           
          | j                   |f|||||dd|	}
|
d   }| j                  |      }|j                  dd      \  }}|j                  d      j	                         }|j                  d      j	                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   d	z  }t        ||||
j                  |
j                  
      S )rZ  Tr  r   r$   r.   rg   N)ignore_indexr   )r  start_logits
end_logitsr   r,  )r+  r  splitr  r   lenrF   clampr   r   r   r,  )rJ   rO   r   r|   r0   r,   rP   r  r  r~   r  rd  r  r  r  
total_lossignored_indexr  
start_lossend_losss                       rM   rb   z XmodForQuestionAnswering.forwardC  s   & $,,	
))%'	
 	
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J+%!!//))
 	
rN   )NNNNNNNN)rn   ro   rp   r3   r"   r    rA   rr   rs   r   r   r   rt   r   rb   rv   rw   s   @rM   r  r  6  s     .2,0372604263715:
##d*:
 ""T):
 ))D0	:

 ((4/:
 &&-:
 ((4/:
 ))D0:
 ''$.:
 +,:
 
u||	;	;:
  :
rN   r  )rh  r  r  r  r  r  rM  r*  )Nr   )Prq   collections.abcr   rA   r   torch.nnr   r   r    r	   r1  activationsr
   r   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r    r!   utils.genericr"   r#   configuration_xmodr%   
get_loggerrn   r:  Moduler'   rt   floatr   r   r   r   r   r   r   r   r  r  r!  r*  rM  rh  r  r0  r  r  r  r  r  __all__r  rN   rM   <module>r     s    $   A A & ' C C ) J 9	 	 	 G & 6 @ @ A * 
		H	%g8RYY g8b !%II%<<% 
% <<	%
 LL4'% T\% % '(%<F)		 F)TJ) J)ZRYY &.BII &.Tryy ")) $, ,^J3* J3Z'
")) '
V  40/ 40 40n [6# [6[6| 
k
)? k

k
\ O
) O
 O
f , L
$7 L
L
^ a
/ a
 a
H >
!4 >
 >
DRYY , H
2 H
 H
V	rN   