
    i4                        d Z ddlZddlmZ ddlmZmZmZ ddlmZ	 ddl
mZmZmZ ddlmZmZ ddlmZmZmZmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8  e!jr                  e:      Z; G d de'      Z< G d de6      Z= G d de&      Z> G d de1      Z? G d de5      Z@ G d de3      ZA G d de(      ZBe  G d d e             ZC G d! d"e4      ZD G d# d$e-      ZE G d% d&e,      ZF G d' d(e2      ZG G d) d*e)      ZH G d+ d,e+      ZI G d- d.e/      ZJ G d/ d0e*      ZK G d1 d2e0      ZL G d3 d4e.      ZMg d5ZNy)6zPyTorch ERNIE model.    N)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)CacheDynamicCacheEncoderDecoderCache)create_bidirectional_maskcreate_causal_mask),BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuplecheck_model_inputs   )BertCrossAttentionBertEmbeddingsBertEncoderBertForMaskedLMBertForMultipleChoiceBertForNextSentencePredictionBertForPreTrainingBertForPreTrainingOutputBertForQuestionAnsweringBertForSequenceClassificationBertForTokenClassification	BertLayerBertLMHeadModelBertLMPredictionHead	BertModel
BertPoolerBertSelfAttention   )ErnieConfigc                        e Zd ZdZ fdZ	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ed
ej                  fdZ
 xZS )ErnieEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                     t         |   |       |j                  | _        |j                  r0t        j                  |j
                  |j                        | _        y y )N)super__init__use_task_idnn	Embeddingtask_type_vocab_sizehidden_sizetask_type_embeddings)selfconfig	__class__s     q/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/ernie/modular_ernie.pyr4   zErnieEmbeddings.__init__A   sL     !--(*V5P5PRXRdRd(eD%     N	input_idstoken_type_idstask_type_idsposition_idsinputs_embedspast_key_values_lengthreturnc                 |   ||j                         }n|j                         d d }|\  }}	|| j                  d d ||	|z   f   }|t        | d      rT| j                  j	                  |j
                  d   d      }
t        j                  |
d|      }
|
j	                  ||	      }n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }|j                  |j                        }||z   }| j                  |      }||z   }| j                  rR|:t        j                  |t        j                  | j                  j                        }| j!                  |      }||z  }| j#                  |      }| j%                  |      }|S )NrA   r   r.   )dimindex)dtypedevice)sizerC   hasattrrA   expandshapetorchgatherzeroslongrL   word_embeddingstoken_type_embeddingstoposition_embeddingsr5   r:   	LayerNormdropout)r;   r@   rA   rB   rC   rD   rE   input_shape
batch_size
seq_lengthbuffered_token_type_idsrV   
embeddingsrX   r:   s                  r>   forwardzErnieEmbeddings.forwardH   s     #..*K',,.s3K!,
J,,Q0FVlIl0l-lmL
 !t-.*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
J!W!&[

SWSdSdSkSk!l  00;M $ : :> J &(()>)E)EF"%::
"66|D"55
 $ %KuzzRVRcRcRjRj k#'#<#<]#K ..J^^J/
\\*-
r?   )NNNNNr   )__name__
__module____qualname____doc__r4   rQ   
LongTensorFloatTensorintTensorr`   __classcell__r=   s   @r>   r1   r1   >   s    Qf .226150426&'3##d*3 ((4/3 ''$.	3
 &&-3 ((4/3 !$3 
3r?   r1   c                       e Zd Zy)ErnieSelfAttentionNra   rb   rc    r?   r>   rl   rl   ~       r?   rl   c                       e Zd Zy)ErnieCrossAttentionNrm   rn   r?   r>   rq   rq      ro   r?   rq   c                       e Zd Zy)
ErnieLayerNrm   rn   r?   r>   rs   rs      ro   r?   rs   c                       e Zd Zy)ErniePoolerNrm   rn   r?   r>   ru   ru      ro   r?   ru   c                       e Zd Zy)ErnieLMPredictionHeadNrm   rn   r?   r>   rw   rw      ro   r?   rw   c                       e Zd Zy)ErnieEncoderNrm   rn   r?   r>   ry   ry      ro   r?   ry   c                   n     e Zd ZeZdZdZdZdZdZ	dZ
eeedZ ej                           fd       Z xZS )ErniePreTrainedModelernieT)hidden_states
attentionscross_attentionsc                    t         |   |       t        |t              r t	        j
                  |j                         yt        |t              ryt	        j                  |j                  t        j                  |j                  j                  d         j                  d             t	        j
                  |j                         yy)zInitialize the weightsrH   )r.   rH   N)r3   _init_weights
isinstancerw   initzeros_biasr1   copy_rC   rQ   arangerP   rO   rA   )r;   moduler=   s     r>   r   z"ErniePreTrainedModel._init_weights   s     	f%f34KK$0JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 1r?   )ra   rb   rc   r/   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendrs   rl   rq   _can_record_outputsrQ   no_gradr   ri   rj   s   @r>   r{   r{      sX    L&*#N"&#(/ U]]_/ /r?   r{   c                       e Zd ZdgZd fd	Zee	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  de	dz  de
dz  dej                  dz  dee   deej                     ez  fd              Zd Z xZS )
ErnieModelrs   c                     t         |   | |       || _        d| _        t	        |      | _        t        |      | _        |rt        |      nd | _	        | j                          y )NF)r3   r4   r<   gradient_checkpointingr1   r_   ry   encoderru   pooler	post_init)r;   r<   add_pooling_layerr=   s      r>   r4   zErnieModel.__init__   sU    v&&+#)&1#F+->k&)D 	r?   Nr@   attention_maskrA   rB   rC   rD   encoder_hidden_statesencoder_attention_maskpast_key_values	use_cachecache_positionkwargsrF   c                    | j                   j                  r|
|
n| j                   j                  }
nd}
| j                  r%| j                  r|
rt
        j                  d       d}
|
rd|	b|| j                   j                  r4t        t        | j                         t        | j                               nt        | j                         }	||t        d      |#| j                  ||       |j                         }n!||j                         dd }nt        d      |\  }}||j                  n|j                  }|	|	j                         nd}|t        j                   |||z   |	      }| j#                  ||||||
      }| j%                  ||||||	      \  }} | j&                  |f||||	|
||d|}|d   }| j(                  | j)                  |      nd}t+        |||j,                        S )  
        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
            config.task_type_vocab_size-1]
        NFzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...)r<   zDYou cannot specify both input_ids and inputs_embeds at the same timerH   z5You have to specify either input_ids or inputs_embedsr   )rL   )r@   rC   rA   rB   rD   rE   )r   r   embedding_outputr   r   r   )r   r   r   r   r   r   rC   )last_hidden_statepooler_outputr   )r<   
is_decoderr   r   trainingloggerwarning_onceis_encoder_decoderr
   r	   
ValueError%warn_if_padding_and_no_attention_maskrM   rL   get_seq_lengthrQ   r   r_   _create_attention_masksr   r   r   r   )r;   r@   r   rA   rB   rC   rD   r   r   r   r   r   r   r[   r\   r]   rL   rE   r   encoder_outputssequence_outputpooled_outputs                         r>   r`   zErnieModel.forward   s1   0 ;;!!%.%:	@U@UII&&4==##p "	0 )48V8V $L$DlZ^ZeZeFfg!5   ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@TETE`!?!?!Afg!"\\*@BX[eBentuN??%)''#9 + 
 261M1M)#9-"7)+ 2N 2
.. '$,,

)"7#9+)%

 

 *!,8<8OO4UY;-'+;;
 	
r?   c                     | j                   j                  rt        | j                   ||||      }nt        | j                   ||      }|t        | j                   |||      }||fS )N)r<   input_embedsr   r   r   )r<   r   r   )r<   r   r   r   )r<   r   r   r   )r;   r   r   r   r   r   r   s          r>   r   z"ErnieModel._create_attention_masks$  sx     ;;!!/{{--- /N 7{{--N "-%>{{-5&;	&" 555r?   )T)NNNNNNNNNNN)ra   rb   rc   _no_split_modulesr4   r   r   rQ   rh   r   boolr   r   tupler   r`   r   ri   rj   s   @r>   r   r      sR   %  *..2.2-1,0-1596:(,!%.2_
<<$&_
 t+_
 t+	_

 ||d*_
 llT)_
 ||d*_
  %||d2_
 !&t 3_
 _
 $;_
 t+_
 +,_
 
u||	K	K_
  _
D 6r?   r   c                       e Zd Zy)ErnieForPreTrainingOutputNrm   rn   r?   r>   r   r   G  ro   r?   r   c                   b   e Zd ZdddZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dee	   de
ej                     ez  fd              Zy)ErnieForPreTrainingcls.predictions.bias'ernie.embeddings.word_embeddings.weightzcls.predictions.decoder.biaszcls.predictions.decoder.weightNr@   r   rA   rB   rC   rD   labelsnext_sentence_labelr   rF   c	           
          | j                   |f|||||dd|	}
|
dd \  }}| j                  ||      \  }}d}|u|st               } ||j                  d| j                  j
                        |j                  d            } ||j                  dd      |j                  d            }||z   }t        ||||
j                  |
j                        S )a:  
        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
            config.task_type_vocab_size-1]
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
            the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
            pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ErnieForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
        >>> model = ErnieForPreTraining.from_pretrained("nghuyong/ernie-1.0-base-zh")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```
        Tr   rA   rB   rC   rD   return_dictNr   rH   )lossprediction_logitsseq_relationship_logitsr}   r~   )	r|   clsr   viewr<   
vocab_sizer   r}   r~   )r;   r@   r   rA   rB   rC   rD   r   r   r   outputsr   r   prediction_scoresseq_relationship_score
total_lossloss_fctmasked_lm_lossnext_sentence_losss                      r>   r`   zErnieForPreTraining.forwardQ  s   ^ $**	
))'%'	
 	
 *1!&48HH_m4\11
"5"A')H%&7&<&<RAWAW&XZ`ZeZefhZijN!)*@*E*Eb!*LNaNfNfgiNj!k'*<<J(/$:!//))
 	
r?   NNNNNNNN)ra   rb   rc   _tied_weights_keysr   r   rQ   rh   r   r   r   r   r`   rn   r?   r>   r   r   K  s   (>*S
  *..2.2-1,0-1&*37H
<<$&H
 t+H
 t+	H

 ||d*H
 llT)H
 ||d*H
 t#H
 #\\D0H
 +,H
 
u||	8	8H
  H
r?   r   c            "          e Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  deej                     dz  dedz  dej                  dz  de	ej                  z  de
e   deej                     ez  fd              Zy)ErnieForCausalLMNr@   r   rA   rB   rC   rD   r   r   r   r   r   r   logits_to_keepr   rF   c                    |	d} | j                   |f||||||||
||dd|}|j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }d}|	* | j                  d||	| j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
            config.task_type_vocab_size-1]
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
        NFT)r   rA   rB   rC   rD   r   r   r   r   r   r   )logitsr   r   )r   r   r   r}   r~   r   rn   )r|   r   r   rg   slicer   loss_functionr<   r   r   r   r}   r~   r   )r;   r@   r   rA   rB   rC   rD   r   r   r   r   r   r   r   r   r   r}   slice_indicesr   r   s                       r>   r`   zErnieForCausalLM.forward  s   < I@J

A
))'%'"7#9+)A
 A
   118B>SV8W~ot4]k-=!(;<=%4%%pVFt{{OeOepiopD0#33!//))$55
 	
r?   )NNNNNNNNNNNNr   )ra   rb   rc   r   r   rQ   rh   listr   rg   r   r   r   r   r`   rn   r?   r>   r   r     sc    *..2.2-1,0-1596:&*59!%.2-.?
<<$&?
 t+?
 t+	?

 ||d*?
 llT)?
 ||d*?
  %||d2?
 !&t 3?
 t#?
 ell+d2?
 $;?
 t+?
 ell*?
 +,?
  
u||	@	@!?
  ?
r?   r   c                      e Zd ZdddZee	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dee	   de
ej                     ez  fd              Zy)ErnieForMaskedLMr   r   r   Nr@   r   rA   rB   rC   rD   r   r   r   r   rF   c
                 @    | j                   |f|||||||dd|
}|d   }| j                  |      }d}|	Ft               } ||j                  d| j                  j
                        |	j                  d            }t        |||j                  |j                        S )as  
        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
            config.task_type_vocab_size-1]
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        T)r   rA   rB   rC   rD   r   r   r   r   NrH   r   r   r}   r~   )	r|   r   r   r   r<   r   r   r}   r~   )r;   r@   r   rA   rB   rC   rD   r   r   r   r   r   r   r   r   r   s                   r>   r`   zErnieForMaskedLM.forward  s    4 $**
))'%'"7#9
 
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
r?   )	NNNNNNNNN)ra   rb   rc   r   r   r   rQ   rh   r   r   r   r   r`   rn   r?   r>   r   r     s   (>*S
  *..2.2-1,0-1596:&*2
<<$&2
 t+2
 t+	2

 ||d*2
 llT)2
 ||d*2
  %||d22
 !&t 32
 t#2
 +,2
 
u||	~	-2
  2
r?   r   c                   8   e Zd Zee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee   d
e	ej                     e
z  fd              Zy)ErnieForNextSentencePredictionNr@   r   rA   rB   rC   rD   r   r   rF   c           
          | j                   |f|||||dd|}	|	d   }
| j                  |
      }d}|2t               } ||j                  dd      |j                  d            }t	        |||	j
                  |	j                        S )a  
        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
            config.task_type_vocab_size-1]
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring). Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ErnieForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
        >>> model = ErnieForNextSentencePrediction.from_pretrained("nghuyong/ernie-1.0-base-zh")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> logits = outputs.logits
        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
        ```
        Tr   r.   NrH   r   r   )r|   r   r   r   r   r}   r~   )r;   r@   r   rA   rB   rC   rD   r   r   r   r   seq_relationship_scoresr   r   s                 r>   r`   z&ErnieForNextSentencePrediction.forward!  s    Z $**	
))'%'	
 	
  
"&((="9!')H!)*A*F*Fr1*Mv{{[]!_*#*!//))	
 	
r?   NNNNNNN)ra   rb   rc   r   r   rQ   rh   r   r   r   r   r`   rn   r?   r>   r   r      s     *..2.2-1,0-1&*D
<<$&D
 t+D
 t+	D

 ||d*D
 llT)D
 ||d*D
 t#D
 +,D
 
u||	:	:D
  D
r?   r   c                   8   e Zd Zee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee   d
e	ej                     e
z  fd              Zy)ErnieForSequenceClassificationNr@   r   rA   rB   rC   rD   r   r   rF   c           
          | j                   |f|||||dd|}	|	d   }
| j                  |
      }
| j                  |
      }d}|| j                  j                  | j
                  dk(  rd| j                  _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                  j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                  j                  dk(  rt               } |||      }t        |||	j                   |	j"                  	      S )
a^  
        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
            config.task_type_vocab_size-1]
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Tr   r.   N
regressionsingle_label_classificationmulti_label_classificationrH   r   )r|   rZ   
classifierr<   problem_type
num_labelsrK   rQ   rT   rg   r   squeezer   r   r   r   r}   r~   )r;   r@   r   rA   rB   rC   rD   r   r   r   r   r   r   r   s                 r>   r`   z&ErnieForSequenceClassification.forwardk  s   0 $**	
))'%'	
 	
  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r?   r   )ra   rb   rc   r   r   rQ   rh   r   r   r   r   r`   rn   r?   r>   r   r   j  s     *..2.2-1,0-1&*B
<<$&B
 t+B
 t+	B

 ||d*B
 llT)B
 ||d*B
 t#B
 +,B
 
u||	7	7B
  B
r?   r   c                   8   e Zd Zee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee   d
e	ej                     e
z  fd              Zy)ErnieForMultipleChoiceNr@   r   rA   rB   rC   rD   r   r   rF   c           
         ||j                   d   n|j                   d   }	|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|1|j                  d|j                  d      |j                  d            nd} | j                  |f|||||dd|}
|
d   }| j	                  |      }| j                  |      }|j                  d|	      }d}|t               } |||      }t        |||
j                  |
j                        S )a9	  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        task_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
            config.task_type_vocab_size-1]
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr.   rH   Tr   r   )
rP   r   rM   r|   rZ   r   r   r   r}   r~   )r;   r@   r   rA   rB   rC   rD   r   r   num_choicesr   r   r   reshaped_logitsr   r   s                   r>   r`   zErnieForMultipleChoice.forward  s   ` -6,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 $**	
))'%'	
 	
  
]3/ ++b+6')HOV4D("!//))	
 	
r?   r   )ra   rb   rc   r   r   rQ   rh   r   r   r   r   r`   rn   r?   r>   r   r     s     *..2.2-1,0-1&*U
<<$&U
 t+U
 t+	U

 ||d*U
 llT)U
 ||d*U
 t#U
 +,U
 
u||	8	8U
  U
r?   r   c                   8   e Zd Zee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee   d
e	ej                     e
z  fd              Zy)ErnieForTokenClassificationNr@   r   rA   rB   rC   rD   r   r   rF   c           
      J    | j                   |f|||||dd|}	|	d   }
| j                  |
      }
| j                  |
      }d}|<t               } ||j	                  d| j
                        |j	                  d            }t        |||	j                  |	j                        S )a  
        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
            config.task_type_vocab_size-1]
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Tr   r   NrH   r   )	r|   rZ   r   r   r   r   r   r}   r~   )r;   r@   r   rA   rB   rC   rD   r   r   r   r   r   r   r   s                 r>   r`   z#ErnieForTokenClassification.forward  s    , $**	
))'%'	
 	
 "!*,,71')HFKKDOO<fkk"oND$!//))	
 	
r?   r   )ra   rb   rc   r   r   rQ   rh   r   r   r   r   r`   rn   r?   r>   r   r     s     *..2.2-1,0-1&*.
<<$&.
 t+.
 t+	.

 ||d*.
 llT).
 ||d*.
 t#.
 +,.
 
u||	4	4.
  .
r?   r   c                   X   e Zd Zee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ee   de	ej                     e
z  fd              Zy)ErnieForQuestionAnsweringNr@   r   rA   rB   rC   rD   start_positionsend_positionsr   rF   c	           
          | j                   |f|||||dd|	}
|
d   }| j                  |      }|j                  dd      \  }}|j                  d      j	                         }|j                  d      j	                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   d	z  }t        ||||
j                  |
j                  
      S )r   Tr   r   r.   rH   )rI   N)ignore_indexr   )r   start_logits
end_logitsr}   r~   )r|   
qa_outputssplitr   
contiguouslenrM   clampr   r   r}   r~   )r;   r@   r   rA   rB   rC   rD   r   r   r   r   r   r   r   r   r   ignored_indexr   
start_lossend_losss                       r>   r`   z!ErnieForQuestionAnswering.forwardB  s   * $**	
))'%'	
 	
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J+%!!//))
 	
r?   r   )ra   rb   rc   r   r   rQ   rh   r   r   r   r   r`   rn   r?   r>   r   r   A  s     *..2.2-1,0-1/3-1<
<<$&<
 t+<
 t+	<

 ||d*<
 llT)<
 ||d*<
 ,<
 ||d*<
 +,<
 
u||	;	;<
  <
r?   r   )
r   r   r   r   r   r   r   r   r   r{   )Ord   rQ   torch.nnr6   r   r   r    r   r   cache_utilsr   r	   r
   masking_utilsr   r   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   r   bert.modeling_bertr   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   configuration_ernier/   
get_loggerra   r   r1   rl   rq   rs   ru   rw   ry   r{   r   r   r   r   r   r   r   r   r   r   __all__rn   r?   r>   <module>r     s      A A & C C J	 	 	 . & @ @ A    & - 
		H	%=n =@	* 		, 		 		* 		0 		; 	 /? / /2T6 T6n	 8 	P
, P
fB
 B
J:
 :
zG
%B G
TE
%B E
PX
2 X
v1
"< 1
h?
 8 ?
Dr?   