
    i                     6   d Z ddlmZ ddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z*  e%jV                  e,      Z- G d dej\                        Z/	 	 dAdej\                  dej`                  dej`                  dej`                  dej`                  dz  de1dz  de1dee#   fdZ2 G d dej\                        Z3 G d  d!ej\                        Z4 G d" d#ej\                        Z5 G d$ d%ej\                        Z6e$ G d& d'e             Z7e e$d()       G d* d+e"                    Z8e$ G d, d-e7             Z9 e$d.)       G d/ d0e7             Z: G d1 d2ej\                        Z; G d3 d4ej\                        Z<e$ G d5 d6e7             Z= e$d7)       G d8 d9e7             Z>e$ G d: d;e7             Z?e$ G d< d=e7             Z@e$ G d> d?e7             ZAg d@ZBy)BzPyTorch ALBERT model.    )Callable)	dataclassN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)create_bidirectional_mask)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplecheck_model_inputs   )AlbertConfigc                        e Zd ZdZdef fdZ	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  f
d
Z
 xZS )AlbertEmbeddingszQ
    Construct the embeddings from word, position and token_type embeddings.
    configc                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       | j#                  dt%        j*                  | j,                  j/                         t$        j0                        d       y )	N)padding_idxepsposition_idsr   F)
persistenttoken_type_ids)dtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandzerosr'   sizelongselfr"   	__class__s     t/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/albert/modeling_albert.pyr.   zAlbertEmbeddings.__init__5   s   !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`"f&;&;AVAVWzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    N	input_idsr+   r'   inputs_embedsreturnc                    ||j                         }n|j                         d d }|\  }}|| j                  d d d |f   }|t        | d      rT| j                  j	                  |j
                  d   d      }t        j                  |d|      }|j	                  ||      }n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  |      }|
|z   }
| j                  |
      }
| j                  |
      }
|
S )Nr)   r+   r   r   )dimindex)r,   device)rB   r'   hasattrr+   r@   shaper>   gatherrA   rC   rO   r3   r7   r5   r8   r<   )rE   rI   r+   r'   rJ   input_shape
batch_size
seq_lengthbuffered_token_type_idsr7   
embeddingsr5   s               rG   forwardzAlbertEmbeddings.forwardF   sG     #..*K',,.s3K!,
J,,Q^<L
 !t-.*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
J!W!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D"55
^^J/
\\*-
rH   )NNNN)__name__
__module____qualname____doc__r   r.   r>   
LongTensorFloatTensorTensorrX   __classcell__rF   s   @rG   r!   r!   0   s    
| 
& .2260426'##d*' ((4/' &&-	'
 ((4/' 
'rH   r!   modulequerykeyvalueattention_maskscalingr<   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|#|d d d d d d d |j                  d   f   }||z   }t
        j                  j                  |d      }t
        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )	Nr)            r	   rM   )ptrainingr   )rB   r>   matmul	transposerQ   r   
functionalsoftmaxr<   ro   
contiguous)
rb   rc   rd   re   rf   rg   r<   rh   attn_weightsattn_outputs
             rG   eager_attention_forwardrw   q   s     **R.D( <<s}}Q':;gEL!'1a399R=(@A#n4==((2(>L==((6??([L,,|U3K''1-88:K$$rH   c                        e Zd Zdef fdZ	 d	dej                  dej                  dz  dee	   de
ej                  ej                  f   fdZ xZS )
AlbertAttentionr"   c                 $   t         |           |j                  |j                  z  dk7  r1t	        |d      s%t        d|j                   d|j                         || _        |j                  | _        |j                  | _        |j                  |j                  z  | _        | j                  | j                  z  | _        | j                  dz  | _	        t        j                  |j                        | _        t        j                  |j                        | _        t        j                   |j                  | j                        | _        t        j                   |j                  | j                        | _        t        j                   |j                  | j                        | _        t        j                   |j                  |j                        | _        t        j*                  |j                  |j,                        | _        d| _        y )Nr   r1   zThe hidden size (z6) is not a multiple of the number of attention heads (rj   r%   F)r-   r.   hidden_sizenum_attention_headsrP   
ValueErrorr"   attention_head_sizeall_head_sizerg   r   r:   attention_probs_dropout_probattention_dropoutr;   output_dropoutLinearrc   rd   re   denser8   r9   	is_causalrD   s     rG   r.   zAlbertAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 4457  #)#=#= !--#)#5#59S9S#S !558P8PP//5!#F,O,O!P jj)C)CDYYv1143E3EF
99V//1C1CDYYv1143E3EF
YYv1163E3EF
f&8&8f>S>STrH   Nhidden_statesrf   rh   rK   c                    |j                   d d }g |d| j                  } | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      }t        j                  | j                  j                  t              }	 |	| ||||f| j                  sdn| j                  j                  | j                  d|\  }
} |
j                   g |d j#                         }
| j%                  |
      }
| j'                  |
      }
| j)                  ||
z         }
|
|fS )Nr)   r   rk           )r<   rg   )rQ   r~   rc   viewrq   rd   re   r   get_interfacer"   _attn_implementationrw   ro   r   rn   rg   reshapert   r   r   r8   )rE   r   rf   rh   rS   hidden_shapequery_layer	key_layervalue_layerattention_interfacerv   ru   s               rG   rX   zAlbertAttention.forward   s}    $))#2.CCbC$*B*BC 5djj/44lCMMaQRS0DHH]+00,?II!QO	4djj/44lCMMaQRS(?(M(MKK,,.E)
 %8	%
  $}}C$2H2H2J2JLL	%
 	%
!\ *k));;;;FFHjj-))+6nn][%@AL((rH   NrY   rZ   r[   r   r.   r>   r_   r^   r   r   tuplerX   r`   ra   s   @rG   ry   ry      sf    | < 48")||") ))D0") +,	")
 
u||U\\)	*")rH   ry   c                        e Zd Zdef fdZ	 ddej                  dej                  dz  dee	   de
ej                  ej                  f   fdZd	ej                  dej                  fd
Z xZS )AlbertLayerr"   c                    t         |           || _        |j                  | _        d| _        t        j                  |j                  |j                        | _	        t        |      | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t         |j"                     | _        t        j&                  |j(                        | _        y )Nr   r%   )r-   r.   r"   chunk_size_feed_forwardseq_len_dimr   r8   r{   r9   full_layer_layer_normry   	attentionr   intermediate_sizeffn
ffn_outputr   
hidden_act
activationr:   r;   r<   rD   s     rG   r.   zAlbertLayer.__init__   s    '-'E'E$%'\\&2D2D&J_J_%`"(099V//1I1IJ))F$<$<f>P>PQ !2!23zz&"<"<=rH   Nr   rf   rh   rK   c                      | j                   ||fi |\  }}t        | j                  | j                  | j                  |      }| j                  ||z         }|S r   )r   r   ff_chunkr   r   r   )rE   r   rf   rh   attention_output_r   s          rG   rX   zAlbertLayer.forward   se     -dnn]NUfU!.MM((	

 22:@P3PQrH   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )rE   r   r   s      rG   r   zAlbertLayer.ff_chunk   s3    XX./
__Z0
__Z0
rH   r   )rY   rZ   r[   r   r.   r>   r_   r^   r   r   r   rX   r   r`   ra   s   @rG   r   r      s    >| >  48|| ))D0 +,	
 
u||U\\)	*" %,, rH   r   c                        e Zd Zdef fdZ	 d
dej                  dej                  dz  dee	   de
ej                  e
ej                     z  df   fd	Z xZS )AlbertLayerGroupr"   c                     t         |           t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        y c c}w r   )r-   r.   r   
ModuleListrangeinner_group_numr   albert_layersrE   r"   r   rF   s      rG   r.   zAlbertLayerGroup.__init__   s=    ]]vOeOeIf+gAK,?+gh+gs   ANr   rf   rh   rK   .c                 T    t        | j                        D ]  \  }} |||fi |} |S r   )	enumerater   )rE   r   rf   rh   layer_indexalbert_layers         rG   rX   zAlbertLayerGroup.forward   s<     *343E3E)F 	R%K(Q&QM	RrH   r   r   ra   s   @rG   r   r      sr    i| i 48|| ))D0 +,	
 
u||eELL1136	7rH   r   c            
       z     e Zd Zdef fdZ	 d	dej                  dej                  dz  dee	   de
ez  fdZ xZS )
AlbertTransformerr"   c                     t         |           || _        t        j                  |j
                  |j                        | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        y c c}w r   )r-   r.   r"   r   r   r1   r{   embedding_hidden_mapping_inr   r   num_hidden_groupsr   albert_layer_groupsr   s      rG   r.   zAlbertTransformer.__init__  sf    +-99V5J5JFL^L^+_(#%==TYZ`ZrZrTs1tq2B62J1t#u 1ts   ,BNr   rf   rh   rK   c                 $   | j                  |      }t        | j                  j                        D ]R  }t	        || j                  j                  | j                  j
                  z  z        } | j                  |   ||fi |}T t        |      S )N)last_hidden_state)r   r   r"   num_hidden_layersintr   r   r   )rE   r   rf   rh   i	group_idxs         rG   rX   zAlbertTransformer.forward  s     88Gt{{445 	AA!>!>A^A^!^_`I?D44Y? M		 ??rH   r   )rY   rZ   r[   r   r.   r>   r_   r^   r   r   r   r   rX   r`   ra   s   @rG   r   r     s`    v| v 48@||@ ))D0@ +,	@
 
5	 @rH   r   c                   \    e Zd ZeZdZdZdZdZdZ	e
edZ ej                         d        Zy)AlbertPreTrainedModelalbertT)r   
attentionsc                 f   t        |t        j                        rct        j                  |j
                  d| j                  j                         |j                   t        j                  |j                         yyt        |t        j                        rt        j                  |j
                  d| j                  j                         |j                  Et        |j
                  dd      s-t        j                  |j
                  |j                            yyyt        |t        j                        r?t        j                  |j                         t        j                  |j
                         yt        |t              r t        j                  |j                         yt        |t               ryt        j"                  |j$                  t'        j(                  |j$                  j*                  d         j-                  d             t        j                  |j.                         yy)zInitialize the weights.r   )meanstdN_is_hf_initializedFr)   r(   )
isinstancer   r   initnormal_weightr"   initializer_rangebiaszeros_r/   r$   getattrr8   ones_AlbertMLMHeadr!   copy_r'   r>   r?   rQ   r@   r+   )rE   rb   s     rG   _init_weightsz#AlbertPreTrainedModel._init_weights0  sa    fbii(LLSdkk6S6ST{{&FKK( '-LLSdkk6S6ST!!-gfmmMach6iFMM&*<*<=> 7j--KK$JJv}}%.KK$ 01JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 2rH   N)rY   rZ   r[   r   config_classbase_model_prefix_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   ry   _can_record_outputsr>   no_gradr    rH   rG   r   r   #  sN    L N"&$%
 U]]_/ /rH   r   z2
    Output type of [`AlbertForPreTraining`].
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
eej                     dz  ed<   dZeej                     dz  ed<   y)AlbertForPreTrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    Nlossprediction_logits
sop_logitsr   r   )rY   rZ   r[   r\   r   r>   r^   __annotations__r   r   r   r   r   r   rH   rG   r   r   F  s}    	 &*D%

d
")26u((4/6+/J!!D(/59M5**+d2926Je''(4/6rH   r   c                   H    e Zd ZeZdZddedef fdZdej                  fdZ
dej                  ddfd	Zee	 	 	 	 	 dd
ej                  dz  dej                   dz  dej                  dz  dej                  dz  dej                   dz  dee   deez  fd              Z xZS )AlbertModelr   r"   add_pooling_layerc                 f   t         |   |       || _        t        |      | _        t        |      | _        |rIt        j                  |j                  |j                        | _
        t        j                         | _        nd| _
        d| _        |j                  | _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r-   r.   r"   r!   rW   r   encoderr   r   r{   poolerTanhpooler_activationr   attn_implementation	post_init)rE   r"   r   rF   s      rG   r.   zAlbertModel.__init__d  s    
 	 *62(0))F$6$68J8JKDK%'WWYD"DK%)D"#)#>#>  	rH   rK   c                 .    | j                   j                  S r   rW   r3   rE   s    rG   get_input_embeddingsz AlbertModel.get_input_embeddingsz  s    ...rH   re   Nc                 &    || j                   _        y r   r   )rE   re   s     rG   set_input_embeddingsz AlbertModel.set_input_embeddings}  s    */'rH   rI   rf   r+   r'   rJ   rh   c                 >   |d u |d uz  rt        d      | j                  ||||      }t        | j                  ||      } | j                  ||fd|i|}|d   }	| j
                  '| j                  | j                  |	d d df               nd }
t        |	|
      S )Nz:You must specify exactly one of input_ids or inputs_embeds)r'   r+   rJ   )r"   input_embedsrf   r'   r   )r   pooler_output)r}   rW   r   r"   r   r   r   r   )rE   rI   rf   r+   r'   rJ   rh   embedding_outputencoder_outputssequence_outputpooled_outputs              rG   rX   zAlbertModel.forward  s     -t";<YZZ??L_l + 
 3;;))
 '$,,
 &
 	
 *!,VZVaVaVm..t{{?1a4;P/QRsw)-'
 	
rH   )T)NNNNN)rY   rZ   r[   r   r   r   boolr.   r   r/   r   r   r   r   r>   r]   r^   r   r   r   r   rX   r`   ra   s   @rG   r   r   _  s    L |  ,/bll /0",, 04 0  .237260426$
##d*$
 ))D0$
 ((4/	$

 &&-$
 ((4/$
 +,$
 
$e	+$
  $
rH   r   z
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    c                       e Zd ZdddZdef fdZdej                  fdZdej                  dd	fd
Z	dej                  fdZee	 	 	 	 	 	 	 ddej                  d	z  dej                   d	z  dej                  d	z  dej                  d	z  dej                   d	z  dej                  d	z  dej                  d	z  dee   deez  fd              Z xZS )AlbertForPreTraining(albert.embeddings.word_embeddings.weightpredictions.biaszpredictions.decoder.weightzpredictions.decoder.biasr"   c                     t         |   |       t        |      | _        t	        |      | _        t        |      | _        | j                          y r   )	r-   r.   r   r   r   predictionsAlbertSOPHeadsop_classifierr   rD   s     rG   r.   zAlbertForPreTraining.__init__  sB     !&)(0+F3 	rH   rK   c                 .    | j                   j                  S r   r   decoderr   s    rG   get_output_embeddingsz*AlbertForPreTraining.get_output_embeddings      '''rH   new_embeddingsNc                 &    || j                   _        y r   r  rE   r  s     rG   set_output_embeddingsz*AlbertForPreTraining.set_output_embeddings  s    #1 rH   c                 B    | j                   j                  j                  S r   r   rW   r3   r   s    rG   r   z)AlbertForPreTraining.get_input_embeddings      {{%%555rH   rI   rf   r+   r'   rJ   labelssentence_order_labelrh   c           	          | j                   |f||||dd|}	|	dd \  }
}| j                  |
      }| j                  |      }d}|u|st               } ||j	                  d| j
                  j                        |j	                  d            } ||j	                  dd      |j	                  d            }||z   }t        ||||	j                  |	j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then
            sequence B), `1` indicates switched order (sequence B, then sequence A).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, AlbertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> sop_logits = outputs.sop_logits
        ```Trf   r+   r'   rJ   return_dictNrk   r)   )r   r   r   r   r   )
r   r   r  r   r   r"   r0   r   r   r   )rE   rI   rf   r+   r'   rJ   r  r  rh   outputsr   r   prediction_scores
sop_scores
total_lossloss_fctmasked_lm_losssentence_order_losss                     rG   rX   zAlbertForPreTraining.forward  s   N $++
))%'
 
 *1!& ,,_=((7

"6"B')H%&7&<&<RAWAW&XZ`ZeZefhZijN"*:??2q+ACWC\C\]_C`"a'*==J)/!!//))
 	
rH   NNNNNNN)rY   rZ   r[   _tied_weights_keysr   r.   r   r   r  r  r/   r   r   r   r>   r]   r^   r   r   r   r   rX   r`   ra   s   @rG   r   r     sE    'Q$6
| (ryy (2BII 2$ 26bll 6  .237260426*.8<A
##d*A
 ))D0A
 ((4/	A

 &&-A
 ((4/A
   4'A
 $..5A
 +,A
 
$e	+A
  A
rH   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )r   r"   c                    t         |           t        j                  |j                  |j
                        | _        t        j                  t        j                  |j                              | _
        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        |j                      | _        y )Nr%   )r-   r.   r   r8   r1   r9   	Parameterr>   rA   r0   r   r   r{   r   r  r   r   r   rD   s     rG   r.   zAlbertMLMHead.__init__  s    f&;&;AVAVWLLV->->!?@	YYv1163H3HI
yy!6!68I8IJ !2!23rH   r   rK   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|}|S r   )r   r   r8   r  )rE   r   r  s      rG   rX   zAlbertMLMHead.forward  sF    

=16}5]3)  rH   	rY   rZ   r[   r   r.   r>   r_   rX   r`   ra   s   @rG   r   r     s*    4| 4!U\\ !ell !rH   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )r  r"   c                     t         |           t        j                  |j                        | _        t        j                  |j                  |j                        | _	        y r   )
r-   r.   r   r:   classifier_dropout_probr<   r   r{   
num_labels
classifierrD   s     rG   r.   zAlbertSOPHead.__init__$  sB    zz&"@"@A))F$6$68I8IJrH   r   rK   c                 J    | j                  |      }| j                  |      }|S r   )r<   r&  )rE   r   dropout_pooled_outputlogitss       rG   rX   zAlbertSOPHead.forward*  s%     $] ;!67rH   r!  ra   s   @rG   r  r  #  s,    K| KU\\ ell rH   r  c                   ~    e Zd ZdddZ fdZdej                  fdZdej                  ddfd	Zdej                  fd
Z
ee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dee   deez  fd              Z xZS )AlbertForMaskedLMr   r   r   c                     t         |   |       t        |d      | _        t	        |      | _        | j                          y NF)r   )r-   r.   r   r   r   r   r   rD   s     rG   r.   zAlbertForMaskedLM.__init__7  s7     !&EB(0 	rH   rK   c                 .    | j                   j                  S r   r  r   s    rG   r  z'AlbertForMaskedLM.get_output_embeddings@  r  rH   r  Nc                 \    || j                   _        |j                  | j                   _        y r   )r   r  r   r
  s     rG   r  z'AlbertForMaskedLM.set_output_embeddingsC  s$    #1  . 3 3rH   c                 B    | j                   j                  j                  S r   r  r   s    rG   r   z&AlbertForMaskedLM.get_input_embeddingsG  r  rH   rI   rf   r+   r'   rJ   r  rh   c           
      :    | j                   d|||||dd|}|d   }	| j                  |	      }
d}|Ft               } ||
j                  d| j                  j
                        |j                  d            }t        ||
|j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, AlbertForMaskedLM

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2")

        >>> # add mask_token
        >>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> # retrieve index of [MASK]
        >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
        >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
        >>> tokenizer.decode(predicted_token_id)
        'france'
        ```

        ```python
        >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
        >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
        >>> outputs = model(**inputs, labels=labels)
        >>> round(outputs.loss.item(), 2)
        0.81
        ```
        TrI   rf   r+   r'   rJ   r  r   Nr)   r   r)  r   r   r   )	r   r   r   r   r"   r0   r   r   r   )rE   rI   rf   r+   r'   rJ   r  rh   r  sequence_outputsr  r  r  s                rG   rX   zAlbertForMaskedLM.forwardJ  s    ^ $++ 
))%'
 
 #1: ,,-=>')H%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
rH   NNNNNN)rY   rZ   r[   r  r.   r   r   r  r  r/   r   r   r   r>   r]   r^   r   r   r   r   rX   r`   ra   s   @rG   r+  r+  0  s%    'Q$6
(ryy (4BII 4$ 46bll 6  .237260426*.D
##d*D
 ))D0D
 ((4/	D

 &&-D
 ((4/D
   4'D
 +,D
 
%	D
  D
rH   r+  z
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                       e Zd Zdef fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e
e   deez  fd              Z xZS )AlbertForSequenceClassificationr"   c                 N   t         |   |       |j                  | _        || _        t	        |      | _        t        j                  |j                        | _	        t        j                  |j                  | j                  j                        | _        | j                          y r   )r-   r.   r%  r"   r   r   r   r:   r$  r<   r   r{   r&  r   rD   s     rG   r.   z(AlbertForSequenceClassification.__init__  st      ++!&)zz&"@"@A))F$6$68N8NO 	rH   NrI   rf   r+   r'   rJ   r  rh   rK   c           
          | j                   d
|||||dd|}|d   }	| j                  |	      }	| j                  |	      }
d}|| j                  j                  | j
                  dk(  rd| j                  _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j                  dk(  rIt               }| j
                  dk(  r& ||
j                         |j                               }n ||
|      }n| j                  j                  dk(  r=t               } ||
j                  d| j
                        |j                  d            }n,| j                  j                  dk(  rt               } ||
|      }t        ||
|j                   |j"                  	      S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Tr2  r   N
regressionsingle_label_classificationmulti_label_classificationr)   r3  r   )r   r<   r&  r"   problem_typer%  r,   r>   rC   r   r   squeezer   r   r   r   r   r   )rE   rI   rf   r+   r'   rJ   r  rh   r  r   r)  r   r  s                rG   rX   z'AlbertForSequenceClassification.forward  s   $ $++ 
))%'
 
  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
rH   r5  )rY   rZ   r[   r   r.   r   r   r>   r]   r^   r   r   r   r   rX   r`   ra   s   @rG   r7  r7    s    
| 
  .237260426*.;
##d*;
 ))D0;
 ((4/	;

 &&-;
 ((4/;
   4';
 +,;
 
"E	);
  ;
rH   r7  c                       e Zd Zdef fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e
e   deez  fd              Z xZS )AlbertForTokenClassificationr"   c                 x   t         |   |       |j                  | _        t        |d      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  | j                  j                        | _        | j                          y r-  )r-   r.   r%  r   r   r$  r;   r   r:   r<   r   r{   r"   r&  r   )rE   r"   r$  rF   s      rG   r.   z%AlbertForTokenClassification.__init__  s      ++!&EB --9 **++ 	 
 zz"9:))F$6$68N8NO 	rH   NrI   rf   r+   r'   rJ   r  rh   rK   c           	      H    | j                   |f||||dd|}|d   }	| j                  |	      }	| j                  |	      }
d}|<t               } ||
j	                  d| j
                        |j	                  d            }t        ||
|j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Tr  r   Nr)   r3  )	r   r<   r&  r   r   r%  r   r   r   )rE   rI   rf   r+   r'   rJ   r  rh   r  r   r)  r   r  s                rG   rX   z$AlbertForTokenClassification.forward  s      $++
))%'
 
 "!*,,71')HFKKDOO<fkk"oND$!//))	
 	
rH   r5  )rY   rZ   r[   r   r.   r   r   r>   r]   r^   r   r   r   r   rX   r`   ra   s   @rG   r@  r@    s    |    .237260426*.'
##d*'
 ))D0'
 ((4/	'

 &&-'
 ((4/'
   4''
 +,'
 
	&'
  '
rH   r@  c                   6    e Zd Zdef fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  de
e   deez  fd              Z xZS )AlbertForQuestionAnsweringr"   c                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r-  )
r-   r.   r%  r   r   r   r   r{   
qa_outputsr   rD   s     rG   r.   z#AlbertForQuestionAnswering.__init__&  sU      ++!&EB))F$6$68I8IJ 	rH   NrI   rf   r+   r'   rJ   start_positionsend_positionsrh   rK   c           
          | j                   d
|||||dd|}	|	d   }
| j                  |
      }|j                  dd      \  }}|j                  d      j	                         }|j                  d      j	                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }t        ||||	j                  |	j                  	      S )NTr2  r   r   r)   rm   )ignore_indexrk   )r   start_logits
end_logitsr   r   r   )r   rF  splitr>  rt   lenrB   clampr   r   r   r   )rE   rI   rf   r+   r'   rJ   rG  rH  rh   r  r   r)  rK  rL  r  ignored_indexr  
start_lossend_losss                      rG   rX   z"AlbertForQuestionAnswering.forward0  s    $++ 
))%'
 
 "!*#?#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J+%!!//))
 	
rH   r  rY   rZ   r[   r   r.   r   r   r>   r]   r^   r   r   r   r   rX   r`   ra   s   @rG   rD  rD  $  s    |   .23726042637153
##d*3
 ))D03
 ((4/	3

 &&-3
 ((4/3
 ))D03
 ''$.3
 +,3
 
$e	+3
  3
rH   rD  c                       e Zd Zdef fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e
e   deez  fd              Z xZS )AlbertForMultipleChoicer"   c                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y )Nr   )r-   r.   r   r   r   r:   r$  r<   r   r{   r&  r   rD   s     rG   r.   z AlbertForMultipleChoice.__init__j  sV     !&)zz&"@"@A))F$6$6: 	rH   NrI   rf   r+   r'   rJ   r  rh   rK   c           	         ||j                   d   n|j                   d   }|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|1|j                  d|j                  d      |j                  d            nd} | j                  |f||||dd|}	|	d   }
| j	                  |
      }
| j                  |
      }|j                  d|      }d}|t               } |||      }t        |||	j                  |	j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
            *input_ids* above)
        Nr   r)   rl   Tr  r3  )
rQ   r   rB   r   r<   r&  r   r   r   r   )rE   rI   rf   r+   r'   rJ   r  rh   num_choicesr  r   r)  reshaped_logitsr   r  s                  rG   rX   zAlbertForMultipleChoice.forwardt  s   T -6,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	
 $++
))%'
 
  
]3#}= ++b+6')HOV4D("!//))	
 	
rH   r5  rS  ra   s   @rG   rU  rU  h  s    |   .237260426*.M
##d*M
 ))D0M
 ((4/	M

 &&-M
 ((4/M
   4'M
 +,M
 
$e	+M
  M
rH   rU  )r   r   r   r+  r7  r@  rD  rU  )Nr   )Cr\   collections.abcr   dataclassesr   r>   r   torch.nnr   r   r    r
   r   activationsr   masking_utilsr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   utils.genericr   r   configuration_albertr   
get_loggerrY   loggerModuler!   r_   floatrw   ry   r   r   r   r   r   r   r   r   r  r+  r7  r@  rD  rU  __all__r   rH   rG   <module>rl     s    $ !   A A & ! 6   G & N M A . 
		H	%=ryy =N !%II%<<% 
% <<	%
 LL4'% T\% % '(%:>)bii >)B#")) #Lryy "@		 @: /O / /D 
7 7 7& F
' F
 F
R \
0 \
\
~!BII !*
BII 
 _
- _
 _
D J
&; J
J
Z :
#8 :
 :
z @
!6 @
 @
F Z
3 Z
 Z
z	rH   