
    i                        d dl Z d dlmZ d dlmZ d dlZd dlmZ d dlmZm	Z	m
Z
 ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/  G d dej`                        Z1 G d dej`                        Z2 G d dej`                        Z3	 d?dej`                  dejh                  dejh                  dejh                  dejh                  dz  de5d e5fd!Z6d" Z7 ed#      d@d$       Z8 ee8       G d% d&ej`                               Z9 G d' d(e      Z:e) G d) d*e$             Z;e) G d+ d,e;             Z< G d- d.ej`                        Z= e)d/0       G d1 d2e;             Z> e)d30       G d4 d5e;             Z? e)d60       G d7 d8e;             Z@e) G d9 d:e;             ZA e)d;0       G d< d=e;             ZBg d>ZCy)A    N)Callable)Optional)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)use_kernel_func_from_hubuse_kernelized_func)create_bidirectional_mask(create_bidirectional_sliding_window_mask)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring)can_return_tuplecheck_model_inputsmaybe_autocast   )ModernBertConfigc                        e Zd ZdZdef fdZ	 d	dej                  dz  dej                  dz  dej                  fdZ	 xZ
S )
ModernBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    configc                 d   t         |           || _        t        j                  |j
                  |j                  |j                        | _        t        j                  |j                  |j                  |j                        | _        t        j                  |j                        | _        y )N)padding_idxepsbias)super__init__r%   r   	Embedding
vocab_sizehidden_sizepad_token_idtok_embeddings	LayerNormnorm_eps	norm_biasnormDropoutembedding_dropoutdropselfr%   	__class__s     |/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/modernbert/modeling_modernbert.pyr,   zModernBertEmbeddings.__init__8   sw     ll6+<+<f>P>P^d^q^qrLL!3!3vO_O_`	JJv778	    N	input_idsinputs_embedsreturnc                     |"| j                  | j                  |            }|S | j                  | j                  | j                  |                  }|S N)r8   r5   r1   )r:   r>   r?   hidden_statess       r<   forwardzModernBertEmbeddings.forward?   sS     $ IIdii&>?M  !IIdii0C0CI0N&OPMr=   NN)__name__
__module____qualname____doc__r"   r,   torch
LongTensorTensorrD   __classcell__r;   s   @r<   r$   r$   3   sR    9/ 9 _c))D0HMW[H[	r=   r$   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )ModernBertMLPa6  Applies the GLU at the end of each ModernBERT layer.

    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
    r%   c                    t         |           || _        t        j                  |j
                  t        |j                        dz  |j                        | _	        t        |j                     | _        t        j                  |j                        | _        t        j                  |j                  |j
                  |j                        | _        y )N   r*   )r+   r,   r%   r   Linearr/   intintermediate_sizemlp_biasWir   hidden_activationactr6   mlp_dropoutr8   Wor9   s     r<   r,   zModernBertMLP.__init__P   s    ))F..F4L4L0MPQ0QX^XgXgh&223JJv112	))F44f6H6Hv_r=   rC   r@   c                     | j                  |      j                  dd      \  }}| j                  | j                  | j	                  |      |z              S )NrR   dim)rX   chunkr\   r8   rZ   )r:   rC   inputgates       r<   rD   zModernBertMLP.forwardX   sI    ggm,221"2=twwtyy%4!7899r=   )
rF   rG   rH   rI   r"   r,   rJ   rL   rD   rM   rN   s   @r<   rP   rP   I   s2    `/ `:U\\ :ell :r=   rP   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 	 ddedz  de	d   de
dz  dedz  d	ed
ef   f
d       Z ej                         edd              Z xZS )ModernBertRotaryEmbeddinginv_freqNr%   c                 v   t         |           |j                  | _        |j                  | _        || _        t        t        |j                              | _        i | _	        | j                  D ]  }| j
                  j                  |   }||d   | j                  |<   | j                  }| j                  |   dk7  rt        | j                  |      } || j
                  ||      \  }}| j                  | d|d       | j                  | d|j                         d       t        | | d|        y )	N	rope_typedefault
layer_type	_inv_freqF)
persistent_original_inv_freq_attention_scaling)r+   r,   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr%   listsetlayer_typesrh   rope_parameterscompute_default_rope_parametersr   register_bufferclonesetattr)	r:   r%   devicerk   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalingr;   s	           r<   r,   z"ModernBertRotaryEmbedding.__init__`   s8   "("@"@$*$B$B!F$6$6 78** 	UJ++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@fak4l1M1  J<y!9=UZ [  J</A!BMDWDWDYfk lDZL(:;=ST	Ur=   r{   ztorch.deviceseq_lenrk   r@   ztorch.Tensorc                     | j                   |   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   rR   dtyper{   r   )	rv   getattrr/   num_attention_headsrJ   arangeint64tofloat)r%   r{   r   rk   baser`   attention_factorrf   s           r<   rw   z9ModernBertRotaryEmbedding.compute_default_rope_parametersw   s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r=   c                 N   t        | | d      }t        | | d      }|d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d	      5  |j                         |j                         z  j                  dd
      }	t        j                  |	|	fd      }
|
j                         |z  }|
j                         |z  }d d d        j	                  |j                        j	                  |j                        fS # 1 sw Y   AxY w)Nrl   ro   r   r^   r!   mpscpuF)device_typeenabledrR   r_   r   )r   r   expandshaper   r{   
isinstancetypestrr    	transposerJ   catcossinr   )r:   xposition_idsrk   rf   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   s                r<   rD   z!ModernBertRotaryEmbedding.forward   sl    4J<y!9:#DZL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	0&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')//C'')//C		0 vvAGGv$cff177f&;;;	0 	0s   *A1FF$rB   NNNN)rF   rG   rH   rJ   rL   __annotations__r"   r,   staticmethodr   rU   r   tupler   rw   no_gradr   rD   rM   rN   s   @r<   re   re   ]   s    llU/ U. *.+/"!%	!* 4'!*(!* t!* $J	!*
 
~u$	%!* !*F U]]_<  <r=   re   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|#|d d d d d d d |j                  d   f   }	||	z   }t        j
                  j                  |dt         j                        j                  |j                        }t        j
                  j                  ||| j                        }t        j                  ||      }
|
j                  dd      j                         }
|
|fS )NrR   r	   r^   )r`   r   )ptrainingr!   )rJ   matmulr   r   r   
functionalsoftmaxfloat32r   r   r   r   
contiguous)r   r   r   r   r   r   r   kwargsattn_weightscausal_maskattn_outputs              r<   eager_attention_forwardr      s     <<s}}Q':;gEL!$Q1o		"o%=>#k1==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r=   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr^   rR   r_   )r   rJ   r   )r   x1x2s      r<   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r=   rotary_pos_embc                 b   | j                   }|j                  |      }|j                  |      }| j                         |z  t        | j                               |z  z   }|j                         |z  t        |j                               |z  z   }|j	                  |      |j	                  |      fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r   	unsqueezer   r   r   )qkr   r   unsqueeze_dimoriginal_dtypeq_embedk_embeds           r<   apply_rotary_pos_embr      s    & WWN
--
&C
--
&Cwwy3;qwwy#9C#?@Gwwy3;qwwy#9C#?@G::n%wzz.'AAAr=   c                        e Zd ZdZddededz  f fdZ	 	 ddej                  de	ej                  ej                  f   dz  dej                  dz  d	e
e   d
e	ej                  ej                  dz  f   f
dZ xZS )ModernBertAttentiona  Performs multi-headed self attention on a batch of unpadded sequences.

    If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
    If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
    which requires padding and unpadding inputs, adding some overhead.

    See `forward` method for additional details.
    Nr%   	layer_idxc                 R   t         |           || _        || _        |j                  |j
                  z  dk7  r&t        d|j                   d|j
                   d      |j                  | _        |j                  | _        |j                  |j
                  z  | _	        t        j                  |j                  d| j                  z  |j
                  z  |j                        | _        |j                  |   dk(  r|j                  dz   | _        nd | _        d	| _        t        j                  |j                  |j                  |j                        | _        |j                  d
kD  r%t        j$                  |j                        | _        y t        j&                         | _        y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r	   rS   sliding_attentionr!   F        )r+   r,   r%   r   r/   r   
ValueErrorattention_dropoutdeterministic_flash_attnr   r   rT   attention_biasWqkvru   sliding_window	is_causalr\   r6   Identityout_dropr:   r%   r   r;   s      r<   r,   zModernBertAttention.__init__   sz   " : ::a?#F$6$6#77mnt  oI  oI  nJ  JK  L  "(!9!9(.(G(G%**f.H.HHIIDMM 1F4N4N NU[UjUj
	 i(,?? #)"7"7!";D"&D))F..0B0BI^I^_@F@X@X[^@^

6#;#;<dfdododqr=   rC   position_embeddingsr   r   r@   c                    |j                   d d }| j                  |      } |j                  g |dd| j                   }|j	                  d      \  }}}	|j                  dd      }|j                  dd      }|	j                  dd      }	|\  }
}t        |||
|d      \  }}t        }| j                  j                  dk7  rt        | j                  j                     } || |||	|f| j                  r| j                  nd	| j                  d
z  | j                  | j                  d|\  }} |j                  g |d j!                         }| j#                  | j%                  |            }||fS )Nr^   r	   r_   r!   rR   )r   eagerr         )r   r   r   deterministic)r   r   viewr   unbindr   r   r   r%   _attn_implementationr   r   r   r   r   reshaper   r   r\   )r:   rC   r   r   r   input_shapeqkvquery_states
key_statesvalue_statesr   r   attention_interfacer   r   s                  r<   rD   zModernBertAttention.forward  s    $))#2.ii&chh::Q::DMM:141C.j,#--a3))!Q/
#--a3&S#7jRUWZjk#l j5;;++w6"9$++:Z:Z"[$7%
 /3mmD**MM4'..77%
 %
!\ *k));;;;FFHmmDGGK$89L((r=   rB   rE   )rF   rG   rH   rI   r"   rU   r,   rJ   rL   r   r   r   rD   rM   rN   s   @r<   r   r      s    r/ rC$J r@ IM.2	')||') #5<<#=>E') t+	')
 +,') 
u||U\\D00	1')r=   r   c                        e Zd Zddededz  f fdZ	 	 ddej                  dej                  dz  dej                  dz  dee	   d	ej                  f
d
Z
 xZS )ModernBertEncoderLayerNr%   r   c                    t         |           || _        || _        |dk(  rt	        j
                         | _        n;t	        j                  |j                  |j                  |j                        | _        t        ||      | _        t	        j                  |j                  |j                  |j                        | _        t        |      | _        |j                   |   | _        y )Nr   r(   )r%   r   )r+   r,   r%   r   r   r   	attn_normr2   r/   r3   r4   r   attnmlp_normrP   mlpru   attention_typer   s      r<   r,   zModernBertEncoderLayer.__init__:  s    ">[[]DN\\&*<*<&//X^XhXhiDN'vK	V%7%7V__SYScScd ($00;r=   rC   r   r   r   r@   c                      | j                   | j                  |      f||d|\  }}||z   }|| j                  | j                  |            z   }|S )N)r   r   )r   r   r   r   )r:   rC   r   r   r   r   _s          r<   rD   zModernBertEncoderLayer.forwardG  sg     #NN=)
 3)
 	
Q &3%}1M(NNr=   rB   rE   )rF   rG   rH   r"   rU   r,   rJ   rL   r   r   rD   rM   rN   s   @r<   r   r   9  sx    </ <C$J <  /337	|| t+ #\\D0	
 +, 
r=   r   c                        e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZeedZ ej                          dej$                  fd       Z	 dd	ed
z  dedef fdZ xZS )ModernBertPreTrainedModelr%   modelTr$   r   )rC   
attentionsr   c                    | j                   j                  ddt        j                  dt        ffd}| j                   j
                  | j                   j
                  t        j                  d| j                   j                  z        z  | j                   j
                  | j                   j                  dz  d}t        |t              r ||j                  |d          y t        |t              r- ||j                  |d	           ||j                  |d
          y t        |t               r- ||j"                  |d	           ||j                  |d
          y t        |t$              r ||j&                  |d
          y t        |t(              r ||j*                  |d
          y t        |t,        t.        t0        t2        f      r ||j4                  |d          y t        |t        j6                        rLt9        j:                  |j<                         |j>                   t9        j@                  |j>                         y y t        |tB              r|jD                  D ]  }|jF                  }|jH                  |   dk7  rtJ        |jH                  |      } ||j                   |      \  }}t9        jL                  tO        || d      |       t9        jL                  tO        || d      |        y y )Nr	   r   stdc                     t        j                  | j                  d| |z  |z         t        | t        j
                        r-| j                   t        j                  | j                         y y y )Nr   )meanr   ab)inittrunc_normal_weightr   r   rT   r*   zeros_)r   r   cutoff_factors     r<   init_weightz<ModernBertPreTrainedModel._init_weights.<locals>.init_weighto  sd     .3&#% &")),;;*KK, + -r=   g       @r   )inout	embedding	final_outr	  r  r  r
  ri   rj   rl   rn   )(r%   initializer_cutoff_factorr   Moduler   initializer_rangemathsqrtnum_hidden_layersr/   r   r$   r1   rP   rX   r\   r   r   ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassificationModernBertForMultipleChoice ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierr2   r  ones_r  r*   r  re   ru   rw   rh   r   copy_r   )	r:   r   r  stdsrk   r}   r~   r   r  s	           @r<   _init_weightsz'ModernBertPreTrainedModel._init_weightsi  sd   == M	-		 	- 	- ++//;;00499S4;;C`C`=`3aa6600$6	
 f23--tK/@A.		4:.		4;/ 34T$Z0		4;/ 89d5k2 56U43+0.	
 ))4+<=-JJv}}%{{&FKK( ' 9:$00 ^
%EE##J/9<#6v7G7G
7S#TL#/*#U q

76j\+CDmT

76j\9K+LM}]^ ;r=   attn_implementationNis_init_checkr@   c                     	 |dn|}t         |   ||      S # t        t        f$ r t         |   ||      cY S w xY w)zR
        Checks and dispatches to hhe requested attention implementation.
        flash_attention_2)r  r  )r+   %_check_and_adjust_attn_implementationr   ImportError)r:   r  r  requested_attn_implementationr;   s       r<   r"  z?ModernBertPreTrainedModel._check_and_adjust_attn_implementation  sh    	CVC^,?dw)7@$AQ^ A   K( 	7@$7} A  	s     ==)F)rF   rG   rH   r"   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsrJ   r   r   r  r  r   boolr"  rM   rN   s   @r<   r   r   Y  s    &*#/1IJN"& 0)
 U]]_:^BII :^ :^z FK#&:>B	 r=   r   c                        e Zd Zdef fdZd Zd Zee	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
ee   defd              Z xZS )ModernBertModelr%   c           	         t         |   |       || _        t        |      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _
        t        j                  |j                  |j                  |j                        | _        t!        |      | _        d| _        | j'                          y c c}w )Nr(   )r%   F)r+   r,   r%   r$   
embeddingsr   
ModuleListranger  r   layersr2   r/   r3   r4   
final_normre   
rotary_embgradient_checkpointing	post_initr   s      r<   r,   zModernBertModel.__init__  s     .v6mmHMfNfNfHgh9#FI6h
 ,,v'9'9vU[UeUef36B&+# is   Cc                 .    | j                   j                  S rB   r1  r1   r:   s    r<   get_input_embeddingsz$ModernBertModel.get_input_embeddings  s    ---r=   c                 &    || j                   _        y rB   r:  )r:   r   s     r<   set_input_embeddingsz$ModernBertModel.set_input_embeddings  s    ).&r=   Nr>   r   r   r?   r   r@   c                    |d u |d uz  rt        d      ||j                  d   n|j                  d   }||j                  n|j                  }|&t        j                  ||      j                  d      }| j                  ||      }t        |x}	t              s'| j                  ||d}
t        d
i |
t        d
i |
d}	i }| j                  j                  D ]  }| j                  |||      ||<    | j                  D ](  } ||f|	|j                     ||j                     d|}* | j!                  |      }t#        |	      S )Nz:You must specify exactly one of input_ids or inputs_embedsr!   r{   r   )r>   r?   )r%   input_embedsr   )full_attentionr   )r   r   )last_hidden_state )r   r   r{   rJ   r   r   r1  r   dictr%   r   r   ru   r6  r4  r   r5  r   )r:   r>   r   r   r?   r   r   r{   rC   attention_mask_mappingmask_kwargsr   rk   encoder_layers                 r<   rD   zModernBertModel.forward  sw    -t";<YZZ,9,E-%%a(9??[\K]%.%:!!@T@T <<?II!LL)=YNB0DI++ -"0K #<"Jk"J%M%\P[%\&"
 !++11 	gJ.2oom\[e.f
+	g "[[ 	M)5m6R6RS$78T8T$U 	M	 6??r=   r   )rF   rG   rH   r"   r,   r<  r>  r   r   rJ   rK   rL   r   r   r   rD   rM   rN   s   @r<   r/  r/    s    
/ 
./  .2.204-1,@##d*,@ t+,@ &&-	,@
 ||d*,@ +,,@ 
,@  ,@r=   r/  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )r  r%   c                 J   t         |           || _        t        j                  |j
                  |j
                  |j                        | _        t        |j                     | _
        t        j                  |j
                  |j                  |j                        | _        y )Nr(   )r+   r,   r%   r   rT   r/   classifier_biasr  r   classifier_activationrZ   r2   r3   r4   r5   r9   s     r<   r,   z!ModernBertPredictionHead.__init__  sq    YYv1163E3EvG]G]^
&667LL!3!3vO_O_`	r=   rC   r@   c                 `    | j                  | j                  | j                  |                  S rB   )r5   rZ   r  )r:   rC   s     r<   rD   z ModernBertPredictionHead.forward  s#    yy$**]";<==r=   )	rF   rG   rH   r"   r,   rJ   rL   rD   rM   rN   s   @r<   r  r    s-    a/ a>U\\ >ell >r=   r  zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )custom_introc                   >    e Zd ZddiZdef fdZd Zdej                  fdZ	e
e	 	 	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dee   deej                     ez  fd              Z xZS )r  zdecoder.weightz&model.embeddings.tok_embeddings.weightr%   c                 t   t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                  |j                        | _        | j                  j                  | _        | j                  j                  | _        | j                          y )NrS   )r+   r,   r%   r/  r   r  headr   rT   r/   r.   decoder_biasr  sparse_predictionsparse_pred_ignore_indexr8  r9   s     r<   r,   zModernBertForMaskedLM.__init__  s     $V,
,V4	yy!3!3V5F5FVM`M`a!%!>!>(,(L(L% 	r=   c                     | j                   S rB   r  r;  s    r<   get_output_embeddingsz+ModernBertForMaskedLM.get_output_embeddings  s    ||r=   new_embeddingsc                     || _         y rB   rV  )r:   rX  s     r<   set_output_embeddingsz+ModernBertForMaskedLM.set_output_embeddings"  s	    %r=   Nr>   r   r   r?   labelsr   r@   c                     | j                   d||||d|}|d   }| j                  rK|I|j                  d      }|j                  |j                  d   d      }|| j                  k7  }	||	   }||	   }| j                  | j                  |            }
d }|* | j                  |
|fd| j                  j                  i|}t        ||
|j                  |j                        S )Nr>   r   r   r?   r   r^   r.   losslogitsrC   r   rD  )r   rS  r   r   rT  r  rQ  loss_functionr%   r.   r   rC   r   )r:   r>   r   r   r?   r[  r   outputsrC  mask_tokensr`  r_  s               r<   rD   zModernBertForMaskedLM.forward%  s	    $** 
)%'	

 
 $AJ!!f&8[[_F 1 6 6v||A K !D$A$AAK 1+ >K(Fdii(9:;%4%%ffbAWAWb[abD!//))	
 	
r=   NNNNN)rF   rG   rH   _tied_weights_keysr"   r,   rW  r   rT   rZ  r   r   rJ   rK   rL   r   r   r   r   rD   rM   rN   s   @r<   r  r  
  s     +,TU/ &BII &  .2.2,0-1&*'
##d*'
 t+'
 llT)	'

 ||d*'
 t#'
 +,'
 
u||	~	-'
  '
r=   r  z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c                       e Zd Zdef fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e
e   d
eej                     ez  fd              Z xZS )r  r%   c                 n   t         |   |       |j                  | _        || _        t	        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j!                          y rB   )r+   r,   
num_labelsr%   r/  r   r  rQ  rJ   r   r6   classifier_dropoutr8   rT   r/   r  r8  r9   s     r<   r,   z,ModernBertForSequenceClassification.__init__W  s      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	r=   Nr>   r   r   r?   r[  r   r@   c                 d    | j                   d||||d|}|d   }| j                  j                  dk(  r
|dddf   }n| j                  j                  dk(  rw|=t        j                  |j
                  dd |j                  t        j                        }||j                  d      z  j                  d	
      |j                  d	d      z  }| j                  |      }	| j                  |	      }	| j                  |	      }
d}|| j                  j                  | j                  d	k(  rd| j                  _        nl| j                  d	kD  rL|j                  t        j                   k(  s|j                  t        j"                  k(  rd| j                  _        nd| j                  _        | j                  j                  dk(  rIt%               }| j                  d	k(  r& ||
j'                         |j'                               }n ||
|      }n| j                  j                  dk(  r=t)               } ||
j+                  d| j                        |j+                  d            }n,| j                  j                  dk(  rt-               } ||
|      }t/        ||
|j0                  |j2                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r]  r   clsNr   rR   r   r^   r!   r_   Tr`   keepdim
regressionsingle_label_classificationmulti_label_classificationr^  rD  )r   r%   classifier_poolingrJ   onesr   r{   r-  r   sumrQ  r8   r  problem_typerh  r   longrU   r   squeezer   r   r   r   rC   r   )r:   r>   r   r   r?   r[  r   rb  rC  pooled_outputr`  r_  loss_fcts                r<   rD   z+ModernBertForSequenceClassification.forwardd  se   " $** 
)%'	

 
 $AJ;;))U2 1!Q$ 7[[++v5%!&%++BQ/8I8P8PX]XbXb" "3^5M5Mb5Q!Q V V[\ V ]`n`r`rt as a ! 		"34		-0/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r=   rd  )rF   rG   rH   r"   r,   r   r   rJ   rK   rL   r   r   r   r   rD   rM   rN   s   @r<   r  r  Q  s    /   .2.2,0-1&*C
##d*C
 t+C
 llT)	C

 ||d*C
 t#C
 +,C
 
u||	7	7C
  C
r=   r  zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c                       e Zd Zdef fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e
e   d
eej                     ez  fd              Z xZS )r  r%   c                 `   t         |   |       |j                  | _        t        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y rB   r+   r,   rh  r/  r   r  rQ  rJ   r   r6   ri  r8   rT   r/   r  r8  r9   s     r<   r,   z)ModernBertForTokenClassification.__init__  s{      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	r=   Nr>   r   r   r?   r[  r   r@   c                 f    | j                   d||||d|}|d   }| j                  |      }| j                  |      }| j                  |      }	d}
|<t	               } ||	j                  d| j                        |j                  d            }
t        |
|	|j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        r]  r   Nr^   r^  rD  )
r   rQ  r8   r  r   r   rh  r   rC   r   )r:   r>   r   r   r?   r[  r   rb  rC  r`  r_  rx  s               r<   rD   z(ModernBertForTokenClassification.forward  s     $** 
)%'	

 
 $AJ II&78 II&78!23')HFKKDOO<fkk"oND$!//))	
 	
r=   rd  )rF   rG   rH   r"   r,   r   r   rJ   rK   rL   r   r   r   r   rD   rM   rN   s   @r<   r  r    s    
/ 
  .2.2,0-1&*$
##d*$
 t+$
 llT)	$

 ||d*$
 t#$
 +,$
 
u||	4	4$
  $
r=   r  c                       e Zd Zdef fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	e
   d
eej                     ez  fd              Z xZS )r  r%   c                 `   t         |   |       |j                  | _        t        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y rB   r{  r9   s     r<   r,   z'ModernBertForQuestionAnswering.__init__  sy      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJr=   Nr>   r   r   start_positionsend_positionsr   r@   c                     | j                   |f||d|}|d   }| j                  |      }| j                  |      }| j                  |      }	|	j	                  dd      \  }
}|
j                  d      j                         }
|j                  d      j                         }d }|| | j                  |
|||fi |}t        ||
||j                  |j                        S )N)r   r   r   r!   r^   r_   )r_  start_logits
end_logitsrC   r   )r   rQ  r8   r  splitrv  r   ra  r   rC   r   )r:   r>   r   r   r  r  r   rb  rC  r`  r  r  r_  s                r<   rD   z&ModernBertForQuestionAnswering.forward  s    $**
)%
 	
 $AJ II&78 II&78!23#)<<r<#: j#++B/::<''+668
&=+D%4%%lJQ^ibhiD+%!!//))
 	
r=   rd  )rF   rG   rH   r"   r,   r   r   rJ   rL   r   r   r   r   rD   rM   rN   s   @r<   r  r    s    	/ 	  *..2,0/3-1#
<<$&#
 t+#
 llT)	#

 ,#
 ||d*#
 +,#
 
u||	;	;#
  #
r=   r  z
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd Zdef fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e
e   d
eej                     ez  fd              Z xZS )r  r%   c                 8   t         |   |       || _        t        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  d      | _        | j                          y )Nr!   )r+   r,   r%   r/  r   r  rQ  rJ   r   r6   ri  r8   rT   r/   r  r8  r9   s     r<   r,   z$ModernBertForMultipleChoice.__init__"  so     $V,
,V4	HH$$V%>%>?	))F$6$6: 	r=   Nr>   r   r   r?   r[  r   r@   c                    ||j                   d   n|j                   d   }|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|1|j                  d|j                  d      |j                  d            nd} | j                  d||||d|}|d   }	| j                  j
                  dk(  rt        j                  |	j                   d   |	j                        }
|,|j                  d	      j                  |	j                        }n0t        j                  dt        j                  |	j                  
      }|	|
|f   }	nS| j                  j
                  dk(  r:|j                  dd      }|	|j                  d      z  j                  d	      |z  }	| j                  |	      }| j!                  |      }| j#                  |      }|j                  d|      }d}|t%        j&                         } |||      }t)        |||j*                  |j,                        S )a&  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
        Nr!   r^   r   r]  r   rk  r@  r_   )r   r{   r   Trl  r^  rD  )r   r   sizer   r%   rq  rJ   r   r{   argmaxr   tensorru  rs  r   rQ  r8   r  r   r   r   rC   r   )r:   r>   r   r   r?   r[  r   num_choicesrb  rC  	indices_0cls_masknum_non_pad_tokensrw  r`  reshaped_logitsr_  rx  s                     r<   rD   z#ModernBertForMultipleChoice.forward.  sn     -6,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 $** 
)%'	

 
 $AJ ;;))U2%6%<%<Q%?HYH`H`aI))00R08;;<M<T<TU !<<DUD\D\] 1)X2E F [[++v5!/!3!34!3!H!2^5M5Mb5Q!Q V V[\ V ]`r r		"34		-0/ ++b+6**,HOV4D("!//))	
 	
r=   rd  )rF   rG   rH   r"   r,   r   r   rJ   rK   rL   r   r   r   r   rD   rM   rN   s   @r<   r  r    s    
/ 
  .2.2,0-1&*C
##d*C
 t+C
 llT)	C

 ||d*C
 t#C
 +,C
 
u||	8	8C
  C
r=   r  )r/  r   r  r  r  r  r  )r   )r!   )Dr  collections.abcr   typingr   rJ   r   torch.nnr   r   r    r
   r  activationsr   integrationsr   r   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.genericr   r   r    configuration_modernbertr"   r  r$   rP   re   rL   r   r   r   r   r   r   r   r/  r  r  r  r  r  r  __all__rD  r=   r<   <module>r     sU  ,  $    A A & ! I ` 9  L F & 7 Q Q 6299 ,:BII :(N<		 N<p %II%<<% 
% <<	%
 LL4'% % %.( *+B ,B4 )*N)")) N) +N)b7 @ \ \ \~ A@/ A@ A@H	>ryy 	> 
?
5 ?

?
D 
S
*C S

S
l 
3
'@ 3

3
l 1
%> 1
 1
h 
R
"; R

R
jr=   