
    iH                        d dl Z d dlmZmZ d dlZd dlmZ d dlmZmZm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4  e,jj                  e6      Z7 G d de      Z8 G d dejr                        Z: G d dejr                        Z; G d de3      Z< ed      d9d       Z= ee=       G d d ejr                               Z> G d! d"e      Z?e+ G d# d$e&             Z@e+ G d% d&e@             ZA G d' d(ejr                        ZB e+d)*       G d+ d,e@             ZC e+d-*       G d. d/e@             ZD e+d0*       G d1 d2e@             ZEe+ G d3 d4e@             ZF e+d5*       G d6 d7e@             ZGg d8ZHy):    N)LiteralOptional)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)PreTrainedConfiglayer_type_validation)use_kernel_func_from_hubuse_kernelized_func)create_bidirectional_mask(create_bidirectional_sliding_window_mask)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSRopeParameters)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuplecheck_model_inputs   )eager_attention_forward)Gemma3RotaryEmbeddingrotate_halfc            F       8    e Zd ZdZdZdgZdddZ fdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d2d	edz  d
edz  dedz  dedz  dedz  de	dz  dedz  de
dz  de
dz  de
dz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  de
dz  dee	   dz  deed   ef   dz  dedz  de
dz  d edz  d!e
dz  d"edz  d#ed$   d%e
dz  d&edz  d'e	dz  d(edz  d)edz  d*edz  d+edz  d,edz  fD fd-Zd3d.Z fd/Zed0        Zej*                  d1        Z xZS )4ModernBertConfigaO  
    This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the ModernBERT-base.
    e.g. [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 50368):
            Vocabulary size of the ModernBert model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`ModernBertModel`]
        hidden_size (`int`, *optional*, defaults to 768):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 22):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer decoder.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu"`
            if not specified.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_cutoff_factor (`float`, *optional*, defaults to 2.0):
            The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
        norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        norm_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the normalization layers.
        pad_token_id (`int`, *optional*, defaults to 50283):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 50282):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 50281):
            Beginning of stream token id.
        cls_token_id (`int`, *optional*, defaults to 50281):
            Classification token id.
        sep_token_id (`int`, *optional*, defaults to 50282):
            Separation token id.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        rope_parameters (`dict`, *optional*):
            Dictionary mapping attention patterns (`"full_attention"`, `"sliding_attention"`) to `RopeParameters`.
            Each value should be a dictionary containing `rope_type` and optional scaling parameters.
        local_attention (`int`, *optional*, defaults to 128):
            The window size for local attention.
        embedding_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the embeddings.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the MLP layers.
        mlp_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the MLP layers.
        decoder_bias (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the decoder layers.
        classifier_pooling (`str`, *optional*, defaults to `"cls"`):
            The pooling method for the classifier. Should be either `"cls"` or `"mean"`. In local attention layers, the
            CLS token doesn't attend to all tokens on long sequences.
        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the classifier.
        classifier_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the classifier.
        classifier_activation (`str`, *optional*, defaults to `"gelu"`):
            The activation function for the classifier.
        deterministic_flash_attn (`bool`, *optional*, defaults to `False`):
            Whether to use deterministic flash attention. If `False`, inference will be faster but not deterministic.
        sparse_prediction (`bool`, *optional*, defaults to `False`):
            Whether to use sparse prediction for the masked language model instead of returning the full dense logits.
        sparse_pred_ignore_index (`int`, *optional*, defaults to -100):
            The index to ignore for the sparse prediction.
        reference_compile (`bool`, *optional*):
            Whether to compile the layers of the model which were compiled during pretraining. If `None`, then parts of
            the model will be compiled if 1) `triton` is installed, 2) the model is not on MPS, 3) the model is not
            shared between devices, and 4) the model is not resized after initialization. If `True`, then the model may
            be faster in some scenarios. This argument is deprecated and will be removed in a future version.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    Examples:

    ```python
    >>> from transformers import ModernBertModel, ModernBertConfig

    >>> # Initializing a ModernBert style configuration
    >>> configuration = ModernBertConfig()

    >>> # Initializing a model from the modernbert-base style configuration
    >>> model = ModernBertModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
modernbertpast_key_valuesg     Ag     @)globallocalc                 b    |dk(  r|t         j                  d       d }t        |   ||       y )Nreference_compilezThe `reference_compile` argument is deprecated and will be removed in `transformers v5.2.0`Use `torch.compile()` directly on the model instead.)loggerwarning_oncesuper__setattr__)selfnamevalue	__class__s      {/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/modernbert/modular_modernbert.pyr2   zModernBertConfig.__setattr__   s;    &&5+<G ED%(    N
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshidden_activationmax_position_embeddingsinitializer_rangeinitializer_cutoff_factornorm_eps	norm_biaspad_token_ideos_token_idbos_token_idcls_token_idsep_token_idattention_biasattention_dropoutlayer_typesrope_parametersfull_attentionsliding_attentionlocal_attentionembedding_dropoutmlp_biasmlp_dropoutdecoder_biasclassifier_poolingclsmeanclassifier_dropoutclassifier_biasclassifier_activationdeterministic_flash_attnsparse_predictionsparse_pred_ignore_indexr.   tie_word_embeddingsc#                 b   || _         || _        || _        || _        || _        |"| _        || _        || _        || _        || _	        || _
        || _        || _        |	| _        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        | | _        |!| _        | j0                  dvrtA        d| j0                   d      || _!        |#jE                  dd      | _#        | jB                  BtI        | j                        D $cg c]  }$tK        |$| jF                  z        rdnd  c}$| _!        tM        | jB                  | j                         || _'        tQ        %|   di |# y c c}$w )	NrV   zQInvalid value for `classifier_pooling`, should be either "cls" or "mean", but is .global_attn_every_n_layersr	   rO   rN    )*rD   rF   rE   rG   rH   r_   r9   r?   r:   r;   r<   r=   r@   rA   rB   rC   rI   rJ   r>   rP   rQ   rR   rS   rT   rU   rY   rZ   r[   r\   r]   r^   r.   
ValueErrorrK   getrb   rangeboolr   rL   r1   __init__)&r3   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rP   rQ   rR   rS   rT   rU   rY   rZ   r[   r\   r]   r^   r.   r_   kwargsir6   s&                                        r7   rh   zModernBertConfig.__init__   s   L )((((#6 $'>$&!2!2#6 !2)B& ",!2!2.!2 &("4"4.%:"(@%!2(@%!2""/9cdhd{d{c||}~  ' +1**5QST*U'# t556  (,A0O0O,O'P#Vff D 	d..0F0FG."6" s   
#F,c                    |j                  dd       }ddiddid}| j                  | j                  n|| _        |<| j                  d   j                  |       | j                  d   j                  |       | j                  j                  d      ddi| j                  d<   | j                  d   j	                  d|j                  d| j
                  d	                | j                  j                  d      ddi| j                  d<   | j                  d   j	                  d|j                  d
| j
                  d                | j                          | j                  |       |S )Nrope_scaling	rope_typedefault)rO   rN   rN   rO   
rope_thetaglobal_rope_thetar+   local_rope_thetar,   )ignore_keys)poprL   updatere   
setdefaultdefault_thetastandardize_rope_paramsvalidate_rope)r3   ignore_keys_at_rope_validationri   rl   default_rope_paramss        r7   convert_rope_params_to_dictz,ModernBertConfig.convert_rope_params_to_dict   su   zz.$7
 #.y!9*I6
 8<7K7K7Wt33]p#  !1299,G  !45<<\J ##$45=6A95MD  !12-.99&**%8$:L:LX:VW	
 ##$78@9Di8PD  !4501<<&**%79K9KG9TU	

 	$$&'EFr8   c                 H    t         |          }|j                  dd        |S )Nr.   )r1   to_dictrs   )r3   outputr6   s     r7   r}   zModernBertConfig.to_dict  s#    "

&-r8   c                      | j                   dz  S )zKHalf-window size: `local_attention` is the total window, so we divide by 2.r#   rP   r3   s    r7   sliding_windowzModernBertConfig.sliding_window#  s     ##q((r8   c                     |dz  | _         y)z<Set sliding_window by updating local_attention to 2 * value.r#   Nr   r3   r5   s     r7   r   zModernBertConfig.sliding_window(  s      %qyr8   )"i  i   i        gelui    g{Gz?       @gh㈵>Fik  j  i  r   r   F        NN   r   Fr   TrW   r   Fr   FFiNTN)__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencerv   r2   intstrfloatrg   listdictr   r   rh   r{   r}   propertyr   setter__classcell__r6   s   @r7   r(   r(   1   s   cJ J#4"5(8<M) "'"%(,(**,(..2*.25!%!&#(#(#(#(#(&+*-(,gk&)*- %$'$(5:+.',,205)./3)-+/GY#$JY# 4ZY# :	Y#
 :Y# !4ZY# :Y# "%tY# !4<Y# $)4<Y# $,Y# $;Y# DjY# DjY# DjY#  Dj!Y#" Dj#Y#$ t%Y#& !4<'Y#( #Y%)Y#* g&KLn\]`dd+Y#, t-Y#. !4</Y#0 +1Y#2 T\3Y#4 Tk5Y#6 $M27Y#8 "DL9Y#: ;Y#<  #Tz=Y#> #'+?Y#@  $;AY#B #&*CY#D  $;EY#F "D[GY#v>
 ) ) ) )r8   r(   c                        e Zd ZdZdef fdZ	 d	dej                  dz  dej                  dz  dej                  fdZ	 xZ
S )
ModernBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    configc                 d   t         |           || _        t        j                  |j
                  |j                  |j                        | _        t        j                  |j                  |j                  |j                        | _        t        j                  |j                        | _        y )N)padding_idxepsbias)r1   rh   r   r   	Embeddingr9   r:   rD   tok_embeddings	LayerNormrB   rC   normDropoutrQ   dropr3   r   r6   s     r7   rh   zModernBertEmbeddings.__init__3  sw     ll6+<+<f>P>P^d^q^qrLL!3!3vO_O_`	JJv778	r8   N	input_idsinputs_embedsreturnc                     |"| j                  | j                  |            }|S | j                  | j                  | j                  |                  }|S r   )r   r   r   )r3   r   r   hidden_statess       r7   forwardzModernBertEmbeddings.forward:  sS     $ IIdii&>?M  !IIdii0C0CI0N&OPMr8   NN)r   r   r   r   r(   rh   torch
LongTensorTensorr   r   r   s   @r7   r   r   .  sR    9/ 9 _c))D0HMW[H[	r8   r   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )ModernBertMLPa6  Applies the GLU at the end of each ModernBERT layer.

    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
    r   c                    t         |           || _        t        j                  |j
                  t        |j                        dz  |j                        | _	        t        |j                     | _        t        j                  |j                        | _        t        j                  |j                  |j
                  |j                        | _        y )Nr#   r   )r1   rh   r   r   Linearr:   r   r;   rR   Wir   r>   actr   rS   r   Wor   s     r7   rh   zModernBertMLP.__init__K  s    ))F..F4L4L0MPQ0QX^XgXgh&223JJv112	))F44f6H6Hv_r8   r   r   c                     | j                  |      j                  dd      \  }}| j                  | j                  | j	                  |      |z              S )Nr#   dim)r   chunkr   r   r   )r3   r   inputgates       r7   r   zModernBertMLP.forwardS  sI    ggm,221"2=twwtyy%4!7899r8   )
r   r   r   r   r(   rh   r   r   r   r   r   s   @r7   r   r   D  s2    `/ `:U\\ :ell :r8   r   c                   |     e Zd Zddef fdZe	 	 	 	 ddedz  ded   dedz  dedz  de	d	e
f   f
 fd
       Z xZS )ModernBertRotaryEmbeddingNr   c                 &    t         |   ||       y r   )r1   rh   )r3   r   devicer6   s      r7   rh   z"ModernBertRotaryEmbedding.__init__Y  s    (r8   r   ztorch.deviceseq_len
layer_typer   ztorch.Tensorc                 (    t         |   | |||      S r   )r1   compute_default_rope_parameters)r   r   r   r   r6   s       r7   r   z9ModernBertRotaryEmbedding.compute_default_rope_parameters\  s     w6vvwPZ[[r8   r   NNNN)r   r   r   r(   rh   staticmethodr   r   r   tupler   r   r   r   s   @r7   r   r   X  s    )/ ) *.+/"!%	\ 4'\(\ t\ $J	\
 
~u$	%\ \r8   r   rotary_pos_embc                 b   | j                   }|j                  |      }|j                  |      }| j                         |z  t        | j                               |z  z   }|j                         |z  t        |j                               |z  z   }|j	                  |      |j	                  |      fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )dtype	unsqueezer   r&   to)qkcossinunsqueeze_dimoriginal_dtypeq_embedk_embeds           r7   apply_rotary_pos_embr   f  s    & WWN
--
&C
--
&Cwwy3;qwwy#9C#?@Gwwy3;qwwy#9C#?@G::n%wzz.'AAAr8   c                        e Zd ZdZddededz  f fdZ	 	 ddej                  de	ej                  ej                  f   dz  dej                  dz  d	e
e   d
e	ej                  ej                  dz  f   f
dZ xZS )ModernBertAttentiona  Performs multi-headed self attention on a batch of unpadded sequences.

    If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
    If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
    which requires padding and unpadding inputs, adding some overhead.

    See `forward` method for additional details.
    Nr   	layer_idxc                 R   t         |           || _        || _        |j                  |j
                  z  dk7  r&t        d|j                   d|j
                   d      |j                  | _        |j                  | _        |j                  |j
                  z  | _	        t        j                  |j                  d| j                  z  |j
                  z  |j                        | _        |j                  |   dk(  r|j                  dz   | _        nd | _        d	| _        t        j                  |j                  |j                  |j                        | _        |j                  d
kD  r%t        j$                  |j                        | _        y t        j&                         | _        y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r	   r   rO      Fr   )r1   rh   r   r   r:   r=   rd   rJ   r\   head_dimr   r   rI   WqkvrK   r   	is_causalr   r   Identityout_dropr3   r   r   r6   s      r7   rh   zModernBertAttention.__init__  sz   " : ::a?#F$6$6#77mnt  oI  oI  nJ  JK  L  "(!9!9(.(G(G%**f.H.HHIIDMM 1F4N4N NU[UjUj
	 i(,?? #)"7"7!";D"&D))F..0B0BI^I^_@F@X@X[^@^

6#;#;<dfdododqr8   r   position_embeddingsattention_maskri   r   c                    |j                   d d }| j                  |      } |j                  g |dd| j                   }|j	                  d      \  }}}	|j                  dd      }|j                  dd      }|	j                  dd      }	|\  }
}t        |||
|d      \  }}t        }| j                  j                  dk7  rt        | j                  j                     } || |||	|f| j                  r| j                  nd	| j                  d
z  | j                  | j                  d|\  }} |j                  g |d j!                         }| j#                  | j%                  |            }||fS )Nr   r	   r   r   r#   )r   eagerr         )dropoutscalingr   deterministic)shaper   viewr   unbind	transposer   r$   r   _attn_implementationr   trainingrJ   r   r\   reshape
contiguousr   r   )r3   r   r   r   ri   input_shapeqkvquery_states
key_statesvalue_statesr   r   attention_interfaceattn_outputattn_weightss                  r7   r   zModernBertAttention.forward  s    $))#2.ii&chh::Q::DMM:141C.j,#--a3))!Q/
#--a3&S#7jRUWZjk#l j5;;++w6"9$++:Z:Z"[$7%
 /3mmD**MM4'..77%
 %
!\ *k));;;;FFHmmDGGK$89L((r8   r   r   )r   r   r   r   r(   r   rh   r   r   r   r   r   r   r   r   s   @r7   r   r     s    r/ rC$J r@ IM.2	')||') #5<<#=>E') t+	')
 +,') 
u||U\\D00	1')r8   r   c                        e Zd Zddededz  f fdZ	 	 ddej                  dej                  dz  dej                  dz  dee	   d	ej                  f
d
Z
 xZS )ModernBertEncoderLayerNr   r   c                    t         |           || _        || _        |dk(  rt	        j
                         | _        n;t	        j                  |j                  |j                  |j                        | _        t        ||      | _        t	        j                  |j                  |j                  |j                        | _        t        |      | _        |j                   |   | _        y )Nr   r   )r   r   )r1   rh   r   r   r   r   	attn_normr   r:   rB   rC   r   attnmlp_normr   mlprK   attention_typer   s      r7   rh   zModernBertEncoderLayer.__init__  s    ">[[]DN\\&*<*<&//X^XhXhiDN'vK	V%7%7V__SYScScd ($00;r8   r   r   r   ri   r   c                      | j                   | j                  |      f||d|\  }}||z   }|| j                  | j                  |            z   }|S )N)r   r   )r  r  r	  r  )r3   r   r   r   ri   r  _s          r7   r   zModernBertEncoderLayer.forward  sg     #NN=)
 3)
 	
Q &3%}1M(NNr8   r   r   )r   r   r   r(   r   rh   r   r   r   r   r   r   r   s   @r7   r  r    sx    </ <C$J <  /337	|| t+ #\\D0	
 +, 
r8   r  c                        e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZeedZ ej                          dej$                  fd       Z	 dd	ed
z  dedef fdZ xZS )ModernBertPreTrainedModelr   modelTr   r  )r   
attentionsmodulec                    | j                   j                  ddt        j                  dt        ffd}| j                   j
                  | j                   j
                  t        j                  d| j                   j                  z        z  | j                   j
                  | j                   j                  dz  d}t        |t              r ||j                  |d          y t        |t              r- ||j                  |d	           ||j                  |d
          y t        |t               r- ||j"                  |d	           ||j                  |d
          y t        |t$              r ||j&                  |d
          y t        |t(              r ||j*                  |d
          y t        |t,        t.        t0        t2        f      r ||j4                  |d          y t        |t        j6                        rLt9        j:                  |j<                         |j>                   t9        j@                  |j>                         y y t        |tB              r|jD                  D ]  }|jF                  }|jH                  |   dk7  rtJ        |jH                  |      } ||j                   |      \  }}t9        jL                  tO        || d      |       t9        jL                  tO        || d      |        y y )Nr	   r  stdc                     t        j                  | j                  d| |z  |z         t        | t        j
                        r-| j                   t        j                  | j                         y y y )Nr   )rX   r  ab)inittrunc_normal_weight
isinstancer   r   r   zeros_)r  r  cutoff_factors     r7   init_weightz<ModernBertPreTrainedModel._init_weights.<locals>.init_weight	  sd     .3&#% &")),;;*KK, + -r8   r   r   )inout	embedding	final_outr   r  r  r!  rn   )r   	_inv_freq_original_inv_freq)(r   rA   r   Moduler   r@   mathsqrtr<   r:   r  r   r   r   r   r   r   r   ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassificationModernBertForMultipleChoice ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierr   r  ones_r  r   r  r   rK   r   rm   r   copy_getattr)	r3   r  r  stdsr   rope_init_fncurr_inv_freqr  r  s	           @r7   _init_weightsz'ModernBertPreTrainedModel._init_weights  sd   == M	-		 	- 	- ++//;;00499S4;;C`C`=`3aa6600$6	
 f23--tK/@A.		4:.		4;/ 34T$Z0		4;/ 89d5k2 56U43+0.	
 ))4+<=-JJv}}%{{&FKK( ' 9:$00 ^
%EE##J/9<#6v7G7G
7S#TL#/*#U q

76j\+CDmT

76j\9K+LM}]^ ;r8   attn_implementationNis_init_checkr   c                     	 |dn|}t         |   ||      S # t        t        f$ r t         |   ||      cY S w xY w)zR
        Checks and dispatches to hhe requested attention implementation.
        flash_attention_2)r7  r8  )r1   %_check_and_adjust_attn_implementationrd   ImportError)r3   r7  r8  requested_attn_implementationr6   s       r7   r;  z?ModernBertPreTrainedModel._check_and_adjust_attn_implementation@  sh    	CVC^,?dw)7@$AQ^ A   K( 	7@$7} A  	s     ==)F)r   r   r   r(   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr  r   _can_record_outputsr   no_gradr   r$  r6  r   rg   r;  r   r   s   @r7   r  r    s    &*#/1IJN"& 0)
 U]]_:^BII :^ :^z FK#&:>B	 r8   r  c                        e Zd Zdef fdZd Zd Zee	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
ee   defd              Z xZS )ModernBertModelr   c           	         t         |   |       || _        t        |      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _
        t        j                  |j                  |j                  |j                        | _        t!        |      | _        d| _        | j'                          y c c}w )Nr   )r   F)r1   rh   r   r   
embeddingsr   
ModuleListrf   r<   r  layersr   r:   rB   rC   
final_normr   
rotary_embgradient_checkpointing	post_initr   s      r7   rh   zModernBertModel.__init__U  s     .v6mmHMfNfNfHgh9#FI6h
 ,,v'9'9vU[UeUef36B&+# is   Cc                 .    | j                   j                  S r   rK  r   r   s    r7   get_input_embeddingsz$ModernBertModel.get_input_embeddingsa  s    ---r8   c                 &    || j                   _        y r   rS  r   s     r7   set_input_embeddingsz$ModernBertModel.set_input_embeddingsd  s    ).&r8   Nr   r   position_idsr   ri   r   c                    |d u |d uz  rt        d      ||j                  d   n|j                  d   }||j                  n|j                  }|&t        j                  ||      j                  d      }| j                  ||      }t        |x}	t              s'| j                  ||d}
t        d
i |
t        d
i |
d}	i }| j                  j                  D ]  }| j                  |||      ||<    | j                  D ](  } ||f|	|j                     ||j                     d|}* | j!                  |      }t#        |	      S )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r   r   )r   input_embedsr   rM   )r   r   )last_hidden_staterc   )rd   r   r   r   aranger   rK  r  r   r   r   r   rK   rO  rM  r
  rN  r   )r3   r   r   rW  r   ri   r   r   r   attention_mask_mappingmask_kwargsr   r   encoder_layers                 r7   r   zModernBertModel.forwardg  sw    -t";<YZZ,9,E-%%a(9??[\K]%.%:!!@T@T <<?II!LL)=YNB0DI++ -"0K #<"Jk"J%M%\P[%\&"
 !++11 	gJ.2oom\[e.f
+	g "[[ 	M)5m6R6RS$78T8T$U 	M	 6??r8   r   )r   r   r   r(   rh   rT  rV  r"   r   r   r   r   r   r   r   r   r   r   s   @r7   rI  rI  S  s    
/ 
./  .2.204-1,@##d*,@ t+,@ &&-	,@
 ||d*,@ +,,@ 
,@  ,@r8   rI  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )r'  r   c                 J   t         |           || _        t        j                  |j
                  |j
                  |j                        | _        t        |j                     | _
        t        j                  |j
                  |j                  |j                        | _        y )Nr   )r1   rh   r   r   r   r:   rZ   r(  r   r[   r   r   rB   rC   r   r   s     r7   rh   z!ModernBertPredictionHead.__init__  sq    YYv1163E3EvG]G]^
&667LL!3!3vO_O_`	r8   r   r   c                 `    | j                  | j                  | j                  |                  S r   )r   r   r(  )r3   r   s     r7   r   z ModernBertPredictionHead.forward  s#    yy$**]";<==r8   )	r   r   r   r(   rh   r   r   r   r   r   s   @r7   r'  r'    s-    a/ a>U\\ >ell >r8   r'  zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )custom_introc                   >    e Zd ZddiZdef fdZd Zdej                  fdZ	e
e	 	 	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dee   deej                     ez  fd              Z xZS )r)  zdecoder.weightz&model.embeddings.tok_embeddings.weightr   c                 t   t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                  |j                        | _        | j                  j                  | _        | j                  j                  | _        | j                          y )Nr   )r1   rh   r   rI  r  r'  headr   r   r:   r9   rT   r*  r]   r^   rQ  r   s     r7   rh   zModernBertForMaskedLM.__init__  s     $V,
,V4	yy!3!3V5F5FVM`M`a!%!>!>(,(L(L% 	r8   c                     | j                   S r   r*  r   s    r7   get_output_embeddingsz+ModernBertForMaskedLM.get_output_embeddings  s    ||r8   new_embeddingsc                     || _         y r   rh  )r3   rj  s     r7   set_output_embeddingsz+ModernBertForMaskedLM.set_output_embeddings  s	    %r8   Nr   r   rW  r   labelsri   r   c                     | j                   d||||d|}|d   }| j                  rK|I|j                  d      }|j                  |j                  d   d      }|| j                  k7  }	||	   }||	   }| j                  | j                  |            }
d }|* | j                  |
|fd| j                  j                  i|}t        ||
|j                  |j                        S )Nr   r   rW  r   r   r   r9   losslogitsr   r  rc   )r  r]   r   r   r^   r*  rf  loss_functionr   r9   r   r   r  )r3   r   r   rW  r   rm  ri   outputsr[  mask_tokensrr  rq  s               r7   r   zModernBertForMaskedLM.forward  s	    $** 
)%'	

 
 $AJ!!f&8[[_F 1 6 6v||A K !D$A$AAK 1+ >K(Fdii(9:;%4%%ffbAWAWb[abD!//))	
 	
r8   NNNNN)r   r   r   _tied_weights_keysr(   rh   ri  r   r   rl  r!   r   r   r   r   r   r   r   r   r   r   r   s   @r7   r)  r)    s     +,TU/ &BII &  .2.2,0-1&*'
##d*'
 t+'
 llT)	'

 ||d*'
 t#'
 +,'
 
u||	~	-'
  '
r8   r)  z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c                       e Zd Zdef fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e
e   d
eej                     ez  fd              Z xZS )r+  r   c                 n   t         |   |       |j                  | _        || _        t	        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j!                          y r   )r1   rh   
num_labelsr   rI  r  r'  rf  r   r   r   rY   r   r   r:   r/  rQ  r   s     r7   rh   z,ModernBertForSequenceClassification.__init__  s      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	r8   Nr   r   rW  r   rm  ri   r   c                 d    | j                   d||||d|}|d   }| j                  j                  dk(  r
|dddf   }n| j                  j                  dk(  rw|=t        j                  |j
                  dd |j                  t        j                        }||j                  d      z  j                  d	
      |j                  d	d      z  }| j                  |      }	| j                  |	      }	| j                  |	      }
d}|| j                  j                  | j                  d	k(  rd| j                  _        nl| j                  d	kD  rL|j                  t        j                   k(  s|j                  t        j"                  k(  rd| j                  _        nd| j                  _        | j                  j                  dk(  rIt%               }| j                  d	k(  r& ||
j'                         |j'                               }n ||
|      }n| j                  j                  dk(  r=t)               } ||
j+                  d| j                        |j+                  d            }n,| j                  j                  dk(  rt-               } ||
|      }t/        ||
|j0                  |j2                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        ro  r   rW   NrX   r#   )r   r   r   r   r   Tr   keepdim
regressionsingle_label_classificationmulti_label_classificationrp  rc   )r  r   rU   r   onesr   r   rg   r   sumrf  r   r/  problem_typerz  r   longr   r   squeezer   r   r   r   r   r  )r3   r   r   rW  r   rm  ri   rt  r[  pooled_outputrr  rq  loss_fcts                r7   r   z+ModernBertForSequenceClassification.forward  se   " $** 
)%'	

 
 $AJ;;))U2 1!Q$ 7[[++v5%!&%++BQ/8I8P8PX]XbXb" "3^5M5Mb5Q!Q V V[\ V ]`n`r`rt as a ! 		"34		-0/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r8   rv  )r   r   r   r(   rh   r!   r   r   r   r   r   r   r   r   r   r   r   s   @r7   r+  r+    s    /   .2.2,0-1&*C
##d*C
 t+C
 llT)	C

 ||d*C
 t#C
 +,C
 
u||	7	7C
  C
r8   r+  zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c                       e Zd Zdef fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e
e   d
eej                     ez  fd              Z xZS )r-  r   c                 `   t         |   |       |j                  | _        t        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   r1   rh   rz  rI  r  r'  rf  r   r   r   rY   r   r   r:   r/  rQ  r   s     r7   rh   z)ModernBertForTokenClassification.__init__L  s{      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	r8   Nr   r   rW  r   rm  ri   r   c                 f    | j                   d||||d|}|d   }| j                  |      }| j                  |      }| j                  |      }	d}
|<t	               } ||	j                  d| j                        |j                  d            }
t        |
|	|j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        ro  r   Nr   rp  rc   )
r  rf  r   r/  r   r   rz  r   r   r  )r3   r   r   rW  r   rm  ri   rt  r[  rr  rq  r  s               r7   r   z(ModernBertForTokenClassification.forwardX  s     $** 
)%'	

 
 $AJ II&78 II&78!23')HFKKDOO<fkk"oND$!//))	
 	
r8   rv  )r   r   r   r(   rh   r!   r   r   r   r   r   r   r   r   r   r   r   s   @r7   r-  r-  F  s    
/ 
  .2.2,0-1&*$
##d*$
 t+$
 llT)	$

 ||d*$
 t#$
 +,$
 
u||	4	4$
  $
r8   r-  c                       e Zd Zdef fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	e
   d
eej                     ez  fd              Z xZS )r.  r   c                 `   t         |   |       |j                  | _        t        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   r  r   s     r7   rh   z'ModernBertForQuestionAnswering.__init__  sy      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJr8   Nr   r   rW  start_positionsend_positionsri   r   c                     | j                   |f||d|}|d   }| j                  |      }| j                  |      }| j                  |      }	|	j	                  dd      \  }
}|
j                  d      j                         }
|j                  d      j                         }d }|| | j                  |
|||fi |}t        ||
||j                  |j                        S )N)r   rW  r   r   r   r   )rq  start_logits
end_logitsr   r  )r  rf  r   r/  splitr  r   rs  r   r   r  )r3   r   r   rW  r  r  ri   rt  r[  rr  r  r  rq  s                r7   r   z&ModernBertForQuestionAnswering.forward  s    $**
)%
 	
 $AJ II&78 II&78!23#)<<r<#: j#++B/::<''+668
&=+D%4%%lJQ^ibhiD+%!!//))
 	
r8   rv  )r   r   r   r(   rh   r!   r   r   r   r   r   r   r   r   r   r   s   @r7   r.  r.    s    	/ 	  *..2,0/3-1#
<<$&#
 t+#
 llT)	#

 ,#
 ||d*#
 +,#
 
u||	;	;#
  #
r8   r.  z
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd Zdef fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e
e   d
eej                     ez  fd              Z xZS )r,  r   c                 8   t         |   |       || _        t        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  d      | _        | j                          y )Nr   )r1   rh   r   rI  r  r'  rf  r   r   r   rY   r   r   r:   r/  rQ  r   s     r7   rh   z$ModernBertForMultipleChoice.__init__  so     $V,
,V4	HH$$V%>%>?	))F$6$6: 	r8   Nr   r   rW  r   rm  ri   r   c                    ||j                   d   n|j                   d   }|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|1|j                  d|j                  d      |j                  d            nd} | j                  d||||d|}|d   }	| j                  j
                  dk(  rt        j                  |	j                   d   |	j                        }
|,|j                  d	      j                  |	j                        }n0t        j                  dt        j                  |	j                  
      }|	|
|f   }	nS| j                  j
                  dk(  r:|j                  dd      }|	|j                  d      z  j                  d	      |z  }	| j                  |	      }| j!                  |      }| j#                  |      }|j                  d|      }d}|t%        j&                         } |||      }t)        |||j*                  |j,                        S )a&  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
        Nr   r   ro  r   rW   rY  r   )r   r   rX   Tr|  rp  rc   )r   r   sizer  r   rU   r   r\  r   argmaxr   tensorr  r  r   rf  r   r/  r   r   r   r   r  )r3   r   r   rW  r   rm  ri   num_choicesrt  r[  	indices_0cls_masknum_non_pad_tokensr  rr  reshaped_logitsrq  r  s                     r7   r   z#ModernBertForMultipleChoice.forward  sn     -6,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 $** 
)%'	

 
 $AJ ;;))U2%6%<%<Q%?HYH`H`aI))00R08;;<M<T<TU !<<DUD\D\] 1)X2E F [[++v5!/!3!34!3!H!2^5M5Mb5Q!Q V V[\ V ]`r r		"34		-0/ ++b+6**,HOV4D("!//))	
 	
r8   rv  )r   r   r   r(   rh   r!   r   r   r   r   r   r   r   r   r   r   r   s   @r7   r,  r,    s    
/ 
  .2.2,0-1&*C
##d*C
 t+C
 llT)	C

 ||d*C
 t#C
 +,C
 
u||	8	8C
  C
r8   r,  )r(   rI  r  r)  r+  r-  r.  r,  )r   )Ir%  typingr   r   r   r   torch.nnr   r   r    r
   r  activationsr   configuration_utilsr   r   integrationsr   r   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r    utils.genericr!   r"   align.modeling_alignr$   gemma3.modeling_gemma3r%   r&   
get_loggerr   r/   r(   r$  r   r   r   r   r   r  r  rI  r'  r)  r+  r-  r.  r,  __all__rc   r8   r7   <module>r     s     $   A A & ! J I ` 9  G F & @ @ A : G 
		H	%z)' z)z299 ,:BII :(\ 5 \ *+B ,B4 )*N)")) N) +N)b7 @ \ \ \~ A@/ A@ A@H	>ryy 	> 
?
5 ?

?
D 
S
*C S

S
l 
3
'@ 3

3
l 1
%> 1
 1
h 
R
"; R

R
j	r8   