
    i^                        d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1  e"jd                  e3      Z4 G d de+      Z5 G d de,      Z6 G d dejn                        Z8 G d dejn                        Z9 G d d ejn                        Z: G d! d"ejn                        Z; G d# d$ejn                        Z< G d% d&e'      Z=e  G d' d(e*             Z>e  G d) d*e)             Z? G d+ d,e>e      Z@ G d- d.ee>      ZAg d/ZBy)0zPyTorch JetMoe model.    )CallableN)nn)
functional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask) GenericForSequenceClassification)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)OutputRecordercheck_model_inputs   )LlamaDecoderLayer)MixtralModelMixtralPreTrainedModelMixtralRMSNormMixtralRotaryEmbeddingapply_rotary_pos_embeager_attention_forwardload_balancing_loss_func   )JetMoeConfigc                       e Zd Zy)JetMoeRMSNormN__name__
__module____qualname__     s/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/jetmoe/modular_jetmoe.pyr%   r%   3       r+   r%   c                       e Zd Zy)JetMoeRotaryEmbeddingNr&   r*   r+   r,   r/   r/   7   r-   r+   r/   c                   6     e Zd Zdedededdf fdZd Z xZS )JetMoeParallelExpertsnum_experts
input_sizeoutput_sizereturnNc                     t         |           t        j                  t	        j
                  |||            | _        || _        || _        || _	        y)a  
        Initialize the JetMoeParallelExperts module.
        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
        used in vllm.

        Args:
            num_experts (int):
                Number of experts.
            input_size (int):
                Size of the input.
            output_size (int):
                Size of the output.
        N)
super__init__r   	Parametertorchemptyweightr2   r3   r4   )selfr2   r3   r4   	__class__s       r,   r8   zJetMoeParallelExperts.__init__<   sD    " 	ll5;;{K#TU&$&r+   c                     |j                  |d      }g }t        | j                        D ]7  }|j                  t	        j
                  ||   | j                  |                9 t        j                  |d      }|S )a  
        Forward pass of the JetMoeParallelExperts module.

        Args:
            inputs (Tensor):
                Input tensor.
            expert_size:
                Expert size information.

        Returns:
            Tensor: Output tensor.
        r   dim)	splitranger2   appendFlinearr<   r:   cat)r=   inputsexpert_size
input_listoutput_listiresultss          r,   forwardzJetMoeParallelExperts.forwardS   sq     \\+1\5
t''( 	HAqxx
1t{{1~FG	H))KQ/r+   r'   r(   r)   intr8   rN   __classcell__r>   s   @r,   r1   r1   ;   s)    'C 'S 's 't '.r+   r1   c                   2     e Zd Zdededef fdZd Z xZS )JetMoeTopKGatingr3   r2   top_kc                     t         |           || _        || _        || _        t        j                  ||d      | _        y)a  
        Initialize the top-k gating mechanism.

        Args:
            input_size (`int`):
                Size of the input.
            num_experts (`int`):
                Number of experts.
            top_k (`int`):
                Number of top experts to select.
        FbiasN)r7   r8   r2   r3   rU   r   Linearlayer)r=   r3   r2   rU   r>   s       r,   r8   zJetMoeTopKGating.__init__i   s:     	&$
YYz;UC
r+   c                    | j                  |      j                         }|j                  | j                  d      \  }}t	        j
                  |d      j                  |      }t	        j                  |j                  d      | j                  g|j                  |j                        }|j                  d|d      }|j                         j                  d      }|j                         }|j!                         }	|	j#                  d      \  }
}|j%                  | j                  d      }|j!                         }||   }|||||fS )Nr"   r@   r   dtypedevicetrunc)rounding_mode)rZ   floattopkrU   r:   softmaxtype_aszerossizer2   r]   r^   scatterlongsumtolistflattensortdiv)r=   hidden_stateslogitstop_k_logitstop_k_indicestop_k_gatesre   gatesrI   top_k_experts_index_sorted_expertsbatch_indexbatch_gatess                 r,   rN   zJetMoeTopKGating.forward}   s.   M*002&,kk$**!k&D#mmmLa8@@O a $"2"23;;L;LU`UgUg
 a2jjl&&q) "((* &--/"/"4"4Q"7*..tzz.Q "))+!"67#[+{FRRr+   rO   rR   s   @r,   rT   rT   h   s'    D3 DS D D(Sr+   rT   c                   .     e Zd ZdZdef fdZd Z xZS )	JetMoeMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                 @   t         |           |j                  | _        |j                  | _        t
        |j                     | _        t        j                  j                  t        j                  | j                              | _        t        |j                  | j                  | j                  dz        | _        t        |j                  | j                  | j                        | _        t#        | j                  |j                  |j$                        | _        y )Nr   r3   r2   rU   )r7   r8   hidden_sizer3   intermediate_sizer   activation_function
activationr:   r   r9   r;   rX   r1   num_local_expertsinput_linearoutput_linearrT   num_experts_per_tokrouterr=   r{   r>   s     r,   r8   zJetMoeMoE.__init__   s     ,,!33 !;!;<HH&&u{{4??'CD	1&2J2JDOO]a]m]mpq]qr263K3KTM]M]_c_n_no&00,,
r+   c                 8   |j                         \  }}}|j                  d|      }| j                  |      \  }}}}}	||   }
| j                  |
|      }|j	                  dd      }| j                  |d         |d   z  }| j                  ||      }||dddf   z  }t        j                  ||z  | j                  f|j                  |j                        }|j                  d||      }|j                  ||| j                        }|| j                  z   }|S )a  
        Forward pass of the mixture of experts layer.

        Args:
            layer_input (Tensor):
                Input tensor.

        Returns:
            Tensor:
                Output tensor.
            Tensor:
                Router logits.
        r   r@   r   r"   Nr\   )rf   reshaper   r   chunkr   r   r:   re   r3   r]   r^   	index_addviewrX   )r=   layer_inputbszlengthemb_sizeru   rw   rx   rI   router_logitsexpert_inputsrn   chunked_hidden_statesexpert_outputsre   layer_outputs                   r,   rN   zJetMoeMoE.forward   s%    !, 0 0 2VX!))"h7BF++kBZ?;[-#K0))-E - 3 3A2 3 >(=a(@ADYZ[D\\++M;G'+ag*>>S6\4??;>CWCW`n`u`uvq+~F#((fdooF#dii/r+   )r'   r(   r)   __doc__r#   r8   rN   rQ   rR   s   @r,   rz   rz      s    
| 
 r+   rz   c                   :     e Zd ZdZdef fdZd Zd Zd Z xZ	S )	JetMoeMoAz
    A Sparsely gated mixture of attention layer with pairs of query- and output-projections as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    r{   c                 h   t         |           |j                  | _        |j                  | _        |j                  |j                  z  | _        |j                  | _	        t        j                  j                  t        j                  | j
                              | _        t        | j                  | j
                  | j                        | _        t        | j                  | j                  | j
                        | _        t%        | j
                  | j                  | j                        | _        y )Nr}   )r7   r8   r   r2   r~   r3   kv_channelsnum_key_value_headsr   rU   r:   r   r9   r;   rX   r1   r   r   rT   r   r   s     r,   r8   zJetMoeMoA.__init__   s    !33 ,,!--0J0JJ//
HH&&u{{4??'CD	1$2B2BDOOUYUeUef243C3CTEUEUW[WfWfg&((**
r+   c                    |j                         \  }}}|j                  d|      }| j                  |      \  }}}}}	||||f}
||   }| j                  ||      }t	        j
                  ||z  | j                  z  | j                  f|j                  |j                        }|j                  d||      }|j                  ||| j                  d      }||	|
fS )z
        Map inputs to attention experts according to routing decision and compute query projection inside each experts.
        r   r\   r   )rf   r   r   r   r:   re   rU   r~   r]   r^   r   r   )r=   r   r   r   r   rv   rw   rx   rI   r   	topo_infor   r   re   r   s                  r,   mapzJetMoeMoA.map   s     !, 0 0 2VX!))"h7UYU`U`alUmRk;]);[Q	 $K0**=+F 6\DJJ&(8(89AUAU^l^s^s
 q*>O#((fdjj"E]I55r+   c                    |j                         \  }}}}|j                  d|      }|\  }}}	}
||   }| j                  ||
      }||	dddf   z  }t        j                  ||z  | j
                  f|j                  |j                        }|j                  d||      }|j                  ||| j
                        }|| j                  z   }|S )zu
        Compute output projection inside each attention experts and merge the outputs of different experts.
        r   Nr\   r   )rf   r   r   r:   re   r3   r]   r^   r   r   rX   )r=   r   r   r   r   kr~   rv   rw   rx   rI   r   r   re   r   s                  r,   reducezJetMoeMoA.reduce  s     '2&6&6&8#VQ!))"k:FOCk; $$89++M;G (+ag*>> S6\4??;>CWCW`n`u`uvq+~F#((fdooF#dii/r+   c                     t        d      )Nz-This module doesn't support call and forward.)NotImplementedError)r=   r   s     r,   rN   zJetMoeMoA.forward  s    !"QRRr+   )
r'   r(   r)   r   r#   r8   r   r   rN   rQ   rR   s   @r,   r   r      s$    
| 
$6.,Sr+   r   c                        e Zd ZdZddededz  f fdZ	 	 	 	 ddej                  dej                  dz  dej                  dz  d	e
dz  d
ej                  dz  deej                  ej                  dz  eej                     dz  f   fdZ xZS )JetMoeAttentionzH
    Multi-headed attention from 'Attention Is All You Need' paper.
    Nr{   	layer_idxc                 b   t         |           || _        || _        d| _        |-t
        j                  d| j                  j                   d       d| _	        |j                  | _        |j                  | _        |j                  |j                  z  | _        |j                  | _        |j                   | _        |j                  | _        | j$                  dz  | _        t)        |      | _        t,        j.                  j1                  |j2                  | j                  dz  d	      | _        y)
z
        Initialize the JetMoeAttention module.

        Args:
            config:
                Configuration object with model hyperparameters.
            layer_idx:
                Index of the layer in the model.
        TNzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.r"   g      r   FrW   )r7   r8   r{   r   	is_causalloggerwarning_oncer>   r'   num_key_value_groupsr   rU   attention_dropoutr   r   kv_projection_sizenum_attention_heads	num_headshead_dimscalingr   expertsr:   r   rY   r~   kv_projr=   r{   r   r>   s      r,   r8   zJetMoeAttention.__init__$  s    	" !8!8 9 :, , %&!//
!'!9!9"("4"4v7Q7Q"Q#)#=#= 33**}}d* (xxv'9'94;R;RUV;V]bcr+   rn   attention_maskposition_embeddingspast_key_valuescache_positionr5   c                    |j                   d d }g |d| j                  }| j                  j                  |      \  }	}
}| j	                  |      j                  dd      \  }}|	j                  |      j                  dd      }	|j                  |      j                  dd      }|j                  |      j                  dd      }|\  }}t        |	|||      \  }	}|'|||d}|j                  ||| j                  |      \  }}t        j                  | j                  j                  t              }|j!                  d| j"                  dd      }|j!                  d| j"                  dd      } || |	|||f| j$                  sdn| j&                  | j(                  d|\  }} |j                  g || j"                  d }| j                  j+                  ||      } |j                  g |d }|||
fS )Nr   r   r@   r"   )sincosr           )dropoutr   )shaper   r   r   r   r   r   	transposer   updater   r   get_interfacer{   _attn_implementationr    repeatrU   trainingr   r   r   )r=   rn   r   r   r   r   kwargsinput_shapehidden_shapequery_statesr   r   
key_statesvalue_statesr   r   cache_kwargsattention_interfaceattn_outputattn_weightss                       r,   rN   zJetMoeAttention.forwardE  s    $))#2.88b8$--8151A1A-1P.mY#'<<#>#D#DQB#D#O 
L#((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
  &&q$**a;
#**1djj!Q?$7	%
  $}}C$2H2HLL	%
 	%
!\ 'k&&DDTZZDDll))+yA&k&&88R8L-77r+   N)NNNN)r'   r(   r)   r   r#   rP   r8   r:   Tensor
LongTensorr	   tuplerN   rQ   rR   s   @r,   r   r     s    d| dd
 dH /37;(,2628||28 t+28 #--4	28
 28 ((4/28 
u||U\\D0%2E2LL	M28r+   r   c                   *    e Zd Zddededz  f fdZ	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
ej                  dz  deej                  ej                  f   dz  dee   dej                  fdZ xZS )JetMoeDecoderLayerNr{   r   c                     t         |   ||       t        |j                        | _        t        ||      | _        t        |j                        | _        t        |      | _	        | `
y r   )r7   r8   r%   r~   input_layernormr   self_attentionpost_attention_layernormrz   mlp	self_attnr   s      r,   r8   zJetMoeDecoderLayer.__init__{  sX    +,V-?-?@-fi@(5f6H6H(I%V$Nr+   rn   r   position_idsr   	use_cacher   r   r   r5   c                     |}	| j                  |      } | j                  d|||||||d|\  }}
}
|	|z   }|}	| j                  |      }| j                  |      }|	|z   }|S )N)rn   r   r   r   r   r   r   r*   )r   r   r   r   )r=   rn   r   r   r   r   r   r   r   residualru   s              r,   rN   zJetMoeDecoderLayer.forward  s     !,,];1d11 	
')%+) 3	
 	
q! !=0 !55mD/ =0r+   r   )NNNFNN)r'   r(   r)   r#   rP   r8   r:   r   r   r	   boolr   r   r   rN   rQ   rR   s   @r,   r   r   z  s    | d
  /304(,!&26HL|| t+ &&-	
  $; ((4/ #5<<#=>E +, 
r+   r   c                       e Zd ZU  eej
                  dd      e eed      dZe	e
d<   dZdZd	gZd
gZdZdZdZ ej&                         d        Zy)JetMoePreTrainedModelgater"   )
layer_nameindex)r   )r   rn   
attentionsr{   modelFr   r   Tc                 ,   t        j                  | |       t        |t              r7t	        j
                  |j                  d| j                  j                         yt        |t        t        z        r t	        j                  |j                         yy)zInitialize the weights.r   )meanstdN)r   _init_weights
isinstancer1   initnormal_r<   r{   initializer_ranger   rz   zeros_rX   )r=   modules     r,   r   z#JetMoePreTrainedModel._init_weights  sa     	%%dF3f34LLSdkk6S6ST	I 56KK$ 7r+   N)r'   r(   r)   r   r   rY   r   r   _can_record_outputsr#   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphr:   no_gradr   r*   r+   r,   r   r     sz     (		fAN+$_A>
 &+#-.#4"5N"U]]_% %r+   r   c                       e Zd Zdef fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  dej                  dz  d	edz  d
ej                  dz  dee   defd              Z xZS )JetMoeModelr{   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        |j                  | _        t        |j                  |j                         | _        y c c}w )N)eps)r7   r8   pad_token_idpadding_idx
vocab_sizer   	Embeddingr~   embed_tokens
ModuleListrC   num_hidden_layersr   layersr   r%   rms_norm_epsnormr   s      r,   r8   zJetMoeModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammDI&JbJbDcdy	2d
 %+$?$?!!&"4"4&:M:MN	 es   C*N	input_idsr   r   r   inputs_embedsr   r   r   r5   c                 D   |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }
|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f||
||||d|} | j                  |      }t        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embeds)r{   r   r"   )r^   )r{   input_embedsr   r   r   r   )r   r   r   r   r   r   )last_hidden_stater   )
ValueErrorr
   r{   r  get_seq_lengthr:   aranger   r^   	unsqueezer   
rotary_embr  r  r	  r   )r=   r
  r   r   r   r  r   r   r   past_seen_tokenscausal_maskrn   r   decoder_layers                 r,   rN   zJetMoeModel.forward  s`    -t";<YZZ0*$++>O  --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;&))+%
 & #oom\J![[)H4;;+H+HI 
	M)	$7* /#-)	 	M
	 		-0%++
 	
r+   )NNNNNNN)r'   r(   r)   r#   r8   r   r   r:   r   r   r	   FloatTensorr   r   r   r   rN   rQ   rR   s   @r,   r   r     s    
O| 
O  .2.204(,26!%26;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
 $;;
 ((4/;
 +,;
 
 ;
  ;
r+   r   c                   L    e Zd ZddiZ fdZee	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  d	ej                  dz  d
ej                  dz  dedz  dej                  dz  deej                  z  dedz  defd              Z xZS )JetMoeForCausalLMzlm_head.weightzmodel.embed_tokens.weightc                 ,   t         |   |       t        |      | _        |j                  | _        |j
                  | _        t        j                  |j                  |j                  d      | _	        |j                  | _
        | j                          y )NFrW   )r7   r8   r   r   r  aux_loss_coefr   rY   r~   lm_headtie_word_embeddings	post_initr   s     r,   r8   zJetMoeForCausalLM.__init__  sq      (
 ++#11yy!3!3V5F5FUS#)#=#=  	r+   Nr
  r   r   r   r  labelsr   r   logits_to_keepoutput_router_logitsr5   c                 L    | j                   d|||||||d|}|j                  }t        |	t              rt	        |	 d       n|	}| j                  |d d |d d f         }d }|* | j                  ||fd| j                  j                  i|}d }|
rYt        |j                  | j                  | j                  |      }|+|| j                  |j                  |j                        z  z  }t!        ||||j"                  |j$                  |j&                  |j                        S )N)r
  r   r   r   r  r   r   r  )lossaux_lossro   r   rn   r   r   r*   )r   r  r   rP   slicer  loss_functionr{   r  r!   r   r2   r   r  tor^   r   r   rn   r   )r=   r
  r   r   r   r  r  r   r   r   r!  r   outputsrn   slice_indicesro   r#  r$  s                     r,   rN   zJetMoeForCausalLM.forward  sQ     +5$** 	+
)%+')	+
 	+
  118B>SV8W~ot4]kmA}a,?@A%4%%  ;;11 	D /%%  ((	H !**X[[-EEE(#33!//))!//
 	
r+   )
NNNNNNNNr   F)r'   r(   r)   _tied_weights_keysr8   r   r   r:   r   r   r	   r  r   rP   r   rN   rQ   rR   s   @r,   r  r    s   *,GH	  .2.204(,26*.!%26-.,1:
##d*:
 t+:
 &&-	:

 :
 ((4/:
   4':
 $;:
 ((4/:
 ell*:
 #Tk:
 
#:
  :
r+   r  c                       e Zd Zy)JetMoeForSequenceClassificationNr&   r*   r+   r,   r,  r,  Z  s    r+   r,  )r  r   r   r,  )Cr   collections.abcr   r:   r   torch.nnr   rE    r   r   activationsr   cache_utilsr	   r
   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   llama.modeling_llamar   mixtral.modeling_mixtralr   r   r   r   r   r    r!   configuration_jetmoer#   
get_loggerr'   r   r%   r/   Moduler1   rT   rz   r   r   r   r   r   r  r,  __all__r*   r+   r,   <module>r@     sR    $   $ & ! . ) / R F & R R ? 4   / 
		H	%	N 		2 	*BII *Z.Sryy .Sb7		 7tIS		 ISXX8bii X8v(* (V %2 % %2 J
, J
 J
ZJ
- J
Z d&FH] c kr+   