
    iF                     \   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z*  ejV                  e,      Z- G d de$      Z. G d de       Z/ G d de(      Z0 G d dejb                        Z2 G d dejf                        Z4 G d  d!ejb                        Z5 G d" d#e"      Z6 G d$ d%e      Z7 G d& d'e      Z8e G d( d)e8             Z9 G d* d+e#e8e      Z:g d,Z;y)-zPyTorch AFMoE model.    )CallableN)nn   )initialization)CacheDynamicCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)GradientCheckpointingLayer)MoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)check_model_inputs   )GptOssRMSNorm)LlamaAttentionLlamaForCausalLMLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Qwen2MoeMLP   )AfmoeConfigc                       e Zd Zy)AfmoeRotaryEmbeddingN__name__
__module____qualname__     q/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/afmoe/modular_afmoe.pyr    r    .       r&   r    c                       e Zd Zy)AfmoeRMSNormNr!   r%   r&   r'   r*   r*   2   r(   r&   r*   c                       e Zd Zy)AfmoeMLPNr!   r%   r&   r'   r,   r,   6   r(   r&   r,   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )AfmoeTokenChoiceRouterz
    Token-choice top-K router for MoE routing.

    This router assigns each token to the top-K experts based on sigmoid scores, matching the released checkpoints.
    c                     t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  |j                  |j
                  d      | _
        y NFbias)super__init__confignum_experts_per_toktop_knum_expertsroute_scaler   Linearhidden_sizegateselfr5   	__class__s     r'   r4   zAfmoeTokenChoiceRouter.__init__A   s^    //
!--!--IIf00&2D2D5Q	r&   hidden_statesexpert_biasc                    |j                   \  }}}|j                  d|      }t        j                  | j	                  |      j                  t        j                              }t        j                  ||z   | j                  d      \  }}|j                  d|      }|j                  dd      dz   }||z  }|| j                  z  }||fS )Nr   )kdim)rE   indexT)rE   keepdimg#B;)shapeviewtorchsigmoidr<   tofloat32topkr7   gathersumr9   )	r>   r@   rA   _
hidden_dimscoresselected_experts
top_scoresdenominators	            r'   forwardzAfmoeTokenChoiceRouter.forwardI   s    (..1j%**2z:tyy7::5==IJ#jj+)=QRS]]q0@]A
 nnTn:UB+-
$"2"22
+++r&   )	r"   r#   r$   __doc__r4   rJ   TensorrW   __classcell__r?   s   @r'   r.   r.   :   s)    R,U\\ , ,r&   r.   c                        e Zd ZdZdef fdZdej                  dej                  dej                  dej                  fdZ xZ	S )	AfmoeExpertsz
    Container holding the routed experts.

    This mirrors the Experts pattern used across other MoE models to ease checkpoint conversion.
    r5   c                     t         |           |j                  | _        |j                  | _        t        | j                        D ](  }| j                  t        ||j                               * y )N)intermediate_size)	r3   r4   r6   r7   r8   rangeappendr,   moe_intermediate_size)r>   r5   rQ   r?   s      r'   r4   zAfmoeExperts.__init__^   s^    //
!--t''( 	ZAKK6;W;WXY	Zr&   r@   rT   routing_weightsreturnc                 0   |j                   \  }}}|dk(  r|j                  |d|      S |j                  d|      }|j                   d   }t        j                  |j                   d   |j
                  t        j                        j                  |      }	|j                  d      }
|j                  d      }t        j                  |
d      }|	|   }	|
|   }
||   }|j                  d|	      }t        j                  |      }t        j                  |
d      \  }}d}t        |j                         |j                               D ]'  \  }}|dk(  r||z   }||| } | |   |      }|||| |}) |j                  t        j                         |j#                  d      z  j                  |j$                        }t        j                  |      }|	j#                  d      j'                  |      }|j)                  d||       |j                  |||      S )z
        Args:
            hidden_states: (batch, seq, hidden)
            selected_experts: (batch, seq, top_k)
            routing_weights: (batch, seq, top_k)
        r   rC   )devicedtypeT)stable)return_counts)rH   	new_zerosrI   rJ   arangerf   longrepeat_interleavereshapeargsortindex_select
zeros_likeunique_consecutiveziptolistrL   rM   	unsqueezerg   	expand_asscatter_add_)r>   r@   rT   rc   
batch_sizeseq_lenrR   hidden_states_flatr7   token_indicesexpert_indicessortingdispatched_tokensexpert_outputsunique_expertscountsstart	expert_idcountendexpert_inputexpert_outputweighted_outputs
aggregatedscatter_indicess                            r'   rW   zAfmoeExperts.forwarde   s    +8*=*='
GZa< **:q*EE*//J? &&r* $$Q'0D0DEJJ


E
" 	 *11"5)11"5--t<%g.'0)'2.;;A}M))*;<!&!9!9.X\!] #N$9$9$;V]]_ M 	Iuz%-C,U37L+DOL9M(5N5%E	 +--emm<?X?XY[?\\``anatatu%%&89
'11"5??@PQ?4DEz7J??r&   )
r"   r#   r$   rX   r   r4   rJ   rY   rW   rZ   r[   s   @r'   r]   r]   W   sR    Z{ Z-@"\\-@=B\\-@\a\h\h-@	-@r&   r]   c                   (     e Zd ZdZ fdZd Z xZS )AfmoeMoEz
    Mixture of Experts (MoE) module for AFMoE.

    This module implements a sparse MoE layer with both shared experts (always active) and
    routed experts (activated based on token-choice routing).
    c                 2   t         |           || _        t        |      | _        t        ||j                  |j                  z        | _        t        |      | _
        t        j                  t        j                  |j                        d      | _        y )NF)requires_grad)r3   r4   r5   r.   routerr,   rb   num_shared_expertsshared_expertsr]   expertsr   	ParameterrJ   zerosr8   rA   r=   s     r'   r4   zAfmoeMoE.__init__   sp    ,V4&vv/K/KfNgNg/gh#F+<<F4F4F(GW\]r&   c                    |j                   \  }}}|j                  d|      }| j                  || j                        \  }}|j                  ||| j                  j
                        }|j                  ||| j                  j
                        }| j                  |      j                  |||      }| j                  |||      }	||	z   S )NrC   )rH   rI   r   rA   r5   r6   r   r   )
r>   r@   rx   ry   rR   rz   rU   rT   shared_outputrouted_outputs
             r'   rW   zAfmoeMoE.forward   s    *7*=*='
GZ*//J? (,{{=$BRBR'S$
$__Z$++:Y:YZ
+00WdkkFeFef ++,>?DDZQXZde]4DjQ},,r&   )r"   r#   r$   rX   r4   rW   rZ   r[   s   @r'   r   r      s    ^-r&   r   c                       e Zd ZdZdedef fdZ	 	 ddej                  de	ej                  ej                  f   dej                  dz  d	e
dz  d
ej                  dz  dee   de	ej                  ej                  f   fdZ xZS )AfmoeAttentionaJ  
    Multi-headed attention module with optional sliding window and gating.

    This attention mechanism supports both full attention and sliding window attention,
    and includes Q/K normalization and gating of the output. It inherits from [`LlamaAttention`] to minimize the amount
    of custom logic we need to maintain.
    r5   	layer_idxc                    t         |   ||       |j                  |   dk(  | _        | j                  r|j                  nd | _        t        | j                  |j                        | _        t        | j                  |j                        | _	        t        j                  |j                  |j                  | j                  z  d      | _        y )Nsliding_attentionepsFr1   )r3   r4   layer_typesis_local_attentionsliding_windowr*   head_dimrms_norm_epsq_normk_normr   r:   r;   num_attention_heads	gate_projr>   r5   r   r?   s      r'   r4   zAfmoeAttention.__init__   s    + #)"4"4Y"?CV"V7;7N7Nf33TX"4==f6I6IJ"4==f6I6IJ6#5#5v7Q7QTXTaTa7ahmnr&   Nr@   position_embeddingsattention_maskpast_key_valuecache_positionkwargsrd   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      }	| j	                  |      j                  |      }
| j                  |      j                  |      }| j                  |      }| j                  |	      j                  dd      }	| j                  |
      j                  dd      }
|j                  dd      }| j                  r|\  }}t        |	|
||      \  }	}
|%d|i}|j                  |
|| j                  |      \  }
}t        j                  | j                   j"                  t$              } || |	|
|f|| j&                  sdn| j(                  | j*                  | j,                  d|\  }} |j                  g |d j/                         }|t1        j2                  |      z  }| j5                  |      }||fS )NrC   r   r   r   g        )r   dropoutscalingr   )rH   r   q_projrI   k_projv_projr   r   	transposer   r   r   updater   r   get_interfacer5   _attn_implementationr   trainingattention_dropoutr   r   
contiguousrJ   rK   o_proj)r>   r@   r   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesgate_statescossincache_kwargsattention_interfaceoutputattn_weightsattn_outputs                       r'   rW   zAfmoeAttention.forward   s    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|Dnn]3{{<0::1a@[[,66q!<
#--a3""*HC';L*VY[^'_$L*%,n=L'5'<'<ZW[WeWegs't$J(?(M(MKK,,.E)
  3	
 

 *#}}C$2H2HLL..
 
 
 
 .k.2.99;%--44kk&)L((r&   )NN)r"   r#   r$   rX   r   intr4   rJ   rY   tupler   
LongTensorr   r   rW   rZ   r[   s   @r'   r   r      s    	o{ 	os 	o  (,260)||0) #5<<#=>0) t+	0)
 0) ((4/0) +,0) 
u||U\\)	*0)r&   r   c                   &    e Zd ZdZdedef fdZ	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  d	e
dz  d
edz  dej                  dz  deej                  ej                  f   dz  dee   dej                  fdZ xZS )AfmoeDecoderLayerz
    AFMoE decoder layer with dual normalization.

    This layer applies self-attention followed by either a dense MLP or MoE block,
    with dual normalization (pre and post) around each component.
    r5   r   c                 P   t         |           |j                  | _        || _        t	        ||      | _        |j                  |   | _        t        |j                  |j                        | _
        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        ||j                  k\  | _        | j                  rt!        |      | _        y t%        |      | _        y )N)r5   r   r   )r3   r4   r;   r   r   	self_attnr   attention_typer*   r   input_layernormpost_attention_layernormpre_mlp_layernormpost_mlp_layernormnum_dense_layersmoe_enabledr   mlpr,   r   s      r'   r4   zAfmoeDecoderLayer.__init__  s    !--"'vK$00;  ,F,>,>FDWDWX(4V5G5GVM`M`(a% ".f.@.@fFYFY!Z".v/A/AvGZGZ"[ %(?(??'DH'DHr&   Nr@   r   position_idsr   	use_cacher   r   r   rd   c                    |}	| j                  |      } | j                  d|||||||d|\  }}
| j                  |      }|	|z   }|}	| j                  |      }| j	                  |      }| j                  |      }|	|z   }|S )N)r@   r   r   r   r   r   r   r%   )r   r   r   r   r   r   )r>   r@   r   r   r   r   r   r   r   residualrQ   s              r'   rW   zAfmoeDecoderLayer.forward  s     ! ,,];)4>> 	
')%)) 3	
 	
q 55mD =0 !..}=///> =0r&   )NNNNNN)r"   r#   r$   rX   r   r   r4   rJ   rY   r   r   boolr   r   r   FloatTensorrW   rZ   r[   s   @r'   r   r      s    ({ (s (4 /304'+!%26HL#||# t+# &&-	#
 # $;# ((4/# #5<<#=>E# +,# 
		#r&   r   c                   d     e Zd ZU dZeed<   dZdgZdgZe	e
dZg dZdZdZdZdZdZ fd	Z xZS )
AfmoePreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r5   modelr   past_key_values)r@   
attentions)r   r   r   r   r   r   normrA   Tc                     t         |   |       t        |t              r*t	        j
                  |j                  j                         yt        |t              r t	        j
                  |j                         yy)zInitialize the weightsN)
r3   _init_weights
isinstancer.   initzeros_r<   weightr   rA   )r>   moduler?   s     r'   r   z"AfmoePreTrainedModel._init_weights^  sR    f%f45KK**+)KK**+ *r&   )r"   r#   r$   rX   r   __annotations__base_model_prefix_no_split_modules_skip_keys_device_placementr   r   _can_record_outputs_keep_in_fp32_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendsupports_gradient_checkpointingr   rZ   r[   s   @r'   r   r   @  sg    
 ,-#4"5*$	 N"&&*#, ,r&   r   c                       e Zd ZdZdef fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
ej                  dz  dedz  dee   deez  fd              Z xZS )
AfmoeModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`AfmoeDecoderLayer`]

    Args:
        config: AfmoeConfig
    r5   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nr   r5   F)r3   r4   pad_token_idpadding_idx
vocab_sizer   	Embeddingr;   embed_tokens
ModuleListr`   num_hidden_layersr   layersr*   r   r   r    
rotary_embgradient_checkpointing	post_initr   s      r'   r4   zAfmoeModel.__init__p  s     !.. ++LL):):F<N<NPTP`P`ammCHIaIaCbcivy1c
 !!3!39L9LM	.f=&+# ds   DN	input_idsr   inputs_embedsr   r   r   r   r   rd   c                    |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        |x}
t              s)| j                  ||||d}t        di |t        di |d}
|}| j                  j                  r|| j                  j                  dz  z  }| j!                  ||      }| j"                  D ]  } ||f|
|j$                     |||||d	|}! | j'                  |      }t)        ||r|
      S d 
      S )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )rf   )r5   input_embedsr   r   r   )full_attentionr   g      ?)r   r   r   r   r   r   )last_hidden_stater   r%   )
ValueErrorr   r5   r   get_seq_lengthrJ   rk   rH   rf   ru   r   dictr
   r   mup_enabledr;   r  r  r   r   r   )r>   r  r   r  r   r   r   r   r   past_seen_tokenscausal_mask_mappingmask_kwargsr@   r   decoder_layers                  r'   rW   zAfmoeModel.forward  s    -t";<YZZ0*$++>O  --i8M!CRC^==?de"\\  =#6#6q#99$++N
 )33A6L ?-F++ -"0"0#2K #5"C{"C%F%U%U#
 & ;;"")T[[-D-Dc-IJM"oom\J![[ 
	M)	2=3O3OP).#-$7	 	M
	 		-0%+/8O
 	
>B
 	
r&   )NNNNNNN)r"   r#   r$   rX   r   r4   r   r   rJ   r   rY   r   r   r   r   r   r   r   rW   rZ   r[   s   @r'   r   r   g  s    {   .2.22604(,26!%D
##d*D
 t+D
 ((4/	D

 &&-D
 D
 ((4/D
 $;D
 +,D
 
'	'D
  D
r&   r   c                       e Zd Zd Zy)AfmoeForCausalLMc                     t         j                  | |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y r0   )
r   r4   r   r   r   r   r:   r;   lm_headr  )r>   r5   s     r'   r4   zAfmoeForCausalLM.__init__  sU    %%dF3'
 ++yy!3!3V5F5FUSr&   N)r"   r#   r$   r4   r%   r&   r'   r  r    s    r&   r  )r  r   r   )<rX   collections.abcr   rJ   r    r   r   cache_utilsr   r   
generationr	   masking_utilsr
   r   modeling_layersr   modeling_outputsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   gpt_oss.modeling_gpt_ossr   llama.modeling_llamar   r   r   r   r   qwen2_moe.modeling_qwen2_moer   configuration_afmoer   
get_loggerr"   loggerr    r*   r,   Moduler.   r   r]   r   r   r   r   r   r  __all__r%   r&   r'   <module>r*     s    $   & . ) R 9 6 F & @ @ / 4  7 , 
		H	%	/ 		= 		{ 	,RYY ,:;@2== ;@|-ryy ->D)^ D)NB2 BJ$,? $,N ]
% ]
 ]
@')= r&   