
    iW                     v   d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ	 ddl
mZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z*  G d de$      Z+ G d dejX                        Z- G d dejX                        Z. G d dejX                        Z/ G d d ejX                        Z0 G d! d"ejX                        Z1 G d# d$ejX                        Z2 G d% d&e      Z3 G d' d(e      Z4e G d) d*e4             Z5 G d+ d,e4e      Z6g d-Z7y).z"Modular components for DBRX model.    )Callable)AnyN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputs   )LlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)load_balancing_loss_func   )
DbrxConfigc                       e Zd Zy)DbrxRotaryEmbeddingN)__name__
__module____qualname__     o/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/dbrx/modular_dbrx.pyr   r   ,   s    r$   r   c                        e Zd ZdZ	 ddedz  f fdZ	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	ej                  dz  d
e
ej                  ej                  f   fdZ xZS )DbrxAttentionzYModular DBRX attention component that can be reused across different model architectures.N	layer_idxc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        |j                  | _	        || _
        |j                  }|j                  | _        |j                  | _        |j                  | _        | j                  | j                   z  | _        | j                  dz  | _        |j&                  | _        d| _        t+        j,                  | j                  | j                  d| j                   z  | j                  z  z   d      | _        t+        j,                  | j                  | j                  d      | _        y )Ng      Tr   Fbias)super__init__configd_modelhidden_sizen_heads	num_headshead_dimmax_seq_lenmax_position_embeddingsr(   attn_config
attn_pdropattention_dropoutclip_qkv
kv_n_headsnum_key_value_headsnum_key_value_groupsscaling
rope_theta	is_causalr   LinearWqkvout_proj)selfr.   r(   kwargsr6   	__class__s        r%   r-   zDbrxAttention.__init__3   s*    	!>>((DNN:'-'9'9$"((!,!7!7#,,#.#9#9 $(NNd6N6N$N!}}d*%00IId..T5M5M1MPTP]P]1]]di
	 		$"2"2D4D4D5Qr$   hidden_statesattention_maskposition_embeddingspast_key_valuescache_positionreturnc                    |j                   d d }g |d| j                  }| j                  |      }	| j                  | j                   nd }
|	j	                  |
| j                        }	|	j                  | j                  | j                  | j                  z  | j                  | j                  z  gd      \  }}}|j                  |      j                  dd      }|j                  |      j                  dd      }|j                  |      j                  dd      }|\  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}t        j                  | j                  j                   t"              } || ||||f| j$                  sdn| j&                  | j(                  d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )	N)minmaxr   dimr   )sincosrJ           )dropoutr=   )shaper3   rA   r9   clampsplitr0   r;   view	transposer   updater(   r   get_interfacer.   _attn_implementationr   trainingr8   r=   reshape
contiguousrB   )rC   rF   rG   rH   rI   rJ   rD   input_shapehidden_shape
qkv_statesmin_valquery_states
key_statesvalue_statesrS   rR   cache_kwargsattention_interfaceattn_outputattn_weightss                       r%   forwardzDbrxAttention.forwardO   s    $))#2.88b8$--8YY}-
$(MM$=4==.4%%'t}}%E
1;1A1A  ((4==8((4==8
  2B 2
.j, $((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHmmK0L((r$   NNNNN)r    r!   r"   __doc__intr-   torchTensor
LongTensorr	   tuplerl   __classcell__rE   s   @r%   r'   r'   0   s    c
 !%R :R> /37;(,266)||6) t+6) #--4	6)
 6) ((4/6) 
u||U\\)	*6)r$   r'   c            
            e Zd Z fdZdej
                  dej
                  dej
                  dej
                  dej
                  f
dZ xZS )DbrxExpertGLUc                    t         |           |j                  | _        |j                  | _        |j                  | _        t        j                  t        j                  | j                  | j                  z  | j                              | _	        t        j                  t        j                  | j                  | j                  z  | j                              | _
        t        j                  t        j                  | j                  | j                  z  | j                              | _        |j                  j                  dd      }t        |   | _        y )Nnamesilu)r,   r-   r0   ffn_hidden_sizemoe_num_expertsr   	Parameterrq   emptyw1v1w2
ffn_act_fngetr   activation_fn)rC   r.   act_fn_namerE   s      r%   r-   zDbrxExpertGLU.__init__   s    !--%55%55,,u{{4+?+?$BVBV+VX\XhXhij,,u{{4+?+?$BVBV+VX\XhXhij,,u{{4+?+?$BVBV+VX\XhXhij''++FF;#K0r$   x	expert_w1	expert_v1	expert_w2rK   c                     |j                  |      }|j                  |      }| j                  |      }||z  }|j                  |j                               }|S rm   )matmulr   t)	rC   r   r   r   r   	gate_projup_projintermediate_states	down_projs	            r%   rl   zDbrxExpertGLU.forward   sW     HHY'	((9%&&y1	''1'..y{{}=	r$   r    r!   r"   r-   rq   rr   rl   ru   rv   s   @r%   rx   rx      sK    1*/,,CH<<\a\h\h	r$   rx   c                        e Zd Z fdZdej
                  dej
                  dej
                  dej
                  fdZ xZS )DbrxExpertsc                     t         |           t        |      | _        |j                  | _        |j
                  | _        |j                  | _        y rm   )r,   r-   rx   mlpr0   r|   r}   num_expertsrC   r.   rE   s     r%   r-   zDbrxExperts.__init__   sD     (!--%55!11r$   rF   top_k_indextop_k_weightsrK   c                    |j                   d   }|j                  d| j                        }t        j                  ||j
                  |j                        }t        j                         5  t        j                  j                  j                  || j                        }|j                  ddd      }t        j                  |j                  d      d      j                         }d d d        d| j                  | j                   f}D ]  }	|	d   }	t        j                         5  t        j"                  |	         \  }
}d d d        | j$                  j&                  j)                  |      |	   }| j$                  j*                  j)                  |      |	   }| j$                  j,                  j)                  |      |	   }| j%                  |   |||      }|j)                  d| j                        ||
d f   z  }|j/                  d||       
 |j)                  |d| j                        }|S # 1 sw Y   OxY w# 1 sw Y   xY w)	Nr   rM   )dtypedevice)num_classesr   r   )rM   rP   )rV   r_   r|   rq   
zeros_liker   r   no_gradr   
functionalone_hotr   permutegreatersumnonzeror0   wherer   r   rY   r   r   
index_add_)rC   rF   r   r   
batch_sizenext_statesexpert_mask
expert_hitsplit_expert_shape
expert_idxidx	token_idxr   r   r   statess                   r%   rl   zDbrxExperts.forward   s    #((+
%--b$2F2FG&&}M<O<OXeXlXlm]]_ 	S((--55ktO_O_5`K%--aA6K{8'DaHPPRJ	S
 !$"6"68H8HI$ 		9J#AJ F!&[-D!EYF!!"45jAB!!"45jAB!!"45jABXXmI6BCF[[T%9%9:]9VY[_K_=``F""1i8		9 "&&z2t7K7KL%	S 	SF Fs   ,A=H6)I6I I	r   rv   s   @r%   r   r      sC    2|| \\ ||	
 
r$   r   c                        e Zd Z fdZdej
                  deej
                  ej
                  ej                  f   fdZ xZ	S )
DbrxRouterc                     t         |           |j                  | _        |j                  | _        t        j                  | j                  |j                  d      | _        y NFr*   )	r,   r-   r|   r0   moe_jitter_epsr   r@   r}   layerr   s     r%   r-   zDbrxRouter.__init__   sJ    !11$33YYt//1G1GeT
r$   rF   rK   c                    | j                   rN| j                  B|t        j                  |      j	                  d| j                  z
  d| j                  z         z  }|j                  d|j                  d         }| j                  |      }|S )Ng      ?rM   )r^   r   rq   
empty_likeuniform_rY   rV   r   )rC   rF   router_logitss      r%   rl   zDbrxRouter.forward   s    ==T00<U--m<EEd)))31D1D+D M &**2}/B/B2/FG

=1r$   )
r    r!   r"   r-   rq   rr   rt   rs   rl   ru   rv   s   @r%   r   r      s;    UU\\ eELL%,,X]XhXh<h6i r$   r   c                   ~     e Zd ZdZ fdZd Zdej                  deej                  ej                  f   fdZ	 xZ
S )DbrxFFNz0Modular DBRX MLP/FFN component with MoE support.c                     t         |           t        |j                        | _        t        |j                        | _        |j                  j                  | _        |j                  j                  | _	        y rm   )
r,   r-   r   
ffn_configrouterr   expertsmoe_normalize_expert_weights	moe_top_ktop_k)rC   r.   rD   rE   s      r%   r-   zDbrxFFN.__init__   sY     !2!23"6#4#45,2,=,=,Z,Z)&&00
r$   c                 $   t         j                  j                  j                  |d|j                        }t        j
                  || j                  d      \  }}| j                  &|t        j                  || j                  dd      z  }||fS )Nr   )rQ   r   rM   rP   T)prQ   keepdim)	rq   r   r   softmaxr   topkr   r   norm)rC   r   router_top_valuerouter_indicess       r%   route_tokens_to_expertszDbrxFFN.route_tokens_to_experts   s    ++33MqP]PcPc3d+0::mTZZUW+X(.,,8/%** D$E$E2W[3    //r$   rF   rK   c                 v    | j                  |      }| j                  |      \  }}| j                  |||      }|S rm   )r   r   r   )rC   rF   r   r   r   outputs         r%   rl   zDbrxFFN.forward   s<    M2%)%A%A-%P"{m[-Hr$   )r    r!   r"   ro   r-   r   rq   rr   rt   rl   ru   rv   s   @r%   r   r      s9    :10U\\ eELL%,,<V6W r$   r   c                        e Zd Zddededz  f fdZ	 	 	 ddej                  dej                  dej                  dz  de	dz  d	ej                  dz  d
e
deej                  ej                  f   fdZ xZS )DbrxNormAttentionNormNr.   r(   c                    t         |           || _        |j                  | _        t	        j
                  |j                  d      | _        t        ||      | _	        t	        j
                  |j                  d      | _
        y )NFr*   r.   r(   )r,   r-   r(   resid_pdropr   	LayerNormr/   norm_1r'   attnnorm_2rC   r.   r(   rE   s      r%   r-   zDbrxNormAttentionNorm.__init__   sc    "!--ll6>>>!
	 ll6>>>r$   rF   rH   rG   rI   rJ   rD   rK   c           	      f   |}| j                  |      j                  |j                        } | j                  d|||||d|\  }}t        j
                  j                  || j                  | j                        }||z   }|}| j                  |      j                  |j                        }||fS N)rF   rG   rH   rI   rJ   )r   r^   r#   )
r   tor   r   r   r   rU   r   r^   r   )	rC   rF   rH   rG   rI   rJ   rD   residual_states_s	            r%   rl   zDbrxNormAttentionNorm.forward   s     (M255m6I6IJ$499 
') 3+)
 
q --mt?O?OZ^ZgZg-h%7'M255m6I6IJ--r$   rm   )NNN)r    r!   r"   r   rp   r-   rq   rr   rs   r	   r   rt   rl   ru   rv   s   @r%   r   r      s    	?z 	?cDj 	? /3(,26.||. #--. t+	.
 . ((4/. . 
u||U\\)	*.r$   r   c                        e Zd Zdedef fdZ	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	ej                  dz  d
e
fdZ xZS )	DbrxBlockr.   r(   c                     t         |           |j                  | _        |j                  | _        || _        t        ||      | _        t        |      | _	        y )Nr   r.   )
r,   r-   r/   r0   r   r(   r   norm_attn_normr   ffnr   s      r%   r-   zDbrxBlock.__init__  sP    !>>!--"3
 &)r$   NrF   rG   rH   rI   rJ   rD   c           	           | j                   d|||||d|\  }}| j                  |      }t        j                  j	                  || j
                  | j                        }||z   }|S r   )r   r   r   r   rU   r   r^   )rC   rF   rG   rH   rI   rJ   rD   resid_statess           r%   rl   zDbrxBlock.forward*  s     ':d&9&9 '
') 3+)'
 '
#m /--mt?O?OZ^ZgZg-h$}4r$   rn   )r    r!   r"   r   rp   r-   rq   rr   rs   r	   r   rl   ru   rv   s   @r%   r   r     s    	*z 	*c 	* /37;(,26|| t+ #--4	
  ((4/ r$   r   c                        e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZ ej$                         dej(                  f fd	       Z xZS )
DbrxPreTrainedModelr.   transformerTr   rI   F)rF   
attentionsmodulec                 >   t         |   |       | j                  j                  }t	        |t
              rgt        j                  |j                  d|       t        j                  |j                  d|       t        j                  |j                  d|       y y )NrT   )meanstd)r,   _init_weightsr.   initializer_range
isinstancerx   initnormal_r   r   r   )rC   r   r   rE   s      r%   r   z!DbrxPreTrainedModel._init_weightsR  sj    f%kk++fm,LL#6LL#6LL#6 -r$   )r    r!   r"   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flex_attn_supports_attention_backend_supports_flash_attn_supports_sdpa_can_compile_fullgraphr   r'   _can_record_outputsrq   r   r   Moduler   ru   rv   s   @r%   r   r   B  sx    %&*#$#4"5"&N""#
 U]]_7BII 7 7r$   r   c                   L    e Zd ZdZdef fdZdej                  fdZdej                  fdZ	e
e	 	 	 	 	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dedz  dej                   dz  dedz  dej                  dz  dee   defd              Z xZS )	DbrxModela  Transformer decoder consisting of *config.num_hidden_layers*. Each layer is a [`DbrxBlock`] layer.

    Args:
        config ([`DbrxConfig`]): Model configuration class with all parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    r.   c           	      ,   t         |   |       |j                  | _        |j                  | _        |j
                  | _        t        |      | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j"                  |j                  d      | _        d| _        | j)                          y c c}w r   )r,   r-   pad_token_idpadding_idx
vocab_size	emb_pdropr   
rotary_embr   	Embeddingr/   wte
ModuleListrangen_layersr   blocksr   norm_fgradient_checkpointing	post_initr   s      r%   r-   zDbrxModel.__init__f  s     !.. ++))-f5<< 1 16>>4CSCSTmmSXY_YhYhSi$jiYvy%A$jkll6>>>&+# 	 %ks   4DrK   c                     | j                   S rm   r
  rC   s    r%   get_input_embeddingszDbrxModel.get_input_embeddingst  s    xxr$   valuec                     || _         y rm   r  rC   r  s     r%   set_input_embeddingszDbrxModel.set_input_embeddingsw  s	    r$   N	input_idsrG   position_idsrI   inputs_embeds	use_cacherJ   rD   c                 D   |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }
|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f||
||||d|} | j                  |      }t        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r   )r.   input_embedsrG   rJ   rI   r  )rH   rG   r  rI   r  rJ   )last_hidden_staterI   )
ValueErrorr
   r.   r
  get_seq_lengthrq   arangerV   r   	unsqueezer   r  r  num_hidden_layersr  r   )rC   r  rG   r  rI   r  r  rJ   rD   past_seen_tokenscausal_maskrF   rH   decoder_layers                 r%   rl   zDbrxModel.forwardz  s^    -t";<YZZ0*$++>O  HHY/M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;&))+%
 & #oom\J![[)H4;;+H+HI 
	M)	$7*) /#-	 	M
	 M2%++
 	
r$   )NNNNNNN)r    r!   r"   ro   r   r-   r   r	  r  r  r   r   rq   rs   rr   r	   FloatTensorboolr   r   r   rl   ru   rv   s   @r%   r  r  \  s   z bll ",,   .2.204(,26!%26;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
 $;;
 ((4/;
 +,;
 
 ;
  ;
r$   r  c                       e Zd ZddiZddiZddgdgfiZdef fdZd	ej                  fd
Z
dej                  fdZd	ej                  fdZdej                  fdZdefdZd	efdZee	 	 	 	 	 	 	 	 	 	 d dej*                  dz  dej,                  dz  dej*                  dz  dedz  dej0                  dz  dej*                  dz  dedz  dedz  dej*                  dz  deej,                  z  dee   d	efd              Z xZS )!DbrxForCausalLMzlm_head.weightztransformer.wte.weightlm_headcolwise_gather_outputrF   logitsr.   c                    t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  j                  | _        |j                  j                  | _        |j                  j                  | _        | j!                          y r   )r,   r-   r  r   r  r   r@   r0   r-  r   moe_loss_weightrouter_aux_loss_coefr}   r   r   num_experts_per_tokr  r   s     r%   r-   zDbrxForCausalLM.__init__  s     $V, ++yy!3!3V5F5FUS$*$5$5$E$E!!,,<<#)#4#4#>#> r$   rK   c                 6    | j                   j                         S rm   )r   r  r  s    r%   r  z$DbrxForCausalLM.get_input_embeddings  s    4466r$   r  c                 :    | j                   j                  |       y rm   )r   r  r  s     r%   r  z$DbrxForCausalLM.set_input_embeddings  s    --e4r$   c                     | j                   S rm   r-  r  s    r%   get_output_embeddingsz%DbrxForCausalLM.get_output_embeddings  s    ||r$   new_embeddingsc                     || _         y rm   r7  )rC   r9  s     r%   set_output_embeddingsz%DbrxForCausalLM.set_output_embeddings  s	    %r$   decoderc                     || _         y rm   r   )rC   r<  s     r%   set_decoderzDbrxForCausalLM.set_decoder  s
    "r$   c                     | j                   S rm   r>  r  s    r%   get_decoderzDbrxForCausalLM.get_decoder  s    r$   Nr  rG   r  rI   r  labelsr  output_router_logitsrJ   logits_to_keeprD   c                 l   ||n| j                   j                  } | j                  d||||||||	d|}|j                  }t	        |
t
              rt        |
 d      n|
}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |}d}|rYt        |j                  | j                  | j                  |      }|+|| j                  |j                  |j                         z  z  }t#        ||||j$                  |j&                  |j(                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, DbrxForCausalLM

        >> model = DbrxForCausalLM.from_pretrained("transformers-community/dbrx-instruct")
        >> tokenizer = AutoTokenizer.from_pretrained("transformers-community/dbrx-instruct")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```
        N)r  rG   r  rI   r  r  rC  rJ   )lossaux_lossr/  rI   rF   r   r   r#   )r.   rC  r   r   r   rp   slicer-  loss_functionr  r   r   r   r3  r2  r   r   r   rI   rF   r   )rC   r  rG   r  rI   r  rB  r  rC  rJ   rD  rD   outputsrF   slice_indicesr/  rF  rG  s                     r%   rl   zDbrxForCausalLM.forward  sZ   P %9$D $++JjJj 	
 +;$*:*: 
+
)%+'!5)
+
 
+
  118B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
r$   )
NNNNNNNNNr   ) r    r!   r"   _tied_weights_keys_tp_plan_pp_planr   r-   r   r	  r  r  r@   r8  r;  r  r?  rA  r   r   rq   rs   rr   r	   r)  r*  rp   r   r   r   rl   ru   rv   s   @r%   r,  r,    s   *,DE23H_-z:;Hz 7bll 75",, 5ryy &BII &#9 # Y    .2.204(,26*.!%,026-.R
##d*R
 t+R
 &&-	R

 R
 ((4/R
   4'R
 $;R
 #TkR
 ((4/R
 ell*R
 +,R
 
#R
  R
r$   r,  )r,  r  r   )8ro   collections.abcr   typingr   rq   r    r   r   activationsr   cache_utilsr	   r
   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   llama.modeling_llamar   r   r   mixtral.modeling_mixtralr   configuration_dbrxr   r   r   r'   rx   r   r   r   r   r   r   r  r,  __all__r#   r$   r%   <module>r`     s   ) $    & ! . ) / R F & I I / 
 @ *	. 	U)BII U)pBII 2$")) $N "bii 6'.BII '.T!* !H7/ 74 Z
# Z
 Z
zu
)? u
p Br$   