
    i6                         d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,  ejZ                  e.      Z/ G d de"      Z0 G d de      Z1 G d de      Z2 G d de%      Z3 G d de      Z4 G d de'      Z5 G d  d!e#      Z6 G d" d#e(      Z7 G d$ d%e&      Z8 G d& d'e$      Z9g d(Z:y))    )CallableN)nn   )initialization)Cache)create_causal_mask)BaseModelOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)check_model_inputs   )BambaConfig)
BambaMixerBambaRMSNormGated HybridMambaAttentionDynamicCache)Gemma2RotaryEmbedding)
GraniteFlashAttentionKwargsGraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedMoEGraniteMoeSharedPreTrainedModelapply_rotary_pos_embeager_attention_forward   )GraniteMoeHybridConfigc                       e Zd Zdedef fdZ	 	 	 ddej                  dej                  dz  dedz  dej                  dz  d	e
ej                  ej                  f   dz  d
ee   de
ej                  ej                  f   fdZ xZS )GraniteMoeHybridAttentionconfig	layer_idxc                 &    t         |   ||       y Nsuper__init__selfr%   r&   	__class__s      /home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr+   z"GraniteMoeHybridAttention.__init__2   s    +    Nhidden_statesattention_maskpast_key_valuescache_positionposition_embeddingskwargsreturnc                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }||\  }}t        |	|
||      \  }	}
|%d|i}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t              } || |	|
||f| j                  sdn| j                   | j"                  d|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )Nr!   r   r4   g        )dropoutscaling)shapehead_dimq_projview	transposek_projv_projr   updater&   r   get_interfacer%   _attn_implementationr    trainingattention_dropoutr;   reshape
contiguouso_proj)r-   r1   r2   r3   r4   r5   r6   input_shapehidden_shapequery_states
key_statesvalue_statescossincache_kwargsattention_interfaceattn_outputattn_weightss                     r/   forwardz!GraniteMoeHybridAttention.forward5   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST**HC';L*VY[^'_$L*&,n=L'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r0   )NNN)__name__
__module____qualname__r"   intr+   torchTensorr   
LongTensortupler   r   rV   __classcell__r.   s   @r/   r$   r$   1   s    ,5 ,# , )-26HL))||)) t+)) 	))
 ((4/)) #5<<#=>E)) +,)) 
u||U\\)	*))r0   r$   c                   (     e Zd Zdedef fdZ xZS )GraniteMoeHybridMambaLayerr%   r&   c                 8    t         |   t        |      |       y r(   )r*   r+   r   r,   s      r/   r+   z#GraniteMoeHybridMambaLayer.__init__b   s    V,i8r0   )rW   rX   rY   r"   rZ   r+   r_   r`   s   @r/   rb   rb   a   s    95 9# 9 9r0   rb   c                         e Zd Zd fd	Z xZS )GraniteMoeHybridRMSNormGatedc                 &    t         |   ||       y r(   r)   )r-   hidden_sizeepsr.   s      r/   r+   z%GraniteMoeHybridRMSNormGated.__init__g   s    c*r0   )gư>)rW   rX   rY   r+   r_   r`   s   @r/   re   re   f   s    + +r0   re   c                   $     e Zd Zdef fdZ xZS )GraniteMoeHybridMLPr%   c                 $    t         |   |       y r(   r)   r-   r%   r.   s     r/   r+   zGraniteMoeHybridMLP.__init__l   s     r0   )rW   rX   rY   r"   r+   r_   r`   s   @r/   rj   rj   k   s    !5 ! !r0   rj   c                       e Zd Zy)GraniteMoeHybridRotaryEmbeddingNrW   rX   rY    r0   r/   rn   rn   p       r0   rn   c                       e Zd Zy)GraniteMoeHybridMoENro   rp   r0   r/   rs   rs   t   rq   r0   rs   c                   N    e Zd Zdedef fdZe	 	 	 	 	 ddej                  dej                  dz  de	dz  de
dz  d	ej                  dz  d
eej                  ej                  f   dz  dee   deej                  eej                  ej                  f   dz  f   fd       Z xZS )GraniteMoeHybridDecoderLayerr%   r&   c                 `   t         |   ||       t        |      | _        d | _        d | _        |j                  |   dk(  rt        ||      | _        nt        ||      | _        |j                  |   | _	        |j                  dkD  rt        |      nd | _        t        |dd      dkD  | _        y )Nmambar   num_local_experts)r*   r+   rj   
shared_mlp	self_attnrw   layers_block_typerb   r$   
layer_typerx   rs   block_sparse_moegetattrhas_expertsr,   s      r/   r+   z%GraniteMoeHybridDecoderLayer.__init__y   s    +-f5
##I.'93FIFDJ6vyIDN 229= @F?W?WZ[?[ 3F ;ae #6+>BQFr0   Nr1   r2   r3   	use_cacher4   r5   r6   r7   c           
         |}| j                  |      }| j                   | j                  d||||d|}n | j                  d||||||d|\  }}	||| j                  z  z   }|}| j	                  |      }| j
                  r&| j                  |      }
|
| j                  |      z   }n| j                  |      }||| j                  z  z   }|S )N)r1   r4   cache_paramsr2   )r1   r2   r3   r   r4   r5   rp   )input_layernormrw   rz   residual_multiplierpost_attention_layernormr   r}   ry   )r-   r1   r2   r3   r   r4   r5   r6   residual_moe_hidden_statess              r/   rV   z$GraniteMoeHybridDecoderLayer.forward   s    !,,];::!&DJJ +-,-	
 M  .t~~  +- /#-$7   M1 !=43K3K#KK 55mD $ 5 5m D-0NNM OOM:M =43K3K#KKr0   )NNFNN)rW   rX   rY   r"   rZ   r+   r   r[   r\   r   boolr]   r^   r   r   FloatTensorrV   r_   r`   s   @r/   ru   ru   x   s    G5 G# G&  /3(,!&26HL+||+ t++ 	+
 $;+ ((4/+ #5<<#=>E+ 45+ 
u  %(9(95;L;L(L"MPT"TT	U+ +r0   ru   c                   \     e Zd ZU eed<   dgZdZ ej                          fd       Z	 xZ
S )GraniteMoeHybridPreTrainedModelr%   ru   Tc           
         t         |   |       t        |t              rt	        j
                  |j                         t	        j                  |j                  t        j                  t        j                  d|j                  dz                      t	        j
                  |j                         y t        |t              r t	        j
                  |j                         y y )Nr!   )r*   _init_weights
isinstancerb   initones_dt_biascopy_A_logr[   logarange	num_headsDre   weight)r-   moduler.   s     r/   r   z-GraniteMoeHybridPreTrainedModel._init_weights   s    f%f89JJv~~&JJv||UYYu||Av?O?ORS?S/T%UVJJvxx  <=JJv}}% >r0   )rW   rX   rY   r"   __annotations___no_split_modules_is_statefulr[   no_gradr   r_   r`   s   @r/   r   r      s1    ""78LU]]_& &r0   r   c                       e Zd Zdef fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  dej                  dz  d	edz  d
ej                  dz  dee   deez  fd              Zd Z xZS )GraniteMoeHybridModelr%   c           	      (   t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        |j                  | _        |j                  dk(  rt        |      | _        y d | _        y c c}w )Nrope)r*   r+   r   
ModuleListrangenum_hidden_layersru   layersembedding_multiplierposition_embedding_typern   
rotary_embr,   s      r/   r+   zGraniteMoeHybridModel.__init__   sz     mmNSTZTlTlNmn)&)<n
 %+$?$?!EKEcEcgmEm9&Asw os   BN	input_idsr2   position_idsr3   inputs_embedsr   r4   r6   r7   c           
         |d u |d uz  rt        d      || j                  |      }|| j                  z  }|F||j                         nd}	t	        j
                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  ||||      }
| j                  ||      }|}d }| j                  | j                  ||      }| j                  D ]$  }|j                  dk(  r|n|
} ||f|||||d|}& | j                  |      }|r|j                   sd|_        t#        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r!   devicerw   )r2   r3   r   r4   r5   T)last_hidden_stater3   )
ValueErrorembed_tokensr   get_seq_lengthr[   r   r<   r   	unsqueezer   r%   _update_mamba_maskr   r   r|   normhas_previous_stater
   )r-   r   r2   r   r3   r   r   r4   r6   past_seen_tokenscausal_mask
mamba_maskr1   r5   decoder_layer
layer_masks                   r/   rV   zGraniteMoeHybridModel.forward   s    -t";<YZZ  --i8M%(A(AA!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(KK
 ,,^^L
 &"??&"&//-"N![[ 	M'4'?'?7'JP[J)) /#-$7 M		 		-0?#E#E15O.%++
 	
r0   c                 R    |}|d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr!   )r[   all)r-   r2   r4   r   s       r/   r   z(GraniteMoeHybridModel._update_mamba_mask  s7     $
!q ^%?EIIn`aNaDbJr0   )NNNNNNN)rW   rX   rY   r"   r+   r   r   r[   r]   r\   r   r   r   r   r   r^   r	   rV   r   r_   r`   s   @r/   r   r      s    x5 x  .2.204(,26!%26@
##d*@
 t+@
 &&-	@

 @
 ((4/@
 $;@
 ((4/@
 45@
 
(	(@
  @
D	r0   r   c                   P     e Zd ZddiZdef fdZ fdZ	 	 	 	 	 	 	 d fd	Z xZS )GraniteMoeHybridForCausalLMzlm_head.weightzmodel.embed_tokens.weightr%   c                 d    t         |   |       t        |      | _        | j	                          y r(   )r*   r+   r   model	post_initrl   s     r/   r+   z$GraniteMoeHybridForCausalLM.__init__'  s&     *62
r0   c                 "    t        |   di |S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeHybridForCausalLM

        >>> model = GraniteMoeHybridForCausalLM.from_pretrained("ibm-granite/granite-4.0-h-tiny")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-4.0-h-tiny")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```rp   )r*   rV   )r-   super_kwargsr.   s     r/   rV   z#GraniteMoeHybridForCausalLM.forward-  s    . w...r0   c	                     |<|r:t        | j                  |j                  d   | j                  | j                        }t        |   |f|||||||d|	}
|
S )Nr   r   )r3   r2   r   r4   r   r   is_first_iteration)r   r%   r<   dtyper   r*   prepare_inputs_for_generation)r-   r   r3   r2   r   r4   r   r   r   r6   model_inputsr.   s              r/   r   z9GraniteMoeHybridForCausalLM.prepare_inputs_for_generationF  su     "y>Y__Q/DKKO w<

+)')%1

 

 r0   )NNNNNTF)	rW   rX   rY   _tied_weights_keysr"   r+   rV   r   r_   r`   s   @r/   r   r   $  sB    *,GH5 /8   r0   r   )r   r   r   );collections.abcr   r[   r    r   r   cache_utilsr   masking_utilsr   modeling_outputsr	   r
   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   bamba.configuration_bambar   bamba.modeling_bambar   r   r   gemma2.modeling_gemma2r   *granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   r   r   r   r    configuration_granitemoehybridr"   
get_loggerrW   loggerr$   rb   re   rj   rn   rs   ru   r   r   r   __all__rp   r0   r/   <module>r      s    %   &   / O 5 & @ @ / 3 b b :   C 
		H	%-) 9 -)`9 9
+#4 +
!- !
	&; 		- 	@#? @F&&E & V1 VrA"= AH fr0   