
    iZ                        d dl mZ d dlmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)  e$jT                  e+      Z, ed       G d dejZ                               Z. G d d      Z/d Z0 ed      dCd       Z1dejd                  de3dejd                  fdZ4	 dDdejZ                  d ejd                  d!ejd                  d"ejd                  d#ejd                  dz  d$e5d%e5d&ee!   fd'Z6 ee1       G d( d)ejZ                               Z7 G d* d+ejZ                        Z8 G d, d-ejZ                        Z9e G d. d/ejZ                               Z: G d0 d1ejZ                        Z; G d2 d3e      Z< G d4 d5e      Z= G d6 d7e      Z>e<e=d8Z?e" G d9 d:e>             Z@	 	 	 dEd;ejd                  eAejd                     z  dz  d<e3dz  d#ejd                  dz  dejd                  e3z  fd=ZBe" G d> d?e>e             ZC G d@ dAee>      ZDg dBZEy)F    )Callable)AnyN)nn   )initialization)ACT2FN)GenerationMixin)lazy_load_kerneluse_experts_implementationuse_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)OutputRecordercheck_model_inputs   )JambaConfigRMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )JambaRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        JambaRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      r/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/jamba/modeling_jamba.pyr$   zJambaRMSNorm.__init__8   s1     	ll5::k#:; #    c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor&   float32powmeanrsqrtr)   r(   )r*   hidden_statesinput_dtypevariances       r.   forwardzJambaRMSNorm.forward@   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r/   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler(   shaper)   r*   s    r.   
extra_reprzJambaRMSNorm.extra_reprG   s*    ))*+6$2G2G1HIIr/   )gư>)__name__
__module____qualname__r$   r=   rB   __classcell__r-   s   @r.   r!   r!   6   s    $;Jr/   r!   c                   4   e Zd ZdZdZej                  dfdZd Zd Z		 ddej                  dej                  d	ed
eeef   dz  deej                  ej                  f   f
dZdej"                  fdZdej                  d	edeeef   fdZdd	edz  defdZy) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNc           
         || _         |j                  | _        d| _        |j                  |j                  z  }|j
                  }|j                  }g | _        g | _        g | _	        t        |j                        D ]  }| j                  |   dk(  r]| xj                  t        j                  |||||      gz  c_        | xj                  t        j                  |||||      gz  c_        r| xj                  t        j                  g g|z  |      gz  c_        | xj                  t        j                  g g|z  |      gz  c_        | j                  j                  |        t        |j                        D 	cg c]  }	t        j                  g g|z  |       c}	| _        t        |j                        D 	cg c]  }	t        j                  g g|z  |       c}	| _        y c c}	w c c}	w )NFmambadevicer4   rM   )r4   layers_block_typehas_previous_statemamba_expandr+   mamba_d_statemamba_d_convconv_states
ssm_statestransformer_layersrangenum_hidden_layersr&   zerostensorappend	key_cachevalue_cache)
r*   config
batch_sizer4   rM   intermediate_sizessm_state_sizeconv_kernel_sizei_s
             r.   r$   z)HybridMambaAttentionDynamicCache.__init__[   s   
!'!9!9"'"//&2D2DD--!.."$v//0 	2A%%a(G3  KK
,=?OX^fkl%   KK
,=~V\dij$    U\\2$2CF%S$TT ELL"
1B6$R#SS''..q1	2 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts   ?"G+ "G0c                 ,    t        | j                        S N)lenr\   rA   s    r.   __len__z(HybridMambaAttentionDynamicCache.__len__u   s    4>>""r/   c                 >    | j                   |   | j                  |   fS rf   )r\   r]   r*   	layer_idxs     r.   __getitem__z,HybridMambaAttentionDynamicCache.__getitem__x   s!    ~~i($*:*:9*EEEr/   
key_statesvalue_statesrk   cache_kwargsreturnc                    | j                   |   j                  d   dk(  r|| j                   |<   || j                  |<   nft        j                  | j                   |   |gd      | j                   |<   t        j                  | j                  |   |gd      | j                  |<   | j                   |   | j                  |   fS )Nr2   r   r1   dim)r\   r@   r]   r&   cat)r*   rm   rn   rk   ro   s        r.   updatez'HybridMambaAttentionDynamicCache.update{   s     >>)$**2.!3(2DNN9%*6DY'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr/   beam_idxc                    | j                         dkD  rvt        t        | j                              D ]S  }| j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   V yy)zDReorders the cache for beam search, given the selected beam indices.r   N)
get_seq_lengthrW   rg   r\   rM   index_selectr5   r]   rT   rU   )r*   rv   rk   rM   s       r.   reorder_cachez.HybridMambaAttentionDynamicCache.reorder_cache   s[    1$"3t~~#67 	m		299,0NN9,E,R,RSTV^VaVabhVi,jy))))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +)))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +3::-1__Y-G-T-TUVX`XcXcdjXk-l	*	m %r/   cache_positionc                 T    d}|j                   d   }| j                  |      |z   }||fS )zDReturn the length and offset of the cache, used to generate the maskr   )r@   rx   )r*   r{   rk   	kv_offsetquery_length	kv_lengths         r.   get_mask_sizesz/HybridMambaAttentionDynamicCache.get_mask_sizes   s7    	%++A.''	2\A	)##r/   c                     || j                   vr| j                   d   n|}t        | j                        |k  s| j                  |   j                  d   dk(  ry| j                  |   j                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   r2   )rV   rg   r\   r@   rj   s     r.   rx   z/HybridMambaAttentionDynamicCache.get_seq_length   sn     3<4CZCZ2ZD++A.`i	t~~)+t~~i/H/N/Nr/RVW/W~~i(..r22r/   rf   )r   )rC   rD   rE   __doc__is_compileabler&   float16r$   rh   rl   Tensorintdictstrr   r?   ru   
LongTensorrz   r   rx    r/   r.   rI   rI   K   s     N16t u4#F /3FLLF llF 	F
 38nt+F 
u||U\\)	*F"me&6&6 m$U\\ $c $eTWY\T\o $3d
 33 3r/   rI   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr2   r1   rr   )r@   r&   rt   )xx1x2s      r.   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r/   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkcossinunsqueeze_dimq_embedk_embeds          r.   apply_rotary_pos_embr      sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGr/   r:   n_reprp   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r@   expandreshape)r:   r   batchnum_key_value_headsslenhead_dims         r.   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr/   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr1   r   r   r2   rs   r4   )ptrainingr   )r   num_key_value_groupsr&   matmul	transposer@   r   
functionalsoftmaxr6   r5   r4   r   r   
contiguous)r   r   r   r   r   r   r   r   rm   rn   attn_weightscausal_maskattn_outputs                r.   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r/   c                        e Zd ZdZdedef fdZ	 	 	 ddej                  dej                  dz  de	dz  d	ej                  dz  d
ee   deej                  ej                  dz  f   fdZ xZS )JambaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr^   rk   c                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j
                  d      | _        y )Nr   g      TFbias)r#   r$   r^   rk   getattrr+   num_attention_headsr   r   r   r   attention_dropout	is_causalr   Linearq_projk_projv_projo_proj)r*   r^   rk   r-   s      r.   r$   zJambaAttention.__init__   s,   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii : :T]] JFL^L^ejkr/   Nr:   r   past_key_valuesr{   r   rp   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
|#|j                  |	|
| j                  d|i      \  }	}
t        j                  | j                  j                  t              } || ||	|
|f| j                  sdn| j                  | j                   d|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )Nr2   r   r1   r{           )r   r   )r@   r   r   viewr   r   r   ru   rk   r   get_interfacer^   _attn_implementationr   r   r   r   r   r   r   )r*   r:   r   r   r{   r   input_shapehidden_shapequery_statesrm   rn   attention_interfacer   r   s                 r.   r=   zJambaAttention.forward  s|    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&'6'='=L$..;K^:\($J )@(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r/   NNN)rC   rD   rE   r   r   r   r$   r&   r   rI   r   r   r   r?   r=   rF   rG   s   @r.   r   r      s    Gl{ ls l" /3CG26%)||%) t+%) :D@	%)
 ((4/%) +,%) 
u||U\\D00	1%)r/   r   c                        e Zd ZdZdef fdZ	 	 ddej                  dedz  dej                  dz  fdZ
ddedz  dej                  dz  fd	Z	 	 ddedz  dej                  dz  fd
Z xZS )JambaMambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    r^   c           	         t         |           || _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  |j                  z  | _
        |j                  | _        |j                  | _        |j                  | _        t#        j$                  | j                  | j                  | j                  | j                  | j                  | j                  dz
        | _        |j(                  | _        t,        |j(                     | _        |j0                  | _        t#        j4                  | j                  | j                  dz  | j                         | _        t#        j4                  | j                  | j                  | j                  dz  z   d      | _        t#        j4                  | j                  | j                  d      | _        t=        j>                  d| j                  dz         d d d f   }|jA                  | j                  d      jC                         }t#        jD                  t=        jF                  |            | _$        t#        jD                  t=        jJ                  | j                              | _&        t#        j4                  | j                  | j                  | j                         | _'        tQ        | j                  |jR                        | _*        tQ        | j                  |jR                        | _+        tQ        | j                  |jR                        | _,        t[        d	      }t]        |d
d       a/t]        |dd       a0t[        d      }t]        |dd       a1t]        |dd       a2t]        |dd       a3ti        tb        tf        t`        t^        td        f      a5tj        stl        jo                  d       y y )Nr   )in_channelsout_channelsr   kernel_sizegroupspaddingr1   r   FTr2   r,   zcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmselective_state_updatemamba_inner_fnselective_scan_fnap  The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)8r#   r$   r^   rk   r+   rR   ra   rS   rb   rQ   r`   mamba_dt_ranktime_step_rankmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1d
hidden_act
activationr   actuse_mamba_kernelsuse_fast_kernelsr   in_projx_projdt_projr&   aranger   r   r%   logA_logr'   Dout_projr!   rms_norm_epsdt_layernormb_layernormc_layernormr
   r   r   r   r   r   r   allis_fast_path_availableloggerwarning_once)r*   r^   rk   Acausal_conv1d	mamba_ssmr-   s         r.   r$   zJambaMambaMixer.__init__3  s   "!--$22 & 3 3!'!4!4v7I7I!I$22#33..ii..//##--))))A-
 !++&++, & 8 8 yy!1!143I3IA3MTXTaTabii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!34T1W=HHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQUQ^Q^_()<)<&BUBUV'(;(;ATATU'(;(;ATATU )9&}6LdS"=2DdK %[1	!(4Ld!S ,<dC#I/BDI "%#%68HJ^`no"
 &^ &r/   Nr:   cache_paramsr   c                 4
   |j                   \  }}}|d uxrm |j                  xr_ |dk(  xrX |j                  | j                     j                   d   |j                  | j                     j                   d   cxk(  xr |k(  nc }| j                  |      j                  dd      }|j                  dd      \  }}	|||j                  d      z  }| j                  j                  j                  | j                  j                  j                  d      | j                  j                  j                  d            }
|ret        |j                  d      |j                  | j                     |
| j                  j                  | j                         }|j                  d      }n|dt"        j$                  j'                  || j(                  |j                   d   z
  df      }|j                  | j                     j+                  |       t-        ||
| j                  j                  | j                         }|||j                  d      z  }| j/                  |j                  dd            }t1        j2                  || j4                  | j6                  | j6                  gd      \  }}}| j9                  |      }| j;                  |      }| j=                  |      }| j>                  j                  j@                  }t1        jB                         5  t1        jD                  | j>                  j                  j@                        | j>                  j                  _         d d d        | j?                  |      j                  dd      }t1        jB                         5  || j>                  j                  _         d d d        t1        jF                  | jH                  jK                                }||jK                         nd }|r]tM        |j                  | j                     |d   |d   ||d d df   |d d df   | jN                  |	d   |d	
      j                  d      }n|tQ        ||||j                  dd      |j                  dd      | jN                  jK                         |	|dd

      \  }}|*|(|j                  | j                     j+                  |       | jS                  |j                  dd            }|S # 1 sw Y   xY w# 1 sw Y   WxY w)Nr   r   r1   rr   r2   )r   ).r   T)dt_softplus)delta_softplusreturn_last_state)*r@   rP   rT   rk   rU   r   r   chunkr   r   r(   r   sizer   squeezer   r   r   r   padrb   copy_r   r   r&   splitr   ra   r   r   r   r   datano_grad
zeros_likeexpr   floatr   r   r   r   )r*   r:   r  r   r_   seq_lenrd   use_precomputed_statesprojected_statesgateconv_weightsrT   ssm_parameters	time_stepBCtime_proj_biasdiscrete_time_stepr   scan_outputs	ssm_statecontextualized_statess                         r.   cuda_kernels_forwardz$JambaMambaMixer.cuda_kernels_forwardw  s    "/!4!4
GQ$ //1 ((8>>qA&&t~~6<<Q?	 	  <<6@@AF /44QA4>t%)N,D,DQ,GGM {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)((8  M *33B7M' mm//@U@UXeXkXklnXo@oqr?st((8>>{K,]L$++JZJZgkgvgvwM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a %%i0	QQ **//]]_ 	N%*%5%5dll6G6G6L6L%MDLL"	N!\\)4>>q!D]]_ 	4%3DLL"	4 YYtzz'')**3A3M--/SW!1''7f%"6*!Q$!Q$V  im  '8"Aq!Aq!#"&'#L) $)A''7==iH !%l.D.DQ.J K$$S	N 	N	4 	4s   AT T T
Tc           	      n   |j                   \  }}}|j                  }| j                  |      j                  dd      }|j	                  dd      \  }	}
||	|j                  d      z  }	t        |t              }|r8|j                  | j                     j                   d   |k(  r| j                  r(|j                  | j                     j                         }n|j                  | j                     }|j                  |	j                        }|j                  r|dk(  r|j                  | j                     j                   d   |k(  r|j                  | j                     }t!        j"                  |dd      }|	d d d d df   |d d d d df<   ||j                  | j                  <   t!        j$                  || j&                  j(                  d d dd d f   z  d      }	| j*                  r|	| j&                  j,                  z  }	| j/                  |	      j                  |      j                  d      }	nt0        j2                  j5                  |	| j6                  |	j                   d   z
  df      }||j                  | j                  <   | j/                  | j'                  |	      dd |f         }	n`t!        j8                  || j:                  | j<                  f|	j                  |      }| j/                  | j'                  |	      dd |f         }	||	|j                  d      z  }	| j?                  |	j                  dd            }t!        j@                  || jB                  | j<                  | j<                  gd      \  }}}| jE                  |      }| jG                  |      }| jI                  |      }| jK                  |      }t0        j2                  jM                  |      j                  dd      }t!        jN                  | jP                  jS                                }t!        jN                  |d d d d d d f   |d d d d d d d f   z        }|d d d d d d d f   |d d d d d d d f   jS                         z  }||	d d d d d d d f   jS                         z  }g }tU        |      D ]}  }|d d d d |d d f   |z  |d d d d |d d f   z   }t!        jV                  |j                  |      |d d |d d f   j                  d            }|jY                  |d d d d df           t!        jZ                  |d      }||	| j\                  d d d d f   z  z   }|| j/                  |
      z  }|r||j                  | j                  <   | j_                  |j                  dd            }|S )	Nr   r1   rr   r   r2   )shiftsdims.rL   )0r@   r4   r   r   r  r   
isinstancerI   rU   rk   r   cloner5   rM   rP   rT   r&   rollsumr   r(   r   r   r   r   r   r
  rb   rY   r`   ra   r   r  r   r   r   r   r   softplusr  r   r  rW   r   r[   stackr   r   )r*   input_statesr  r   r_   r  rd   r4   r  r:   r  	use_cacher  
conv_stater  r  r  r  r  r   
discrete_A
discrete_BdeltaB_ur  rc   scan_outputr  s                              r.   slow_forwardzJambaMambaMixer.slow_forward  s   !-!3!3
GQ""<<5??1E.44QA4>t%)N,D,DQ,GGM|-MN	00@FFqIZW}}(33DNNCIIK	(33DNNC	!]%9%9:I..7a< ,,T^^<BB1ES)55dnnE
"ZZ
2BG
'4Q1W'=
1a8$;E((8 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB O]]..!**]-@-@-DDaH
 <F((8 $])CC'M)R ST33T5H5HI$++5I !HHT[[%?XgX%NOM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a %%i0	QQ!\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DDw 	6A"1aA:.:XaAqj=QQI,,y||E':AaAgJ<P<PQS<TUKAq!G 45	6 kk,B7!]TVVD!TM5J%JK"TXXd^36?L##DNN3 !%k.C.CAq.I J$$r/   c                     | j                   rPt        r,d| j                  j                  j                  j
                  vrt        d      | j                  |||      S | j                  |||      S )NcudazsFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device)	r   r   r   r(   rM   type
ValueErrorr   r1  )r*   r:   r  r   s       r.   r=   zJambaMambaMixer.forward7  sl       )V4;;;M;M;T;T;Y;Y-Y  J  ,,]L.YY  nMMr/   )NN)rC   rD   rE   r   r   r$   r&   r   rI   r   r   r1  r=   rF   rG   s   @r.   r   r   +  s    B{ BN AE26	h%||h% 7=h% ((4/	h%VR%7WZ^7^ R%w|  xH  xH  KO  xO R%p AE26	N 7=N ((4/	Nr/   r   c                   $     e Zd Z fdZd Z xZS )JambaMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFr   )r#   r$   r^   r+   r`   r   r   	gate_projup_proj	down_projr   r   act_fnr*   r^   r-   s     r.   r$   zJambaMLP.__init__G  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r/   c                     | j                  | j                  | j                  |            | j                  |      z        }|S rf   )r<  r=  r:  r;  )r*   r   r<  s      r.   r=   zJambaMLP.forwardQ  s6    NN4;;t~~a/@#ADLLQRO#ST	r/   )rC   rD   rE   r$   r=   rF   rG   s   @r.   r7  r7  F  s    0r/   r7  c                        e Zd ZdZdef fdZdej                  dej                  dej                  dej                  fdZ xZ	S )	JambaExpertsz2Collection of expert weights stored as 3D tensors.r^   c                    t         |           |j                  | _        |j                  | _        |j                  | _        t        j                  t        j                  | j                  d| j                  z  | j
                              | _        t        j                  t        j                  | j                  | j
                  | j                              | _        t        |j                     | _        y )Nr1   )r#   r$   num_local_expertsnum_expertsr+   
hidden_dimr`   intermediate_dimr   r%   r&   emptygate_up_projr<  r   r   r=  r>  s     r.   r$   zJambaExperts.__init__Z  s    !33 ,, & 8 8LLT5E5Eq4K`K`G`bfbqbq)rsekk$2B2BDOOUYUjUj&klV../r/   r:   top_k_indextop_k_weightsrp   c                 f   t        j                  |      }t        j                         5  t         j                  j                  j                  || j                        }|j                  ddd      }t        j                  |j                  d      d      j                         }d d d        D ]  }|d   }|| j                  k(  rt        j                  |         \  }}	||	   }
t        j                  j                  |
| j                  |         j                  dd      \  }}| j                  |      |z  }t        j                  j                  || j                   |         }|||	|d f   z  }|j#                  d|	|j%                  |j&                                |S # 1 sw Y   xY w)N)num_classesr1   r   r   )r2   r   rr   r2   )r&   r  r  r   r   one_hotrD  permutegreaterr'  nonzerowherelinearrH  r  r=  r<  
index_add_r5   r4   )r*   r:   rI  rJ  final_hidden_statesexpert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stater  upcurrent_hidden_statess                 r.   r=   zJambaExperts.forwardc  s    $..}=]]_ 	S((--55ktO_O_5`K%--aA6K{8'DaHPPRJ	S
 % 
	nJ#AJT---#(;;{:/F#G Iy))4M}}++M4;L;LZ;XY__`agi_jHD"$(KK$5$:!$&MM$8$89NPTP^P^_iPj$k!$9M)U^`dJd<e$e!**1i9N9Q9QReRkRk9lm
	n #"#	S 	Ss   A=F&&F0)
rC   rD   rE   r   r   r$   r&   r   r=   rF   rG   s   @r.   rA  rA  V  sM    <0{ 0#||# \\# ||	#
 
#r/   rA  c                   f     e Zd ZdZdef fdZd Zdej                  dej                  fdZ	 xZ
S )JambaSparseMoeBlocka  
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accommodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    r^   c                 ,   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        |      | _        y r9  )r#   r$   r+   rE  r`   ffn_dimrD  num_experts_per_toktop_kr   r   routerrA  expertsr>  s     r.   r$   zJambaSparseMoeBlock.__init__  sm     ,,//!--//
ii1A1AN#F+r/   c                     t         j                  j                  j                  |dt         j                        }t        j
                  || j                  d      \  }}||j                  |j                        fS )Nr2   r   rr   )	r&   r   r   r   r  topkrb  r5   r4   )r*   r:   router_logitsrouting_weightsrJ  rI  s         r.   route_tokens_to_expertsz+JambaSparseMoeBlock.route_tokens_to_experts  sb    ((--55mSXS^S^5_%*ZZQS%T"{M,,]-@-@AAAr/   r:   rp   c                     |j                   \  }}}|j                  d|      }| j                  |      }| j                  ||      \  }}| j	                  |||      }|j                  |||      }|S )Nr2   )r@   r   rc  ri  rd  r   )r*   r:   r_   sequence_lengthrE  rg  rI  rJ  s           r.   r=   zJambaSparseMoeBlock.forward  sx    2?2E2E/
OZ%**2z:M2%)%A%A-Q^%_"]]KO%--j/:Vr/   )rC   rD   rE   r   r   r$   ri  r&   r   r=   rF   rG   s   @r.   r^  r^  ~  s5    	,{ ,B
U\\ ell r/   r^  c                        e Zd Zdedef fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
ej                  dz  dee   dej                  fdZ xZS )JambaAttentionDecoderLayerr^   rk   c                 R   t         |           |j                  r|j                  |   nd}t        ||      | _        |dkD  rt
        nt        } ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr   r   )r#   r$   layers_num_expertsr   	self_attnr^  r7  feed_forwardr!   r+   r   input_layernormpre_ff_layernormr*   r^   rk   rD  ffn_layer_classr-   s        r.   r$   z#JambaAttentionDecoderLayer.__init__  s    >D>W>Wf//	:]^'	:1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yr/   Nr:   r   position_idsr   r+  r{   r   rp   c           
          |}| j                  |      } | j                  d||||||d|\  }}	||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r:   r   rv  r   r+  r{   r   )rr  rp  rs  rq  )
r*   r:   r   rv  r   r+  r{   r   residualrd   s
             r.   r=   z"JambaAttentionDecoderLayer.forward  s     !,,];)4>> 
')%+)
 
q !=0 --m<))-8 =0r/   )NNNFN)rC   rD   rE   r   r   r$   r&   r   r   rI   boolr   r   FloatTensorr=   rF   rG   s   @r.   rm  rm    s    Z{ Zs Z /304CG!&26|| t+ &&-	
 :D@ $; ((4/ +, 
		r/   rm  c                        e Zd Zdedef fdZ	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
e   d
ej                  fdZ xZS )JambaMambaDecoderLayerr^   rk   c                 T   t         |           |j                  r|j                  |   nd}t        ||      | _        |dkD  rt
        nt        } ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr   )r^   rk   r   )r#   r$   ro  r   rK   r^  r7  rq  r!   r+   r   rr  rs  rt  s        r.   r$   zJambaMambaDecoderLayer.__init__  s    >D>W>Wf//	:]^$FiH
1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yr/   Nr:   r   rv  r   r   rp   c                     |}| j                  |      }| j                  |||      }||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r:   r  r   )rr  rK   rs  rq  )r*   r:   r   rv  r   r   rx  s          r.   r=   zJambaMambaDecoderLayer.forward  sv     !,,];

'() # 

 !=0 --m<))-8 =0r/   r   )rC   rD   rE   r   r   r$   r&   r   r   rI   r   r   rz  r=   rF   rG   s   @r.   r|  r|    s    Z{ Zs Z /304CG|| t+ &&-	
 :D@ +, 
		r/   r|  c                        e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZeege eej"                  d      d	Z ej(                          fd
       Z xZS )JambaPreTrainedModelr^   modelTrm  r|  r   rc  )
layer_name)r:   
attentionsrg  c                    t         |   |       t        |t              rt	        j
                  d|j                  dz         d d d f   }|j                  |j                  d      j                         }t        j                  |j                  t	        j                  |             t        j                  |j                         y t        |t               rmt        j"                  |j$                  d| j&                  j(                         t        j"                  |j*                  d| j&                  j(                         y y )Nr   r2   r   )r8   std)r#   _init_weightsr$  r   r&   r   ra   r   r`   r   initr  r   r   ones_r   rA  normal_rH  r^   initializer_ranger<  )r*   r   r   r-   s      r.   r  z"JambaPreTrainedModel._init_weights  s    f%fo.Q 5 5 9:47CA1126AACAJJv||UYYq\2JJvxx -LL,,3DKK<Y<YZLL))9V9VW .r/   )rC   rD   rE   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulrm  r|  r   r   r   r   _can_record_outputsr&   r  r  rF   rG   s   @r.   r  r    sw    &*#57OP"3NL46LM$'		hG U]]_	X 	Xr/   r  )	attentionrK   c                       e Zd Zdef fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  dej                  dz  d	edz  d
ej                  dz  dee   defd              Zd Z xZS )
JambaModelr^   c                     t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        g }t        |j                        D ]1  }t        |j                  |      }|j                   |||             3 t        j                  |      | _        t!        |j                  |j"                        | _        d| _        | j)                          y )N)rk   r   F)r#   r$   pad_token_idpadding_idx
vocab_sizer   	Embeddingr+   embed_tokensrW   rX   ALL_DECODER_LAYER_TYPESrO   r[   
ModuleListlayersr!   r   final_layernormgradient_checkpointing	post_init)r*   r^   decoder_layersrc   layer_classr-   s        r.   r$   zJambaModel.__init__  s     !.. ++LL):):F<N<NPTP`P`av//0 	DA1&2J2J12MNK!!+f"BC	D mmN3+F,>,>FDWDWX&+#r/   N	input_idsr   rv  r   inputs_embedsr+  r{   r   rp   c           
         |d u |d uz  rt        d      || j                  |      }|r<|:t        | j                  |j                  d   |j
                  |j                        }|F||j                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }
| j                  ||      }|}| j                  D ]%  }t        |t              r|n|
} ||f|||||d|}' | j!                  |      }|r|j"                  sd|_        t%        ||	      S )
Nz:You must specify exactly one of input_ids or inputs_embedsr   )r^   r_   r4   rM   r   rN   )r^   input_embedsr   r{   r   rv  )r   rv  r   r+  r{   T)last_hidden_stater   )r5  r  rI   r^   r@   r4   rM   rx   r&   r   r   r   _update_mamba_maskr  r$  r|  r  rP   r   )r*   r  r   rv  r   r  r+  r{   r   past_seen_tokensr   
mamba_maskr:   decoder_layer
layer_masks                  r.   r=   zJambaModel.forward  s    -t";<YZZ  --i8M0>{{(..q1#))$++	O !CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;&))+%
 ,,^^L
%![[ 	M'1-AW'X^iJ))) /#- M	 ,,];?#E#E15O.%++
 	
r/   c                 V    |}||d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        Nr   r   )r&   r   )r*   r   r{   r  s       r.   r  zJambaModel._update_mamba_maskd  s<     $
&>!+<q+@&599^q5H+IJr/   )NNNNNNN)rC   rD   rE   r   r$   r   r   r&   r   r   rI   rz  ry  r   r   r   r=   r  rF   rG   s   @r.   r  r    s    { $  .2.204CG26!%26A
##d*A
 t+A
 &&-	A

 :D@A
 ((4/A
 $;A
 ((4/A
 +,A
 
 A
  A
Fr/   r  gate_logitsrD  c                    | t        | t              syt        | t              rC| d   j                  }t        j                  | D cg c]  }|j                  |       c}d      }t        j                  j                  j                  d      }t        j                  ||d      \  }}	t        j                  j                  j                  |	|      }
|>t        j                  |
j                         d      }t        j                  |d      }n|j                  \  }}|j                  d   ||z  z  }|dddddddf   j                  |||||f      j                  d||      j                        }t        j                   |
j                         |z  d      t        j                   |d      z  }|ddddddf   j                  ||||f      j                  d|      j                  |      }t        j                   ||z  d      t        j                   |d      z  }t        j                   ||j#                  d      z        }||z  S c c}w )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   rr   r2   )r$  r?   rM   r&   rt   r5   r   r   r   rf  rM  r8   r  r@   r   r   r'  r   )r  rD  rb  r   compute_device
layer_gateconcatenated_gate_logitsrh  rd   selected_expertsrU  tokens_per_expertrouter_prob_per_expertr_   rk  rX   expert_attention_mask router_per_expert_attention_maskoverall_losss                      r.   load_balancing_loss_funcr  r  s9   : *[%"@+u%$Q..#(99^i-jPZjmmN.K-jpq#r hh))112JPR1SO**_eDA((%%--.>LK!JJ{'8'8':B "'O!C&4&:&:#
O4::1=*B^_ 4AtT12V&
OUKXYWR,R	 	 "IIk&7&7&9<Q&QWXY\a\e\e!q]
 
 4At+,V&
O[QRWR%R	 	) "'?=]+]cd!ehmhqhq,!i
 "
 99.1G1Q1QRS1TTUL+%%[ .ks   Ic                   t    e Zd ZddiZddiZddgdgfiZdef fdZee		 	 	 	 	 	 	 	 	 	 dd
e
j                  d	z  de
j                  d	z  de
j                  d	z  ded	z  de
j                  d	z  de
j                  d	z  ded	z  ded	z  de
j                  d	z  dee
j                  z  dee   defd              Z xZS )JambaForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr:   logitsr^   c                 N   t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  | _	        |j                  | _
        |j                  | _        | j                          y r9  )r#   r$   r  r  r  r   r   r+   r  router_aux_loss_coefrD  ra  r  r>  s     r.   r$   zJambaForCausalLM.__init__  s     '
 ++yy!3!3V5F5FUS$*$?$?!!--#)#=#=  	r/   Nr  r   rv  r   r  labelsr+  output_router_logitsr{   logits_to_keepr   rp   c                 l   ||n| j                   j                  } | j                  d||||||||	d|}|j                  }t	        |
t
              rt        |
 d      n|
}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |}d}|rYt        |j                  | j                  | j                  |      }|+|| j                  |j                  |j                         z  z  }t#        ||||j$                  |j&                  |j(                  |j                        S )aj  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, JambaForCausalLM

        >>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
        >>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r  r   rv  r   r  r+  r  r{   )lossaux_lossr  r   r:   r  rg  r   )r^   r  r  r  r$  r   slicer  loss_functionr  r  rg  rD  ra  r  r5   rM   r   r   r:   r  )r*   r  r   rv  r   r  r  r+  r  r{   r  r   outputsr:   slice_indicesr  r  r  s                     r.   r=   zJambaForCausalLM.forward  sX   P %9$D $++JjJj 	
 +5$** 
+
)%+'!5)
+
 
+
  118B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
r/   )
NNNNNNNNNr   )rC   rD   rE   _tied_weights_keys_tp_plan_pp_planr   r$   r   r   r&   r   r   rI   rz  ry  r   r   r   r   r=   rF   rG   s   @r.   r  r    sV   *,GH23H_-z:;H
{ 
  .2.204CG26*.!%,026-.R
##d*R
 t+R
 &&-	R

 :D@R
 ((4/R
   4'R
 $;R
 #TkR
 ((4/R
 ell*R
 +,R
 
#R
  R
r/   r  c                       e Zd Zy)JambaForSequenceClassificationN)rC   rD   rE   r   r/   r.   r  r  -  s    r/   r  )r  r  r  r  )r   )r   )Nr1   N)Fcollections.abcr   typingr   r&   r    r   r  activationsr   
generationr	   integrationsr
   r   r   r   r   masking_utilsr   modeling_layersr   r   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   configuration_jambar   
get_loggerrC   r   Moduler!   rI   r   r   r   r   r   r  r   r   r   r7  rA  r^  rm  r|  r  r  r  r?   r  r  r  __all__r   r/   r.   <module>r     s  2 %    & ! )  0 [ Q F & R R ? , 
		H	% Y'J299 J (J(\3 \3~( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%4 )*6)RYY 6) +6)rXNbii XNvryy   $#299 $# $#N"")) "J%!; %P7 BX? X8 )CMcd  c% c cP #
*.	O&ell 33d:O&tO& LL4'	O&
 \\CO&d e
+_ e
 e
P	%EG[ 	 gr/   