
    io?                       d dl Z d dlmZ d dlmZ d dlmZmZ d dlZd dlm	Z	 d dl
mZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3m4Z4 ddl5m6Z6  e4       rd dl7m8Z8 d dl9m:Z:m;Z; nd\  Z8Z:Z; e3       r	d dl<m=Z=m>Z> nd\  Z>Z= e.j~                  e@      ZA G d dej                  j                        ZC G d  d!e	j                        ZD G d" d#      ZE G d$ d%e	j                        ZFd&ej                  d'eHd(ej                  fd)ZI	 dQd*e	j                  d+ej                  d,ej                  d-ej                  d.ej                  dz  d/eJd0eJfd1ZKd2 ZL ed3      dRd4       ZM G d5 d6e	j                        ZNd7ej                  d8eHfd9ZOd: ZPd; ZQ eRe8e=e>f      ZS G d< d=e	j                        ZT G d> d?e	j                        ZU G d@ dAe	j                        ZV G dB dCe      ZW G dD dEe      ZX G dF dGe(      ZYe, G dH dIeY             ZZ G dJ dKeYe      Z[ e,dLM       G dN dOeY             Z\g dPZ]y)S    N)Callable)cycle)AnyOptional)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)Cache)GenerationMixin)use_kernel_func_from_hub)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringis_torchdynamo_compilinglogging)is_flash_attention_requestedmaybe_autocast)is_causal_conv1d_availableis_mamba_ssm_available   )Zamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_updateNNc                   (     e Zd Zd fd	ZddZ xZS )Zamba2RMSNormGatedc                     t         |           t        j                  t	        j
                  |            | _        || _        || _        y N)	super__init__r   	Parametertorchonesweightvariance_epsilon
group_size)selfhidden_sizer7   eps	__class__s       t/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/zamba2/modeling_zamba2.pyr1   zZamba2RMSNormGated.__init__?   s6    ll5::k#:; #$    c                 b   |j                   }|j                  t        j                        }|?|t        j
                  j                  |j                  t        j                              z  }|j                  ^ }}|| j                  z  } |j                  g ||| j                   }|j                  d      j                  dd      }|t        j                  || j                  z         z  } |j                  g ||| j                  z   }| j                  |j                  |      z  S N   T)keepdim)dtypetor3   float32r   
functionalsilushaper7   viewpowmeanrsqrtr6   r5   )	r8   hidden_statesgateinput_dtypeprefix_dimslast_dimgroup_counthidden_states_groupvariances	            r<   forwardzZamba2RMSNormGated.forwardE   s   #))%((7)BMM,>,>twwu}}?U,VVM!.!4!4h$//10m00\+\{\DOO\&**1-222t2D1EKK4K`K`@`4aa0+00]+]{T__?\]{{]--k:::r=   gư>r/   )__name__
__module____qualname__r1   rU   __classcell__r;   s   @r<   r-   r-   >   s    %;r=   r-   c                   ,     e Zd Zd fd	Zd Zd Z xZS )Zamba2RMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z<
        Zamba2RMSNorm is equivalent to T5LayerNorm
        N)r0   r1   r   r2   r3   r4   r5   r6   )r8   r9   r:   r;   s      r<   r1   zZamba2RMSNorm.__init__T   s1     	ll5::k#:; #r=   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S r?   )	rC   rD   r3   rE   rJ   rK   rL   r6   r5   )r8   rM   rO   rT   s       r<   rU   zZamba2RMSNorm.forward\   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r=   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler5   rH   r6   r8   s    r<   
extra_reprzZamba2RMSNorm.extra_reprc   s*    ))*+6$2G2G1HIIr=   rV   )rW   rX   rY   r1   rU   rc   rZ   r[   s   @r<   r]   r]   S   s    $;Jr=   r]   c                      e Zd ZdZdZej                  dfdededej                  de
dz  fdZd	 Z	 dd
ej                  dej                  dedee
ef   dz  deej                  ej                  f   f
dZdej$                  fdZddedz  defdZdedej                  dej$                  dej                  fdZd Zy)Zamba2HybridDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNconfig
batch_sizerC   devicec           	      .   || _         |j                  | _        d| _        t        |j                  |j
                  z        | _        |j                  | _        |j                  | _
        |j                  | _        g | _        i | _        i | _        i | _        i | _        i | _        t%        |j&                        D ]  }t)        j*                  || j                  d|j,                  z  |j                  z  z   | j                  ||      | j                   |<   t)        j*                  || j                  |j.                  | j                  ||      | j"                  |<   | j                  |   dk(  s| j                  j1                  |        t%        |j&                        D cg c]  }t)        j2                  g g|z  |       c}| _        t%        |j&                        D cg c]  }t)        j2                  g g|z  |       c}| _        y c c}w c c}w )NFr@   rh   rC   hybridrh   )rC   layers_block_typehas_previous_stateintmamba_expandr9   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headstransformer_layers_modules_parameters_buffersconv_states
ssm_statesrangenum_hidden_layersr3   zerosmamba_ngroupsmamba_headdimappendtensor	key_cachevalue_cache)r8   rf   rg   rC   rh   i_s          r<   r1   z!Zamba2HybridDynamicCache.__init__w   s    
!'!9!9"'!$V%8%86;M;M%M!N$22 & 3 3#11"$v//0 	2A"'++&&V-A-A)AFDXDX)XX%%#DQ "'D..0D0DdFYFYbhpu"DOOA %%a(H4''..q1	2 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts   !"H""Hc                 ,    t        | j                        S r/   )lenr   rb   s    r<   __len__z Zamba2HybridDynamicCache.__len__   s    4>>""r=   
key_statesvalue_states	layer_idxcache_kwargsreturnc                    | j                   |   j                  d   dk(  r|| j                   |<   || j                  |<   nft        j                  | j                   |   |gd      | j                   |<   t        j                  | j                  |   |gd      | j                  |<   | j                   |   | j                  |   fS )NrA   r   r@   dim)r   rH   r   r3   cat)r8   r   r   r   r   s        r<   updatezZamba2HybridDynamicCache.update   s     >>)$**2.!3(2DNN9%*6DY'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr=   beam_idxc                    | j                         dkD  rvt        t        | j                              D ]S  }| j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   V yy)zDReorders the cache for beam search, given the selected beam indices.r   N)
get_seq_lengthr}   r   r   rh   index_selectrD   r   r{   r|   )r8   r   r   rh   s       r<   reorder_cachez&Zamba2HybridDynamicCache.reorder_cache   s[    1$"3t~~#67 	m		299,0NN9,E,R,RSTV^VaVabhVi,jy))))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +)))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +3::-1__Y-G-T-TUVX`XcXcdjXk-l	*	m %r=   c                     || j                   vr| j                   d   n|}t        | j                        |k  s | j                  |   j                         dk(  ry| j                  |   j                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rw   r   r   numelrH   )r8   r   s     r<   r   z'Zamba2HybridDynamicCache.get_seq_length   sl     3<4CZCZ2ZD++A.`i	t~~)+t~~i/H/N/N/PTU/U~~i(..r22r=   new_conv_statecache_positionc                 T   | j                   |   }|j                  d| j                  dz
        }|j                  dd      }|j	                  |j
                        |d d d d |f<   | j                   |   j                          | j                   |xx   |z  cc<   | j                   |   S )Nr   r#   rA   shiftsdims)r{   clampru   rollrD   rh   zero_)r8   r   r   r   
conv_states        r<   update_conv_statez*Zamba2HybridDynamicCache.update_conv_state   s     %%i0
'--a1F1F1JK__BR_8
+9+<+<Z=N=N+O
1a'(#))+#z1#	**r=   c                 l    | j                   j                          | j                  j                          y r/   )r{   r   r|   rb   s    r<   resetzZamba2HybridDynamicCache.reset   s$     r=   r/   )r   )rW   rX   rY   __doc__is_compileabler3   float16r$   ro   rC   strr1   r   Tensordictr   ra   r   
LongTensorr   r   r   r    r=   r<   re   re   g   s     N KP--nru"u03u<AKKuadgkaku@# /3FLLF llF 	F
 38nt+F 
u||U\\)	*F"me&6&6 m3d
 33 3
+
+.3ll
+LQL\L\
+	
+ r=   re   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )Zamba2RotaryEmbeddinginv_freqNrf   c                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultr   F)
persistentoriginal_inv_freq)r0   r1   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrf   rope_parametersr   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r8   rf   rh   rope_init_fnr   r;   s        r<   r1   zZamba2RotaryEmbedding.__init__   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuUr=   rh   ztorch.deviceseq_lenr   ztorch.Tensorc                    | j                   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   r@   rC   rj   )	r   getattrr9   num_attention_headsr3   arangeint64rD   float)rf   rh   r   baser   attention_factorr   s          r<   r   z5Zamba2RotaryEmbedding.compute_default_rope_parameters   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r=   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rA   r#   mpscpuF)device_typeenabledr@   r   r   )r   r   expandrH   rD   rh   
isinstancetyper   r    	transposer3   r   cosr   sinrC   )
r8   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r<   rU   zZamba2RotaryEmbedding.forward  sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$r/   r(   )rW   rX   rY   r3   r   __annotations__r$   r1   staticmethodr   ro   ra   r   r   no_gradr   rU   rZ   r[   s   @r<   r   r      s    llV| V  &*+/"*t#*(* t* 
~u$	%	* *: U]]_<  <r=   r   rM   n_repr   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r#   N)rH   r   reshape)rM   r   batchnum_key_value_headsslenr   s         r<   	repeat_kvr     so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr=   modulequerykeyvalueattention_maskscalingdropoutc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr@   r   r   rA   )r   rC   )ptrainingr#   )r   num_key_value_groupsr3   matmulr   rH   r   rF   softmaxrE   rD   rC   r   r   
contiguous)r   r   r   r   r   r   r   kwargsr   r   attn_weightscausal_maskattn_outputs                r<   eager_attention_forwardr     s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r=   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrA   r@   r   )rH   r3   r   )r   x1x2s      r<   rotate_halfr   9  sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r=   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embeds          r<   apply_rotary_pos_embr	  @  sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGr=   c                   F    e Zd ZdZ	 	 	 ddededz  dedz  dedz  f fdZ	 	 	 ddej                  ded	ej                  dz  d
e	dz  de
ej                  ej                  f   dz  dee   de
ej                  ej                  dz  e
ej                     dz  f   fdZ xZS )Zamba2AttentionaZ  
    Multi-headed attention from 'Attention Is All You Need' paper.

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
    layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
    expressivity with a small memory overhead (see Fig. 2 of https://huggingface.co/papers/2411.15242).
    Nrf   r   num_fwd_mem_blocksblock_idc           	         t         |           || _        || _        |j                  | _        |j
                  | _        |j                  |j                  z  | _	        |j                  | _
        | j                  dz  dz  | _        d| _        |j                  | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j&                  d      | _        || _        |j,                  | _        || _        |j2                  rt        j4                  g       | _        t        j4                  g       | _        t        j4                  g       | _        t=        | j*                        D ]  }||j>                  z  |k(  r{t        j@                  t        j                  | j                  | j                  jB                  d      t        j                  | j                  jB                  | j                  d            }t        j@                  t        j                  | j                  | j                  jB                  d      t        j                  | j                  jB                  | j                  d            }t        j@                  t        j                  | j                  | j                  jB                  d      t        j                  | j                  jB                  | j                  d            }n<t        jD                         }t        jD                         }t        jD                         }| j6                  jG                  |       | j8                  jG                  |       | j:                  jG                  |       ! tI        | j.                        D 	
ci c]  \  }	}
|
|	
 c}
}	| _%        y c c}
}	w )Nr@   g      TFbias)&r0   r1   rf   r   attention_hidden_sizeattention_head_dimr   r   r   r   r   r   	is_causalattention_dropoutr   Linearq_projk_projv_projr9   o_projr  hybrid_layer_idslayer_block_mapr  use_shared_attention_adapter
ModuleListlinear_q_adapter_listlinear_k_adapter_listlinear_v_adapter_listr}   num_mem_blocks
Sequentialadapter_rankIdentityr   	enumerate	layer_dic)r8   rf   r   r  r  r   linear_q_adapterlinear_k_adapterlinear_v_adapterindexr   r;   s              r<   r1   zZamba2Attention.__init__j  sJ    	"%+%A%A"11$*$>$>&B\B\$\!'-'E'E$)d2!'!9!9ii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii : :T]] JFL^L^ejk"4%66 ..)+r):D&)+r):D&)+r):D&4223 Dv,,,8')}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($
 (*{{}$'){{}$'){{}$**112BC**112BC**112BC)D, <ETEYEY;Z[<5%%,[[s   Q6rM   r   past_key_valuesposition_embeddingsr   r   c                    |j                   d d }g |d| j                  }| j                  |      }	| j                  |      }
| j	                  |      }| j
                  j                  rW| j                  |   }|	 | j                  |   |      z   }	|
 | j                  |   |      z   }
| | j                  |   |      z   }|	j                  |      j                  dd      }	|
j                  |      j                  dd      }
|j                  |      j                  dd      }| j
                  j                  r|\  }}t        |	|
||      \  }	}
||j                  |
||      \  }
}t!        j"                  | j
                  j$                  t&              } || |	|
||f| j(                  sdn| j*                  | j,                  d|\  }} |j.                  g |d j1                         }| j3                  |      }||fS )NrA   r#   r@           )r   r   )rH   r   r  r  r  rf   r  r&  r  r  r   rI   r   use_mem_roper	  r   r   get_interface_attn_implementationr   r   r  r   r   r   r  )r8   rM   r   r   r+  r,  r   input_shapehidden_shapequery_statesr   r   adapter_layer_idxr   r   attention_interfacer   r   s                     r<   rU   zZamba2Attention.forward  s    $))#2.88b8$--8{{=1[[/
{{=1;;33 $y 9'*W$*D*DEV*WXe*ffL#&Sd&@&@AR&STa&bbJ'*W$*D*DEV*WXe*ffL#((6@@AF__\2<<QB
#((6@@AF;;##*HC';L*VY[^'_$L*&'6'='=j,Xa'b$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r=   r(   )rW   rX   rY   r   r$   ro   r1   r3   r   re   ra   r   r   rU   rZ   r[   s   @r<   r  r  Z  s    $ !%)-#6\6\ :6\  $J	6\
 *6\x /3;?HL1)||1) 1) t+	1)
 2D81) #5<<#=>E1) -.1) 
u||U\\D0%2E2LL	M1)r=   r  input_tensorpad_sizec                     t        | j                        dk(  r
ddddd|ddfnddd|ddf}t        j                  j                  j                  | |dd      S )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)moder   )r   rH   r3   r   rF   pad)r7  r8  	pad_shapes      r<   pad_tensor_by_sizer?    sf     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UUr=   c                    t        | |      } t        | j                        dk(  r.| j                  | j                  d   d|| j                  d         S | j                  | j                  d   d|| j                  d   | j                  d         S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r   r   rA   r@   )r?  r   rH   r   )r7  r8  
chunk_sizes      r<   reshape_into_chunksrB    s     &lH=L
<!###L$6$6q$92z<K]K]^_K`aa ##q!2z<3E3Ea3H,J\J\]^J_
 	
r=   c                 "   | j                  d      } | d   j                  g | j                         | } t        j                  t        j                  ||| j
                  t        j                        d      }| j                  | d      } t        j                  | d      }t        j                  t        j                  ||| j
                  t        j                        d      }|j                  | t        j                         }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    rA   .Nrj   diagonalr   r   r   )
sizer   r3   trilr4   rh   boolmasked_fillcumsuminf)r7  rA  masktensor_segsums       r<   segment_sumrO    s     ""2&J 2<	*11S<3D3D3FS
SL::ejjZ@S@S[`[e[efqstD++TE15LLL26M ::ejjZ@S@S[`[e[efqrsD!--teeiiZ@Mr=   c                        e Zd ZdZddededz  f fdZ	 	 ddej                  de	dz  dej                  dz  fd	Z
dde	dz  dej                  dz  fd
Z	 	 dde	dz  dej                  dz  fdZ xZS )Zamba2MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    Nrf   r   c           	         t         |           || _        |j                  | _        |j                  | _        |j                  | _        t        |j                  | j                  z        | _
        || _        |j                  | _        d| _        t        j                         | _        |j"                  | _        |j$                  | _        |j(                  | _        | j                  j,                  | _        |j0                  | _        |j2                  | _        |j4                  | _        |j6                  | _        | j                  d| j&                  z  | j
                  z  z   | _        t        j:                  | j8                  | j8                  d|j                  | j8                  |j                  dz
        | _        | j                  | j8                  z   | j.                  z   }t        j>                  | j                  ||j@                        | _!        t        jD                  tG        jH                  | j.                              | _%        tG        jL                  d| j.                  dz         }t        jD                  tG        jN                  |            | _(        tS        | j                  | j                  | j&                  z  d      | _*        t        jD                  tG        jH                  | j.                              | _+        t        j>                  | j                  | j                  |j@                        | _,        tZ        st\        j_                  d	       y y )
NrG   r@   Tr#   )in_channelsout_channelsr  kernel_sizegroupspaddingr  gh㈵>)r7   r:   a  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)0r0   r1   rf   r9   rr   rs   rt   ru   ro   rp   rq   r   use_conv_bias
activationr   SiLUactuse_mem_eff_pathr   n_groupsr   r   rv   	num_headsrA  time_step_limittime_step_mintime_step_maxconv_dimConv1dconv1dr  add_bias_linearin_projr2   r3   r4   dt_biasr   logA_logr-   normDout_projis_fast_path_availableloggerwarning_once)r8   rf   r   projection_sizeAr;   s        r<   r1   zZamba2MambaMixer.__init__  s{   !--$22 & 3 3!$V%8%84;K;K%K!L"#11 779 & 7 7,,,,22 ++%55#11#11..T]]1BTEXEX1XXii++==''!+
 004==@4>>Qyy''
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&""t/E/E/V\`
	 ejj89		$"8"8$:J:JQWQgQgh%> &r=   rM   cache_paramsr   c                    |j                   \  }}}| j                  | j                  z  }d| j                  z  d| j                  z  | j                  z  z   | j                  z   }|4|j
                  r'| j                  |j                  d            }	|	j                   d   |z
  dz  }
|
|
| j                  | j                  | j                  g}t        j                  |	|d      \  }}}}}t        ||j                  | j                     | j                  j                  j                  d      | j                  j                   | j"                        }t        j                  || j                  ||gd      \  }}}t        j$                  | j&                  j)                                }|d d d df   d d d d d f   j+                  d| j,                  | j                        j/                  t        j0                        }|d d d d d f   j+                  dd| j,                        }| j2                  d d d df   j+                  d| j,                        }| j4                  d d d df   j+                  d| j,                        }|j7                  || j                  |j                   d   | j                  z        }|j7                  || j                  |j                   d   | j                  z        }|j7                  || j                  | j,                        }t9        |j:                  | j                     ||||||d |d
      }|j7                  || j                  | j,                  z        }| j=                  ||      }| j?                  |      d d d df   }|S |Bt        j@                  |dk(        s*|jB                  }||d d d d d f   z  j/                  |      }| j                  |      }t        j$                  | j&                  j)                                }| jD                  i nd	| jD                  i}|t        j@                  |dk(        }nd}| jF                  r| jH                  r||rtK        || j                  j                  j                  d      | j                  j                   | j2                  |f| j4                  | jL                  d | j"                  | j<                  j                  | j<                  jN                  | j>                  j                  | j>                  j                   | j,                  | j                  d
dd|\  }}|S t        j                  || j                  | j                  | j                  gd      \  }}}|v|jQ                  dd      }tR        jT                  jW                  || jX                  |j                   d   z
  df      }|j                  | j                     j[                  |       t\        | j"                  dvrJ| j_                  | j                  |jQ                  dd            jQ                  dd      d d d |f         }nyt]        |jQ                  dd      | j                  j                  j                  d      | j                  j                   | j"                        jQ                  dd      d d d |f   }t        j                  || j                  ||gd      \  }}}|Bt        j@                  |dk(        s*|jB                  }||d d d d d f   z  j/                  |      }ta        |j7                  ||d| j,                        |||j7                  ||| j                  d      |j7                  ||| j                  d      f| jL                  | j4                  d d d| j2                  dd|\  }}|*|(|j:                  | j                     j[                  |       |j7                  ||d      }| j=                  ||      }| j?                  |      }|S )Nr@   r#   rA   r   .r   T)zrg  dt_softplusdt_limitF)rk  rA  seq_idxrY  rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   )rG   swish)r   r5   r  rY  )rA  rk  rt  rw  r  rg  ru  )1rH   r]  rs   rq   r^  rn   rf  squeezerb  r3   splitr*   r{   r   rd  r5   r  rY  expri  r   r   r   rD   rE   rg  rk  rI   r%   r|   rj  rl  allrC   r_  r\  r   r'   rA  r6   r   r   rF   r=  ru   copy_r)   r[  r&   )r8   rM   rr  r   rg   r   r   groups_time_state_sized_to_removein_projected_statesd_mlpsplit_projection_dimrN   hidden_states_B_CdtBCrq  rg  rk  hidden_states_reshapedoutrC   projected_statesdt_limit_kwargsinput_not_masked	ssm_state	time_stephidden_states_B_C_tr   scan_outputs                                  r<   cuda_kernels_forwardz%Zamba2MambaMixer.cuda_kernels_forwardV  sv    "/!4!4
GQ!%1D1D!D$0001t}}3DtGZGZ3ZZ]a]k]kk #(G(G"&,,}/D/DQ/G"H(..r2[@QFE$)5$2H2H$--Y]YgYg#h 05<OQekm0n-Aq$)2 4!((8""**1-  ! #(++!'')?AWX#M1a
 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az4==!''!*2MNAz4==!''!*2MNA%2%7%7
DNNTXTaTa%b"2''7& M *..z4>>DMM;YZM IImT:M--.q$|<Cz 
u )%))Na<O2P%++!.1d
1K!K O OPU V#||M:4::++-..A$($8$8$@bzSWSgSgFhO)#(99^q-@#A #' $$<;OTd!A$KK&&..q1KK$$LL" ff# ##'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(,#"$ &%"YX 
m 6;[[$++T]]DNNK62'  +*;*E*Ea*K'!#!2!2+d.C.CFYF_F_`bFc.cef-g"J !,,T^^<BB:N#+tFW/W(,$5$?$?1$EFPPQRTUVWXZb[bZbWbc)% )9+55a;#{{1199!<![[--#'??	)
  i1oa'k)3% ',kk%++-CE[\'#q!
 "-eiiRS@S6T)//E%2^Aq$J5O%O$S$STY$ZM)B!&&z7BNFF:wrBFF:wrB*  $ff (, LL $* &*&Y (\-E ++DNN;AA)L)..z7BG"iiT:mmK0
r=   c                    |j                   \  }}}|j                  }|-|j                  r!| j                  |j	                  d            }n1|||d d d d d f   z  j                  |      }| j                  |      }|j                   d   d| j                  z  z
  d| j                  z  | j                  z  z
  | j                  z
  dz  }	|j                  |	|	| j                  | j                  | j                  gd      \  }}}
}}|_|j                  | j                     j                         }|j                  |j                        }|j                  r1|
j!                  d      }
|j"                  | j                     }t%        j&                  |dd      }|j(                  dk(  r|d d dd d f   n||d d d d df<   |j"                  | j                     j+                  |       t%        j,                  |j                  |j                        | j.                  j0                  d d dd d f   z  d      }| j2                  r|| j.                  j4                  z  }| j7                  |      j                  |      d d d df   }nj|j9                  dd      }t:        j<                  j?                  || j@                  |j                   d   z
  df      }|j"                  | j                     j+                  |       | j7                  | j/                  |      j9                  dd            d d d |d d f   }||j                  }||d d d d d f   z  j                  |      }nt%        jB                  || j                  | jD                  | j                  f|j                  |	      }| j7                  | j/                  |j9                  dd            dd |f   j9                  dd            }t%        j                  || j                  | j                  | j                  z  | j                  | j                  z  gd      \  }}}t%        jF                  | jH                  jK                                }|t|j                  rg|j(                  dk(  r
|d d d df   n|d d dd d f   d d d df   }|j9                  dd      jM                  ||j                   d   | jD                        }| jN                  d
   jM                  | jN                  j                   d   | jD                        }t$        j:                  j<                  jQ                  ||j                  |j                        z         }t%        jR                  || jT                        }|d   jM                  | j                  | jD                  | j                        j                  t$        jV                        }t%        jF                  |d
   |z        }|jY                  || j                  d      dd d d f   }|jM                  || j                  | j                  | j                  z  |j                   d         j[                         }|jY                  |d|j                   d         }|d
   |dd d d f   z  }|jY                  |d| jD                        }||d
   z  }|j                  | j                     j+                  |j                  | j                     |z  |z          |jY                  || j                  d      dd d d f   }|jM                  || j                  | j                  | j                  z  |j                   d         j[                         }|jY                  |d|j                   d         }|j                  | j                     j                  |j                        }|j]                  || j                  z  | jD                  | j                        }|j]                  || j                  z  | j                  d      }t%        j^                  ||      }|j]                  || j                  | jD                        }| j`                  d
   jM                  | j`                  j                   d   | jD                        }|||z  z   j                  |j                        }|jY                  |d      d d d df   }n4t:        j<                  jQ                  || jN                  z         }t%        jR                  || jT                        }|jY                  ||d| jD                        jK                         }|jY                  ||d| j                        jK                         }|jY                  ||d| j                        jK                         }|jc                  | j                  | j                  z  d| j                        }|jc                  | j                  | j                  z  d| j                        }| jd                  || jd                  z  z
  | jd                  z  }| j`                  d
   tg        ||      z  }||d
   z  }|j                  |j                        |z  }||||fD cg c]  }ti        ||| jd                         c}\  }}}}|jk                  dddd      }t%        jl                  |d      }t%        jF                  to        |            }|d d d d d d d d d d d f   |d d d d d d d d d d d f   z  } | j-                  d      }!|!d
   |jk                  ddddd      d
   z  }"|"j-                  d      }#|#d
   |d d d d d f   z  j-                  d      }$t%        jF                  |d d d d d d dd f   |z
        }%||%jk                  dddd      d
   z  }&|&jk                  ddddd      d
   |jk                  ddddd      dd d d f   z  j-                  d      jk                  ddddd      }'|.|j                  r"|j                  | j                     d d d df   }(nt%        jp                  |'d d d df         }(t%        jr                  |(|'gd      }'t%        jF                  to        t:        j<                  j?                  |d d d d d d df   d                  })|'jk                  ddddd      }*|)d   |*d d d d d df   z  j-                  d      }+|+jk                  ddddd      },|,d d d df   |,d d df   }}'t%        jF                  |      }-|dd d d f   |'d d d d d df   z  }.|-jk                  dddd      }/|.j-                  d      |/d
   z  }0|$|0z   }|jY                  |d| j                  | jD                        }||z   }|dkD  r|d d d |d d d d f   }|jY                  ||d      }|*|(|j                  | j                     j+                  |       | ju                  ||
      }1| jw                  |1j                  |            }2|2S c c}w )Nr#   rA   r@   r   r   r   r   .rj   rD  ).NNr   )r   output_sizer:  )r#   r   )<rH   rC   rn   rf  r  rD   rq   r]  rs   r^  r  rb  r|   r   r   rh   r  r{   r3   r   ndimr  sumrd  r5   rX  r  r[  r   r   rF   r=  ru   r   r   r  ri  r   r   rg  softplusr   r`  rE   r   r   rI   bmmrk  repeat_interleaverA  r?  rB  permuterK  rO  
zeros_liker   rj  rl  )3r8   input_statesrr  r   rg   r   r   rC   r  r  rN   rM   r  r  r   r  r  rq  rg  dAdBdBxr|   ssm_states_reshaped
C_reshapedyrk  r8  
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decay_contractionstatesprevious_statesdecay_chunkstates_permutedresult
new_statesstate_decay_outC_times_statesstate_decay_out_permutedY_offr  contextualized_statess3                                                      r<   torch_forwardzZamba2MambaMixer.torch_forward  sY   !-!3!3
GQ""#(G(G#||L,@,@,CD) ,~aDj/I IMMeT#||L9!''+a$2H2H.HH1t}}K\_c_r_rKrrtx  uC  uC  C  HI  I(8(>(>t55t~~V\^ )? )
%1dM2
 #$//?EEGI!]%9%9:I..~~a()55dnnE
"ZZ
2BG
ANASASWXAX}Q1W'=^k
1a8$((8>>zJ %		*--8H8O8O*PSWS^S^SeSefgijlmfmSn*ntv w%%!T[[%5%55M $ 7 : :5 A!T3, O - 7 7! <]]..!**]-@-@-DDaH
 ((8>>zJ $])C)M)MaPQ)R STUW_X_W_abTb c!-)//E%2^Aq$J5O%O$S$STY$ZMT^^T]]D<O<OP$++5I !HHT[[1H1HA1N%OPSU]V]U]P]%^%h%hijlm%noM#kk-$:P:PRVR_R_bfbubuRuw{  xE  xE  HL  H[  H[  x[  :\  bd  eq!YYtzz'')**#(G(G &(WW\AtSL!r!Q'{1dC<7PBa#**:rxx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!3!34B/"))$..$--I\I\]``glgtgt`uA2i=1,-B
 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PM}Y//C ##DNN399''7"<sB 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CCAGGLJ",//*t~~2Mt}}^b^q^q"r
T^^ ;T=P=PRSTJ		-z:Az4>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''T\\(9:BR!3!34B)11*gr4==Y__aM		*gD4G4GHNNPA		*gr43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'DOO*CCtVH	*-?x-XXJ *ByM9M](()B.A cpqrtuwxay%z\]&9!Xt&W%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCAFF !99XaArsl%;h%FGL"#l&:&:1aA&Fy&Q"Q)11!Q1a@K}OdOdefhiklnoqrOstwy}  @A  uA  PB  B  G  G  LM  G  N  V  V  WX  Z[  ]^  `a  cd  eF'L,K,K"."9"9$.."I!TSV,"W"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK$nnQ1a;O!/2_Q4QT_5UUZZ_`ZaF1aA6J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*gr2A$)A''7==iHii4(
 !%knnU.C D$$I &{s   yc                     t         rId| j                  j                  j                  j                  v rt               s| j                  |||      S | j                  |||      S )Ncuda)rm  rf  r5   rh   r   r   r  r  )r8   rM   rr  r   s       r<   rU   zZamba2MambaMixer.forward  sT     "f0C0C0J0J0O0O&OXpXr,,]L.YY!!-~NNr=   r/   r+   )rW   rX   rY   r   r$   ro   r1   r3   r   re   r  r  rU   rZ   r[   s   @r<   rQ  rQ    s    =| =d
 =D 9=.2	T||T /5T t+	Tn%8PSW8W %nsnznz  ~B  oB %J 9=.2		O /5	O t+		Or=   rQ  c                   8     e Zd Zddededz  f fdZddZ xZS )	Zamba2MLPNrf   r  c           	          t         	|           || _        |j                  | _        |j                  | _        || _        || _        t        j                  | j                  d| j                  z  |j                        | _
        t        j                  | j                  | j                  |j                        | _        t        |j                     | _        t        j                  g       | _        t#        | j
                        D ]  }||j$                  z  |k(  rt        j&                  t        j                  | j                  j                  | j                  j(                  d      t        j                  | j                  j(                  d| j                  z  d            }nt        j*                         }| j                   j-                  |        |j.                  }t1        |      D ci c]  \  }}||
 c}}| _        yc c}}w )aQ  
        This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
        is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
        r@   r  FN)r0   r1   rf   r9   rq   r  r  r   r  re  gate_up_proj	down_projr   
hidden_actact_fnr  gate_up_proj_adapter_listr}   r!  r"  r#  r$  r   r  r%  r&  )
r8   rf   r  r  r   gate_up_proj_adapterr  r*  r   r;   s
            r<   r1   zZamba2MLP.__init__  s   
 	!--!'!9!9"4 IId&6&6D<R<R8RY_YoYop4#9#94;K;KRXRhRhiV../)+r):&t../ 	HA6(((H4')}}IIdkk55t{{7O7OV[\IIdkk66D<R<R8RY^_($
 (*{{}$**112FG	H !11;D_;UV<5%%,VVs   3H
c                     | j                  |      }| j                  |   }| | j                  |   |      z   }t        j                  |dd      }| j                  |d         |d   z  }| j                  |      }|S )Nr@   rA   r   r   r#   )r  r&  r  r3   chunkr  r  )r8   hidden_stater   gate_up_stateoutputs        r<   rU   zZamba2MLP.forward  s    )),7NN9-	%(Q(F(Fy(QR^(__M1"={{=#34}Q7GG-r=   r+   r/   )rW   rX   rY   r$   ro   r1   rU   rZ   r[   s   @r<   r  r    s%    W| WPSVZPZ W<r=   r  c                   4    e Zd Zddededz  dedz  f fdZ	 	 	 	 ddej                  dej                  dedej                  dz  d	edz  d
e	dz  dej                  dz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )Zamba2AttentionDecoderLayerNrf   r  r   c                 @   t         |           || _        t        |j                        }t        |d||      | _        t        |||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )NrA   )r   r  r  )r  r  r:   )r0   r1   r  r   r  r  	self_attnr  feed_forwardr]   r  rms_norm_epsinput_layernormr9   pre_ff_layernorm)r8   rf   r  r   num_gsr;   s        r<   r1   z$Zamba2AttentionDecoderLayer.__init__  s     V,,-(2RXckl%fRZ[,V-I-IvObObc -f.@.@fFYFY Zr=   rM   original_hidden_statesr   r+  output_attentionsr,  r   r   c           
          t        j                  ||gd      }| j                  |      } | j                  d||||||d|\  }}	| j	                  |      }| j                  ||      }|f}
|r|
|	fz  }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        rA   r   )rM   r   r   r+  r  r,  r   )r3   concatenater  r  r  r  )r8   rM   r  r   r   r+  r  r,  r   self_attn_weightsoutputss              r<   rU   z#Zamba2AttentionDecoderLayer.forward  s    > ))=:P*QWYZ,,];+94>> ,
')+/ 3,
 ,
(( --m<))-C ")++Gr=   r+   )NNFN)rW   rX   rY   r$   ro   r1   r3   r   re   rI  r   r   r   ra   FloatTensorrU   rZ   r[   s   @r<   r  r    s    [| [sTz [UX[_U_ [ /3;?).7;3||3 !&3 	3
 t+3 2D83  $;3 #--43 -.3 
u  %(9(95;L;L(L"MPT"TT	U3r=   r  c                       e Zd Zdedef fdZ	 	 	 	 	 	 	 	 	 	 ddej                  dej                  dz  dedz  dej                  dz  dej                  dz  d	edz  d
e	dz  de	dz  dej                  dz  dej                  dz  dej                  dz  deej                  eej                  ej                  f   dz  f   fdZ xZS )Zamba2MambaDecoderLayerrf   r   c                     t         |           t        ||      | _        t	        |j
                  |j                        | _        || _        y )N)rf   r   r  )	r0   r1   rQ  mambar]   r9   r  r  r   )r8   rf   r   r;   s      r<   r1   z Zamba2MambaDecoderLayer.__init__&  s>    %VyI
,V-?-?VEXEXY"r=   NrM   r  r   r   r+  r  	use_cacher   r   transformer_hidden_statesr   c                     |}|||z   n|}| j                  |      }| j                  |||      }d}||z   }|f}|r||fz  }|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        N)rM   rr  r   )r  r  )r8   rM   r  r   r   r   r+  r  r  r   r   r  r   residualr  r  s                   r<   rU   zZamba2MambaDecoderLayer.forward,  s    > !
 :S9^M55dq 	 ,,];

'() # 
 ! !=0 ")++G))Gr=   )
NNNNNFFNNN)rW   rX   rY   r$   ro   r1   r3   r   re   rI  r   ra   r  rU   rZ   r[   s   @r<   r  r  %  s,   #| # # 7; $.2+/;?).!&26049=;||; !&t 3; :	;
 t+; \\D(; 2D8;  $;; $;; ((4/; &&-; $)<<$#6; 
u  %(9(95;L;L(L"MPT"TT	U;r=   r  c                       e Zd Zdedej
                  def fdZ	 	 	 	 	 	 	 	 	 ddej                  dej                  dz  de
dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  dej                  dz  dej                  dz  deej                  eej                  ej                  f   dz  f   fdZ xZS )Zamba2HybridLayershared_transformerlinearr  c                 L    t         |           || _        || _        || _        y r/   )r0   r1   r  mamba_decoderr  )r8   r  r  r  r;   s       r<   r1   zZamba2HybridLayer.__init__k  s'     	""4r=   NrM   r  r   r   r   r+  r  r  r,  r   r   c           
          | j                  |||||||	|
      }|d   }|r|d   }| j                  |      }| j                  |||||||	      }|r|d   f|dd z   }|S )aY  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r  r   r   r+  r  r,  r   r   r#   )r  r   r+  r  r  r,  r@   N)r  r  r  )r8   rM   r  r   r   r   r+  r  r  r,  r   layer_outputsr  r  s                 r<   rU   zZamba2HybridLayer.forwards  s    B //#9&+/ 3% 0 	
 %2!$4! -a 0$(KK0I$J!**&?)+/ 3 + 
 *1-/@AMRSRTDUUMr=   )	NNNNNFFNN)rW   rX   rY   r  r   r  r  r1   r3   r   ro   re   rI  r   ra   r  rU   rZ   r[   s   @r<   r  r  j  s0   5"=5GIyy5Yp5 7; $.2+/;?).!&7;04@||@ !&t 3@ :	@
 t+@ \\D(@ 2D8@  $;@ $;@ #--4@ &&-@ 
u  %(9(95;L;L(L"MPT"TT	U@r=   r  c                   v     e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZdZ ej                          fd       Z xZS )Zamba2PreTrainedModelrf   modelTr  r  r+  c                    t         |   |       t        |t              rt	        j
                  t	        j                  | j                  j                        t        j                  | j                  j                        t        j                  | j                  j                        z
  z  t        j                  | j                  j                        z         j                  | j                  j                        }|t	        j                  t	        j                  |              z   }t!        j"                  |j$                  |       t	        j&                  d|j(                  dz         }t!        j"                  |j*                  t	        j                  |             t!        j,                  |j.                         y y )N)minr#   )r0   _init_weightsr   rQ  r3   r  randrf   rv   mathrh  ra  r`  r   time_step_floorexpm1initr  rg  r   r^  ri  ones_rk  )r8   r   r  inv_dtrq  r;   s        r<   r  z#Zamba2PreTrainedModel._init_weights  s(   f%f./

4;;44588DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FJJv~~v.Q 0 01 45AJJv||UYYq\2JJvxx  0r=   )rW   rX   rY   r$   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_flex_attn_supports_sdpa_is_statefulr3   r   r  rZ   r[   s   @r<   r  r    sW    &*#68QR"3NLU]]_! !r=   r  c                   .    e Zd ZdZdef fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  d	ej                  dz  d
edz  dedz  dedz  dedz  dej                  dz  deez  fd       Zd Zd Z xZS )Zamba2Modelzh
    Model consisting of *config.num_hidden_layers* layers.

    Args:
        config: Zamba2Config
    rf   c                 L   t         |   |       || _        |j                  | _        |j
                  | _        t        j                  |j
                  |j                  | j                        | _	        |j                  | _
        | j                         | _        |j                  | _        t        |j                  |j                        | _        |j"                  r1|j$                  rt&        j)                  d       t+        |      | _        d| _        | j1                          y )Nr  ze`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`.F)r0   r1   rf   pad_token_idpadding_idx
vocab_sizer   	Embeddingr9   embed_tokensrm   
get_layerslayersr1  r]   r  final_layernormr/  use_long_contextrn  ro  r   
rotary_embgradient_checkpointing	post_initr8   rf   r;   s     r<   r1   zZamba2Model.__init__  s     !.. ++LL):):F<N<NPTP`P`a!'!9!9oo'$*$?$?!,V-?-?VEXEXY&&##{ 4F;DO&+# 	r=   N	input_idsr   r   r+  inputs_embedsr  r  output_hidden_statesreturn_dictr   r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|d u |d uz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|}t        j                  |      }|rO|M||j                  d   n|j                  d   }t        | j                   || j                  | j                         }|
R||j#                  | j$                        nd}t        j&                  |||j                  d   z   |j                         }
||
j)                  d      }| j+                  |||
      }| j-                  ||	      }|rd
nd }|rd
nd }t/        | j0                        D ]6  \  }}|r||fz  } |||||||||||
      }|d   }|s(|d   .||d   fz  }8 | j3                  |      }|r||fz  }||j4                  sd|_        t7        ||r|nd ||      }|	r|S |j9                         S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   rC   rh   r   r#   rl   )r   r   )r+  r  r  r,  r   T)last_hidden_stater+  rM   
attentions)rf   r  r  r  use_return_dict
ValueErrorr  r   rn  ro  r  r3   r   rH   re   rC   rh   r   first_transformer_layer_idr   r  _update_causal_maskr  r%  r  r  rn   r   to_tuple)r8   r  r   r   r+  r  r  r  r  r  r   r   rM   r  rg   past_seen_tokensr   r,  all_hidden_statesall_self_attnsr   layerr  r  s                           r<   rU   zZamba2Model.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<s  &&4==Yj I  --i8M%!&]!; 0/8/D+-J]J]^_J`J6t{{JVZV`V`imitituO! #.  ..9X9X.Y 
 #\\ "2]5H5H5K"KTaThThN )33A6L..~}n]"oom,oW"6BD0d )$++ 6 	:Iu#!m%55!!& /"3#$7)M *!,M  #/"}Q'7&99N-	:0 ,,];  -!11&/Q/Q15O.(+/8Od+%	
 %v;&//*;;r=   c                    t        | j                        r	|d|v r|S y |j                  |j                  }}t	        j
                  |      j                  }|j                  d   }||}n|d   dz   }t	        j                  ||f|||      }	|dk7  rt	        j                  |	d      }	|J|	t	        j                  ||      |j                  dd      kD  j                  t        j                        z  }	|	d d d d d d f   j                  |j                  d   ddd      }	|w|	j                         }	|j!                         dk(  rT|d d d d d d f   j                  dd|d      }
|	j#                  d      |
j#                  d      z  }|	j%                  ||       | j                  j&                  d	k(  r0|.|j                  j(                  d
v rt+        j,                  |	|      }	|	S )Nr.  r#   rA   )
fill_valuerC   rh   rE  rl   r   r@   sdpa)r  xpunpu)r   rf   rC   rh   r3   finfor  rH   fulltriur   r   rD   rI  r   r   r   eqmasked_fill_r1  r   r   _unmask_unattended)r8   r   r7  r   rC   rh   	min_dtypesequence_lengthtarget_lengthr   expanded_attn_maskpadding_masks               r<   r"  zZamba2Model._update_causal_mask]  s   '4)c^.C%%$**L,?,?vKK&**	&,,Q/!+M*2.2Mjj/=!Ai_dmsta**[1=K%ELLvFI_I_`bdeIffjjkpkukuvvK!$a"23::<;M;Ma;PRSUWY[\%%++-K!!#q(%3AtT14D%E%L%LRQRTceg%h"*~~c25G5J5J35OO((yA KK,,6*%%**.DD
 1CCKQZ[Kr=   c                     g }i | _         d| _        g }t        | j                        D ]O  \  }}t	        | j
                  |      }|dk(  rd| d}t        |t              r"t        |      | j
                  j                  k\  rDt        |t              rt        |      }t        |      }| j                   j                  ||i       n|j                  |       || j
                  j                  z  }t        | j
                  |      }	t        j                   | j
                  j"                  | j
                  j"                  d      }
|j                  t%        |	|
|             ?|j                  |       R t        j&                  |      S )	Nr   r  rk   zlayers.z.shared_transformer)r  Fr  )_tied_weights_keysr!  r%  rm   r  rf   r   listr   r!  r   nextr   r   r  r   r  r9   r  r  )r8   r  unique_hybrid_blockslayer_id
layer_typemamba_layerprefix_patterntarget_patternr  
attn_blocklinear_layers              r<   r  zZamba2Model.get_layers  sQ   "$*+'!$-d.D.D$E 	+ Hj1$++RKX%#*8*4G!H ##7>/0DKK4N4NN!"6=/45I/J,%)*>%?N++22NN3ST )//?#dkk&@&@@8xX
!yy)@)@$++BYBY`ef/
L+VWk*5	+6 }}V$$r=   
NNNNNNNNNN)rW   rX   rY   r   r$   r1   r   r3   r   r   re   r  rI  ra   r   rU   r"  r  rZ   r[   s   @r<   r  r    s   | .  .2.204;?26!%)-,0#'26f<##d*f< t+f< &&-	f<
 2D8f< ((4/f< $;f<  $;f< #Tkf< D[f< ((4/f< 
(	(f< f<P%N!%r=   r  c                       e Zd ZddiZdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	e
dz  d
ej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                  z  deez  fd       Z	 	 	 	 	 	 	 d fd	Z xZS )Zamba2ForCausalLMzlm_head.weightzmodel.embed_tokens.weightrf   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y NFr  )
r0   r1   r  r  r  r   r  r9   lm_headr  r  s     r<   r1   zZamba2ForCausalLM.__init__  sU      (
 ++yy!3!3V5F5FUS 	r=   Nr  r   r   r+  r  labelsr  r  r  r  r   logits_to_keepr   c                    ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
| j	                  ||||||||	||

      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |}|
s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )al  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Zamba2ForCausalLM

        >>> model = Zamba2ForCausalLM.from_pretrained("Zyphra/Zamba2-7B-v1")
        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-7B-v1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)
r  r   r   r+  r  r  r  r  r   r  r   r#   losslogitsr+  rM   r  )rf   r  r  r  r  r   ro   slicerI  loss_functionr  r   r+  rM   r  )r8   r  r   r   r+  r  rJ  r  r  r  r  r   rK  r   r  rM   slice_indicesrO  rN  r  s                       r<   rU   zZamba2ForCausalLM.forward  sL   P 2C1N-TXT_T_TqTq %9$D $++JjJj 	 &1%<k$++B]B] **)%+'/!5)#  
  
8B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPDY,F'+'7D7V#CVC%#33!//))
 	
r=   c	                     |:t        | j                  |j                  d   | j                  | j                        }| j                  j
                  |	d<   t        |   |f|||||||d|	}
|
S )Nr   r  rK  )r+  r   r  r   r   r  is_first_iteration)re   rf   rH   rC   rh   num_logits_to_keepr0   prepare_inputs_for_generation)r8   r  r+  r   r  r   r   r  rT  r   model_inputsr;   s              r<   rV  z/Zamba2ForCausalLM.prepare_inputs_for_generation  s     "6Y__Q/tzz$++O $(;;#A#A w<

+)')%1

 

 r=   )NNNNNNNNNNNr   )NNNNNTF)rW   rX   rY   r9  r$   r1   r   r3   r   r   re   r  rI  ro   ra   r   rU   rV  rZ   r[   s   @r<   rF  rF    sj   *,GH|   .2.204;?26*.!%)-,0#'26-.O
##d*O
 t+O
 &&-	O

 2D8O
 ((4/O
   4'O
 $;O
  $;O
 #TkO
 D[O
 ((4/O
 ell*O
 
'	'O
 O
h     r=   rF  a  
    The Zamba2 Model with a sequence classification head on top (linear layer).

    [`Zamba2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  de
dz  de
dz  deez  fd       Z xZS )Zamba2ForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y rH  )
r0   r1   
num_labelsr  r  r   r  r9   scorer  r  s     r<   r1   z(Zamba2ForSequenceClassification.__init__9  sS      ++ (
YYv114??O
 	r=   Nr  r   r   r+  r  rJ  r  r  r  r  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }||j                  d   }n|j                  d   }| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d       |t        j                  ||j                  	      |f   }d}||j                  |j                        }| j                   j"                  | j$                  dk(  rd
| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  d
k(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j                   j"                  dk(  r=t1               } ||j3                  d| j$                        |j3                  d            }n,| j                   j"                  dk(  rt5               } |||      }|
s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                  |j<                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r+  r  r  r  r  r  r   r#   z=Cannot handle batch sizes > 1 if no padding token is defined.rA   rj   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rl   
regressionsingle_label_classificationmulti_label_classificationrM  )rf   r  r  r]  rH   r	  r   rD   rh   r3   int32r   argmaxrn  ro  r;   rW   problem_typer\  rC   longro   r
   r  r	   rI   r   r   r+  rM   r  )r8   r  r   r   r+  r  rJ  r  r  r  r  r   transformer_outputsrM   rO  rg   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsrN  loss_fctr  s                          r<   rU   z'Zamba2ForSequenceClassification.forwardB  s   * &1%<k$++B]B]"jj)%+'/!5# ) 

 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaabYYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r=   rD  )rW   rX   rY   r1   r   r3   r   r   r   r  rI  ra   r   rU   rZ   r[   s   @r<   rZ  rZ  *  s     .2.204(,26*.!%)-,0#'\
##d*\
 t+\
 &&-	\

 \
 ((4/\
   4'\
 $;\
  $;\
 #Tk\
 D[\
 
1	1\
 \
r=   rZ  )rF  rZ  r  r  )r.  )r#   )^r  collections.abcr   	itertoolsr   typingr   r   r3   r   torch.nnr   r	   r
    r   r  activationsr   cache_utilsr   
generationr   integrationsr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r    utils.import_utilsr!   r"   configuration_zamba2r$   +mamba_ssm.ops.triton.selective_state_updater%   !mamba_ssm.ops.triton.ssd_combinedr&   r'   causal_conv1dr)   r*   
get_loggerrW   rn  Moduler-   r]   re   r   r   ro   r   r   r   r   r	  r  r?  rB  rO  r  rm  rQ  r  r  r  r  r  r  rF  rZ  __all__r   r=   r<   <module>r     s  *  $      A A & !   ) 4 > B 9 q q K F & F F I T . RmmZjW57WDD-7**			H	%; ;*JBII J(h  h V><BII ><B	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % %4( *+ ,2y)bii y)~VU\\ VS V
(( 46FH\]^ iOryy iOX'		 'T=")) =@B8 BJI2 IX!O !< P%' P% P%h~- ~B g
&; g
g
T kr=   