
    i*                        d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-  e+       rddl.m/Z/m0Z0 ddl1m2Z2 nd\  Z2Z0Z/ e*       r	ddl3m4Z4m5Z5 nd\  Z5Z4 e6e2e0e4e5e/f      Z7 e&jp                  e9      Z: G d dejv                        Z<dejz                  de>d ejz                  fd!Z? G d" d#      Z@	 dCd$ejv                  d%ejz                  d&ejz                  d'ejz                  d(ejz                  dz  d)eAd*eAfd+ZB G d, d-ejv                        ZC G d. d/ejv                        ZD G d0 d1ejv                        ZE G d2 d3ejv                        ZF G d4 d5e      ZG G d6 d7e      ZHe% G d8 d9e!             ZIe% G d: d;eI             ZJ G d< d=eIe      ZK e%d>?       G d@ dAeI             ZLg dBZMy)DzPyTorch Zamba model.    N)Callable)Any)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)Cache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging)is_flash_attention_requested)is_causal_conv1d_availableis_mamba_ssm_available   )ZambaConfig)mamba_inner_fnselective_scan_fn)selective_state_update)NNN)causal_conv1d_fncausal_conv1d_updateNNc                   ,     e Zd Zd fd	Zd Zd Z xZS )ZambaRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        ZambaRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      r/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/zamba/modeling_zamba.pyr(   zZambaRMSNorm.__init__B   s1     	ll5::k#:; #    c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor*   float32powmeanrsqrtr-   r,   )r.   hidden_statesinput_dtypevariances       r2   forwardzZambaRMSNorm.forwardJ   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r3   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler,   shaper-   r.   s    r2   
extra_reprzZambaRMSNorm.extra_reprQ   s*    ))*+6$2G2G1HIIr3   )gư>)__name__
__module____qualname__r(   rA   rF   __classcell__r1   s   @r2   r%   r%   A   s    $;Jr3   r%   r>   n_repreturnc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rD   expandreshape)r>   rL   batchnum_key_value_headsslenhead_dims         r2   	repeat_kvrU   V   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr3   c                       e Zd ZdZdZej                  dfdZd Z	 ddej                  dej                  de
d	eeef   dz  d
eej                  ej                  f   f
dZdej                   fdZdde
dz  d
e
fdZy)ZambaHybridDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNc           
      ,   || _         d| _        |j                  | _        d| _        |j                  |j
                  z  | _        |j                  | _        |j                  | _
        |j                  | _        g | _        g | _        g | _        i | _        i | _        i | _        t%        |j&                        D ]  }| xj                  t)        j*                  || j                  | j                  ||      gz  c_        || j                  | j                  | j                  z  | j                  f}| xj                  t)        j*                  |||      gz  c_        | j                  |   dk(  s| j                  j-                  |        t%        |j&                        D cg c]  }t)        j.                  g g|z  |       c}| _        t%        |j&                        D cg c]  }t)        j.                  g g|z  |       c}| _        y c c}w c c}w )NFdevicer8   hybridrZ   )r8   is_compileablelayers_block_typehas_previous_statemamba_expandr/   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headsconv_states
ssm_statestransformer_layers_modules_parameters_buffersrangenum_hidden_layersr*   zerosappendtensor	key_cachevalue_cache)r.   config
batch_sizer8   rZ   icache_shape_s           r2   r(   z ZambaHybridDynamicCache.__init__r   s   
#!'!9!9"'!'!4!4v7I7I!I$22 & 3 3#11"$v//0 	2AJ(>(>@U@U^dlqr!  ""&&$*<*<<##	K OOKe TUUO%%a(H4''..q1	2 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts    "H!"Hc                 ,    t        | j                        S N)lenrr   rE   s    r2   __len__zZambaHybridDynamicCache.__len__   s    4>>""r3   
key_statesvalue_states	layer_idxcache_kwargsrM   c                    | j                   |   j                  d   dk(  r|| j                   |<   || j                  |<   nft        j                  | j                   |   |gd      | j                   |<   t        j                  | j                  |   |gd      | j                  |<   | j                   |   | j                  |   fS )Nr6   r   r5   dim)rr   rD   rs   r*   cat)r.   r}   r~   r   r   s        r2   updatezZambaHybridDynamicCache.update   s     >>)$**2.!3(2DNN9%*6DY'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr3   beam_idxc                    | j                         dkD  rvt        t        | j                              D ]S  }| j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   V yy)zDReorders the cache for beam search, given the selected beam indices.r   N)
get_seq_lengthrm   r{   rr   rZ   index_selectr9   rs   rg   rh   )r.   r   r   rZ   s       r2   reorder_cachez%ZambaHybridDynamicCache.reorder_cache   s[    1$"3t~~#67 	m		299,0NN9,E,R,RSTV^VaVabhVi,jy))))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +)))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +3::-1__Y-G-T-TUVX`XcXcdjXk-l	*	m %r3   c                     || j                   vr| j                   d   n|}t        | j                        |k  s| j                  |   j                  d   dk(  ry| j                  |   j                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   r6   )ri   r{   rr   rD   )r.   r   s     r2   r   z&ZambaHybridDynamicCache.get_seq_length   sn     3<4CZCZ2ZD++A.`i	t~~)+t~~i/H/N/Nr/RVW/W~~i(..r22r3   rz   )r   )rG   rH   rI   __doc__r]   r*   float16r(   r|   Tensorintdictstrr   rC   r   
LongTensorr   r    r3   r2   rW   rW   b   s     N16t u@# /3FLLF llF 	F
 38nt+F 
u||U\\)	*F$me&6&6 m3d
 33 3r3   rW   modulequerykeyvalueattention_maskscalingdropoutc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr5   r	   r   r6   )r   r8   )ptrainingr   )rU   num_key_value_groupsr*   matmul	transposerD   r   
functionalsoftmaxr:   r9   r8   r   r   
contiguous)r   r   r   r   r   r   r   kwargsr}   r~   attn_weightscausal_maskattn_outputs                r2   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r3   c                        e Zd ZdZdedef fdZ	 ddej                  dedej                  dz  de	dz  d	e
e   d
eej                  ej                  dz  eej                     dz  f   fdZ xZS )ZambaAttentionaA  
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    rt   r   c                 .   t         |           || _        || _        |j                  | _        |j
                  | _        |j                  |j                  z  | _	        |j                  | _
        | j                  dz  dz  | _        d| _        |j                  | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j&                  d      | _        y )Nr5         TFbias)r'   r(   rt   r   attention_hidden_sizeattention_head_dimrT   num_attention_headsrR   r   max_position_embeddingsr   	is_causalattention_dropoutr   Linearq_projk_projv_projr/   o_projr.   rt   r   r1   s      r2   r(   zZambaAttention.__init__   s9   "%+%A%A"11$*$>$>&B\B\$\!'-'E'E$)d2!'!9!9ii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii : :T]] JFL^L^ejkr3   Nr>   r   past_key_valuesr   rM   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
||j                  |	|
|      \  }	}
t        j                  | j                  j                  t              } || ||	|
|f| j                  sdn| j                  | j                  d|\  }} |j                   g |d j#                         }| j%                  |      }||fS )Nr6   r   r5           )r   r   )rD   rT   r   viewr   r   r   r   r   get_interfacert   _attn_implementationr   r   r   r   rP   r   r   )r.   r>   r   r   r   r   input_shapehidden_shapequery_statesr}   r~   attention_interfacer   r   s                 r2   rA   zZambaAttention.forward   sk    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&'6'='=j,Xa'b$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r3   rz   )rG   rH   rI   r   r   r   r(   r*   r   rW   r   r   rC   rA   rJ   rK   s   @r2   r   r      s    l{ ls l. ;?#)||#) #) t+	#)
 147#) -.#) 
u||U\\D0%2E2LL	M#)r3   r   c                   l     e Zd ZdZdef fdZ	 d	dej                  defdZ	d	defdZ
d	defdZ xZS )
ZambaMambaMixeruE  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways:
    - Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head
    undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of
    `self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`.
    rt   c           	         t         |           || _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  |j                  z  | _
        |j                  | _        |j                  | _        | j                  | j                  z  | _        |j                  | _        |j"                  | _        t'        j(                  | j                  | j                  | j                   | j                  | j                  | j                  dz
        | _        |j,                  | _        t0        |j,                     | _        |j4                  | _        t'        j8                  | j                  | j                  dz  | j$                        | _        t'        j<                  t?        j@                  | j                  | j                  | j                  dz  z   | j                              | _!        t'        j<                  t?        j@                  | j                  | j                  | j                        dz
  dz  | j                  dz  z        | _"        t'        j<                  t?        j@                  | j                  | j                              | _#        t?        jH                  d| j                  dz   t>        jJ                        d d d f   }|jM                  | j                  d      jO                         }t'        j<                  t?        jP                  |      jS                  | j                  | j                  d            | _*        t'        j<                  t?        jV                  | j                  | j                              | _,        t'        j8                  | j                  | j                  | j$                        | _-        t\        st^        ja                  d       y y )	Nr   )in_channelsout_channelsr   kernel_sizegroupspaddingr5   r   g      ?r8   r6   aq  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)1r'   r(   rt   r   r/   rb   rc   rd   re   r`   ra   mamba_dt_ranktime_step_rankrf   mamba_head_dimmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1dhidden_mamba_act
activationr   actuse_mamba_kernelsuse_fast_kernelsr   in_projr)   r*   ro   x_proj_weightdt_proj_weightdt_proj_biasaranger:   rO   r   logrP   A_logr+   Dout_projis_fast_path_availableloggerwarning_once)r.   rt   r   Ar1   s       r2   r(   zZambaMambaMixer.__init__-  s   "!--$22 & 3 3!'!4!4v7I7I!I$22#11"448J8JJ#33..ii..//##--))))A-
 !11&112 & 8 8 yy!1!143I3IA3MTXTaTab  \\KK""##d&9&9A&==##
 !ll[[++T-@-@$BUBUVY\\!!3&'

 LLT5G5GI\I\)]^ LLD//!35==I$PQ'RHHT++R0;;=\\%))A,"6"6t7I7I4K^K^`b"cd
ejj););T=P=PQR		$"8"8$:J:JQUQ^Q^_%^ &r3   r>   cache_paramsc                    |j                   \  }}}|d uxr |j                  xr |dk(  }| j                  |      j                  dd      }|j	                  |dd|      j                  dd      \  }}	|j                  d      j                         }|	j                  d      }	|	j                  || j                  d|      j                  dd      }	| j                  j                  j	                  | j                  j                  j                  d      | j                  j                  j                  d            }
|ret        |j                  d      |j                  | j                     |
| j                  j                   | j"                        }|j%                  d      }n|,t'        j(                  |dk(        s||j%                  d      z  }|dt*        j,                  j/                  || j0                  |j                   d   z
  df      }|j                  | j                     j3                  |       t5        ||
| j                  j                   | j"                        }|,t'        j(                  |dk(        s||j%                  d      z  }|j                  d| j                  | j6                  |      j                  dd      }| j8                  d d d d d d d f   |z  j                  dd      }t'        j:                  || j<                  | j>                  | j>                  gd      \  }}}| j@                  d d d f   |j                  dd      z  }t'        jB                  | jD                  jG                                }| jH                  | jH                  jG                         nd }t'        jJ                  |d|f|jL                  |jN                        }|rtQ        | j                        D ]  }tS        |jT                  | j                     d d |f   ||d	df   ||d	df   ||   ||d d df   ||d d df   | jV                  |   |	|d	df   ||   d

      j%                  d      }t'        jX                  ||fd      } nAt'        jJ                  |d| j6                  | j>                  f|jL                  |jN                        }tQ        | j                        D ]  }t[        ||   ||   ||   ||   j                  dd      ||   j                  dd      | jV                  |   jG                         |	|   ||   d
d

      \  }}t'        jX                  ||fd      j                         }t'        jX                  ||j%                  d      fd      } |*|(|jT                  | j                     j3                  |       | j]                  |j                  dd            }|S )Nr   r5   r6   r   r   )r   r   rY   .T)dt_softplus)delta_softplusreturn_last_state)/rD   r_   r   r   r   chunksqueezer   rP   rf   r   r,   sizer"   rg   r   r   r   	unsqueezer*   allr   r   padre   copy_r!   r   r   splitr   rc   r   expr   floatr   emptyrZ   r8   rm   r    rh   r   r   r   r   )r.   r>   r   r   ru   seq_lenrx   use_precomputed_statesprojected_statesgateconv_weightsrg   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsnscan_outputs_	ssm_state
ssm_state_contextualized_statess                            r2   cuda_kernels_forwardz$ZambaMambaMixer.cuda_kernels_forwardj  s    "/!4!4
GQ!-T!9!nl>]>]!nbimnbn  <<6@@AF.33JAwOUUVW]^U_t%--a0;;=||A||J(:(:BHRRSTVWX {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)((8  M *33B7M)%))Na<O2P -0H0H0K K' mm//@U@UXeXkXklnXo@oqr?st((8>>{K,]L$++JZJZgkgvgvwM)%))Na<O2P -0H0H0K K
 &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++T00$2E2EtGZGZ[ac
	1a "00D9I<O<OPRTV<WWYYtzz'')** 7;6G6G6S**002Y]{{J7#;MDXDX`m`s`st!4--. O 6 ++DNN;AqDA!!S!),&q#qy1aDaAgJaAgJFF1ICO"1% $! )B-   %yy,)FANO  Q 3 3T5H5HI$++#))I
 4--. S,=!!$&q)aDaDNN1a(aDNN1a(FF1IOO%G"1%#'&*-)z  %yy,)FANYY[!IIy*2F2Fq2I&JPQR	S $)A''7==iH !%l.D.DQ.J K$$r3   c           
      R   |j                   \  }}}|j                  }| j                  |      j                  dd      }|j	                  |dd|      j                  dd      \  }	}
|	j                  d      j                         }	|
j                  d      }
|
j                  || j                  d|      j                  dd      }
t        |t              }|r|j                  | j                     j                   d   |k(  ri| j                  r(|j                  | j                     j                         }n|j                  | j                     }|j!                  |	j"                        }|j$                  r|dk(  r|j&                  | j                     j                   d   |k(  r|j&                  | j                     }t)        j*                  |dd      }|	d d d d df   |d d d d df<   ||j&                  | j                  <   t)        j,                  || j.                  j0                  d d dd d f   z  d      }	| j2                  r|	| j.                  j4                  z  }	| j7                  |	      j!                  |      j9                  d      }	nn|+|	|d d |	j                   d    d f   j9                  d      z  }	t:        j<                  j?                  |	| j@                  |	j                   d   z
  df      }||j&                  | j                  <   | j7                  | j/                  |	      dd |f         }	||	|d d |	j                   d    d f   j9                  d      z  }	nt)        jB                  || j                  | jD                  | jF                  f|	j"                  |      }||	|j9                  d      z  }	| j7                  | j/                  |	      dd |f         }	||	|j9                  d      z  }	|	j                  d| j                  | jD                  |      j                  dd      }	| jH                  d d d d d d d f   |	z  j                  dd	      }t)        jJ                  || jL                  | jF                  | jF                  gd      \  }}}| jN                  d d d f   |j                  dd	      z  | jP                  d d d d d d f   z   }t:        j<                  jS                  |      }t)        jT                  | jV                  jY                                }t)        jT                  |d d d d d d d d f   |d d d d d d d d d f   z        }|d d d d d d d d d f   |d d d d d d d d d f   jY                         z  }||	d d d d d d d d d f   jY                         z  }g }t[        |      D ]  }|d d d d d d |d d f   j                  dd      |z  |d d d d d d |d d f   j                  dd      z   }t)        j\                  |j                  dd      j!                  |      |d d d d |d d f   j9                  d            }|j_                  |d d d d d d df           t)        j`                  |d      }||	| jb                  d d d d d d f   z  z   }|| j7                  |
      z  }|r||j                  | j                  <   | je                  |j                  dd      j                  |d|      j                  dd            }|S )
Nr   r5   r6   r   r   )shiftsdims.rY   r   )3rD   r8   r   r   r   r   r   r   rP   rf   
isinstancerW   rh   r   r   cloner9   rZ   r_   rg   r*   rollsumr   r,   r   r   r   r   r   r   r   re   ro   r   rc   r   r   r   r   r   softplusr   r   r   rm   r   rp   stackr   r   )r.   input_statesr   r   ru   r   rx   r8   r   r>   r   	use_cacher  
conv_stater   r   r  r  r  r   
discrete_A
discrete_BdeltaB_ur  rv   scan_outputr
  s                              r2   slow_forwardzZambaMambaMixer.slow_forward  s   !-!3!3
GQ""<<5??1E.33JAwOUUVW]^U_t%--a0;;=||A||J(:(:BHRRSTVWX|-DE	00@FFqIZW}}(33DNNCIIK	(33DNNC	!]%9%9:I //qL ,,T^^<BB1ES)55dnnE
"ZZ
2BG
'4Q1W'=
1a8$;E((8 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB O!-$1N1}GZGZ[]G^F^F`C`4a4k4klm4n$nM]]..}t?T?TWdWjWjkmWn?npq>rs
;E((8 $])CC'M)R S!-$1N1}GZGZ[]G^F^F`C`4a4k4klm4n$nMT//1D1DdFYFYZ$++I
 ) -0H0H0K K HHT[[%?XgX%NOM) -0H0H0K K &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++T00$2E2EtGZGZ[ac
	1a #11!T':Y=P=PQSUW=XX\`\m\mtQ]
 
  ]]334FG YYtzz'')**YYqD!T1!458J1aQRTUW[K[8\\]
'1aD(89AaD!Q>N<O<U<U<WW
aAq$.> ? E E GGw 	9A"1aAq=1;;AqAIMPXYZ\]_`bcefYfPgPqPqrsuvPwwI,,y':':1a'@'C'CE'JAaQRTUWXjMLcLcdfLghKAq!QJ 78	9 kk,B7!]TVVAtQ<L5M%MN!DHHTN26?L##DNN3 !%!!!Q'//
BHRRSTVWX!
 %$r3   c                     | j                   rGt        r"d| j                  j                  j                  vrt        d      | j                  |||      S | j                  |||      S )NcudazFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config.)r   )r   r   r   rZ   type
ValueErrorr  r  )r.   r>   r   r   s       r2   rA   zZambaMambaMixer.forward(  sm      )V4;M;M;T;T;Y;Y-Y i 
 ,,]LYg,hh  ^ \\r3   r#   )rG   rH   rI   r   r   r(   r*   r   rW   r  r  rA   rJ   rK   s   @r2   r   r      sX    
;{ ;| im_%"\\_%9P_%B[%7N [%z	]3J 	]r3   r   c                   $     e Zd Z fdZd Z xZS )ZambaMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFr   )r'   r(   rt   r/   ra   r   r   	gate_projup_proj	down_projr   
hidden_actact_fnr.   rt   r1   s     r2   r(   zZambaMLP.__init__6  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r3   c                     | j                  | j                  | j                  |            | j                  |      z        }|S rz   )r'  r)  r%  r&  )r.   xr'  s      r2   rA   zZambaMLP.forward@  s6    NN4;;t~~a/@#ADLLQRO#ST	r3   )rG   rH   rI   r(   rA   rJ   rK   s   @r2   r"  r"  5  s    0r3   r"  c                       e Zd Zddededz  f fdZ	 	 	 	 ddej                  dej                  dedej                  dz  dedz  d	e	dz  d
e	dz  de
e   deej                  eej                  ej                  f   dz  f   fdZ xZS )ZambaAttentionDecoderLayerNrt   r   c                     t         |           t        ||      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _        y )Nr0   )r'   r(   r   	self_attnr"  feed_forwardr%   r   rms_norm_epsinput_layernormr/   pre_ff_layernormr   s      r2   r(   z#ZambaAttentionDecoderLayer.__init__F  s_    '	:$V,+F,H,HfNaNab ,V-?-?VEXEX Yr3   r>   original_hidden_statesr   r   output_attentionsr  r   rM   c           
          t        j                  ||gd      }| j                  |      } | j                  d||||||d|\  }}	| j	                  |      }| j                  |      }|f}
|r|
|	fz  }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        r6   r   )r>   r   r   r   r7  r  r   )r*   concatenater4  r1  r5  r2  )r.   r>   r6  r   r   r   r7  r  r   self_attn_weightsoutputss              r2   rA   z"ZambaAttentionDecoderLayer.forwardN  s    > ))=:P*QWYZ,,];+94>> ,
')+/,
 ,
(( --m<))-8 ")++Gr3   rz   )NNFF)rG   rH   rI   r   r   r(   r*   r   rW   boolr   r   rC   FloatTensorrA   rJ   rK   s   @r2   r.  r.  E  s    Z{ ZsTz Z /3:>).!&3||3 !&3 	3
 t+3 1473  $;3 $;3 -.3 
u  %(9(95;L;L(L"MPT"TT	U3r3   r.  c                       e Zd Zdedef fdZ	 	 	 	 	 	 	 	 	 	 ddej                  dej                  dz  dedz  dej                  dz  dej                  dz  d	edz  d
e	dz  de	dz  dej                  dz  dej                  dz  dej                  dz  deej                  eej                  ej                  f   dz  f   fdZ xZS )ZambaMambaDecoderLayerrt   r   c                     t         |           t        ||      | _        t	        |j
                  |j                        | _        || _        y )N)rt   r   r0  )	r'   r(   r   mambar%   r/   r3  r4  r   r   s      r2   r(   zZambaMambaDecoderLayer.__init__  s>    $FiH
+F,>,>FDWDWX"r3   Nr>   r6  r   r   r   r7  r  cache_positionposition_idstransformer_hidden_statesrM   c                     |}|||z   n|}| j                  |      }| j                  |||      }d}||z   }|f}|r||fz  }|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        N)r>   r   r   )r4  rA  )r.   r>   r6  r   r   r   r   r7  r  rB  rC  rD  r   residualr:  r;  s                   r2   rA   zZambaMambaDecoderLayer.forward  s    > !
 :S9^M55dq 	 ,,];

'() # 
 ! !=0 ")++G))Gr3   )
NNNNNFFNNN)rG   rH   rI   r   r   r(   r*   r   rW   r<  r   rC   r=  rA   rJ   rK   s   @r2   r?  r?    s,   #{ #s # 7; $.2+/:>).!&26049=;||; !&t 3; :	;
 t+; \\D(; 147;  $;; $;; ((4/; &&-; $)<<$#6; 
u  %(9(95;L;L(L"MPT"TT	U;r3   r?  c                   l    e Zd Zdedej
                  def fdZ	 	 	 	 	 	 	 	 ddej                  dej                  dz  de
dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  dej                  dz  deej                  eej                  ej                  f   dz  f   fdZ xZS )ZambaHybridLayershared_transflinearrA  c                 L    t         |           || _        || _        || _        y rz   )r'   r(   rI  rJ  mamba_decoder)r.   rI  rJ  rA  r1   s       r2   r(   zZambaHybridLayer.__init__  s%    *"r3   Nr>   r6  r   r   r   r   r7  r  rB  rM   c
           
          | j                  ||||||||	      }
|
d   }|r|
d   }| j                  |      }| j                  |||||||	      }
|r|
d   f|
dd z   }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        )r6  r   r   r   r7  r  rB  r   r   )rD  r   r   r7  r  rB  r5   N)rI  rJ  rL  )r.   r>   r6  r   r   r   r   r7  r  rB  layer_outputsrD  r:  s                r2   rA   zZambaHybridLayer.forward  s    > **#9&+/) + 	
 %2!$4! -a 0$(KK0I$J!**&?)+/) + 
 *1-/@AMRSRTDUUMr3   )NNNNNFFN)rG   rH   rI   r.  r   r   r?  r(   r*   r   r   rW   r<  r   rC   r=  rA   rJ   rK   s   @r2   rH  rH    s   #&@ #")) #\r # 7; $.2+/:>).!&26>||> !&t 3> :	>
 t+> \\D(> 147>  $;> $;> ((4/> 
u  %(9(95;L;L(L"MPT"TT	U>r3   rH  c                   r     e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZ ej                          fd       Z xZS )	ZambaPreTrainedModelrt   modelTr.  r?  r   Fc                    | j                   j                  }t        |   |       t	        |t
              rt        j                  |j                  d|       | j                   j                  dz  }t        j                  |j                  | |       | j                   j                  | j                   j                  z  | j                   j                  z  }t        j                   t        j"                  | j                   j                  |      t%        j&                  | j                   j(                        t%        j&                  | j                   j*                        z
  z  t%        j&                  | j                   j*                        z         j-                  | j                   j.                        }|t        j&                  t        j0                  |              z   }t        j2                  |j4                  |       t        j6                  d|j8                  dz   t        j:                        d d d f   }|j=                  |j>                  d      jA                         }t        j2                  |jB                  t        j&                  |      jE                  |j                  |jF                  d             t        jH                  |jJ                         y y )Nr   )r<   stdr   )minr   r   r6   )&rt   initializer_ranger'   _init_weightsr  r   initnormal_r   r   uniform_r   r`   r/   rf   r*   r   randmathr   time_step_maxtime_step_minclamptime_step_floorexpm1r   r   r   rc   r:   rO   ra   r   r   rP   r   ones_r   )	r.   r   rS  dt_init_stdr   dtinv_dtr   r1   s	           r2   rV  z"ZambaPreTrainedModel._init_weights  s   kk++f%fo.LL--CSA++33T9KMM&//+{K![[558O8OOSWS^S^SlSllN

4;;44nE88DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FJJv**F3Q 5 5 9OPTVWPWXA1126AACAJJv||UYYq\%9%9&:N:NPVPePegi%jkJJvxx % /r3   )rG   rH   rI   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr*   no_gradrV  rJ   rK   s   @r2   rP  rP    sQ    &*#57OP"3 NLU]]_! !r3   rP  c                   (    e Zd ZdZdef fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  d	ej                  dz  d
edz  dedz  dedz  dedz  dej                  dz  deez  fd       Zd Z xZS )
ZambaModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`]

    Args:
        config: ZambaConfig
    rt   c                 h   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        |j                  | _	        g }d | _
        t        | j                        D ]  \  }}t        ||      }|dk(  rt        j                  | j                  j                  | j                  j                  d      }|j                  t!        t#        |      ||             | j                  d| dd| di| _
        |j                  |        t        j$                  |      | _        |j(                  | _        t+        |j                  |j,                  	      | _        d| _        | j3                          y )
N)r   r[   Fr   z
layers.(?!z\.)\d+.shared_transfzlayers.z.shared_transfr0  )r'   r(   pad_token_idpadding_idx
vocab_sizer   	Embeddingr/   embed_tokensr^   _tied_weights_keys	enumerater?  r   rt   rp   rH  r.  
ModuleListlayersr   r%   r3  final_layernormgradient_checkpointing	post_init)r.   rt   ry  layer_id
layer_typerA  rJ  r1   s          r2   r(   zZambaModel.__init__?  sq    !.. ++LL):):F<N<NPTP`P`a!'!9!9"&$-d.D.D$E 
	% Hj*6XFEX%4;;#:#:DKK<S<SZ_`./I&/QSY[`ab**2%hZ/CDPXzYgFh/D+ e$
	% mmF+$*$?$?!+F,>,>FDWDWX&+#r3   N	input_idsr   rC  r   inputs_embedsr  r7  output_hidden_statesreturn_dictrB  rM   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|d u |d uz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|}t        j                  |      }|r|t        j                  d       |
.t        j                  |j                  d   |j                        }
||
j!                  d      }| j#                  |||
      }|rdnd }|rdnd }t%        | j&                        D ]5  \  }}|r||fz  } ||||||||||
		      }|d   }|s'|d   -||d   fz  }7 | j)                  |      }|r||fz  }|r|j*                  sd
|_        t-        ||r|nd ||      }|	r|S |j/                         S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fz{Zamba requires an initialized `ZambaHybridDynamicCache` to return a cache. None was provided, so no cache will be returned.r   r\   r   r   )r   r7  r  rB  T)last_hidden_stater   r>   
attentions)rt   r7  r  r  use_return_dictr   r{  r   r   r   ru  r*   r  r   rD   rZ   r   _update_causal_maskrw  ry  rz  r_   r   to_tuple)r.   r  r   rC  r   r  r  r7  r  r  rB  r   r>   r6  r   all_hidden_statesall_self_attnsr   layerrN  outputs                        r2   rA   zZambaModel.forward\  sI    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<s  &&4==Yj I  --i8M%!&]!; 0:
 !"\\-*=*=a*@I]I]^N)33A6L..~}n]"6BD0d )$++ 6 	:Iu#!m%55!!& /"3#-
M *!,M  #/"}Q'7&99N+	:. ,,];  -!11?#E#E15O.(+/8Od+%	
 %v;&//*;;r3   c                    t        | j                        r	|d|v r|S y |j                  |j                  }}t	        j
                  |      j                  }|j                  d   }||}n|d   dz   }t	        j                  ||f|||      }	|dk7  rt	        j                  |	d      }	|J|	t	        j                  ||      |j                  dd      kD  j                  t        j                        z  }	|	d d d d d d f   j                  |j                  d   ddd      }	|w|	j                         }	|j!                         dk(  rT|d d d d d d f   j                  dd|d      }
|	j#                  d      |
j#                  d      z  }|	j%                  ||       | j                  j&                  d	k(  r0|.|j                  j(                  d
v rt+        j,                  |	|      }	|	S )Nr   r   r6   )
fill_valuer8   rZ   )diagonalr\   r   r5   sdpa)r  xpunpu)r   rt   r8   rZ   r*   finforT  rD   fulltriur   rP   r9   r<  rO   r  r   eqmasked_fill_r   r  r   _unmask_unattended)r.   r   input_tensorrB  r8   rZ   	min_dtypesequence_lengthtarget_lengthr   expanded_attn_maskpadding_masks               r2   r  zZambaModel._update_causal_mask  s   '4)c^.C%%$**L,?,?vKK&**	&,,Q/!+M*2.2Mjj/=!Ai_dmsta**[1=K%ELLvFI_I_`bdeIffjjkpkukuvvK!$a"23::<;M;Ma;PRSUWY[\%%++-K!!#q(%3AtT14D%E%L%LRQRTceg%h"*~~c25G5J5J35OO((yA KK,,6*%%**.DD
 1CCKQZ[Kr3   
NNNNNNNNNN)rG   rH   rI   r   r   r(   r   r*   r   r   rW   r=  r<  rC   r   rA   r  rJ   rK   s   @r2   ro  ro  6  s   { :  .2.204:>26!%)-,0#'26_<##d*_< t+_< &&-	_<
 147_< ((4/_< $;_<  $;_< #Tk_< D[_< ((4/_< 
(	(_< _<B%r3   ro  c                       e Zd ZddiZdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	e
dz  d
ej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                  z  deez  fd       Z	 	 	 	 	 	 	 d fd	Z xZS )ZambaForCausalLMzlm_head.weightzmodel.embed_tokens.weightrt   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y r$  )
r'   r(   ro  rQ  rs  r   r   r/   lm_headr|  r*  s     r2   r(   zZambaForCausalLM.__init__  sU     '
 ++yy!3!3V5F5FUS 	r3   Nr  r   rC  r   r  labelsr  r7  r  r  rB  logits_to_keeprM   c                    ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
| j	                  ||||||||	||

      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |}|
s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ZambaForCausalLM

        >>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1")
        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)
r  r   rC  r   r  r  r7  r  rB  r  r   r   losslogitsr   r>   r  )rt   r7  r  r  rQ  r  r   slicer  loss_functionrs  r   r   r>   r  )r.   r  r   rC  r   r  r  r  r7  r  r  rB  r  r   r;  r>   slice_indicesr  r  r  s                       r2   rA   zZambaForCausalLM.forward  sL   P 2C1N-TXT_T_TqTq %9$D $++JjJj 	 &1%<k$++B]B] **)%+'/!5)#  
  
8B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPDY,F'+'7D7V#CVC%#33!//))
 	
r3   c	                     |:t        | j                  |j                  d   | j                  | j                        }| j                  j
                  |	d<   t        |   |f|||||||d|	}
|
S )Nr   )r8   rZ   r  )r   r   r  rB  rC  r  is_first_iteration)rW   rt   rD   r8   rZ   num_logits_to_keepr'   prepare_inputs_for_generation)r.   r  r   r   r  rB  rC  r  r  r   model_inputsr1   s              r2   r  z.ZambaForCausalLM.prepare_inputs_for_generationE  s     "5Y__Q/tzz$++O $(;;#A#A w<

+)')%1

 

 r3   )NNNNNNNNNNNr   )NNNNNTF)rG   rH   rI   rv  r   r(   r   r*   r   r   rW   r=  r<  r   rC   r   rA   r  rJ   rK   s   @r2   r  r    sj   *,GH{   .2.204:>26*.!%)-,0#'26-.O
##d*O
 t+O
 &&-	O

 147O
 ((4/O
   4'O
 $;O
  $;O
 #TkO
 D[O
 ((4/O
 ell*O
 
'	'O
 O
h     r3   r  a  
    The Zamba Model with a sequence classification head on top (linear layer).

    [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  de
dz  de
dz  deez  fd       Z xZS )ZambaForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y r$  )
r'   r(   
num_labelsro  rQ  r   r   r/   scorer|  r*  s     r2   r(   z'ZambaForSequenceClassification.__init__w  sS      ++'
YYv114??O
 	r3   Nr  r   rC  r   r  r  r  r7  r  r  rM   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }||j                  d   }n|j                  d   }| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d       |t        j                  ||j                  	      |f   }d}||j                  |j                        }| j                   j"                  | j$                  dk(  rd
| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  d
k(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j                   j"                  dk(  r=t1               } ||j3                  d| j$                        |j3                  d            }n,| j                   j"                  dk(  rt5               } |||      }|
s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                  |j<                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   rC  r   r  r  r7  r  r  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r6   rY   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r\   
regressionsingle_label_classificationmulti_label_classificationr  )rt   r  rQ  r  rD   rq  r   r9   rZ   r*   int32r   argmaxr   r   r1   rG   problem_typer  r8   longr   r   r   r   r   r   r   r   r>   r  )r.   r  r   rC  r   r  r  r  r7  r  r  r   transformer_outputsr>   r  ru   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr  s                          r2   rA   z&ZambaForSequenceClassification.forward  s   * &1%<k$++B]B]"jj)%+'/!5# ) 

 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaabYYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r3   r  )rG   rH   rI   r(   r   r*   r   r   r   r=  r<  rC   r   rA   rJ   rK   s   @r2   r  r  h  s     .2.204(,26*.!%)-,0#'\
##d*\
 t+\
 &&-	\

 \
 ((4/\
   4'\
 $;\
  $;\
 #Tk\
 D[\
 
1	1\
 \
r3   r  )r  r  ro  rP  )r   )Nr   r[  collections.abcr   typingr   r*   r   torch.nnr   r   r    r
   rW  activationsr   cache_utilsr   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.genericr   utils.import_utilsr   r   configuration_zambar   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater    causal_conv1dr!   r"   r   r   
get_loggerrG   r   Moduler%   r   r   rU   rW   r   r   r   r   r"  r.  r?  rH  rP  ro  r  r  __all__r   r3   r2   <module>r     sD  &   $    A A & !   ) > B 9 q q F & , 9 T , XR@P=-~DD-7**.0@BVXfg 
 
		H	%J299 J*	UU\\ 	U# 	U%,, 	U[3 [3J %II%<<% 
% <<	%
 LL4'% % %4C)RYY C)LQ]bii Q]jryy  < <~B7 BJE1 EP !!? !! !!H l% l l`~+_ ~B g
%9 g
g
T gr3   