
    i                        d dl mZ d dlmZmZ d dlZd dlmc mZ	 d dlmZ ddl
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z,  e)       r	d dl-m.Z.m/Z/ nd\  Z.Z/ ed       G d dej`                               Z1 G d dej`                        Z2 G d dej`                        Z3 G d d      Z4d Z5 ed       d=d!       Z6d"ejn                  d#e8d$ejn                  fd%Z9	 d>d&ej`                  d'ejn                  d(ejn                  d)ejn                  d*ejn                  dz  d+e:d,e:d-e e"   fd.Z; ee6       G d/ d0ej`                               Z<d1 Z=e.e/fZ> e?e>      Z@ G d2 d3ej`                        ZA G d4 d5e      ZBe# G d6 d7e             ZCe# G d8 d9eC             ZDe# G d: d;eCe             ZEg d<ZFy)?    )Callable)AnyOptionalN)nn   )Cache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputsmaybe_autocast)is_causal_conv1d_availableis_torchdynamo_compiling   )
Lfm2Config)causal_conv1d_fncausal_conv1d_updateNNRMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )Lfm2RMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z:
        Lfm2RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      p/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.pyr'   zLfm2RMSNorm.__init__2   s1     	ll5::k#:; #    c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor)   float32powmeanrsqrtr,   r+   )r-   hidden_statesinput_dtypevariances       r1   forwardzLfm2RMSNorm.forward:   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r2   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler+   shaper,   r-   s    r1   
extra_reprzLfm2RMSNorm.extra_reprA   s*    ))*+6$2G2G1HIIr2   )gư>)__name__
__module____qualname__r'   r@   rE   __classcell__r0   s   @r1   r$   r$   0   s    $;Jr2   r$   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )Lfm2RotaryEmbeddinginv_freqNconfigc                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultrM   F)
persistentoriginal_inv_freq)r&   r'   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrN   rope_parametersrP   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r-   rN   devicerope_init_fnrM   r0   s        r1   r'   zLfm2RotaryEmbedding.__init__H   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuUr2   r\   ztorch.deviceseq_lenreturnztorch.Tensorc                    | j                   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   r4   r7   r\   r7   )	rW   getattrr.   num_attention_headsr)   arangeint64r8   float)rN   r\   r^   basedimattention_factorrM   s          r1   rX   z3Lfm2RotaryEmbedding.compute_default_rope_parametersX   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r2   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r5   r   mpscpuF)device_typeenabledr4   rk   rc   )rM   ri   expandrC   r8   r\   
isinstancetypestrr   	transposer)   catcosrY   sinr7   )
r-   xposition_idsinv_freq_expandedposition_ids_expandedrp   freqsembry   rz   s
             r1   r@   zLfm2RotaryEmbedding.forwardv   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$NNNN)rF   rG   rH   r)   Tensor__annotations__r   r'   staticmethodr   intrB   ri   rX   no_gradr   r@   rI   rJ   s   @r1   rL   rL   E   s    llVz V  $(+/"*T!*(* t* 
~u$	%	* *: U]]_<  <r2   rL   c                   *     e Zd Zdef fdZd Z xZS )Lfm2MLPrN   c                    t         |           |j                  }|j                  rat	        d|z  dz        }|j
                  Dt	        |j
                  |z        }|j                  ||j                  z   dz
  |j                  z  z  }t        j                  |j                  |d      | _
        t        j                  |j                  |d      | _        t        j                  ||j                  d      | _        y )Nr4   r   r   Fbias)r&   r'   intermediate_sizeblock_auto_adjust_ff_dimr   block_ffn_dim_multiplierblock_multiple_ofr   Linearr.   w1w3w2)r-   rN   r   r0   s      r1   r'   zLfm2MLP.__init__   s    "44** #A(9$9A$= >..:$'(G(GJ[([$\!$*$<$<&)A)AAAE&JbJbb%! ))F..0AN))F..0AN))-v/A/ANr2   c                     | j                  t        j                  | j                  |            | j	                  |      z        S r   )r   Fsilur   r   )r-   r{   s     r1   r@   zLfm2MLP.forward   s/    wwqvvdggaj)DGGAJ677r2   )rF   rG   rH   r   r'   r@   rI   rJ   s   @r1   r   r      s    Oz O8r2   r   c                      e Zd ZdZdZdZdZdZej                  dfde
dedej                  dej                  ez  dz  fdZ	 dd	ej                   d
ej                   dedeeef   dz  deej                   ej                   f   f
dZdej*                  fdZddedz  defdZdej                   dedeeef   fdZdefdZdefdZd Zy)Lfm2HybridConvCachea  
    Attention and conv cache for Lfm2.

    It stores the Key and Value states as a list of tensors, one for each layer.
    Attention layer cache shape: `[batch_size, num_heads, seq_len, head_dim]`.
    Conv layer cache shape: `[batch_size, hidden_size, L_cache-1]`.
    NFrN   max_batch_sizer7   r\   c                    g | _         g | _        || _        |j                  | _        | j                  j	                  d      | _        |j                  | _        || _        g | _        |t        j                  |      nd }t        |j                        D ]  }t        j                  | j                  |j                  | j                  | j                  |      }| j                  j                  |       | j                   j                  t        j                   g              | j                  j                  t        j                   g               y )Nfull_attention)r7   r\   )	key_cachevalue_cacher   layer_typesindexfirst_attention_layerconv_L_cache_dtype
conv_cacher)   r\   rangenum_hidden_layerszerosr.   appendtensor)r-   rN   r   r7   r\   _
conv_states          r1   r'   zLfm2HybridConvCache.__init__   s    ,!--%)%5%5%;%;<L%M""//.0)/);f%v//0 
	6A##""!!kkJ OO"":.NN!!%,,r"23##ELL$45
	6r2   
key_statesvalue_states	layer_idxcache_kwargsr_   c                    | j                   |   j                         dk(  r|| j                   |<   || j                  |<   nft        j                  | j                   |   |gd      | j                   |<   t        j                  | j                  |   |gd      | j                  |<   | j                   |   | j                  |   fS )a  
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.

        Return:
            A tuple containing the updated key and value states.
        r   rr   )r   numelr   r)   rx   )r-   r   r   r   r   s        r1   updatezLfm2HybridConvCache.update   s    0 >>)$**,1(2DNN9%*6DY'(-		4>>)3Lj2Y_a(bDNN9%*/))T5E5Ei5PR^4_eg*hDY'~~i($*:*:9*EEEr2   beam_idxc                    t        t        | j                              D ]:  }| j                  |   j                         r| j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                         s| j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   = y)zDReorders the cache for beam search, given the selected beam indices.r   N)	r   lenr   r   r\   index_selectr8   r   r   )r-   r   r   r\   s       r1   reorder_cachez!Lfm2HybridConvCache.reorder_cache   s#   s4>>23 		mI~~i(..0	299,0NN9,E,R,RSTV^VaVabhVi,jy))))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +y)//13::-1__Y-G-T-TUVX`XcXcdjXk-l	*		mr2   c                     | j                   |   dk7  r| j                  n|}t        | j                        |k  s | j                  |   j	                         dk(  ry| j                  |   j
                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   r   r   )r   r   r   r   r   rC   r-   r   s     r1   get_seq_lengthz"Lfm2HybridConvCache.get_seq_length   sm     372B2B92MQa2aD..gp	t~~)+t~~i/H/N/N/PTU/U~~i(..r22r2   cache_positionc                 V    d}|j                   d   }| j                         }||z   }||fS )aB  
        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
        the given layer at `layer_idx`.
        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
        for each layer.
        r   )rC   r   )r-   r   r   full_mask_kv_offsetquery_lengthpast_seen_tokens	kv_lengths          r1   get_mask_sizesz"Lfm2HybridConvCache.get_mask_sizes   s@      %++A...0 #33	---r2   
max_lengthc                    |dk  r| j                         t        |      z
  }| j                         |k  ryt        t        | j                              D ]l  }| j                  |   j                         s!| j                  |   dd|ddf   | j                  |<   | j                  |   dd|ddf   | j                  |<   n y)z"Crop the cache to the given lengthr   N.)r   absr   r   r   r   r   )r-   r   idxs      r1   cropzLfm2HybridConvCache.crop
  s    >,,.Z@J J.T^^,- 	SC~~c"((*&*nnS&9#{
{A:M&Ns#(,(8(8(=c;J;PQ>Q(R  %	Sr2   c                 ,    t        | j                        S r   )r   r   rD   s    r1   __len__zLfm2HybridConvCache.__len__  s    4>>""r2   c                     t        t        | j                              D ]  }| j                  |   j                          ! y r   )r   r   r   zero_r   s     r1   resetzLfm2HybridConvCache.reset  s4    s4??34 	/IOOI&,,.	/r2   r   )r   )rF   rG   rH   __doc__r   is_compileabler   r   r)   r9   r   r   r7   r\   rv   r'   r   dictr   rB   r   
LongTensorr   r   r   r   r   r    r2   r1   r   r      sB    NNIK #]],066 6 {{	6
 s"T)6F /3FLLF llF 	F
 38nt+F 
u||U\\)	*FBme&6&6 m3d
 33 3.U\\ .c .eTWY\T\o .Ss S# #/r2   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr5   r4   rr   )rC   r)   rx   )r{   x1x2s      r1   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r2   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkry   rz   unsqueeze_dimq_embedk_embeds          r1   apply_rotary_pos_embr   '  sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGr2   r=   n_repr_   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rC   rs   reshape)r=   r   batchnum_key_value_headsslenrb   s         r1   	repeat_kvr   A  so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr2   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr4   r   r   r5   )rk   r7   )ptrainingr   )r   num_key_value_groupsr)   matmulrw   rC   r   
functionalsoftmaxr9   r8   r7   r   r   
contiguous)r   r   r   r   r   r   r   r   r   r   attn_weightscausal_maskattn_outputs                r1   eager_attention_forwardr   M  s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r2   c                   
    e Zd ZdZdedef fdZ	 	 ddej                  de	ej                  ej                  f   dej                  dz  d	e
dz  d
ej                  dz  de	ej                  ej                  dz  f   fdZ xZS )Lfm2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrN   r   c                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        d| _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j
                  d      | _        t%        | j                  |j&                        | _        t%        | j                  |j&                        | _        y )Nrb   g      TFr   r/   )r&   r'   rN   r   re   r.   rf   rb   r   r   r   	is_causalr   r   q_projk_projv_projout_projr$   norm_epsq_layernormk_layernormr-   rN   r   r0   s      r1   r'   zLfm2Attention.__init__k  sL   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejk		&"<"<t}}"LfN`N`glm&t}}&//J&t}}&//Jr2   Nr=   position_embeddingsr   past_key_valuesr   r_   c                    |j                   d d }g |d| j                  }| j                   | j                  |      j                  |       j                  dd      }	| j                   | j                  |      j                  |       j                  dd      }
 | j                  |      j                  | j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t               } || |	|
||fd| j"                  d|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )Nr5   r   r4   )rz   ry   r           )r   r   )rC   rb   r  r   viewrw   r  r   r  r   r   r   r   get_interfacerN   _attn_implementationr   r   r   r   r  )r-   r=   r  r   r  r   r   input_shapehidden_shapequery_statesr   r   ry   rz   r   attention_interfacer   r   outputs                      r1   r@   zLfm2Attention.forwardz  s    $))#2.88b8$--8''(GM(B(G(G(VWaabcefg%%&Edkk-&@&E&E|&TU__`acde
6t{{=166EOOPQSTU&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
 LL	%
 	%
!\ *k));;;;FFH{+|##r2   r!   )rF   rG   rH   r   r   r   r'   r)   r   rB   r   r   r@   rI   rJ   s   @r1   r   r   g  s    GKz Kc K( 7;26'$||'$ #5<<#=>'$ t+	'$
 -t3'$ ((4/'$ 
u||U\\D00	1'$r2   r   c                     |N|j                   d   dkD  r<|j                   d   dkD  r*| j                  }| |dddddf   z  j                  |      } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )rC   r7   r8   )r=   r   r7   s      r1   apply_mask_to_padding_statesr    sa    
 !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr2   c            
       r    e Zd Zdedef fdZ	 	 	 ddej                  dedz  dej                  dz  dej                  dz  fd	Z
	 	 	 ddej                  dedz  dej                  dz  dej                  dz  fd
Z	 	 	 ddej                  dedz  dej                  dz  dej                  dz  fdZ xZS )Lfm2ShortConvrN   r   c           	      2   t         |           || _        || _        |j                  | _        |j                  | _        t        j                  |j                  |j                  | j
                  |j                  | j                  | j
                  dz
        | _        t        j                  |j                  d|j                  z  | j                        | _        t        j                  |j                  |j                  | j                        | _        y )Nr   )in_channelsout_channelskernel_sizegroupsr   paddingr   r   )r&   r'   rN   r   r   L_cache	conv_biasr   r   Conv1dr.   convr   in_projr  r  s      r1   r'   zLfm2ShortConv.__init__  s    
 	"**$$	II**++%%LL1$
	 yy!3!3Q9K9K5KRVR[R[\		&"4"4f6H6HtyyYr2   Nr{   r  r   r   c                    t        ||      }| j                  |      j                  dd      }|j                  dd      \  }}}||z  }| j                  j
                  j                  | j                  j
                  j                  d      | j                  j
                  j                  d            }	|c|d   dkD  r[t        |j                  d      |j                  | j                     |	| j                  j                  d       }
|
j                  d      }
n|dt        j                  j!                  || j"                  |j$                  d   z
  df      }|j                  | j                     j'                  |       t)        ||	| j                  j                  d       }
||
z  }| j+                  |j                  dd      j-                               }|S )Nr5   r   r   rr   r   r4   )
activation)r  r!  rw   chunkr   r+   r  sizer    squeezer   r   r   r   r   r   padr  rC   copy_r   r  r   )r-   r{   r  r   r   BCxBCBxconv_weightsconv_outr   ys                r1   cuda_kernels_forwardz"Lfm2ShortConv.cuda_kernels_forward  s    )N;ll1o''B/))A2)&1aUyy'',,TYY-=-=-B-B1-EtyyGWGWG\G\]^G_`&>!+<q+@+

2**4>>:		H  ))"-H*]]..rDLL288B<4OQR3ST
**4>>:@@L'L$))..UYZHLMM!++b"-88:;r2   c                    |j                   d   }t        ||      }| j                  |      j                  dd      }|j	                  dd      \  }}}||z  }	|5|d   dkD  r,|j
                  | j                     }
|j                  d| j                  dz
        }|
j                  dd      }
|	j                  |
j                  |
j                        |
d d d d |f<   |j
                  | j                     j                  |
       t        j                  |
j                  |	j                        | j                   j"                  d d dd d f   z  d      }| j$                  r|| j                   j$                  z  }|j'                  d      }n~|dt(        j*                  j-                  |	| j                  |	j                   d   z
  df      }
|j
                  | j                     j                  |
       | j!                  |	      d	d |f   }||z  }|j                  dd      j/                         }| j1                  |      }|S )
Nr   r5   r   r   rr   r   )shiftsdimsrd   .)rC   r  r!  rw   r$  r   r   clampr  rollr8   r\   r7   r(  r)   sumr   r+   r   r   r   r   r'  r   r  )r-   r{   r  r   r   seqlenr)  r*  r+  r,  r   r.  r/  s                r1   slow_forwardzLfm2ShortConv.slow_forward  s    (N;ll1o''B/))A2)&1aU&>!+<q+@(33DNNCJ+11!T\\A5EFN#<J/1uuJ<M<MU_UeUeu/fJq!^+,&&t~~6<<ZHyyryy!9DII<L<LQPQSTW<U!U[]^HyyDIINN*))"-H*]]..rDLL288B<4OQR3ST
**4>>:@@Lyy}S'6'\2HLKKB**,MM!r2   r=   c                     t         r6d|j                  j                  v rt               s| j	                  ||||      S | j                  ||||      S )Ncuda)is_fast_path_availabler\   ru   r   r0  r8  )r-   r=   r  r   r   s        r1   r@   zLfm2ShortConv.forward  sP     "f0D0D0I0I&IRjRl,,]O^]kll  Q_``r2   r   )rF   rG   rH   r   r   r'   r)   r   r   r   r0  r8  r@   rI   rJ   s   @r1   r  r    s   ZZ Z2 7;26.2 <<  -t3  ((4/	 
 t+ J 7;26.2$<<$ -t3$ ((4/	$
 t+$R 7;26.2	a||	a -t3	a ((4/		a
 t+	ar2   r  c                       e Zd Zdedef fdZ	 	 	 	 	 ddej                  deej                  ej                  f   dz  dej                  dz  dej                  dz  d	e
dz  d
ej                  dz  dej                  fdZ xZS )Lfm2DecoderLayerrN   r   c                 f   t         |           |j                  |   dk(  | _        | j                  rt	        ||      | _        nt        ||      | _        t        |      | _	        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr   r   )r&   r'   r   is_attention_layerr   	self_attnr  r   r   feed_forwardr$   r.   r  operator_normffn_normr  s      r1   r'   zLfm2DecoderLayer.__init__   s    "("4"4Y"?CS"S""*69=DN%fi8DI#FO(););Q#F$6$6FOOLr2   Nr=   r  r   r|   r  r   r_   c           
         |}| j                   r, | j                  d| j                  |      |||||d|\  }}	n$| j                  | j                  |      |||      }||z   }|| j	                  | j                  |            z   }|S )N)r=   r  r   r|   r  r   )r=   r  r   r   r   )r?  r@  rB  r   rA  rC  )
r-   r=   r  r   r|   r  r   r   residualr   s
             r1   r@   zLfm2DecoderLayer.forward,  s     !""-t~~  "00?$7-) /-   M1 !II"00? /--	 & M &0%(9(9$--:V(WWr2   )NNNNN)rF   rG   rH   r   r   r'   r)   r   rB   r   r   r@   rI   rJ   s   @r1   r=  r=    s    
Mz 
Mc 
M IM.2046:26|| #5<<#=>E t+	
 &&- -t3 ((4/ 
r2   r=  c                   J    e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZy)	Lfm2PreTrainedModelrN   modelTr=  r  F)r=   
attentionsN)rF   rG   rH   r   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr=  r   _can_record_outputsr   r2   r1   rG  rG  N  sQ    &*#+,#4"5N""&)#r2   rG  c                       e Zd Zdef fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  dej                  dz  d	edz  d
ej                  dz  dee   defd              Z xZS )	Lfm2ModelrN   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |      | _        d| _        t#        |j                  |j$                        | _        | j)                          y c c}w )N)rN   Fr   )r&   r'   pad_token_idpadding_idx
vocab_sizer   	Embeddingr.   embed_tokens
ModuleListr   r   r=  layersrL   
rotary_embgradient_checkpointingr$   r  embedding_norm	post_initr  s      r1   r'   zLfm2Model.__init__b  s     !.. ++LL):):F<N<NPTP`P`ammBGH`H`BabYfi0b
 .V<&+#)&*<*<&//R 	 cs   DN	input_idsr   r|   r  inputs_embeds	use_cacher   r   r_   c           
         |d u |d uz  rt        d      || j                  |      }|r>|<|j                  d   }	t        | j                  |	| j
                  | j                        }|F||j                         nd}
t        j                  |
|
|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }|j                  d   dk7  r|nd }|}| j                  ||      }| j                  d | j                  j                   D ]!  }|j                  r|n|} ||f|||||d|}# | j!                  |      }t#        ||	      S )
Nz:You must specify exactly one of input_ids or inputs_embedsr   )rN   r   r7   r\   r   )r\   )rN   input_embedsr   r   r  r|   )r|   )r   r  r|   r  r   )last_hidden_stater  )
ValueErrorr[  rC   r   rN   r7   r\   r   r)   rg   r   r   r^  r]  r   r?  r`  r   )r-   rb  r   r|   r  rc  rd  r   r   
batch_sizer   r   linear_attentionr=   r  decoder_layer
layer_masks                    r1   r@   zLfm2Model.forwardr  s    -t";<YZZ  --i8M0&,,Q/J1{{:TZZX\XcXcO !CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;&))+%
 .;-@-@-Cq-H>d%"oom,oW "[[)H4;;+H+HI 
	M(5(H(HN^J))$7) /- M
	 ++M:&++
 	
r2   )NNNNNNN)rF   rG   rH   r   r'   r   r   r)   r   r   r   FloatTensorboolr   r   r   r@   rI   rJ   s   @r1   rU  rU  `  s    z    .2.2046:26!%26@
##d*@
 t+@
 &&-	@

 -t3@
 ((4/@
 $;@
 ((4/@
 +,@
 
!@
  @
r2   rU  c                   b    e Zd ZddiZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 dd	e	j                  dz  d
e	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  dee	j                  z  dee   defd              Z xZS )Lfm2ForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr=   logitsc                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y )NFr   )
r&   r'   rU  rH  rY  r   r   r.   rq  ra  )r-   rN   r0   s     r1   r'   zLfm2ForCausalLM.__init__  sU     v&
 ++yy!3!3V5F5FUS 	r2   Nrb  r   r|   r  rc  labelsrd  r   logits_to_keepr   r_   c
                 z    | j                   d|||||||d|
}|j                  }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }d}|* | j                  d||| j                  j                  d|
}t        |||j                  |j                  |j                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Lfm2ForCausalLM

        >>> model = Lfm2ForCausalLM.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)rb  r   r|   r  rc  rd  r   N)rs  ru  rY  )lossrs  r  r=   rI  r   )rH  rg  rt   r   slicerq  loss_functionrN   rY  r   r  r=   rI  )r-   rb  r   r|   r  rc  ru  rd  r   rv  r   outputsr=   slice_indicesrs  rx  s                   r1   r@   zLfm2ForCausalLM.forward  s    @ ,64:: 	,
)%+')	,
 	,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD%#33!//))
 	
r2   )	NNNNNNNNr   )rF   rG   rH   _tied_weights_keys_tp_plan_pp_planr'   r   r   r)   r   r   r   rm  rn  r   r   r   r   r@   rI   rJ   s   @r1   rp  rp    s/   *,GH23H_-z:;H  .2.204(,26*.!%26-.8
##d*8
 t+8
 &&-	8

 8
 ((4/8
   4'8
 $;8
 ((4/8
 ell*8
 +,8
 
 8
  8
r2   rp  )rp  rU  rG  )r   )r
  )Gcollections.abcr   typingr   r   r)   torch.nn.functionalr   r   r   cache_utilsr   
generationr	   integrationsr
   r   r   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.import_utilsr   r   configuration_lfm2r   causal_conv1dr   r    Moduler$   rL   r   r   r   r   r   r   r   ri   r   r   r  kernel_modulesallr;  r  r=  rG  rU  rp  __all__r   r2   r1   <module>r     s:  ( %         ) f f / 9 O K F & I I ? V * DD-7** Y'J")) J (J(><")) ><B8bii 8(C/ C/L( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%4 )*9$BII 9$ +9$x	 #$89^, haBII haV,1 ,^ /  " S
# S
 S
l H
)? H
 H
V Br2   