
    i                     0   d dl mZ d dlmZ d dlZd dlZd dlmZ d dl	m
Z
mZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5  G d dejl                        Z7 G d dejl                        Z8 G d dejl                        Z9dejt                  de;dejt                  fdZ<	 dGd ejl                  d!ejt                  d"ejt                  d#ejt                  d$ejt                  dz  d%e=d&e=d'e,e.   fd(Z>d) Z?dHd*Z@ ee@       G d+ d,ejl                               ZA G d- d.e      ZB G d/ d0e      ZCe/ G d1 d2e*             ZD G d3 d4eD      ZEe/ G d5 d6eD             ZF	 	 dId7eGe;e;f   d8e=d9e;d$ej                  dz  d:e;dej                  fd;ZJe/ G d< d=eD             ZKd>ejt                  d?e;d@e;fdAZL e/dBC       G dD dEeDe             ZMg dFZNy)J    )Callable)OptionalN)OutputRecordercheck_model_inputs   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)use_kernelized_func)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)is_flash_attention_requestedmaybe_autocast   )MoonshineConfigc                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineEncoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                        | _	        t        j                  |j                  |j                        | _
        y Nsuper__init__configr   activation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr+   
hidden_act	__class__s      z/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/moonshine/modeling_moonshine.pyr*   zMoonshineEncoderMLP.__init__6   s^    #J/99V//1I1IJ99V55v7I7IJ    hidden_statesreturnc                 l    | j                  |      }| j                  |      }| j                  |      }|S r'   )r1   r,   r2   )r4   r9   s     r7   forwardzMoonshineEncoderMLP.forward=   s4    /**=9/r8   __name__
__module____qualname__r*   torchTensorr<   __classcell__r6   s   @r7   r%   r%   5   s$    KU\\ ell r8   r%   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineDecoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                  dz        | _	        t        j                  |j                  |j                        | _
        y )N   r(   r3   s      r7   r*   zMoonshineDecoderMLP.__init__E   sc    #J/99V//1I1IA1MN99V55v7I7IJr8   r9   r:   c                     | j                  |      }|j                  dd      \  }}| j                  |      |z  }| j                  |      }|S )NrH   dim)r1   chunkr,   r2   )r4   r9   gates      r7   r<   zMoonshineDecoderMLP.forwardL   sS    /+11!1<t**40=@/r8   r=   rD   s   @r7   rF   rF   D   s$    KU\\ ell r8   rF   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )MoonshineRotaryEmbeddinginv_freqNr+   c                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultrQ   F)
persistentoriginal_inv_freq)r)   r*   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr+   rope_parametersrS   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r4   r+   devicerope_init_fnrQ   r6   s        r7   r*   z!MoonshineRotaryEmbedding.__init__W   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuUr8   r_   ztorch.deviceseq_lenr:   ztorch.Tensorc                 n   | j                   d   }| j                   j                  dd      }t        | dd      xs | j                  | j                  z  }t        ||z        }d}d|t        j                  d|dt        j                        j                  |t        j                  	      |z  z  z  }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorg      ?head_dimNr   rH   dtyper_   rg   )rZ   getgetattrr/   num_attention_headsintrA   arangeint64tofloat)	r+   r_   ra   baserd   re   rL   attention_factorrQ   s	            r7   r[   z8MoonshineRotaryEmbedding.compute_default_rope_parametersg   s    & %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(223 U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r8   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rJ   r"   mpscpuF)device_typeenabledrH   rK   rf   )rQ   rp   expandshapero   r_   
isinstancetypestrr!   	transposerA   catcosr\   sinrg   )
r4   xposition_idsinv_freq_expandedposition_ids_expandedrv   freqsembr   r   s
             r7   r<   z MoonshineRotaryEmbedding.forward   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$r'   )NNN)r>   r?   r@   rA   rB   __annotations__r#   r*   staticmethodr   rl   tuplerp   r[   no_gradr   r<   rC   rD   s   @r7   rP   rP   T   s    llV V  )-+/"*$&*(* t* 
~u$	%	* *> U]]_<  <r8   rP   r9   n_repr:   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r"   N)ry   rx   reshape)r9   r   batchnum_key_value_headsslenre   s         r7   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr8   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )NrH   r   rJ   )rL   rg   )ptrainingr"   )r   num_key_value_groupsrA   matmulr}   ry   r-   
functionalsoftmaxfloat32ro   rg   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r7   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r8   c                 |    | ddddf   }| ddddf   }t        j                  | |fd      j                  d      S )	z*Rotates half the hidden dims of the input..r   NrH   r"   rJ   rK   r   )rA   stackflatten)r   x1x2s      r7   rotate_halfr      sJ    	
319B	
319B;;Ryb)11"55r8   c                    |j                  |      }|j                  |      }|dd|j                  d   dz  f   j                  dd      }|dd|j                  d   dz  f   j                  dd      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }	}||z  t        |      |z  z   }
||z  t        |      |z  z   }t	        j
                  |
|gd      }
t	        j
                  ||	gd      }|
|fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .NrJ   rH   rK   )	unsqueezery   repeat_interleaver   rA   r~   )qkr   r   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds               r7   apply_rotary_pos_embr      sD   $ --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr8   c                   l    e Zd ZdZdededededef
 fdZ	 	 	 	 	 dd	ej                  d
e
ej                  ej                  f   dz  dej                  dz  dedz  dej                  dz  dej                  dz  dee   de
ej                  ej                  dz  e
ej                     dz  f   fdZ xZS )MoonshineAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr+   	layer_idx	is_causalrk   r   c                 8   t         |           |j                  ||d       || _        || _        t        |d|j                  |j                  z        | _        |j                  |j                  z  | _
        | j                  dz  | _        |j                  | _        || _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  | j                  z  |j                  d      | _        | j                  j*                  C| j                  j*                  }|| j                  |z   dz
  |z  z  }|| j                  z
  | _        y d| _        y )N)rk   r   re   g      ࿩biasFr"   r   )r)   r*   updater+   r   rj   r/   rk   re   r   r   r   attention_dropoutr   r-   r.   attention_biasq_projk_projv_projo_projpad_head_dim_to_multiple_ofhead_dim_padding)	r4   r+   r   r   rk   r   target_multipletarget_head_dimr6   s	           r7   r*   zMoonshineAttention.__init__   s    	.AZmno"
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9"ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JFL^L^ejk ;;22>"kkEEO-$--/2QTU2UZi1ijO$3dmm$CD!$%D!r8   Nr9   position_embeddingsr   past_key_valuescache_positionkey_value_statesr   r:   c                 N   |j                   d d \  }}	| j                  |      j                  ||	| j                  j                  | j
                        j                  dd      }
|d u}|Y|j                  j                  | j                        }|r&d|j                  | j                  <   |j                  }n|j                  }||n|}|rK|rIrG|j                  | j                     j                  }|j                  | j                     j                  }n| j                  |      j                  |d| j                  j                  | j
                        j                  dd      }| j!                  |      j                  |d| j                  j                  | j
                        j                  dd      }|r%|#|j#                  ||| j                  d|i      \  }}|s?|\  }}t%        |
|||      \  }
}|'|||d}|j#                  ||| j                  |      \  }}t'        j(                  | j                  j*                  t,              }| j.                  xr |d u xr |	dkD  }| j0                  dkD  rt2        j4                  j6                  j9                  |
d| j0                  f      }
t2        j4                  j6                  j9                  |d| j0                  f      }t2        j4                  j6                  j9                  |d| j0                  f      } || |
|||f| j:                  sdn| j<                  | j>                  |d	|\  }}| j0                  dkD  r|d
d | j0                   f   }|jA                  ||	d      jC                         }| jE                  |      }||fS )NrJ   r"   rH   Tr   )r   r   r   r           )r   r   r   .)#ry   r   viewr+   r   re   r}   
is_updatedri   r   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   r   r   r   get_interface_attn_implementationr   r   r   rA   r-   r   padr   r   r   r   r   r   )r4   r9   r   r   r   r   r   r   bszq_lenquery_statesis_cross_attentionr   current_statesr   r   r   r   cache_kwargsattention_interfacer   r   r   s                          r7   r<   zMoonshineAttention.forward  sy    #(("-
U KK&++C8W8WY]YfYfgqqrsuvw 	 .T9&(3377GJ!=A**4>>:"1"G"G"1"F"F .>-I)}/j(//?DDJ*11$..AHHL N+c2t{{>>N1a  N+c2t{{>>N1a 
 "o&A+:+A+Adnn?OQ_>`,(
L "*HC';L*VY[^'_$L**'*3.Y+:+A+Adnnl,(
L )@(M(MKK,,.E)
 NNK~'=K%!)	  1$ 88..22<!TEZEZA[\L,,00aAVAV=WXJ 88..22<!TEZEZA[\L$7
%
  $}}C$2H2HLL
%
 
%
!\   1$%c+Cd.C.C-C+C&CDK!))#ub9DDFkk+.L((r8   )NNNNN)r>   r?   r@   __doc__r#   rl   boolr*   rA   rB   r   r	   
LongTensorr   r   r<   rC   rD   s   @r7   r   r      s   G#&#& #& 	#&
 !#& !#&P IM.2(,2604U)||U) #5<<#=>EU) t+	U)
 U) ((4/U)  ,,-U) -.U) 
u||U\\D0%2E2LL	MU)r8   r   c                   "    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
ej                  dz  deej                  ej                  f   dz  dee   dej                  fdZ xZS )MoonshineEncoderLayerr+   r   c                 d   t         |           |j                  | _        t        ||d|j                  |j
                        | _        t        ||j                        | _	        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NFr+   r   r   rk   r   r   )r)   r*   r/   r   encoder_num_attention_headsencoder_num_key_value_heads	self_attnr%   encoder_hidden_actmlpr-   	LayerNorminput_layernormpost_attention_layernormr4   r+   r   r6   s      r7   r*   zMoonshineEncoderLayer.__init__n  s    !--+ & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%r8   Nr9   r   r   r   	use_cacher   r   r   r:   c                     |}	| j                  |      } | j                  d|||||||d|\  }}
|	|z   }|}	| j                  |      }| j                  |      }|	|z   }|S )Nr9   r   r   r   r   r   r    )r   r   r   r   )r4   r9   r   r   r   r   r   r   r   residual_s              r7   r<   zMoonshineEncoderLayer.forward~  s     !,,];)4>> 	
')%+) 3	
 	
q !=0 !55mD/ =0r8   )NNNFNN)r>   r?   r@   r#   rl   r*   rA   rB   r   r	   r   r   r   r   r<   rC   rD   s   @r7   r   r   m  s    U U3 U& /304(,!&26HL|| t+ &&-	
  $; ((4/ #5<<#=>E +, 
r8   r   c                   
    e Zd Zddededz  f fdZ	 	 	 	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  de	dz  de
dz  dej                  dz  deej                  ej                  f   dz  deej                  ej                  f   dz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )MoonshineDecoderLayerNr+   r   c                    t         |           |j                  | _        t        ||d|j                  |j
                        | _        t        ||d|j                  |j
                        | _        t        ||j                        | _
        t        j                  |j                  d      | _        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NTr   Fr   )r)   r*   r/   r   decoder_num_attention_headsdecoder_num_key_value_headsr   encoder_attnrF   decoder_hidden_actr   r-   r   r   r   final_layernormr   s      r7   r*   zMoonshineDecoderLayer.__init__  s    !--+ & B B & B B
 / & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%!||F,>,>UKr8   r9   r   encoder_hidden_statesencoder_attention_maskr   encoder_position_idsr   r   r   r   encoder_position_embeddingsr   r:   c                 (   |}| j                  |      } | j                  d||||||	|
d|\  }}||z   }|1|}| j                  |      }| j                  |||||      \  }}||z   }|}| j	                  |      }| j                  |      }||z   }|S )Nr   )r9   r   r   r   r   r   )r   r   r   r  r  r   )r4   r9   r   r  r  r   r  r   r   r   r   r  r   r   r   s                  r7   r<   zMoonshineDecoderLayer.forward  s     !,,];)4>> 	
')%+) 3	
 	
q !=0 ,$H 99-HM#00+!65 /#  1  M1 %}4M ,,];/ =0r8   r'   )
NNNNNNFNNN)r>   r?   r@   r#   rl   r*   rA   rB   r   r	   r   r   r   r   FloatTensorr<   rC   rD   s   @r7   r   r     sj   L L3: L6 /3596:048<(,!&26HLPT.||. t+.  %||d2	.
 !&t 3. &&-. $..5. . $;. ((4/. #5<<#=>E. &+5<<+E%F%M. +,. 
u  %(9(95;L;L(L"MPT"TT	U.r8   r   c                   \    e Zd ZU eed<   dZdZdZdZddgZ	dZ
dZdZdej                  fd	Zy
)MoonshinePreTrainedModelr+   modelinput_valuesaudioTr   r   input_lengthsc                 ~    t        |dz
  dz  dz         }t        |dz
  dz  dz         }t        |dz
  dz  dz         }|S )zH
        Computes the output length of the convolutional layers
           @   r"      r   rH   )rl   )r4   r  output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        r7    _get_feat_extract_output_lengthsz9MoonshinePreTrainedModel._get_feat_extract_output_lengths  sZ     "=3#6""<q"@A!#6#:a"?!"CD!#6#:a"?!"CD""r8   N)r>   r?   r@   r#   r   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphrA   r   r  r   r8   r7   r  r    sN    $O&*#02IJN!#e>N>N #r8   r  c                        e Zd ZdZdZeedZdef fdZ	de
j                  fdZde
j                  fd	Ze	 ddej                   dej"                  d
z  dee   deez  fd       Z xZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r  )
attentionsr9   r+   c           	      b   t         |   |       || _        |j                  }t	        j
                  d|ddd      | _        t	        j
                  |d|z  dd	      | _        t	        j
                  d|z  |dd	      | _        t	        j                  d|d
      | _
        t	        j                  t        |j                        D cg c]  }t        ||       c}      | _        t	        j                   |d      | _        t%        |      | _        d| _        | j+                          y c c}w )Nr"   r  r  F)kernel_sizestrider   rH   r  r   )r%  r&  gh㈵>)
num_groupsnum_channelsepsr   r+   )r)   r*   r+   r/   r-   Conv1dconv1conv2conv3	GroupNorm	groupnorm
ModuleListrangeencoder_num_hidden_layersr   r   r   
layer_normrP   
rotary_embgradient_checkpointing	post_init)r4   r+   	embed_dimidxr6   s       r7   r*   zMoonshineEncoder.__init__  s     &&	YYq)ReT
YYy!i-QqQ
YYq9}iQqQ
PTUmm;@AaAa;bcC"63/c
 ,,yu=2&A&+# ds   D,r:   c                     | j                   S r'   r,  r4   s    r7   get_input_embeddingsz%MoonshineEncoder.get_input_embeddings#  s    zzr8   r   c                     || _         y r'   r;  r4   r   s     r7   set_input_embeddingsz%MoonshineEncoder.set_input_embeddings&  s	    
r8   Nr   r   c                    |j                  d      }t        j                  j                  | j	                  |            }| j                  |      }t        j                  j                  | j                  |            }t        j                  j                  | j                  |            }|j                  ddd      }|| j                  |j                  d         }d}|ddd|f   dd|f   }t        | j                        r|dk(  j                         r|nd}nF| j                  j                  d	k(  rt!        ||j"                        }nt%        ||j"                        }t'        j(                  d|j                  d   |j*                  
      j                  d      }| j-                  ||      }| j.                  D ]  }	 |	|f|||d|} | j1                  |      }t3        |      S )a.  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        r"   r   rH   NrJ     .r   sdpar_   r   )r   r   r   )last_hidden_state)r   r-   r   tanhr,  r0  gelur-  r.  permuter  ry   r    r+   anyr   r   rg   r   rA   rm   r_   r5  r   r4  r   )
r4   r  r   r   r9   mask_lendownsample_strider   r   encoder_layers
             r7   r<   zMoonshineEncoder.forward)  s   , $--a0**4::l+CD}5**4::m+DE**4::m+DE%--aA6 %<<^=Q=QRT=UVH *+C1D3D1D,DEc9H9nUN+DKK84Bc4I3N3N3PVZ11V;!D^UbUhUh!i!;NML_L_!`||A}':':1'=mFZFZ[eefgh"oom,oW![[ 	M)-)$7	
 M	 6&+
 	
r8   r'   )r>   r?   r@   r   r  r   r   _can_record_outputsr#   r*   r-   Moduler=  r@  r   rA   r
  rB   r   r   r   r   r<   rC   rD   s   @r7   r"  r"    s     %O(.
 $bii "))   /38
''8
 t+8
 +,	8

 
(	(8
 8
r8   r"  c                   x    e Zd ZdZ eedd      e eedd      dZdef fdZ	e
	 	 	 	 	 	 	 	 	 ddej                  d	z  d
ej                  d	z  dej                  d	z  ded	z  dej                  d	z  ded	z  dej                  d	z  dej                  d	z  dej                  d	z  dee   deez  fd       Z xZS )MoonshineDecoder	input_idsr"   r   )index
layer_namer  )r#  r9   cross_attentionsr+   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j                  d      | _        t!        |      | _        d| _        | j'                          y c c}w )NFr   r*  )r)   r*   pad_token_idpadding_idx
vocab_sizer-   	Embeddingr/   embed_tokensr1  r2  decoder_num_hidden_layersr   r   r   normrP   r5  r6  r7  )r4   r+   r9  r6   s      r7   r*   zMoonshineDecoder.__init__n  s     !.. ++LL):):F<N<NPTP`P`amm;@AaAa;bcC"63/c
 LL!3!3%@	2&A&+# 	 ds   DNr   r   r   inputs_embedsr   r   r  r  r   r:   c
                    |du |duz  rt        d      || j                  |      }|r6|4t        t        | j                        t        | j                              }|F||j                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }|}| j                  ||      }|	|j                  d	   }d
}|	ddd|f   dd|f   }	t        | j                        r|	dk(  j                         r|	nd}	nb| j                  j                  dk(  r%t!        |	|j"                  |j                  d	         }	n$t%        |	|j"                  |j                  d	         }	| j&                  D ]  } ||||f|	|||||d|
} | j)                  |      }t+        ||r|      S d      S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedsr*  r   r"   rD  )r+   input_embedsr   r   r   r   rE  r   rB  .r   rC  )r  r   r   r   r   r   )rF  r   )
ValueErrorr[  r   r
   r+   get_seq_lengthrA   rm   ry   r_   r   r   r5  r    rJ  r   r   rg   r   r   r]  r   )r4   rR  r   r   r   r^  r   r   r  r  r   past_seen_tokensr   r9   r   rK  rL  decoder_layers                     r7   r<   zMoonshineDecoder.forward~  sD   0 -t";<YZZ  --i8M01,dkk2RT`hlhshsTtuO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;&))+%
 &"oom,oW!-,2226H *%;CATCTAT<T%UVY[d\d[dVd%e"+DKK8DZ^aDaCfCfCh)?nr&11V;)L*M,?,?ATATUWAX*& *D*M,?,?ATATUWAX*& "[[ 	M)% (>) /#-$7 M	 		-08+/8O
 	
>B
 	
r8   )	NNNNNNNNN)r>   r?   r@   r  r   r   r   rN  r#   r*   r   rA   r   rB   r	   r
  r   r   r   r   r   r<   rC   rD   s   @r7   rQ  rQ  e  sA   !O$%7q[Y.*+=QSab    .2.204(,26!%26:>6:W
##d*W
 t+W
 &&-	W

 W
 ((4/W
 $;W
 ((4/W
  %0047W
 !&t 3W
 +,W
 
(	(W
 W
r8   rQ  ry   	mask_probmask_length	min_masksc                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                         j                  d      j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r"   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr"   r   )rl   max)input_lengthnum_masked_spanepsilonrf  re  rg  sequence_lengths     r7   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr8   NrJ   rf   r   F)replace)ra  nprandomranditemdetachsumtolistr2  zerosr   choicerm   lenconcatenateonesint32appendarraybroadcast_tor   rk  put_along_axis)ry   re  rf  r   rg  
batch_sizerp  r   r  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanrl  rm  spec_aug_mask_idxdummy_mask_idxoffsetsrn  ro  s    `` `            @@r7   _compute_mask_indicesr    s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                       e Zd Zdef fdZd Zd Zd Z	 ddej                  dej                  dz  fd	Zee	 	 	 	 	 	 	 	 	 	 dd
ej                  dz  dej                  dz  dej                  dz  dej                  dz  deeej                        dz  dedz  deej                     dz  deej                     dz  dedz  dej                  dz  dee   defd              Z xZS )MoonshineModelr+   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r'   )r)   r*   r"  encoderrQ  decoderr7  r4   r+   r6   s     r7   r*   zMoonshineModel.__init__R  s2     '/'/r8   c                 .    | j                   j                  S r'   r  r[  r<  s    r7   r=  z#MoonshineModel.get_input_embeddingsZ  s    ||(((r8   c                 &    || j                   _        y r'   r  r?  s     r7   r@  z#MoonshineModel.set_input_embeddings]  s    $)!r8   c                 8    | j                   j                          y)z
        Calling this function will disable the gradient computation for the Moonshine encoder so that its parameters will
        not be updated during training.
        N)r  _freeze_parametersr<  s    r7   freeze_encoderzMoonshineModel.freeze_encoder`  s    
 	'')r8   Ninput_featuresr   c                 2   t        | j                  dd      s|S |j                         \  }}}| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }|dddf   j                  d|d      }d||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  | j                  j                        }t        j                  ||j                  t        j                        }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTr   )re  rf  r   rg  rh   NrJ   )re  rf  rg  )rj   r+   sizemask_time_probr   r  mask_time_lengthmask_time_min_masksrA   tensorr_   r   rx   mask_feature_probmask_feature_lengthmask_feature_min_masks)r4   r  r   r  r/   ro  mask_time_indicesmask_feature_indicess           r7   _mask_input_featuresz#MoonshineModel._mask_input_featuresg  s[    t{{$8$?!! 4B3F3F3H0
K;;%%)dmm 5_-++44 KK88-++99! !&->~G\G\didndn o 1!T' : A A"kSU V01N,-;;((1,#8[)++77 KK;;++<<	$  $)<<0D^MbMbjojtjt#u 34N/0r8   r  decoder_input_idsdecoder_attention_maskencoder_outputsr   decoder_inputs_embedsdecoder_position_idsr   r   r   r:   c                 B   | | j                   |fd|i|} | j                  d||||j                  ||||	|
d	|}t        |j                  |j                  |j
                  |j                  |j                  |j                  |j
                  |j                        S )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        r   )	rR  r   r  r  r   r^  r   r   r   )rF  r   decoder_hidden_statesdecoder_attentionsrU  encoder_last_hidden_stater  encoder_attentionsr   )r  r  rF  r   r   r9   r#  rU  )r4   r  r   r  r  r  r   r  r  r   r   r   decoder_outputss                r7   r<   zMoonshineModel.forward  s    \ "/;t||L/rYg/rkq/rOEQT\\ F
'1#1"1"C"C+/-)F
 F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r8   r'   )
NNNNNNNNNN)r>   r?   r@   r#   r*   r=  r@  r  rA   r
  r   r  r   r   r   r   r   r   r   r   r<   rC   rD   s   @r7   r  r  P  s    )** 37)))) ((4/)V  262659:>BF6:AE?C!%26E
''$.E
 ((4/E
 !++d2	E

 !& 0 04 7E
 uU%6%6784?E
 -t3E
  %U%6%67$>E
 $E$4$45<E
 $;E
 ((4/E
 +,E
 
E
  E
r8   r  rR  rW  decoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    NrJ   r"   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosry   r^   ra  masked_fill_)rR  rW  r  shifted_input_idss       r7   shift_tokens_rightr    s}     "++IOO<(CRC0668ae4adLMM""#4#<lKr8   zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )custom_introc                       e Zd ZddiZdef fdZd Zd Zdej                  fdZ
ee	 	 	 	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  deeej                        d	z  ded	z  deej                     d	z  deej                     d	z  ded	z  dej                  d	z  dej                  d	z  dee   defd              Z xZS )!MoonshineForConditionalGenerationzproj_out.weightz!model.decoder.embed_tokens.weightr+   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFr   )
r)   r*   r  r  r-   r.   r/   rY  proj_outr7  r  s     r7   r*   z*MoonshineForConditionalGeneration.__init__  sH     #F+
		&"4"4f6G6GeT 	r8   c                     | j                   S r'   r  r<  s    r7   get_output_embeddingsz7MoonshineForConditionalGeneration.get_output_embeddings  s    }}r8   c                     || _         y r'   r  )r4   new_embeddingss     r7   set_output_embeddingsz7MoonshineForConditionalGeneration.set_output_embeddings  s	    &r8   r:   c                 6    | j                   j                         S r'   )r  r=  r<  s    r7   r=  z6MoonshineForConditionalGeneration.get_input_embeddings  s    zz..00r8   Nr  r   r  r  r  r   r  r  r   r   labelsr   c                    |9|7|5t        || j                  j                  | j                  j                        } | j                  |f||||||||	|
d	|}| j                  |j                        }d}|(| j                  ||| j                  j                        }t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                   	      S )a0  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)	r   r  r  r  r   r  r  r   r   )logitsr  rY  )	lossr  r   r  r  rU  r  r  r  )r  r+   rW  r  r  r  rF  loss_functionrY  r   r   r  r  rU  r  r  r  )r4   r  r   r  r  r  r   r  r  r   r   r  r   outputsr  r  s                   r7   r<   z)MoonshineForConditionalGeneration.forward  s   f  (-B-J$6DKK44dkk6X6X%! '1djj'
)/+#9+"7!5)'
 '
 w889%%VFt{{OeOe%fD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r8   )NNNNNNNNNNN)r>   r?   r@   _tied_weights_keysr#   r*   r  r  r-   rO  r=  r   r   rA   r
  r   r   r   r   r   r   r   r<   rC   rD   s   @r7   r  r    s    ,-PQ '1bii 1  262659:>BF6:AE?C!%26*.T
''$.T
 ((4/T
 !++d2	T

 !& 0 04 7T
 uU%6%6784?T
 -t3T
  %U%6%67$>T
 $E$4$45<T
 $;T
 ((4/T
   4'T
 +,T
 
T
  T
r8   r  )r  r  r  )r   )r"   )Nr   )Ocollections.abcr   typingr   numpyrr  rA   torch.nnr-   transformers.utils.genericr   r   activationsr   cache_utilsr	   r
   r   
generationr   integrationsr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr    r!   configuration_moonshiner#   rO  r%   rF   rP   rB   rl   r   rp   r   r   r   r   r   r   r  r"  rQ  r   r   ndarrayr  r  r  r  __all__r   r8   r7   <module>r     s  * %     I ! C C ) / / g B 9  L F & I I I 4")) "))  @<ryy @<F	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%46%P )*}) }) +})@06 0fG6 GT # # #0_
/ _
D p
/ p
 p
n /3tc?tt t $$t+	t
 t ZZtn H
- H
 H
V%,, c [^   
j
(@/ j

j
Z ^r8   