
    ia\                     :   d dl Z d dlmZ d dlZd dlmZ d dlmZ d dlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZmZmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z*  G d de*e      Z+ G d de(      Z, G d de!      Z- G d de       Z. G d dej^                        Z0 G d de      Z1 G d  d!e      Z2 G d" d#e$      Z3 G d$ d%e#      Z4 G d& d'e&      Z5 ed()       G d* d+e5             Z6 G d, d-e%      Z7g d.Z8y)/    N)Callable)	Tokenizer)Unigram)nn   )create_bidirectional_mask)BaseModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TokenizersBackend)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputs   )LlamaAttentionLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)ParakeetCTCConfigParakeetEncoderConfig)ParakeetEncoderBlock ParakeetEncoderConvolutionModuleParakeetForCTCParakeetPreTrainedModel)ParakeetProcessor)T5Tokenizerc                   d     e Zd Z	 	 	 	 	 	 	 d	 fd	Z	 	 	 d
deee   z  dededz  dedef
dZ xZ	S )LasrTokenizerNc                     t        	|   d|||||||d| t        t        | j                  dd            | _        y )N)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensvocab
vocab_filer   F)unk_idbyte_fallback )super__init__r   r   _vocab_scores
_tokenizer)
selfr"   r#   r$   r%   r&   r'   r(   kwargs	__class__s
            o/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/lasr/modular_lasr.pyr-   zLasrTokenizer.__init__+   s[     	 		
&?!		
 		
 $""#
    	token_idsskip_special_tokensclean_up_tokenization_spacesgroup_tokensreturnc                     t        |t              r|g}|r%t        j                  |      D cg c]  }|d   	 }}|D cg c]  }|| j                  k7  s| }}t        j                  | f|||d|S c c}w c c}w )Nr   )r5   r6   r7   )
isinstanceint	itertoolsgroupbypad_token_idr   _decode)r0   r5   r6   r7   r8   r1   token_grouptokens           r3   r@   zLasrTokenizer._decodeH   s     i%"I;D;L;LY;WXKQXIX )2PuUd>O>O5OUP	P ((
 3)E	

 
 	
 Y Qs   A4A9A9)z</s>z<unk>z<pad>d   NNN)FNT)
__name__
__module____qualname__r-   r<   listboolstrr@   __classcell__r2   s   @r3   r    r    *   sl     "&
@ %*48!
c?
 "
 '+Tk	

 
 

r4   r    c                       e Zd Zy)LasrProcessorNrD   rE   rF   r+   r4   r3   rM   rM   a   s    r4   rM   c                   \     e Zd ZdZddddddddd	d
ddddddddddddgddgddf fd	Z xZS )LasrEncoderConfiga  
    This is the configuration class to store the configuration of a [`LasrEncoder`]. It is used to instantiate a
    `LasrEncoder` model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
            hidden_size (`int`, *optional*, defaults to 512):
                Dimension of the layers and the hidden states.
            num_hidden_layers (`int`, *optional*, defaults to 17):
                Number of hidden layers in the Transformer encoder.
            num_attention_heads (`int`, *optional*, defaults to 8):
                Number of attention heads for each attention layer in the Transformer encoder.
            intermediate_size (`int`, *optional*, defaults to 2048):
                Dimension of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
            hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
                The non-linear activation function (function or string) in the encoder and pooler.
            attention_bias (`bool`, *optional*, defaults to `False`):
                Whether to use bias in the attention layers.
            convolution_bias (`bool`, *optional*, defaults to `False`):
                Whether to use bias in convolutions of the conformer's convolution module.
            conv_kernel_size (`int`, *optional*, defaults to 32):
                The kernel size of the convolution layers in the Conformer block.
            subsampling_conv_channels (`int`, *optional*, defaults to 256):
                The number of channels in the subsampling convolution layers.
            subsampling_conv_kernel_size (`int`, *optional*, defaults to 5):
                The kernel size of the subsampling convolution layers.
            subsampling_conv_stride (`int`, *optional*, defaults to 2):
                The stride of the subsampling convolution layers.
            num_mel_bins (`int`, *optional*, defaults to 128):
                Number of mel features.
            dropout (`float`, *optional*, defaults to 0.1):
                The dropout ratio for all fully connected layers in the embeddings, encoder, and pooler.
            dropout_positions (`float`, *optional*, defaults to 0.0):
                The dropout ratio for the positions in the input sequence.
            layerdrop (`float`, *optional*, defaults to 0.1):
                The dropout ratio for the layers in the encoder.
            activation_dropout (`float`, *optional*, defaults to 0.1):
                The dropout ratio for activations inside the fully connected layer.
            attention_dropout (`float`, *optional*, defaults to 0.1):
                The dropout ratio for the attention layers.
            max_position_embeddings (`int`, *optional*, defaults to 10000):
                The maximum sequence length that this model might ever be used with.
            initializer_range (`float`, *optional*, defaults to 0.02):
                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
            layer_norm_eps (`float`, *optional*, defaults to 1e-06):
                The epsilon used by the layer normalization layers.
            feed_forward_residual_weights (`tuple[float, float]`, *optional*, defaults to `[1.5, 0.5]`):
                The residual weights for the feed forward layers.
            conv_residual_weights (`tuple[float, float]`, *optional*, defaults to `[2.0, 1.0]`):
                The residual weights for the convolution layers.
            batch_norm_momentum (`float`, *optional*, defaults to 0.01):
                The momentum for the batch normalization layers.
            rope_parameters (`RopeParameters`, *optional*):
                Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
                a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
                with longer `max_position_embeddings`.

    Example:
        ```python
        >>> from transformers import LasrEncoderModel, LasrEncoderConfig

        >>> # Initializing a `LasrEncoder` configuration
        >>> configuration = LasrEncoderConfig()

        >>> # Initializing a model from the configuration
        >>> model = LasrEncoderModel(configuration)

        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```

    This configuration class is based on the LasrEncoder architecture from Google Health AI. You can find more details
    and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO).
             i   siluF          r      g?        i'  g{Gz?gư>g      ?g      ?g       @g      ?g{Gz?Nc                     || _         || _        || _        || _        || _        t        |   di d|d|d|d|d|d|d|d|d	|	d
|d|
d|d|d|d|d|d|d|d|| | `| `y )Nhidden_sizenum_hidden_layersnum_attention_headsintermediate_size
hidden_actattention_biasconvolution_biasconv_kernel_sizesubsampling_conv_channelsnum_mel_binssubsampling_conv_kernel_sizesubsampling_conv_stridedropoutdropout_positions	layerdropactivation_dropoutattention_dropoutmax_position_embeddingsinitializer_ranger+   )	rope_parameterslayer_norm_epsfeed_forward_residual_weightsconv_residual_weightsbatch_norm_momentumr,   r-   subsampling_factorscale_input)r0   r[   r\   r]   r^   r_   r`   ra   rb   rc   re   rf   rd   rg   rh   ri   rj   rk   rl   rm   ro   rp   rq   rr   rn   r1   r2   s                             r3   r-   zLasrEncoderConfig.__init__   s   8  /,-J*%:"#6  	
#	
/	
 !4	
 0		

 "	
 *	
 .	
 .	
 '@	
 &	
 *F	
 %<	
 	
 0	
  	
   2!	
" 0#	
$ %<%	
& 0)	
. #r4   )rD   rE   rF   __doc__r-   rJ   rK   s   @r3   rP   rP   e   sf    K^ "%%& ! %'*Cj"Cj 3: :r4   rP   c                   J     e Zd ZdZ	 	 	 	 	 ddeez  f fdZed        Z xZ	S )LasrCTCConfiga  
    This is the configuration class to store the configuration of a [`LasrForCTC`]. It is used to instantiate a
    Lasr CTC model according to the specified arguments, defining the model architecture.
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.
    Args:
            vocab_size (`int`, *optional*, defaults to 512):
                Vocabulary size of the model.
            ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
                Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
                instance of [`LasrForCTC`].
            ctc_zero_infinity (`bool`, *optional*, defaults to `True`):
                Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
                occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
                of [`LasrForCTC`].
            encoder_config (`Union[dict, LasrEncoderConfig]`, *optional*):
                The config object or dictionary of the encoder.
            pad_token_id (`int`, *optional*, defaults to 0):
                Padding token id. Also used as blank token id.
    Example:
        ```python
        >>> from transformers import LasrForCTC, LasrCTCConfig
        >>> # Initializing a Lasr configuration
        >>> configuration = LasrCTCConfig()
        >>> # Initializing a model from the configuration
        >>> model = LasrForCTC(configuration)
        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```
    This configuration class is based on the Lasr CTC architecture from Google Health AI. You can find more details
    and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO).
    encoder_configc           	      0    t        |   d|||||d| y )N)
vocab_sizectc_loss_reductionctc_zero_infinityrx   r?   r+   )r,   r-   )r0   rz   r{   r|   rx   r?   r1   r2   s          r3   r-   zLasrCTCConfig.__init__  s0     	 	
!1/)%	
 	
r4   c                 4    | j                   j                  dz  S )Nr   )rx   rf   )r0   s    r3   inputs_to_logits_ratioz$LasrCTCConfig.inputs_to_logits_ratio$  s    ""::A==r4   )rQ   meanTNr   )
rD   rE   rF   ru   dictrP   r-   propertyr~   rJ   rK   s   @r3   rw   rw      sC    F !37

 00
$ > >r4   rw   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )LasrEncoderSubsamplingconfigc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j
                  |j                  |j                        | _
        t        j                  |j
                  |j                  |j                  |j                        | _        t        j                  |j                  |j
                        | _        t        j                         | _        y )N)kernel_sizestride)r,   r-   r   Linearrd   r[   dense_0Conv1dre   rf   conv_0rc   conv_1dense_1ReLUact_fn)r0   r   r2   s     r3   r-   zLasrEncoderSubsampling.__init__*  s    yy!4!4f6H6HIii;;11	
 ii,,;;11	
 yy!A!A6CUCUVggir4   input_featuresr9   c                 ,   | j                  | j                  |            }|j                  dd      }| j                  | j                  |            }| j                  | j	                  |            }|j                  dd      }| j                  |      S )N   r   )r   r   	transposer   r   r   )r0   r   hidden_statess      r3   forwardzLasrEncoderSubsampling.forward<  sz    DLL$@A%//15DKK$>?DKK$>?%//15||M**r4   )	rD   rE   rF   rP   r-   torchTensorr   rJ   rK   s   @r3   r   r   )  s+     0  $+ell +u|| +r4   r   c                       e Zd Zy)LasrEncoderRotaryEmbeddingNrN   r+   r4   r3   r   r   E  s    r4   r   c                        e Zd Zdedef fdZ	 	 ddej                  deej                  ej                  f   dz  dej                  dz  de	e
   d	eej                  ej                  f   f
d
Z xZS )LasrEncoderAttentionr   	layer_idxc                 4    t         |   ||       d| _        y )NF)r,   r-   	is_causalr0   r   r   r2   s      r3   r-   zLasrEncoderAttention.__init__I  s    +r4   Nr   position_embeddingsattention_maskr1   r9   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	|\  }
}t        |||
|      \  }}t        j                  | j                  j                  t              } || |||	|f| j                  sdn| j                  | j                  d|\  }} |j                   g |d j#                         }| j%                  |      }||fS )Nr   r   rY   )rg   scaling)shapehead_dimq_projviewr   k_projv_projr   r
   get_interfacer   _attn_implementationr   trainingrk   r   reshape
contiguouso_proj)r0   r   r   r   r1   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                  r3   r   zLasrEncoderAttention.forwardM  sk    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r4   NN)rD   rE   rF   rP   r<   r-   r   r   tupler   r   r   rJ   rK   s   @r3   r   r   H  s    0 S  IM.2	")||") #5<<#=>E") t+	")
 +,") 
u||U\\)	*")r4   r   c                   &     e Zd Zddef fdZ xZS )LasrEncoderConvolutionModuler   c                     t         |   ||       d| _        t        j                  |j
                  |j                        | _        y )Nsame)momentum)r,   r-   paddingr   BatchNorm1dr[   rr   norm)r0   r   module_configr2   s      r3   r-   z%LasrEncoderConvolutionModule.__init__s  s7    /NN6#5#5@Z@Z[	r4   N)rD   rE   rF   rP   r-   rJ   rK   s   @r3   r   r   r  s    \0 \ \r4   r   c                        e Zd Zdedef fdZ	 	 ddej                  dej                  dz  dej                  dz  dee	   d	ej                  f
d
Z
 xZS )LasrEncoderBlockr   r   c                 T   t         |   ||       |j                  | _        |j                  | _        t	        j
                  |j                  |j                  d      | _        t	        j
                  |j                  |j                  d      | _	        t	        j
                  |j                  |j                  d      | _
        t	        j
                  |j                  |j                  d      | _        t	        j
                  |j                  |j                  d      | _        y )NF)bias)r,   r-   rp   rq   r   	LayerNormr[   ro   norm_feed_forward1norm_self_att	norm_convnorm_feed_forward2norm_outr   s      r3   r-   zLasrEncoderBlock.__init__z  s    +-3-Q-Q*%+%A%A""$,,v/A/A6CXCX_d"e\\&*<*<f>S>SZ_`f&8&8&:O:OV[\"$,,v/A/A6CXCX_d"eV%7%79N9NUZ[r4   Nr   r   r   r1   r9   c                 0   |}| j                  | j                  |            }| j                  d   |z  | j                  d   |z  z   }| j                  |      } | j                  d|||d|\  }}||z   }| j                  | j                  |      |      }	| j                  d   |z  | j                  d   |	z  z   }|}| j                  | j                  |            }| j                  d   |z  | j                  d   |z  z   }| j                  |      }|S )Nr   r   )r   r   r   )r   r+   )feed_forward1r   rp   r   	self_attnconvr   rq   feed_forward2r   r   )
r0   r   r   r   r1   residualnormalized_hidden_statesr   _conv_outputs
             r3   r   zLasrEncoderBlock.forward  sJ    !**4+B+B=+QR..q1H<t?a?abc?dgt?tt 	 $(#5#5m#D ' 
2) 3
 	
Q &3ii} =ni]2215EHbHbcdHehsHss **4+B+B=+QR..q1H<t?a?abc?dgt?tt 	 m4r4   r   )rD   rE   rF   rP   r<   r-   r   r   r   r   r   rJ   rK   s   @r3   r   r   y  sw    
\0 
\S 
\ /337	!||! t+! #\\D0	!
 +,! 
!r4   r   c                   6    e Zd ZdZd Zdej                  fdZy)LasrPreTrainedModelFc                 .    t        j                  |       y r   )r   _init_weights)r0   modules     r3   r   z!LasrPreTrainedModel._init_weights  s    %%f-r4   input_lengthsc                     t        | j                  t              r| j                  j                  n| j                  }|j                  }|j
                  }d}t        |      D ]  }||z
  |z  dz   } |S )Nr   r   )r;   r   rw   rx   re   rf   range)r0   r   rx   r   r   
num_layersr   s          r3   _get_subsampling_output_lengthz2LasrPreTrainedModel._get_subsampling_output_length  st    7A$++}7]33cgcncn$AA77
z" 	HA*[8VCaGM	H r4   N)rD   rE   rF   _supports_flex_attnr   r   r   r   r+   r4   r3   r   r     s    .	ELL 	r4   r   zh
    The LasrEncoder model, based on the Conformer architecture](https://arxiv.org/abs/2005.08100).
    )custom_introc                        e Zd ZU eed<   dZdef fdZe e       e		 d
de
j                  de
j                  dz  dee   defd	                     Z xZS )LasrEncoderr   encoderc           	         t         |   |       d| _        |j                  | _        |j                  | _        |j
                  | _        t        |      | _        t        |      | _	        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                   |j"                  |j$                  d      | _        | j)                          y c c}w )NF)epsr   )r,   r-   gradient_checkpointingrg   rh   ri   r   
subsamplerr   
rotary_embr   
ModuleListr   r\   r   layersr   r[   ro   out_norm	post_initr   s      r3   r-   zLasrEncoder.__init__  s     &+#~~!'!9!9))084V<mmBGH`H`BabYfi0b
 V%7%7V=R=RY^_	 cs   C2Nr   r   r1   r9   c                 b   | j                  |      }| j                  |t        j                  |j                  d   |j
                        j                  d            \  }}t        j                  j                  || j                  | j                        }t        j                  j                  || j                  | j                        }t        j                  j                  || j                  | j                        }| | j                  ||j                  d         }t        | j                  ||      }| j                  D ]G  }d}| j                  r&t        j                   g       }	|	| j"                  k  rd}|r: ||f|||fd	|}I | j%                  |      }t'        |
      S )a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, LasrEncoder
        >>> from datasets import load_dataset, Audio

        >>> model_id = TODO
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> encoder = ParakeetEncoder.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"])
        >>> encoder_outputs = encoder(**inputs)

        >>> print(encoder_outputs.last_hidden_state.shape)
        ```
        r   )devicer   )pr   )target_length)r   input_embedsr   FT)r   r   )last_hidden_state)r   r   r   aranger   r   	unsqueezer   
functionalrg   r   rh   _get_output_attention_maskr   r   r   randri   r   r	   )
r0   r   r   r1   r   r   r   encoder_layerto_dropdropout_probabilitys
             r3   r   zLasrEncoder.forward  s   < 7??5<<(;(;A(>}G[G[\ffghi
S --mt||VZVcVc-dmm##C4+A+ADMM#Zmm##C4+A+ADMM#Z%!<<^[h[n[nop[q<rN2;;&)
 "[[ 	MG}}&+jjn#&7"G -!!#1),c
! 	!	  m4??r4   r   )rD   rE   rF   rP   __annotations__base_model_prefixr-   r   r   r   r   r   r   r   r	   r   rJ   rK   s   @r3   r   r     s     !0 "  /3?@?@ t+?@ +,	?@
 
?@   ?@r4   r   c                        e Zd Z fdZ xZS )
LasrForCTCc                  8     t               j                  di | S )a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, LasrForCTC
        >>> from datasets import load_dataset, Audio

        >>> model_id = TODO
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = LasrForCTC.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
        >>> predicted_ids = model.generate(**inputs)
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        >>> print(transcription)
        ```
        r+   )r,   generate)super_kwargsr2   s    r3   r  zLasrForCTC.generate  s    ,  uw/,//r4   )rD   rE   rF   r  rJ   rK   s   @r3   r  r    s    0 0r4   r  )r  r   r   rM   rP   rw   r    )9r=   collections.abcr   r   
tokenizersr   tokenizers.modelsr   r   masking_utilsr   modeling_outputsr	   modeling_utilsr
   r   processing_utilsr   tokenization_utils_tokenizersr   utilsr   r   r   utils.genericr   llama.modeling_llamar   r   r   r   parakeet.configuration_parakeetr   r   parakeet.modeling_parakeetr   r   r   r   parakeet.processing_parakeetr   t5.tokenization_t5r   r    rM   rP   rw   Moduler   r   r   r   r   r   r   r  __all__r+   r4   r3   <module>r     s    $    %  6 / F & > I I / v v V  = ,4
K!2 4
n	% 	H- HV6>% 6>r+RYY +8 <!5 ;')> ')T\#C \.+ .b1 & 
W@% W@
W@t0 04r4   