
    iF                         d dl mZ d dlmZ d dlmZ ddlmZ  ej                  e	      Z
 G d de      Z G d d	e      Zdd	gZy
)   )PreTrainedConfig)RopeParameters)logging   )
AutoConfigc            ,       D    e Zd ZdZdZdZdgZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedz  dedz  d	edz  d
edz  dedz  dedz  dedz  dedz  dedz  dedz  de	dz  dedz  de
dz  dedz  dedz  dedz  deeeef   z  dz  de
dz  de	dz  de
dz  dedz  f* fdZ xZS )CsmDepthDecoderConfiga|  
    This is the configuration class to store the configuration of a [`CsmDepthDecoderModel`]. It is used to instantiate an CSM depth decoder
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
    a similar configuration to that of the csm-1b.

    e.g. [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        num_codebooks (`int`, *optional*, defaults to 32):
            Number of codebooks used in the underlying codec model responsible for tokenizing the audio.
        backbone_hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations of the backbone model used with this depth decoder.
        vocab_size (`int`, *optional*, defaults to 2051):
            Vocabulary size of the CsmDepthDecoder model. Defines the number of different audio tokens that can be represented by each codebook.
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 4):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 2):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 33):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 2050):
            Padding token id.
        bos_token_id (`int`, *optional*):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*):
            End of stream token id.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
        head_dim (`int`, *optional*):
            The attention head dimension. If None, it will default to hidden_size // num_attention_heads

    ```python
    >>> from transformers import CsmDepthDecoder, CsmDepthDecoderConfig

    >>> # Initializing a CsmDepthDecoder
    >>> configuration = CsmDepthDecoderConfig()
    >>> model = CsmDepthDecoderModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```csm_depth_decoder_modeldepth_decoder_configpast_key_values    ANnum_codebooksbackbone_hidden_size
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_idbos_token_ideos_token_idrope_parametersattention_biasattention_dropoutmlp_biashead_dimc                    |j                  dd      rt        d      || _        || _        || _        || _        || _        || _        |
| _        || _	        || _
        || _        || _        ||}|| _        |	| _        || _        || _        || _        || _        || _        || _        ||n| j                  | j                  z  | _        || _        t/        | `  di | y )Ntie_word_embeddingsFzE`tie_word_embeddings=True` is not supported for CsmDepthDecoderConfig )pop
ValueErrorr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r   super__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   kwargs	__class__s                          s/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/csm/configuration_csm.pyr)   zCsmDepthDecoderConfig.__init__j   s    2 ::+U3dee(((*$$8!'>$&!2!2#6  &"5#6 $!2(",!2 $,$8d>N>NRVRjRj>j."6"    )         i             r   silu!   {Gz?h㈵>TNNNNF        FN)__name__
__module____qualname____doc__
model_typebase_config_keykeys_to_ignore_at_inferencedefault_thetaintfloatboolr   dictstrr)   __classcell__r,   s   @r-   r	   r	      s   IV +J,O#4"5M %'+/!%"&(,()*+*+!'.0*.#'!%#'#'#'MQ&+*- %#-6#Tz6# "Dj6# $J	6#
 4Z6# :6# :6# !4Z6# !4Z6# $J6# "%t6# !4<6# Dj6# $;6# Dj6#  Dj!6#" Dj#6#$ ($sN/B*CCdJ%6#& t'6#( !4<)6#* ++6#, *-6# 6#r.   r	   c            :           e Zd ZdZdZdZdgZdZee	dZ
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%dedz  d	edz  d
edz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  deeeef   z  dz  dedz  dedz  dedz  d edz  d!edz  d"edz  d#edz  f8 fd$Z xZS )&	CsmConfigaW  
    This is the configuration class to store the configuration of a [`CsmForConditionalGeneration`]. It is used to instantiate an CSM
    model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the csm-1b.

    e.g. [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        num_codebooks (`int`, *optional*, defaults to 32):
            Number of codebooks used in the underlying codec model responsible for tokenizing the audio.
        vocab_size (`int`, *optional*, defaults to 2051):
            Vocabulary size of the Csm model. Defines the number of different audio tokens that can be represented by each codebook.
        text_vocab_size (`int`, *optional*, defaults to 128256):
            Vocabulary size of the text input for the Csm model. Defines the number of different text tokens that can be represented.
        hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations of the backbone model.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimension of the MLP representations of the backbone model.
        num_hidden_layers (`int`, *optional*, defaults to 16):
            Number of hidden layers in the backbone model Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the backbone model Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245).
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the backbone model Transformer decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 128002):
            Padding token id.
        codebook_pad_token_id (`int`, *optional*, defaults to 2050):
            Padding token id for codebook tokens.
        codebook_eos_token_id (`int`, *optional*, defaults to 0):
            End of stream token id for codebook tokens.
        bos_token_id (`int`, *optional*, defaults to 128000):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*):
            End of stream token id.
        audio_token_id (`int`, *optional*, defaults to 128002):
            Audio token id in the text input.
        audio_eos_token_id (`int`, *optional*, defaults to 128003):
            End of stream token id for audio in the text input.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
        head_dim (`int`, *optional*):
            The attention head dimension. If None, it will default to hidden_size // num_attention_heads
        tie_codebooks_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie the codebook tokens embeddings of the backbone model to the codebook tokens embeddings of the depth decoder.
        depth_decoder_config (`CsmDepthDecoderConfig`, *optional*):
            Configuration for the depth decoder.
        codec_config (`PreTrainedConfig`, *optional*):
            Configuration for the codec.

    ```python
    >>> from transformers import CsmForConditionalGeneration, CsmConfig

    >>> # Initializing a CsmConfig
    >>> configuration = CsmConfig()

    >>> # Initializing a model
    >>> model = CsmForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```csm
csm_configr   r   )codec_configr   Nr   r   text_vocab_sizer   r   r   r   r   r   r   r   r   r   r   codebook_pad_token_idcodebook_eos_token_idr   r   audio_token_idaudio_eos_token_idr   r   r    r!   r"   tie_codebooks_embeddingsr   rM   c                    |j                  dd      rt        d      |%t               | _        t        j                  d       n8t        |t              rt        di || _        nt        |t              r|| _        |0t        j                  d      | _
        t        j                  d       nBt        |t              rt        j                  di || _
        nt        |t              r|| _
        || _        || _        || _        || _        || _        || _        || _        || _        |
| _        || _        || _        || _        || _        ||}|| _        |	| _        || _        || _        || _        || _        || _        || _         ||n| j*                  | j0                  z  | _!        || _"        || _#        || _$        || _%        d| _&        tO        |   di | y )Nr$   Fz9`tie_word_embeddings=True` is not supported for CsmConfigzAdepth_decoder_config is None, using default depth decoder config.mimiz9codec_config is None, using default audio encoder config.r%   ))r&   r'   r	   r   loggerinfo
isinstancerE   r   	for_modelrM   r   rN   r   rQ   rR   rO   rP   rS   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r   r   r   r   r$   r(   r)   )r*   r   r   rN   r   r   r   r   r   r   r   r   r   r   r   rO   rP   r   r   rQ   rR   r   r   r    r!   r"   rS   r   rM   r+   r,   s                                 r-   r)   zCsmConfig.__init__  s   @ ::+U3XYY'(=(?D%KK[\,d3(=(U@T(UD%,.CD(<D% * 4 4V <DKKSTd+ * 4 4 D| DD&67 ,D.*,"4%:"%:"(@%$'>$&!2!2#6  &"5#6 $!2(",!2 $,$8d>N>NRVRjRj>j.(((#( "6"r.   )r/   r1   i  r0   r2      r/   r4   r5   r0   r7   r8   T i      i  Nr[   i NFr9   FNTNN)r:   r;   r<   r=   r>   r?   r@   rA   r   r	   sub_configsrB   rF   rC   rD   r   rE   r)   rG   rH   s   @r-   rJ   rJ      sE   Wr J"O#4"5M" 5K %'!%&,"&(,(**,*+!'.2*.#'!%#),0,-#)#'%+)/MQ&+*- %#04,0$(;U#TzU# $JU# t	U#
 4ZU# :U# :U# !4ZU# !4ZU# $JU# "%tU# !4<U# DjU# $;U# DjU#   #Tz!U#"  #Tz#U#$ Dj%U#& Dj'U#( d
)U#*  $J+U#, ($sN/B*CCdJ-U#. t/U#0 !4<1U#2 +3U#4 *5U#6 #'+7U#8 #Tk9U#: Tk;U# U#r.   rJ   N)configuration_utilsr   modeling_rope_utilsr   utilsr   auto.configuration_autor   
get_loggerr:   rV   r	   rJ   __all__r%   r.   r-   <module>rd      sV     4 1  0 
		H	%G#, G#Tx#  x#x r.   