
    i-                        d dl Z d dl mZ ddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlmZmZmZmZ ddlmZ  G d de      Z G d de      Z G d de      Z  ed       G d de             Z! G d dejD                        Z# ed       G d de e             Z$g dZ%y)     N)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputs   )	AutoModelAutoModelForCausalLM)Qwen2AudioAttentionQwen2AudioEncoderQwen2AudioEncoderLayerQwen2AudioPreTrainedModel   )VoxtralConfigc                       e Zd Zy)VoxtralAttentionN__name__
__module____qualname__     u/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/voxtral/modular_voxtral.pyr   r   (       r    r   c                       e Zd Zy)VoxtralEncoderLayerNr   r   r    r!   r$   r$   ,   r"   r    r$   c                        e Zd ZdZdZdZdZdZy)VoxtralPreTrainedModelTN)r   r   r   _supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraph_no_split_modulesr   r    r!   r&   r&   0   s      "&!r    r&   z:
    The Voxtral encoder, which is a Whisper encoder.
    custom_introc                   @    e Zd ZeedZe	 ddee   de	e
z  fd       Zy)VoxtralEncoder)
attentionshidden_statesNkwargsreturnc           	         | j                   j                  | j                  j                  d   z  | j                  j                  d   z  }|j
                  d   |k7  r"t        d| d|j
                  d    d| d      |j                  | j                  j                  j                  | j                  j                  j                        }t        j                  j                  | j                  |            }t        j                  j                  | j	                  |            }|j                  ddd	      }| j                  j                  }||z   j                  |j                        }t        j                  j!                  || j                   | j"                  
      }t%        | j&                        D ]  \  }}	 |	||      }
|
d   } | j)                  |      }t+        |      S )a  
        Args:
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
        r   z7Voxtral expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)dtypedevicer   r   )ptraining)attention_mask)last_hidden_state)configmax_source_positionsconv1strideconv2shape
ValueErrortoweightr7   r8   r   
functionalgelupermuteembed_positionsdropoutr:   	enumeratelayers
layer_normr	   )selfinput_featuresr;   r2   expected_seq_lengthinputs_embeds	embed_posr1   idxencoder_layerlayer_outputss              r!   forwardzVoxtralEncoder.forwardD   s   & #kk>>ARARSTAUUX\XbXbXiXijkXll#'::IJ]I^^jkykk  AC  lD  kE  Er  sF  rG  GH  I  (**1B1B1H1HQUQ[Q[QbQbQiQi*j**4::n+EF**4::m+DE%--aA6((//	&266}7J7JK--mt||VZVcVc-d"+DKK"8 	-C)-M *!,M	- 6)+
 	
r    N)r   r   r   r   r$   _can_record_outputsr   r   r   tupler	   rV   r   r    r!   r/   r/   9   sK     ',
  ,
 +,	,

 
+	+,
 ,
r    r/   c                   *     e Zd Zdef fdZd Z xZS )VoxtralMultiModalProjectorr=   c                 f   t         |           t        j                  |j                  j
                  |j                  j                  d      | _        t        |j                     | _        t        j                  |j                  j                  |j                  j                  d      | _        y )NF)bias)super__init__r   Linearaudio_configintermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2rN   r=   	__class__s     r!   r_   z#VoxtralMultiModalProjector.__init__u   sz    		&"5"5"G"GI[I[IgIgnst&556		&"4"4"@"@&BTBTB`B`glmr    c                 l    | j                  |      }| j                  |      }| j                  |      }|S rW   )re   rg   rh   )rN   audio_featuresr1   s      r!   rV   z"VoxtralMultiModalProjector.forward{   s2    n5/m4r    )r   r   r   r   r_   rV   __classcell__rj   s   @r!   r[   r[   t   s    n} nr    r[   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                       e Zd ZdgZ fdZd Zd Zd Zd Zd Z	d Z
e ed	
      dej                  dee   deez  fd              Zee	 	 	 	 	 	 	 	 	 	 ddej(                  dz  dej                  dz  dej*                  dz  dej(                  dz  dedz  dej                  dz  dej(                  dz  dedz  dej(                  dz  deej*                  z  dee   defd              Z fdZ xZS )VoxtralForConditionalGenerationrI   c                 *   t         |   |       |j                  j                  | _        t	        j
                  |j                        | _        t        j
                  |j                        | _	        t        |      | _        | j                          y rW   )r^   r_   rc   
vocab_sizer   from_configra   audio_towerr   language_modelr[   multi_modal_projector	post_initri   s     r!   r_   z(VoxtralForConditionalGeneration.__init__   sn      ,,77$001D1DE2>>v?Q?QR%?%G" 	r    c                 6    | j                   j                         S rW   )ru   get_input_embeddingsrN   s    r!   ry   z4VoxtralForConditionalGeneration.get_input_embeddings   s    ""7799r    c                 :    | j                   j                  |       y rW   )ru   set_input_embeddings)rN   values     r!   r|   z4VoxtralForConditionalGeneration.set_input_embeddings   s    007r    c                 6    | j                   j                         S rW   )ru   get_output_embeddingsrz   s    r!   r   z5VoxtralForConditionalGeneration.get_output_embeddings   s    ""88::r    c                 :    | j                   j                  |       y rW   )ru   set_output_embeddings)rN   new_embeddingss     r!   r   z5VoxtralForConditionalGeneration.set_output_embeddings   s    11.Ar    c                 :    | j                   j                  |       y rW   )ru   set_decoder)rN   decoders     r!   r   z+VoxtralForConditionalGeneration.set_decoder   s    ''0r    c                 6    | j                   j                         S rW   )ru   get_decoderrz   s    r!   r   z+VoxtralForConditionalGeneration.get_decoder   s    ""..00r    zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r,   rO   r2   r3   c                      | j                   |fddi|}|j                  }|j                  d| j                  j                  j
                        }| j                  |      }||_        |S )aa  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        return_dictTr5   )rt   r<   reshaper=   ra   rb   rv   pooler_output)rN   rO   r2   audio_outputsaudio_hidden_statesaudio_embedss         r!   get_audio_featuresz2VoxtralForConditionalGeneration.get_audio_features   sn     )((TTTVT+==199"dkk>V>V>h>hi112EF&2#r    N	input_idsr;   position_idspast_key_valuesrQ   labels	use_cachecache_positionlogits_to_keepc                    | | j                         |      }||| j                  |d      j                  }|| j                  j                  k(  j                  d      }|j                  |j                  |j                        |j                  |j                              } | j                  d|||||||	|
d|}|S )aj  
        Example:

        ```python
        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
        >>> import torch

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

        >>> processor = AutoProcessor.from_pretrained(repo_id)
        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)

        >>> conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
                    },
                    {"type": "text", "text": "What can you tell me about this audio?"},
                ],
            }
        ]

        >>> inputs = processor.apply_chat_template(conversation)
        >>> inputs = inputs.to(device, dtype=torch.bfloat16)

        >>> outputs = model.generate(**inputs, max_new_tokens=30)
        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
        ```T)r   r5   )r;   r   r   rQ   r   r   r   r   r   )
ry   r   r   r=   audio_token_id	unsqueezemasked_scatterrD   r8   ru   )rN   r   rO   r;   r   r   rQ   r   r   r   r   r2   r   audio_token_maskoutputss                  r!   rV   z'VoxtralForConditionalGeneration.forward   s    b  7D557	BM%)*?22>t2TbbL !*T[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M ,?4+>+> 
,
)%+'))
,
 
,
 r    c                     |j                  dd       }|j                  dd      }t        |   |i |}|s|j                  dd      s||d<   |S )NrO   is_first_iterationFr   T)popgetr^   prepare_inputs_for_generation)rN   argsr2   rO   r   model_inputsrj   s         r!   r   z=VoxtralForConditionalGeneration.prepare_inputs_for_generation  s]      $4d;#ZZ(<eDw<dMfMVZZT%B-;L)*r    )
NNNNNNNNNr   )r   r   r   _keep_in_fp32_modules_strictr_   ry   r|   r   r   r   r   r   r   torchFloatTensorr   r   rY   r	   r   
LongTensorTensorr   boolintr
   rV   r   rm   rn   s   @r!   rp   rp      s    %6#6 :8;B11  w#//;ABT;U	+	+ &  .237.204(,26*.!%26-.F##d*F ))D0F t+	F
 &&-F F ((4/F   4'F $;F ((4/F ell*F +,F 
 F  FP r    rp   )r&   r/   rp   )&r   r   activationsr   cache_utilsr   
generationr   modeling_outputsr   r	   r
   processing_utilsr   utilsr   r   r   utils.genericr   autor   r    qwen2_audio.modeling_qwen2_audior   r   r   r   configuration_voxtralr   r   r$   r&   r/   Moduler[   rp   __all__r   r    r!   <module>r      s       !   ) 
 ' I I / 2  1	* 		0 	6  
3
& 3

3
l  
L&<o L
L^ Zr    