
    iU                     ~   d dl Z d dlmZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlmZmZ ddl m!Z!m"Z"  ejF                  e$      Z%	 	 d*dejL                  dejN                  dejN                  dejN                  dejN                  dz  de(dz  de(fdZ) G d dejL                        Z* G d de      Z+e G d de             Z, ed !       G d" d#e,             Z- G d$ d%ejL                        Z. ed&!       G d' d(e,e
             Z/g d)Z0y)+    N)Callable)nn   )ACT2FN)Cache)GenerationMixin)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)check_model_inputs   )	AutoModelAutoModelForCausalLM   )VoxtralConfigVoxtralEncoderConfigmodulequerykeyvalueattention_maskscalingdropoutc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|0|j                  dk(  r!||d d d d d d d |j
                  d   f   z   }t        j                  j                  |d      }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  d	d      j                         }	|	|fS )
N      r   r      )dimptrainingr   )sizetorchmatmul	transposendimshaper   
functionalsoftmaxr!   r*   
contiguous)
r   r   r   r   r   r    r!   kwargsattn_weightsattn_outputs
             v/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/voxtral/modeling_voxtral.pyeager_attention_forwardr8   ,   s     **R.D(<<s}}Q':;gEL!n&9&9Q&>#nQ1o		"o5M&NN==((2(>L==((6??([L,,|U3K''1-88:K$$    c                   ,    e Zd ZdZ	 	 	 	 	 	 ddedededededed	edz  d
edz  f fdZde	j                  dedefdZ	 	 dde	j                  de	j                  dz  dedee	j                  e	j                  dz  ee	j                     dz  f   fdZ xZS )VoxtralAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsr!   
is_decoderbias	is_causal	layer_idxconfigc	                 z   t         	|           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        |/|r-t        j                  d| j                  j                   d       || _        t!        j"                  ||d      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        y )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r$   zInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr?   )super__init__r<   r=   r!   head_dimrB   
ValueErrorr    r>   r@   loggerwarning_once	__class____name__rA   r   Lineark_projv_projq_projout_proj)
selfr<   r=   r!   r>   r?   r@   rA   rB   rK   s
            r7   rF   zVoxtralAttention.__init__I   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"*4>>+B+B*C D, ,
 #ii	95Aii	94@ii	94@		)YTBr9   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   r   )viewr=   rG   r.   r3   )rR   rS   rT   rU   s       r7   _shapezVoxtralAttention._shapeq   s7    {{3GQQRSUVWbbddr9   hidden_statesr   output_attentionsreturnc                 0   |j                         \  }}}| j                  | j                  |      | j                  z  ||      }| j                  | j	                  |      d|      }	| j                  | j                  |      d|      }
t        j                  | j                  j                  t              } || ||	|
|f| j                  sdn| j                  d|d|\  }}|j                  ||d      j                         }| j                  |      }||fS )z#Input shape: Batch x Time x Channelr#                 ?)r!   r    rZ   )r+   rX   rP   r    rN   rO   r   get_interfacerB   _attn_implementationr8   r*   r!   reshaper3   rQ   )rR   rY   r   rZ   r4   rU   tgt_len_query_states
key_statesvalue_statesattention_interfacer6   r5   s                 r7   forwardzVoxtralAttention.forwardt   s    (,,.Wa {{4;;}#=#LgWZ[[[]!;RE
{{4;;}#=r3G(?(M(MKK,,.E)
 %8
%
  $}}C$,,/
%
 
%
!\ "))#w;FFHmmK0L((r9   )r]   FTFNNNF)rL   
__module____qualname____doc__intfloatboolr   rF   r,   TensorrX   tuplerh   __classcell__rK   s   @r7   r;   r;   F   s	   G   $'+&C&C &C 	&C
 &C &C &C :&C $&CPeU\\ eC ec e /3"'	')||') t+')  	') 
u||U\\D0%2E2LL	M')r9   r;   c            	       |     e Zd Zdef fdZ	 ddej                  dej                  dedej                  fdZ xZ	S )	VoxtralEncoderLayerrB   c                 h   t         |           |j                  | _        t	        | j                  |j
                  |j                  |      | _        t        j                  | j                        | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                        | _        t        j                   |j"                  | j                        | _        t        j                  | j                        | _        y )N)r<   r=   r!   rB   )rE   rF   d_modelr<   r;   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normr!   r   activation_functionactivation_fnactivation_dropoutrM   encoder_ffn_dimfc1fc2final_layer_normrR   rB   rK   s     r7   rF   zVoxtralEncoderLayer.__init__   s    )nn44,,	
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r9   rY   r   rZ   r[   c                    |}| j                  |      }| j                  |||      \  }}t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  | j                  |            }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }||z   }|j                  t        j                  k(  rEt        j                  |j                        j                  dz
  }t        j                   || |      }||fS )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rY   r   rZ   r(   i  )minmax)r|   rz   r   r1   r!   r*   r   r~   r   r   r   dtyper,   float16finfor   clamp)rR   rY   r   rZ   residualr5   clamp_values          r7   rh   zVoxtralEncoderLayer.forward   sP    !11-@&*nn')/ '5 '
#|
 --mt||VZVcVc-d =0 --m<**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0%--/++m&9&9:>>EK!KKK<[YMl**r9   )F)
rL   rj   rk   r   rF   r,   rp   ro   rh   rr   rs   s   @r7   ru   ru      sK    =} =, #(	%+||%+ %+  	%+
 
%+r9   ru   c                   D    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZdZdZdZy)VoxtralPreTrainedModelrB   model)audiotextTNpast_key_values)rL   rj   rk   r   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraph r9   r7   r   r      sI    (&*#"3N "&!r9   r   z:
    The Voxtral encoder, which is a Whisper encoder.
    custom_introc                        e Zd ZU dZeed<   dZdZdgZe	e
dZdef fdZd Zd	ej                  fd
Zdej                  fdZe	 ddee   d	eez  fd       Zdej2                  fdZ xZS )VoxtralEncoderz
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`VoxtralEncoderLayer`].

    Args:
        config: VoxtralEncoderConfig
    rB   input_featuresr   ru   )
attentionsrY   c                 b   t         |   |       |j                  | _        |j                  | _        |j
                  }|j                  | _        |j                  | _        |j                  rt        j                  |      nd| _        t        j                  | j                  |dd      | _        t        j                  ||ddd      | _        t        j                   | j                  |      | _        | j"                  j%                  d       t        j&                  t)        |j*                        D cg c]  }t-        |       c}      | _        t        j0                  |j
                        | _        t        j4                  dd      | _        d| _        | j;                          y c c}w )	Nr^   r   r   )kernel_sizepaddingr   )r   strider   F)r   )rE   rF   r!   encoder_layerdrop	layerdroprw   num_mel_binsmax_source_positionsscale_embeddingmathsqrtembed_scaler   Conv1dconv1conv2	Embeddingembed_positionsrequires_grad_
ModuleListrangeencoder_layersru   layersr{   
layer_norm	AvgPool1d
avg_poolergradient_checkpointing	post_init)rR   rB   r<   rc   rK   s       r7   rF   zVoxtralEncoder.__init__  s6    ~~11NN	"//$*$?$?!393I3I499Y/sYYt00)TUV
YYy)1VWX
!||D,E,EyQ++E2mm%PVPePeJf$gQ%8%@$gh,,v~~6,,q3&+# %hs   6F,c                 J    | j                         D ]	  }d|_         d| _        y ri   )
parametersrequires_grad_requires_grad)rR   params     r7   _freeze_parametersz!VoxtralEncoder._freeze_parameters  s(    __& 	(E"'E	(#r9   r[   c                     | j                   S Nr   rR   s    r7   get_input_embeddingsz#VoxtralEncoder.get_input_embeddings  s    zzr9   r   c                     || _         y r   r   rR   r   s     r7   set_input_embeddingsz#VoxtralEncoder.set_input_embeddings"  s	    
r9   r4   c           	         | j                   j                  | j                  j                  d   z  | j                  j                  d   z  }|j
                  d   |k7  r"t        d| d|j
                  d    d| d      |j                  | j                  j                  j                  | j                  j                  j                        }t        j                  j                  | j                  |            }t        j                  j                  | j	                  |            }|j                  ddd	      }| j                  j                  }||z   j                  |j                        }t        j                  j!                  || j                   | j"                  
      }t%        | j&                        D ]  \  }}	 |	||      }
|
d   } | j)                  |      }t+        |      S )a  
        Args:
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
        r   r#   z7Voxtral expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)r   devicer   r   r(   )r   )last_hidden_state)rB   r   r   r   r   r0   rH   toweightr   r   r   r1   gelupermuter   r!   r*   	enumerater   r   r   )rR   r   r   r4   expected_seq_lengthinputs_embeds	embed_posrY   idxencoder_layerlayer_outputss              r7   rh   zVoxtralEncoder.forward%  s   & #kk>>ARARSTAUUX\XbXbXiXijkXll#'::IJ]I^^jkykk  AC  lD  kE  Er  sF  rG  GH  I  (**1B1B1H1HQUQ[Q[QbQbQiQi*j**4::n+EF**4::m+DE%--aA6((//	&266}7J7JK--mt||VZVcVc-d"+DKK"8 	-C)-M *!,M	- 6)+
 	
r9   input_lengthsc                 6    |dz
  dz  dz   }|dz
  dz  dz   }||fS )zs
        Computes the output length of the convolutional layers and the output length of the audio encoder
        r   r   r   )rR   r   output_lengthss      r7    _get_feat_extract_output_lengthsz/VoxtralEncoder._get_feat_extract_output_lengthsU  s7     '*q014'!+1A5n,,r9   r   )rL   rj   rk   rl   r   r   main_input_namer   r   r;   ru   _can_record_outputsrF   r   r   Moduler   r   r   r   r   rq   r   rh   r,   
LongTensorr   rr   rs   s   @r7   r   r      s     ! &O./&,
3 2$
bii "))   ,
 +,	,

 
+	+,
 ,
^-e>N>N -r9   r   c                   *     e Zd Zdef fdZd Z xZS )VoxtralMultiModalProjectorrB   c                 f   t         |           t        j                  |j                  j
                  |j                  j                  d      | _        t        |j                     | _        t        j                  |j                  j                  |j                  j                  d      | _        y )NFrD   )rE   rF   r   rM   audio_configintermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2r   s     r7   rF   z#VoxtralMultiModalProjector.__init___  sz    		&"5"5"G"GI[I[IgIgnst&556		&"4"4"@"@&BTBTB`B`glmr9   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )rR   audio_featuresrY   s      r7   rh   z"VoxtralMultiModalProjector.forwarde  s2    n5/m4r9   )rL   rj   rk   r   rF   rh   rr   rs   s   @r7   r   r   ^  s    n} nr9   r   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                       e Zd ZdgZ fdZd Zd Zd Zd Zd Z	d Z
e ed	
      dej                  dee   deez  fd              Zee	 	 	 	 	 	 	 	 	 	 ddej(                  dz  dej                  dz  dej*                  dz  dej(                  dz  dedz  dej                  dz  dej(                  dz  dedz  dej(                  dz  deej*                  z  dee   defd              Z fdZ xZS )VoxtralForConditionalGenerationr   c                 *   t         |   |       |j                  j                  | _        t	        j
                  |j                        | _        t        j
                  |j                        | _	        t        |      | _        | j                          y r   )rE   rF   r   
vocab_sizer   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   r   s     r7   rF   z(VoxtralForConditionalGeneration.__init__t  sn      ,,77$001D1DE2>>v?Q?QR%?%G" 	r9   c                 6    | j                   j                         S r   )r   r   r   s    r7   r   z4VoxtralForConditionalGeneration.get_input_embeddings~  s    ""7799r9   c                 :    | j                   j                  |       y r   )r   r   r   s     r7   r   z4VoxtralForConditionalGeneration.set_input_embeddings  s    007r9   c                 6    | j                   j                         S r   )r   get_output_embeddingsr   s    r7   r   z5VoxtralForConditionalGeneration.get_output_embeddings  s    ""88::r9   c                 :    | j                   j                  |       y r   )r   set_output_embeddings)rR   new_embeddingss     r7   r  z5VoxtralForConditionalGeneration.set_output_embeddings  s    11.Ar9   c                 :    | j                   j                  |       y r   )r   set_decoder)rR   decoders     r7   r  z+VoxtralForConditionalGeneration.set_decoder  s    ''0r9   c                 6    | j                   j                         S r   )r   get_decoderr   s    r7   r  z+VoxtralForConditionalGeneration.get_decoder  s    ""..00r9   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r   r   r4   r[   c                      | j                   |fddi|}|j                  }|j                  d| j                  j                  j
                        }| j                  |      }||_        |S )aa  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        return_dictTr#   )r   r   ra   rB   r   r   r   pooler_output)rR   r   r4   audio_outputsaudio_hidden_statesaudio_embedss         r7   get_audio_featuresz2VoxtralForConditionalGeneration.get_audio_features  sn     )((TTTVT+==199"dkk>V>V>h>hi112EF&2#r9   N	input_idsr   position_idsr   r   labels	use_cachecache_positionlogits_to_keepc                    | | j                         |      }||| j                  |d      j                  }|| j                  j                  k(  j                  d      }|j                  |j                  |j                        |j                  |j                              } | j                  d|||||||	|
d|}|S )aj  
        Example:

        ```python
        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
        >>> import torch

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

        >>> processor = AutoProcessor.from_pretrained(repo_id)
        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)

        >>> conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
                    },
                    {"type": "text", "text": "What can you tell me about this audio?"},
                ],
            }
        ]

        >>> inputs = processor.apply_chat_template(conversation)
        >>> inputs = inputs.to(device, dtype=torch.bfloat16)

        >>> outputs = model.generate(**inputs, max_new_tokens=30)
        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
        ```T)r	  r#   )r   r  r   r   r  r  r  r  r   )
r   r  r
  rB   audio_token_id	unsqueezemasked_scatterr   r   r   )rR   r  r   r   r  r   r   r  r  r  r  r4   r  audio_token_maskoutputss                  r7   rh   z'VoxtralForConditionalGeneration.forward  s    b  7D557	BM%)*?22>t2TbbL !*T[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M ,?4+>+> 
,
)%+'))
,
 
,
 r9   c                     |j                  dd       }|j                  dd      }t        |   |i |}|s|j                  dd      s||d<   |S )Nr   is_first_iterationFr  T)popgetrE   prepare_inputs_for_generation)rR   argsr4   r   r  model_inputsrK   s         r7   r  z=VoxtralForConditionalGeneration.prepare_inputs_for_generation  s]      $4d;#ZZ(<eDw<dMfMVZZT%B-;L)*r9   )
NNNNNNNNNr   )rL   rj   rk   _keep_in_fp32_modules_strictrF   r   r   r   r  r  r  r   r   r,   FloatTensorr   r   rq   r   r  r   rp   r   ro   rm   r   rh   r  rr   rs   s   @r7   r   r   l  s    %6#6 :8;B11  w#//;ABT;U	+	+ &  .237.204(,26*.!%26-.F##d*F ))D0F t+	F
 &&-F F ((4/F   4'F $;F ((4/F ell*F +,F 
 F  FP r9   r   )r   r   r   )Nr]   )1r   collections.abcr   r,   r   activationsr   cache_utilsr   
generationr   modeling_layersr	   modeling_outputsr
   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   autor   r   configuration_voxtralr   r   
get_loggerrL   rI   r   rp   rn   r8   r;   ru   r   r   r   r   __all__r   r9   r7   <module>r2     sd  ,  $   !   ) 9 k k F & R R / 2 F 
		H	% !%II%<<% 
% <<	%
 LL4'% T\% %4U)ryy U)p8+4 8+v "_ " " 
m-+ m-
m-`  
L&<o L
L^ Zr9   