
    i]I                     Z   d dl mZ d dlZddlmZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z* ddl+m,Z,m-Z-  e       r
d dl.Z.d dl.m/Z/  ej`                  e1      Z2 G d de$      Z3 G d de#      Z4 G d de&      Z5d.dZ6 G d de(      Z7 G d d e/jp                        Z9 G d! d"e      Z: G d# d$e!      Z; G d% d&e;      Z< G d' d(e       Z= ed)*       G d+ d,e             Z>g d-Z?y)/    )CallableN   )ACT2FN)
AudioInputmake_list_of_audio)Cache)BatchFeature)GradientCheckpointingLayer)BaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringis_torch_availablelogging)can_return_tuplecheck_model_inputs   )&AudioFlamingo3ForConditionalGeneration!AudioFlamingo3MultiModalProjectorAudioFlamingo3PreTrainedModel)AudioFlamingo3ProcessorAudioFlamingo3ProcessorKwargs)GlmRotaryEmbedding)LlamaAttentioneager_attention_forwardrotate_half   )GlmAsrConfigGlmAsrEncoderConfig)nnc                       e Zd Zy)GlmAsrProcessorKwargsN__name__
__module____qualname__     s/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/glmasr/modular_glmasr.pyr$   r$   0       r*   r$   c            	       z     e Zd ZdZ	 	 	 	 d
 fd	ZddZ	 ddeee   z  ez  deee   z  dz  de	e
   defd	Z xZS )GlmAsrProcessora  
    Constructs an GlmAsr processor which wraps an GlmAsr feature extractor and an GlmAsr
    tokenizer into a single processor.

    [`GlmAsrProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
    [`Qwen2TokenizerFast`]. See the [`~GlmAsrProcessor.__call__`] for more information.

    Args:
            feature_extractor ([`WhisperFeatureExtractor`]):
                The feature extractor is a required input.
            tokenizer ([`Qwen2TokenizerFast`]):
                The tokenizer is a required input.
            chat_template (`Optional[str]`, *optional*):
                The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
                template will be used.
            audio_token (`Optional[str]`, *optional*, defaults to `"<|pad|>`"):
                Special token used to represent audio inputs in the chat template.
            default_transcription_prompt (`str`, *optional*, defaults to `"Please transcribe this audio into text"`):
                Default prompt to use for transcription tasks when applying transcription requests.
            max_audio_len (`int`, *optional*, defaults to 655):
                Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
                655 gives approximately 8192 tokens, corresponding to the maximum sequence length of the text model.
    Nc                 0    t         |   ||||||       y )N)chat_templateaudio_tokendefault_transcription_promptmax_audio_len)super__init__)selffeature_extractor	tokenizerr0   r1   r2   r3   	__class__s          r+   r5   zGlmAsrProcessor.__init__L   s)     	'#)E' 	 	
r*   returnc                 d    d}dD ]  \  }}}|d|z  z   |dz
  z
  dz
  |z  dz   } ||z
  |z  dz   }|S )N   )r   r   r   )r   r   r   r   r   r)   )r6   audio_lengthsmerge_factorpaddingkernel_sizestride
num_tokenss          r+   _get_audio_token_lengthz'GlmAsrProcessor._get_audio_token_length^   sc    ,B 	`(G[&*Q[8K!OLqPU[[^__M	` $l2|CaG
r*   audiopromptkwargsc           	         t        |t              r|g}nt        |t        t        f      r |rt	        d |D              rt        |      }nst        t        |            }t               rU|D cg c]J  }t        |t        j                        r,|j                         j                         j                         n|L }}t        |      }|dk(  rt        d      || j                  g|z  }nt        |t              r|g|z  }nt        |t        t        f      r}t        |      |k7  rt        dt        |       d| d      g }|D ]L  }||j                  | j                         !t        |t              r|j                  |       Ct!        d       nt!        d      t#        ||      D 	
cg c](  \  }	}
d	t        |
t              rd
|
dnd
|
dd|	dgdg* }}	}
 | j$                  |fdddd|S c c}w c c}
}	w )a  
        Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.

        Args:
            audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
                the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
            prompt (`str` or `list[str]`, *optional*):
                Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
                each sample uses `"Transcribe the input speech."`.
            **kwargs:
                Additional keyword arguments forwarded to [`~AudioFlamingo3Processor.apply_chat_template`] (for example
                `text_kwargs`, `audio_kwargs`, ...).

        Returns:
            [`BatchFeature`]: Processor outputs ready to be passed to [`AudioFlamingo3ForConditionalGeneration.generate`].

        c              3   <   K   | ]  }t        |t                y wN)
isinstancestr).0els     r+   	<genexpr>z>GlmAsrProcessor.apply_transcription_request.<locals>.<genexpr>   s     ?dXZ
2s@S?ds   r   z)`audio` must contain at least one sample.z	Received z prompt(s) for z$ audio sample(s); counts must match.z'Each prompt must be a string or `None`.z<`prompt` must be a string, a sequence of strings, or `None`.userrE   )typepath)rQ   rE   text)rQ   rS   )rolecontentT)tokenizeadd_generation_promptreturn_dict)rK   rL   listtupleallr   r   torchTensordetachcpunumpylen
ValueErrorr2   append	TypeErrorzipapply_chat_template)r6   rE   rF   rG   audio_itemsrN   
batch_sizepromptsitemprompt_text
audio_itemconversationss               r+   apply_transcription_requestz+GlmAsrProcessor.apply_transcription_requestf   s   2 eS!38'Ke}-%C?d^c?d<du+K1%89K!#kvwegJr5<<<Xryy{0668^``ww%
?HII>889JFG$h+Gu.6{j( F}OJ<Gkl  G O<NN4#D#DEc*NN4(#$MNNO Z[[ ,/w+D
 (Z # &j#6 ")*=&-
C!'=	 

 
 (t''
"&	

 
 	
S x4
s   -AG70-G<)Nz<|pad|>z&Please transcribe this audio into texti  )r>   torch.Tensorr:   ro   rJ   )r&   r'   r(   __doc__r5   rD   rL   rY   r   r   r$   r	   rn   __classcell__r9   s   @r+   r.   r.   3   su    8 %M
$ *.O
T#Y+O
 d3i$&O
 ./	O

 
O
r*   r.   c                       e Zd Zy)GlmAsrRotaryEmbeddingNr%   r)   r*   r+   rt   rt      r,   r*   rt   c                 h   |j                  |      }|j                  |      }|j                  d   }| dd |f   | d|d f   }}|dd |f   |d|d f   }
}	||z  t        |      |z  z   }|	|z  t        |	      |z  z   }t        j                  ||gd      }t        j                  ||
gd      }||fS )N.)dim)	unsqueezeshaper   r\   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r+   apply_rotary_pos_embr      s    
--
&C
--
&C2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr*   c                        e Zd Zdedef fdZ	 d
dej                  deej                  ej                  f   dz  de	e
   deej                  ej                  f   fd	Z xZS )GlmAsrAttentionconfig	layer_idxc                 $   t         |   ||       d| _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _
        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j
                  d      | _        y )NFT)bias)r4   r5   	is_causalr"   Linearhidden_sizenum_attention_headshead_dimq_projnum_key_value_headsk_projv_projo_projr6   r   r   r9   s      r+   r5   zGlmAsrAttention.__init__   s    +ii 2 2F4N4NQUQ^Q^4^eijii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^eijii : :T]] JFL^L^eijr*   Nhidden_statesposition_embeddingsrG   r:   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }|\  }	}
t        |||	|
      \  }}t        j                  | j                  j                  t              } || |||fd | j                  sdn| j                  | j                  d|\  }} |j                   g |d j#                         }| j%                  |      }||fS )Nrv   r   r   g        )attention_maskdropoutscaling)ry   r   r   view	transposer   r   r   r   get_interfacer   _attn_implementationr   trainingattention_dropoutr   reshape
contiguousr   )r6   r   r   rG   input_shapehidden_shapequery_states
key_statesvalue_statesr}   r~   attention_interfaceattn_outputattn_weightss                 r+   forwardzGlmAsrAttention.forward   sk    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j(?(M(MKK,,.E)
 %8		%

  #}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r*   rJ   r&   r'   r(   r    intr5   r\   r]   rZ   r   r   r   rq   rr   s   @r+   r   r      s~    k| k k IM!)||!) #5<<#=>E!) +,	!)
 
u||U\\)	*!)r*   r   c                   >     e Zd Z fdZdej
                  fdZ xZS )	GlmAsrMLPc                    t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        |j                     | _
        y rJ   )r4   r5   r"   r   r   intermediate_sizefc1fc2r   
hidden_actact_fnr6   r   r9   s     r+   r5   zGlmAsrMLP.__init__   s\    99V//1I1IJ99V55v7I7IJV../r*   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rJ   )r   r   r   )r6   r   s     r+   r   zGlmAsrMLP.forward  s2    /M2/r*   )r&   r'   r(   r5   r\   r]   r   rq   rr   s   @r+   r   r      s    0U\\ r*   r   c            	            e Zd Zdedef fdZ	 d
dej                  deej                  ej                  f   dz  de	e
   dej                  fd	Z xZS )GlmAsrEncoderLayerr   r   c                    t         |           |j                  | _        t        ||      | _        t        |      | _        t        j                  |j                        | _	        t        j                  |j                        | _
        y )N)r   r   )r4   r5   r   r   	self_attnr   mlpr"   	LayerNorminput_layernormpost_attention_layernormr   s      r+   r5   zGlmAsrEncoderLayer.__init__	  sd    !--()LV$!||F,>,>?(*V5G5G(H%r*   Nr   r   rG   r:   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r   r   r)   )r   r   r   r   )r6   r   r   rG   residual_s         r+   r   zGlmAsrEncoderLayer.forward  s     !,,];)4>> 
' 3
 
q
 !=0 !55mD/ =0r*   rJ   r   rr   s   @r+   r   r     sp    I| I I IM|| #5<<#=>E +,	
 
r*   r   c                       e Zd Zy)GlmAsrPreTrainedModelNr%   r)   r*   r+   r   r   +  r,   r*   r   c                   n     e Zd ZU eed<   dZdZdgZee	dZ
def fdZeedee   fd              Z xZS )	GlmAsrEncoderr   input_featuresrE   r   )r   
attentionsc           	         t         |   |       t        j                  |j                  |j
                  dd      | _        t        j                  |j
                  |j
                  ddd      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j
                        | _        t        |      | _        d| _        | j%                          y c c}w )Nr   r   )rA   r@   r   )rA   rB   r@   )r   F)r4   r5   r"   Conv1dnum_mel_binsr   conv1conv2
ModuleListrangenum_hidden_layersr   layersr   normrt   
rotary_embgradient_checkpointing	post_initr   s      r+   r5   zGlmAsrEncoder.__init__9  s     YYv22F4F4FTU_`a
YYv1163E3EST]^hij
mmDI&JbJbDcdy	2d
 LL!3!34	/v>&+# es   DrG   c                    t         j                  j                  | j                  |            }t         j                  j                  | j	                  |            }|j                  dd      }|}| j                  |t        j                  |j                  d   |j                        d d d f         }| j                  D ]  } ||fd|i|} | j                  |      }t        |      S )Nr   r   device)r   r   )last_hidden_state)r"   
functionalgelur   r   r   r   r\   arangery   r   r   r   r   )r6   r   rG   inputs_embedsr   r   encoder_layers          r+   r   zGlmAsrEncoder.forwardF  s     **4::n+EF**4::m+DE%//15%"oo]5H5H5KTaThTh(ijnpqjq(r . 
 "[[ 	lM)-kM`kdjkM	l 		-0)MJJr*   )r&   r'   r(   r!   __annotations__main_input_nameinput_modalities_no_split_modulesr   r   _can_record_outputsr5   r   r   r   r   r   rq   rr   s   @r+   r   r   /  sd    &O-.+%
2  K7I0J K  Kr*   r   c                   $     e Zd Zdef fdZ xZS )GlmAsrMultiModalProjectorr   c                 :   t         |           t        j                  |j                  j
                  |j                  j                  dz        | _        t        j                  |j                  j                  dz  |j                  j                        | _	        y )Nr   )
r4   r5   r"   r   audio_configr   text_configr   linear_1linear_2r   s     r+   r5   z"GlmAsrMultiModalProjector.__init__Z  sm    		&"5"5"G"GI[I[IgIgjkIkl		&"4"4"@"@1"DfFXFXFdFder*   )r&   r'   r(   r    r5   rq   rr   s   @r+   r   r   Y  s    f| f fr*   r   z~
    The GlmAsr model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Llama language model.
    custom_introc                       e Zd Ze ed      dej                  dej                  dee	   de
ez  fd              Z	 	 	 	 	 	 	 	 	 	 	 dd	ej                  dz  dej                  dz  dej                  dz  d
ej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  dedz  dej                  dz  deej                  z  dee	   def fdZ xZS )GlmAsrForConditionalGenerationzgCompute audio embeddings from log-mel input features using the audio encoder and multi-modal projector.r   r   input_features_maskrG   r:   c                 *    | j                   |fddi|}|j                  }|j                  |j                  d   d| j                  j
                  j                        }| j                  |      }|j                  d      }dD ]  \  }}	}
|d|z  z   |	dz
  z
  dz
  |
z  dz   } d}||z
  |z  dz   }t        j                  |j                  d   |j                  	      d d d f   |d d d f   k  }||j                  |j                           |_        |S )
NrX   Tr   rv   r=   r   r   r<   r   )audio_towerr   r   ry   r   r   r   multi_modal_projectorsumr\   r   r   topooler_output)r6   r   r   rG   audio_outputsaudio_hidden_statesaudio_embedsr>   r@   rA   rB   r?   post_lengths
valid_masks                 r+   get_audio_featuresz1GlmAsrForConditionalGeneration.get_audio_featuresf  s<    )((TTTVT+==199  #R)A)A)S)S
 112EF+//3,B 	`(G[&*Q[8K!OLqPU[[^__M	`%4EI\\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
&2:==ATAT3U&V#r*   N	input_idsr   r   past_key_valuesr   labels	use_cachecache_positionlogits_to_keepc                 6    t        |   d|||||||	|
|d	|S )a  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import GlmAsrForConditionalGeneration, AutoProcessor

        >>> model_id = "zai-org/GLM-ASR-Nano-2512"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = GlmAsrForConditionalGeneration.from_pretrained(model_id, dtype="auto", device_map="auto")
        >>> inputs = processor.apply_transcription_request("https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3")

        >>> inputs = inputs.to(model.device, dtype=model.dtype)

        >>> outputs = model.generate(**inputs, do_sample=False, max_new_tokens=500)

        >>> decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1] :], skip_special_tokens=True)
        >>> print(decoded_outputs)
        ```)	r  r   r   r  r   r  r  r  r  r)   )r4   r   )r6   r  r   r   r   r   r  r   r  r  r  r  rG   r9   s                r+   r   z&GlmAsrForConditionalGeneration.forward  sA    V w 
)%+'))
 
 	
r*   )NNNNNNNNNNr   )r&   r'   r(   r   r   r\   FloatTensorr]   r   r   rZ   r   r   
LongTensorr   boolr   r   r   rq   rr   s   @r+   r   r   `  s{    ~)) #\\ +,	
 
+	+ 4 .23737.204(,26*.!%26-.6
##d*6
 ))D06
 #\\D0	6

 t+6
 &&-6
 6
 ((4/6
   4'6
 $;6
 ((4/6
 ell*6
 +,6
 
 6
 6
r*   r   )r   r   r.   r   )Nr   )@collections.abcr   r`   npactivationsr   audio_utilsr   r   cache_utilsr   feature_extraction_utilsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   r   &audioflamingo3.modeling_audioflamingo3r   r   r   (audioflamingo3.processing_audioflamingo3r   r   glm.modeling_glmr   llama.modeling_llamar   r   r   configuration_glmasrr    r!   r\   r"   
get_loggerr&   loggerr$   r.   rt   r   r   Moduler   r   r   r   r   r   __all__r)   r*   r+   <module>r      s    %  ! 9   4 9 R 5 & T T A 
 n 1 W W C  
		H	% @9 ?B
- B
J 5. 4$*)n *)Z		  3  F @9 ?'K) 'KTf A f 
S
%K S

S
l jr*   