
    iX                     &   d dl mZ d dlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&  e       r
d dl'Z'd dl'm(Z(  G d de(jR                        Z*d Z+de'jX                  de-de'jX                  fdZ.	 d7de(jR                  de'jX                  de'jX                  de'jX                  d e'jX                  dz  d!e/d"e/d#ee   fd$Z0d8d%Z1 ee1       G d& d'e(jR                               Z2 G d( d)e(jR                        Z3 G d* d+e      Z4e G d, d-e             Z5 G d. d/e5      Z6 G d0 d1e(jR                        Z7 ed23       G d4 d5e5e	             Z8g d6Z9y)9    )Callable)Optional   )ACT2FN)Cache)GenerationMixin)use_kernelized_func)GradientCheckpointingLayer)BaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_torch_available)can_return_tuplecheck_model_inputsmaybe_autocast   )	AutoModelAutoModelForCausalLM   )GlmAsrConfigGlmAsrEncoderConfigN)nnc                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )GlmAsrRotaryEmbeddinginv_freqNconfigc                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultr!   F)
persistentoriginal_inv_freq)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr"   rope_parametersr$   compute_default_rope_parametersr   attention_scalingregister_bufferclone)selfr"   devicerope_init_fnr!   	__class__s        t/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/glmasr/modeling_glmasr.pyr)   zGlmAsrRotaryEmbedding.__init__/   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuU    r3   ztorch.deviceseq_lenreturnztorch.Tensorc                 n   | j                   d   }| j                   j                  dd      }t        | dd      xs | j                  | j                  z  }t        ||z        }d}d|t        j                  d|dt        j                        j                  |t        j                  	      |z  z  z  }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorg      ?head_dimNr   r   dtype)r3   r?   )r-   getgetattrhidden_sizenum_attention_headsinttorcharangeint64tofloat)	r"   r3   r8   baser<   r=   dimattention_factorr!   s	            r6   r.   z5GlmAsrRotaryEmbedding.compute_default_rope_parameters?   s    & %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(223 U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r7   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r   mpscpuF)device_typeenabledr   rK   r>   )r!   rI   expandshaperH   r3   
isinstancetypestrr   	transposerE   catcosr/   sinr?   )
r2   xposition_idsinv_freq_expandedposition_ids_expandedrQ   freqsembr[   r\   s
             r6   forwardzGlmAsrRotaryEmbedding.forward_   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$N)NNN)__name__
__module____qualname__rE   Tensor__annotations__r   r)   staticmethodr   rD   tuplerI   r.   no_gradr   rc   __classcell__r5   s   @r6   r    r    ,   s    llV| V  &*+/"*t#*(* t* 
~u$	%	* *> U]]_<  <r7   r    c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrN   r   rS   )rU   rE   rZ   )r]   x1x2s      r6   rotate_halfrr   o   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r7   hidden_statesn_repr9   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rU   rT   reshape)rs   rt   batchnum_key_value_headsslenr=   s         r6   	repeat_kvrz   v   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr7   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr   r   rN   )rK   r?   )ptrainingr   )rz   num_key_value_groupsrE   matmulrY   rU   r   
functionalsoftmaxfloat32rH   r?   r   r   
contiguous)r{   r|   r}   r~   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r6   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r7   c                 h   |j                  |      }|j                  |      }|j                  d   }| dd |f   | d|d f   }}|dd |f   |d|d f   }
}	||z  t        |      |z  z   }|	|z  t        |	      |z  z   }t        j                  ||gd      }t        j                  ||
gd      }||fS )NrN   .rS   )	unsqueezerU   rr   rE   rZ   )qkr[   r\   r^   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r6   apply_rotary_pos_embr      s    
--
&C
--
&C2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr7   c                        e Zd ZdZdedef fdZ	 ddej                  de	ej                  ej                  f   dz  de
e   d	e	ej                  ej                  f   fd
Z xZS )GlmAsrAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr"   	layer_idxc                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j
                  d      | _        y )Nr=   g      FT)bias)r(   r)   r"   r   rA   rB   rC   r=   rx   r   r   attention_dropout	is_causalr   Linearq_projk_projv_projo_projr2   r"   r   r5   s      r6   r)   zGlmAsrAttention.__init__   s,   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii 2 2F4N4NQUQ^Q^4^eijii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^eijii : :T]] JFL^L^eijr7   Nrs   position_embeddingsr   r9   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }|\  }	}
t        |||	|
      \  }}t        j                  | j                  j                  t              } || |||fd | j                  sdn| j                  | j                  d|\  }} |j                   g |d j#                         }| j%                  |      }||fS )NrN   r   r           )r   r   r   )rU   r=   r   viewrY   r   r   r   r   get_interfacer"   _attn_implementationr   r   r   r   rv   r   r   )r2   rs   r   r   input_shapehidden_shapequery_statesr   r   r[   r\   attention_interfacer   r   s                 r6   rc   zGlmAsrAttention.forward   sk    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j(?(M(MKK,,.E)
 %8		%

  #}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r7   rd   )re   rf   rg   __doc__r   rD   r)   rE   rh   rk   r   r   rc   rm   rn   s   @r6   r   r      s    Gk| k k" IM!)||!) #5<<#=>E!) +,	!)
 
u||U\\)	*!)r7   r   c                   >     e Zd Z fdZdej
                  fdZ xZS )	GlmAsrMLPc                    t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        |j                     | _
        y rd   )r(   r)   r   r   rB   intermediate_sizefc1fc2r   
hidden_actact_fnr2   r"   r5   s     r6   r)   zGlmAsrMLP.__init__   s\    99V//1I1IJ99V55v7I7IJV../r7   rs   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rd   )r   r   r   )r2   rs   s     r6   rc   zGlmAsrMLP.forward   s2    /M2/r7   )re   rf   rg   r)   rE   rh   rc   rm   rn   s   @r6   r   r      s    0U\\ r7   r   c            	            e Zd Zdedef fdZ	 d
dej                  deej                  ej                  f   dz  de	e
   dej                  fd	Z xZS )GlmAsrEncoderLayerr"   r   c                    t         |           |j                  | _        t        ||      | _        t        |      | _        t        j                  |j                        | _	        t        j                  |j                        | _
        y )N)r"   r   )r(   r)   rB   r   	self_attnr   mlpr   	LayerNorminput_layernormpost_attention_layernormr   s      r6   r)   zGlmAsrEncoderLayer.__init__   sd    !--()LV$!||F,>,>?(*V5G5G(H%r7   Nrs   r   r   r9   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)rs   r    )r   r   r   r   )r2   rs   r   r   residual_s         r6   rc   zGlmAsrEncoderLayer.forward   s     !,,];)4>> 
' 3
 
q
 !=0 !55mD/ =0r7   rd   )re   rf   rg   r   rD   r)   rE   rh   rk   r   r   rc   rm   rn   s   @r6   r   r      sp    I| I I IM|| #5<<#=>E +,	
 
r7   r   c                   6    e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZy)GlmAsrPreTrainedModelr"   model)audiotextTr   past_key_valuesN)re   rf   rg   r   ri   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpar   r7   r6   r   r     s4    (&*#*+"3Nr7   r   c                   n     e Zd ZU eed<   dZdZdgZee	dZ
def fdZeedee   fd              Z xZS )	GlmAsrEncoderr"   input_featuresr   r   )rs   
attentionsc           	         t         |   |       t        j                  |j                  |j
                  dd      | _        t        j                  |j
                  |j
                  ddd      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j
                        | _        t        |      | _        d| _        | j%                          y c c}w )Nr   r   )kernel_sizepaddingr   )r   strider   )r"   F)r(   r)   r   Conv1dnum_mel_binsrB   conv1conv2
ModuleListrangenum_hidden_layersr   layersr   normr    
rotary_embgradient_checkpointing	post_initr   s      r6   r)   zGlmAsrEncoder.__init__,  s     YYv22F4F4FTU_`a
YYv1163E3EST]^hij
mmDI&JbJbDcdy	2d
 LL!3!34	/v>&+# es   Dr   c                    t         j                  j                  | j                  |            }t         j                  j                  | j	                  |            }|j                  dd      }|}| j                  |t        j                  |j                  d   |j                        d d d f         }| j                  D ]  } ||fd|i|} | j                  |      }t        |      S )Nr   r   r3   )r^   r   )last_hidden_state)r   r   gelur   r   rY   r   rE   rF   rU   r3   r   r   r   )r2   r   r   inputs_embedsrs   r   encoder_layers          r6   rc   zGlmAsrEncoder.forward9  s     **4::n+EF**4::m+DE%//15%"oo]5H5H5KTaThTh(ijnpqjq(r . 
 "[[ 	lM)-kM`kdjkM	l 		-0)MJJr7   )re   rf   rg   r   ri   main_input_namer   r   r   r   _can_record_outputsr)   r   r   r   r   rc   rm   rn   s   @r6   r   r   "  sd    &O-.+%
2  K7I0J K  Kr7   r   c                   .     e Zd ZdZdef fdZd Z xZS )GlmAsrMultiModalProjectorz
    Audio adaptor (small MLP) that projects GlmAsrEncoder features
    to the LLM embedding space so they can replace `<sound>` tokens.
    r"   c                 j   t         |           t        j                  |j                  j
                  |j                  j                  dz        | _        t        |j                     | _        t        j                  |j                  j                  dz  |j                  j                        | _        y )Nr   )r(   r)   r   r   audio_configr   text_configrB   linear_1r   projector_hidden_actactlinear_2r   s     r6   r)   z"GlmAsrMultiModalProjector.__init__R  s    		&"5"5"G"GI[I[IgIgjkIkl&556		&"4"4"@"@1"DfFXFXFdFder7   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rd   )r   r   r   )r2   audio_featuresrs   s      r6   rc   z!GlmAsrMultiModalProjector.forwardX  s2    n5/m4r7   )re   rf   rg   r   r   r)   rc   rm   rn   s   @r6   r   r   L  s    
f| fr7   r   z~
    The GlmAsr model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Llama language model.
    custom_introc                   *    e Zd ZdZdZdZ fdZd Zd Zd Z	d Z
d Zd Ze ed	
      dej                   dej"                  dee   deez  fd              Zee	 	 	 	 	 	 	 	 	 	 	 ddej.                  dz  dej                   dz  dej"                  dz  dej"                  dz  dej.                  dz  dedz  dej                   dz  dej.                  dz  dedz  dej.                  dz  deej"                  z  dee   defd              Z fdZ xZS )GlmAsrForConditionalGenerationNc                 *   t         |   |       |j                  j                  | _        t	        j
                  |j                        | _        t        j
                  |j                        | _	        t        |      | _        | j                          y rd   )r(   r)   r   
vocab_sizer   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   r   s     r6   r)   z'GlmAsrForConditionalGeneration.__init__i  sn      ,,77$001D1DE2>>v?Q?QR%>v%F" 	r7   c                 6    | j                   j                         S rd   )r
  get_input_embeddingsr2   s    r6   r  z3GlmAsrForConditionalGeneration.get_input_embeddingss  s    ""7799r7   c                 :    | j                   j                  |       y rd   )r
  set_input_embeddings)r2   r~   s     r6   r  z3GlmAsrForConditionalGeneration.set_input_embeddingsv  s    007r7   c                 6    | j                   j                         S rd   )r
  get_output_embeddingsr  s    r6   r  z4GlmAsrForConditionalGeneration.get_output_embeddingsy  s    ""88::r7   c                 :    | j                   j                  |       y rd   )r
  set_output_embeddings)r2   new_embeddingss     r6   r  z4GlmAsrForConditionalGeneration.set_output_embeddings|  s    11.Ar7   c                 :    | j                   j                  |       y rd   )r
  set_decoder)r2   decoders     r6   r  z*GlmAsrForConditionalGeneration.set_decoder  s    ''0r7   c                 6    | j                   j                         S rd   )r
  get_decoderr  s    r6   r  z*GlmAsrForConditionalGeneration.get_decoder  s    ""..00r7   zgCompute audio embeddings from log-mel input features using the audio encoder and multi-modal projector.r  r   input_features_maskr   r9   c                 *    | j                   |fddi|}|j                  }|j                  |j                  d   d| j                  j
                  j                        }| j                  |      }|j                  d      }dD ]  \  }}	}
|d|z  z   |	dz
  z
  dz
  |
z  dz   } d}||z
  |z  dz   }t        j                  |j                  d   |j                  	      d
d
d
f   |d
d
d
f   k  }||j                  |j                           |_        |S )a
  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padded feature indices.
        return_dictTr   rN   ))r   r   r   )r   r   r   r   r      r   N)r	  r   rv   rU   r"   r   r   r  sumrE   rF   r3   rH   pooler_output)r2   r   r  r   audio_outputsaudio_hidden_statesaudio_embedsaudio_lengthsr   r   r   merge_factorpost_lengths
valid_masks                 r6   get_audio_featuresz1GlmAsrForConditionalGeneration.get_audio_features  s<   ( )((TTTVT+==199  #R)A)A)S)S
 112EF+//3,B 	`(G[&*Q[8K!OLqPU[[^__M	`%4EI\\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
&2:==ATAT3U&V#r7   	input_idsr   r^   r   r   labels	use_cachecache_positionlogits_to_keepc                    | | j                         |      }||| j                  ||d      j                  }|| j                  j                  k(  j                  d      }|j                  |j                  |j                        |j                  |j                              } | j                  d||||||	|
|d|}|S )a  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import GlmAsrForConditionalGeneration, AutoProcessor

        >>> model_id = "zai-org/GLM-ASR-Nano-2512"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = GlmAsrForConditionalGeneration.from_pretrained(model_id, dtype="auto", device_map="auto")
        >>> inputs = processor.apply_transcription_request("https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3")

        >>> inputs = inputs.to(model.device, dtype=model.dtype)

        >>> outputs = model.generate(**inputs, do_sample=False, max_new_tokens=500)

        >>> decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1] :], skip_special_tokens=True)
        >>> print(decoded_outputs)
        ```T)r  rN   )r   r   r^   r   r*  r+  r,  r-  r   )
r  r(  r   r"   audio_token_idr   masked_scatterrH   r3   r
  )r2   r)  r   r  r   r^   r   r   r*  r+  r,  r-  r   r#  audio_token_maskoutputss                   r6   rc   z&GlmAsrForConditionalGeneration.forward  s    \  7D557	BM%)*?22>CVdh2iwwL !*T[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M +>$*=*= 
+
')%+))
+
 
+
 r7   c                     |j                  dd       }|j                  dd       }|j                  d      }t        |   |i |}||d   dk(  r|||d<   |||d<   |S )Nr   r  r,  r   )popr@   r(   prepare_inputs_for_generation)r2   argsr   r   r  r,  model_inputsr5   s          r6   r5  z<GlmAsrForConditionalGeneration.prepare_inputs_for_generation  s      $4d;$jj)>E$45w<dMfM%.*;q*@)1?-.".6I23r7   )NNNNNNNNNNr   )re   rf   rg   _keep_in_fp32_modules_strict_tp_plan_pp_planr)   r  r  r  r  r  r  r   r   rE   FloatTensorrh   r   r   rk   r   r(  
LongTensorr   boolrD   r   rc   r5  rm   rn   s   @r6   r  r  _  s    $( HH:8;B11 ~ ))  #\\  +,	 
 
+	+   D  .23737.204(,26*.!%26-.C##d*C ))D0C #\\D0	C
 t+C &&-C C ((4/C   4'C $;C ((4/C ell*C +,C 
 C  CJ r7   r  )r   r  r   )r   )Nr   ):collections.abcr   typingr   activationsr   cache_utilsr   
generationr   integrationsr	   modeling_layersr
   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   r   autor   r   configuration_glmasrr   r   rE   r   Moduler    rr   rh   rD   rz   rI   r   r   r   r   r   r   r   r   r  __all__r   r7   r6   <module>rO     s  * %  !   ) / 9 R K F & K K Q Q 2 C @<BII @<F(	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%4$ )*2)bii 2) +2)j		  3  F O  'K) 'KT		 & 
^%:O ^
^B Wr7   