
    i6                       d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z	ddl
mc mZ ddl	mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+  e&jX                  e-      Z.d Z/d Z0d Z1de	jd                  de	jd                  fdZ3e e$d       G d de"                    Z4e e$d       G d de"                    Z5ee$ G d d e"                    Z6 G d! d"ejn                        Z8 G d# d$ejn                        Z9 G d% d&ejn                        Z: G d' d(ejn                        Z; G d) d*ejn                        Z< G d+ d,ejn                        Z= G d- d.ejn                        Z> G d/ d0ejn                        Z? G d1 d2ejn                        Z@ G d3 d4e      ZA G d5 d6ejn                        ZB G d7 d8ejn                        ZC G d9 d:ejn                        ZD G d; d<ejn                        ZE	 dcd=ejn                  d>e	jd                  d?e	jd                  d@e	jd                  dAe	jd                  dz  dBeFdCeFfdDZG G dE dFejn                        ZH G dG dHejn                        ZI G dI dJejn                        ZJ G dK dLejn                        ZK G dM dNejn                        ZL G dO dPe      ZM G dQ dRejn                        ZN G dS dTejn                        ZOe$ G dU dVe             ZP G dW dXeP      ZQ e$dY       G dZ d[eP             ZRe$ G d\ d]eP             ZSe$ G d^ d_eP             ZTe$ G d` daeP             ZUg dbZVy)dzPyTorch CLAP model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forwardmeshgrid)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int   )ClapAudioConfig
ClapConfigClapTextConfigc                     | j                   \  }}}| dddddddf   j                  dd|d      }|j                  |||z  |      }|S )ae  
    Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.

    Args:
        hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
            Input hidden states
        ratio (`int`):
            The ratio of the length of the output to the length of the input.
    Nr   )shaperepeatreshape)hidden_statesratio
batch_sizetime_lengthclasses_num	upsampleds         p/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/clap/modeling_clap.pyinterpolater(   -   sX     .;-@-@*ZkaD!m,33Aq%CI!!*kE.A;OI    c                     | j                   \  }}}}| j                  |||z  |||z  ||      } | j                  dddddd      j                         j                  d|||      }|S )aR  
    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
    num_channels)`

    Args:
        hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
            Input hidden states
        window_size (`int`):
            Window size
    r   r   r            r   viewpermute
contiguous)r!   window_sizer#   heightwidthnum_channelswindowss          r'   window_partitionr8   >   s}     /<.A.A+J|!&&Fk);8Lk[gM ##Aq!Q15@@BGGKYdfrsGNr)   c                     | j                   d   }| j                  d||z  ||z  |||      } | j                  dddddd      j                         j                  d|||      } | S )a  
    Merges windows to produce higher resolution features.
    Args:
        windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
            Input windows
        window_size (`int`):
            Window size
        height (`int`):
            Height of the resized audio
        width (`int`):
            Width of the resized audio
    r.   r   r   r   r+   r,   r-   r/   )r7   r3   r4   r5   r6   s        r'   window_reverser:   S   sn     ==$Lll2v4e{6JKYdfrsGooaAq!Q/::<AA"feUabGNr)   logitsreturnc                     t        j                  t        |       | j                        }t        j
                  j                  | |      S )Ndevice)torcharangelenr?   r   
functionalcross_entropy)r;   labelss     r'   contrastive_lossrF   h   s1    \\#f+fmm<F==&&vv66r)   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)ClapTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedslast_hidden_state.r!   
attentions)__name__
__module____qualname____doc__rJ   r@   FloatTensor__annotations__rK   r!   tuplerL    r)   r'   rI   rI   m   sr    
 -1K""T)026u((4/6:>M5**C/047>7;Je'',-4;r)   rI   zT
    ClapAudio model output to mimic the output of the original implementation.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)ClapAudioModelOutputz
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        The Audio embeddings obtained by applying the projection layer to the pooler_output.
    Naudio_embedsrK   .r!   rL   )rM   rN   rO   rP   rW   r@   rQ   rR   rK   r!   rS   rL   rT   r)   r'   rV   rV      sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r)   rV   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)
ClapOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for audio-text similarity.
    logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
        The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
        The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapTextModel`].
    audio_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapAudioModel`].
    Nlosslogits_per_audiologits_per_textrJ   rW   text_model_outputaudio_model_outputr<   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r]   r^   N)getattrto_tuple).0kselfs     r'   	<genexpr>z&ClapOutput.to_tuple.<locals>.<genexpr>   s=      
  KKDGQXY]_`QaQjQjQll
s   -0)rS   keysre   s   `r'   rb   zClapOutput.to_tuple   s#     
YY[
 
 	
r)   )rM   rN   rO   rP   rZ   r@   rQ   rR   r[   r\   rJ   rW   r]   r   r^   rS   r   rb   rT   r)   r'   rY   rY      s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148185929
%* 
r)   rY   c                   *     e Zd ZdZd fd	Zd Z xZS )ClapDropPathz
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
    refactored version of the `SwinDropPath` implementation.
    c                 0    t         |           || _        y N)super__init__	drop_prob)re   ro   	__class__s     r'   rn   zClapDropPath.__init__   s    "r)   c                 J   | j                   dk(  s| j                  s|S d| j                   z
  }|j                  d   fd|j                  dz
  z  z   }|t	        j
                  ||j                  |j                        z   }|j                          |j                  |      |z  }|S )N        r   r   )r   dtyper?   )
ro   trainingr   ndimr@   randrt   r?   floor_div)re   r!   	keep_probr   random_tensoroutputs         r'   forwardzClapDropPath.forward   s    >>S   &	$$Q')DM4F4F4J,KK!EJJuM<O<OXeXlXl$mm""9-=r)   rl   )rM   rN   rO   rP   rn   r}   __classcell__rp   s   @r'   rj   rj      s    
#r)   rj   c                   .     e Zd ZdZdef fdZd Z xZS )ClapAudioAFFBlockz
    ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
    the 1D version.
    configc                    t         |           |j                  }|j                  }t	        ||z        }t        j                  t        j                  ||ddd      t        j                  |      t        j                  d      t        j                  ||ddd      t        j                  |            | _
        t        j                  t        j                  d      t        j                  ||ddd      t        j                  |      t        j                  d      t        j                  ||ddd      t        j                  |            | _        t        j                         | _        y )Nr   r   kernel_sizestridepaddingT)inplace)rm   rn   patch_embeds_hidden_sizeaff_block_rintr   
SequentialConv2dBatchNorm2dReLU	local_attAdaptiveAvgPool2d
global_attSigmoidsigmoid)re   r   channelsdownsize_ratiointer_channelsrp   s        r'   rn   zClapAudioAFFBlock.__init__   s   22++X78IIhAaQRSNN>*GGD!IInhAaQRSNN8$
 --  #IIhAaQRSNN>*GGD!IInhAaQRSNN8$
 zz|r)   c                     ||z   }| j                  |      | j                  |      z   }| j                  |      }d|z  |z  d|z  d|z
  z  z   }|S )Nr+   r   )r   r   r   )re   r!   residualattention_inputfused_layer_outputr|   s         r'   r}   zClapAudioAFFBlock.forward   sb    '(2!^^O<t?__!\\*<=]"%77!h,!N`J`:aar)   rM   rN   rO   rP   r   rn   r}   r~   r   s   @r'   r   r      s    
$ $0r)   r   c                   0     e Zd ZdZdef fdZddZ xZS )ClapAudioPatchEmbedz
    This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
    Transformer block.
    r   c                    t         |           t        |j                  t              r|j                  |j                  fn|j                  }t        |j
                  t              r|j
                  |j
                  fn|j
                  }t        |j                  t              r|j                  |j                  fn|j                  }|| _        || _        |d   |d   z  |d   |d   z  f| _        | j                  d   | j                  d   z  | _	        |j                  | _        |j                  | _        |d   |d   z
  dz  |d   |d   z
  dz  f}| j                  r|j                  dk(  rdnd}t        j                  |j                   |z  |j"                  |||      | _        |j&                  rt        j(                  |j"                        nt        j*                         | _        | j                  rZt/        |      | _        t        j                  |j                   |j"                  |d   |d   dz  f|d   |d   dz  f|      | _        y y )Nr   r   r+   channel_mapr,   r   r   )rm   rn   
isinstance	spec_sizer   
patch_sizepatch_strideimg_size	grid_sizenum_patchesflatten_patch_embedsflattenenable_fusionfusion_typer   r   patch_embed_input_channelsr   projenable_patch_layer_norm	LayerNormIdentitynormr   fusion_model
mel_conv2d)re   r   r   r   r   r   scale_factorrp   s          r'   rn   zClapAudioPatchEmbed.__init__   s+   ;EfFVFVX[;\F$$f&6&67bhbrbr6@ARARTW6XV 1 12^d^o^o 	 ;EVEXEXZ]:^V  &"5"56djdwdw 	 !("1+a8(1+VW:XY>>!,t~~a/@@22#11qMLO39JqMLYZO<[`a;ab!//f6H6HM6Yq`aII--<++"
	 FLEcEcBLL!@!@Aikititiv	 1& 9D ii11//']JqMA,=>$Qa1)<=DO r)   c                    | j                   r|d d ddd d d d f   }|j                  \  }}}}|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  |      }|j                  d      }t        |      dkD  r||dd d d d d f   j                         }	|	j                  \  }}}}|	j                  ||z  d||      }	| j                  |	      }	|	j                  \  }
}}}|	j                  |||||      }	|	j                  d      j                         j                  d	      }	|	j                  d      }t        j                  j                  j                  |	d||z
  fd
d      }	| j!                  ||   |	      ||<   |}nx|j                  \  }
}
}}|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  |      }| j                  r!|j                  d      j#                  dd      }| j%                  |      }|S )Nr   r   zInput audio size (*z) doesn't match model (z).r.   )r   r+   r   r   r,   r   constantr+   )r   r   r   
ValueErrorr   sizerB   r2   r0   r   r1   r   r@   r   rC   padr   	transposer   )re   r!   is_longer_idxglobal_hidden_statesr#   r6   r4   r5   output_widthlocal_hidden_states_featureslocal_widths                r'   r}   zClapAudioPatchEmbed.forward*  s   #0AaCA#>  7K6P6P3Jfeq))UdmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  $(99-A#B /44R8L=!A%&3M12q!4K&L&W&W&Y#:M:S:S7
L&%&9&>&>zL?XZ[]cej&k#&*oo6I&J#-@-F-F*8VU&9&>&>z<Yacikp&q#&9&A&A/&R&]&]&_&g&ghi&j#166r:&+hh&9&9&=&='!\K-G)H*VW'# 7;6G6G(79L7$]3 1M"/"5"5Aq&%q))UdmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  !IIm4M<<)11!4>>q!DM		-0r)   rl   r   r   s   @r'   r   r      s    
( (T/r)   r   c            
            e Zd Z fdZ	 	 d	dej
                  dej                  dz  dedz  deej
                     fdZ	d Z
 xZS )
ClapAudioSelfAttentionc                    t         |           ||z  dk7  rt        d| d| d      || _        t	        ||z        | _        | j                  | j
                  z  | _        t        |t        j                  j                        r|n||f| _        t        j                  t        j                  d| j                  d   z  dz
  d| j                  d   z  dz
  z  |            | _        | j#                  d| j%                                t        j&                  | j                  | j                  |j(                        | _        t        j&                  | j                  | j                  |j(                        | _        t        j&                  | j                  | j                  |j(                        | _        t        j0                  |j2                        | _        y )	Nr   The hidden size (6) is not a multiple of the number of attention heads ()r+   r   relative_position_indexbias)rm   rn   r   num_attention_headsr   attention_head_sizeall_head_sizer   collectionsabcIterabler3   r   	Parameterr@   zerosrelative_position_bias_tableregister_buffercreate_relative_position_indexLinearqkv_biasquerykeyvalueDropoutattention_probs_dropout_probdropoutre   r   dim	num_headsr3   rp   s        r'   rn   zClapAudioSelfAttention.__init__^  s   ?a#C5(^_h^iijk  $- #&sY#7 !558P8PP%k;??3K3KLKS^`kRl 	 -/LLKKT--a0014T=M=Ma=P9PST9TUW`a-
) 	68[8[8]^YYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr)   Nr!   attention_maskoutput_attentionsr<   c                    |j                   \  }}}||d| j                  f}| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
t        j                  ||	j	                  dd            }|t        j                  | j                        z  }| j                  | j                  j                  d         }|j                  | j                  d   | j                  d   z  | j                  d   | j                  d   z  d      }|j                  ddd      j                         }||j!                  d      z   }|r|j                   d   }|j                  ||z  || j"                  ||      }||j!                  d      j!                  d      z   }|j                  d| j"                  ||      }t$        j&                  j)                  |d      }| j+                  |      }t        j                  ||
      }|j                  dddd      j                         }|j-                         d d | j.                  fz   }|j                  |      }|r||f}|S |f}|S )Nr.   r   r+   r   r   r   )r   r   r   r0   r   r   r   r@   matmulmathsqrtr   r   r3   r1   r2   	unsqueezer   r   rC   softmaxr   r   r   )re   r!   r   r   r#   r   r6   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                     r'   r}   zClapAudioSelfAttention.forwardx  s    )6(;(;%
C"CT-E-EFjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<Y5H5HR5PQ+dii8P8P.QQ!%!B!B4C_C_CdCdegCh!i!7!<!<Q$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U+.D.N.Nq.QQ%'--a0J/44j(*d6N6NPSUX   0.2J2J12M2W2WXY2ZZ/44R9Q9QSVX[\ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r)   c                 r   t        j                  | j                  d         }t        j                  | j                  d         }t        j                  t	        ||gd            }t        j
                  |d      }|d d d d d f   |d d d d d f   z
  }|j                  ddd      j                         }|d d d d dfxx   | j                  d   dz
  z  cc<   |d d d d dfxx   | j                  d   dz
  z  cc<   |d d d d dfxx   d| j                  d   z  dz
  z  cc<   |j                  d      }|S )Nr   r   ij)indexingr+   r.   )	r@   rA   r3   stackr   r   r1   r2   sum)re   coords_hcoords_wcoordscoords_flattenrelative_coordsr   s          r'   r   z5ClapAudioSelfAttention.create_relative_position_index  s)   << 0 0 34<< 0 0 34Xx&:TJKvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"9&&r)   NF)rM   rN   rO   rn   r@   TensorrQ   boolrS   r}   r   r~   r   s   @r'   r   r   ]  s^    G: 48).	1||1 ))D01  $;	1
 
u||	1f'r)   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapAudioSelfOutputc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y rl   )rm   rn   r   r   denser   r   r   re   r   r   rp   s      r'   rn   zClapAudioSelfOutput.__init__  s6    YYsC(
zz&"E"EFr)   r!   input_tensorr<   c                 J    | j                  |      }| j                  |      }|S rl   r  r   re   r!   r	  s      r'   r}   zClapAudioSelfOutput.forward  s$    

=1]3r)   rM   rN   rO   rn   r@   r  r}   r~   r   s   @r'   r  r    s2    G
U\\  RWR^R^ r)   r  c            
            e Zd Z fdZ	 	 ddej
                  dej                  dz  dedz  deej
                     fdZ	 xZ
S )	ClapAudioAttentionc                 j    t         |           t        ||||      | _        t	        ||      | _        y rl   )rm   rn   r   re   r  r|   r   s        r'   rn   zClapAudioAttention.__init__  s.    *63	;O	)&#6r)   Nr!   r   r   r<   c                 h    | j                  |||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   re   r|   )re   r!   r   r   self_outputsattention_outputr   s          r'   r}   zClapAudioAttention.forward  sE     yy@QR;;|AF#%QR(88r)   r  rM   rN   rO   rn   r@   r  rQ   r  rS   r}   r~   r   s   @r'   r  r    sW    7 48).		||	 ))D0	  $;		
 
u||		r)   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapAudioIntermediatec                    t         |           t        j                  |t	        |j
                  |z              | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rl   )rm   rn   r   r   r   	mlp_ratior  r   
hidden_actstrr	   intermediate_act_fnr  s      r'   rn   zClapAudioIntermediate.__init__  sa    YYsC(8(83(>$?@
f''-'-f.?.?'@D$'-'8'8D$r)   r!   r<   c                 J    | j                  |      }| j                  |      }|S rl   r  r  re   r!   s     r'   r}   zClapAudioIntermediate.forward  &    

=100?r)   r  r   s   @r'   r  r    #    9U\\ ell r)   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapAudioOutputc                     t         |           t        j                  t	        |j
                  |z        |      | _        t        j                  |j                        | _	        y rl   )
rm   rn   r   r   r   r  r  r   hidden_dropout_probr   r  s      r'   rn   zClapAudioOutput.__init__  sF    YYs6#3#3c#9:C@
zz&"<"<=r)   r!   r<   c                 J    | j                  |      }| j                  |      }|S rl   r  r  s     r'   r}   zClapAudioOutput.forward  s$    

=1]3r)   r  r   s   @r'   r#  r#    s#    >
U\\ ell r)   r#  c                        e Zd Zd fd	Zd Zd Zd Z	 	 ddej                  de	e
e
f   dedz  d	edz  d
e	ej                  ej                  f   f
dZ xZS )ClapAudioLayerc                    t         |           |j                  | _        || _        |j                  | _        || _        t        j                  ||j                        | _	        t        |||| j                        | _        |dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t!        ||      | _        t%        ||      | _        y )Neps)r3   rr   )rm   rn   chunk_size_feed_forward
shift_sizer3   input_resolutionr   r   layer_norm_epslayernorm_beforer  	attentionrj   r   	drop_pathlayernorm_afterr  intermediater#  r|   )re   r   r   r.  r   drop_path_rater-  rp   s          r'   rn   zClapAudioLayer.__init__  s    '-'E'E$$!-- 0 "Sf6K6K L+FCPTP`P`a9G#9Mn5SUS^S^S`!||CV5J5JK1&#>%fc2r)   c                    t        |      | j                  k  rgt        d      | _        t        j
                  j                         r(t	        j                   t	        j                  |            n
t        |      | _        y y Nr   )minr3   r   r-  r@   jit
is_tracingtensor)re   r.  s     r'   set_shift_and_window_sizez(ClapAudioLayer.set_shift_and_window_size  s\     D$4$44'lDO=BYY=Q=Q=S		%,,'789Y\]mYn  5r)   c           	         | j                   dkD  rht        j                  d||df||      }t        d| j                         t        | j                   | j                          t        | j                    d       f}t        d| j                         t        | j                   | j                          t        | j                    d       f}d}|D ]  }	|D ]  }
||d d |	|
d d f<   |dz  }  t        || j                        }|j                  d| j                  | j                  z        }|j                  d      |j                  d      z
  }|j                  |dk7  d      j                  |dk(  d      }|S d }|S )Nr   r   rs   r.   r+   g      Yrr   )	r-  r@   r   slicer3   r8   r0   r   masked_fill)re   r4   r5   rt   r?   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r'   get_attn_maskzClapAudioLayer.get_attn_mask  s   ??Q{{Avua#8fUHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E - #/ K@EHQk1<=QJE
 ,Hd6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1nfEQQR[_`R`befI  Ir)   c                     | j                   || j                   z  z
  | j                   z  }| j                   || j                   z  z
  | j                   z  }ddd|d|f}t        j                  j                  ||      }||fS r7  )r3   r   rC   r   )re   r!   r4   r5   	pad_right
pad_bottom
pad_valuess          r'   	maybe_padzClapAudioLayer.maybe_pad+  s    %%0@0@(@@DDTDTT	&&$2B2B)BBdFVFVV
Ay!Z8
))-Dj((r)   r!   input_dimensionsr   Nalways_partitionr<   c                    |s| j                  |       n	 |\  }}|j                         \  }}}	|}
| j                  |      }|j                  ||||	      }| j	                  |||      \  }}|j
                  \  }}}}| j                  dkD  r1t        j                  || j                   | j                   fd      }n|}t        || j                        }|j                  d| j                  | j                  z  |	      }| j                  |||j                  |j                        }| j                  |||      }|d   }|j                  d| j                  | j                  |	      }t        || j                  ||      }| j                  dkD  r/t        j                  || j                  | j                  fd      }n|}|d   dkD  xs |d   dkD  }|r|d d d |d |d d f   j!                         }|j                  |||z  |	      }|
| j#                  |      z   }| j%                  |      }| j'                  |      }|| j)                  |      z   }|r	||d	   f}|S |f}|S )
Nr   )r   r+   )shiftsdimsr.   rs   )r   r   r-   r   )r<  r   r0  r0   rM  r   r-  r@   rollr8   r3   rH  rt   r?   r1  r:   r2   r2  r3  r4  r|   )re   r!   rN  r   rO  r4   r5   r#   r   r   shortcutrL  
height_pad	width_padshifted_hidden_stateshidden_states_windowsrG  attention_outputsr  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                           r'   r}   zClapAudioLayer.forward2  s     **+;<("/"4"4"6
Ax --m<%**:vuhO %)NN=&%$P!z&3&9&9#:y!??Q$)JJ}tFVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&	)<)<EZEaEa ' 
	 !NN+@)_pNq,Q/,11"d6F6FHXHXZbc():D<L<LjZcd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:v~xX 4>>2C#DD++M:((6$t{{<'@@@Q'8';< YeWfr)   )rr   r   FF)rM   rN   rO   rn   r<  rH  rM  r@   r  rS   r   r  r}   r~   r   s   @r'   r(  r(    sz    38) */(->||>  S/>  $;	>
 +> 
u||U\\)	*>r)   r(  c                        e Zd Z fdZ	 	 d	dej
                  deeef   dedz  dedz  deej
                     f
dZ	 xZ
S )
ClapAudioStagec                 h   t         	|           || _        || _        t	        j
                  t        |      D cg c]-  }t        ||||||   |dz  dk(  rdn|j                  dz        / c}      | _	        |& |||t        j                        | _        d| _        y d | _        d| _        y c c}w )Nr+   r   )r   r   r.  r   r5  r-  )r   
norm_layerF)rm   rn   r   r   r   
ModuleListranger(  r3   blocksr   
downsamplepointing)
re   r   r   r.  depthr   r2  rg  irp   s
            r'   rn   zClapAudioStage.__init__u  s    mm u
  !%5'#,Q<%&UaZqf6H6HA6M

 !()9sr||\DO  #DO'
s   2B/r!   rN  r   NrO  r<   c                    |\  }}t        | j                        D ]  \  }} |||||      }	|	d   } |}
| j                  )|dz   dz  |dz   dz  }}||||f}| j                  |
|      }n||||f}||
|f}|r|	dd  z  }|S )Nr   r   r+   )	enumeraterf  rg  )re   r!   rN  r   rO  r4   r5   rj  layer_moduler^  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledoutput_dimensionsstage_outputss                  r'   r}   zClapAudioStage.forward  s     )(5 	-OA|(8HJ[]mnM)!,M	-
 -:)??&5;aZA4EPQ	VWGW 1!'0BDU V OO,MO_`M!' >&(IK\]]12..Mr)   r_  )rM   rN   rO   rn   r@   r  rS   r   r  r}   r~   r   s   @r'   ra  ra  t  sb    < */(-||  S/  $;	
 + 
u||	r)   ra  c            	            e Zd ZdZej
                  fdee   dedej                  ddf fdZ	d Z
d	ej                  d
eeef   dej                  fdZ xZS )ClapAudioPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    r.  r   rc  r<   Nc                     t         |           || _        || _        t	        j
                  d|z  d|z  d      | _         |d|z        | _        y )Nr,   r+   Fr   )rm   rn   r.  r   r   r   	reductionr   )re   r.  r   rc  rp   s       r'   rn   zClapAudioPatchMerging.__init__  sI     01s7AG%@q3w'	r)   c                     |dz  dk(  xs |dz  dk(  }|r.ddd|dz  d|dz  f}t         j                  j                  ||      }|S )Nr+   r   r   )r   rC   r   )re   input_featurer4   r5   
should_padrL  s         r'   rM  zClapAudioPatchMerging.maybe_pad  sU    qjAo:519>
Q519a!<JMM--mZHMr)   rx  rN  c                    |\  }}|j                   \  }}}|j                  ||||      }| j                  |||      }|d d dd ddd dd d f   }|d d dd ddd dd d f   }	|d d dd ddd dd d f   }
|d d dd ddd dd d f   }t        j                  ||	|
|gd      }|j                  |dd|z        }| j                  |      }| j                  |      }|S )Nr   r+   r   r.   r,   )r   r0   rM  r@   catr   rv  )re   rx  rN  r4   r5   r#   r   r6   input_feature_0input_feature_1input_feature_2input_feature_3s               r'   r}   zClapAudioPatchMerging.forward  s   ((5(;(;%
C%**:vulS}feD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?O_Ve"fhjk%**:r1|;KL		-0}5r)   )rM   rN   rO   rP   r   r   rS   r   Modulern   rM  r@   r  r}   r~   r   s   @r'   rt  rt    sr    
 XZWcWc (s (# (299 (hl (U\\ U3PS8_ Y^YeYe r)   rt  c                        e Zd Z fdZd Z	 	 	 	 	 	 ddej                  dz  dedz  dedz  dedz  dedz  d	edz  d
ee	z  fdZ
 xZS )ClapAudioEncoderc                    t         |           t        |j                        | _        || _        t        |      | _        |j                  | _        | j                  j                  | _	        |j                  | _
        |j                  |j                  z  | _        t        |j                  d| j                  dz
  z  z        | _        t!        j"                  d|j$                  t'        |j                        d      D cg c]  }|j)                          }}| j                  j*                  }t-        | j                        D cg c]  }|d   d|z  z  |d   d|z  z  f c}| _        t1        j2                  t-        | j                        D cg c]  }t5        |t        |j                  d|z  z        | j.                  |   |j                  |   |j6                  |   |t'        |j                  d |       t'        |j                  d |dz           || j                  dz
  k  rt8        nd        c}      | _        d| _        t1        j>                  |j                        | _         t1        jB                  | j                        | _"        |j                  | _        t1        jF                  d      | _$        y c c}w c c}w c c}w )Nr+   r   r   cpur>   )r   r   r.  ri  r   r2  rg  F)%rm   rn   rB   depths
num_layersr   r   patch_embedr   r   r   num_mel_bins
freq_ratior   r   num_featuresr@   linspacer5  r   itemr   re  input_resolutionsr   rd  ra  r   rt  layersgradient_checkpointingr   
batch_normr   r   AdaptiveAvgPool1davgpool)re   r   xr5  r   rj  i_layerrp   s          r'   rn   zClapAudioEncoder.__init__  sW   fmm,.v6#11 ,,99)) **f.A.AA ? ?!Z[H[B\ \],1NN1f>S>SUXY_YfYfUgpu,vwq!&&(ww$$..	\abfbqbq\r!sWX9Q<AqD#99Q<AqD;Q"R!smm  %T__5  !F;;ajHI%)%;%;G%D --0$88A,Sx1H-ICPVP]P]^k`gjk`kPlLmn9@4??UVCV9V4]a
 ',#..)<)<=LL!2!23	mm++A.3 x "ts   J<KB#Kc                    |j                   \  }}}}t        | j                  | j                  z        }| j                  | j                  z  }||kD  s||kD  rt	        d      ||k  r%t
        j                  j                  |||fdd      }||k  r%t
        j                  j                  |||fdd      }|j                   \  }}}	}
|j                  ||| j                  z  |	| j                  z  |
      }|j                  dddd      j                         }|j                  |||
| j                  z  |	| j                  z        }|S )	z
        The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
        z@the wav size should be less than or equal to the swin input sizebicubicT)modealign_cornersr   r   r   r+   )r   r   r   r  r   r   rC   r(   r    r1   r2   )re   normalized_input_featuresr   r$   freq_length
spec_widthspec_heightbatchr   timefreqs              r'   reshape_mel2imgz ClapAudioEncoder.reshape_mel2img  s`   
 *C)H)H&1k;$//9:
nn7#{['@_`` #(*(A(A)J+D9dh )B )% $(*(A(A)K+EIei )B )% '@&E&E#xt %>$E$E8doo-tt/F%
! %>$E$EaAq$Q$\$\$^!$=$E$E8TDOO3TT__5L%
! )(r)   N	is_longerr   output_hidden_states(output_hidden_states_before_downsamplingrO  return_dictr<   c                    |j                  dd      }| j                  |      }|j                  dd      }d }	| j                  r6|j                  |j                        }
t        j                  |
dk(        d   }	| j                  |      }|j                  d   }| j                  ||	      }|rdnd }|rdnd }|rdnd }| j                  d   }|rE|j                  \  }}} |j                  |g|| }|j                  dddd      }||fz  }||fz  }t        | j                        D ]  \  }}| j                  |   } |||||      }|d   }|d   }|d   }|d   |d   f}|rP|rN|j                  \  }}} |j                  |g|d   |d   f| }|j                  dddd      }||fz  }||fz  }nI|rG|sE|j                  \  }}} |j                  |g|| }|j                  dddd      }||fz  }||fz  }|s||dd  z  } | j                  |      }|j                  \  }}}|dt!        | j"                        dz
  z  z  | j$                  d   z  }|dt!        | j"                        dz
  z  z  | j$                  d   z  }|j                  ddd      j'                         j)                  ||||      }|j                  \  }}}}|| j*                  z  } |j)                  |||| z  | |      }|j                  ddddd      j'                         j)                  ||| d      }| j-                  t        j.                  |d            }!t        j.                  |!d      }!|st1        d	 ||!||fD              S t3        ||!||
      S )Nr   r   r   r+   rT   r   r.   r,   c              3   $   K   | ]  }|| 
 y wrl   rT   )rc   vs     r'   rf   z+ClapAudioEncoder.forward.<locals>.<genexpr>  s      	 = 	s   rK   pooler_outputr!   rL   )r   r  r   tor?   r@   wherer  r   r  r  r0   r1   rl  r  r   rB   r  r   r2   r    r  r  r   rS   r   )"re   input_featuresr  r   r  r  rO  r  r  is_longer_list_idxis_longer_listr!   
frames_numall_hidden_statesall_reshaped_hidden_statesall_self_attentionsrN  r#   r   hidden_sizereshaped_hidden_staterj  rm  r^  rn  rq  rK   
n_channels
freq_shapetemporal_shapen_frequenciesn_temp
c_freq_binlatent_outputs"                                     r'   r}   zClapAudioEncoder.forward/  sh    (11!Q7$(OON$C!$=$G$G1$M!!&\\.*?*?@N!&^q-@!A!!D,,-FG"((+
((8JK"6BD+?RT"$5b411!4)6)<)<&J;$6M$6$6z$bDT$bVa$b!$9$A$A!Q1$M!-!11&+@*BB&(5 	9OA|#55a8(8HJ[]mnM)!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )O(I(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(:(:(::(fHX(fZe(f%(=(E(EaAq(Q%!m%55!*/D.FF* #}QR'88#?	9B !IIm4$5$;$;!
AzA#dkk*:Q*>$?@DDUDUVWDXX
#c$++.>.B(CDHYHYZ[H\\ %%aA.99;CCJPZ\fhvw 	 9J8O8O5
Jv"doo5
-55
MZ$?V
 %%aAq!4??AII*V`blnpq 	 U]]3Da%HImQ7 	 &!.'		 	 	 */'4*	
 	
r)   )NFFFFT)rM   rN   rO   rn   r  r@   rQ   r  rS   rV   r}   r~   r   s   @r'   r  r    s    &/P")N /3).,1@E(-#'p
 $$t+p
  $;	p

 #Tkp
 37+p
 +p
 D[p
 
%	%p
r)   r  c                   0     e Zd Zdeez  f fdZd Z xZS )ClapProjectionLayerr   c                     t         |           || _        |j                  }|j                  }t        j                  ||      | _        t        |j                     | _
        t        j                  ||      | _        y rl   )rm   rn   r   r  projection_dimr   r   linear1r	   projection_hidden_act
activationlinear2)re   r   r  r  rp   s       r'   rn   zClapProjectionLayer.__init__  sa    ((..yyn= !=!=>yy@r)   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rl   )r  r  r  r  s     r'   r}   zClapProjectionLayer.forward  s2    ]36]3r)   )rM   rN   rO   r   r   rn   r}   r~   r   s   @r'   r  r    s    A? Ar)   r  c                        e Zd ZdZ fdZ	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  ded	ej                  fd
Z
ed        Zedd       Z xZS )ClapTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 T   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j$                        j'                  d      d       | j                  dt!        j(                  | j*                  j-                         t         j.                        d       |j                  | _        t        j                  |j$                  |j
                  | j0                        | _        y )	N)padding_idxr*  position_idsr   r.   T)
persistenttoken_type_ids)rt   )rm   rn   r   	Embedding
vocab_sizer  pad_token_idword_embeddingstype_vocab_sizetoken_type_embeddingsr   r/  r   r%  r   r   r@   rA   max_position_embeddingsexpandr   r  r   longr  position_embeddingsre   r   rp   s     r'   rn   zClapTextEmbeddings.__init__  s4   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXei 	 	
 	ekk$*;*;*@*@*B%**Ubf 	 	
 "..#%<<**F,>,>DL\L\$
 r)   N	input_idsr  r  inputs_embedspast_key_values_lengthr<   c                    |<|| j                  || j                  |      }n| j                  || j                        }||j                         }n|j                         d d }|\  }}|t	        | d      rT| j
                  j                  |j                  d   d      }	t        j                  |	d|      }	|	j                  ||      }n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j!                  |      }||z   }| j#                  |      }| j%                  |      }|S )Nr.   r  r   r   )r   indexrs   )"create_position_ids_from_input_idsr  &create_position_ids_from_inputs_embedsr   hasattrr  r  r   r@   gatherr   r  r  r?   r  r  r  r   r   )re   r  r  r  r  r  input_shaper#   
seq_lengthbuffered_token_type_idsr  
embeddingsr  s                r'   r}   zClapTextEmbeddings.forward  sn    $#FFt//1G   $JJ=Z^ZjZjk #..*K',,.s3K!,
J
 !t-.*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
J!W!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D"55
^^J/
\\*-
r)   c                     | j                         dd }|d   }t        j                  |dz   ||z   dz   t        j                  | j                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr.   r   rs   r   )r   r@   rA   r  r?   r   r  )r  r  r  sequence_lengthr  s        r'   r  z9ClapTextEmbeddings.create_position_ids_from_inputs_embeds  sp     $((*3B/%a.||!O_{:Q>ejjYfYmYm
 %%a(//<<r)   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
        are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:

        Returns: torch.Tensor
        r   r   )ner   r@   cumsumtype_asr  )r  r  r  maskincremental_indicess        r'   r  z5ClapTextEmbeddings.create_position_ids_from_input_ids  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r)   )NNNNr   )r   )rM   rN   rO   rP   rn   r@   
LongTensorrQ   r   r  r}   staticmethodr  r  r~   r   s   @r'   r  r    s    Q
, .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$. 
.` = =" 8 8r)   r  moduler   r   r   r   scalingr   c                    t        j                  ||j                  dd            |z  }|#|d d d d d d d |j                  d   f   }	||	z   }t        j
                  j                  |dt         j                        j                  |j                        }t        j
                  j                  ||| j                        }t        j                  ||      }
|
j                  dd      j                         }
|
|fS )Nr+   r   r   r.   )r   rt   )pru   r   )r@   r   r   r   r   rC   r   float32r  rt   r   ru   r2   )r  r   r   r   r   r  r   kwargsattn_weightscausal_maskattn_outputs              r'   eager_attention_forwardr     s     <<s}}Q':;gEL!$Q1o		"o%=>#k1==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r)   c            
            e Zd Z fdZ	 	 ddej
                  dej                  dz  dedz  deej
                     fdZ	 xZ
S )	ClapTextSelfAttentionc                 $   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        |j                   | _        | j                  dz  | _        y )Nr   embedding_sizer   r   r         )rm   rn   r  r   r  r   r   r   r   r   r   r   r   r   r   r   r   r   attention_dropoutr  r  s     r'   rn   zClapTextSelfAttention.__init__9  sC    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r)   Nr!   r   r   r<   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|f| j                  sdn| j                  | j                  d|\  }} |j                  g |d j!                         }|r||f}|S |f}|S )Nr.   r   r+   rr   )r   r  )r   r   r   r0   r   r   r   r   get_interfacer   _attn_implementationr  ru   r  r  r    r2   )re   r!   r   r   r  r  r   query_states
key_statesvalue_statesattention_interfacer  r  r   s                 r'   r}   zClapTextSelfAttention.forwardN  sT    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFH1B;- JUr)   r  r  r   s   @r'   r  r  8  sW    60 48).	|| ))D0  $;	 
u||	r)   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapTextSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr*  )rm   rn   r   r   r  r  r   r/  r   r%  r   r  s     r'   rn   zClapTextSelfOutput.__init__r  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r)   r!   r	  r<   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rl   r  r   r   r  s      r'   r}   zClapTextSelfOutput.forwardx  7    

=1]3}|'CDr)   r  r   s   @r'   r  r  q  1    >U\\  RWR^R^ r)   r  c            
            e Zd Z fdZ	 	 ddej
                  dej                  dz  dedz  deej
                     fdZ	 xZ
S )	ClapTextAttentionc                 b    t         |           t        |      | _        t	        |      | _        y rl   )rm   rn   r  re   r  r|   r  s     r'   rn   zClapTextAttention.__init__  s&    )&1	(0r)   Nr!   r   r   r<   c                 n     | j                   |f||d|}| j                  |d   |      }|f|dd  z   }|S N)r   r   r   r   r  )re   r!   r   r   r  r  r  r   s           r'   r}   zClapTextAttention.forward  s\     !tyy
)/
 	
  ;;|AF#%QR(88r)   r  r  r   s   @r'   r  r    sW    1 48).	|| ))D0  $;	 
u||	r)   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapTextIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rl   )rm   rn   r   r   r  intermediate_sizer  r   r  r  r	   r  r  s     r'   rn   zClapTextIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r)   r!   r<   c                 J    | j                  |      }| j                  |      }|S rl   r  r  s     r'   r}   zClapTextIntermediate.forward  r   r)   r  r   s   @r'   r  r    r!  r)   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapTextOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r	  )rm   rn   r   r   r  r  r  r   r/  r   r%  r   r  s     r'   rn   zClapTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r)   r!   r	  r<   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rl   r  r  s      r'   r}   zClapTextOutput.forward  r  r)   r  r   s   @r'   r  r    r  r)   r  c            
            e Zd Z fdZ	 	 d	dej
                  dej                  dz  dedz  deej
                     fdZ	d Z
 xZS )
ClapTextLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y )Nr   )
rm   rn   r,  seq_len_dimr  r1  r  r4  r  r|   r  s     r'   rn   zClapTextLayer.__init__  sI    '-'E'E$*6208$V,r)   Nr!   r   r   r<   c                      | j                   |f||d|}|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S r  )r1  r   feed_forward_chunkr,  r  )	re   r!   r   r   r  self_attention_outputsr  r   r]  s	            r'   r}   zClapTextLayer.forward  s     "0"
)/"
 	"
 2!4(,0##T%A%A4CSCSUe
  /G+r)   c                 L    | j                  |      }| j                  ||      }|S rl   )r4  r|   )re   r  intermediate_outputr]  s       r'   r!  z ClapTextLayer.feed_forward_chunk  s,    "//0@A{{#68HIr)   r  )rM   rN   rO   rn   r@   r  rQ   r  rS   r}   r!  r~   r   s   @r'   r  r    s\    - 48).	|| ))D0  $;	 
u||	.r)   r  c                        e Zd Z fdZe	 	 	 	 d
dej                  dej                  dz  dedz  dedz  dedz  de	ej                     e
z  fd	       Z xZS )ClapTextEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r  )
rm   rn   r   r   rd  re  num_hidden_layersr  layerr  )re   r   rj  rp   s      r'   rn   zClapTextEncoder.__init__  sN    ]]5IaIaCb#caM&$9#cd
&+# $ds   A#Nr!   r   r   r  r  r<   c                     |rdnd }|rdnd }t        | j                        D ])  \  }	}
|r||fz   } |
|||fi |}|d   }|s!||d   fz   }+ |r||fz   }t        |||      S )NrT   r   r   )rK   r!   rL   )rl  r)  r   )re   r!   r   r   r  r  r  r  r  rj  rm  r^  s               r'   r}   zClapTextEncoder.forward  s     #7BD$5b4(4 	POA|#$58H$H!(! 	M *!,M &9]1=M<O&O#	P   1]4D D++*
 	
r)   )NFFT)rM   rN   rO   rn   r   r@   r  rQ   r  rS   r   r}   r~   r   s   @r'   r&  r&    s    ,  48).,1#'"
||"
 ))D0"
  $;	"

 #Tk"
 D["
 
u||		."
 "
r)   r&  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapTextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y rl   )rm   rn   r   r   r  r  Tanhr  r  s     r'   rn   zClapTextPooler.__init__  s9    YYv1163E3EF
'')r)   r!   r<   c                 \    |d d df   }| j                  |      }| j                  |      }|S r7  )r  r  )re   r!   first_token_tensorpooled_outputs       r'   r}   zClapTextPooler.forward  s6     +1a40

#566r)   r  r   s   @r'   r,  r,    s#    $
U\\ ell r)   r,  c                   l    e Zd ZU eed<   dZdZdZ ej                         de
j                  fd       Zy)ClapPreTrainedModelr   clap)audiotextFr  c                    | j                   j                  }t        |t              rt	        j
                  |j                  j                  d|dz         t	        j
                  |j                  j                  d|dz         t	        j                  |j                  t        j                  |j                  j                  d         j                  d             t	        j                  |j                          yt        |t"              rt	        j$                  |j&                  t)        j*                  | j                   j,                               t	        j$                  |j.                  t)        j*                  | j                   j,                               yt        |t0        j2                        r&t	        j
                  |j                  d|dz         yt        |t0        j4                  t0        j6                  f      rt	        j                  |j8                         t	        j:                  |j                         t=        |dd      ^t	        j                  |j>                         t	        j:                  |j@                         t	        j                  |jB                         yyt        |t0        jD                  t0        jF                  f      r| j                   jH                  dz  d	| j                   jJ                  z  dz  z  |z  }t	        j
                  |j                  |
       |j8                   t	        j                  |j8                         yyt        |tL              rNt	        j                  |jN                         t	        j                  |jP                  |jS                                yy)zInitialize the weightsrr   g{Gz?)meanstdr.   r  running_meanNr  r+   )r9  )*r   initializer_factorr   r  initnormal_r  weightr  copy_r  r@   rA   r   r  zeros_r  	ClapModel	constant_logit_scale_ar   loglogit_scale_init_valuelogit_scale_tr   r  r   r   r   ones_ra   r:  running_varnum_batches_trackedr   r   r  r(  r   r   r   r   )re   r  factorin_proj_stds       r'   _init_weightsz!ClapPreTrainedModel._init_weights#  sj    //f01LL33::&SW-XLL55<<3FUYMZJJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--.	*NN6//$++:\:\1]^NN6//$++:\:\1]^-LLSftmDr~~ >?KK$JJv}}%v~t4@F//0

6--.F667 A BII 67;;22D8a$++B_B_>_dh=hilrrKLLK8{{&FKK( ' 67KK;;<JJv55v7\7\7^_ 8r)   N)rM   rN   rO   r   rR   base_model_prefixinput_modalitiessupports_gradient_checkpointingr@   no_gradr   r  rL  rT   r)   r'   r3  r3    sB    (&+#U]]_`BII ` `r)   r3  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
e	 	 	 	 	 ddej                  dz  dej                  dz  d	edz  d
edz  dedz  deez  fd       Z xZS )ClapAudioModelr   r  r5  c                 d    t         |   |       t        |      | _        | j	                          y rl   )rm   rn   r  audio_encoder	post_initr  s     r'   rn   zClapAudioModel.__init__H  s'     -f5r)   r<   c                 B    | j                   j                  j                  S rl   )rT  r  r   rh   s    r'   get_input_embeddingsz#ClapAudioModel.get_input_embeddingsN  s    !!--222r)   Nr  r   r  r  c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||||      S )ad  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapAudioModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> inputs = processor(audio=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```r  r  r   r  r  )r   use_return_dictr   r  rT  )re   r  r  r   r  r  r  s          r'   r}   zClapAudioModel.forwardQ  sy    @ &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 !!)/!5# " 
 	
r)   NNNNN)rM   rN   rO   r   rR   main_input_namerN  rn   r   r  rW  r   r@   rQ   
BoolTensorr  rS   r   r}   r~   r   s   @r'   rR  rR  C  s    &O 3bii 3  48-1)-,0#'+
))D0+
 ##d*+
  $;	+

 #Tk+
 D[+
 
+	++
 +
r)   rR  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c                   B    e Zd ZU eed<   dZd fd	Zd Zd Ze	e
	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dedz  dedz  dedz  deej                     ez  fd              Z xZS )ClapTextModelr   r6  c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rm   rn   r   r  r  r&  encoderr,  poolerrU  )re   r   add_pooling_layerrp   s      r'   rn   zClapTextModel.__init__  sM    
 	 ,V4&v.0AnV,t 	r)   c                 .    | j                   j                  S rl   r  r  rh   s    r'   rW  z"ClapTextModel.get_input_embeddings  s    ...r)   c                 &    || j                   _        y rl   rf  re   r   s     r'   set_input_embeddingsz"ClapTextModel.set_input_embeddings  s    */'r)   Nr  r   r  r  r  r   r  r  r<   c	                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  ||f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                  |
t        j                  |      }| j!                  ||
      }| j                  ||||      }| j#                  ||||d	      }|d
   }| j$                  | j%                  |      nd }t'        |||j(                  |j*                        S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer.   z5You have to specify either input_ids or inputs_embedsr>   r  rs   )r  r  r  r  T)r   r   r  r  r   r  )r   r   r  rZ  r   %warn_if_padding_and_no_attention_maskr   r?   r@   onesr  r  r  r  r   r  get_extended_attention_maskrb  rc  r   r!   rL   )re   r  r   r  r  r  r   r  r  r  r  r#   r  r?   r   buffered_token_type_ids_expandedextended_attention_maskembedding_outputencoder_outputssequence_outputr1  s                        r'   r}   zClapTextModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m??%)'	 + 
 ,,2/!5 ' 
 *!,8<8OO4UY)-')77&11	
 	
r)   )T)NNNNNNNN)rM   rN   rO   r   rR   rN  rn   rW  ri  r   r   r@   r  r  rS   r   r}   r~   r   s   @r'   r_  r_    s      /0  *..2.2,0-1)-,0#'C
<<$&C
 t+C
 t+	C

 llT)C
 ||d*C
  $;C
 #TkC
 D[C
 
u||	K	KC
  C
r)   r_  c                   <    e Zd ZU eed<   def fdZee	 	 ddej                  dej                  dz  dej                  dz  de
e   deez  f
d	              Zee	 	 dd
ej                  dej                  dz  dej                  dz  de
e   deez  f
d              Zee	 	 	 	 	 	 	 	 	 ddej                   dz  d
ej"                  dz  dej$                  dz  dej                  dz  dej                   dz  dedz  dedz  dedz  dedz  deez  fd              Z xZS )rA  r   c                 .   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }t        j                  t        j                  t        j                  |j                                    | _        t        j                  t        j                  t        j                  |j                                    | _        |j$                  | _        t'        |      | _        t+        |      | _        t/        |      | _        t+        |      | _        | j5                          y )NzKconfig.text_config is expected to be of type ClapTextConfig but is of type .zMconfig.audio_config is expected to be of type ClapAudioConfig but is of type )rm   rn   r   text_configr   	TypeErrortypeaudio_configr   r   r   r@   r;  r   rD  rE  rC  rF  r  r_  
text_modelr  text_projectionrR  audio_modelaudio_projectionrU  )re   r   rv  ry  rp   s       r'   rn   zClapModel.__init__  s=    &,,n=++,-Q0 
 &--?,,-.a1 
 ((**\\%,,txx@]@]7^*_`\\%,,txx@]@]7^*_`$33'42;?),7 3L A 	r)   Nr  r   r  r  r<   c                      | j                   d|||dd|}| j                  |j                        }t        j                  |d      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r  r   r  r  r.   r   rT   )rz  r{  r  F	normalize)re   r  r   r  r  text_outputstext_featuress          r'   get_text_featureszClapModel.get_text_features  sa    . 4C4?? 4
)%	4

 4
 ,,\-G-GH%&[[B%G"r)   r  r  c                      | j                   d||dd|}| j                  |j                        }t        j                  |d      |_        |S )a  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
        >>> random_audio = torch.rand((16_000))

        >>> inputs = feature_extractor(random_audio, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     audio_features = model.get_audio_features(**inputs)
        ```T)r  r  r  r.   r   rT   )r|  r}  r  r  r  )re   r  r  r   r  audio_outputsaudio_featuress          r'   get_audio_featureszClapModel.get_audio_features7  s\    8 5ED4D4D 5
)YD5
TZ5
 ..}/J/JK&'kk.b&I#r)   return_lossr   r  r  c
           	      l   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	| j	                  ||||d      }| j                  |||||d      }|	s|d   n|j                  }| j                  |      }|	s|d   n|j                  }| j                  |      }||j                  ddd      z  }||j                  ddd      z  }| j                  j                         }| j                  j                         }t        j                  ||j                               |z  }t        j                  ||j                               |z  }d}|r,t!        |      }t!        |j                               }||z   d	z  }t#        |||||||
      S )a  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

        >>> input_text = ["Sound of a dog", "Sound of vacuum cleaner"]

        >>> inputs = processor(text=input_text, audio=audio_sample, return_tensors="pt", padding=True)

        >>> outputs = model(**inputs)
        >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
        >>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
        ```NTrY  r  r   r  r   r  r  r   r+   r.   )r  r   keepdimg       @)rZ   r[   r\   rJ   rW   r]   r^   )r   r   r  rZ  r|  rz  r  r}  r{  r   rF  exprC  r@   r   trF   rY   )re   r  r  r  r   r  r  r   r  r  r  r  r  rW   rJ   logit_scale_textlogit_scale_audior\   r[   rZ   caption_loss
audio_losss                         r'   r}   zClapModel.forward[  s   V 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B](()/!5 ) 
 )%/!5 ' 
 0;}Q'@[@[,,\:-8l1ol>X>X**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO  --113 ..224,,{LNN4DEHXX <<kmmoFIZZ+O<L)*:*<*<*>?J :-4D-+#%*,
 	
r)   )NN)	NNNNNNNNN)rM   rN   rO   r   rR   rn   r   r   r@   r  r   r   rS   r   r  r  r  rQ   r]  r  rY   r}   r~   r   s   @r'   rA  rA    s   z @  /3,0	<< t+ llT)	
 +, 
+	+  B  *..2	   <<$&  t+	 
 +,  
+	+    D  .237-1.204#')-,0#'^
##d*^
 ))D0^
 ##d*	^

 t+^
 &&-^
 D[^
  $;^
 #Tk^
 D[^
 
	^
  ^
r)   rA  c                       e Zd ZU eed<   dZdef fdZdej                  fdZ	d Z
ee	 	 	 	 	 	 ddej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  deez  fd              Z xZS )ClapTextModelWithProjectionr   r`  c                     t         |   |       t        |      | _        t	        |      | _        | j                          y rl   )rm   rn   r_  rz  r  r{  rU  r  s     r'   rn   z$ClapTextModelWithProjection.__init__  s3     '/26:r)   r<   c                 B    | j                   j                  j                  S rl   rz  r  r  rh   s    r'   rW  z0ClapTextModelWithProjection.get_input_embeddings  s    ))999r)   c                 :    || j                   j                  _        y rl   r  rh  s     r'   ri  z0ClapTextModelWithProjection.set_input_embeddings  s    5:""2r)   Nr  r   r  r   r  r  c                    ||n| j                   j                  }| j                  |||||d      }|s|d   n|j                  }	| j	                  |	      }
t        |
|j                  |j                  |j                        S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, ClapTextModelWithProjection

        >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```Tr  r   )rJ   rK   r!   rL   )	r   rZ  rz  r  r{  rI   rK   r!   rL   )re   r  r   r  r   r  r  r  r  r1  rJ   s              r'   r}   z#ClapTextModelWithProjection.forward  s    4 &1%<k$++B]B])%/!5 ' 
 0;Q@Z@Z**=9"#*<<&44#..	
 	
r)   )NNNNNN)rM   rN   rO   r   rR   rN  rn   r   r  rW  ri  r   r   r@   r  r  rS   rI   r}   r~   r   s   @r'   r  r    s     ~ :bii :;  *..2,0)-,0#',
<<$&,
 t+,
 llT)	,

  $;,
 #Tk,
 D[,
 
$	$,
  ,
r)   r  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
ee	 	 	 	 	 ddej                  dz  dej                  dz  d	edz  d
edz  dedz  deez  fd              Z xZS )ClapAudioModelWithProjectionr   r  r5  c                     t         |   |       t        |      | _        t	        |      | _        | j                          y rl   )rm   rn   rR  r|  r  r}  rU  r  s     r'   rn   z%ClapAudioModelWithProjection.__init__  s4     )&1 3F ;r)   r<   c                 V    | j                   j                  j                  j                  S rl   )r|  rT  r  r   rh   s    r'   rW  z1ClapAudioModelWithProjection.get_input_embeddings  s     --99>>>r)   Nr  r   r  r  c                 l   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||d      }|s|d   n|j
                  }| j                  |      }	t        |	|j                  |j                  |j                        S )au  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import ClapAudioModelWithProjection, ClapProcessor

        >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
        >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> inputs = processor(audio=audio_sample, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> audio_embeds = outputs.audio_embeds
        ```TrY  r   )rW   rK   rL   r!   )r   rZ  r   r  r|  r  r}  rV   rK   rL   r!   )
re   r  r  r   r  r  r  r  r1  rW   s
             r'   r}   z$ClapAudioModelWithProjection.forward  s    @ &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 (()/!5 ) 
 1<a(A\A\,,];#%+==$//'55	
 	
r)   r[  )rM   rN   rO   r   rR   r\  rN  rn   r   r  rW  r   r   r@   rQ   r]  r  rS   rV   r}   r~   r   s   @r'   r  r    s    &O ?bii ?  48-1)-,0#'5
))D05
 ##d*5
  $;	5

 #Tk5
 D[5
 
%	%5
  5
r)   r  )rA  r3  r_  r  rR  r  )rr   )WrP   r   r   collections.abcr   dataclassesr   typingr   r@   torch.nn.functionalr   rC   r   r   r<  activationsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   configuration_clapr   r   r   
get_loggerrM   loggerr(   r8   r:   r  rF   rI   rV   rY   r  rj   r   r   r   r  r  r  r#  r(  ra  rt  r  r  r  floatr  r  r  r  r  r  r  r&  r,  r3  rR  r_  rA  r  r  __all__rT   r)   r'   <module>r     s      $ !      & ! 9 
 G & @ j j K K 
		H	%"**7U\\ 7ell 7
 	<+ 	< 	< 
	<; 	< 	<  
  
   
H299 2%		 %P_")) _FZ'RYY Z'|
")) 
 &BII  	bii 	wRYY wv4/ 4p3BII 3l}
ryy }
@")) &g8 g8d %II%<<% 
% <<	%
 LL4'% % %05BII 5r 		 2299  RYY #. #N*
bii *
\RYY  #`/ #` #`L:
( :
z _
' _
_
D J
# J
 J
Z ?
"5 ?
 ?
D F
#6 F
 F
Rr)   