
    i@                        d dl Z d dlmZ d dlmZmZmZ d dlZd dl	Z	d dl
mZ d dlmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ d
dlmZ ddlmZ  G d dej6                        Z G d dej6                        Z G d dej6                        Z G d dej6                        Z G d de       Z!e G d de             Z"e G d de             Z# G d dej6                        Z$ G d dej6                        Z% G d  d!eee      Z&y)"    N)	dataclass)OptionalTupleUnion)weight_norm   )ConfigMixinregister_to_config)
BaseOutput)apply_forward_hook)randn_tensor   )
ModelMixin   )AutoencoderMixinc                   *     e Zd ZdZd fd	Zd Z xZS )Snake1dz;
    A 1-dimensional Snake activation function module.
    c                 0   t         |           t        j                  t	        j
                  d|d            | _        t        j                  t	        j
                  d|d            | _        d| j                  _        d| j                  _        || _	        y )Nr   T)
super__init__nn	Parametertorchzerosalphabetarequires_gradlogscale)self
hidden_dimr   	__class__s      {/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/diffusers/models/autoencoders/autoencoder_oobleck.pyr   zSnake1d.__init__$   sg    \\%++aQ"?@
LLQ
A!>?	#'

 "&		     c                    |j                   }| j                  s| j                  nt        j                  | j                        }| j                  s| j
                  nt        j                  | j
                        }|j                  |d   |d   d      }||dz   j                         t        j                  ||z        j                  d      z  z   }|j                  |      }|S )Nr   r   g&.>r   )
shaper   r   r   expr   reshape
reciprocalsinpow)r   hidden_statesr&   r   r   s        r"   forwardzSnake1d.forward-   s    ##"&--

UYYtzz5J $tyy599TYY3G%--eAha"E%(@(@(BUYYuWdOdEeEiEijkEl(ll%--e4r#   T__name__
__module____qualname____doc__r   r-   __classcell__r!   s   @r"   r   r      s    !	r#   r   c                   4     e Zd ZdZddedef fdZd Z xZS )OobleckResidualUnitza
    A residual unit composed of Snake1d and weight-normalized Conv1d layers with dilations.
    	dimensiondilationc           	         t         |           d|z  dz  }t        |      | _        t	        t        j                  ||d||            | _        t        |      | _        t	        t        j                  ||d            | _	        y )N   r      )kernel_sizer9   paddingr   )r=   )
r   r   r   snake1r   r   Conv1dconv1snake2conv2)r   r8   r9   padr!   s       r"   r   zOobleckResidualUnit.__init__>   sm    !a'i( 9iQYakn!op
i( 9iQ!OP
r#   c                     |}| j                  | j                  |            }| j                  | j                  |            }|j                  d   |j                  d   z
  dz  }|dkD  r
|d|| f   }||z   }|S )aq  
        Forward pass through the residual unit.

        Args:
            hidden_state (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
                Input tensor .

        Returns:
            output_tensor (`torch.Tensor` of shape `(batch_size, channels, time_steps)`)
                Input tensor after passing through the residual unit.
        r%   r   r   .)rA   r?   rC   rB   r&   )r   hidden_stateoutput_tensorr>   s       r"   r-   zOobleckResidualUnit.forwardG   s     %

4;;}#=>

4;;}#=>%%b)M,?,?,CCIQ;'WgX-=(=>L$}4r#   )   r   r0   r1   r2   r3   intr   r-   r4   r5   s   @r"   r7   r7   9   s#    Q# Qc Qr#   r7   c                   0     e Zd ZdZddef fdZd Z xZS )OobleckEncoderBlockz&Encoder block used in Oobleck encoder.stridec                 0   t         |           t        |d      | _        t        |d      | _        t        |d      | _        t        |      | _        t        t        j                  ||d|z  |t        j                  |dz                    | _        y )Nr   r9   r   	   r   r=   rM   r>   )r   r   r7   	res_unit1	res_unit2	res_unit3r   r?   r   r   r@   mathceilrA   r   	input_dim
output_dimrM   r!   s       r"   r   zOobleckEncoderBlock.__init__a   s{    ,YC,YC,YCi( IIiVF\`\e\eflopfp\qr

r#   c                     | j                  |      }| j                  |      }| j                  | j                  |            }| j	                  |      }|S N)rR   rS   r?   rT   rA   r   rF   s     r"   r-   zOobleckEncoderBlock.forwardl   sI    ~~l3~~l3{{4>>,#?@zz,/r#   r   rI   r5   s   @r"   rL   rL   ^   s    0	
c 	
r#   rL   c                   0     e Zd ZdZddef fdZd Z xZS )OobleckDecoderBlockz&Decoder block used in Oobleck decoder.rM   c                 0   t         |           t        |      | _        t	        t        j                  ||d|z  |t        j                  |dz                    | _	        t        |d      | _        t        |d      | _        t        |d      | _        y )Nr   rQ   r   rO   r   rP   )r   r   r   r?   r   r   ConvTranspose1drU   rV   conv_t1r7   rR   rS   rT   rW   s       r"   r   zOobleckDecoderBlock.__init__x   s    i("J		&1*-
 -Z!D,Z!D,Z!Dr#   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S r[   )r?   rb   rR   rS   rT   r\   s     r"   r-   zOobleckDecoderBlock.forward   sN    {{<0||L1~~l3~~l3~~l3r#   r]   rI   r5   s   @r"   r_   r_   u   s    0Ec E"r#   r_   c                       e Zd Zddej                  defdZddeej                     dej                  fdZ	ddd dej                  fd	Z
dej                  fd
Zy)#OobleckDiagonalGaussianDistribution
parametersdeterministicc                 F   || _         |j                  dd      \  | _        | _        t        j
                  j                  | j                        dz   | _        | j                  | j                  z  | _        t        j                  | j                        | _        || _        y )Nr   r   )dimg-C6?)rf   chunkmeanscaler   
functionalsoftplusstdvarr   loglogvarrg   )r   rf   rg   s      r"   r   z,OobleckDiagonalGaussianDistribution.__init__   sv    $ * 0 0 0 :	4:==))$**5<88dhh&ii)*r#   N	generatorreturnc                     t        | j                  j                  || j                  j                  | j                  j
                        }| j                  | j                  |z  z   }|S )N)rs   devicedtype)r   rk   r&   rf   rv   rw   ro   )r   rs   samplexs       r"   rx   z*OobleckDiagonalGaussianDistribution.sample   sR    IIOO??))//''	
 II6))r#   otherc                     | j                   rt        j                  dg      S |S| j                  | j                  z  | j                  z   | j
                  z
  dz
  j                  d      j                         S t        j                  | j                  |j                  z
  d      |j                  z  }| j                  |j                  z  }| j
                  |j
                  z
  }||z   |z   dz
  }|j                  d      j                         }|S )Ng        g      ?r   r   )rg   r   Tensorrk   rp   rr   sumr+   )r   rz   normalized_diff	var_ratiologvar_diffkls         r"   r   z&OobleckDiagonalGaussianDistribution.kl   s    <<&&}		DII-84;;FLQQRSTYY[["'))DII

,BA"F"R HHuyy0	"kkELL8$y0;>BVVAY^^%	r#   c                     | j                   S r[   )rk   )r   s    r"   modez(OobleckDiagonalGaussianDistribution.mode   s    yyr#   )Fr[   )r0   r1   r2   r   r|   boolr   r   	Generatorrx   r   r    r#   r"   re   re      sa    +5<< + +	 9 	U\\ 	=   ell r#   re   c                       e Zd ZU dZded<   y)AutoencoderOobleckOutputar  
    Output of AutoencoderOobleck encoding method.

    Args:
        latent_dist (`OobleckDiagonalGaussianDistribution`):
            Encoded outputs of `Encoder` represented as the mean and standard deviation of
            `OobleckDiagonalGaussianDistribution`. `OobleckDiagonalGaussianDistribution` allows for sampling latents
            from the distribution.
    re   latent_distN)r0   r1   r2   r3   __annotations__r   r#   r"   r   r      s     76r#   r   c                   0    e Zd ZU dZej
                  ed<   y)OobleckDecoderOutputz
    Output of decoding method.

    Args:
        sample (`torch.Tensor` of shape `(batch_size, audio_channels, sequence_length)`):
            The decoded output sample from the last layer of the model.
    rx   N)r0   r1   r2   r3   r   r|   r   r   r#   r"   r   r      s     LLr#   r   c                   (     e Zd ZdZ fdZd Z xZS )OobleckEncoderzOobleck Encoderc           
         t         	|           |}dg|z   }t        t        j                  ||dd            | _        g | _        t        |      D ]6  \  }}| xj                  t        |||   z  |||dz      z  |      gz  c_        8 t        j                  | j                        | _        ||d   z  }t        |      | _        t        t        j                  ||dd            | _        y )Nr   r<   r   r=   r>   rX   rY   rM   r%   )r   r   r   r   r@   rA   block	enumeraterL   
ModuleListr   r?   rC   )
r   encoder_hidden_sizeaudio_channelsdownsampling_ratioschannel_multiplesstridesstride_indexrM   d_modelr!   s
            r"   r   zOobleckEncoder.__init__   s    %C"33 !>;N\]gh!ij

$-g$6 	 L&JJ#14El4SS25F|VWGW5XX! J	 ]]4::.
%(9"(==g& 74GUV`a!bc
r#   c                     | j                  |      }| j                  D ]
  } ||      } | j                  |      }| j                  |      }|S r[   rA   r   r?   rC   )r   rF   modules      r"   r-   zOobleckEncoder.forward   sQ    zz,/jj 	0F!,/L	0 {{<0zz,/r#   r/   r5   s   @r"   r   r      s    d2	r#   r   c                   (     e Zd ZdZ fdZd Z xZS )OobleckDecoderzOobleck Decoderc           
         t         |           |}dg|z   }t        t        j                  |||d   z  dd            | _        g }t        |      D ]>  \  }}	|t        ||t        |      |z
     z  ||t        |      |z
  dz
     z  |	      gz  }@ t        j                  |      | _
        |}
t        |
      | _        t        t        j                  ||ddd            | _        y )	Nr   r%   r<   r   r   r   F)r=   r>   bias)r   r   r   r   r@   rA   r   r_   lenr   r   r   r?   rC   )r   channelsinput_channelsr   upsampling_ratiosr   r   r   r   rM   rY   r!   s              r"   r   zOobleckDecoder.__init__  s    #C"33 !>8FWXZF[;[ijtu!vw
 $-g$6 	 L&#&):3w<,;V)WW'*;CL<<WZ[<[*\\! E	 ]]5)

j) 8^QR\]di!jk
r#   c                     | j                  |      }| j                  D ]
  } ||      } | j                  |      }| j                  |      }|S r[   r   )r   rF   layers      r"   r-   zOobleckDecoder.forward  sQ    zz,/ZZ 	/E .L	/ {{<0zz,/r#   r/   r5   s   @r"   r   r      s    l2	r#   r   c                       e Zd ZdZdZdZedg dg dddddf fd		       Ze	 dd
e	j                  dedeeee   f   fd       Zdde	j                  dedeee	j                  f   fdZe	 dde	j&                  dedeee	j&                  f   fd       Z	 	 	 dde	j                  dededee	j,                     deee	j                  f   f
dZ xZS )AutoencoderOoblecka  
    An autoencoder for encoding waveforms into latents and decoding latent representations into waveforms. First
    introduced in Stable Audio.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        encoder_hidden_size (`int`, *optional*, defaults to 128):
            Intermediate representation dimension for the encoder.
        downsampling_ratios (`List[int]`, *optional*, defaults to `[2, 4, 4, 8, 8]`):
            Ratios for downsampling in the encoder. These are used in reverse order for upsampling in the decoder.
        channel_multiples (`List[int]`, *optional*, defaults to `[1, 2, 4, 8, 16]`):
            Multiples used to determine the hidden sizes of the hidden layers.
        decoder_channels (`int`, *optional*, defaults to 128):
            Intermediate representation dimension for the decoder.
        decoder_input_channels (`int`, *optional*, defaults to 64):
            Input dimension for the decoder. Corresponds to the latent dimension.
        audio_channels (`int`, *optional*, defaults to 2):
            Number of channels in the audio data. Either 1 for mono or 2 for stereo.
        sampling_rate (`int`, *optional*, defaults to 44100):
            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
    F   )r      r      r   )r   r   r   r   rH   @   r   iD  c                 .   t         |           || _        || _        || _        |d d d   | _        t        t        j                  |            | _	        || _
        t        ||||      | _        t        |||| j
                  |      | _        d| _        y )Nr%   )r   r   r   r   )r   r   r   r   r   F)r   r   r   r   decoder_channelsr   rJ   npprod
hop_lengthsampling_rater   encoderr   decoderuse_slicing)	r   r   r   r   r   decoder_input_channelsr   r   r!   s	           r"   r   zAutoencoderOobleck.__init__C  s     	#6 #6  0!4TrT!:bgg&9:;*% 3) 3/	
 &%1)"44/
 !r#   ry   return_dictrt   c                 (   | j                   rU|j                  d   dkD  rC|j                  d      D cg c]  }| j                  |       }}t	        j
                  |      }n| j                  |      }t        |      }|s|fS t        |      S c c}w )a  
        Encode a batch of images into latents.

        Args:
            x (`torch.Tensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

        Returns:
                The latent representations of the encoded images. If `return_dict` is True, a
                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
        r   r   )r   )r   r&   splitr   r   catre   r   )r   ry   r   x_sliceencoded_slicesh	posteriors          r"   encodezAutoencoderOobleck.encodeh  s      
QCD771:Ndll73NNN		.)AQA7:	<'I>> Os   Bzc                 F    | j                  |      }|s|fS t        |      S )Nrx   )r   r   )r   r   r   decs       r"   _decodezAutoencoderOobleck._decode  s$    ll1o6M#3//r#   c                 :   | j                   r_|j                  d   dkD  rM|j                  d      D cg c]  }| j                  |      j                   }}t        j                  |      }n| j                  |      j                  }|s|fS t        |      S c c}w )a  
        Decode a batch of images.

        Args:
            z (`torch.Tensor`): Input batch of latent vectors.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.vae.OobleckDecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.OobleckDecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.OobleckDecoderOutput`] is returned, otherwise a plain `tuple`
                is returned.

        r   r   r   )r   r&   r   r   rx   r   r   r   )r   r   r   rs   z_slicedecoded_slicesdecodeds          r"   decodezAutoencoderOobleck.decode  s    $ 
QJK''RS*Uwdll73::UNUii/Gll1o,,G:#733 Vs   "Brx   sample_posteriorrs   c                     |}| j                  |      j                  }|r|j                  |      }n|j                         }| j	                  |      j                  }|s|fS t        |      S )ah  
        Args:
            sample (`torch.Tensor`): Input sample.
            sample_posterior (`bool`, *optional*, defaults to `False`):
                Whether to sample from the posterior.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`OobleckDecoderOutput`] instead of a plain tuple.
        )rs   r   )r   r   rx   r   r   r   )	r   rx   r   r   rs   ry   r   r   r   s	            r"   r-   zAutoencoderOobleck.forward  sf     KKN..	  9 5A Akk!n##6M#3//r#   r.   )TN)FTN)r0   r1   r2   r3    _supports_gradient_checkpointing_supports_group_offloadingr
   r   r   r   r|   r   r   r   r   re   r   r   r   FloatTensorr   r   r   r-   r4   r5   s   @r"   r   r   '  sV   0 (-$!&  +*!"! "!H 37??,0?	'/R)SS	T? ?80 0D 0EJ^`e`l`lJlDm 0 HL4""4154	#U%6%66	74 4> "' /300 0 	0
 EOO,0 
#U\\1	20r#   r   )'rU   dataclassesr   typingr   r   r   numpyr   r   torch.nnr   torch.nn.utilsr   configuration_utilsr	   r
   utilsr   utils.accelerate_utilsr   utils.torch_utilsr   modeling_utilsr   vaer   Moduler   r7   rL   r_   objectre   r   r   r   r   r   r   r#   r"   <module>r      s     ! ) )    & B  8 - ' !bii 4"")) "J")) .")) <%& %P 7z 7 7 	: 	 	%RYY %P%RYY %P]0%5{ ]0r#   