
    ig                        d dl mZmZmZ d dlZd dlZd dlmZ d dl	mc m
Z d dlZddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ  ej>                  e       Z! G d dejD                        Z# G d dejD                        Z$ G d dejD                        Z% G d dejD                        Z& G d dejD                        Z' G d dejD                        Z( G d dejD                        Z) G d deee      Z*y)    )OptionalTupleUnionN   )ConfigMixinregister_to_config)FromOriginalModelMixin)logging)apply_forward_hook   )get_activation)AutoencoderKLOutput)
ModelMixin   )DecoderOutputDiagonalGaussianDistributionc            	       <     e Zd ZdZd	dedededdf fdZd Z xZS )
HunyuanImageResnetBlocka  
    Residual block with two convolutions and optional channel change.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        non_linearity (str, optional): Type of non-linearity to use. Default is "silu".
    in_channelsout_channelsnon_linearityreturnNc                    t         |           || _        || _        t	        |      | _        t        j                  d|dd      | _        t        j                  ||ddd      | _
        t        j                  d|dd      | _        t        j                  ||ddd      | _        ||k7  r t        j                  ||ddd      | _        y d | _        y )	N    ư>T
num_groupsnum_channelsepsaffiner   r   kernel_sizestridepaddingr   )super__init__r   r   r   nonlinearitynn	GroupNormnorm1Conv2dconv1norm2conv2conv_shortcut)selfr   r   r   	__class__s       /home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage.pyr&   z HunyuanImageResnetBlock.__init__.   s    &(*=9 \\Rkt\`a
YY{LaPQ[\]
\\RlPT]ab
YY|\qQR\]^
,&!#;RS\]gh!iD!%D    c                    |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }| j
                  | j                  |      }||z   S N)r*   r'   r,   r-   r.   r/   )r0   xresiduals      r2   forwardzHunyuanImageResnetBlock.forward>   s     JJqMa JJqMJJqMa JJqM)""1%A8|r3   )silu)	__name__
__module____qualname____doc__intstrr&   r8   __classcell__r1   s   @r2   r   r   $   s/    &C &s &3 &\` & r3   r   c                   .     e Zd ZdZdef fdZd Z xZS )HunyuanImageAttentionBlockz~
    Self-attention with a single head.

    Args:
        in_channels (int): The number of channels in the input tensor.
    r   c                 >   t         |           t        j                  d|dd      | _        t        j
                  ||d      | _        t        j
                  ||d      | _        t        j
                  ||d      | _        t        j
                  ||d      | _	        y )Nr   r   Tr   r   )
r%   r&   r(   r)   normr+   to_qto_kto_vproj)r0   r   r1   s     r2   r&   z#HunyuanImageAttentionBlock.__init__Y   sw     LLB[d[_`	IIk;:	IIk;:	IIk;:	IIk;:	r3   c                    |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }|j                  \  }}}}	|j                  dddd      j                  |||	z  |      j                         }|j                  dddd      j                  |||	z  |      j                         }|j                  dddd      j                  |||	z  |      j                         }t        j                  |||      }|j                  |||	|      j                  dddd      }| j                  |      }||z   S )Nr   r   r   r   )rE   rF   rG   rH   shapepermutereshape
contiguousFscaled_dot_product_attentionrI   )
r0   r6   identityquerykeyvalue
batch_sizechannelsheightwidths
             r2   r8   z"HunyuanImageAttentionBlock.forwardc   s1   IIaL 		!iil		!.3kk+
HfeaAq)11*funhWbbdkk!Q1%--j&5.(S^^`aAq)11*funhWbbd **5#u=IIj&%:BB1aANIIaL8|r3   )r:   r;   r<   r=   r>   r&   r8   r@   rA   s   @r2   rC   rC   Q   s    ;C ;r3   rC   c                   d     e Zd ZdZdedef fdZdej                  dej                  fdZ xZ	S )HunyuanImageDownsamplez
    Downsampling block for spatial reduction.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
    r   r   c                     t         |           d}||z  dk7  rt        d||z         t        j                  |||z  ddd      | _        ||z  |z  | _        y )N   r   zout_channels % factor != 0: r   r   r!   )r%   r&   
ValueErrorr(   r+   conv
group_sizer0   r   r   factorr1   s       r2   r&   zHunyuanImageDownsample.__init__   si    & A%;L6<Q;RSTTIIk<6+AqYZdef	 ;.,>r3   r6   r   c                 H   | j                  |      }|j                  \  }}}}|j                  |||dz  d|dz  d      }|j                  dddddd      }|j                  |d|z  |dz  |dz        }|j                  \  }}}}|j                  |||dz  d|dz  d      }|j                  dddddd      }|j                  |d|z  |dz  |dz        }|j                  \  }}}}|j	                  ||j                  d   | j
                  ||      j                  d      }||z   S )Nr   r   r      r   r\   dim)r^   rK   rM   rL   viewr_   meanr0   r6   hBCHWshortcuts           r2   r8   zHunyuanImageDownsample.forward   s1   IIaLWW
1aIIaAFAqAvq1IIaAq!Q'IIaQQQ/WW
1a99Q161a1fa8##Aq!Q15##Aq1ua1fa1f=^^
1a==AGGAJAFKKPQKR8|r3   
r:   r;   r<   r=   r>   r&   torchTensorr8   r@   rA   s   @r2   rZ   rZ   {   s6    ?C ?s ? %,, r3   rZ   c                   d     e Zd ZdZdedef fdZdej                  dej                  fdZ xZ	S )HunyuanImageUpsamplez
    Upsampling block for spatial expansion.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
    r   r   c                     t         |           d}t        j                  |||z  ddd      | _        ||z  |z  | _        y )Nr\   r   r   r!   )r%   r&   r(   r+   r^   repeatsr`   s       r2   r&   zHunyuanImageUpsample.__init__   sC    IIk<&+@aXYcde	,;r3   r6   r   c                    | j                  |      }|j                  \  }}}}|j                  |dd|dz  ||      }|j                  dddddd      }|j                  ||dz  |dz  |dz        }|j	                  | j
                  d      }|j                  \  }}}}|j                  |dd|dz  ||      }|j                  dddddd      }|j                  ||dz  |dz  |dz        }||z   S )Nr   r\   r   r   r   rc   ru   re   )r^   rK   rM   rL   repeat_interleaveru   rh   s           r2   r8   zHunyuanImageUpsample.forward   s   IIaLWW
1aIIaAqAvq!,IIaAq!Q'IIaaQA.&&t||&C^^
1a##Aq!Q!VQ:##Aq!Q15##AqAvq1ua!e<8|r3   ro   rA   s   @r2   rs   rs      s6    <C <s < %,, r3   rs   c                   f     e Zd ZdZddedef fdZdej                  dej                  fdZ xZ	S )	HunyuanImageMidBlockz
    Middle block for HunyuanImageVAE encoder and decoder.

    Args:
        in_channels (int): Number of input channels.
        num_layers (int): Number of layers.
    r   
num_layersc                 6   t         |           t        ||      g}g }t        |      D ]8  }|j	                  t        |             |j	                  t        ||             : t        j                  |      | _        t        j                  |      | _	        y )Nr   r   )
r%   r&   r   rangeappendrC   r(   
ModuleListresnets
attentions)r0   r   r{   r   r   _r1   s         r2   r&   zHunyuanImageMidBlock.__init__   s    *{Q\]^
z" 	gA8EFNN2{Ydef	g }}W---
3r3   r6   r   c                      | j                   d   |      }t        | j                  | j                   dd        D ]  \  }} ||      } ||      } |S )Nr   r   )r   zipr   )r0   r6   attnresnets       r2   r8   zHunyuanImageMidBlock.forward   sX    DLLOAab1AB 	LD&QAq	A	 r3   )r   ro   rA   s   @r2   rz   rz      s6    4C 4S 4 %,, r3   rz   c                        e Zd ZdZ	 	 ddededeedf   dededed	ef fd
Zde	j                  de	j                  fdZ xZS )HunyuanImageEncoder2Da7  
    Encoder network that compresses input to latent representation.

    Args:
        in_channels (int): Number of input channels.
        z_channels (int): Number of latent channels.
        block_out_channels (list of int): Output channels for each block.
        num_res_blocks (int): Number of residual blocks per block.
        spatial_compression_ratio (int): Spatial downsampling factor.
        non_linearity (str): Type of non-linearity to use. Default is "silu".
        downsample_match_channel (bool): Whether to match channels during downsampling.
    r   
z_channelsblock_out_channels.num_res_blocksspatial_compression_ratior   downsample_match_channelc                    t         |           |d   d|z  z  dk7  rt        d|d    d|       || _        || _        || _        || _        || _        |d   d|z  z  | _        t        |      | _
        t        j                  ||d   ddd      | _        t        j                  g       | _        |d   }t!        t#        |            D ]  }	||	   }
t!        |      D ]*  }| j                  j%                  t'        ||
	             |
}, |	t)        j*                  |      k  sY|	t#        |      dz
  k7  sk|r||	dz      }
| j                  j%                  t-        ||
	             |
} t/        |d   d
      | _        t        j2                  d|d   dd      | _        t        j                  |d   d|z  ddd      | _        d| _        y )Nr   r   z]block_out_channels[-1 has to be divisible by 2 * out_channels, you have block_out_channels = z and out_channels = r   r   r!   r}   r   r{   r   r   Tr   F)r%   r&   r]   r   r   r   r   r   r_   r   r'   r(   r+   conv_inr   down_blocksr~   lenr   r   nplog2rZ   rz   	mid_blockr)   norm_outconv_outgradient_checkpointing)r0   r   r   r   r   r   r   r   block_in_channeliblock_out_channelr   r1   s               r2   r&   zHunyuanImageEncoder2D.__init__   s     	b!Q^49o  qC  DF  qG  pH  H\  ]g  \h  i  '$"4,)B&,R0Q^D*=9 yy.@.CQR[\fgh ==,-a0s-./ 	5A 21 5>* 5  ''+8HWhi $5 	5 277455!sCU?VYZ?Z:Z+(:1q5(A%  ''*7GVgh $5 !	5& .:LR:P]^_ ASTVAW]ajno		"4R"8!j.VW`aklm&+#r3   r6   r   c                 b   | j                  |      }| j                  D ]=  }t        j                         r| j                  r| j                  ||      }6 ||      }? t        j                         r)| j                  r| j                  | j                  |      }n| j                  |      }|j                  \  }}}}|j                  ||| j                  z  | j                  ||      j                  d      }| j                  |      }| j                  |      }| j                  |      }||z   S )Nr   rd   )r   r   rp   is_grad_enabledr   _gradient_checkpointing_funcr   rK   rf   r_   rg   r   r'   r   )r0   r6   
down_blockrj   rk   rl   rm   r7   s           r2   r8   zHunyuanImageEncoder2D.forward)  s
   LLO ** 	"J$$&4+F+F55j!DqM		"   "t'B'B11$..!DAq!A WW
1a66!Q$//14??AqINNSTNUMM!a MM!8|r3   )r9   T)r:   r;   r<   r=   r>   r   r?   boolr&   rp   rq   r8   r@   rA   s   @r2   r   r      s}    ( $)-:,:, :, "#s(O	:,
 :, $':, :, #':,x %,, r3   r   c                        e Zd ZdZ	 	 ddededeedf   dededed	ef fd
Zde	j                  de	j                  fdZ xZS )HunyuanImageDecoder2DaJ  
    Decoder network that reconstructs output from latent representation.

    Args:
    z_channels : int
        Number of latent channels.
    out_channels : int
        Number of output channels.
    block_out_channels : Tuple[int, ...]
        Output channels for each block.
    num_res_blocks : int
        Number of residual blocks per block.
    spatial_compression_ratio : int
        Spatial upsampling factor.
    upsample_match_channel : bool
        Whether to match channels during upsampling.
    non_linearity (str): Type of non-linearity to use. Default is "silu".
    r   r   r   .r   r   upsample_match_channelr   c                    t         |           |d   |z  dk7  rt        d|d    d|       || _        || _        || _        |d   |z  | _        || _        t        |      | _	        t        j                  ||d   ddd      | _        t        |d   d      | _        |d   }t        j                         | _        t#        t%        |            D ]  }	||	   }
t#        | j
                  dz         D ]*  }| j                   j'                  t)        ||
             |
}, |	t+        j,                  |      k  sf|	t%        |      dz
  k7  sx|r||	dz      }
| j                   j'                  t/        ||
             |
} t        j0                  d	|d
   dd      | _        t        j                  |d
   |ddd      | _        d| _        y )Nr   zXblock_out_channels[0] should be divisible by z_channels but has block_out_channels[0] = z and z_channels = r   r   r!   r   r}   r   r   r   Tr   F)r%   r&   r]   r   r   r   repeatr   r   r'   r(   r+   r   rz   r   r   	up_blocksr~   r   r   r   r   r   rs   r)   r   r   r   )r0   r   r   r   r   r   r   r   r   r   r   r   r1   s               r2   r&   zHunyuanImageDecoder2D.__init__W  s    	a :-2jk}~  lA  kB  BT  U_  T`  a  %"4,(+z9)B&*=9yy-?-BPQZ[efg .:LQ:O\]^ .a0s-./ 	5A 21 54..23 5%%+8HWhi $5 	5 277455!sCU?VYZ?Z:Z)(:1q5(A%%%&:;KM^&_`#4 	5 ASTVAW]ajno		"4R"8,TU^_ijk&+#r3   r6   r   c                    | j                  |      |j                  | j                  d      z   }t        j                         r)| j
                  r| j                  | j                  |      }n| j                  |      }| j                  D ]=  }t        j                         r| j
                  r| j                  ||      }6 ||      }? | j                  |      }| j                  |      }| j                  |      }|S )Nr   rw   )r   rx   r   rp   r   r   r   r   r   r   r'   r   )r0   r6   ri   up_blocks       r2   r8   zHunyuanImageDecoder2D.forward  s    LLOa11$++11MM  "t'B'B11$..!DAq!A 	 H$$&4+F+F55hBQK		 
 MM!a MM!r3   )Tr9   )r:   r;   r<   r=   r>   r   r   r?   r&   rp   rq   r8   r@   rA   s   @r2   r   r   C  s}    4 (,#1,1, 1, "#s(O	1,
 1, $'1, !%1, 1,f %,, r3   r   c                   $    e Zd ZdZdZe	 	 	 d)dedededeedf   d	ed
ededede	de	ddf fd       Z
	 	 d*dee   dee   ddfdZd+dZd+dZd+dZdej"                  fdZe	 d,dej"                  de	deeee   f   fd       Zd,dej"                  de	fdZed,dej"                  de	deeej"                  f   fd       Zdej"                  dej"                  d edej"                  fd!Zdej"                  dej"                  d edej"                  fd"Zdej"                  dej"                  fd#Zd,dej"                  de	deeej"                  f   fd$Z	 	 	 d-d%ej"                  d&e	de	d'eej>                     deeej"                  f   f
d(Z  xZ!S ).AutoencoderKLHunyuanImagez
    A VAE model for 2D images with spatial tiling support.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).
    FNr   r   latent_channelsr   .layers_per_blockr   sample_sizescaling_factorr   r   r   c           	          t         |           t        ||||||	      | _        t	        ||t        t        |            |||
      | _        d| _        d| _	        || _
        ||z  | _        d| _        y )N)r   r   r   r   r   r   )r   r   r   r   r   r   Fg      ?)r%   r&   r   encoderr   listreverseddecoderuse_slicing
use_tilingtile_sample_min_sizetile_latent_min_sizetile_overlap_factor)r0   r   r   r   r   r   r   r   r   r   r   r1   s              r2   r&   z"AutoencoderKLHunyuanImage.__init__  s     	,#&1+&?%=
 -&%#H-?$@A+&?#9
 ! %0!$/3L$L!#' r3   r   r   c                     d| _         |xs | j                  | _        |xs | j                  | _        | j                  | j                  j                  z  | _        y)a{  
        Enable spatial tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles
        to compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to
        allow processing larger images.

        Args:
            tile_sample_min_size (`int`, *optional*):
                The minimum size required for a sample to be separated into tiles across the spatial dimension.
            tile_overlap_factor (`float`, *optional*):
                The overlap factor required for a latent to be separated into tiles across the spatial dimension.
        TN)r   r   r   configr   r   )r0   r   r   s      r2   enable_tilingz'AutoencoderKLHunyuanImage.enable_tiling  sN      $8$UD<U<U!#6#R$:R:R $($=$=AfAf$f!r3   c                     d| _         y)z
        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
        decoding in one step.
        FN)r   r0   s    r2   disable_tilingz(AutoencoderKLHunyuanImage.disable_tiling  s    
  r3   c                     d| _         y)z
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        TNr   r   s    r2   enable_slicingz(AutoencoderKLHunyuanImage.enable_slicing  s    
  r3   c                     d| _         y)z
        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
        decoding in one step.
        FNr   r   s    r2   disable_slicingz)AutoencoderKLHunyuanImage.disable_slicing  s    
 !r3   r6   c                     |j                   \  }}}}| j                  r/|| j                  kD  s|| j                  kD  r| j                  |      S | j	                  |      }|S r5   )rK   r   r   tiled_encoder   )r0   r6   rU   r   rW   rX   encs          r2   _encodez!AutoencoderKLHunyuanImage._encode  sX    23''/
L&%??(A(A AVdNgNgEg$$Q''ll1o
r3   return_dictc                 (   | j                   rU|j                  d   dkD  rC|j                  d      D cg c]  }| j                  |       }}t	        j
                  |      }n| j                  |      }t        |      }|s|fS t        |      S c c}w )a  
        Encode a batch of images into latents.

        Args:
            x (`torch.Tensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

        Returns:
                The latent representations of the encoded videos. If `return_dict` is True, a
                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
        r   r   )latent_dist)r   rK   splitr   rp   catr   r   )r0   r6   r   x_sliceencoded_slicesri   	posteriors          r2   encodez AutoencoderKLHunyuanImage.encode	  s      
QCD771:Ndll73NNN		.)AQA03	<"y99 Os   Bzc                     |j                   \  }}}}| j                  r1|| j                  kD  s|| j                  kD  r| j                  ||      S | j	                  |      }|s|fS t        |      S )Nr   sample)rK   r   r   tiled_decoder   r   )r0   r   r   rU   r   rW   rX   decs           r2   _decodez!AutoencoderKLHunyuanImage._decode$  sm    23''/
L&%??(A(A AVdNgNgEg$$QK$@@ll1o6MC((r3   c                 :   | j                   r_|j                  d   dkD  rM|j                  d      D cg c]  }| j                  |      j                   }}t        j                  |      }n| j                  |      j                  }|s|fS t        |      S c c}w )a  
        Decode a batch of images.

        Args:
            z (`torch.Tensor`): Input batch of latent vectors.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.DecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                returned.
        r   r   r   )r   rK   r   r   r   rp   r   r   )r0   r   r   z_slicedecoded_slicesdecodeds         r2   decodez AutoencoderKLHunyuanImage.decode2  s     
QJK''RS*Uwdll73::UNUii/Gll1o,,G:G,, Vs   "Babblend_extentc                     t        |j                  d   |j                  d   |      }t        |      D ]A  }|d d d d | |z   d d f   d||z  z
  z  |d d d d |d d f   ||z  z  z   |d d d d |d d f<   C |S )Nr   minrK   r~   )r0   r   r   r   ys        r2   blend_vz!AutoencoderKLHunyuanImage.blend_vL  s    1772;\B|$ 	Aa\MA$5q89Q\AQ=QRUVWXZ[]^`aWaUbL V AaAqjM	 r3   c                     t        |j                  d   |j                  d   |      }t        |      D ]A  }|d d d d d d | |z   f   d||z  z
  z  |d d d d d d |f   ||z  z  z   |d d d d d d |f<   C |S )Nr   r   r   )r0   r   r   r   r6   s        r2   blend_hz!AutoencoderKLHunyuanImage.blend_hT  s    1772;\B|$ 	AaA}q'889Q\AQ=QRUVWXZ[]^`aWaUbL V AaAqjM	 r3   c                 z   |j                   \  }}}}}t        | j                  d| j                  z
  z        }t        | j                  | j                  z        }| j                  |z
  }g }t        d||      D ]w  }	g }
t        d||      D ]R  }|dddddd|	|	| j                  z   ||| j                  z   f   }| j                  |      }|
j                  |       T |j                  |
       y g }t        |      D ]  \  }	}
g }t        |
      D ]g  \  }}|	dkD  r| j                  ||	dz
     |   ||      }|dkD  r| j                  |
|dz
     ||      }|j                  |ddddddd|d|f          i |j                  t        j                  |d              t        j                  |d      }|S )a  
        Encode input using spatial tiling strategy.

        Args:
            x (`torch.Tensor`): Input tensor of shape (B, C, T, H, W).

        Returns:
            `torch.Tensor`:
                The latent representation of the encoded images.
        r   r   Nr   rd   r   )rK   r>   r   r   r   r~   r   r   	enumerater   r   rp   r   )r0   r6   r   rW   rX   overlap_sizer   	row_limitrowsr   rowjtileresult_rows
result_rowmomentss                   r2   r   z&AutoencoderKLHunyuanImage.tiled_encode\  s    "#1a444D<T<T8TUV444t7O7OOP--<	q&,/ 	AC1e\2 !Aq!a$*C*C&C"CQTMfMfIfEffg||D)

4 ! KK	 o 	>FAsJ$S> I4q5<<QUAlKDq5<<AE
D,GD!!$q!Q

JYJ'F"GHI uyy<=	> ))KR0r3   c           
         |j                   \  }}}}t        | j                  d| j                  z
  z        }t        | j                  | j                  z        }| j                  |z
  }g }	t        d||      D ]t  }
g }t        d||      D ]O  }|dddd|
|
| j                  z   ||| j                  z   f   }| j                  |      }|j                  |       Q |	j                  |       v g }t        |	      D ]  \  }
}g }t        |      D ]d  \  }}|
dkD  r| j                  |	|
dz
     |   ||      }|dkD  r| j                  ||dz
     ||      }|j                  |ddddd|d|f          f |j                  t        j                  |d              t        j                  |d      }|s|fS t        |      S )a  
        Decode latent using spatial tiling strategy.

        Args:
            z (`torch.Tensor`): Latent tensor of shape (B, C, H, W).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.DecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                returned.
        r   r   Nr   rd   r   r   )rK   r>   r   r   r   r~   r   r   r   r   r   rp   r   r   )r0   r   r   r   rW   rX   r   r   r   r   r   r   r   r   r   r   r   r   s                     r2   r   z&AutoencoderKLHunyuanImage.tiled_decode  s     gg1fe444D<T<T8TUV444t7O7OOP--<	q&,/ 	AC1e\2 $Aq1t'@'@#@@!a$JcJcFcBccd,,t,

7#$ KK	 o 	>FAsJ$S> F4q5<<QUAlKDq5<<AE
D,GD!!$q!ZiZ)'C"DEF uyy<=	> ii,6MC((r3   r   sample_posterior	generatorc                     | j                  |      j                  }|r|j                  |      }n|j                         }| j	                  ||      }|S )z
        Args:
            sample (`torch.Tensor`): Input sample.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
        )r   r   )r   r   r   moder   )r0   r   r   r   r   r   r   r   s           r2   r8   z!AutoencoderKLHunyuanImage.forward  sQ     KK'33	  9 5A Akk!k5
r3   )NTT)NN)r   N)T)FTN)"r:   r;   r<   r=    _supports_gradient_checkpointingr   r>   r   floatr   r&   r   r   r   r   r   rp   rq   r   r   r   r   r   r   r   r   r   r   r   r   r   	Generatorr8   r@   rA   s   @r2   r   r     s    (-$  !%)-'+)()( )( 	)(
 "#s(O)( )( $')( )( )( #')( !%)( 
)( )(Z /3/3g&smg &e_g 
	g*  !	 	 37::,0:	"E*F$GG	H: :4) )D ) - -4 -5X]XdXdIdCe - -2 %,, c ell  %,, c ell &ell &u|| &P*)ell *) *)}^c^j^jOjIk *)` "' /3  	
 EOO, 
}ell*	+r3   r   )+typingr   r   r   numpyr   rp   torch.nnr(   torch.nn.functional
functionalrO   torch.utils.checkpointconfiguration_utilsr   r   loadersr	   utilsr
   utils.accelerate_utilsr   activationsr   modeling_outputsr   modeling_utilsr   vaer   r   
get_loggerr:   loggerModuler   rC   rZ   rs   rz   r   r   r    r3   r2   <module>r     s    * )       B -  8 ( 2 ' < 
		H	%*bii *Z' 'T!RYY !H299 @299 @aBII aHWBII Wth
K9O hr3   