
    i                        d dl mZmZmZ d dlZd dlZd dlmZ d dl	mc m
Z d dlZddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ  ej:                  e      Z G d dej@                        Z! G d dej@                        Z" G d dej@                        Z# G d dej@                        Z$ G d dej@                        Z% G d dej@                        Z& G d dej@                        Z' G d dej@                        Z( G d dej@                        Z) G d d ej@                        Z* G d! d"ej@                        Z+ G d# d$ee      Z,y)%    )OptionalTupleUnionN   )ConfigMixinregister_to_config)logging)apply_forward_hook   )get_activation)AutoencoderKLOutput)
ModelMixin   )DecoderOutputDiagonalGaussianDistributionc                        e Zd Z	 	 	 	 	 	 ddededeeeeeef   f   deeeeeef   f   deeeeeef   f   deeeeeef   f   deded	d
f fdZde	j                  d	e	j                  fdZ xZS )HunyuanVideo15CausalConv3din_channelsout_channelskernel_sizestridepaddingdilationbiaspad_modereturnNc	           	          t         	|           t        |t              r|||fn|}|| _        |d   dz  |d   dz  |d   dz  |d   dz  |d   dz
  df| _        t        j                  |||||||      | _        y )Nr   r   r   )r   )	super__init__
isinstanceintr   time_causal_paddingnnConv3dconv)
selfr   r   r   r   r   r   r   r   	__class__s
            /home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.pyr   z#HunyuanVideo15CausalConv3d.__init__$   s     	AKKY\A]{K=cn NaNaNaNaNQ$
  IIk<fgW_fjk	    hidden_statesc                 |    t        j                  || j                  | j                        }| j	                  |      S )N)mode)Fpadr"   r   r%   )r&   r*   s     r(   forwardz"HunyuanVideo15CausalConv3d.forward?   s-    mT-E-EDMMZyy''r)   )r   r   r   r   T	replicate)__name__
__module____qualname__r!   r   r   boolstrr   torchTensorr/   __classcell__r'   s   @r(   r   r   #   s    
 9:344556#ll l 3c3m 445	l
 c5c3//0l sE#sC-001l U3S=112l l l 
l6(U\\ (ell (r)   r   c                   @     e Zd ZdZd
dededededdf
 fdZd	 Z xZS )HunyuanVideo15RMS_norma  
    A custom RMS normalization layer.

    Args:
        dim (int): The number of dimensions to normalize over.
        channel_first (bool, optional): Whether the input tensor has channels as the first dimension.
            Default is True.
        images (bool, optional): Whether the input represents image data. Default is True.
        bias (bool, optional): Whether to include a learnable bias term. Default is False.
    dimchannel_firstimagesr   r   Nc                 .   t         |           |sdnd}|r|g|n|f}|| _        |dz  | _        t	        j
                  t        j                  |            | _        |r.t	        j
                  t        j                  |            | _
        y d| _
        y )N)r   r   r   )r   r   g      ?g        )r   r   r=   scaler#   	Parameterr6   onesgammazerosr   )r&   r<   r=   r>   r   broadcastable_dimsshaper'   s          r(   r   zHunyuanVideo15RMS_norm.__init__P   sy    .4Y&.;*)*#*#X
\\%**U"34
8<BLLU!34	#	r)   c                     t        j                  || j                  rdnd      | j                  z  | j                  z  | j
                  z   S )Nr   r<   )r-   	normalizer=   r@   rC   r   )r&   xs     r(   r/   zHunyuanVideo15RMS_norm.forwardZ   s>    {{1(:(:1DtzzQTXT^T^^aeajajjjr)   )TTF)	r1   r2   r3   __doc__r!   r4   r   r/   r8   r9   s   @r(   r;   r;   D   s>    	FC F FT FX\ Fim Fkr)   r;   c                   |     e Zd Zdef fdZed
dededefd       Zdej                  dej                  fd	Z	 xZ
S )HunyuanVideo15AttnBlockr   c                 <   t         |           || _        t        |d      | _        t        j                  ||d      | _        t        j                  ||d      | _        t        j                  ||d      | _	        t        j                  ||d      | _
        y )NFr>   r   r   )r   r   r   r;   normr#   r$   to_qto_kto_vproj_out)r&   r   r'   s     r(   r   z HunyuanVideo15AttnBlock.__init___   st    &*;uE	IIk;AF	IIk;AF	IIk;AF			+{Jr)   n_framen_hw
batch_sizec                     | |z  }t        j                  ||ft        d      ||      }t        |      D ]  }||z  }d||d|dz   |z  f<    |"|j	                  d      j                  |dd      }|S )a  Prepare a causal attention mask for 3D videos.

        Args:
            n_frame (int): Number of frames (temporal length).
            n_hw (int): Product of height and width.
            dtype: Desired mask dtype.
            device: Device for the mask.
            batch_size (int, optional): If set, expands for batch.

        Returns:
            torch.Tensor: Causal attention mask.
        z-inf)dtypedevicer   Nr   rH   )r6   fullfloatrange	unsqueezeexpand)	rW   rX   r[   r\   rY   seq_lenmaskii_frames	            r(   prepare_causal_attention_maskz5HunyuanVideo15AttnBlock.prepare_causal_attention_maskj   s     D.zz7G,eFm5QWXw 	0A4iG./D*w{d***+	0 !>>!$++JB?Dr)   rK   r   c                    |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }|j                  \  }}}}	}
|j                  ||||	z  |
z        j                  ddd      j                  d      j                         }|j                  ||||	z  |
z        j                  ddd      j                  d      j                         }|j                  ||||	z  |
z        j                  ddd      j                  d      j                         }| j                  ||	|
z  |j                  |j                  |      }t        j                  j                  ||||      }|j                  d      j                  |||	|
|      j                  ddddd      }| j!                  |      }||z   S )Nr   r   r   )rY   )	attn_mask   r   )rR   rS   rT   rU   rF   reshapepermuter`   
contiguousrf   r[   r\   r#   
functionalscaled_dot_product_attentionsqueezerV   )r&   rK   identityquerykeyvaluerY   channelsframesheightwidthattention_masks               r(   r/   zHunyuanVideo15AttnBlock.forward   s   IIaL		!iil		!6;kk3
Hffej(FVOe4KLTTUVXY[\]gghijuuwkk*h%0GHPPQRTUWXYccdefqqsj(FVOe4KLTTUVXY[\]gghijuuw;;FUNEKK* < 
 MM66uc5Tb6c IIaL  VVUHMUUVWYZ\]_`bcdMM!8|r)   N)r1   r2   r3   r!   r   staticmethodrf   r6   r7   r/   r8   r9   s   @r(   rN   rN   ^   sV    	KC 	K s # Z]  , %,, r)   rN   c                   `     e Zd Zddededef fdZed	d       Zdej                  fdZ
 xZS )
HunyuanVideo15Upsampler   r   add_temporal_upsamplec                     t         |           |rdnd}t        |||z  d      | _        || _        ||z  |z  | _        y N   ri   r   rQ   )r   r   r   r%   r}   repeats)r&   r   r   r}   factorr'   s        r(   r   zHunyuanVideo15Upsample.__init__   sF    3.{L6<Q_`a	%:",;r)   c           
          | j                   \  }}}}}||z  |z  }	||	z  }
| j                  |||||
|||      } | j                  dddddddd      } | j                  ||
||z  ||z  ||z        S )	a  
        Convert (b, r1*r2*r3*c, f, h, w) -> (b, c, r1*f, r2*h, r3*w)

        Args:
            tensor: Input tensor of shape (b, r1*r2*r3*c, f, h, w)
            r1: temporal upsampling factor
            r2: height upsampling factor
            r3: width upsampling factor
        r   ri      r      r      r   rF   viewrk   rj   )tensorr1r2r3bpacked_cfhwr   cs              r(   _dcae_upsample_rearrangez/HunyuanVideo15Upsample._dcae_upsample_rearrange   s      &||8Q1b2QBAq!Q71aAq!Q7~~aAFAFAF;;r)   rK   c                 \   | j                   rdnd}| j                  |      }| j                   r3|d d d d d dd d d d f   }| j                  |ddd      }|d d d |j                  d   dz  f   }|d d d d dd d d d d f   }| j                  ||dd      }t	        j
                  ||gd      }|d d d d d dd d d d f   }| j                  |ddd      }|j                  | j                  dz  d      }|d d d d dd d d d d f   }| j                  ||dd      }|j                  | j                  d      }t	        j
                  ||gd      }||z   S | j                  ||dd      }|j                  | j                  d      }| j                  ||dd      }||z   S )Nr   r   r   r   r   rI   r   r<   )r}   r%   r   rF   r6   catrepeat_interleaver   )	r&   rK   r   r   h_firsth_nextx_firstx_nextshortcuts	            r(   r/   zHunyuanVideo15Upsample.forward   s   ,,Q!IIaL%%1bqb!Q'G33GaA3NGa!87==#3q#8!889Gq!QRA~&F226bQ12MF		7F+3A 1bqb!Q'G33GaA3NG//8Iq/QGq!QRA~&F226bQ12MF--dll-JFyy'6!2:H 8| --aB1-CA**4<<Q*GH44X"q4QH8|r)   Tr   r   r   )r1   r2   r3   r!   r4   r   rz   r   r6   r7   r/   r8   r9   s   @r(   r|   r|      sB    <C <s <SW < < <$ r)   r|   c                   `     e Zd Zddededef fdZed	d       Zdej                  fdZ
 xZS )
HunyuanVideo15Downsampler   r   add_temporal_downsamplec                     t         |           |rdnd}t        |||z  d      | _        || _        ||z  |z  | _        y r   )r   r   r   r%   r   
group_size)r&   r   r   r   r   r'   s        r(   r   z!HunyuanVideo15Downsample.__init__   sF    59.{LF<R`ab	'>$ ;.,>r)   c           
          | j                   \  }}}}}||z  ||z  ||z  }}
}	| j                  |||	||
|||      } | j                  dddddddd      } | j                  |||z  |z  |z  |	|
|      S )	z
        Convert (b, c, r1*f, r2*h, r3*w) -> (b, r1*r2*r3*c, f, h, w)

        This packs spatial/temporal dimensions into channels (opposite of upsample)
        r   r   r   r   r   r   ri   r   r   )r   r   r   r   r   r   packed_fpacked_hpacked_wr   r   r   s               r(   _dcae_downsample_rearrangez3HunyuanVideo15Downsample._dcae_downsample_rearrange   s     .4\\*1h(b.(b.(b.a1Q1b!RB71aAq!Q7~~ab2!11a;;r)   rK   c                    | j                   rdnd}| j                  |      }| j                   r|d d d d d dd d d d f   }| j                  |ddd      }t        j                  ||gd      }|d d d d dd d d d d f   }| j                  ||dd      }t        j                  ||gd      }|d d d d d dd d d d f   }| j                  |ddd      }|j
                  \  }}}	}
}|j                  ||j
                  d   | j                  dz  |	|
|      j                  d      }|d d d d dd d d d d f   }| j                  ||dd      }|j
                  \  }}}	}
}|j                  ||j
                  d   | j                  |	|
|      j                  d      }t        j                  ||gd      }||z   S | j                  ||dd      }| j                  ||dd      }|j
                  \  }}}	}
}|j                  ||j
                  d   | j                  |	|
|      j                  d      }||z   S )Nr   r   r   rI   )	r   r%   r   r6   r   rF   r   r   mean)r&   rK   r   r   r   r   r   BCTHWr   r   s                 r(   r/   z HunyuanVideo15Downsample.forward   sI   ..QAIIaL''1bqb!Q'G55g!a5PGii' 2:Gq!QRA~&F44VqQ4OF		7F+3A 1bqb!Q'G55g!a5PG#MMMAq!Qll1aggaj$//Q2F1aPUUZ[U\Gq!QRA~&F44VqQ4OF"LLMAq!Q[[AGGAJAqINNSTNUFyy'6!2:H 8| //bQ1/EA66qRA!6LH$NNMAq!Q}}Q
DOOQ1MRRWXRYH8|r)   r   r   )r1   r2   r3   r!   r4   r   rz   r   r6   r7   r/   r8   r9   s   @r(   r   r      sB    ?C ?s ?UY ? < < r)   r   c            	       t     e Zd Z	 	 d	dedee   deddf fdZdej                  dej                  fdZ	 xZ
S )
HunyuanVideo15ResnetBlockNr   r   non_linearityr   c                 :   t         |           |xs |}t        |      | _        t	        |d      | _        t        ||d      | _        t	        |d      | _        t        ||d      | _	        d | _
        ||k7  r t        j                  ||ddd      | _
        y y )NFrP   r   rQ   r   r   )r   r   r   )r   r   r   nonlinearityr;   norm1r   conv1norm2conv2conv_shortcutr#   r$   )r&   r   r   r   r'   s       r(   r   z"HunyuanVideo15ResnetBlock.__init__  s     	#2{*=9+KF
/\WXY
+LG
/lXYZ
!,&!#;RS\]gh!iD 'r)   r*   c                    |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }| j
                  | j                  |      }||z   S ry   )r   r   r   r   r   r   )r&   r*   residuals      r(   r/   z!HunyuanVideo15ResnetBlock.forward!  s     

=1))-8

=1

=1))-8

=1)))(3Hx''r)   )Nswish)r1   r2   r3   r!   r   r5   r   r6   r7   r/   r8   r9   s   @r(   r   r     sZ     '+$	jj smj 	j
 
j*(U\\ (ell (r)   r   c            	       n     e Zd Z	 	 d	dedededdf fdZdej                  dej                  fdZ xZ	S )
HunyuanVideo15MidBlockr   
num_layersadd_attentionr   Nc                    t         |           || _        t        ||      g}g }t	        |      D ]V  }| j                  r|j                  t        |             n|j                  d        |j                  t        ||             X t        j                  |      | _	        t        j                  |      | _
        d| _        y )Nr   r   F)r   r   r   r   r_   appendrN   r#   
ModuleList
attentionsresnetsgradient_checkpointing)r&   r   r   r   r   r   _r'   s          r(   r   zHunyuanVideo15MidBlock.__init__3  s     	* &'(
 
z" 	A!!!!"9+"FG!!$'NN) +!,	 --
3}}W-&+#r)   r*   c                      | j                   d   |      }t        | j                  | j                   dd        D ]  \  }}| ||      } ||      } |S )Nr   r   )r   zipr   )r&   r*   attnresnets       r(   r/   zHunyuanVideo15MidBlock.forwardW  sa    'Q6ab1AB 	2LD& $] 3"=1M	2
 r)   )r   T)
r1   r2   r3   r!   r4   r   r6   r7   r/   r8   r9   s   @r(   r   r   2  sR     "	",", ", 	",
 
",HU\\ ell r)   r   c                   ~     e Zd Z	 	 	 ddedededee   deddf fdZd	ej                  dej                  fd
Z xZ	S )HunyuanVideo15DownBlock3DNr   r   r   downsample_out_channelsr   r   c                 B   t         |           g }t        |      D ]'  }|dk(  r|n|}|j                  t	        ||             ) t        j                  |      | _        |/t        j                  t        |||      g      | _	        d| _
        y d | _	        d| _
        y )Nr   r   )r   r   F)r   r   r_   r   r   r#   r   r   r   downsamplersr   )	r&   r   r   r   r   r   r   rd   r'   s	           r(   r   z"HunyuanVideo15DownBlock3D.__init__c  s     	z" 	A)*a+\KNN) +!-	 }}W-". ",$%<0G!D ',# !%D&+#r)   r*   c                     | j                   D ]
  } ||      } | j                  | j                  D ]
  } ||      } |S ry   )r   r   )r&   r*   r   downsamplers       r(   r/   z!HunyuanVideo15DownBlock3D.forward  sT    ll 	2F"=1M	2 (#00 ; +M :; r)   r   NT)
r1   r2   r3   r!   r   r   r6   r7   r/   r8   r9   s   @r(   r   r   b  sm    
 15'+#,#, #, 	#,
 "*##, "%#, 
#,JU\\ ell r)   r   c                   ~     e Zd Z	 	 	 ddedededee   deddf fdZd	ej                  dej                  fd
Z	 xZ
S )HunyuanVideo15UpBlock3DNr   r   r   upsample_out_channelsr}   r   c                 B   t         	|           g }t        |      D ]'  }|dk(  r|n|}|j                  t	        ||             ) t        j                  |      | _        |/t        j                  t        |||      g      | _	        d| _
        y d | _	        d| _
        y )Nr   r   )r   r}   F)r   r   r_   r   r   r#   r   r   r|   
upsamplersr   )
r&   r   r   r   r   r}   r   rd   input_channelsr'   s
            r(   r   z HunyuanVideo15UpBlock3D.__init__  s     	z" 	A,-F[NNN) .!-	 }}W- , mm*$%:.CDO ',# #DO&+#r)   r*   c                 
   t        j                         r0| j                  r$| j                  D ]  }| j	                  ||      } n| j                  D ]
  } ||      } | j
                  | j
                  D ]
  } ||      } |S ry   )r6   is_grad_enabledr   r   _gradient_checkpointing_funcr   )r&   r*   r   	upsamplers       r(   r/   zHunyuanVideo15UpBlock3D.forward  s      "t'B'B,, Y $ A A&- XY ,, 6 &} 56 ??&!__ 9	 )- 89 r)   r   )r1   r2   r3   r!   r   r4   r   r6   r7   r/   r8   r9   s   @r(   r   r     sm    
 /3&*$,$, $, 	$,
  (}$,  $$, 
$,LU\\ ell r)   r   c                        e Zd ZdZ	 	 	 	 	 	 	 ddededeedf   dededed	ed
df fdZdej                  d
ej                  fdZ
 xZS )HunyuanVideo15Encoder3Dz1
    3D vae encoder for HunyuanImageRefiner.
    r   r   block_out_channels.layers_per_blocktemporal_compression_ratiospatial_compression_ratiodownsample_match_channelr   Nc                    t         |           || _        || _        |d   | j                  z  | _        t        ||d   d      | _        d | _        t        j                  g       | _
        |d   }t        t        |            D ]  }	|	t        j                  |      k  }
||	   }|
st        |||d d      }|}n9|	t        j                  ||z        k\  }|r||	dz      n|}t        |||||      }|}| j                  j!                  |        t#        |d         | _        t%        |d   d	      | _        t        j(                         | _        t        |d   |d      | _        d| _        y )
NrH   r   r   rQ   F)r   r   r   r   r   r   r   rP   )r   r   r   r   r   r   conv_in	mid_blockr#   r   down_blocksr_   lennplog2r   r   r   r;   norm_outSiLUconv_actconv_outr   )r&   r   r   r   r   r   r   r   input_channelrd   add_spatial_downsampleoutput_channel
down_blockr   r   r'   s                  r(   r   z HunyuanVideo15Encoder3D.__init__  s    	&(,R0D4E4EE1+?QRS?Tbcd==,*1-s-./ 	0A%&1J)K%K"/2N)6/ -!/,0,1
 !/*+rww7PTn7n/o*o'G_*<QU*Ces'6/ -!/,C,C
 !8##J/1	04 0<Nr<RS./A"/EeT	23Eb3I<efg&+#r)   r*   c                 0   | j                  |      }t        j                         rL| j                  r@| j                  D ]  }| j                  ||      } | j                  | j                  |      }n*| j                  D ]
  } ||      } | j                  |      }|j                  \  }}}}}|j                  |d| j                  |||      j                  d      }| j                  |      }| j                  |      }| j                  |      }||z  }|S )NrH   r   rI   )r   r6   r   r   r   r   r   rF   r   r   r   r   r   r   )	r&   r*   r   rY   r   framerv   rw   	short_cuts	            r(   r/   zHunyuanVideo15Encoder3D.forward  s   ]3  "t'B'B".. ]
 $ A A*m \] !==dnnm\M".. :
 *= 9: !NN=9M.;.A.A+
Aufe!&&z2tvW\]bbghbi	m4m4m4"r)   )r   @               r   r   ri      Tr1   r2   r3   rL   r!   r   r4   r   r6   r7   r/   r8   r9   s   @r(   r   r     s     .I !*+)+)-5,5, 5, "#s(O	5,
 5, %(5, $'5, #'5, 
5,nU\\ ell r)   r   c                        e Zd ZdZ	 	 	 	 	 	 	 ddededeedf   dededed	ef fd
Zdej                  dej                  fdZ
 xZS )HunyuanVideo15Decoder3DzR
    Causal decoder for 3D video-like data used for HunyuanImage-1.5 Refiner.
    r   r   r   .r   r   r   upsample_match_channelc                 &   t         |           || _        || _        || _        |d   | j                  z  | _        t        | j                  |d   d      | _        t        j                  g       | _
        t        |d         | _        |d   }t        t        |            D ]  }	||	   }
|	t        j                   |      k  }|	t        j                   |      k  }|s|r,|r||	dz      n|
}t#        | j                  dz   ||
||      }|}nt#        | j                  dz   ||
d d      }|
}| j                  j%                  |        t'        |d   d	      | _        t        j*                         | _        t        |d   |d      | _        d| _        y )
Nr   r   rQ   r   r   )r   r   r   r   r}   FrH   rP   )r   r   r   r   r   repeatr   r   r#   r   	up_blocksr   r   r_   r   r   r   r   r   r;   r   r   r   r   r   )r&   r   r   r   r   r   r   r  r   rd   r   add_spatial_upsampler}   r   up_blockr'   s                  r(   r   z HunyuanVideo15Decoder3D.__init__%  s    	 0&((+t/?/??1$2B2BDVWXDYghir* 0<Nq<QR +1-s-./ 	,A/2N#$rww/H'I#I $%0J(K$K!#'<E[(:1q5(Aao%2#44q8 -!/*?*? !62#44q8 -!/*.*/ !/NN!!(+3	,8 //A"/EeT	23Eb3I<efg&+#r)   r*   r   c                    | j                  |      |j                  | j                  d      z   }t        j                         rL| j
                  r@| j                  | j                  |      }| j                  D ]  }| j                  ||      } n*| j                  |      }| j                  D ]
  } ||      } | j                  |      }| j                  |      }| j                  |      }|S )Nr   r   )r   r   r  r6   r   r   r   r   r  r   r   r   )r&   r*   r	  s      r(   r/   zHunyuanVideo15Decoder3D.forward_  s    ]3m6U6U^b^i^iop6U6qq  "t'B'B ==dnnm\M NN [ $ A A(M Z[ !NN=9M NN 8 ( 78 m4m4m4r)   )    r   )r   r   r   r   r   r   r   ri   Tr  r9   s   @r(   r  r     s     .I !)+*+'+8,8, 8, "#s(O	8,
 8, $'8, %(8, !%8,tU\\ ell r)   r  c                       e Zd ZdZdZe	 	 	 	 	 	 	 	 	 	 d,dedededee   deded	ed
edede	ddf fd       Z
	 	 	 	 	 d-dee   dee   dee   dee   dee	   ddfdZd.dZd.dZd.dZdej"                  dej"                  fdZe	 d/dej"                  dedeeee   f   fd       Zdej"                  dej"                  fdZed/dej"                  dedeeej"                  f   fd       Zd ej"                  d!ej"                  d"edej"                  fd#Zd ej"                  d!ej"                  d"edej"                  fd$Zd ej"                  d!ej"                  d"edej"                  fd%Zdej"                  dej"                  fd&Zdej"                  dej"                  fd'Z	 	 	 d0d(ej"                  d)eded*eej@                     deeej"                  f   f
d+Z! xZ"S )1AutoencoderKLHunyuanVideo15a=  
    A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos. Used for
    HunyuanVideo-1.5.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).
    Tr   r   latent_channelsr   r   r   r   r   r  scaling_factorr   Nc           	      Z   t         |           t        ||dz  |||||      | _        t	        ||t        t        |            ||||	      | _        || _        || _	        d| _
        d| _        d| _        d| _        | j                  |z  | _        | j                  |z  | _        d| _        y )Nr   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r  Fr   g      ?)r   r   r   encoderr  listreverseddecoderr   r   use_slicing
use_tilingtile_sample_min_heighttile_sample_min_widthtile_latent_min_heighttile_latent_min_widthtile_overlap_factor)r&   r   r   r  r   r   r   r   r   r  r  r'   s              r(   r   z$AutoencoderKLHunyuanVideo15.__init__  s     	.#(1,1-'A&?%=
 /'%#H-?$@A-'A&?#9
 *C&*D' !
   '*#%(" '+&A&AE^&^#%)%?%?C\%\"#' r)   r  r  r  r  r  c                     d| _         |xs | j                  | _        |xs | j                  | _        |xs | j                  | _        |xs | j                  | _        |xs | j
                  | _        y)a  
        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.

        Args:
            tile_sample_min_height (`int`, *optional*):
                The minimum height required for a sample to be separated into tiles across the height dimension.
            tile_sample_min_width (`int`, *optional*):
                The minimum width required for a sample to be separated into tiles across the width dimension.
            tile_latent_min_height (`int`, *optional*):
                The minimum height required for a latent to be separated into tiles across the height dimension.
            tile_latent_min_width (`int`, *optional*):
                The minimum width required for a latent to be separated into tiles across the width dimension.
        TN)r  r  r  r  r  r  )r&   r  r  r  r  r  s         r(   enable_tilingz)AutoencoderKLHunyuanVideo15.enable_tiling  sj    . &<&[@[@[#%:%Xd>X>X"&<&[@[@[#%:%Xd>X>X"#6#R$:R:R r)   c                     d| _         y)z
        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
        decoding in one step.
        FN)r  r&   s    r(   disable_tilingz*AutoencoderKLHunyuanVideo15.disable_tiling  s    
  r)   c                     d| _         y)z
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        TNr  r  s    r(   enable_slicingz*AutoencoderKLHunyuanVideo15.enable_slicing  s    
  r)   c                     d| _         y)z
        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
        decoding in one step.
        FNr"  r  s    r(   disable_slicingz+AutoencoderKLHunyuanVideo15.disable_slicing  s    
 !r)   rK   c                     |j                   \  }}}}}| j                  r/|| j                  kD  s|| j                  kD  r| j	                  |      S | j                  |      }|S ry   )rF   r  r  r  tiled_encoder  )r&   rK   r   rv   rw   s        r(   _encodez#AutoencoderKLHunyuanVideo15._encode  sZ    !"1a??(B(B BftOjOjFj$$Q''LLOr)   return_dictc                 (   | j                   rU|j                  d   dkD  rC|j                  d      D cg c]  }| j                  |       }}t	        j
                  |      }n| j                  |      }t        |      }|s|fS t        |      S c c}w )a  
        Encode a batch of images into latents.

        Args:
            x (`torch.Tensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

        Returns:
                The latent representations of the encoded videos. If `return_dict` is True, a
                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
        r   r   )latent_dist)r  rF   splitr(  r6   r   r   r   )r&   rK   r)  x_sliceencoded_slicesr   	posteriors          r(   encodez"AutoencoderKLHunyuanVideo15.encode  s      
QCD771:Ndll73NNN		.)AQA03	<"y99 Os   Bzc                     |j                   \  }}}}}| j                  r/|| j                  kD  s|| j                  kD  r| j	                  |      S | j                  |      }|S ry   )rF   r  r  r  tiled_decoder  )r&   r1  r   rv   rw   decs         r(   _decodez#AutoencoderKLHunyuanVideo15._decode  sZ    !"1a??(B(B BftOjOjFj$$Q''ll1o
r)   c                    | j                   rU|j                  d   dkD  rC|j                  d      D cg c]  }| j                  |       }}t	        j
                  |      }n| j                  |      }|s|fS t        |      S c c}w )a  
        Decode a batch of images.

        Args:
            z (`torch.Tensor`): Input batch of latent vectors.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.DecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                returned.
        r   r   )sample)r  rF   r,  r5  r6   r   r   )r&   r1  r)  z_slicedecoded_slicesdecodeds         r(   decodez"AutoencoderKLHunyuanVideo15.decode  sv     
QCD771:Ndll73NNNii/Gll1oG:G,, Os   Bar   blend_extentc           	         t        |j                  d   |j                  d   |      }t        |      D ]J  }|d d d d d d | |z   d d f   d||z  z
  z  |d d d d d d |d d f   ||z  z  z   |d d d d d d |d d f<   L |S )Nr   minrF   r_   )r&   r<  r   r=  ys        r(   blend_vz#AutoencoderKLHunyuanVideo15.blend_v4  s    1772;\B|$ 	A Aq<-!*;Q!>?1q<GWCWX[\]^`acdfgij]j[kL \  AaAq!m	 r)   c                    t        |j                  d   |j                  d   |      }t        |      D ]J  }|d d d d d d d d | |z   f   d||z  z
  z  |d d d d d d d d |f   ||z  z  z   |d d d d d d d d |f<   L |S )NrH   r   r@  r&   r<  r   r=  rK   s        r(   blend_hz#AutoencoderKLHunyuanVideo15.blend_h<  s    1772;\B|$ 	A Aq!l]Q->!>?1q<GWCWX[\]^`acdfgij]j[kL \  AaAq!m	 r)   c           	         t        |j                  d   |j                  d   |      }t        |      D ]J  }|d d d d | |z   d d d d f   d||z  z
  z  |d d d d |d d d d f   ||z  z  z   |d d d d |d d d d f<   L |S )Nr   r@  rE  s        r(   blend_tz#AutoencoderKLHunyuanVideo15.blend_tD  s    1772;\B|$ 	A A}q'8!Q!>?1q<GWCWX[\]^`acdfgij]j[kL \  AaAq!m	 r)   c                 &   |j                   \  }}}}}t        | j                  d| j                  z
  z        }t        | j                  d| j                  z
  z        }t        | j
                  | j                  z        }t        | j                  | j                  z        }| j
                  |z
  }	| j                  |z
  }
g }t        d||      D ]w  }g }t        d||      D ]R  }|dddddd||| j                  z   ||| j                  z   f   }| j                  |      }|j                  |       T |j                  |       y g }t        |      D ]  \  }}g }t        |      D ]g  \  }}|dkD  r| j                  ||dz
     |   ||      }|dkD  r| j                  ||dz
     ||      }|j                  |ddddddd|	d|
f          i |j                  t        j                  |d              t        j                  |d      }|S )zEncode a batch of images using a tiled encoder.

        Args:
            x (`torch.Tensor`): Input batch of videos.

        Returns:
            `torch.Tensor`:
                The latent representation of the encoded videos.
        r   r   NrH   rI   r?  )rF   r!   r  r  r  r  r  r_   r  r   	enumeraterC  rF  r6   r   )r&   rK   r   rv   rw   overlap_heightoverlap_widthblend_heightblend_widthrow_limit_heightrow_limit_widthrowsrd   rowjtileresult_rows
result_rowmomentss                      r(   r'  z(AutoencoderKLHunyuanVideo15.tiled_encodeL  s0    "#1aT88A@X@X<XYZD66!d>V>V:VWX4669Q9QQR$44t7O7OOP66E44{Bq&.1 	AC1e]3 	!D7777D6666	8 ||D)

4 	! KK	 o 	>FAsJ$S> V4q5<<QUAlKDq5<<AE
D+FD!!$q!Q0A1A0ACSOCS'S"TUV uyy<=	> ))KR0r)   c                 &   |j                   \  }}}}}t        | j                  d| j                  z
  z        }t        | j                  d| j                  z
  z        }t        | j
                  | j                  z        }t        | j                  | j                  z        }| j
                  |z
  }	| j                  |z
  }
g }t        d||      D ]w  }g }t        d||      D ]R  }|dddddd||| j                  z   ||| j                  z   f   }| j                  |      }|j                  |       T |j                  |       y g }t        |      D ]  \  }}g }t        |      D ]g  \  }}|dkD  r| j                  ||dz
     |   ||      }|dkD  r| j                  ||dz
     ||      }|j                  |ddddddd|	d|
f          i |j                  t        j                  |d              t        j                  |d      }|S )a  
        Decode a batch of images using a tiled decoder.

        Args:
            z (`torch.Tensor`): Input batch of latent vectors.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.DecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                returned.
        r   r   NrH   rI   r?  )rF   r!   r  r  r  r  r  r_   r  r   rK  rC  rF  r6   r   )r&   r1  r   rv   rw   rL  rM  rN  rO  rP  rQ  rR  rd   rS  rT  rU  r:  rV  rW  r4  s                       r(   r3  z(AutoencoderKLHunyuanVideo15.tiled_decode|  s0    "#1aT88A@X@X<XYZD66!d>V>V:VWX4669Q9QQR$44t7O7OOP66E44{Bq&.1 	AC1e]3 	$D7777D6666	8 ,,t,

7#	$ KK	 o 	>FAsJ$S> V4q5<<QUAlKDq5<<AE
D+FD!!$q!Q0A1A0ACSOCS'S"TUV uyy<=	> ii,
r)   r7  sample_posterior	generatorc                     |}| j                  |      j                  }|r|j                  |      }n|j                         }| j	                  ||      }|S )aa  
        Args:
            sample (`torch.Tensor`): Input sample.
            sample_posterior (`bool`, *optional*, defaults to `False`):
                Whether to sample from the posterior.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
        )r[  )r)  )r0  r+  r7  r,   r;  )	r&   r7  rZ  r)  r[  rK   r/  r1  r4  s	            r(   r/   z#AutoencoderKLHunyuanVideo15.forward  sU     KKN..	  9 5A Akk!k5
r)   )
r   r   r  r   r   r   ri   TTgfk}Ж?)NNNNN)r   Nr   )FTN)#r1   r2   r3   rL    _supports_gradient_checkpointingr   r!   r   r4   r^   r   r   r  r   r#  r%  r6   r7   r(  r
   r   r   r   r0  r5  r   r;  rC  rF  rI  r'  r3  	Generatorr/   r8   r9   s   @r(   r  r  t  s    (,$ !)D !)+*+)-'+ '6(6( 6( 	6(
 "#J6( 6( $'6( %(6( #'6( !%6( 6( 
6( 6(t 15/304/3/3S (S  (}S !)	S
  (}S &e_S 
S<  ! %,,  37::,0:	"E*F$GG	H: :6 %,,  - -4 -5X]XdXdIdCe - -2 %,, c ell  %,, c ell  %,, c ell .ell .u|| .`3ell 3u|| 3p "' /3  	
 EOO, 
}ell*	+r)   r  )-typingr   r   r   numpyr   r6   torch.nnr#   torch.nn.functionalrm   r-   torch.utils.checkpointconfiguration_utilsr   r   utilsr	   utils.accelerate_utilsr
   activationsr   modeling_outputsr   modeling_utilsr   vaer   r   
get_loggerr1   loggerModuler   r;   rN   r|   r   r   r   r   r   r   r  r   r)   r(   <module>ro     s   * )       B  8 ( 2 ' < 
		H	%( (BkRYY k4=bii =@5RYY 5p2ryy 2j$(		 $(N-RYY -`.		 .b4bii 4nSbii SlQbii QhS*k Sr)   