
    i&                     ^   d dl Z d dlmZmZmZ d dlZd dlmZ d dlmc m	Z
 ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ  ej8                  e      Z G d dej>                        Z  G d dejB                        Z" G d dejB                        Z# G d dejB                        Z$ G d dejB                        Z% G d dejB                        Z& G d dejB                        Z' G d dejB                        Z( G d dejB                        Z) G d d eee      Z*y)!    N)OptionalTupleUnion   )ConfigMixinregister_to_config)logging)apply_forward_hook   )get_activation)AutoencoderKLOutput)
ModelMixin   )AutoencoderMixinDecoderOutputDiagonalGaussianDistributionc                        e Zd Z	 	 	 	 	 	 	 ddededeeeedf   f   deeeedf   f   deeeedf   f   deeeedf   f   ded	ed
ef fdZd Z	de
j                  de
j                  f fdZ xZS )EasyAnimateCausalConv3din_channelsout_channelskernel_size.stridepaddingdilationgroupsbiaspadding_modec
                    t        |t              r|n|fdz  }t        |      dk(  sJ d| d       t        |t              r|n|fdz  }t        |      dk(  sJ d| d       t        |t              r|n|fdz  }t        |      dk(  sJ d| d       |\  }
}}|\  | _        }}|\  }}}|
dz
  |z  }|It	        j
                  |dz
  |z  d|z
  z   dz        }t	        j
                  |dz
  |z  d|z
  z   dz        }nt        |t              r|x}}nt        sJ || _        t	        j
                  |
dz
  |z  d|z
  z   dz        | _	        d | _
        t        | 1  |||||df|||			       y )
Nr   z#Kernel size must be a 3-tuple, got z	 instead.zStride must be a 3-tuple, got z Dilation must be a 3-tuple, got r   r   r   )	r   r   r   r   r   r   r   r   r   )
isinstancetuplelent_stridemathceilintNotImplementedErrortemporal_paddingtemporal_padding_originprev_featuressuper__init__)selfr   r   r   r   r   r   r   r   r   t_ksh_ksw_ksh_stridew_stride
t_dilation
h_dilation
w_dilationt_padh_padw_pad	__class__s                        }/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/diffusers/models/autoencoders/autoencoder_kl_magvit.pyr+   z EasyAnimateCausalConv3d.__init__$   s    &0U%Ck+Z[I[;1$b(KK=Xa&bb$%fe46)a-6{aS#A&!SS)(E:8a8}!Y%EhZy#YY! 'dD,2)x-5*
J
 Z' ?IIqJ6!h,G1LMEIIqJ6!h,G1LME%##EE&&& !&'+yy4!8z2IQQY\2Z^_1_'`$! 	#%#u%% 	 
	
    c                     | ` d | _         y Nr)   r,   s    r9   _clear_conv_cachez)EasyAnimateCausalConv3d._clear_conv_cache^       !r:   hidden_statesreturnc           	      "   |j                   }| j                  t        j                  |dddd| j                  dfd      }|j                  |      }| j                          |d d d d | j                   d f   j                         | _        |j                  d      }g }d}|| j                  z   dz   |k  rat        | )  |d d d d ||| j                  z   dz   f         }|| j                  z  }|j                  |       || j                  z   dz   |k  rat        j                  |d      S | j                  dk(  r>t        j                  | j                  d d d d | j                  dz
   d f   |gd      }n#t        j                  | j                  |gd      }|j                  |      }| j                          |d d d d | j                   d f   j                         | _        |j                  d      }g }d}|| j                  z   dz   |k  rat        | )  |d d d d ||| j                  z   dz   f         }|| j                  z  }|j                  |       || j                  z   dz   |k  rat        j                  |d      S )Nr   	replicate)padmode)dtyper   r   dim)rG   r)   FrE   r'   tor?   clonesizer*   forwardr"   appendtorchconcat)r,   rA   rG   
num_framesoutputsioutr8   s          r9   rN   zEasyAnimateCausalConv3d.forwardb   s   ##%EE1a!6!6: M
 *,,5,9M ""$!.q!d6K6K5K5M/M!N!T!T!VD '++A.JGAd+++a/:=gomAq!a$BWBW>WZ[>[:[4[&\]T]]"s# d+++a/:= <<++ }}! %''10E0E0I.J.L(LM}]cd! !&d.@.@--PVW X),,5,9M ""$!.q!d6K6K5K5M/M!N!T!T!VD '++A.JGAd+++a/:=gomAq!a$BWBW>WZ[>[:[4[&\]T]]"s# d+++a/:= <<++r:   )r   r   r   r   r   Tzeros)__name__
__module____qualname__r%   r   r   boolstrr+   r?   rP   TensorrN   __classcell__r8   s   @r9   r   r   #   s    
 45.//001#8
8
 8
 3c3h/0	8

 c5c?*+8
 sE#s(O+,8
 U38_,-8
 8
 8
 8
t"/,U\\ /,ell /, /,r:   r   c                        e Zd Z	 	 	 	 	 	 ddedededededededef fd	Zd
ej                  dej                  fdZ
 xZS )EasyAnimateResidualBlock3Dr   r   non_linearitynorm_num_groupsnorm_epsspatial_group_normdropoutoutput_scale_factorc	                    t         	|           || _        t        j                  |||d      | _        t        |      | _        t        ||d      | _	        t        j                  |||d      | _
        t        j                  |      | _        t        ||d      | _        ||k7  r%t        j                  ||d      | _        || _        y t        j                          | _        || _        y )NT)
num_groupsnum_channelsepsaffiner   r   r   )r*   r+   rf   nn	GroupNormnorm1r   nonlinearityr   conv1norm2Dropoutre   conv2Conv3dshortcutIdentityrd   )
r,   r   r   ra   rb   rc   rd   re   rf   r8   s
            r9   r+   z#EasyAnimateResidualBlock3D.__init__   s     	#6  \\&$	

 +=9,[,TUV
\\_<]enrs
zz'*,\<UVW
,&IIk<QODM #5 KKMDM"4r:   rA   rB   c                 "   | j                  |      }| j                  ro|j                  d      }|j                  ddddd      j	                  dd      }| j                  |      }|j                  d|df      j                  ddddd      }n| j                  |      }| j                  |      }| j                  |      }| j                  ro|j                  d      }|j                  ddddd      j	                  dd      }| j                  |      }|j                  d|df      j                  ddddd      }n| j                  |      }| j                  |      }| j                  |      }| j                  |      }||z   | j                  z  S Nr   r   r   r      )rv   rd   rM   permuteflattenro   	unflattenrp   rq   rr   re   rt   rf   )r,   rA   rv   
batch_sizes       r9   rN   z"EasyAnimateResidualBlock3D.forward   s   ==/""&++A.J)11!Q1a@HHANM JJ}5M)33A
B7GHPP1aAM !JJ}5M))-8

=1""&++A.J)11!Q1a@HHANM JJ}5M)33A
B7GHPP1aAM !JJ}5M))-8]3

=1(D,D,DDDr:   )silu    ư>T              ?rW   rX   rY   r%   r[   floatrZ   r+   rP   r\   rN   r]   r^   s   @r9   r`   r`      s    
 $!#'%("5"5 "5 	"5
 "5 "5 !"5 "5 #"5HEU\\ Eell Er:   r`   c            	       j     e Zd Zd	dedededef fdZdej                  dej                  fdZ xZ	S )
EasyAnimateDownsampler3Dr   r   r   r   c                 L    t         |           t        ||||d      | _        y )Nr   )r   r   r   r   r   )r*   r+   r   conv)r,   r   r   r   r   r8   s        r9   r+   z!EasyAnimateDownsampler3D.__init__   s'    +#,K`fpq
	r:   rA   rB   c                 T    t        j                  |d      }| j                  |      }|S )N)r   r   r   r   )rJ   rE   r   r,   rA   s     r9   rN   z EasyAnimateDownsampler3D.forward   s&    m\:		-0r:   )r   r   r   r   )
rW   rX   rY   r%   r    r+   rP   r\   rN   r]   r^   s   @r9   r   r      s@    
C 
s 
 
Z_ 
U\\ ell r:   r   c                   z     e Zd Z	 	 	 ddededededef
 fdZd Zdej                  d	ej                  fd
Z	 xZ
S )EasyAnimateUpsampler3Dr   r   r   temporal_upsamplerd   c                 ~    t         |           |xs |}|| _        || _        t	        |||      | _        d | _        y )N)r   r   r   )r*   r+   r   rd   r   r   r)   )r,   r   r   r   r   rd   r8   s         r9   r+   zEasyAnimateUpsampler3D.__init__   sG     	#2{!2"4+#,K
	 "r:   c                     | ` d | _         y r<   r=   r>   s    r9   r?   z(EasyAnimateUpsampler3D._clear_conv_cache   r@   r:   rA   rB   c                     t        j                  |dd      }| j                  |      }| j                  r;| j                  	|| _        |S t        j                  |d| j
                  sdnd      }|S )Nr   r   r   nearest)scale_factorrF   )r   r   r   	trilinear)rJ   interpolater   r   r)   rd   r   s     r9   rN   zEasyAnimateUpsampler3D.forward   ss    m)R[\		-0!!!!)%2"  !"!!*,0,C,C!
 r:   )r   FT)rW   rX   rY   r%   rZ   r+   r?   rP   r\   rN   r]   r^   s   @r9   r   r      sc    
 "'#'"" " 	"
  " !"&"U\\ ell r:   r   c                        e Zd Z	 	 	 	 	 	 	 	 	 ddedededededededed	ed
edef fdZdej                  dej                  fdZ
 xZS )EasyAnimateDownBlock3Dr   r   
num_layersact_fnrb   rc   rd   re   rf   add_downsampleadd_temporal_downsamplec                    t         |           t        j                  g       | _        t        |      D ]7  }|dk(  r|n|}| j                  j                  t        ||||||||	             9 |
r%|r#t        ||dd      | _	        d| _
        d| _        y |
r%|s#t        ||dd      | _	        d| _
        d| _        y d | _	        d| _
        d| _        y )	Nr   r   r   ra   rb   rc   rd   re   rf   r   r   )r   r   r   r   r   )r*   r+   rm   
ModuleListconvsrangerO   r`   r   downsamplerspatial_downsample_factortemporal_downsample_factor)r,   r   r   r   r   rb   rc   rd   re   rf   r   r   rT   r8   s                r9   r+   zEasyAnimateDownBlock3D.__init__  s     	]]2&
z" 	A)*a+\KJJ* +!-"($3%'9#(;		 57l`ajstD-.D*./D+$;7l`ajstD-.D*./D+#D-.D*./D+r:   rA   rB   c                 r    | j                   D ]
  } ||      } | j                  | j                  |      }|S r<   )r   r   r,   rA   r   s      r9   rN   zEasyAnimateDownBlock3D.forward>  sA    JJ 	0D /M	0' ,,];Mr:   )	r   r   r   r   Tr   r   TTr   r^   s   @r9   r   r     s    
 !#'%(#(,+0+0 +0 	+0
 +0 +0 +0 !+0 +0 #+0 +0 "&+0ZU\\ ell r:   r   c                        e Zd Z	 	 	 	 	 	 	 	 	 ddedededededededed	ed
edef fdZdej                  dej                  fdZ
 xZS )EasyAnimateUpBlock3dr   r   r   r   rb   rc   rd   re   rf   add_upsampleadd_temporal_upsamplec                    t         |           t        j                  g       | _        t        |      D ]7  }|dk(  r|n|}| j                  j                  t        ||||||||	             9 |
rt        ||||      | _	        y d | _	        y )Nr   r   )r   rd   )
r*   r+   rm   r   r   r   rO   r`   r   	upsampler)r,   r   r   r   r   rb   rc   rd   re   rf   r   r   rT   r8   s                r9   r+   zEasyAnimateUpBlock3d.__init__G  s     	]]2&
z" 	A)*a+\KJJ* +!-"($3%'9#(;		 3"7#5	DN "DNr:   rA   rB   c                 r    | j                   D ]
  } ||      } | j                  | j                  |      }|S r<   )r   r   r   s      r9   rN   zEasyAnimateUpBlock3d.forwardq  s=    JJ 	0D /M	0>>% NN=9Mr:   )	r   r   r   r   Fr   r   TTr   r^   s   @r9   r   r   F  s    
 !#(%(!&*("(" (" 	("
 (" (" (" !(" (" #(" ("  $("TU\\ ell r:   r   c                        e Zd Z	 	 	 	 	 	 	 ddedededededededef fd	Zd
ej                  dej                  fdZ
 xZS )EasyAnimateMidBlock3dr   r   r   rb   rc   rd   re   rf   c	                    t         
|           ||nt        |dz  d      }t        j                  t        ||||||||      g      | _        t        |dz
        D ].  }	| j                  j                  t        ||||||||             0 y )Nrz   r   r   r   )	r*   r+   minrm   r   r`   r   r   rO   )r,   r   r   r   rb   rc   rd   re   rf   _r8   s             r9   r+   zEasyAnimateMidBlock3d.__init__z  s     	-<-H/cR]abRbdfNg]]* +!,"($3%'9#(;	

 zA~& 	AJJ* +!,"($3%'9#(;		r:   rA   rB   c                 h     | j                   d   |      }| j                   dd  D ]
  } ||      } |S )Nr   r   )r   )r,   rA   resnets      r9   rN   zEasyAnimateMidBlock3d.forward  s?    %

1m4jjn 	2F"=1M	2r:   )r   r   r   r   Tr   r   r   r^   s   @r9   r   r   y  s     !#'%(** * 	*
 * * !* * #*XU\\ ell r:   r   c                        e Zd ZdZdZdddg dddd	dd
f	dededeedf   deedf   dededededef fdZ	de
j                  de
j                  fdZ xZS )EasyAnimateEncoderzp
    Causal encoder for 3D video-like data used in [EasyAnimate](https://huggingface.co/papers/2405.18991).
    Tr      SpatialDownBlock3DSpatialTemporalDownBlock3Dr   r            r   r   r   r   Fr   r   down_block_types.block_out_channelslayers_per_blockrb   r   double_zrd   c
                    t         |           t        ||d   d      | _        t	        j
                  g       | _        |d   }
t        |      D ]|  \  }}|
}||   }
|t        |      dz
  k(  }|dk(  rt        ||
|||d|	| d	      }n)|d	k(  rt        ||
|||d|	| d
	      }nt        d|       | j                  j                  |       ~ t        |d   |||	|ddd      | _        |	| _        t	        j                  |d   |d      | _        t#        |      | _        |rd|z  n|}t        |d   |d      | _        d| _        y )Nr   r   rl   r   r   r   F)	r   r   r   r   rb   rc   rd   r   r   r   TUnknown up block type: r{   )r   r   r   rd   rb   rc   re   rf   ri   rh   rj   r   )r*   r+   r   conv_inrm   r   down_blocks	enumerater!   r   
ValueErrorrO   r   	mid_blockrd   rn   conv_norm_outr   conv_actconv_outgradient_checkpointing)r,   r   r   r   r   r   rb   r   r   rd   output_channelsrT   down_block_typeinput_channelsis_final_block
down_blockconv_out_channelsr8   s                    r9   r+   zEasyAnimateEncoder.__init__  s   " 	 /{<Nq<Q_`a ==,,Q/"+,<"= 	0A,N03O#&8"9A"==N"663 .!0/!$3!'9'5#5,1

 !$@@3 .!0/!$3!'9'5#5,0

 !#:?:K!LMM##J/=	0B /*2.'1+ !	
 #5\\+B/&

 'v. 19A,l/0B20FHYghi&+#r:   rA   rB   c                 >   | j                  |      }| j                  D ]=  }t        j                         r| j                  r| j                  ||      }6 ||      }? | j                  |      }| j                  ro|j                  d      }|j                  ddddd      j                  dd      }| j                  |      }|j                  d|df      j                  ddddd      }n| j                  |      }| j                  |      }| j                  |      }|S ry   )r   r   rP   is_grad_enabledr   _gradient_checkpointing_funcr   rd   rM   r|   r}   r   r~   r   r   )r,   rA   r   r   s       r9   rN   zEasyAnimateEncoder.forward  s   ]3** 	:J$$&4+F+F $ A A*m \ *= 9		: }5""&++A.J)11!Q1a@HHANM ..}=M)33A
B7GHPPQRTUWXZ[]^_M ..}=Mm4m4r:   rW   rX   rY   __doc__ _supports_gradient_checkpointingr%   r   r[   rZ   r+   rP   r\   rN   r]   r^   s   @r9   r   r     s     (,$ -
 /C !!#(R,R, R,  S/	R, "#s(OR, R, R, R, R, !R,hU\\ ell r:   r   c                        e Zd ZdZdZdddg dddd	d
fdededeedf   deedf   dedededef fdZ	de
j                  de
j                  fdZ xZS )EasyAnimateDecoderzp
    Causal decoder for 3D video-like data used in [EasyAnimate](https://huggingface.co/papers/2405.18991).
    Tr   r   SpatialUpBlock3DSpatialTemporalUpBlock3Dr   r   r   r   r   r   Fr   r   up_block_types.r   r   rb   r   rd   c	                    t         |           t        ||d   d      | _        t	        |d   |||ddd      | _        t        j                  g       | _        t        t        |            }	|	d   }
t        |      D ]  \  }}|
}|	|   }
|t        |      dz
  k(  }|dk(  rt        ||
|dz   ||d|| d	
	      }n,|dk(  rt        ||
|dz   ||d|| d
	      }nt        d|       | j                  j                  |        || _        t        j"                  |d   |d      | _        t'        |      | _        t        |d   |d      | _        d	| _        y )Nr{   r   rl   r   r   r   )r   r   r   rb   rc   re   rf   r   F)	r   r   r   r   rb   rc   rd   r   r   r   Tr   r   )r*   r+   r   r   r   r   rm   r   	up_blockslistreversedr   r!   r   r   rO   rd   rn   r   r   r   r   r   )r,   r   r   r   r   r   rb   r   rd   reversed_block_out_channelsr   rT   up_block_typer   r   up_blockr8   s                   r9   r+   zEasyAnimateDecoder.__init__(  s     	 /{<Nr<R`ab /*2.'+ !
 r*&*84F+G&H#5a8 ). 9  	,A},N9!<O#&8"9A"==N  22/ .!0/!3!$3!'9%3!3*/
 "<</ .!0/!3!$3!'9%3!3*.
 !#:=/!JKKNN!!(+A 	,F #5\\+A.&

 'v. 00B10E|abc&+#r:   rA   rB   c                    | j                  |      }t        j                         r)| j                  r| j	                  | j
                  |      }n| j                  |      }| j                  D ]=  }t        j                         r| j                  r| j	                  ||      }6 ||      }? | j                  ro|j                  d      }|j                  ddddd      j                  dd      }| j                  |      }|j                  d|df      j                  ddddd      }n| j                  |      }| j                  |      }| j                  |      }|S ry   )r   rP   r   r   r   r   r   rd   rM   r|   r}   r   r~   r   r   )r,   rA   r   r   s       r9   rN   zEasyAnimateDecoder.forward|  s<   ]3  "t'B'B ==dnnm\M NN=9M 	8H$$&4+F+F $ A A(M Z ( 7		8 ""&++A.J)11!Q1a@HHANM ..}=M)33A
B7GHPP1aAM !..}=Mm4m4r:   r   r^   s   @r9   r   r   !  s     (,$ +
 /C !!#(R,R, R, c3h	R, "#s(OR, R, R, R, !R,hU\\ ell r:   r   c                       e Zd ZdZdZedddg dg dg ddd	d
ddfdedededeedf   deedf   deedf   dededede	de
f fd       Zd Z	 	 	 	 	 	 d5dee   dee   dee   dee	   dee	   d ee	   d!dfd"Ze	 d6d#ej"                  d$e
d!eeee   f   fd%       Ze	 d6d#ej"                  d$e
d!eeee   f   fd&       Zd6d'ej"                  d$e
d!eeej"                  f   fd(Zed6d'ej"                  d$e
d!eeej"                  f   fd)       Zd*ej"                  d+ej"                  d,ed!ej"                  fd-Zd*ej"                  d+ej"                  d,ed!ej"                  fd.Zd6d#ej"                  d$e
d!efd/Zd6d'ej"                  d$e
d!eeej"                  f   fd0Z	 	 	 d7d1ej"                  d2e
d$e
d3eej<                     d!eeej"                  f   f
d4Z xZ S )8AutoencoderKLMagvitaq  
    A VAE model with KL loss for encoding images into latents and decoding latent representations into images. This
    model is used in [EasyAnimate](https://huggingface.co/papers/2405.18991).

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).
    Tr      r   r   r   r   r   r   g?r   latent_channelsr   r   .r   r   r   r   rb   scaling_factorrd   c                    t         |           t        ||||||	|d|	      | _        t	        ||||||	||      | _        t        j                  d|z  d|z  d      | _        t        j                  ||d      | _	        dt        |      dz
  z  | _        dt        |      dz
  z  | _        d| _        d| _        d| _        d| _        d| _        d| _        d	| _        d	| _        d| _        d
| _        d
| _        d| _        y )NT)	r   r   r   r   r   rb   r   r   rd   )r   r   r   r   r   rb   r   rd   r   r   rl   Frz   r   i  r   )r*   r+   r   encoderr   decoderrm   ru   
quant_convpost_quant_convr!   spatial_compression_ratiotemporal_compression_ratiouse_slicing
use_tilinguse_framewise_encodinguse_framewise_decodingnum_sample_frames_batch_sizenum_latent_frames_batch_sizetile_sample_min_heighttile_sample_min_widthtile_sample_min_num_framestile_sample_stride_heighttile_sample_stride_widthtile_sample_stride_num_frames)r,   r   r   r   r   r   r   r   r   rb   r   rd   r8   s               r9   r+   zAutoencoderKLMagvit.__init__  s1   2 	 *#(-1-+1

 *'%)1-+1	
 ))A$7_9LZ[\!yy/WXY)*s3E/F/J)K&*+4F0G!0K*L' !
   ',#&+# -.),-) '*#%("*+' *-&(+%-.*r:   c                     | j                         D ]F  \  }}t        |t              r|j                          t        |t              s7|j                          H y r<   )named_modulesr   r   r?   r   )r,   namemodules      r9   r?   z%AutoencoderKLMagvit._clear_conv_cache  sL     ..0 	+LD&&"9:((*&"89((*		+r:   Nr   r   r   r   r  r  rB   c                 *   d| _         d| _        d| _        |xs | j                  | _        |xs | j                  | _        |xs | j
                  | _        |xs | j                  | _        |xs | j                  | _        |xs | j                  | _        y)aX  
        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.

        Args:
            tile_sample_min_height (`int`, *optional*):
                The minimum height required for a sample to be separated into tiles across the height dimension.
            tile_sample_min_width (`int`, *optional*):
                The minimum width required for a sample to be separated into tiles across the width dimension.
            tile_sample_stride_height (`int`, *optional*):
                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
                no tiling artifacts produced across the height dimension.
            tile_sample_stride_width (`int`, *optional*):
                The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
                artifacts produced across the width dimension.
        TN)	r   r   r   r   r   r   r   r  r  )r,   r   r   r   r   r  r  s          r9   enable_tilingz!AutoencoderKLMagvit.enable_tiling  s    4 &*#&*#&<&[@[@[#%:%Xd>X>X"*D*gHgHg')B)ddFdFd&(@(aDDaDa%-J-pdNpNp*r:   xreturn_dictc           
      D   | j                   rK|j                  d   | j                  kD  s|j                  d   | j                  kD  r| j	                  ||      S | j                  |ddddddddddf         }|g}t        d|j                  d   | j                        D ]C  }| j                  |dddd||| j                  z   ddddf         }|j                  |       E t        j                  |d      }| j                  |      }| j                          |S )a  
        Encode a batch of images into latents.

        Args:
            x (`torch.Tensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

        Returns:
                The latent representations of the encoded images. If `return_dict` is True, a
                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
        r{   r
  Nr   r   rH   )r   shaper   r   tiled_encoder   r   r   rO   rP   catr   r?   )r,   r	  r
  first_frameshrT   next_framesmomentss           r9   _encodezAutoencoderKLMagvit._encode(  s     ??d.I.I IQWWUW[[_[u[uMu$$QK$@@||AaBQB1n$56Nq!''!*d&G&GH 	"A,,qAq1t7X7X3X/XZ[]^)^'_`KHH[!	" IIaQ//!$ r:   c                 (   | j                   rU|j                  d   dkD  rC|j                  d      D cg c]  }| j                  |       }}t	        j
                  |      }n| j                  |      }t        |      }|s|fS t        |      S c c}w )a  
        Encode a batch of images into latents.

        Args:
            x (`torch.Tensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

        Returns:
                The latent representations of the encoded videos. If `return_dict` is True, a
                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
        r   r   )latent_dist)r   r  splitr  rP   r  r   r   )r,   r	  r
  x_sliceencoded_slicesr  	posteriors          r9   encodezAutoencoderKLMagvit.encodeF  s      
QCD771:Ndll73NNN		.)AQA03	<"y99 Os   Bzc           
         |j                   \  }}}}}| j                  | j                  z  }| j                  | j                  z  }	| j                  r7|j                   d   |kD  s|j                   d   |	kD  r| j                  ||      S | j                  |      }| j                  |d d d d d dd d d d f         }
|
g}t        d|j                   d   | j                        D ]C  }| j                  |d d d d ||| j                  z   d d d d f         }|j                  |       E t        j                  |d      }|s|fS t        |      S )Nr{   r  r  r   r   rH   sample)r  r   r   r   r   tiled_decoder   r   r   r   rO   rP   r  r   )r,   r  r
  r   ri   rR   heightwidthtile_latent_min_heighttile_latent_min_widthr  decrT   r  s                 r9   _decodezAutoencoderKLMagvit._decodeb  sE   >?gg;
L*fe!%!<!<@^@^!^ $ : :d>\>\ \??.D DPRVkHk$$QK$@@  # ||AaBQB1n$56nq!''!*d&G&GH 	$A,,qAq1t7X7X3X/XZ[]^)^'_`KJJ{#	$ ii#6MC((r:   c                 Z   | j                   r_|j                  d   dkD  rM|j                  d      D cg c]  }| j                  |      j                   }}t        j                  |      }n| j                  |      j                  }| j                          |s|fS t        |      S c c}w )a  
        Decode a batch of images.

        Args:
            z (`torch.Tensor`): Input batch of latent vectors.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.DecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                returned.
        r   r   r  )	r   r  r  r'  r   rP   r  r?   r   )r,   r  r
  z_slicedecoded_slicesdecodeds         r9   decodezAutoencoderKLMagvit.decode|  s     
QJK''RS*Uwdll73::UNUii/Gll1o,,G :G,, Vs   "B(abblend_extentc           	         t        |j                  d   |j                  d   |      }t        |      D ]J  }|d d d d d d | |z   d d f   d||z  z
  z  |d d d d d d |d d f   ||z  z  z   |d d d d d d |d d f<   L |S )Nr   r   r   r  r   )r,   r-  r.  r/  ys        r9   blend_vzAutoencoderKLMagvit.blend_v  s    1771:qwwqz<@|$ 	A Aq<-!*;Q!>?1q<GWCWX[\]^`acdfgij]j[kL \  AaAq!m	 r:   c                    t        |j                  d   |j                  d   |      }t        |      D ]J  }|d d d d d d d d | |z   f   d||z  z
  z  |d d d d d d d d |f   ||z  z  z   |d d d d d d d d |f<   L |S )Nrz   r   r1  )r,   r-  r.  r/  r	  s        r9   blend_hzAutoencoderKLMagvit.blend_h  s    1771:qwwqz<@|$ 	A Aq!l]Q->!>?1q<GWCWX[\]^`acdfgij]j[kL \  AaAq!m	 r:   c                    |j                   \  }}}}}|| j                  z  }|| j                  z  }	| j                  | j                  z  }
| j                  | j                  z  }| j                  | j                  z  }| j
                  | j                  z  }|
|z
  }||z
  }g }t        d|| j                        D ],  }g }t        d|| j
                        D ]  }|d d d d d d ||| j                  z   ||| j                  z   f   }| j                  |d d d d ddd d d d f         }|g}t        d|| j                        D ]C  }| j                  |d d d d ||| j                  z   d d d d f         }|j                  |       E t        j                  |d      }| j                  |      }| j                          |j                  |        |j                  |       / g }t        |      D ]  \  }}g }t        |      D ]g  \  }}|dkD  r| j                  ||dz
     |   ||      }|dkD  r| j!                  ||dz
     ||      }|j                  |d d d d d d d |d |	f          i |j                  t        j                  |d              t        j                  |d      d d d d d d d |d |	f   }|S )Nr   r   r   rH   rz   r   )r  r   r   r   r   r  r   r   r   rO   rP   r  r   r?   r   r3  r5  )r,   r	  r
  r   ri   rR   r"  r#  latent_heightlatent_widthr$  r%  tile_latent_stride_heighttile_latent_stride_widthblend_heightblend_widthrowsrT   rowjtiler  tile_hkr  result_rows
result_rowr  s                               r9   r  z AutoencoderKLMagvit.tiled_encode  s   >?gg;
L*fe$"@"@@ > >>!%!<!<@^@^!^ $ : :d>\>\ \$($B$BdFdFd$d!#'#@#@DDbDb#b -0II+.FF q&$"@"@A 	AC1eT%B%BC !D7777D6666	8  $||DAqsAq,AB&q*d.O.OP /A"&,,tAq!a$BcBc>c:cefhi4i/j"kKMM+./ yyQ/t,&&(

4 #!$ KK)	* o 
	=FAsJ$S> P4 q5<<QUAlKDq5<<AE
D+FD!!$q!Q'M"NOP uyy;<
	= ))KQ/1a-,0VWr:   c                    |j                   \  }}}}}|| j                  z  }|| j                  z  }	| j                  | j                  z  }
| j                  | j                  z  }| j                  | j                  z  }| j
                  | j                  z  }| j                  | j                  z
  }| j                  | j
                  z
  }g }t        d||      D ]  }g }t        d||      D ]  }|d d d d d d |||
z   |||z   f   }| j                  |      }| j                  |d d d d d dd d d d f         }|g}t        d|| j                        D ]C  }| j                  |d d d d ||| j                  z   d d d d f         }|j                  |       E t        j                  |d      }| j                          |j                  |        |j                  |        g }t        |      D ]  \  }}g }t        |      D ]{  \  }}|dkD  r| j                  ||dz
     |   ||      }|dkD  r| j!                  ||dz
     ||      }|j                  |d d d d d d d | j                  d | j
                  f          } |j                  t        j                  |d              t        j                  |d      d d d d d d d |d |	f   }|s|fS t#        |      S )Nr   r   r   rH   rz   r   r  )r  r   r   r   r   r  r   r   r   r   rO   rP   r  r?   r   r3  r5  r   )r,   r  r
  r   ri   rR   r"  r#  sample_heightsample_widthr$  r%  r9  r:  r;  r<  r=  rT   r>  r?  r@  r  tile_decrB  r  r+  rC  rD  r&  s                                r9   r!  z AutoencoderKLMagvit.tiled_decode  s   >?gg;
L*fe!?!??t===!%!<!<@^@^!^ $ : :d>\>\ \$($B$BdFdFd$d!#'#@#@DDbDb#b 22T5S5SS0043P3PP q&";< 	AC1e%=> $222111	3 ++D1  $||DArr1a,@A(>q*d.O.OP 1A"&,,tAq!a$BcBc>c:cefhi4i/j"kKOOK01  ))H!4&&(

7#+$, KK1	2 o 
	=FAsJ$S> t4 q5<<QUAlKDq5<<AE
D+FD!!$q!Q0P$2P2P0PRqTXTqTqRq'q"rst uyy;<
	= ii+Aq!^m^]l],RS6MC((r:   r   sample_posterior	generatorc                     |}| j                  |      j                  }|r|j                  |      }n|j                         }| j	                  |      j                  }|s|fS t        |      S )aa  
        Args:
            sample (`torch.Tensor`): Input sample.
            sample_posterior (`bool`, *optional*, defaults to `False`):
                Whether to sample from the posterior.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
        )rJ  r  )r  r  r   rF   r,  r   )	r,   r   rI  r
  rJ  r	  r  r  r&  s	            r9   rN   zAutoencoderKLMagvit.forward  sf     KKN..	  9 5A Akk!n##6MC((r:   )NNNNNN)T)FTN)!rW   rX   rY   r   r   r   r%   r   r[   r   rZ   r+   r?   r   r  r
   rP   r\   r   r   r   r  r  r   r'  r,  r3  r5  r  r!  	GeneratorrN   r]   r^   s   @r9   r   r     sZ    (,$ !.B-
+
 !"! &#'-T/T/ T/ 	T/
 "#s(OT/  S/T/ c3hT/$ %T/& 'T/( )T/* +T/, !-T/ T/l+ 15/34859489="q ("q  (}"q %-SM	"q
 $,E?"q #+5/"q (0"q 
"qH 37,0	"E*F$GG	H : 37::,0:	"E*F$GG	H: :6) )D )E-Y^YeYeJeDf )4 - -4 -5X]XdXdIdCe - -2 %,, c ell  %,, c ell 2ell 2 2I\ 2h;)ell ;) ;)}^c^j^jOjIk ;)@ "' /3)) ) 	)
 EOO,) 
}ell*	+)r:   r   )+r#   typingr   r   r   rP   torch.nnrm   torch.nn.functional
functionalrJ   configuration_utilsr   r   utilsr	   utils.accelerate_utilsr
   activationsr   modeling_outputsr   modeling_utilsr   vaer   r   r   
get_loggerrW   loggerru   r   Moduler`   r   r   r   r   r   r   r   r    r:   r9   <module>r\     s      ) )     B  8 ( 2 ' N N 
		H	%n,bii n,bCE CELryy %RYY %P3RYY 3l0299 0f1BII 1hq qhv vrW)*&6 W)r:   