
    i                        d dl Z d dlmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ  G d dej>                        Z  G d dej>                        Z! G d dej>                        Z" G d dej>                        Z# G d dej>                        Z$ G d dej>                        Z% G d deee	      Z&d Z'y)    N)OptionalTupleUnion   )ConfigMixinregister_to_config)apply_forward_hook   )	AttentionSpatialNorm)DecoderOutputDiagonalGaussianDistribution)Downsample2D)AutoencoderKLOutput)
ModelMixin)ResnetBlock2D)
Upsample2D   )AutoencoderMixinc                        e Zd ZdZ	 	 	 	 	 	 ddedee   dedededed	ed
df fdZe	de
j                  d
e
j                  fd       Zde
j                  ded
e
j                  fdZ xZS )AllegroTemporalConvLayera
  
    Temporal convolutional layer that can be used for video (sequence of images) input. Code adapted from:
    https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016
    Nin_dimout_dimdropoutnorm_num_groups	up_sampledown_samplestridereturnc                 L   t         |           |xs |}t        |dz
  dz        x}}	d}
|| _        || _        |rat        j                  t        j                  ||      t        j                         t        j                  ||d||fdd||	f            | _
        n|rct        j                  t        j                  ||      t        j                         t        j                  ||dz  d||fd||	f            | _
        n_t        j                  t        j                  ||      t        j                         t        j                  ||d||f|
||	f            | _
        t        j                  t        j                  ||      t        j                         t        j                  |      t        j                  ||d||f|
||	f            | _        t        j                  t        j                  ||      t        j                         t        j                  |      t        j                  ||d||f|
||f            | _        t        j                  t        j                  ||      t        j                         t        j                  ||d||f|
||f            | _        y )	Nr   g      ?r   r
   )r
   r   r   )r   paddingr!   r   )super__init__intr   r   nn
Sequential	GroupNormSiLUConv3dconv1Dropoutconv2conv3conv4)selfr   r   r   r   r   r   r   pad_hpad_wpad_t	__class__s              ~/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/diffusers/models/autoencoders/autoencoder_kl_allegro.pyr$   z!AllegroTemporalConvLayer.__init__(   s(    	#VVaZ3.//&"_f5			&'Avv+>y[\^cejZklDJ
 _f5			&'A+66/BQPUW\L]^DJ _f5			&'Avv+>PUW\H]^DJ
 ]]LL'2GGIJJwIIgv66':UESXDYZ	

 ]]LL'2GGIJJwIIgv66':UESXDYZ	

 ]]LL'2GGIIIgv66':UESXDYZ

    hidden_statesc                     t        j                  | d d d d ddf   | fd      } t        j                  | | d d d d dd f   fd      } | S )Nr   r   r
   )dim)torchcat)r7   s    r5   _pad_temporal_dimz*AllegroTemporalConvLayer._pad_temporal_dim_   sP    		=Aqs#;]"KQRS		=-1bc	2J"KQRSr6   
batch_sizec                 .   |j                  d|df      j                  ddddd      }| j                  r|d d d d d d df   }n3| j                  r%|j	                  dd|j
                  d   dz        }n|}| j                  s| j                  r| j                  |      }n"| j                  |      }| j                  |      }| j                  r6|j                  dd      j                  dddddd	      j                  dd      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }||z   }|j                  ddddd      j                  dd      }|S )
Nr   r:   r
   r   r      )r9   output_size)r
   r:      )	unflattenpermuter   r   repeat_interleaveshaper+   r=   flattenr-   r.   r/   )r0   r7   r>   identitys       r5   forwardz AllegroTemporalConvLayer.forwarde   s   %//J3CDLLQPQSTVWYZ[$Q3Q3Y/H^^$66qa]M`M`abMcfgMg6hH$Ht~~ JJ}5M 22=AM JJ}5M>>)33Aw?GG1aQRTUWXYaabcefgM..}=

=1..}=

=1..}=

=1 =0%--aAq!<DDQJr6   )N            FFr   )__name__
__module____qualname____doc__r%   r   floatboolr$   staticmethodr;   Tensorr=   rI   __classcell__r4   s   @r5   r   r   "   s     "&!!5
5
 #5
 	5

 5
 5
 5
 5
 
5
n  %,,  
U\\ s u|| r6   r   c                        e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddedededededededed	ed
edededef fdZdej                  dej                  fdZ
 xZS )AllegroDownBlock3Din_channelsout_channelsr   
num_layers
resnet_epsresnet_time_scale_shiftresnet_act_fnresnet_groupsresnet_pre_normoutput_scale_factorspatial_downsampletemporal_downsampledownsample_paddingc                    t         |           g }g }t        |      D ]M  }|dk(  r|n|}|j                  t	        ||d ||||||
|	
             |j                  t        ||d|             O t        j                  |      | _        t        j                  |      | _	        |rt        ||d|dd      | _
        || _        |r*t        j                  t        |d||d	      g      | _        y d | _        y )
Nr   
rX   rY   temb_channelsepsgroupsr   time_embedding_normnon_linearityr`   pre_norm皙?r   r   Tr   )r   r   r   r   op)use_convrY   r!   name)r#   r$   rangeappendr   r   r&   
ModuleListresnets
temp_convstemp_convs_downadd_temp_downsampler   downsamplers)r0   rX   rY   r   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rt   ru   ir4   s                    r5   r$   zAllegroDownBlock3D.__init__   s     	
z" 	A)*a+\KNN +!-"&"(#(?"/(;, (  $1	!	2 }}W---
3#;lCdhqr$D  $7  " $t,Xjqu!D !%Dr6   r7   r   c                    |j                   d   }|j                  ddddd      j                  dd      }t        | j                  | j
                        D ]  \  }} ||d       } |||      } | j                  r| j                  ||      }| j                  | j                  D ]
  } ||      } |j                  d|df      j                  ddddd      }|S 	Nr   r
   r   r   r@   )temb)r>   r:   )
rF   rD   rG   ziprt   ru   rw   rv   rx   rC   )r0   r7   r>   resnet	temp_convdownsamplers         r5   rI   zAllegroDownBlock3D.forward   s    "((+
%--aAq!<DDQJ!$T\\4??!C 	LFI"=t<M%m
KM	L ## 00:0VM(#00 ; +M :; &//J3CDLLQPQSTVWYZ[r6   )rJ   r   ư>defaultswishrK   T      ?TFr   rL   rM   rN   r%   rP   strrQ   r$   r;   rS   rI   rT   rU   s   @r5   rW   rW      s    
  '0$ $%(#'$)"#@%@% @% 	@%
 @% @% "%@% @% @% @% #@% !@% "@%  @%DU\\ ell r6   rW   c                        e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddedededededededed	ed
edededee   f fdZde	j                  de	j                  fdZ xZS )AllegroUpBlock3DrX   rY   r   rZ   r[   r\   r]   r^   r_   r`   spatial_upsampletemporal_upsamplerf   c                    t         |           g }g }t        |      D ]M  }|dk(  r|n|}|j                  t	        |||||||||
|	
             |j                  t        ||d|             O t        j                  |      | _        t        j                  |      | _	        || _
        |rt        ||d|dd      | _        |r(t        j                  t        |d|      g      | _        y d | _        y )	Nr   re   rl   rm   Tr   )r   r   r   r   )ro   rY   )r#   r$   rq   rr   r   r   r&   rs   rt   ru   add_temp_upsampletemp_conv_upr   
upsamplers)r0   rX   rY   r   rZ   r[   r\   r]   r^   r_   r`   r   r   rf   rt   ru   ry   input_channelsr4   s                     r5   r$   zAllegroUpBlock3D.__init__   s     	
z" 	A,-F[NNN .!-"/"(#(?"/(;, (  $1	#	4 }}W---
3!2 8lCbfop!D  mmZtbn-o,pqDO"DOr6   r7   r   c                    |j                   d   }|j                  ddddd      j                  dd      }t        | j                  | j
                        D ]  \  }} ||d       } |||      } | j                  r| j                  ||      }| j                  | j                  D ]
  } ||      } |j                  d|df      j                  ddddd      }|S r{   )
rF   rD   rG   r}   rt   ru   r   r   r   rC   )r0   r7   r>   r~   r   	upsamplers         r5   rI   zAllegroUpBlock3D.forward  s    "((+
%--aAq!<DDQJ!$T\\4??!C 	LFI"=t<M%m
KM	L !! --m
-SM??&!__ 9	 )- 89 &//J3CDLLQPQSTVWYZ[r6   )rJ   r   r   r   r   rK   Tr   TFN)rL   rM   rN   r%   rP   r   rQ   r   r$   r;   rS   rI   rT   rU   s   @r5   r   r      s    
  '0$ $%(!%"''+;#;# ;# 	;#
 ;# ;# "%;# ;# ;# ;# #;# ;#  ;#  };#zU\\ ell r6   r   c                        e Zd Z	 	 	 	 	 	 	 	 	 	 ddedededededededed	ed
ededef fdZdej                  dej                  fdZ
 xZS )AllegroMidBlock3DConvrX   rf   r   rZ   r[   r\   r]   r^   r_   add_attentionattention_head_dimr`   c                 N   t         |           t        ||||||||||	
      g}t        ||d|      g}g }||}t	        |      D ]  }|
r7|j                  t        |||z  ||||dk(  r|nd |dk(  r|nd dddd             n|j                  d        |j                  t        ||||||||||	
             |j                  t        ||d|              t        j                  |      | _	        t        j                  |      | _
        t        j                  |      | _        y )Nre   rl   rm   r   spatialT)
headsdim_headrescale_output_factorrg   r   spatial_norm_dimresidual_connectionbiasupcast_softmax_from_deprecated_attn_block)r#   r$   r   r   rq   rr   r   r&   rs   rt   ru   
attentions)r0   rX   rf   r   rZ   r[   r\   r]   r^   r_   r   r   r`   rt   ru   r   _r4   s                    r5   r$   zAllegroMidBlock3DConv.__init__1  ss    	 '(+$$;+$7(
 % -	

 
%!,z" *	A!!#)-??!3.A&9PT]9]cg:QU^:^dh,0!'+48  !!$'NN +!,"/"(#(?"/(;, ($1	G*	X }}W---
3--
3r6   r7   r   c                    |j                   d   }|j                  ddddd      j                  dd      } | j                  d   |d       } | j                  d   ||      }t        | j                  | j                  dd  | j                  dd        D ]"  \  }}} ||      } ||d       } |||      }$ |j                  d|df      j                  ddddd      }|S r{   )rF   rD   rG   rt   ru   r}   r   rC   )r0   r7   r>   attnr~   r   s         r5   rI   zAllegroMidBlock3DConv.forward  s    "((+
%--aAq!<DDQJ'QDA**=ZP'*4??DLL<Ldoo^_^`Na'b 	L#D&) /M"=t<M%m
KM	L
 &//J3CDLLQPQSTVWYZ[r6   )
rJ   r   r   r   r   rK   TTr   r   r   rU   s   @r5   r   r   0  s    
  '0$ $""#%([4[4 [4 	[4
 [4 [4 "%[4 [4 [4 [4 [4  [4 #[4zU\\ ell r6   r   c                        e Zd Zddddg dddddf	d	ed
edeedf   deedf   deedf   dedededef fdZdej                  dej                  fdZ
 xZS )AllegroEncoder3Dr   rW   rW   rW   rW            r   TTFFr
   rK   siluTrX   rY   down_block_types.block_out_channelstemporal_downsample_blockslayers_per_blockr   act_fndouble_zc
                 "   t         |           t        j                  ||d   ddd      | _        t        j
                  |d   |d   dd      | _        t        j                  g       | _        |d   }
t        |      D ]a  \  }}|
}||   }
|t        |      dz
  k(  }|dk(  rt        |||
| ||   d	d||
	      }nt        d      | j                  j                  |       c t        |d   d	|dd|d   |d       | _        t        j                   |d   |d	      | _        t        j$                         | _        |	rd|z  n|}t        j
                  |d   |d   dd      | _        t        j                  |d   |dd      | _        d| _        y )Nr   r   r   kernel_sizer   r!   r   r   r   r   r   r   )rX   rY   r   r!   rW   r   )	rZ   rX   rY   ra   rb   r[   rc   r]   r^   zCInvalid `down_block_type` encountered. Must be `AllegroDownBlock3D`r:   r   rX   r[   r]   r`   r\   r   r^   rf   num_channels
num_groupsrg   r
   r"   F)r#   r$   r&   Conv2dconv_inr*   temp_conv_inrs   down_blocks	enumeratelenrW   
ValueErrorrr   r   	mid_blockr(   conv_norm_outr)   conv_acttemp_conv_outconv_outgradient_checkpointing)r0   rX   rY   r   r   r   r   r   r   r   output_channelry   down_block_typeinput_channelis_final_block
down_blockconv_out_channelsr4   s                    r5   r$   zAllegroEncoder3D.__init__  s   " 	yyq!
 II*1-+A.!	
 ==, ,A."+,<"= 	0A*M/2N#&8"9A"==N"66// -!/+9'9(B1(E#'("("1

 !!fgg##J/)	0. /*2.  !$-1"5)	
  \\7I"7MZiost	08A,lYY'9"'=?QRT?UW`jst		"4R"8:KQXYZ&+#r6   sampler   c                    |j                   d   }|j                  ddddd      j                  dd      }| j                  |      }|j	                  d|df      j                  ddddd      }|}| j                  |      }||z   }t        j                         rL| j                  r@| j                  D ]  }| j                  ||      } | j                  | j                  |      }n*| j                  D ]
  } ||      } | j                  |      }|j                  ddddd      j                  dd      }| j                  |      }| j                  |      }|j	                  d|df      j                  ddddd      }|}| j                  |      }||z   }|j                  ddddd      j                  dd      }| j                  |      }|j	                  d|df      j                  ddddd      }|S Nr   r
   r   r   r@   r:   )rF   rD   rG   r   rC   r   r;   is_grad_enabledr   r   _gradient_checkpointing_funcr   r   r   r   r   )r0   r   r>   residualr   s        r5   rI   zAllegroEncoder3D.forward  s   \\!_
1aA.66q!<f%!!!j"%56>>q!Q1M""6*("  "t'B'B".. O
:::vNO 66t~~vNF #.. ,
#F+, ^^F+F 1aA.66q!<##F+v&!!!j"%56>>q!Q1M##F+("1aA.66q!<v&!!!j"%56>>q!Q1Mr6   rL   rM   rN   r%   r   r   rQ   r$   r;   rS   rI   rT   rU   s   @r5   r   r     s     -
 /C7Q !!Q,Q, Q,  S/	Q, "#s(OQ, %*$)$4Q, Q, Q, Q, Q,f(ell (u|| (r6   r   c                        e Zd Zdddg dddddd	f	d
ededeedf   deedf   deedf   dedededef fdZdej                  dej                  fdZ
 xZS )AllegroDecoder3Dr@   r   r   r   r   r   FTTFr   r
   rK   r   grouprX   rY   up_block_types.temporal_upsample_blocksr   r   r   r   	norm_typec
                    t         |           t        j                  ||d   ddd      | _        t        j
                  |d   |d   dd      | _        d | _        t        j                  g       | _	        |	dk(  r|nd }
t        |d   d	|d|	d
k(  rdn|	|d   ||
      | _        t        t        |            }|d   }t        |      D ]g  \  }}|}||   }|t        |      dz
  k(  }|dk(  rt        |dz   ||| ||   d	|||
|	
      }nt!        d      | j                  j#                  |       |}i |	dk(  rt%        |d   |
      | _        n t        j(                  |d   |d	      | _        t        j*                         | _        t        j
                  |d   |d   dd      | _        t        j                  |d   |dd      | _        d| _        y )Nr:   r   r   r   r   r   r"   r   r   r   r   r   r   r   )
rZ   rX   rY   r   r   r[   r]   r^   rf   r\   z?Invalid `UP_block_type` encountered. Must be `AllegroUpBlock3D`r   F)r#   r$   r&   r   r   r*   r   r   rs   	up_blocksr   listreversedr   r   r   r   rr   r   r   r(   r)   r   r   r   r   )r0   rX   rY   r   r   r   r   r   r   r   rf   reversed_block_out_channelsr   ry   up_block_typeprev_output_channelr   up_blockr4   s                     r5   r$   zAllegroDecoder3D.__init__  s   " 	yyr"
 II&8&<>PQS>TV_irsr*'0I'=4 /*2.  !1:g1EI91"5)'	
 '+84F+G&H#4Q7 ). 9 	1A}"08;N#&8"9A"==N 22+/!3 3!/)7%7&>q&A#"("1"/,5 !!bccNN!!(+"0/	14 	!!,-?-BM!RD!#;Ma;P]lrv!wD	YY'9!'<>PQR>SU^hqr		"4Q"7qRST&+#r6   r   r   c                 v   |j                   d   }|j                  ddddd      j                  dd      }| j                  |      }|j	                  d|df      j                  ddddd      }|}| j                  |      }||z   }t        t        | j                  j                                     j                  }t        j                         rL| j                  r@| j                  | j                  |      }| j                  D ]  }| j                  ||      } n;| j                  |      }|j!                  |      }| j                  D ]
  } ||      } |j                  ddddd      j                  dd      }| j#                  |      }| j%                  |      }|j	                  d|df      j                  ddddd      }|}| j'                  |      }||z   }|j                  ddddd      j                  dd      }| j)                  |      }|j	                  d|df      j                  ddddd      }|S r   )rF   rD   rG   r   rC   r   nextiterr   
parametersdtyper;   r   r   r   r   tor   r   r   r   )r0   r   r>   r   upscale_dtyper   s         r5   rI   zAllegroDecoder3D.forwardv  s'   \\!_
1aA.66q!<f%!!!j"%56>>q!Q1M""6*("T$..";";"=>?EE  "t'B'B66t~~vNF !NN M::8VLM
 ^^F+FYY}-F !NN *!&)* 1aA.66q!<##F+v&!!!j"%56>>q!Q1M##F+("1aA.66q!<v&!!!j"%56>>q!Q1Mr6   r   rU   s   @r5   r   r     s     +
 6P.B !! U,U, U, c3h	U, #(c	"2U, "#s(OU, U, U, U, U,n,ell ,u|| ,r6   r   c            "           e Zd ZdZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#dededeedf   deedf   deedf   d	ee	df   d
ee	df   dedededede
dede
de	ddf  fd       Zdej                  dej                  fdZe	 d$dej                  de	deeee   f   fd       Zdej                  dej                  fdZed$dej                  de	deeej                  f   fd       Zdej                  dej                  fdZdej                  dej                  fdZ	 	 	 d%dej                  d e	de	d!eej4                     deeej                  f   f
d"Z xZS )&AutoencoderKLAllegroa!  
    A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos. Used in
    [Allegro](https://github.com/rhymes-ai/Allegro).

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        in_channels (int, defaults to `3`):
            Number of channels in the input image.
        out_channels (int, defaults to `3`):
            Number of channels in the output.
        down_block_types (`Tuple[str, ...]`, defaults to `("AllegroDownBlock3D", "AllegroDownBlock3D", "AllegroDownBlock3D", "AllegroDownBlock3D")`):
            Tuple of strings denoting which types of down blocks to use.
        up_block_types (`Tuple[str, ...]`, defaults to `("AllegroUpBlock3D", "AllegroUpBlock3D", "AllegroUpBlock3D", "AllegroUpBlock3D")`):
            Tuple of strings denoting which types of up blocks to use.
        block_out_channels (`Tuple[int, ...]`, defaults to `(128, 256, 512, 512)`):
            Tuple of integers denoting number of output channels in each block.
        temporal_downsample_blocks (`Tuple[bool, ...]`, defaults to `(True, True, False, False)`):
            Tuple of booleans denoting which blocks to enable temporal downsampling in.
        latent_channels (`int`, defaults to `4`):
            Number of channels in latents.
        layers_per_block (`int`, defaults to `2`):
            Number of resnet or attention or temporal convolution layers per down/up block.
        act_fn (`str`, defaults to `"silu"`):
            The activation function to use.
        norm_num_groups (`int`, defaults to `32`):
            Number of groups to use in normalization layers.
        temporal_compression_ratio (`int`, defaults to `4`):
            Ratio by which temporal dimension of samples are compressed.
        sample_size (`int`, defaults to `320`):
            Default latent size.
        scaling_factor (`float`, defaults to `0.13235`):
            The component-wise standard deviation of the trained latent space computed using the first batch of the
            training set. This is used to scale the latent space to have unit variance when training the diffusion
            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
            Synthesis with Latent Diffusion Models](https://huggingface.co/papers/2112.10752) paper.
        force_upcast (`bool`, default to `True`):
            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
            can be fine-tuned / trained to a lower range without losing too much precision in which case `force_upcast`
            can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
    TrX   rY   r   .r   r   r   r   latent_channelsr   r   r   temporal_compression_ratiosample_sizescaling_factorforce_upcastr   Nc                    t         |           t        ||||||	|
|d	      | _        t	        ||||||	||
      | _        t        j                  d|z  d|z  d      | _        t        j                  ||d      | _	        d| _
        d| _        dt        |      dz
  z  | _        d| _        d| _        d	| _        d
}|||f| _        || j                  z
  || j                  z
  || j                   z
  f| _        y )NT)	rX   rY   r   r   r   r   r   r   r   )rX   rY   r   r   r   r   r   r   r
   r   F   x   P      )r#   r$   r   encoderr   decoderr&   r   
quant_convpost_quant_convuse_slicing
use_tilingr   spatial_compression_ratiotile_overlap_ttile_overlap_htile_overlap_wkernelr   )r0   rX   rY   r   r   r   r   r   r   r   r   r   r   r   r   r   sample_framesr4   s                    r5   r$   zAutoencoderKLAllegro.__init__  s   : 	'#(-'A1-+

 ('%)%=1-+	
 ))A$7_9LaP!yy/1M
 !)*s3E/F/J)K&! $k;?D///$---$---
r6   xc                 R    | j                   r| j                  |      S t        d      )Nz5Encoding without tiling has not been implemented yet.)r   tiled_encodeNotImplementedError)r0   r  s     r5   _encodezAutoencoderKLAllegro._encode  (     ??$$Q''!"YZZr6   return_dictc                 (   | j                   rU|j                  d   dkD  rC|j                  d      D cg c]  }| j                  |       }}t	        j
                  |      }n| j                  |      }t        |      }|s|fS t        |      S c c}w )a  
        Encode a batch of videos into latents.

        Args:
            x (`torch.Tensor`):
                Input batch of videos.
            return_dict (`bool`, defaults to `True`):
                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

        Returns:
                The latent representations of the encoded videos. If `return_dict` is True, a
                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
        r   r   )latent_dist)r   rF   splitr	  r;   r<   r   r   )r0   r  r  x_sliceencoded_slicesh	posteriors          r5   encodezAutoencoderKLAllegro.encode'  s    " 
QCD771:Ndll73NNN		.)AQA03	<"y99 Os   Bzc                 R    | j                   r| j                  |      S t        d      )Nz5Decoding without tiling has not been implemented yet.)r   tiled_decoder  )r0   r  s     r5   _decodezAutoencoderKLAllegro._decodeD  r
  r6   c                    | j                   rU|j                  d   dkD  rC|j                  d      D cg c]  }| j                  |       }}t	        j
                  |      }n| j                  |      }|s|fS t        |      S c c}w )a  
        Decode a batch of videos.

        Args:
            z (`torch.Tensor`):
                Input batch of latent vectors.
            return_dict (`bool`, defaults to `True`):
                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.DecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                returned.
        r   r   r   )r   rF   r  r  r;   r<   r   )r0   r  r  z_slicedecoded_slicesdecodeds         r5   decodezAutoencoderKLAllegro.decodeL  sv      
QCD771:Ndll73NNNii/Gll1oG:G,, Os   Bc                 

   d}| j                   }| j                  j                  }|j                  \  }}}}}	t	        j
                  || j                  d   z
  | j                  d   z        dz   }
t	        j
                  || j                  d   z
  | j                  d   z        dz   }t	        j
                  |	| j                  d   z
  | j                  d   z        dz   }d}|j                  |
|z  |z  d| j                  j                  z  | j                  d   |z  | j                  d   |z  | j                  d   |z  f      }|j                  ||| j                  d   | j                  d   | j                  d   f      }t        |
      D ]  }t        |      D ]u  }t        |      D ]c  }|| j                  d   z  || j                  d   z  | j                  d   z   }}|| j                  d   z  || j                  d   z  | j                  d   z   }}|| j                  d   z  || j                  d   z  | j                  d   z   }}|d d d d ||||||f   }||||z  <   ||z  |dz
  k(  s||
|z  |z  dz
  k(  r| j                  |      }||
|z  |z  dz
  k(  r ||z  |dz
  k7  r|d ||z  dz    ||||z  z
  d  n||||z
  dz   |dz    |j                  ||| j                  d   | j                  d   | j                  d   f      }|dz  }f x  |j                  |d| j                  j                  z  ||z  ||z  |	|z  f      }| j                  d   |z  | j                  d   |z  | j                  d   |z  f}| j                  d   |z  | j                  d   |z  | j                  d   |z  f}|d   |d   z
  |d   |d   z
  |d   |d   z
  f}t        |
      D ]  }||d   z  ||d   z  |d   z   }}t        |      D ]  }||d   z  ||d   z  |d   z   }}t        |      D ]u  }||d   z  ||d   z  |d   z   }}t        ||
|d   f|||d   f|||d   f|||z  |z  ||z  z   |z      j                  d            }|d d d d ||||||fxx   |z  cc<   w   |j                  ddddd      j                  dd      }| j!                  |      }|j#                  d|df      j                  ddddd      }|S Nr   r   r
   r   r@   r:   )r   configr   rF   mathfloorr  r   	new_zerosr   rq   r   _prepare_for_blend	unsqueezerD   rG   r   rC   )r0   r  local_batch_sizersrtr>   r   
num_framesheightwidthoutput_num_framesoutput_heightoutput_widthcountoutput_latentvae_batch_inputry   jkn_startn_endh_starth_endw_startw_end
video_cubelatentoutput_kerneloutput_strideoutput_overlaplatent_means                                  r5   r  z!AutoencoderKLAllegro.tiled_encodef  s   ++[[33>?gg;
L*fe JJ
T[[^(Ct{{ST~'UVYZZ

FT[[^$;t{{1~#MNQRRzz54;;q>#9T[[^"KLqP!M1L@DKK///A"$A"$A"$
 ++'7t{{ST~W[WbWbcdWegkgrgrstgu&vw() 	A=) |, A%&Q%7T[[^9KdkkZ[n9\UG%&Q%7T[[^9KdkkZ[n9\UG%&Q%7T[[^9KdkkZ[n9\UG!"1awu}#T!UJ@JOE,<$<=  004Dq4HH $5$E$TWX$XX!%o!> "%6%F%UXY%YY %(8 8<Lq<P PPVWuY^aqYqtuYuPvM%%:J2J*J*LMV\M%2B*BQ*FQRS*+++-|T[[^T[[YZ^]a]h]hij]kl+ QJE5	< T[[888*:JFVXLZ_ceZef
 A",dkk!n.BDKKPQNVXDXXA",dkk!n.BDKKPQNVXDXX!}Q//!}Q//!}Q//
 () 	]Aq!111}Q7G3G-XYJZ3ZUG=) 
]!"]1%5!5q=;K7Km\]N^7^|, ]A%&q)9%91}Q?O;OR_`aRb;bUG"4-~a/@AM>!+<=L.*;<%a-&7,&F\IY&Y\]&]^hhijk	#K 1awu}LMQ\\M]
]	] 1aA.66q!<(!!!j"%56>>q!Q1Mr6   c                 	   d}| j                   }| j                  j                  }| j                  d   |z  | j                  d   |z  | j                  d   |z  f}| j                  d   |z  | j                  d   |z  | j                  d   |z  f}|j
                  \  }}}	}
}|j                  ddddd      j                  dd      }| j                  |      }|j                  d|df      j                  ddddd      }t        j                  |	|d   z
  |d   z        dz   }t        j                  |
|d   z
  |d   z        dz   }t        j                  ||d   z
  |d   z        dz   }d}|j                  ||z  |z  | j                  j                  | j                  d   | j                  d   | j                  d   f      }|j                  |||d   |d   |d   f      }t        |      D ]  }t        |      D ]  }t        |      D ]  }||d   z  ||d   z  |d   z   }}||d   z  ||d   z  |d   z   }}||d   z  ||d   z  |d   z   }}|d d d d ||||||f   }||||z  <   ||z  |dz
  k(  s|||z  |z  dz
  k(  rl| j                  |      }|||z  |z  dz
  k(  r ||z  |dz
  k7  r|d ||z  dz    ||||z  z
  d  n||||z
  dz   |dz    |j                  |||d   |d   |d   f      }|dz  }   |j                  || j                  j                  |	|z  |
|z  ||z  f      }| j                  d   | j                  d   z
  | j                  d   | j                  d   z
  | j                  d   | j                  d   z
  f}t        |      D ]  }|| j                  d   z  || j                  d   z  | j                  d   z   }}t        |      D ]  }|| j                  d   z  || j                  d   z  | j                  d   z   }}t        |      D ]  }|| j                  d   z  || j                  d   z  | j                  d   z   }}t!        |||d   f|||d   f|||d   f|||z  |z  ||z  z   |z      j#                  d            }|d d d d ||||||fxx   |z  cc<      |j                  ddddd      j%                         }|S r  )r   r   r   r  r   rF   rD   rG   r   rC   r!  r"  r#  rY   rq   r   r$  r%  
contiguous) r0   r  r&  r'  r(  latent_kernellatent_strider>   r   r)  r*  r+  r,  r-  r.  r/  decoded_videosr1  ry   r2  r3  r4  r5  r6  r7  r8  r9  current_latentcurrent_videovideovideo_overlapout_video_blends                                    r5   r  z!AutoencoderKLAllegro.tiled_decode  s   ++[[33A",dkk!n.BDKKPQNVXDXXA",dkk!n.BDKKPQNVXDXX>?gg;
L*fe IIaAq!$,,Q2  #KKJ+,44Q1aC JJ
]15E(EWXIY'YZ]^^

F]1-=$=qAQ#QRUVVzz5=+;#;}Q?O"OPSTT!M1L@((AAA
 ++|]1-=}Q?OQ^_`Qab
 () 	A=) |, A%&q)9%91}Q?O;OR_`aRb;bUG%&q)9%91}Q?O;OR_`aRb;bUG%&q)9%91}Q?O;OR_`aRb;bUG%&q!WU]GEM7SX='X%YN@NOE,<$<=  004Dq4HH $5$E$TWX$XX(,_(E "%6%F%UXY%YY %(8 8<Lq<P PQ^ >%*:":Q">RN55;K3K+K+MN XeN53C+Ca+G%RS)T*+++-|]1=M}]^O_anopaqr+ QJE9	@ Z)A)A:PR?TZ]_T_afikaklmKKNT[[^+KKNT[[^+KKNT[[^+
 () 	`AQ/T[[^1CdkkRSn1TUG=) 
`!"T[[^!3QQ5G$++VW.5X|, `A%&Q%7T[[^9KdkkZ[n9\UG&8-}Q/?@M=+;<L-*:;&q='8<'G!lJZ'Z]^'^_iijkl	'O !Qwu}gemKLP__L`
`	` aAq!,779r6   r   sample_posterior	generatorc                     |}| j                  |      j                  }|r|j                  |      }n|j                         }| j	                  |      j                  }|s|fS t        |      S )a  
        Args:
            sample (`torch.Tensor`): Input sample.
            sample_posterior (`bool`, *optional*, defaults to `False`):
                Whether to sample from the posterior.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
            generator (`torch.Generator`, *optional*):
                PyTorch random number generator.
        )rK  r  )r  r  r   moder  r   )	r0   r   rJ  r  rK  r  r  r  decs	            r5   rI   zAutoencoderKLAllegro.forward  sf    " KKN..	  9 5A Akk!n##6MC((r6   )r   r   r   r   r   r   r   r@   r
   r   rK   r@   i@  gp=
ף?T)T)FTN)rL   rM   rN   rO    _supports_gradient_checkpointingr   r%   r   r   rQ   rP   r$   r;   rS   r	  r	   r   r   r   r  r  r   r  r  r  r   	GeneratorrI   rT   rU   s   @r5   r   r     s   +Z (,$ -
+
 /C7Q5O  !!,- $!5G
G
 G
  S/	G
 c3hG
  "#s(O!G
" %*$)$4#G
$ #(c	"2%G
& 'G
( )G
* +G
, -G
. %*/G
0 1G
2 3G
4 5G
6 
7G
 G
R[ [%,, [ 37::,0:	"E*F$GG	H: :8[ [%,, [ - -4 -5X]XdXdIdCe - -2Qell Qu|| QfWell Wu|| Wx "' /3)) ) 	)
 EOO,) 
}ell*	+)r6   r   c                    | \  }}}|\  }}}	|\  }
}}|dkD  r|dkD  rx|d d d d d|d d d d f   t        j                  d|      j                         j                  |j                        |z  j                  |dd      z  |d d d d d|d d d d f<   ||dz
  k  r}|d d d d | d d d d d f   dt        j                  d|      j                         j                  |j                        |z  z
  j                  |dd      z  |d d d d | d d d d d f<   |dkD  rw|d d d d d d d|	d d f   t        j                  d|	      j                         j                  |j                        |	z  j                  |	d      z  |d d d d d d d|	d d f<   ||dz
  k  r||d d d d d d |	 d d d f   dt        j                  d|	      j                         j                  |j                        |	z  z
  j                  |	d      z  |d d d d d d |	 d d d f<   |
dkD  rg|d d d d d d d d d|f   t        j                  d|      j                         j                  |j                        |z  z  |d d d d d d d d d|f<   |
|dz
  k  rl|d d d d d d d d | d f   dt        j                  d|      j                         j                  |j                        |z  z
  z  |d d d d d d d d | d f<   |S )Nr   r   )r;   arangerP   r   devicereshape)n_paramh_paramw_paramr  nn_max	overlap_nr  h_max	overlap_hww_max	overlap_ws                r5   r$  r$  1  s   !Aui!Aui!Aui1}q5)*1a9a+B)CQ	*00255ahh?)KgiA&*'AaAiKA%& uqy=)*1a)a+B)CELLI.44699!((CiOOgiA&*'AaYJKA%& 	1u%&q!Q)Q'>%?LLI&,,.11!((;iG
')Q
& !Q1Y;
!" 	519}%&q!Q
Q'>%?Q	*00255ahh?)KK
')Q
& !QI:;
!" 	1u%&q!Q1Y;'>%?LLI&,,.11!((;iG&
!Q1a	k
!" 	519}%&q!QI:;'>%?Q	*00255ahh?)KK&
!Q1yjk
!" Hr6   )(r!  typingr   r   r   r;   torch.nnr&   configuration_utilsr   r   utils.accelerate_utilsr	   attention_processorr   r   autoencoders.vaer   r   downsamplingr   modeling_outputsr   modeling_utilsr   r~   r   
upsamplingr   vaer   Moduler   rW   r   r   r   r   r   r$   r6   r5   <module>rm     s      ) )   B 8 8 J ' 2 ' " # !bryy bJT TnOryy OdlBII l^|ryy |~Dryy DNI):'7 I)Xr6   