
    i<                        d dl mZ d dlmZmZmZ d dlZd dlmc m	Z
 d dlmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% e G d de             Z& G d deee"e      Z'y)    )	dataclass)OptionalTupleUnionN)nn   )ConfigMixinregister_to_config)ConsistencyDecoderScheduler)
BaseOutput)apply_forward_hook)randn_tensor   )AttentionMixin)ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORSAttnAddedKVProcessorAttnProcessor)
ModelMixin)UNet2DModel   )AutoencoderMixinDecoderOutputDiagonalGaussianDistributionEncoderc                       e Zd ZU dZded<   y)ConsistencyDecoderVAEOutputa2  
    Output of encoding method.

    Args:
        latent_dist (`DiagonalGaussianDistribution`):
            Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
            `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
    r   latent_distN)__name__
__module____qualname____doc____annotations__     /home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/diffusers/models/autoencoders/consistency_decoder_vae.pyr   r   &   s     0/r%   r   c            4           e Zd ZdZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d/dedededede	edf   d	e
d
e	edf   dedededede
de	edf   de	edf   dededededededededede	edf   f0 fd       Zd Ze	 d0dej                  de
d eee	e   f   fd!       Ze	 	 	 d1d"ej                  d#eej*                     de
d$ed eee	ej                     f   f
d%       Zd&ej                  d'ej                  d(ed ej                  fd)Zd&ej                  d'ej                  d(ed ej                  fd*Zd0dej                  de
d eee	f   fd+Z	 	 	 d2d,ej                  d-e
de
d#eej*                     d eee	ej                     f   f
d.Z xZS )3ConsistencyDecoderVAEaP  
    The consistency decoder used with DALL-E 3.

    Examples:
        ```py
        >>> import torch
        >>> from diffusers import StableDiffusionPipeline, ConsistencyDecoderVAE

        >>> vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)
        >>> pipe = StableDiffusionPipeline.from_pretrained(
        ...     "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, torch_dtype=torch.float16
        ... ).to("cuda")

        >>> image = pipe("horse", generator=torch.manual_seed(0)).images[0]
        >>> image
        ```
    Fscaling_factorlatent_channelssample_sizeencoder_act_fnencoder_block_out_channels.encoder_double_zencoder_down_block_typesencoder_in_channelsencoder_layers_per_blockencoder_norm_num_groupsencoder_out_channelsdecoder_add_attentiondecoder_block_out_channelsdecoder_down_block_typesdecoder_downsample_paddingdecoder_in_channelsdecoder_layers_per_blockdecoder_norm_epsdecoder_norm_num_groupsdecoder_num_train_timestepsdecoder_out_channelsdecoder_resnet_time_scale_shiftdecoder_time_embedding_typedecoder_up_block_typesc                 |   t         |           t        ||||||	|
|      | _        t	        |||||||||||||      | _        t               | _        | j                  |       | j                  d       | j                  dt        j                  g d      d d d d d f   d       | j                  d	t        j                  g d
      d d d d d f   d       t        j                  d|z  d|z  d      | _        d| _        d| _        | j"                  j$                  | _        t)        | j"                  j$                  t*        t,        f      r| j"                  j$                  d   n| j"                  j$                  }t/        |dt1        | j"                  j2                        dz
  z  z        | _        d| _        y )N)act_fnblock_out_channelsdouble_zdown_block_typesin_channelslayers_per_blocknorm_num_groupsout_channels)add_attentionrC   rE   downsample_paddingrF   rG   norm_epsrH   num_train_timestepsrI   resnet_time_scale_shifttime_embedding_typeup_block_types)rC   F)force_upcastmeans)gg:?gyD?glL?gN3^)
persistentstds)g4?gn=?gr	^?gr` ?r   r   r   g      ?)super__init__r   encoderr   decoder_unetr   decoder_schedulerr
   register_buffertorchtensorr   Conv2d
quant_convuse_slicing
use_tilingconfigr+   tile_sample_min_size
isinstancelisttupleintlenrC   tile_latent_min_sizetile_overlap_factor)selfr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   	__class__s                            r&   rV   zConsistencyDecoderVAE.__init__I   s   V 	!9%5+53-	
 (/959+5%3 ;-$C ;1
 "=!>3MNU3LLIJ4QRTXZ^K^_ 	 	

 	ELL!OPQUWXZ^`dQderw 	 	
 ))A$7_9LaP  %)KK$;$;! $++11D%=A KK##A&(( 	
 %(qSA_A_=`cd=d7e(f$g!#' r%   c           	      j   t        d | j                  j                         D              rt               }nmt        d | j                  j                         D              rt	               }n8t        dt        t        | j                  j                                            | j                  |       y)ze
        Disables custom attention processors and sets the default attention implementation.
        c              3   @   K   | ]  }|j                   t        v   y wN)rk   r   .0procs     r&   	<genexpr>zCConsistencyDecoderVAE.set_default_attn_processor.<locals>.<genexpr>   s     i4t~~!>>i   c              3   @   K   | ]  }|j                   t        v   y wrn   )rk   r   ro   s     r&   rr   zCConsistencyDecoderVAE.set_default_attn_processor.<locals>.<genexpr>   s     h$#==hrs   zOCannot call `set_default_attn_processor` when attention processors are of type N)	allattn_processorsvaluesr   r   
ValueErrornextiterset_attn_processor)rj   	processors     r&   set_default_attn_processorz0ConsistencyDecoderVAE.set_default_attn_processor   s     i4K_K_KfKfKhii,.Ih$J^J^JeJeJghh%Iabfgklp  mA  mA  mH  mH  mJ  hK  cL  bM  N  		*r%   xreturn_dictreturnc                    | j                   rK|j                  d   | j                  kD  s|j                  d   | j                  kD  r| j                  ||      S | j                  rU|j                  d   dkD  rC|j                  d      D cg c]  }| j                  |       }}t        j                  |      }n| j                  |      }| j                  |      }t        |      }|s|fS t        |      S c c}w )al  
        Encode a batch of images into latents.

        Args:
            x (`torch.Tensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
                instead of a plain tuple.

        Returns:
                The latent representations of the encoded images. If `return_dict` is True, a
                [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned, otherwise a
                plain `tuple` is returned.
        )r   r   r   r   )r`   shaperb   tiled_encoder_   splitrW   r[   catr^   r   r   )rj   r~   r   x_sliceencoded_sliceshmoments	posteriors           r&   encodezConsistencyDecoderVAE.encode   s    $ ??d.G.G G177SU;Y]YrYrKr$$QK$@@
QCD771:Ndll73NNN		.)AQA//!$09	<*yAA Os   	C7z	generatornum_inference_stepsc                 :   || j                   j                  z  | j                  z
  | j                  z  }dt	        | j                   j
                        dz
  z  }t        j                  |d|      }|j                  \  }}}}	| j                  j                  || j                         | j                  j                  t        |d||	f||j                  |j                        z  }
| j                  j                  D ]  }t!        j"                  | j                  j%                  |
|      |gd      }| j'                  ||      j(                  d	d	d	dd	d	d	d	f   }| j                  j+                  |||
|      j,                  }|}
 |
}|s|fS t/        |
      S )a  
        Decodes the input latent vector `z` using the consistency decoder VAE model.

        Args:
            z (torch.Tensor): The input latent vector.
            generator (Optional[torch.Generator]): The random number generator. Default is None.
            return_dict (bool): Whether to return the output as a dictionary. Default is True.
            num_inference_steps (int): The number of inference steps. Default is 2.

        Returns:
            Union[DecoderOutput, Tuple[torch.Tensor]]: The decoded output.

        r   r   nearest)modescale_factor)devicer   )r   dtyper   dimNsample)ra   r)   rR   rT   rg   rC   Finterpolater   rY   set_timestepsr   init_noise_sigmar   r   	timestepsr[   concatscale_model_inputrX   r   stepprev_sampler   )rj   r   r   r   r   r   
batch_size_heightwidthx_ttmodel_inputmodel_outputr   x_0s                   r&   decodezConsistencyDecoderVAE.decode   sy   * +++djj8DIIES!?!?@1DEMM!),G'(ww$
Avu,,-@,U$$55FE*iqwwWXW_W_9
 
 ''11 	A,,(>(>(P(PQTVW(XZ['\bcdK,,[!<CCArr1aKPL0055lAsIVbbKC		 6MC((r%   abblend_extentc                     t        |j                  d   |j                  d   |      }t        |      D ]A  }|d d d d | |z   d d f   d||z  z
  z  |d d d d |d d f   ||z  z  z   |d d d d |d d f<   C |S )Nr   r   minr   range)rj   r   r   r   ys        r&   blend_vzConsistencyDecoderVAE.blend_v  s    1771:qwwqz<@|$ 	xAa\MA$5q89Q\AQ=QRUVWXZ[]^`aWaUbfgjvfvUwwAaAqjM	xr%   c                     t        |j                  d   |j                  d   |      }t        |      D ]A  }|d d d d d d | |z   f   d||z  z
  z  |d d d d d d |f   ||z  z  z   |d d d d d d |f<   C |S )Nr   r   r   )rj   r   r   r   r~   s        r&   blend_hzConsistencyDecoderVAE.blend_h  s    1771:qwwqz<@|$ 	xAaA}q'889Q\AQ=QRUVWXZ[]^`aWaUbfgjvfvUwwAaAqjM	xr%   c           
         t        | j                  d| j                  z
  z        }t        | j                  | j                  z        }| j                  |z
  }g }t	        d|j
                  d   |      D ]  }g }t	        d|j
                  d   |      D ]`  }	|dddd||| j                  z   |	|	| j                  z   f   }
| j                  |
      }
| j                  |
      }
|j                  |
       b |j                  |        g }t        |      D ]  \  }}g }t        |      D ]d  \  }	}
|dkD  r| j                  ||dz
     |	   |
|      }
|	dkD  r| j                  ||	dz
     |
|      }
|j                  |
ddddd|d|f          f |j                  t        j                  |d              t        j                  |d      }t        |      }|s|fS t        |      S )a  Encode a batch of images using a tiled encoder.

        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
        output, but they should be much less noticeable.

        Args:
            x (`torch.Tensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
                instead of a plain tuple.

        Returns:
            [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] or `tuple`:
                If return_dict is True, a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
                is returned, otherwise a plain `tuple` is returned.
        r   r   r   r   Nr   r   )rf   rb   ri   rh   r   r   rW   r^   append	enumerater   r   r[   r   r   r   )rj   r~   r   overlap_sizer   	row_limitrowsirowjtileresult_rows
result_rowr   r   s                  r&   r   z"ConsistencyDecoderVAE.tiled_encode  s   ( 444D<T<T8TUV444t7O7OOP--<	 q!''!*l3 	AC1aggaj,7 !Aq1t'@'@#@@!a$JcJcFcBccd||D)t,

4 	!
 KK	 o 
	=FAsJ$S> F4 q5<<QUAlKDq5<<AE
D,GD!!$q!ZiZ)'C"DEF uyy;<
	= ))KQ/09	<*yAAr%   r   sample_posteriorc                     |}| j                  |      j                  }|r|j                  |      }n|j                         }| j	                  ||      j                  }|s|fS t        |      S )a  
        Args:
            sample (`torch.Tensor`): Input sample.
            sample_posterior (`bool`, *optional*, defaults to `False`):
                Whether to sample from the posterior.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
            generator (`torch.Generator`, *optional*, defaults to `None`):
                Generator to use for sampling.

        Returns:
            [`DecoderOutput`] or `tuple`:
                If return_dict is True, a [`DecoderOutput`] is returned, otherwise a plain `tuple` is returned.
        )r   r   )r   r   r   r   r   r   )	rj   r   r   r   r   r~   r   r   decs	            r&   forwardzConsistencyDecoderVAE.forwardQ  sk    * KKN..	  9 5A Akk!yk1886MC((r%   )g{P?       silu)         r   T)DownEncoderBlock2Dr   r   r   r   r   r   r   F)i@  i     r   )ResnetDownsampleBlock2Dr   r   r   r      r   gh㈵>r   r      scale_shiftlearned)ResnetUpsampleBlock2Dr   r   r   )T)NTr   )FTN)r   r    r!   r"   _supports_group_offloadingr
   floatrf   strr   boolrV   r}   r   r[   Tensorr   r   r   r   r   	Generatorr   r   r   r   r   r   __classcell__)rk   s   @r&   r(   r(   4   sX   $ "' !( $6J!%5
 $%()')$%&+6L5
 +,#$()"'')+/$%/<+43
G^(^( ^( 	^(
 ^( %*#s(O^( ^( #(S/^( !^( #&^(  "%!^(" "#^($  $%^(& %*#s(O'^(( #(S/)^(4 %(5^(6 !7^(8 #&9^(:  ;^(< "%=^(> &)?^(@ "A^(B *-C^(D &)E^(F !&c3hG^( ^(B+ 37 B B,0 B	*E2N,OO	P B  BD  04 #$,)<<,) EOO,,) 	,)
 !,) 
}eELL11	2,) ,)^ %,, c ell  %,, c ell 5Bell 5B 5BOjlqOqIr 5Bt "' /3 ) )  ) 	 )
 EOO, ) 
}eELL11	2 )r%   r(   )(dataclassesr   typingr   r   r   r[   torch.nn.functionalr   
functionalr   configuration_utilsr	   r
   
schedulersr   utilsr   utils.accelerate_utilsr   utils.torch_utilsr   	attentionr   attention_processorr   r   r   r   modeling_utilsr   unets.unet_2dr   vaer   r   r   r   r   r(   r$   r%   r&   <module>r      sv    " ) )     B 5  8 - &  ( ' W W 
0* 
0 
0})J8H+ })r%   