
    i"`                        d dl Z d dlmZmZmZ d dlZd dlZd dlZ	d dl
Z
d dlmZmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZm Z  ddl!m"Z"m#Z#m$Z$ ddlm%Z%  e       rd dl&Z& e       rd dl'm(Z(  ejR                  e*      Z+d Z,d Z-d Z.dededee/ee/   f   de0de
jb                  f
dZ2	 d5dedededee
jb                     fdZ3	 d6de
jh                  dee
jj                     d e/fd!Z6	 d7d"e
jh                  d#ede
jj                  de
jb                  d$e
jn                  d%e0fd&Z8 G d' d(e      Z9 G d) d*e      Z: G d+ d,e      Z; G d- d.e      Z< G d/ d0e      Z= G d1 d2e      Z> G d3 d4e      Z?y)8    N)ListOptionalUnion)AutoTokenizerCLIPImageProcessorCLIPVisionModelUMT5EncoderModel   )
FrozenDict)ClassifierFreeGuidance)PipelineImageInput)AutoencoderKLWan)is_ftfy_availableis_torchvision_availablelogging)VideoProcessor   )ModularPipelineBlocksPipelineState)ComponentSpec
InputParamOutputParam   )WanModularPipeline)
transformsc                     t        j                  |       } t        j                  t        j                  |             } | j	                         S N)ftfyfix_texthtmlunescapestriptexts    r/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/diffusers/modular_pipelines/wan/encoders.pybasic_cleanr&   -   s3    ==D==t,-D::<    c                 T    t        j                  dd|       } | j                         } | S )Nz\s+ )resubr"   r#   s    r%   whitespace_cleanr,   3   s$    66&#t$D::<DKr'   c                 .    t        t        |             } | S r   )r,   r&   r#   s    r%   prompt_cleanr.   9   s    K-.DKr'   text_encoder	tokenizerpromptmax_sequence_lengthdevicec                    | j                   }t        |t              r|gn|}|D cg c]  }t        |       }} ||d|dddd      }|j                  |j
                  }	}|	j                  d      j                  d      j                         }
 | |j                  |      |	j                  |            j                  }|j                  ||      }t        ||
      D cg c]
  \  }}|d |  }}}t        j                  |D cg c]J  }t        j                  ||j                  ||j!                  d      z
  |j!                  d            g      L c}d      }|S c c}w c c}}w c c}w )	N
max_lengthTpt)paddingr5   
truncationadd_special_tokensreturn_attention_maskreturn_tensorsr   r   dim)dtyper3   )r>   
isinstancestrr.   	input_idsattention_maskgtsumlongtolast_hidden_stateziptorchstackcat	new_zerossize)r/   r0   r1   r2   r3   r>   utext_inputstext_input_idsmaskseq_lensprompt_embedsvs                r%   get_t5_prompt_embedsrU   >   sR    E#FC0fXfF'-.!l1o.F.&"K '00+2L2LDNwwqz~~!~$))+H !2!26!:DGGFOL^^M!$$5$@M'*=('CDtq!QrUDMDKKZghUVAq{{#6#BAFF1INO	PhnoM ) / Ehs   E"#E'AE-imageimage_processorimage_encoderc                 j     || d      j                  |      }  |di | ddi}|j                  d   S )Nr6   )imagesr;   output_hidden_statesT )rF   hidden_states)rV   rW   rX   r3   image_embedss        r%   encode_imager`   ^   s>     5>AA&IE D5DtDL%%b))r'   encoder_output	generatorsample_modec                     t        | d      r |dk(  r| j                  j                  |      S t        | d      r|dk(  r| j                  j                         S t        | d      r| j                  S t        d      )Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrre   rf   moderh   AttributeError)ra   rb   rc   s      r%   retrieve_latentsrl   j   st     ~}-+2I))00;;		/K84K))..00		+%%%RSSr'   video_tensorvaer>   latent_channelsc                    t        | t        j                        st        dt	        |        d      t        |t
              rCt        |      | j                  d   k7  r(t        dt        |       d| j                  d    d      | j                  ||      } t        |t
              rat        | j                  d         D cg c](  }t        |j                  | ||dz          ||   d	      * }}t        j                  |d
      }nt        |j                  |       d      }t        j                  |j                  j                        j!                  d|ddd      j                  |j"                  |j$                        }dt        j                  |j                  j&                        j!                  d|ddd      j                  |j"                  |j$                        z  }	||z
  |	z  }|S c c}w )Nz*Expected video_tensor to be a tensor, got .r   z/You have passed a list of generators of length z), but it is not same as number of images r3   r>   r   rg   )rb   rc   r<   )rc   g      ?)r?   rI   Tensor
ValueErrortypelistlenshaperF   rangerl   encoderK   tensorconfiglatents_meanviewr3   r>   latents_std)
rm   rn   rb   r3   r>   ro   ivideo_latentsr}   r   s
             r%   encode_vae_imager   w   s    lELL1Ed<FXEYYZ[\\)T"s9~9K9KA9N'N=c)n=MMv  xD  xJ  xJ  KL  xM  wN  NO  P
 	
  ??&?>L)T" <--a01
 SZZQQ(?@IVWLfno
 
 		-Q7(L)AxX 	SZZ,,-	a!Q	*	M  -"5"5	6 
 SZZ%;%;<AA!_VWYZ\]^aam11 K #\1[@M%
s   -G3c                      e Zd ZdZedefd       Zedee   fd       Z	edee
   fd       Zedee   fd       Zed        Ze	 	 	 	 dd	ed
eej$                     dedee   def
d       Z ej,                         dededefd       Zy)WanTextEncoderStepwanreturnc                      y)NzMText Encoder step that generate text_embeddings to guide the video generationr]   selfs    r%   descriptionzWanTextEncoderStep.description   s    ^r'   c           
      |    t        dt              t        dt              t        dt        t	        ddi      d      gS )Nr/   r0   guiderguidance_scaleg      @from_configr|   default_creation_method)r   r	   r   r   r   r   s    r%   expected_componentsz&WanTextEncoderStep.expected_components   sB     .*:;+}5&!#3S"9:(5		
 		
r'   c                 F    t        d      t        d      t        dd      gS )Nr1   negative_promptr2      )default)r   r   s    r%   inputszWanTextEncoderStep.inputs   s*     x (),c:
 	
r'   c                 v    t        dt        j                  dd      t        dt        j                  dd      gS )NrS   denoiser_input_fieldsz2text embeddings used to guide the image generation)	type_hintkwargs_typer   negative_prompt_embedsz;negative text embeddings used to guide the image generationr   rI   rs   r   s    r%   intermediate_outputsz'WanTextEncoderStep.intermediate_outputs   s@     ,,3P	 (,,3Y	
 	
r'   c                     | j                   Wt        | j                   t              s<t        | j                   t              s!t	        dt        | j                                y y y )Nz2`prompt` has to be of type `str` or `list` but is )r1   r?   r@   rv   rt   ru   )block_states    r%   check_inputszWanTextEncoderStep.check_inputs   s]    );--s3J{GYGY[_<`QRVWbWiWiRjQklmm =a3 *r'   Nr1   r3   prepare_unconditional_embedsr   r2   c                    |xs | j                   }t        |t              s|g}t        |      }t	        | j
                  | j                  |||      }|r|xs d}t        |t              r||gz  n|}|:t        |      t        |      ur$t        dt        |       dt        |       d      |t        |      k7  r!t        d| dt        |       d| d| d		      t	        | j
                  | j                  |||      }|fS )
aC  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            prepare_unconditional_embeds (`bool`):
                whether to use prepare unconditional embeddings or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            max_sequence_length (`int`, defaults to `512`):
                The maximum number of text tokens to be used for the generation process.
        )r/   r0   r1   r2   r3    z?`negative_prompt` should be the same type to `prompt`, but got z != rq   z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)_execution_devicer?   rv   rw   rU   r/   r0   r@   ru   	TypeErrorrt   )	
componentsr1   r3   r   r   r2   
batch_sizerS   r   s	            r%   encode_promptz WanTextEncoderStep.encode_prompt   sG   4 7:77&$'XF[
,#00 ** 3
 (-3O@J?\_@`jO+<<fuO!d6l$:O&OUVZ[jVkUl mV~Q(  s?33 )/)::J3K_J` ax/
| <33  &:'44$..&$7&" 444r'   r   statec                 @   | j                  |      }| j                  |       |j                  |_        | j	                  ||j
                  |j                  |j                  |j                  |j                        \  |_	        |_
        | j                  ||       ||fS )N)r   r1   r3   r   r   r2   )get_block_stater   r   r3   r   r1   requires_unconditional_embedsr   r2   rS   r   set_block_state)r   r   r   r   s       r%   __call__zWanTextEncoderStep.__call__  s     **51+&'99 !%%%%)3)Q)Q'77 + ? ?  
	
%. 	UK05  r'   )NTNr   )__name__
__module____qualname__
model_namepropertyr@   r   r   r   r   r   r   r   r   staticmethodr   r   rI   r3   boolintr   no_gradr   r   r   r]   r'   r%   r   r      s,   J_S _ _ 

T-%8 

 

 
Z( 
 
 
d;&7 
 
  n n  *.-1)-#&>5>5 &>5 '+	>5
 "#>5 !>5 >5@ U]]_!#5 !m !P] ! !r'   r   c                   r    e Zd ZdZedefd       Zedee   fd       Z	edee
   fd       ZdededefdZy	)
WanImageResizeStepr   r   c                      y)NzoImage Resize step that resize the image to the target area (height * width) while maintaining the aspect ratio.r]   r   s    r%   r   zWanImageResizeStep.description1       Ar'   c                     t        dt        j                  j                  d      t        dt        d      t        dt        d      gS )	NrV   Tr   requiredheighti  )r   r   widthi@  )r   PILImager   r   s    r%   r   zWanImageResizeStep.inputs5  s:     w#))//DIx3<w#s;
 	
r'   c                 N    t        dt        j                  j                        gS )Nresized_imager   r   r   r   r   s    r%   r   z'WanImageResizeStep.intermediate_outputs=  s      399??C
 	
r'   r   r   c                    | j                  |      }|j                  |j                  z  }|j                  }|j                  |j                  z  }|j                  |j
                  z  }t        t        j                  ||z              |z  |z  |_        t        t        j                  ||z              |z  |z  |_        |j                  |j                  |j                  f      |_
        | j                  ||       ||fS r   )r   r   r   rV   vae_scale_factor_spatialpatch_size_spatialroundnpsqrtresizer   r   )r   r   r   r   max_arearV   aspect_ratio	mod_values           r%   r   zWanImageResizeStep.__call__C  s    **51%%(9(99!!||ekk177*:W:WW	"2778l+B#CD	QT]]!"''(\*A"BCyPS\\$)LL+2C2C[EWEW1X$Y!UK05  r'   Nr   r   r   r   r   r@   r   r   r   r   r   r   r   r   r   r]   r'   r%   r   r   .  s    JAS A A 
Z( 
 
 
d;&7 
 

!#5 !m !P] !r'   r   c                   r    e Zd ZdZedefd       Zedee   fd       Z	edee
   fd       ZdededefdZy	)
WanImageCropResizeStepr   r   c                      y)NzdImage Resize step that resize the last_image to the same size of first frame image with center crop.r]   r   s    r%   r   z"WanImageCropResizeStep.descriptionU  s    ur'   c                     t        dt        j                  j                  dd      t        dt        j                  j                  dd      gS )Nr   TzThe resized first frame image)r   r   r   
last_imagezThe last frameimager   r   r   r   s    r%   r   zWanImageCropResizeStep.inputsY  sA     399??TWv |syy[pq	
 	
r'   c                 N    t        dt        j                  j                        gS )Nresized_last_imager   r   r   s    r%   r   z+WanImageCropResizeStep.intermediate_outputsb  s!     ,		H
 	
r'   r   r   c                    | j                  |      }|j                  j                  }|j                  j                  }|j                  }t        ||j                  z  ||j                  z        }t        |j                  |z        }t        |j                  |z        }||g}t        j                  j                  ||      }	|	|_
        | j                  ||       ||fS r   )r   r   r   r   r   maxr   r   
functionalcenter_cropr   r   )
r   r   r   r   r   r   rV   resize_ratiorM   r   s
             r%   r   zWanImageCropResizeStep.__call__h  s    **51**11))//&& 55;;.0EF ekkL01u||l23v"--99%F)6&UK05  r'   Nr   r]   r'   r%   r   r   R  s    JvS v v 
Z( 
 
 
d;&7 
 

!#5 !m !P] !r'   r   c                       e Zd ZdZedefd       Zedee   fd       Z	edee
   fd       Zedee   fd       Zdededefd	Zy
)WanImageEncoderStepr   r   c                      y)NzfImage Encoder step that generate image_embeds based on first frame image to guide the video generationr]   r   s    r%   r   zWanImageEncoderStep.description  s    wr'   c                 B    t        dt              t        dt              gS NrW   rX   r   r   r   r   s    r%   r   z'WanImageEncoderStep.expected_components  $     +-?@/?;
 	
r'   c                 P    t        dt        j                  j                  d      gS )Nr   Tr   r   r   s    r%   r   zWanImageEncoderStep.inputs  s"     #))//DQ
 	
r'   c                 <    t        dt        j                  d      gS Nr_   zThe image embeddingsr   r   r   r   s    r%   r   z(WanImageEncoderStep.intermediate_outputs       %,,Lbc
 	
r'   r   r   c                     | j                  |      }|j                  }|j                  }t        |j                  |j
                  ||      }||_        | j                  ||       ||fS N)rW   rX   rV   r3   )r   r   r   r`   rW   rX   r_   r   )r   r   r   r   r3   rV   r_   s          r%   r   zWanImageEncoderStep.__call__  so    **51--))#&66$22	
 $0 UK05  r'   Nr   r   r   r   r   r@   r   r   r   r   r   r   r   r   r   r   r   r]   r'   r%   r   r   }  s    JxS x x 
T-%8 
 
 
Z( 
 

 
d;&7 
 

!#5 !m !P] !r'   r   c                       e Zd ZdZedefd       Zedee   fd       Z	edee
   fd       Zedee   fd       Zdededefd	Zy
)!WanFirstLastFrameImageEncoderStepr   r   c                      y)NzpImage Encoder step that generate image_embeds based on first and last frame images to guide the video generationr]   r   s    r%   r   z-WanFirstLastFrameImageEncoderStep.description  s     Br'   c                 B    t        dt              t        dt              gS r   r   r   s    r%   r   z5WanFirstLastFrameImageEncoderStep.expected_components  r   r'   c                     t        dt        j                  j                  d      t        dt        j                  j                  d      gS )Nr   Tr   r   r   r   s    r%   r   z(WanFirstLastFrameImageEncoderStep.inputs  s8     #))//DQ+syyQUV
 	
r'   c                 <    t        dt        j                  d      gS r   r   r   s    r%   r   z6WanFirstLastFrameImageEncoderStep.intermediate_outputs  r   r'   r   r   c                     | j                  |      }|j                  }|j                  }|j                  }t	        |j
                  |j                  ||g|      }||_        | j                  ||       ||fS r   )	r   r   r   r   r`   rW   rX   r_   r   )r   r   r   r   r3   first_frame_imagelast_frame_imager_   s           r%   r   z*WanFirstLastFrameImageEncoderStep.__call__  s    **51--'55&99#&66$22$&67	
 $0 UK05  r'   Nr   r]   r'   r%   r   r     s    JBS B B 
T-%8 
 
 
Z( 
 
 
d;&7 
 

!#5 !m !P] !r'   r   c                       e Zd ZdZedefd       Zedee   fd       Z	edee
   fd       Zedee   fd       Zed        Zded	edefd
Zy)WanVaeImageEncoderStepr   r   c                      y)NzoVae Image Encoder step that generate condition_latents based on first frame image to guide the video generationr]   r   s    r%   r   z"WanVaeImageEncoderStep.description  r   r'   c           	      ^    t        dt              t        dt        t        ddi      d      gS Nrn   video_processorvae_scale_factor   r   r   r   r   r   r   r   s    r%   r   z*WanVaeImageEncoderStep.expected_components  8     %!12!!#5q"9:(5	
 	
r'   c                     t        dt        j                  j                  d      t        d      t        d      t        d      t        d      gS )Nr   Tr   r   r   
num_framesrb   r   r   s    r%   r   zWanVaeImageEncoderStep.inputs  sB     #))//DQx w|${#
 	
r'   c                 <    t        dt        j                  d      gS )Nfirst_frame_latentsz@video latent representation with the first frame image conditionr   r   r   s    r%   r   z+WanVaeImageEncoderStep.intermediate_outputs  s#     %,,^
 	
r'   c           	         |j                   |j                   | j                  z  dk7  s(|j                  O|j                  | j                  z  dk7  r3t        d| j                   d|j                    d|j                   d      |j                  U|j                  dk  s|j                  dz
  | j
                  z  dk7  r&t        d| j
                   d|j                   d      y y 	Nr   z-`height` and `width` have to be divisible by z	 but are z and rq   r   zQ`num_frames` has to be greater than 0, and (num_frames - 1) must be divisible by z
, but got r   r   r   rt   r  vae_scale_factor_temporalr   r   s     r%   r   z#WanVaeImageEncoderStep.check_inputs  O   *{/A/AJDgDg/gkl/l)k.?.?*BeBe.eij.j?
@c@c?ddmny  oA  oA  nB  BG  HS  HY  HY  GZ  Z[  \  !!-""Q&;+A+AA+EImIm*mqr*rcdn  eI  eI  dJ  JT  U`  Uk  Uk  Tl  lm  n  +s .r'   r   r   c           
      "   | j                  |      }| j                  ||       |j                  }|j                  }t        j
                  }|j                  xs |j                  }|j                  xs |j                  }|j                  xs |j                  }	|j                  j                  |||      j                  ||      }
|
j                         dk(  r|
j!                  d      }
t	        j"                  |
|
j%                  |
j&                  d   |
j&                  d   |	dz
  ||      gd      j                  ||      }t)        ||j*                  |j,                  |||j.                        |_        | j3                  ||       ||fS )	Nr   r   rr      r   r   r   r<   rm   rn   rb   r3   r>   ro   )r   r   r   r   rI   float32r   default_heightr   default_widthr  default_num_framesr   
preprocessrF   r=   	unsqueezerK   rL   rx   r   rn   rb   num_channels_latentsr  r   )r   r   r   r   rV   r3   r>   r   r   r  image_tensorrm   s               r%   r   zWanVaeImageEncoderStep.__call__  s   **51*k2))--##@z'@'@!!=Z%=%= ++Lz/L/L
!11<<U6Y^<_bb c 
 "'11!4Lyy&&|'9'9!'<l>P>PQR>SU_bcUcekmrs 
 "F%"
( 	 +;%!++&;;+
' 	UK05  r'   Nr   r   r   r   r   r@   r   r   r   r   r   r   r   r   r   r   r   r   r   r]   r'   r%   r   r     s    JAS A A 	
T-%8 	
 	
 
Z( 
 
 
d;&7 
 
  &!#5 &!m &!P] &!r'   r   c                       e Zd ZdZedefd       Zedee   fd       Z	edee
   fd       Zedee   fd       Zed        Zded	edefd
Zy)$WanFirstLastFrameVaeImageEncoderStepr   r   c                      y)NzyVae Image Encoder step that generate condition_latents based on first and last frame images to guide the video generationr]   r   s    r%   r   z0WanFirstLastFrameVaeImageEncoderStep.description9  s     Kr'   c           	      ^    t        dt              t        dt        t        ddi      d      gS r   r   r   s    r%   r   z8WanFirstLastFrameVaeImageEncoderStep.expected_components=  r   r'   c                     t        dt        j                  j                  d      t        dt        j                  j                  d      t        d      t        d      t        d      t        d      gS )	Nr   Tr   r   r   r   r  rb   r   r   s    r%   r   z+WanFirstLastFrameVaeImageEncoderStep.inputsI  sX     #))//DQ+syyQUVx w|${#
 	
r'   c                 <    t        dt        j                  d      gS )Nfirst_last_frame_latentszJvideo latent representation with the first and last frame images conditionr   r   r   s    r%   r   z9WanFirstLastFrameVaeImageEncoderStep.intermediate_outputsT  s#     *,,h
 	
r'   c           	         |j                   |j                   | j                  z  dk7  s(|j                  O|j                  | j                  z  dk7  r3t        d| j                   d|j                    d|j                   d      |j                  U|j                  dk  s|j                  dz
  | j
                  z  dk7  r&t        d| j
                   d|j                   d      y y r  r  r	  s     r%   r   z1WanFirstLastFrameVaeImageEncoderStep.check_inputs^  r
  r'   r   r   c           
         | j                  |      }| j                  ||       |j                  }|j                  }|j                  }t
        j                  }|j                  xs |j                  }|j                  xs |j                  }	|j                  xs |j                  }
|j                  j                  |||	      j                  ||      }|j!                  d      }|j                  j                  |||	      j                  ||      }|j!                  d      }t        j"                  ||j%                  |j&                  d   |j&                  d   |
dz
  ||	      |gd      j                  ||      }t)        ||j*                  |j,                  |||j.                        |_        | j3                  ||       ||fS )Nr  rr   r   r   r   r<   r  )r   r   r   r   r   rI   r  r   r  r   r  r  r  r   r  rF   r  rK   rL   rx   r   rn   rb   r  r  r   )r   r   r   r   r   r   r3   r>   r   r   r  first_image_tensorlast_image_tensorrm   s                 r%   r   z-WanFirstLastFrameVaeImageEncoderStep.__call__m  s   **51*k2'55&99--##@z'@'@!!=Z%=%= ++Lz/L/L
'77BBCT]ckpBqtt u 
 099!<&66AABR[ainAorr s 
 .77:yy"",,&,,Q/1C1I1I!1Lj[\n^dfk " 	
 "F%"
( 	 0@%!++&;;0
, 	UK05  r'   Nr  r]   r'   r%   r  r  6  s    JKS K K 	
T-%8 	
 	
 
Z( 
 
 
d;&7 
 
  .!#5 .!m .!P] .!r'   r  r   )Nrf   )   )@r    typingr   r   r   numpyr   r   regexr*   rI   transformersr   r   r   r	   configuration_utilsr   guidersr   rW   r   modelsr   utilsr   r   r   r   r   modular_pipeliner   r   modular_pipeline_utilsr   r   r   r   r   torchvisionr   
get_loggerr   loggerr&   r,   r.   r@   r   r3   rU   r`   rs   	Generatorrl   r>   r   r   r   r   r   r   r   r  r]   r'   r%   <module>r2     s    ( (  
   ] ] - - 1 & I I - C K K 0 & 
		H	%
" #tCy.! 	
 LLH &*	**'* #* U\\"	* ck
TLL
T-5eoo-F
T\_
T& %,,%	% % LL	%
 ;;% %PL!. L!^!!. !!H(!2 (!V)!/ )!X+!(= +!\\!2 \!~e!+@ e!r'   