
    i                        d dl Z d dlmZmZmZmZ d dlZd dlmZ d dl	mc m
Z ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ d
dlmZmZmZ  ee      Zg dZg dZ ej>                  ddg       ej>                  ddg      dZ  G d dejB                        Z" G d dej                  jF                        Z$ G d dejF                        Z% G d dejF                        Z& G d dejF                        Z' G d dejF                        Z( G d dejF                        Z) G d d ejF                        Z* G d! d"ejF                        Z+ G d# d$      Z, G d% d&      Z- G d' d(ejF                        Z. G d) d*ejF                        Z/ G d+ d,ejF                        Z0 G d- d.ejF                        Z1 G d/ d0ejF                        Z2 G d1 d2eee      Z3y)3    N)ListOptionalTupleUnion   )ConfigMixinregister_to_config)
get_logger)apply_forward_hook   )AutoencoderKLOutput)
ModelMixin   )AutoencoderMixinDecoderOutputIdentityDistribution(   g4t?g}Gug#әr?gչ?gF%.2Ǔ?g ?gu	$'y?g
ˆj?g&.*?gij'-?g	ف4?g癖?g1X?gj!?g	3,?g.$-Ĕ?g5	{?g;,T?g 
~?g=-7?gTjݣ$?g,l?gM53ݨ?gJF?g!!%?gnK?ggI?gg=c?g9m?g԰?gJHZ?go @?g7?goS?gJK?ge:Ɯ?gѴ@?gfTF?g #?g~[?gJn^慅?gCo?g<
~?gڪ?g?gϾ,̠?guס?g'#J9?g2`gJ?gJvSgAj?gsQ?gO?gQgQL?g
$
?g^_>?g=Y{7]?g%5?gdH_RƧ?g+>?gU?gC?g"~͋?gCfx	?g2gkgg[Yͅg窳9gޑ녆g8&g-!g
Tgc#듿gZgprg2ϙg9Qygeѡۘgq!mFzg{nHgY[N?g)WgS
h.`g=J$g[<箿gTgQgÙgzR gDԷ骿gW5_g˺V﫿gW'g(xg<gg$Tg}bs?gϹJIf?gʱicogaHl?gI.A?gi6xx2?g\?g؂9գ?gy@vjp?gBVP?gDEk?gƵ?gbEO3?g'".H?g$Pq[?gzة?g%dsY?g'#|?g]XP?gO(r?g~l?g9ݪҳ?gm,´?g%@?gMIcQd?ga\?gm`?g`?g	^K%?gcCbӳ?gWƳ?gw=$?g^A0t?g<Mg`}T?gஅgOgFۆg,Yg[ݐg4UAVgѢF:g	g$竁gXWgXbjgcbgc9gg`wg8@?g=C?g<q?g6*	?g!6X?gMܹQ?gt7?g@?g?ge؋?gZSv%?g˸-?gEi?gkn[?giUH?g ?g]ӶHg̸gE}q-ggzdgf{xgCg>gRLge,gnXzgޱg33'g{_g66gklFfg~I+g)݆t8g[mNg-RQg`g;P]g~U7,pg+2gkog(Jg
v'gR~Fg\T(rgKH`gV3(gX]g>(ſg_rH?gVW̤?gIG?gw?g2O?gK?gTA
}?g#l4?gaVL?g|__Ş?g+!b?g1?gc=2)?g8pN?g?g[H|gd*"TggAGg9[av&ggR|s.g
1.g㪿g-J?DgOYggno g'@Pg҂@gJZ᧧gC¨g##?g~"G!f?gGƝgᄕLFg:&GgA?#g0æg6,>"ئg)U g.3Ͼgՙg]p멿gt5l)Zg	bfէg3樿gAKi ʨg ?g,wn)tg;VEg;!	5g3CgIɛ{gƹguMؤgm2g{gҚ̀g,DEgUslڵggeY]g/`䣿(   g0$?g6D~?gxG ?gs"I?g+!~Q?g4uhU?g&|ⵁ?g$m?gB=j?g)n?y?gZT8@8x?gE>3?gl1?gz[#ac?g&߮?g{W?gWGG?g(p?g8!?gMN?gd?g |?g%r?gl??gEeT?g`?gǢ?gm??gAvA?g]@?J?gu!?gn|)?gw3/?g y?gv?gg~N?gHA]?g@=?gڶ%?g7?g/w?gyޕ?g%Uf?g ?g4x"&2?gp_8?g%ϳafB?g^.^M?gnˀf?g6??gr_!s?gЎ?gda?g9B66?g]o!U5?g}a!g?gWRա&|?gZua?g#?gȟ?gd\=?gYfQ?gNl ?g9B6?g@ _?g1	8 ?g776?gqI?g4?gL?g>`#?g=1_?g?g"?g9w`G?g<?g]f@b@?g^cf?g j?g[U}g?g<^?gݎ}?g)>i?gi^a:?g	j[AV?guU^p?g9{>?g퀈`^?gR4A?g;!	?g??g|)=?g&!a{?gT
O"?g	?gՇ?gk82?g6fR?gx*o?goW՜?g,g~5?g7b?g ?gn)?gC~
6?gHUC?ggB}D?g@!R?g5T?g)0W?gx?gs0 R?gHDz?gmc?g1!$?g&G*!`?g/Qv?g?gh0x^K?gʑA?g%6a?g_T?g?gEr"?gr",?g%Z?g2Rs@?g2?ge]//?g_?g,n?gٍ_?ge)w?ga:?g/@0?g8a?g{ ?gG?gܿ}.?g +?gfޢ?gX?gsG?g>?gbL?gV?gox'?ggS~?gڊ?g@#)?gao?g/?g(bS?g̲'?g6>?g8^)/?gjxb?gπ8W?gO4r?g`?g[?g%L/?g?g贗?g?gܽy?g?g~?gnm]?g
?gc: ?g(?g򸠥(?g%?g:T^"?gy3E?g?gD?gLՇ??g:B?g.S?g?g?gj5ȑ?g[F̌?gz,!?gǸ?gJ"?g+~?g}R7a?gO_?gO;>R?gP]?gߑ?gdֽ?g=I?g#ܞb?g3t8?g^b?g,?g̥)^?gԲ!|?g)j/n?gB_?gף.!8?gʇↆ?gR?g3!!b?g_ ?gLsD!vd?gd|?gmA?g>[?gA~8?gNE	~Da?gٰo?gK?g[B?gA`a?gc>A?gBӯ ?gM?g?gF]0$?g*ڠ;?g8m?g$k?g=mB?g]]>'w?gg>N?g1?gb?gIK"?gi?gW
?@?gY8Q#?gn?l-?gzX^2?gpp?gQ*v?g̀u?gc-?gA.?gZ2?g_JUEZ?g]$rw?gi}m?gbͽn?g_?gg7
?g>(޷?g F?g ?g(_?g8+&?gSa?g?g;f?      ?)haar	rearrangec                        e Zd Z	 	 	 	 	 	 	 ddededeeeeeef   f   deeeeeef   f   deeeeeef   f   dededd	f fd
Zdej                  dej                  f fdZ
 xZS )CosmosCausalConv3din_channelsout_channelskernel_sizedilationstridepaddingpad_modereturnNc                 D   t        |t              r|||fn|}t        |t              r|||fn|}t        |t              r|||fn|}|\  }}	}
|	dz  dk(  r|
dz  dk(  sJ t        |   |||||       || _        |d   |d   dz
  z  d|d   z
  z   | _        ||||f| _        y )Nr   r   )r   r   r   )
isinstanceintsuper__init__r   temporal_padspatial_pad)selfr   r   r   r   r   r   r   _height_kernel_sizewidth_kernel_size	__class__s              }/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/diffusers/models/autoencoders/autoencoder_kl_cosmos.pyr$   zCosmosCausalConv3d.__init__.   s     BLKY\A]{K=cn5?#5NHh1T\-7-D&&&)&3>00!A%*/@1/D/III 	 	
 !$QK;q>A+=>!fQi-P#Wgw?    hidden_statesc                    |d d d d d ddf   j                  dd| j                  dd      }t        j                  ||gd      }t	        j
                  |g | j                  dd| j                  d      }t        | %  |      S )Nr   .r   dimr           modevalue)
repeatr%   torchcatFpadr&   r   r#   forward)r'   r.   hidden_states_prevr+   s      r,   r;   zCosmosCausalConv3d.forwardK   s    *1a!S=9@@AtGXGXZ[]^_		#5}"E1Mm-Ft/?/?-F-FA-FT]]befw}--r-   )r   r   )r   r   r   r   r   r   r=   r   constant)__name__
__module____qualname__r"   r   r   strr$   r7   Tensorr;   __classcell__r+   s   @r,   r   r   -   s     8A5>3<"@@ @ 3c3m 445	@
 U3S=112@ c5c3//0@ @ @ 
@:.U\\ .ell . .r-   r   c                   b     e Zd Zddedef fdZdej                  dej                  fdZ xZS )CosmosCausalGroupNormr   
num_groupsc                 l    t         |           t        j                  ||dd      | _        || _        y )Ngư>T)rH   num_channelsepsaffine)r#   r$   nn	GroupNormnormrH   )r'   r   rH   r+   s      r,   r$   zCosmosCausalGroupNorm.__init__S   s3    LL!$	
	 %r-   r.   r   c                 &   | j                   dk(  rp|j                  d      }|j                  ddddd      j                  dd      }| j	                  |      }|j                  d|df      j                  ddddd      }|S | j	                  |      }|S )Nr   r   r   r      )rH   sizepermuteflattenrO   	unflatten)r'   r.   
batch_sizes      r,   r;   zCosmosCausalGroupNorm.forward]   s    ??a&++A.J)11!Q1a@HHANM IIm4M)33A
B7GHPP1aAM
  !IIm4Mr-   r   	r?   r@   rA   r"   r$   r7   rC   r;   rD   rE   s   @r,   rG   rG   R   s1    %C %S %
U\\ 
ell 
r-   rG   c                       e Zd Zddededdf fdZddej                  dedej                  fdZdej                  dej                  fd	Z	dej                  dej                  fd
Z
dej                  dej                  fdZ xZS )CosmosPatchEmbed3d
patch_sizepatch_methodr   Nc                    t         |           || _        || _        t        j                  |      j                         }t        j                  |j                  d         }| j                  d|d       | j                  d|d       y Nr   waveletsF)
persistent_aranger#   r$   r\   r]   	_WAVELETSgetcloner7   arangeshaperegister_bufferr'   r\   r]   r`   rg   r+   s        r,   r$   zCosmosPatchEmbed3d.__init__k   u    $(==.446hnnQ/0ZeDY5Ar-   r.   r4   c           
      j   |j                   }| j                  }|j                  d   }|j                  d   }|j                  d      j	                  ddd      j                  |dd      }|d| j                  z  z  j	                  ddd      j                  |dd      }	|	j                  |      }	|j                  |      }t        j                  |t        d|dz
        |dz
  |dz
  |dz
  |dz
  |dz
  f|      j                  |      }t        j                  ||j                  d      j                  d      |d	
      }
t        j                  ||	j                  d      j                  d      |d	
      }t        j                  |
|j                  d      j                  d      |d
      }t        j                  |
|	j                  d      j                  d      |d
      }t        j                  ||j                  d      j                  d      |d
      }t        j                  ||	j                  d      j                  d      |d
      }t        j                  ||j                  d      j                  d      |d
      }t        j                  ||	j                  d      j                  d      |d
      }t        j                  ||j                  d      j                  d      |d
      }t        j                  ||	j                  d      j                  d      |d
      }t        j                  ||j                  d      j                  d      |d
      }t        j                  ||	j                  d      j                  d      |d
      }t        j                  ||j                  d      j                  d      |d
      }t        j                  ||	j                  d      j                  d      |d
      }t        j                  ||||||||gd      }|r|dz  }|S )Nr   r   rR   dtyper   )r:   r4   r   rQ   r   r   r   groupsr   r   r   r   r   r   r   r0   ;f@)rn   r`   rh   flipreshaper6   rb   tor9   r:   maxconv3d	unsqueezer7   r8   )r'   r.   r4   rescalern   r`   nghlhhxlxhxllxlhxhlxhhxlllxllhxlhlxlhhxhllxhlhxhhlxhhhs                           r,   _dwtzCosmosPatchEmbed3d._dwtw   sd   ##==NN1"]]1%%aB/66q!Q?2$,,./88ArBII!QPQRUUUUUU m#aQ-QAqSTuVWZ[V[]^ab]b1cjnorr
 XXmR\\!_%>%>q%A!T]^XXmR\\!_%>%>q%A!T]^ hhr2<<?44Q7)Thhr2<<?44Q7)Thhr2<<?44Q7)Thhr2<<?44Q7)TxxR\\!_66q9!IVxxR\\!_66q9!IVxxR\\!_66q9!IVxxR\\!_66q9!IVxxR\\!_66q9!IVxxR\\!_66q9!IVxxR\\!_66q9!IVxxR\\!_66q9!IV		4tT4tT"RXYZ)F2Mr-   c                 Z   t        j                  |d|j                  d   dz
  gd      \  }}t        j                  |j	                  | j
                  d      |gd      }t        t        t        j                  | j
                                    D ]  }| j                  |d      } |S )Nr   r   r0   Tr{   )r7   splitrh   r8   repeat_interleaver\   ranger"   mathlog2r   )r'   r.   xixvr(   s        r,   _haarzCosmosPatchEmbed3d._haar   s    ]Q0C0CA0F0J,KQRSB		2#7#7Q#7#OQS"TZ[\s499T__567 	CA IImTIBM	Cr-   c           
         t        j                  |d|j                  d   dz
  gd      \  }}t        j                  |j	                  | j
                  d      |gd      }|j                  \  }}}}}| j
                  }	|j                  ||||	z  |	||	z  |	||	z  |	      }|j                  dddddddd	      j                  dd      j                         }|S )
Nr   r   r0   r   r         rQ      )
r7   r   rh   r8   r   r\   rv   rT   rU   
contiguous)
r'   r.   r   r   rW   rJ   
num_framesheightwidthps
             r,   _arrangezCosmosPatchEmbed3d._arrange   s    ]Q0C0CA0F0J,KQRSB		2#7#7Q#7#OQS"TZ[\>K>Q>Q;
L*feOO%--jAoq&A+q%ST*VW
 &--aAq!Q1EMMaQRS^^`r-   c                     | j                   dk(  r| j                  |      S | j                   dk(  r| j                  |      S t        d| j                          )Nr   r   zUnsupported patch method: )r]   r   r   
ValueErrorr'   r.   s     r,   r;   zCosmosPatchEmbed3d.forward   sV    &::m,,+-==//9$:K:K9LMNNr-   r   r   )reflectF)r?   r@   rA   r"   rB   r$   r7   rC   r   r   r   r;   rD   rE   s   @r,   r[   r[   j   s    
B3 
B# 
B4 
B$%,, $c $Y^YeYe $L5<< ELL ell u|| OU\\ Oell Or-   r[   c                       e Zd Zddedef fdZddej                  dedej                  fdZ	dej                  dej                  fdZ
dej                  dej                  fd	Zdej                  dej                  fd
Z xZS )CosmosUnpatcher3dr\   r]   c                    t         |           || _        || _        t        j                  |      j                         }t        j                  |j                  d         }| j                  d|d       | j                  d|d       y r_   rc   rj   s        r,   r$   zCosmosUnpatcher3d.__init__   rk   r-   r.   r{   r   c                 F   |j                   }|j                  }| j                  j                  |      }|j                  d   dz  }|j                  dg      j                  ddd      j                  |ddg      }|d| j                  j                  |      z  z  j                  ddd      j                  |dd      }|j                  |      }|j                  |      }t        j                  |dd      \  }	}
}}}}}}t        j                  |	|j                  d      j                  d      |d	
      }t        j                  |
|j                  d      j                  d      |d	
      |z   }t        j                  ||j                  d      j                  d      |d	
      }t        j                  ||j                  d      j                  d      |d	
      |z   }t        j                  ||j                  d      j                  d      |d	
      }t        j                  ||j                  d      j                  d      |d	
      |z   }t        j                  ||j                  d      j                  d      |d	
      }t        j                  ||j                  d      j                  d      |d	
      |z   }t        j                  ||j                  d      j                  d      |d
      }t        j                  ||j                  d      j                  d      |d
      |z   }t        j                  ||j                  d      j                  d      |d
      }t        j                  ||j                  d      j                  d      |d
      |z   }t        j                  ||j                  d      j                  d      |d
      }t        j                  ||j                  d      j                  d      |d
      |z   }|r|dz  }|S )Nr      r   rR   rm   r0   r   r   rs   rp   rQ   rr   ro   rt   )devicern   r`   rw   rh   ru   rv   r6   rb   r7   chunkr9   conv_transpose3drz   )r'   r.   r{   r   rn   hr}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s                          r,   _idwtzCosmosUnpatcher3d._idwt   s   %%##MMV$"a'VVQC[  Ar*111a)<B4<<??6223<<Q2FMMaQRTUVUUUUUU9>]TU[\9]6dD$dD$   r||A'@'@'CAV_`  r||A'@'@'CAV_`cff  r||A'@'@'CAV_`  r||A'@'@'CAV_`cff  r||A'@'@'CAV_`  r||A'@'@'CAV_`cff  r||A'@'@'CAV_`  r||A'@'@'CAV_`cff R\\!_%>%>q%A!T]^R\\!_%>%>q%A!T]^accR\\!_%>%>q%A!T]^R\\!_%>%>q%A!T]^acc **2r||A/H/H/KTU^ghr2<<?#<#<Q#?R[\_ll 	 )F2Mr-   c                     t        t        t        j                  | j                                    D ]  }| j                  |d      } |d d d d | j                  dz
  d df   }|S )NTr   r   .)r   r"   r   r   r\   r   )r'   r.   r(   s      r,   _ihaarzCosmosUnpatcher3d._ihaar   sa    s499T__567 	DA JJ}dJCM	D%aDOOa,?,A3&FGr-   c           
         | j                   }|j                  dd|||f      }|j                  dddddddd	      }|j                  dd      j                  d	d      j                  dd      }|d d d d |dz
  d d
f   }|S )Nr   rR   r   r   r   r   r   r   rQ   .)r\   rV   rT   rU   )r'   r.   r   s      r,   _irearrangezCosmosUnpatcher3d._irearrange   s    OO%//B1a=A%--aAq!Q1E%--a3;;AqAII!QO%aAEGS&89r-   c                     | j                   dk(  r| j                  |      S | j                   dk(  r| j                  |      S t        d| j                   z         )Nr   r   zUnknown patch method: )r]   r   r   r   r   s     r,   r;   zCosmosUnpatcher3d.forward   sU    &;;}--+-##M2258I8IIJJr-   r   )F)r?   r@   rA   r"   rB   r$   r7   rC   boolr   r   r   r;   rD   rE   s   @r,   r   r      s    
B3 
B# 
B)5<< )$ )5<< )VELL U\\  %,, KU\\ Kell Kr-   r   c                   d     e Zd Zdededdf fdZdej                  dej                  fdZ xZS )CosmosConvProjection3dr   r   r   Nc                 v    t         |           t        ||ddd      | _        t        ||ddd      | _        y )Nr   r   r   r   r   r   r   r   r   r   r   )r#   r$   r   conv_sconv_t)r'   r   r   r+   s      r,   r$   zCosmosConvProjection3d.__init__
  s<    (lPYbcmno(|QZcdnopr-   r.   c                 J    | j                  |      }| j                  |      }|S N)r   r   r   s     r,   r;   zCosmosConvProjection3d.forward  s$    M2M2r-   rY   rE   s   @r,   r   r   	  s<    qC qs qt qU\\ ell r-   r   c                   r     e Zd Z	 	 d
dededededdf
 fdZdej                  dej                  fd	Z xZ	S )CosmosResnetBlock3dr   r   dropoutrH   r   Nc                 R   t         |           |xs |}t        ||      | _        t	        ||      | _        t        ||      | _        t        j                  |      | _	        t	        ||      | _
        ||k7  rt        ||ddd      | _        y t        j                         | _        y )Nr   r   r   )r#   r$   rG   norm1r   conv1norm2rM   Dropoutr   conv2r   conv_shortcutIdentity)r'   r   r   r   rH   r+   s        r,   r$   zCosmosResnetBlock3d.__init__  s     	#2{*;
C
+KF
*<D
zz'*+L,G
,&!3K[\efpq!rD!#Dr-   r.   c                 0   |}| j                  |      }| j                  |      }t        j                  |      }| j	                  |      }| j                  |      }t        j                  |      }| j                  |      }| j                  |      }||z   S r   )r   r   r9   silur   r   r   r   )r'   r.   residuals      r,   r;   zCosmosResnetBlock3d.forward-  s     %%h/

=1}-

=1

=1}-]3

=1x''r-   )r2   r   
r?   r@   rA   r"   floatr$   r7   rC   r;   rD   rE   s   @r,   r   r     s[    
 // / 	/
 / 
/,(U\\ (ell (r-   r   c            	       n     e Zd Z	 	 d	dedededdf fdZdej                  dej                  fdZ xZ	S )
CosmosDownsample3dr   spatial_downsampletemporal_downsampler   Nc                 d   t         |           || _        || _        t	        j
                         | _        t	        j
                         | _        t	        j
                         | _        |rt        ||ddd      | _        |rt        ||ddd      | _        |s|rt        ||ddd      | _        y y )Nr   r   r   r   r   r   r   ro   r=   )
r#   r$   r   r   rM   r   r   r   conv3r   )r'   r   r   r   r+   s       r,   r$   zCosmosDownsample3d.__init__>  s     	"4#6 [[]
[[]
[[]
+[i	[\DJ +[i	[\DJ !4+[i	[\DJ "5r-   r.   c                    | j                   s| j                  s|S | j                   rId}t        j                  ||dd      }| j	                  |      }t        j
                  |dd      }||z   }| j                  rTt        j                  |d d d d d ddf   |gd	
      }| j                  |      }t        j
                  |dd      }||z   }| j                  |      }|S )N)r   r   r   r   r   r   r>   r   r3   r   )r   r   r   .r   r0   ro   )
r   r   r9   r:   r   
avg_pool3dr7   r8   r   r   )r'   r.   r:   conv_outpool_outs        r,   r;   zCosmosDownsample3d.forwardZ  s    &&t/G/G  ""$CEE-:QOMzz-0H||MyQZ[H$x/M##!II}Q2A2s]'C]&SYZ[Mzz-0H||MyQZ[H$x/M

=1r-   TT
r?   r@   rA   r"   r   r$   r7   rC   r;   rD   rE   s   @r,   r   r   =  sQ     $($(	 ! "	
 
8U\\ ell r-   r   c            	       n     e Zd Z	 	 d	dedededdf fdZdej                  dej                  fdZ xZ	S )
CosmosUpsample3dr   spatial_upsampletemporal_upsampler   Nc                 d   t         |           || _        || _        t	        j
                         | _        t	        j
                         | _        t	        j
                         | _        |rt        ||ddd      | _        |rt        ||ddd      | _        |s|rt        ||ddd      | _        y y )Nr   r=   r   r   r   r   )
r#   r$   r   r   rM   r   r   r   r   r   )r'   r   r   r   r+   s       r,   r$   zCosmosUpsample3d.__init__p  s     	 0!2[[]
[[]
[[]
+[i	[\DJ +[i	[\DJ 0+[i	[\DJ  1r-   r.   c                    | j                   s| j                  s|S | j                  rg|j                  d      }t        dd|dkD  z  z         }|j	                  t        |      d      }|d|dz
  d d d d d f   }| j                  |      |z   }| j                   r8|j	                  dd      j	                  dd      }| j                  |      |z   }| j                  |      }|S )Nr   r   r   r0   .r   rQ   )r   r   rS   r"   r   r   r   r   )r'   r.   r   time_factors       r,   r;   zCosmosUpsample3d.forward  s    $$T-C-C  !!&++A.JcC:>$::;K);;C<LRS;TM)#{Q/@!Q*FGM JJ}5EM  );;A1;EWWXY_`WaM JJ}5EM

=1r-   r   r   rE   s   @r,   r   r   o  sQ     "&"&	   	
 
8U\\ ell r-   r   c                        e Zd Z	 	 	 ddededededed   ddf fd	Zdd
ej                  de	ej                     dej                  fdZ
 xZS )CosmosCausalAttentionNnum_attention_headsattention_head_dimrH   r   	processor)"CosmosSpatialAttentionProcessor2_0#CosmosTemporalAttentionProcessor2_0r   c           	         t         |           || _        t        ||      | _        t        ||ddd      | _        t        ||ddd      | _        t        ||ddd      | _        t        j                  g       | _        | j                  j                  t        ||ddd             | j                  j                  t        j                  |             || _        | j                  t        d      y )NrH   r   r   r   z+CosmosCausalAttention requires a processor.)r#   r$   r   rG   rO   r   to_qto_kto_vrM   
ModuleListto_outappendr   r   r   )r'   r   r   rH   r   r   r+   s         r,   r$   zCosmosCausalAttention.__init__  s     	#6 )*<T	&'9;M[\efpqr	&'9;M[\efpqr	&'9;M[\efpqr	mmB'13EST]^hij	
 	2::g./">>!JKK "r-   r.   attention_maskc                 *    | j                  | ||      S )N)r.   r   )r   )r'   r.   r   s      r,   r;   zCosmosCausalAttention.forward  s    ~~d-P^~__r-   )r   r2   Nr   )r?   r@   rA   r"   r   r   r$   r7   rC   r   r;   rD   rE   s   @r,   r   r     s    
 hlL L  L 	L
 L deL 
L2`U\\ `8ELLCY `ejeqeq `r-   r   c            	       p    e Zd Zd Z	 ddedej                  deej                     dej                  fdZy)	r   c                 :    t        t        d      st        d      y Nscaled_dot_product_attentionzeCosmosSpatialAttentionProcessor2_0 requires PyTorch 2.0 or higher. To use it, please upgrade PyTorch.hasattrr9   ImportErrorr'   s    r,   r$   z+CosmosSpatialAttentionProcessor2_0.__init__  "    q89w  :r-   Nattnr.   r   r   c                 l   |j                   \  }}}}}|}	|j                  |      }|j                  |      }
|j                  |      }|j	                  |      }|
j                  ddddd      j                  dd      j                  dd      }
|j                  ddddd      j                  dd      j                  dd      }|j                  ddddd      j                  dd      j                  dd      }|
j                  d|j                  df      j                  dd      }
|j                  d|j                  df      j                  dd      }|j                  d|j                  df      j                  dd      }t        j                  |
|||      }|j                  dd      j                  dd      j                  |
      }|j                  d||f      j                  d||f      }|j                  ddddd      } |j                  d   |      } |j                  d   |      }||	z   S )Nr   r   r   rQ   r   rR   	attn_maskrh   rO   r   r   r   rT   rU   rV   r   	transposer9   r   type_asr   r'   r   r.   r   rW   rJ   r   r   r   r   querykeyr5   s                r,   __call__z+CosmosSpatialAttentionProcessor2_0.__call__  s    ?L>Q>Q;
L*fe 		-0		-(ii&		-( aAq!,44Q:BB1aHkk!Q1a(00A6>>q!DaAq!,44Q:BB1aH D$<$<b#ABLLQPQRmmA 8 8"=>HHAND$<$<b#ABLLQPQR66uc5Tbc%//15==aCKKER%//FE?CMMaR\^hQij%--aAq!<&A}5&A}5x''r-   r   	r?   r@   rA   r$   r   r7   rC   r   r
   r-   r,   r   r     G     rv()(:?,,(X`afamamXn(	(r-   r   c            	       p    e Zd Zd Z	 ddedej                  deej                     dej                  fdZy)	r   c                 :    t        t        d      st        d      y r   r   r   s    r,   r$   z,CosmosTemporalAttentionProcessor2_0.__init__  r   r-   Nr   r.   r   r   c                    |j                   \  }}}}}|}	|j                  |      }|j                  |      }
|j                  |      }|j	                  |      }|
j                  ddddd      j                  dd      }
|j                  ddddd      j                  dd      }|j                  ddddd      j                  dd      }|
j                  d|j                  df      j                  dd      }
|j                  d|j                  df      j                  dd      }|j                  d|j                  df      j                  dd      }t        j                  |
|||      }|j                  dd      j                  dd      j                  |
      }|j                  d|||f      }|j                  ddddd      } |j                  d   |      } |j                  d   |      }||	z   S )Nr   r   rQ   r   r   rR   r  r  r  s                r,   r
  z,CosmosTemporalAttentionProcessor2_0.__call__  s    ?L>Q>Q;
L*fe 		-0		-(ii&		-( aAq!,44Q:kk!Q1a(00A6aAq!,44Q: D$<$<b#ABLLQPQRmmA 8 8"=>HHAND$<$<b#ABLLQPQR66uc5Tbc%//15==aCKKER%//J3NO%--aAq!<&A}5&A}5x''r-   r   r  r  r-   r,   r   r     r  r-   r   c                   |     e Zd Zdedededededededed	d
f fdZdej                  d	ej                  fdZ	 xZ
S )CosmosDownBlock3dr   r   
num_layersr   use_attentionuse_downsampler   r   r   Nc	                    t         |           g g g }}
}	||}}t        |      D ]  }|	j                  t	        |||d             |}|rO|
j                  t        d|d|t                            |j                  t        d|d|t                            t|
j                  d        |j                  d         t        j                  |	      | _
        t        j                  |
      | _        t        j                  |      | _        d | _        |rAt        j                  g       | _        | j                  j                  t        |||             y y Nr   r   r   r   rH   r   r   )r#   r$   r   r   r   r   r   r   rM   r   resnets
attentionstemp_attentionsdownsamplersr   )r'   r   r   r  r   r  r  r   r   r  r  r  
in_channelout_channelr(   r+   s                  r,   r$   zCosmosDownBlock3d.__init__  s?    	/12r_"-|K
z" 	-ANN.z;\]^_$J!!),-+6#$ '"D"F  &&),-+6#$ '"E"G !!$'&&t,3	-6 }}W---
3!}}_=  "b 1D$$%7EWYl%mn r-   r.   c                 |   t        | j                  | j                  | j                        D ]h  \  }}} ||      }| ||      }||j	                  d      }t        j                  |j                  ||            j                         } |||      }j | j                  | j                  D ]
  } ||      } |S Nr   )
zipr  r  r  rS   r7   trilnew_onesr   r  )r'   r.   resnet	attentiontemp_attentionr   r   downsamplers           r,   r;   zCosmosDownBlock3d.forward@  s    14T\\4??TXThTh1i 	N-FI~"=1M$ )- 8)*//2
!&M,B,B:z,Z![!`!`!b .}n M	N (#00 ; +M :; r-   r?   r@   rA   r"   r   r   r$   r7   rC   r;   rD   rE   s   @r,   r  r    s    2o2o 2o 	2o
 2o 2o 2o !2o "2o 
2ohU\\ ell r-   r  c                   n     e Zd Zd
dededededdf
 fdZdej                  dej                  fd	Z xZ	S )CosmosMidBlock3dr   r  r   rH   r   Nc                    t         	|           g g g }}}|j                  t        ||||             t	        |      D ]m  }|j                  t        d|||t                            |j                  t        d|||t                            |j                  t        ||||             o t        j                  |      | _
        t        j                  |      | _        t        j                  |      | _        y )Nr   r  )r#   r$   r   r   r   r   r   r   rM   r   r  r  r  )
r'   r   r  r   rH   r  r  r  r(   r+   s
            r,   r$   zCosmosMidBlock3d.__init__R  s    /12r_*;WjYZz" 	_A%()'2)#@B ""%()'2)#AC NN.{KR\]^'	_* }}W---
3!}}_=r-   r.   c                 X    | j                   d   |      }t        | j                  | j                  | j                   dd        D ]c  \  }}}|j	                  d      }t        j                  |j                  ||            j                         } ||      } |||      } ||      }e |S )Nr   r   r   )	r  r!  r  r  rS   r7   r"  r#  r   )r'   r.   r%  r&  r$  r   r   s          r,   r;   zCosmosMidBlock3d.forwardq  s    'Q614T__dFZFZ\`\h\hijik\l1m 	2-I~v&++A.J"ZZ(>(>z:(VW\\^N%m4M*=.IM"=1M	2 r-   rX   r   rE   s   @r,   r*  r*  Q  sH    >C >S >5 >VY >bf >>U\\ ell r-   r*  c                   |     e Zd Zdedededededededed	d
f fdZdej                  d	ej                  fdZ	 xZ
S )CosmosUpBlock3dr   r   r  r   r  use_upsampler   r   r   Nc	                    t         |           g g g }}
}	||}}t        |      D ]  }|	j                  t	        |||d             |}|rO|
j                  t        d|d|t                            |j                  t        d|d|t                            t|
j                  d        |j                  d         t        j                  |	      | _
        t        j                  |
      | _        t        j                  |      | _        d | _        |rAt        j                  g       | _        | j                  j                  t        |||             y y r  )r#   r$   r   r   r   r   r   r   rM   r   r  r  r  
upsamplersr   )r'   r   r   r  r   r  r/  r   r   r  r%  r  r  r  r(   r+   s                  r,   r$   zCosmosUpBlock3d.__init__  s;    	.0"bO"-|K
z" 	-ANN.z;\]^_$J  ),-+6#$ '"D"F  &&),-+6#$ '"E"G   &&&t,3	-6 }}W---	2!}}_= mmB/DOOO""#3KAQSd#ef r-   r.   c                 |   t        | j                  | j                  | j                        D ]h  \  }}} ||      }| ||      }||j	                  d      }t        j                  |j                  ||            j                         } |||      }j | j                  | j                  D ]
  } ||      } |S r   )
r!  r  r  r  rS   r7   r"  r#  r   r1  )r'   r.   r$  r%  r&  r   r   	upsamplers           r,   r;   zCosmosUpBlock3d.forward  s    14T\\4??TXThTh1i 	N-FI~"=1M$ )- 8)*//2
!&M,B,B:z,Z![!`!`!b .}n M	N ??&!__ 9	 )- 89 r-   r(  rE   s   @r,   r.  r.    s    2g2g 2g 	2g
 2g 2g 2g 2g  2g 
2ghU\\ ell r-   r.  c                        e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddededeedf   dedeedf   deded	ed
edededdf fdZdej                  dej                  fdZ
 xZS )CosmosEncoder3dr   r   block_out_channels.num_resnet_blocksattention_resolutions
resolutionr\   
patch_typer   spatial_compression_ratiotemporal_compression_ratior   Nc                 .   t         |           ||dz  z  }t        t        j                  |
            t        t        j                  |            z
  }t        t        j                  |            t        t        j                  |            z
  }t        ||      | _        t        ||d         | _        ||z  }g }t        t        |      dz
        D ]]  }||   }||dz      }||v }dx}}|t        |      dz
  k  rd}||k  }||k  }|dz  }nd}|j                  t        ||||	||||             _ t        j                  |      | _        t!        |d   d|	d      | _        t%        |d   d	      | _        t        |d   |      | _        d| _        y )
Nr   r   r   Fr   TrR   r  r   rH   r   )r#   r$   r"   r   r   r[   patch_embedr   conv_inr   lenr   r  rM   r   down_blocksr*  	mid_blockrG   norm_outr   gradient_checkpointing)r'   r   r   r6  r7  r8  r9  r\   r:  r   r;  r<  	inner_dimnum_spatial_layersnum_temporal_layerscurrent_resolutionrB  ir  r  r  r   r   r  r+   s                           r,   r$   zCosmosEncoder3d.__init__  s    	*a-/	 +D!EFTYYWaMbIcc!$)),F"GH3tyyYcOdKee .j*E-i9KA9NO (:5s-.23 	A+A.J,QU3K.2GGM7<<!43)*Q..!%%&);%;"&'*=&=#%71%<"!&!%!"&'		4 ==5 **<R*@QX_lmn ..@.DQRS./A"/E|T&+#r-   r.   c                    | j                  |      }| j                  |      }t        j                         rL| j                  r@| j
                  D ]  }| j                  ||      } | j                  | j                  |      }n*| j
                  D ]
  } ||      } | j                  |      }| j                  |      }t        j                  |      }| j                  |      }|S r   )r?  r@  r7   is_grad_enabledrE  rB  _gradient_checkpointing_funcrC  rD  r9   r   r   r'   r.   blocks      r,   r;   zCosmosEncoder3d.forward  s    ((7]3  "t'B'B)) X $ A A% WX ==dnnm\M)) 5 %m 45 NN=9Mm4}-m4r-   )r               rT  r          rQ   r   r2   r   r   r?   r@   rA   r"   r   rB   r   r$   r7   rC   r;   rD   rE   s   @r,   r5  r5    s     .B!"16 )**+>,>, >, "#s(O	>,
 >,  %S#X>, >, >, >, >, $'>, %(>, 
>,@U\\ ell r-   r5  c                        e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddededeedf   dedeedf   deded	ed
edededdf fdZdej                  dej                  fdZ
 xZS )CosmosDecoder3dr   r   r6  .r7  r8  r9  r\   r:  r   r;  r<  r   Nc                    t         |           ||dz  z  }t        t        j                  |
            t        t        j                  |            z
  }t        t        j                  |            t        t        j                  |            z
  }t        t        |            }t        ||d         | _        t        |d   d|	d      | _
        ||z  dt        |      dz
  z  z  }g }t        t        |      dz
        D ]y  }||   }||dz      }||v }dx}}|t        |      dz
  k  r+d}d|cxk  xr |dz   k  nc }|xs ||k  xr ||kD  }|dz  }nd}|j                  t        |||dz   |	||||             { t        j                   |      | _        t%        |d   d	      | _        t        |d   |      | _        t+        ||      | _        d| _        y )
Nr   r   r   r>  r   FTrR   r   )r#   r$   r"   r   r   listreversedr   r@  r*  rC  rA  r   r   r.  rM   r   	up_blocksrG   rD  r   r   unpatch_embedrE  )r'   r   r   r6  r7  r8  r9  r\   r:  r   r;  r<  rF  rG  rH  reversed_block_out_channelsrI  r^  rJ  r  r  r  r   r   r/  r+   s                            r,   r$   zCosmosDecoder3d.__init__  s    	 :q=0	 +D!EFTYYWaMbIcc!$)),F"GH3tyyYcOdKee&*84F+G&H# .k;VWX;YZ **Ea*HUV`gtuv )J61EWAX[\A\;]]	s-.23 	A4Q7J5a!e<K.2GGM38803)*Q..#$%$C,?!,C$C!#4 $**W/ADW/W ! &8!%;"$%)! $%	!	8 y1 ..I".MZ[\./J2/NPYZ.z:F&+#r-   r.   c                 r   | j                  |      }| j                  |      }| j                  D ]=  }t        j                         r| j
                  r| j                  ||      }6 ||      }? | j                  |      }t        j                  |      }| j                  |      }| j                  |      }|S r   )r@  rC  r^  r7   rL  rE  rM  rD  r9   r   r   r_  rN  s      r,   r;   zCosmosDecoder3d.forward]  s    ]3}5^^ 	5E$$&4+F+F $ A A% W %m 4		5 m4}-m4**=9r-   )rP  r   rQ  r   rU  rW  rQ   r   r2   r   r   rX  rE   s   @r,   rZ  rZ    s     .B!"16 )**+A,A, A, "#s(O	A,
 A,  %S#XA, A, A, A, A, $'A, %(A, 
A,FU\\ ell r-   rZ  c            "           e Zd ZdZdZedddddddd	d
ddddeefdededede	edf   de	edf   de	edf   dededede
dedededeee      deee      ddf  fd        Z	 	 	 	 	 	 d3d!ee   d"ee   d#ee   d$ee   d%ee   d&ee   ddfd'Zd(ej"                  dej"                  fd)Zed4d(ej"                  d*edej"                  fd+       Zd4d,ej"                  d*edeee	ej"                     f   fd-Zed4d,ej"                  d*edeee	ej"                     f   fd.       Z	 	 	 d5d/ej"                  d0ed*ed1eej4                     dee	ej"                     ef   f
d2Z xZS )6AutoencoderKLCosmosa	  
    Autoencoder used in [Cosmos](https://huggingface.co/papers/2501.03575).

    Args:
        in_channels (`int`, defaults to `3`):
            Number of input channels.
        out_channels (`int`, defaults to `3`):
            Number of output channels.
        latent_channels (`int`, defaults to `16`):
            Number of latent channels.
        encoder_block_out_channels (`Tuple[int, ...]`, defaults to `(128, 256, 512, 512)`):
            Number of output channels for each encoder down block.
        decode_block_out_channels (`Tuple[int, ...]`, defaults to `(256, 512, 512, 512)`):
            Number of output channels for each decoder up block.
        attention_resolutions (`Tuple[int, ...]`, defaults to `(32,)`):
            List of image/video resolutions at which to apply attention.
        resolution (`int`, defaults to `1024`):
            Base image/video resolution used for computing whether a block should have attention layers.
        num_layers (`int`, defaults to `2`):
            Number of resnet blocks in each encoder/decoder block.
        patch_size (`int`, defaults to `4`):
            Patch size used for patching the input image/video.
        patch_type (`str`, defaults to `haar`):
            Patch type used for patching the input image/video. Can be either `haar` or `rearrange`.
        scaling_factor (`float`, defaults to `1.0`):
            The component-wise standard deviation of the trained latent space computed using the first batch of the
            training set. This is used to scale the latent space to have unit variance when training the diffusion
            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
            Synthesis with Latent Diffusion Models](https://huggingface.co/papers/2112.10752) paper. Not applicable in
            Cosmos, but we default to 1.0 for consistency.
        spatial_compression_ratio (`int`, defaults to `8`):
            The spatial compression ratio to apply in the VAE. The number of downsample blocks is determined using
            this.
        temporal_compression_ratio (`int`, defaults to `8`):
            The temporal compression ratio to apply in the VAE. The number of downsample blocks is determined using
            this.
    Tr   rP  rQ  )rS  rT  rT  rT  rU  rW  r   rQ   r   r   r   r   r   latent_channelsencoder_block_out_channels.decode_block_out_channelsr8  r9  r  r\   r:  scaling_factorr;  r<  latents_meanlatents_stdr   Nc                    t         |           t        |||||||	|
||
      | _        t	        |||||||	|
||
      | _        t        ||dd      | _        t        ||dd      | _        d| _	        d| _
        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d	| _        d	| _        d
| _        y )N)
r   r   r6  r7  r8  r9  r\   r:  r;  r<  r   r   )r   r   FrP  r   rT  i  r   )r#   r$   r5  encoderrZ  decoderr   
quant_convpost_quant_convuse_slicing
use_tilinguse_framewise_encodinguse_framewise_decodingnum_sample_frames_batch_sizenum_latent_frames_batch_sizetile_sample_min_heighttile_sample_min_widthtile_sample_min_num_framestile_sample_stride_heighttile_sample_stride_widthtile_sample_stride_num_frames)r'   r   r   rd  re  rf  r8  r9  r  r\   r:  rg  r;  r<  rh  ri  r+   s                   r,   r$   zAutoencoderKLCosmos.__init__  s    & 	&#(9("7!!!&?'A
 ''%8("7!!!&?'A
 -_o[\fgh1/?`aklm !
   ',#&+#
 -/),-) '*#%("*,' *-&(+%-.*r-   ru  rv  rw  rx  ry  rz  c                    d| _         |xs | j                  | _        |xs | j                  | _        |xs | j                  | _        |xs | j                  | _        |xs | j
                  | _        |xs | j                  | _        y)aX  
        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.

        Args:
            tile_sample_min_height (`int`, *optional*):
                The minimum height required for a sample to be separated into tiles across the height dimension.
            tile_sample_min_width (`int`, *optional*):
                The minimum width required for a sample to be separated into tiles across the width dimension.
            tile_sample_stride_height (`int`, *optional*):
                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
                no tiling artifacts produced across the height dimension.
            tile_sample_stride_width (`int`, *optional*):
                The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
                artifacts produced across the width dimension.
        TN)rp  ru  rv  rw  rx  ry  rz  )r'   ru  rv  rw  rx  ry  rz  s          r,   enable_tilingz!AutoencoderKLCosmos.enable_tiling  s}    4 &<&[@[@[#%:%Xd>X>X"*D*gHgHg')B)ddFdFd&(@(aDDaDa%-J-pdNpNp*r-   xc                 J    | j                  |      }| j                  |      }|S r   )rk  rm  )r'   r}  encs      r,   _encodezAutoencoderKLCosmos._encode
  s"    LLOooa 
r-   return_dictc                 (   | j                   rU|j                  d   dkD  rC|j                  d      D cg c]  }| j                  |       }}t	        j
                  |      }n| j                  |      }t        |      }|s|fS t        |      S c c}w )Nr   r   )latent_dist)ro  rh   r   r  r7   r8   r   r   )r'   r}  r  x_sliceencoded_slicesr   	posteriors          r,   encodezAutoencoderKLCosmos.encode  s~    
QCD771:Ndll73NNN		.)AQA(+	<"y99 Os   Bzc                 h    | j                  |      }| j                  |      }|s|fS t        |      S )Nsample)rn  rl  r   )r'   r  r  decs       r,   _decodezAutoencoderKLCosmos._decode  s4      #ll1o6MC((r-   c                 :   | j                   r_|j                  d   dkD  rM|j                  d      D cg c]  }| j                  |      j                   }}t        j                  |      }n| j                  |      j                  }|s|fS t        |      S c c}w )Nr   r   r  )ro  rh   r   r  r  r7   r8   r   )r'   r  r  z_slicedecoded_slicesdecodeds         r,   decodezAutoencoderKLCosmos.decode%  s    
QJK''RS*Uwdll73::UNUii/Gll1o,,G:G,, Vs   "Br  sample_posterior	generatorc                     |}| j                  |      j                  }|r|j                  |      }n|j                         }| j	                  |      j                  }|s|fS t        |      S )N)r  r  )r  r  r  r4   r  r   )	r'   r  r  r  r  r}  r  r  r  s	            r,   r;   zAutoencoderKLCosmos.forward1  sf     KKN..	  9 5A Akk!n##6MC((r-   )NNNNNN)T)FTN)r?   r@   rA   __doc__ _supports_gradient_checkpointingr	   LATENTS_MEANLATENTS_STDr"   r   rB   r   r   r   r$   r|  r7   rC   r  r   r   r  r   r   r  r  	Generatorr;   rD   rE   s   @r,   rc  rc  n  s   &P (,$ !6J5I16  #)**+.:-8!L/L/ L/ 	L/
 %*#s(OL/ $)c?L/  %S#XL/ L/ L/ L/ L/ L/ $'L/ %(L/ tE{+L/  d5k*!L/" 
#L/ L/` 15/34859489= q ( q  (} q %-SM	 q
 $,E? q #+5/ q (0 q 
 qD %,, 
 : :4 :5<< : :) )D )E-Y^_d_k_kYlJlDm ) 	- 	-4 	-5X]^c^j^jXkIkCl 	- 	- "' /3)) ) 	)
 EOO,) 
uU\\"M1	2)r-   rc  )4r   typingr   r   r   r   r7   torch.nnrM   torch.nn.functional
functionalr9   configuration_utilsr   r	   utilsr
   utils.accelerate_utilsr   modeling_outputsr   modeling_utilsr   vaer   r   r   r?   loggerr  r  tensorrd   Conv3dr   ModulerG   r[   r   r   r   r   r   r   r   r   r  r*  r.  r5  rZ  rc  r  r-   r,   <module>r     s    / /     B  8 2 ' F F 
H	 w1 y/ELL,.@ABsCj)	". ".JEHHOO 0MO MO`LK		 LK^
RYY 
$(")) $(N/ /d-ryy -``BII `<$( $(N$( $(NC		 CL+ryy +\Cbii CLQbii QhRbii RjS)*&6 S)r-   