
    i2              
       V   d dl mZ d dlmZmZmZmZmZmZ d dl	Z	d dl
mZ ddlmZmZ ddlmZmZ ddlmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&  ejN                  e(      Z)e G d de             Z* G d de!eeeee      Z+ G d de!eeee      Z,y)    )	dataclass)AnyDictListOptionalTupleUnionN   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)USE_PEFT_BACKEND
BaseOutputloggingscale_lora_layersunscale_lora_layers   )AttentionMixin)
CacheMixin)zero_module)Transformer2DModelOutput)
ModelMixin)QwenEmbedRopeQwenImageTransformerBlockQwenTimestepProjEmbeddingsRMSNormc                   2    e Zd ZU eej
                     ed<   y)QwenImageControlNetOutputcontrolnet_block_samplesN)__name__
__module____qualname__r   torchTensor__annotations__     {/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/diffusers/models/controlnets/controlnet_qwenimage.pyr   r   (   s    #ELL11r(   r   c                       e Zd ZdZe	 	 	 	 	 	 	 	 	 ddededee   dedededed	eeeef   d
ef fd       Ze		 	 	 	 	 ddededed
efd       Z
	 	 	 	 	 	 	 	 ddej                  dej                  dedej                  dej                  dej                  deeeeeef         deee      deeeef      dedeej*                  ef   fdZ xZS )QwenImageControlNetModelT
patch_sizein_channelsout_channels
num_layersattention_head_dimnum_attention_headsjoint_attention_dimaxes_dims_ropeextra_condition_channelsc
           
         t         |           |xs || _        ||z  | _        t	        dt        |      d      | _        t        | j                        | _        t        |d      | _
        t        j                  || j                        | _        t        j                  || j                        | _        t        j                  t!        |      D 
cg c]  }
t#        | j                  ||       c}
      | _        t        j                  g       | _        t!        t)        | j$                              D ]N  }
| j&                  j+                  t-        t        j                  | j                  | j                                     P t-        t.        j                  j                  ||	z   | j                              | _        d| _        y c c}
w )	Ni'  T)thetaaxes_dim
scale_rope)embedding_dimgư>)eps)dimr1   r0   F)super__init__r.   	inner_dimr   list	pos_embedr   time_text_embedr   txt_normnnLinearimg_intxt_in
ModuleListranger   transformer_blockscontrolnet_blockslenappendr   r$   controlnet_x_embeddergradient_checkpointing)selfr,   r-   r.   r/   r0   r1   r2   r3   r4   _	__class__s              r)   r=   z!QwenImageControlNetModel.__init__2   sj    	(7K,/AA&UT.=Q^bc9W 3>iiT^^<ii 3T^^D"$-- z*  *(;'9	#
 "$r!2s42234 	bA""))+biiPTP^P^6_*`a	b%0HHOOK*BBDNNS&
" ',#%s   Gc                    t        |j                        }||d<   ||d<   ||d<   ||d<   | j                  |      }|r|j                  j	                  |j                  j                                |j                  j	                  |j                  j                                |j                  j	                  |j                  j                                |j                  j	                  |j                  j                                |j                  j	                  |j                  j                         d       t        |j                        |_        |S )Nr/   r0   r1   r4   F)strict)dictconfigfrom_configr@   load_state_dict
state_dictrA   rE   rF   rI   r   rM   )	clstransformerr/   r0   r1   load_weights_from_transformerr4   rU   
controlnets	            r)   from_transformerz)QwenImageControlNetModel.from_transformera   s    k(())|'9#$(;$%-E)*__V,
(  001F1F1Q1Q1ST&&66{7R7R7]7]7_`--k.@.@.K.K.MN--k.@.@.K.K.MN))99+:X:X:c:c:ens9t/::;[;[/\J,r(   hidden_statescontrolnet_condconditioning_scaleencoder_hidden_statesencoder_hidden_states_masktimestep
img_shapestxt_seq_lensjoint_attention_kwargsreturn_dictreturnc           	         |	#|	j                         }	|	j                  dd      }nd}t        rt        | |       n)|	'|	j	                  dd      t
        j                  d       | j                  |      }|| j                  |      z   }| j                  ||      }| j                  |||j                        }|j                  |j                        }| j                  |      }| j                  |      }d}t!        | j"                        D ]V  \  }}t%        j&                         r&| j(                  r| j+                  ||||||      \  }}n |||||||	      \  }}||fz   }X d}t-        || j.                        D ]  \  }} ||      }||fz   } |D cg c]  }||z  	 }}t1        |      dk(  rdn|}t        rt3        | |       |
s|S t5        |	      S c c}w )
am  
        The [`FluxTransformer2DModel`] forward method.

        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
                Input `hidden_states`.
            controlnet_cond (`torch.Tensor`):
                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
            conditioning_scale (`float`, defaults to `1.0`):
                The scale factor for ControlNet outputs.
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
                from the embeddings of input conditions.
            timestep ( `torch.LongTensor`):
                Used to indicate denoising step.
            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
                A list of tensors that if specified are added to the residuals of transformer blocks.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
                tuple.

        Returns:
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        Nscale      ?z\Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective.)devicer'   )r^   ra   rb   tembimage_rotary_embrf   r   )r    )copypopr   r   getloggerwarningrE   rM   rA   r@   rl   todtyperB   rF   	enumeraterI   r$   is_grad_enabledrN   _gradient_checkpointing_funcziprJ   rK   r   r   )rO   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   
lora_scalerm   rn   block_samplesindex_blockblockr    block_samplecontrolnet_blocksamples                        r)   forwardz QwenImageControlNetModel.forward}   s&   V "-%;%@%@%B"/33GSAJJdJ/%16L6P6PQXZ^6_6kr M2 &(B(B?(SS##Hm<>>*l=K_K_>`;;}223 $.C D $,A B"+D,C,C"D 	=K$$&4+F+F7;7X7X!).$84%} 8="/*?/I%5+A84%} *],<<M)	=. $& .1-AWAW.X 	R*L*+L9L'?</'Q$	R
 Og#gFF-?$?#g #g+./G+HA+M4Sk j1++(%=
 	
 $hs   G)	r   @      <         i   )r   8   r   r   )   r   r   Tr   )rk   NNNNNNT)r!   r"   r#    _supports_gradient_checkpointingr   intr   r   r=   classmethodr]   r$   r%   float
LongTensorr   r   strr   boolr	   FloatTensorr   r   __classcell__rQ   s   @r)   r+   r+   -   s    (,$ &("%#%#'/;(),,,, ,, sm	,,
 ,,  ,, !,, !,, c3m,,, #&,, ,,\  "%#%&*()   	
 ! #& > %(.237%);?,0;? p
||p
 p
 "	p

  %||p
 %*LLp
 ""p
 T%S#"678p
 tCy)p
 !)c3h 8p
 p
 
u  "::	;p
r(   r+   c                       e Zd ZdZ fdZ	 	 	 	 	 	 	 ddej                  deej                     dee	   dej                  dej                  dej                  d	eeeeeef         d
eee      deeeef      dedeeef   fdZ xZS )QwenImageMultiControlNetModela  
    `QwenImageMultiControlNetModel` wrapper class for Multi-QwenImageControlNetModel

    This module is a wrapper for multiple instances of the `QwenImageControlNetModel`. The `forward()` API is designed
    to be compatible with `QwenImageControlNetModel`.

    Args:
        controlnets (`List[QwenImageControlNetModel]`):
            Provides additional conditioning to the unet during the denoising process. You must set multiple
            `QwenImageControlNetModel` as a list.
    c                 V    t         |           t        j                  |      | _        y )N)r<   r=   rC   rG   nets)rO   controlnetsrQ   s     r)   r=   z&QwenImageMultiControlNetModel.__init__   s    MM+.	r(   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   c                 8   t        | j                        dk(  rr| j                  d   }t        t        ||            D ]I  \  }\  }} ||||||||||	|

      }|dk(  r|}#|&)t        ||      D cg c]
  \  }}||z    }}}K S t	        d      c c}}w )N   r   )
r^   r_   r`   ra   rb   rc   rd   re   rf   rg   zJQwenImageMultiControlNetModel only supports a single controlnet-union now.)rK   r   rv   ry   
ValueError)rO   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   r\   iimagerj   r{   control_block_samplescontrol_block_sampler~   s                      r)   r   z%QwenImageMultiControlNetModel.forward  s     tyy>Q1J%.s?DV/W%X !>E5 *"/$)',*?/I%)!-+A +! 6,9)$05J5V GJJ_anFo1 B 4l 1<?1- 1'4 %$ ijj1s   5B)NNNNNNT)r!   r"   r#   __doc__r=   r$   r   r   tensorr   r%   r   r   r   r   r   r   r   r   r	   r   r   r   r   s   @r)   r   r      s    
/ /337%);?,0;? ,%((,% ell+,% !K	,%
  %||,% %*LL,% "",% T%S#"678,% tCy),% !)c3h 8,% ,% 
(%/	0,%r(   r   )-dataclassesr   typingr   r   r   r   r   r	   r$   torch.nnrC   configuration_utilsr   r   loadersr   r   utilsr   r   r   r   r   	attentionr   cache_utilsr   controlnets.controlnetr   modeling_outputsr   modeling_utilsr   "transformers.transformer_qwenimager   r   r   r   
get_loggerr!   rr   r   r+   r   r'   r(   r)   <module>r      s    " : :   B ? b b & $ 0 7 '  
		H	% 2
 2 2@
-=?UWa@
F=%J=MOegq =%r(   