
    i                     J   d dl Z d dlmZmZmZmZmZmZmZ d dl	Z
d dlZd dlZd dlmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ dd	lmZ d
dlmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&  e       rd dl'm(c m)Z* dZ+ndZ+ ejX                  e-      Z.dZ/dZ0e$dfdee1   de1deeeejd                  jd                     eeejd                  jd                        f      fdZ3deeejd                  jd                        eejd                  jd                     z  de de4deeejd                  jd                        fdZ5de4de4de6fdZ7	 	 	 	 d*dee4   d eee1ejp                  f      d!eee4      d"eee6      fd#Z9	 d+d$ejt                  d%eejv                     d&e1fd'Z< G d( d)ee      Z=y),    N)AnyCallableDictListOptionalTupleUnion)AutoProcessor Mistral3ForConditionalGeneration   )Flux2LoraLoaderMixin)AutoencoderKLFlux2Flux2Transformer2DModel)FlowMatchEulerDiscreteScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DiffusionPipeline   )Flux2ImageProcessor)Flux2PipelineOutput)SYSTEM_MESSAGESYSTEM_MESSAGE_UPSAMPLING_I2ISYSTEM_MESSAGE_UPSAMPLING_T2ITFaU  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import Flux2Pipeline

        >>> pipe = Flux2Pipeline.from_pretrained("black-forest-labs/FLUX.2-dev", torch_dtype=torch.bfloat16)
        >>> pipe.to("cuda")
        >>> prompt = "A cat holding a sign that says hello world"
        >>> # Depending on the variant being used, the pipeline call will slightly vary.
        >>> # Refer to the pipeline documentation for more details.
        >>> image = pipe(prompt, num_inference_steps=50, guidance_scale=2.5).images[0]
        >>> image.save("flux.png")
        ```
i  	 promptssystem_messageimagesc           
         | D cg c]  }|j                  dd       }}|t        |      dk(  r |D cg c]  }dd|dgddd|dgdg c}S t        |      t        |       k(  sJ d	       |D cg c]  }dd|dgdg }}t        t        ||            D ]J  \  }\  }}|%|j	                  d|D 	cg c]  }	d
|	d	 c}	d       |j	                  dd||   dgd       L |S c c}w c c}w c c}w c c}	w )a  
    Format a batch of text prompts into the conversation format expected by apply_chat_template. Optionally, add images
    to the input.

    Args:
        prompts: List of text prompts
        system_message: System message to use (default: CREATIVE_SYSTEM_MESSAGE)
        images (optional): List of images to add to the input.

    Returns:
        List of conversations, where each conversation is a list of message dicts
    z[IMG] r   systemtext)typer#   )rolecontentuserz-Number of images must match number of promptsimage)r$   r(   )replacelen	enumeratezipappend)
r   r   r   promptcleaned_txt_messagesiel	image_objs
             r/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/diffusers/pipelines/flux2/pipeline_flux2.pyformat_inputr6   A   sb   ( >EE66>>'2.EKE~V) &	
  %)/ HI  ff-M,NO	
 		
 6{c'l*[,[[* !
 	 %)/ HI
 
  )Xv)>? 	OA|F!		 &[a#biWy$I#b II")/Q HI	" U F	

  $cs   C)C.2C35C8image_processorupsampling_max_image_sizereturnc           
      \   | sg S t        | d   t        j                  j                        r| D cg c]  }|g } }| D cg c]$  }t        |      dkD  r|j	                  |      gn|& } }| D cg c]"  }|D cg c]  }|j                  ||       c}$ } }| S c c}w c c}w c c}w c c}w )Nr   r   )
isinstancePILImager*   concatenate_images_resize_if_exceeds_area)r   r7   r8   imimg_is        r5   _validate_and_process_imagesrB      s     	 &)SYY__-!'(22$(( ekk[`SZ!^11%89QVVkFk
  affW\	0	08Q	RfF  M ) l 	gs#   
B )B/	B)8B$
B)$B)image_seq_len	num_stepsc                     d\  }}d\  }}| dkD  r|| z  |z   }t        |      S || z  |z   }|| z  |z   }||z
  dz  }	|d|	z  z
  }
|	|z  |
z   }t        |      S )N)gT	?gŒ_?)g w:/&?gDw:?i  g     g@g      i@)float)rC   rD   a1b1a2b2mum_200m_10abs              r5   compute_empirical_murP      s    'FB#FBt-"$Ry#E"D	A	A	
Y	B9    num_inference_stepsdevice	timestepssigmasc                    ||t        d      |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS  | j                  |fd	|i| | j                  }||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`List[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    zYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesrT   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)rT   rS   rU   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)rU   rS   rS    )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__rT   r*   )	schedulerrR   rS   rT   rU   kwargsaccepts_timestepsaccept_sigmass           r5   retrieve_timestepsrd      s   > !3tuu'3w/@/@AXAX/Y/d/d/i/i/k+ll .y/B/B.C Da b  	 	M)FMfM''	!)n ))) 
	 C(9(9):Q:Q(R(](](b(b(d$ee.y/B/B.C D_ `  	 	GvfGG''	!)n ))) 	 	 3MFMfM''	)))rQ   encoder_output	generatorsample_modec                     t        | d      r |dk(  r| j                  j                  |      S t        | d      r|dk(  r| j                  j                         S t        | d      r| j                  S t        d      )Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrri   rj   moderl   AttributeError)re   rf   rg   s      r5   retrieve_latentsrp      st     ~}-+2I))00;;		/K84K))..00		+%%%RSSrQ   c            )       H    e Zd ZdZdZddgZdededede	d	e
f
 fd
Zedddedfdede	deeee   f   deej&                     deej(                     dededee   fd       Ze	 dDdej.                  deej.                     fd       Zedej.                  fd       Ze	 dEdeej.                     defd       Zed        Zed        Zed        Zedej.                  dej.                  d eej.                     fd!       Z	 	 	 dFdeeee   f   d"eee jB                  jB                     eee jB                  jB                        f   d#e"dej(                  d ee   f
d$Z#	 	 	 	 	 dGdeeee   f   deej(                     d&edeej.                     ded'e$e   fd(Z%d)ej.                  d*ejL                  fd+Z'	 dDd*ejL                  deej.                     fd,Z(d"eej.                     d*ejL                  fd-Z)	 	 dHd.Z*e+d/        Z,e+d0        Z-e+d1        Z.e+d2        Z/e+d3        Z0 ejb                          e2e3      ddddd4dd5d%dddd6d7dddgdddfd)eeee jB                  jB                     e jB                  jB                  f      deeee   f   d8ee   d9ee   d:ed;eee"      d<ee"   d&ed*eeejL                  eejL                     f      deej.                     deej.                     d=ee   d>e4d?ee5ee6f      d@ee7eee5gdf      dAee   ded'e$e   dBe"f&dC              Z8 xZ9S )IFlux2Pipelinea  
    The Flux2 pipeline for text-to-image generation.

    Reference: [https://bfl.ai/blog/flux-2](https://bfl.ai/blog/flux-2)

    Args:
        transformer ([`Flux2Transformer2DModel`]):
            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
        vae ([`AutoencoderKLFlux2`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`Mistral3ForConditionalGeneration`]):
            [Mistral3ForConditionalGeneration](https://huggingface.co/docs/transformers/en/model_doc/mistral3#transformers.Mistral3ForConditionalGeneration)
        tokenizer (`AutoProcessor`):
            Tokenizer of class
            [PixtralProcessor](https://huggingface.co/docs/transformers/en/model_doc/pixtral#transformers.PixtralProcessor).
    ztext_encoder->transformer->vaerl   prompt_embedsr`   vaetext_encoder	tokenizertransformerc                    t         |           | j                  |||||       t        | dd       r/dt	        | j
                  j                  j                        dz
  z  nd| _        t        | j                  dz        | _
        d| _        d| _        t        | _        t        | _        t"        | _        t&        | _        y )	N)rt   ru   rv   r`   rw   rt   r   r      )vae_scale_factor      )super__init__register_modulesgetattrr*   rt   configblock_out_channelsrz   r   r7   tokenizer_max_lengthdefault_sample_sizer   r   r   system_message_upsampling_t2ir   system_message_upsampling_i2iUPSAMPLING_MAX_IMAGE_SIZEr8   )selfr`   rt   ru   rv   rw   r_   s         r5   r~   zFlux2Pipeline.__init__  s     	%# 	 	
 W^^bdikoVpc$((//*L*L&MPQ&Q Rvw  3DDYDY\]D]^$'!#& ,-J*-J*)B&rQ   Nr{   
         r.   dtyperS   max_sequence_lengthr   hidden_states_layersc           
      *   || j                   n|}|| j                  n|}t        |t              r|gn|}t	        ||      }|j                  |dddddd|      }	|	d   j                  |      }
|	d   j                  |      } | |
|dd	      }t        j                  |D cg c]  }|j                  |    c}d
      }|j                  ||      }|j                  \  }}}}|j                  ddd
d      j                  ||||z        }|S c c}w )N)r   r   FTpt
max_lengthadd_generation_prompttokenizereturn_dictreturn_tensorspadding
truncationr   	input_idsattention_mask)r   r   output_hidden_states	use_cacher   dim)r   rS   r   r   r   )r   rS   r;   strr6   apply_chat_templatetotorchstackhidden_statesshapepermutereshape)ru   rv   r.   r   rS   r   r   r   messages_batchinputsr   r   outputkout
batch_sizenum_channelsseq_len
hidden_dimrs   s                       r5   "_get_mistral_3_small_prompt_embedsz0Flux2Pipeline._get_mistral_3_small_prompt_embeds/  s=    ',m""(.$$F'4&& &f^T .."' * / 	
 ;'**62	 0144V< )!%	
 kk<PQq6//2QWXYff5f08;		5
L':Aq!Q/77
G\\fMfg Rs   #Dxt_coordc                 x   | j                   \  }}}g }t        |      D ]  }|t        j                  d      n||   }t        j                  d      }t        j                  d      }	t        j                  |      }
t        j                  |||	|
      }|j                  |        t        j                  |      S )Nr   )r   ranger   arangecartesian_prodr-   r   )r   r   BLr0   out_idsr2   thwlcoordss               r5   _prepare_text_idszFlux2Pipeline._prepare_text_idsc  s    
 ''1aq 	#A#*?Q
AQAQAQA))!Q15FNN6"	# {{7##rQ   c                 D   | j                   \  }}}}t        j                  d      }t        j                  |      }t        j                  |      }t        j                  d      }t        j                  ||||      }	|	j	                  d      j                  |dd      }	|	S )a  
        Generates 4D position coordinates (T, H, W, L) for latent tensors.

        Args:
            latents (torch.Tensor):
                Latent tensor of shape (B, C, H, W)

        Returns:
            torch.Tensor:
                Position IDs tensor of shape (B, H*W, 4) All batches share the same coordinate structure: T=0,
                H=[0..H-1], W=[0..W-1], L=0
        r   r   )r   r   r   r   	unsqueezeexpand)
rl   r   r0   heightwidthr   r   r   r   
latent_idss
             r5   _prepare_latent_idsz!Flux2Pipeline._prepare_latent_idsv  s    " (/}}$
AvuLLOLL LLLLO ))!Q15
  ))!,33JBG
rQ   image_latentsscalec           	         t        | t              st        dt        |        d      t	        j
                  dt        |             D cg c]
  }|||z  z    }}|D cg c]  }|j                  d       }}g }t        | |      D ]  \  }}|j                  d      }|j                  \  }}}t	        j                  |t	        j
                  |      t	        j
                  |      t	        j
                  d            }	|j                  |	        t	        j                  |d      }|j                  d      }|S c c}w c c}w )a  
        Generates 4D time-space coordinates (T, H, W, L) for a sequence of image latents.

        This function creates a unique coordinate for every pixel/patch across all input latent with different
        dimensions.

        Args:
            image_latents (List[torch.Tensor]):
                A list of image latent feature tensors, typically of shape (C, H, W).
            scale (int, optional):
                A factor used to define the time separation (T-coordinate) between latents. T-coordinate for the i-th
                latent is: 'scale + scale * i'. Defaults to 10.

        Returns:
            torch.Tensor:
                The combined coordinate tensor. Shape: (1, N_total, 4) Where N_total is the sum of (H * W) for all
                input latents.

        Coordinate Components (Dimension 4):
            - T (Time): The unique index indicating which latent image the coordinate belongs to.
            - H (Height): The row index within that latent image.
            - W (Width): The column index within that latent image.
            - L (Seq. Length): A sequence length dimension, which is always fixed at 0 (size 1)
        z+Expected `image_latents` to be a list, got .r   r   r   r   )r;   listrX   r$   r   r   r*   viewr,   squeezer   r   r-   catr   )
r   r   r   t_coordsimage_latent_idsr   r0   r   r   x_idss
             r5   _prepare_image_idsz Flux2Pipeline._prepare_image_ids  s(   < -.J4P]K^J__`abb 05||As=?Q/RS!EEAI%SS(011AFF2J11x0 	+DAq		!A wwAvu((ELL,@%,,uBUW\WcWcdeWfgE##E*	+ !99%51=+55a8 T1s   
D<Ec                     | j                   \  }}}}| j                  |||dz  d|dz  d      } | j                  dddddd      } | j                  ||dz  |dz  |dz        } | S )Nr   r   r   r         )r   r   r   r   rl   r   num_channels_latentsr   r   s        r5   _patchify_latentszFlux2Pipeline._patchify_latents  s}    :A--7
(&%,,z+?1aQVZ[Q[]^_//!Q1a3//*.BQ.FRSUZ^_U_`rQ   c                     | j                   \  }}}}| j                  ||dz  dd||      } | j                  dddddd      } | j                  ||dz  |dz  |dz        } | S )Nr   r   r   r   r   r   r   r   r   r   s        r5   _unpatchify_latentsz!Flux2Pipeline._unpatchify_latents  sy    :A--7
(&%//*.Bu.MqRSU[]bc//!Q1a3//*.Bu.MvXYz[`cd[derQ   c                 v    | j                   \  }}}}| j                  ||||z        j                  ddd      } | S )zw
        pack latents: (batch_size, num_channels, height, width) -> (batch_size, height * width, num_channels)
        r   r   r   r   )rl   r   r   r   r   s        r5   _pack_latentszFlux2Pipeline._pack_latents  sC     3:--/
L&%//*lFUNKSSTUWXZ[\rQ   r   r9   c                    g }t        | |      D ]1  \  }}|j                  \  }}|dddf   j                  t        j                        }|dddf   j                  t        j                        }t        j
                  |      dz   }	t        j
                  |      dz   }
||
z  |z   }t        j                  |	|
z  |f|j                  |j                        }|j                  d|j                  d      j                  d|      |       |j                  |	|
|      j                  ddd      }|j                  |       4 t        j                  |d      S )zA
        using position ids to scatter tokens into place
        Nr   r   rS   r   r   r   r   )r,   r   r   r   int64maxzerosrS   r   scatter_r   r   r   r   r-   r   )r   r   x_listdataposr0   chh_idsw_idsr   r   flat_idsr   s                r5   _unpack_latents_with_idsz&Flux2Pipeline._unpack_latents_with_ids  s#   
 Q 	ID#JJEAr1ILL-E1ILL-E		% 1$A		% 1$Aqy5(H++q1ubk$++TZZPCLLH..q188R@$G ((1a$,,Q15CMM#!	$ {{6q))rQ   r   temperaturec           
         t        |t              r|gn|}|| j                  j                  n|}|t	        |      dk(  s|d   t
        }nt        }|r!t        || j                  | j                        }t        |||      }| j                  j                  |ddddddd      }|d   j                  |      |d<   |d	   j                  |      |d	<   d
|v r,|d
   j                  || j                  j                        |d
<    | j                  j                  di |dd|dd}|d   j                   d   }	|d d |	d f   }
| j                  j                  j#                  |
dd      }|S )Nr   )r   r   r   Tr   r   i   r   r   r   pixel_valuesr{   )max_new_tokens	do_sampler   r   r   )skip_special_tokensclean_up_tokenization_spacesrW   )r;   r   ru   rS   r*   r   r   rB   r7   r8   r6   rv   r   r   r   generater   batch_decode)r   r.   r   r   rS   r   r   r   generated_idsinput_lengthgenerated_tokensupsampled_prompts               r5   upsample_promptzFlux2Pipeline.upsample_prompt  s    (4&&-3^"")) >S[A-1B:N:N 1&$:N:NPTPnPnoF &f^\bc 33"&  4 	
 %[144V<{#)*:#;#>#>v#F V#%+N%;%>%>vtGXGXG^G^%_F>" 3))22 

#
 k*003(LM)9:>>33@@$UY A 
  rQ   r   num_images_per_prompttext_encoder_out_layersc           	         |xs | j                   }|d}t        |t              r|gn|}|6| j                  | j                  | j
                  |||| j                  |      }|j                  \  }}}	|j                  d|d      }|j                  ||z  |d      }| j                  |      }
|
j                  |      }
||
fS )Nr!   )ru   rv   r.   rS   r   r   r   r   r   )_execution_devicer;   r   r   ru   rv   r   r   repeatr   r   r   )r   r.   rS   r   rs   r   r   r   r   r0   text_idss              r5   encode_promptzFlux2Pipeline.encode_prompt;  s     1411>F'4&&  CC!....$7#22%< D M "/!4!4
GQ%,,Q0EqI%**:8M+MwXZ[))-8;;v&h&&rQ   r(   rf   c                 N   |j                   dk7  rt        d|j                    d      t        | j                  j	                  |      |d      }| j                  |      }| j                  j                  j                  j                  dddd      j                  |j                  |j                        }t        j                  | j                  j                  j                  j                  dddd      | j                  j                  j                   z         }||z
  |z  }|S )Nr   zExpected image dims 4, got r   rk   )rf   rg   r   r   )ndimrX   rp   rt   encoder   bnrunning_meanr   r   rS   r   r   sqrtrunning_varr   batch_norm_eps)r   r(   rf   r   latents_bn_meanlatents_bn_stds         r5   _encode_vae_imagezFlux2Pipeline._encode_vae_image^  s    ::?:5::,aHII()?9bjk..}=((++22772q!DGGH\H\^k^q^qrDHHKK$;$;$@$@B1$MPTPXPXP_P_PnPn$no&8NJrQ   c	                    dt        |      | j                  dz  z  z  }dt        |      | j                  dz  z  z  }||dz  |dz  |dz  f}	t        |t              r)t	        |      |k7  rt        dt	        |       d| d      |t        |	|||      }n|j                  ||      }| j                  |      }
|
j                  |      }
| j                  |      }||
fS )Nr   r   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)rf   rS   r   r   )
intrz   r;   r   r*   rX   r   r   r   r   )r   r   num_latents_channelsr   r   r   rS   rf   rl   r   r   s              r5   prepare_latentszFlux2Pipeline.prepare_latentsk  s    c&kd&;&;a&?@ASZD$9$9A$=>?1A5v{EQJOi&3y>Z+GA#i.AQ R&<'gi  ?"5IfTYZGjjej<G--g6
]]6*
$$W-
""rQ   c                    g }|D ]9  }|j                  ||      }| j                  ||      }|j                  |       ; | j                  |      }	g }
|D ]5  }| j	                  |      }|j                  d      }|
j                  |       7 t        j                  |
d      }|j                  d      }|j                  |dd      }|	j                  |dd      }	|	j                  |      }	||	fS )Nr   )r(   rf   r   r   r   )
r   r  r-   r   r   r   r   r   r   r  )r   r   r   rf   rS   r   r   r(   imagge_latentr   packed_latentslatentpackeds                r5   prepare_image_latentsz#Flux2Pipeline.prepare_image_latents  s
     	0EHHF%H8E 22)2TM  /	0
  22=A # 	*F''/F^^A&F!!&)		* 		.a8%//2%,,ZA>+22:q!D+..v6...rQ   c           
           || j                   dz  z  dk7  s|A| j                   dz  z  dk7  r,t        j                  d j                   dz   d| d| d       |Lt         fd|D              s8t	        d j
                   d	|D cg c]  }| j
                  vs| c}       ||t	        d
| d| d      ||t	        d      |9t        |t              s(t        |t              st	        dt        |             y y y c c}w )Nr   r   z-`height` and `width` have to be divisible by z	 but are z and z(. Dimensions will be resized accordinglyc              3   :   K   | ]  }|j                   v   y wN)_callback_tensor_inputs).0r   r   s     r5   	<genexpr>z-Flux2Pipeline.check_inputs.<locals>.<genexpr>  s#      F
23A---F
s   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is )
rz   loggerwarningallrX   r  r;   r   r   r$   )r   r.   r   r   rs   "callback_on_step_end_tensor_inputsr   s   `      r5   check_inputszFlux2Pipeline.check_inputs  s    $//!349 ..23q8NN?@U@UXY@Y?ZZcdjckkpqvpw  x`  a .9# F
7YF
 C
 DTEaEaDbbn  |^  pHvw  bc  ko  kG  kG  bGpq  pH  oI  J  -";08N}o ^0 0  ^ 5w  FC)@TZ\`IaQRVW]R^Q_`aa Jb)@ pHs   	DDc                     | j                   S r  )_guidance_scaler   s    r5   guidance_scalezFlux2Pipeline.guidance_scale  s    ###rQ   c                     | j                   S r  )_joint_attention_kwargsr&  s    r5   joint_attention_kwargsz$Flux2Pipeline.joint_attention_kwargs  s    +++rQ   c                     | j                   S r  )_num_timestepsr&  s    r5   num_timestepszFlux2Pipeline.num_timesteps  s    """rQ   c                     | j                   S r  )_current_timestepr&  s    r5   current_timestepzFlux2Pipeline.current_timestep  s    %%%rQ   c                     | j                   S r  )
_interruptr&  s    r5   	interruptzFlux2Pipeline.interrupt  s    rQ   2   g      @pilTr   r   rR   rU   r'  output_typer   attention_kwargscallback_on_step_endr"  caption_upsample_temperaturec                    | j                  |||||       || _        || _        d| _        d| _        |t        |t              rd}n-|t        |t              rt        |      }n|j                  d   }| j                  }|r| j                  ||||      }| j                  ||||||      \  }}|t        |t              s|g}d}||D ]  }| j                  j                  |        g }|D ]  }|j                  \  }}||z  dkD  r+| j                  j!                  |d      }|j                  \  }}| j"                  d	z  }||z  |z  }||z  |z  }| j                  j%                  |||d
      }|j'                  |       |xs |}|xs |} |xs | j(                  | j"                  z  }|xs | j(                  | j"                  z  }| j*                  j,                  j.                  dz  }| j1                  ||z  ||||j2                  ||	|
      \  }
}d}d}|0| j5                  |||z  |	|| j6                  j2                        \  }}|t9        j:                  dd|z  |      n|}t=        | j>                  j,                  d      r"| j>                  j,                  j@                  rd}|
j                  d   } tC        | |      }!tE        | j>                  ||||!      \  }"}tG        t        |"      || j>                  jH                  z  z
  d      }#t        |"      | _%        tM        jN                  dg||tL        jP                        }$|$jS                  |
j                  d         }$| j>                  jU                  d       | jW                  |      5 }%tY        |"      D ]6  \  }&}'| jZ                  r|'| _        |'jS                  |
j                  d         j]                  |
j2                        }(|
j]                  | j*                  j2                        })|}*|UtM        j^                  |
|gd      j]                  | j*                  j2                        })tM        j^                  ||gd      }*| j+                  |)|(dz  |$|||*| j                  d      d   }+|+ddd|
j                  d      f   }+|
j2                  },| j>                  ja                  |+|'|
d      d   }
|
j2                  |,k7  r9tL        jb                  jd                  jg                         r|
j]                  |,      }
|Hi }-|D ]  }.ti               |.   |-|.<     || |&|'|-      }/|/jk                  d|
      }
|/jk                  d|      }|&t        |"      dz
  k(  s'|&dz   |#kD  r/|&dz   | j>                  jH                  z  dk(  r|%jm                          tn        s#tq        jr                          9 	 ddd       d| _        |dk(  r|
}nH| ju                  |
|      }
| j6                  jv                  jx                  j{                  dddd      j]                  |
j|                  |
j2                        }0tM        j~                  | j6                  jv                  j                  j{                  dddd      | j6                  j,                  j                  z         j]                  |
j|                  |
j2                        }1|
|1z  |0z   }
| j                  |
      }
| j6                  j                  |
d      d   }| j                  j                  ||      }| j                          |s|fS t        |      S # 1 sw Y   xY w)a  
        Function invoked when calling the pipeline for generation.

        Args:
            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
                latents as `image`, but if passing latents directly it is not encoded again.
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            guidance_scale (`float`, *optional*, defaults to 1.0):
                Embedded guiddance scale is enabled by setting `guidance_scale` > 1. Higher `guidance_scale` encourages
                a model to generate images more aligned with `prompt` at the expense of lower image quality.

                Guidance-distilled models approximates true classifer-free guidance for `guidance_scale` > 1. Refer to
                the [paper](https://huggingface.co/papers/2210.03142) to learn more.
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image. This is set to 1024 by default for the best results.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The width in pixels of the generated image. This is set to 1024 by default for the best results.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            sigmas (`List[float]`, *optional*):
                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
                will be used.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.qwenimage.QwenImagePipelineOutput`] instead of a plain tuple.
            attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
            text_encoder_out_layers (`Tuple[int]`):
                Layer indices to use in the `text_encoder` to derive the final prompt embeddings.
            caption_upsample_temperature (`float`):
                When specified, we will try to perform caption upsampling for potentially improved outputs. We
                recommend setting it to 0.15 if caption upsampling is to be performed.

        Examples:

        Returns:
            [`~pipelines.flux2.Flux2PipelineOutput`] or `tuple`: [`~pipelines.flux2.Flux2PipelineOutput`] if
            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
            generated images.
        )r.   r   r   rs   r"  NFr   r   )r   r   rS   )r.   rs   rS   r   r   r   i   r   crop)r   r   resize_moder   )r   r  r   r   r   rS   rf   rl   )r   r   rf   rS   r   g      ?use_flow_sigmas)rC   rD   )rU   rK   r   )totalr   i  )r   timestepguidanceencoder_hidden_statestxt_idsimg_idsr*  r   )r   rl   rs   r  r   )r6  )r   )Gr#  r%  _attention_kwargsr/  r2  r;   r   r   r*   r   r   r   r  r7   check_image_inputsize_resize_to_target_arearz   
preprocessr-   r   rw   r   in_channelsr  r   r  rt   nplinspacerm   r`   r=  rP   rd   r   orderr,  r   fullfloat32r   set_begin_indexprogress_barr+   r3  r   r   stepbackendsmpsis_availablelocalspopupdateXLA_AVAILABLExm	mark_stepr   r  r  r   rS   r	  r
  r  r   decodepostprocessmaybe_free_model_hooksr   )2r   r(   r.   r   r   rR   rU   r'  r   rf   rl   rs   r6  r   r7  r8  r"  r   r   r9  r   rS   r  condition_imagesimgimage_widthimage_heightmultiple_ofr   r   r   r   rC   rK   rT   num_warmup_stepsr@  rP  r2   r   r?  latent_model_inputlatent_image_ids
noise_predlatents_dtypecallback_kwargsr   callback_outputsr  r  s2                                                     r5   __call__zFlux2Pipeline.__call__  sz   H 	'/Q 	 	
  .!1!% *VS"9JJvt$<VJ&,,Q/J'' ())u2NW] * F #'"4"4'"7 3$; #5 #
x Zt%<GE <$$66s;<  " -,/HH)\-;..EEc;WC03-K"33a7*k9[H , ;{J**55c,Vaou5v '',/<,- K433d6K6KKI11D4I4II  $//66BBaG"22!$99!5%% 3 	
 '.2.H.H'%(==#hhnn /I /+M+ TZSaS!&9"9;NOgm4>>((*;<AVAVAfAfFa(!I\]);NN*
&	& s9~0CdnnFZFZ0ZZ\]^!)n ::qc>&V??7==#34
 	&&q)%89 4	#\!), 3#1>>)*&88GMM!$4588G%,ZZ0@0@0F0F%G"#-  ,).G]3KQR)S)V)VW[WgWgWmWm)n&',yy*>N1OUV'W$!--"4%_%*7$,+/+A+A % . 	 	
 (+>W\\!_+>(>?
 !(..--j!WRW-XYZ[==M1~~))668")**]";'3&(O? 9-3Xa[*9';D!Q'X$.229gFG$4$8$8-$XM I**A9I/IqSTuX\XfXfXlXlNlpqNq '') LLNg3#4	#l "&("E33GZHG"hhkk66;;Ar1aHKKGNN\c\i\ijO"ZZ(?(?(D(DQAq(QTXT\T\TcTcTrTr(rsvvN .@G..w7GHHOOGO?BE((44U4TE 	##%8O"%00[4	# 4	#s   H-]"	]""],r  )r   )Ng333333?N)Nr   Nr{   r   )NN):__name__
__module____qualname____doc__model_cpu_offload_seqr  r   r   r   r
   r   r~   staticmethodr   r	   r   r   r   r   r   rS   r  r   Tensorr   r   r   r   r   r   r   r   r<   r=   rF   r   r   r  	Generatorr  r  r  r#  propertyr'  r*  r-  r0  r3  no_gradr   EXAMPLE_DOC_STRINGboolr   r   r   rj  __classcell__)r_   s   @r5   rr   rr      s   & =(/:C2C  C 7	C
 !C -C: 
 (,)-#&,*6161 1 c49n%1 $	1
 &1 !1 1 #3i1 1f  +/$<<$%,,'$ $$  >  / ELL)/ /  / b       *ELL * *$u||J\ * *8 MQ!#< c49n%<  d399??+T$syy2G-HHI<  	< 
 <  
c< B *.%&04#&.:!'c49n%!' &!'  #	!'
  -!' !!' "'s!'Fu||  , +/# ??# %,,'#B /U\\" / ??	 /N +/#bJ $ $ , , # # & &   U]]_12 JN(, $##%(,*-%&MQ*.04%* 59KO9B#&.:.2)_1d399??3SYY__DEF_1 c49n%_1 	_1
 }_1 !_1 e%_1 !_1  #_1 E%//43H"HIJ_1 %,,'_1  -_1 c]_1 _1 #4S>2_1  'xc40@$0F'GH!_1" -1I#_1$ !%_1& "'s'_1( ',)_1 3 _1rQ   rr   )NNNN)Nrj   )>rZ   typingr   r   r   r   r   r   r	   numpyrJ  r<   r   transformersr
   r   loadersr   modelsr   r   
schedulersr   utilsr   r   r   utils.torch_utilsr   pipeline_utilsr   r7   r   pipeline_outputr   system_messagesr   r   r   torch_xla.core.xla_modelcore	xla_modelrY  rX  
get_loggerrk  r  ru  r   r   r=   r6   r  rB   rF   rP   rS   rd   rq  rr  rp   rr   rW   rQ   r5   <module>r     s    D D D  
  H + A 9 O O - . 0 0 i i ))MM 
		H	%   #  )RV>#Y>> U4		0$tCIIOO7L2MMNO>Fciioo&'$syy*??(  # 
$syy
 	6   * *.15%)$(8*!#8* U3,-.8* S	"	8*
 T%[!8*z ck
TLL
T-5eoo-F
T\_
TM1%'; M1rQ   