
    i;+                     ^   d dl mZ d dlmZmZmZmZmZmZm	Z	 d dl
Z
ddlmZ er	  ee      Ze G d d             Ze G d d	             Z ed
       G d d             Z ed
       G d d             Zee	eef   e	eee   eedf   f   f   Ze	eee   eedf   f   Zeee	eef   f   Zy)    )	dataclass)TYPE_CHECKINGDictListLiteralOptionalTupleUnionN   )
get_loggerc                      e Zd ZU dZdZee   ed<   dZee   ed<   dZ	e
ed<   dZed   ed	<   dZeed
<   dZeed<   dZej"                  ed<   dZej&                  j(                  j*                  ed<   dZej&                  j(                  j*                  ed<   dZej&                  j(                  j*                  ed<   dZej&                  j(                  j*                  ed<   dZeed<   dZeed<   d Zedeeef   fd       Zedeeef   fd       Z dededej"                  dej&                  j(                  j*                  fdZ!y)ContextParallelConfiga  
    Configuration for context parallelism.

    Args:
        ring_degree (`int`, *optional*, defaults to `1`):
            Number of devices to use for Ring Attention. Sequence is split across devices. Each device computes
            attention between its local Q and KV chunks passed sequentially around ring. Lower memory (only holds 1/N
            of KV at a time), overlaps compute with communication, but requires N iterations to see all tokens. Best
            for long sequences with limited memory/bandwidth. Number of devices to use for ring attention within a
            context parallel region. Must be a divisor of the total number of devices in the context parallel mesh.
        ulysses_degree (`int`, *optional*, defaults to `1`):
            Number of devices to use for Ulysses Attention. Sequence split is across devices. Each device computes
            local QKV, then all-gathers all KV chunks to compute full attention in one pass. Higher memory (stores all
            KV), requires high-bandwidth all-to-all communication, but lower latency. Best for moderate sequences with
            good interconnect bandwidth.
        convert_to_fp32 (`bool`, *optional*, defaults to `True`):
            Whether to convert output and LSE to float32 for ring attention numerical stability.
        rotate_method (`str`, *optional*, defaults to `"allgather"`):
            Method to use for rotating key/value states across devices in ring attention. Currently, only `"allgather"`
            is supported.

    Nring_degreeulysses_degreeTconvert_to_fp32	allgather)r   alltoallrotate_method_rank_world_size_device_mesh_flattened_mesh
_ring_mesh_ulysses_mesh_ring_local_rank_ulysses_local_rankc                    | j                   d| _         | j                  d| _        | j                   dk(  r| j                  dk(  rt        d      | j                   dk  s| j                  dk  rt        d      | j                   dkD  r| j                  dkD  rt        d      | j                  dk7  rt	        d| j                   d      y )N   zfEither ring_degree or ulysses_degree must be greater than 1 in order to use context parallel inferencezF`ring_degree` and `ulysses_degree` must be greater than or equal to 1.znUnified Ulysses-Ring attention is not yet supported. Please set either `ring_degree` or `ulysses_degree` to 1.r   z=Only rotate_method='allgather' is supported for now, but got .)r   r   
ValueErrorr   NotImplementedErrorselfs    m/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/diffusers/models/_modeling_parallel.py__post_init__z#ContextParallelConfig.__post_init__Q   s    # D&"#Dq T%8%8A%=x  a4#6#6#:effaD$7$7!$; A  ,%OPTPbPbOccde  -    returnc                 2    | j                   | j                  fS N)r   r   r#   s    r%   
mesh_shapez ContextParallelConfig.mesh_shapef   s      $"5"566r'   c                      y)z$Dimension names for the device mesh.)ringulysses r#   s    r%   mesh_dim_namesz$ContextParallelConfig.mesh_dim_namesj   s     #r'   rank
world_sizedevicemeshc           	         || _         || _        || _        || _        | j                  | j
                  z  |kD  r)t        d| j
                   d| j                   d| d      | j                  j                         | _        | j                  d   | _	        | j                  d   | _
        | j                  j                         | _        | j                  j                         | _        y )NzThe product of `ring_degree` (z) and `ulysses_degree` (z") must not exceed the world size (z).r-   r.   )r   r   r   r   r   r   r!   _flattenr   r   r   get_local_rankr   r   r$   r1   r2   r3   r4   s        r%   setupzContextParallelConfig.setupo   s    
%
!1!11J>01A1A0BBZ[_[n[nZo  pR  S]  R^  ^`  a   $zz224**V,!ZZ	2 $ > > @#'#5#5#D#D#F r'   )"__name__
__module____qualname____doc__r   r   int__annotations__r   r   boolr   r   r   r   r   torchr3   r   distributeddevice_mesh
DeviceMeshr   r   r   r   r   r&   propertyr	   r+   strr0   r9   r/   r'   r%   r   r   (   sn   . "&K#%$(NHSM( OT 6AM723AE3K GU\\ 6:E5((33:@DOU&&22==D;?J!!--88?>BM5$$00;;B c ##* 7E#s(O 7 7 #c3h # #G# G3 G GEL]L]LiLiLtLt Gr'   r   c            
       "   e Zd ZU dZdZee   ed<   dZe	ed<   dZ
e	ed<   dZej                  ed<   dZej                  j                   j"                  ed<   ddd	e	d
e	dej                  deej                  j                   j"                     fdZy)ParallelConfigz
    Configuration for applying different parallelisms.

    Args:
        context_parallel_config (`ContextParallelConfig`, *optional*):
            Configuration for context parallelism.
    Ncontext_parallel_configr   r   r   r   )r4   r1   r2   r3   r4   c                    || _         || _        || _        || _        | j                  | j                  j                  ||||       y y r*   )r   r   r   r   rI   r9   r8   s        r%   r9   zParallelConfig.setup   sL     
%
''3((..tZN 4r'   )r:   r;   r<   r=   rI   r   r   r?   r   r>   r   r   rA   r3   r   rB   rC   rD   r9   r/   r'   r%   rH   rH      s     @DX&;<CE3K GU\\ 6:E5((33: DHOO O 	O u((44??@Or'   rH   T)frozenc                   D    e Zd ZU dZeed<   dZee   ed<   dZe	ed<   d Z
y)ContextParallelInputa  
    Configuration for splitting an input tensor across context parallel region.

    Args:
        split_dim (`int`):
            The dimension along which to split the tensor.
        expected_dims (`int`, *optional*):
            The expected number of dimensions of the tensor. If provided, a check will be performed to ensure that the
            tensor has the expected number of dimensions before splitting.
        split_output (`bool`, *optional*, defaults to `False`):
            Whether to split the output tensor of the layer along the given `split_dim` instead of the input tensor.
            This is useful for layers whose outputs should be split after it does some preprocessing on the inputs (ex:
            RoPE).
    	split_dimNexpected_dimsFsplit_outputc                 V    d| j                    d| j                   d| j                   dS )NzContextParallelInput(split_dim=, expected_dims=z, split_output=))rN   rO   rP   r#   s    r%   __repr__zContextParallelInput.__repr__   sK    00@@PQUQcQcPddstx  uF  uF  tG  GH  I  	Ir'   )r:   r;   r<   r=   r>   r?   rO   r   rP   r@   rT   r/   r'   r%   rM   rM      s,     N#'M8C='L$Ir'   rM   c                   6    e Zd ZU dZeed<   dZee   ed<   d Zy)ContextParallelOutputa  
    Configuration for gathering an output tensor across context parallel region.

    Args:
        gather_dim (`int`):
            The dimension along which to gather the tensor.
        expected_dims (`int`, *optional*):
            The expected number of dimensions of the tensor. If provided, a check will be performed to ensure that the
            tensor has the expected number of dimensions before gathering.
    
gather_dimNrO   c                 <    d| j                    d| j                   dS )Nz!ContextParallelOutput(gather_dim=rR   rS   )rW   rO   r#   s    r%   rT   zContextParallelOutput.__repr__   s&    24??2CCSTXTfTfSgghiir'   )	r:   r;   r<   r=   r>   r?   rO   r   rT   r/   r'   r%   rV   rV      s"    	 O#'M8C='jr'   rV   .)dataclassesr   typingr   r   r   r   r   r	   r
   rA   utilsr   r:   loggerr   rH   rM   rV   rF   r>   ContextParallelInputTypeContextParallelOutputTypeContextParallelModelPlanr/   r'   r%   <module>r`      s/  $ " M M M    
H	 UG UG UGp O O O@ $I I I0 $j j j0  	#s(OU/6J1KUSgilSlMmmnn  "4 56>SUX>X8YY   U+CE^+^%_ _` r'   