
    iT                        d dl Z d dlZd dlmZ d dlZd dlmZ ddl	m
Z
  e
j                  e      ZddefdZej                   j"                  j$                  ddfdedefd	Z G d
 dej(                        Z G d dej(                        Z G d dej(                        Z G d dej(                        Z G d dej(                        Zy)    N   )logging   key_chunk_sizec                     j                   dd \  }j                   d   t        |       t        j                        z   t	        j
                  t        j                  d      fd        fd}t        j                  j                  |t        j                  d|      	      \  }}}	t        j                  |	dd
      }
t        j                  |	|
z
        }|t        j                  |d      z  }||z  }|j                  d      }t        j                  |d      j                  d      }||z  S )zBMulti-head dot product attention with a limited number of queries.NF)prevent_csec                 \   t        j                  d| |      }t        j                  |dd      }t        j                  j                  |      }t        j                  ||z
        }t        j                  d||      }t        j                  d|      }||j                  d      |fS )	Nz...qhd,...khd->...qhk)	precisionr	   Taxiskeepdimsz...vhf,...qhv->...qhfz...qhk->...qhr   )jnpeinsummaxjaxlaxstop_gradientexpsum)querykeyvalueattn_weights	max_scoreexp_weights
exp_valuesr   s          i/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/diffusers/models/attention_flax.pysummarize_chunkz/_query_chunk_attention.<locals>.summarize_chunk#   s    zz"95#QZ[GGLrDA	GG)))4	gglY67ZZ 7W`a
JJ	:	KOOO4i@@    c           	      l   t         j                  j                  dgj                  dz
  z  | ddgz   t	        j
                  d d       gz         }t         j                  j                  
dg
j                  dz
  z  | ddgz   t	        
j
                  d d       	gz         } ||      S )Nr      r   operandstart_indicesslice_sizes)r   r   dynamic_slicendimlistshape)	chunk_idx	key_chunkvalue_chunk
k_featuresr   r   	num_headsr   r!   
v_featuresr   s      r    chunk_scannerz-_query_chunk_attention.<locals>.chunk_scanner0   s    GG))#A.)Q1BBSYYs^,	:/VV * 
	 gg++#a0Iq!3DDU[["-..)Z1XX , 
 ui==r"   r   )fxsTr   r   )r,   minr   sqrt	functoolspartialr   
checkpointr   maparanger   r   expand_dimsr   )r   r   r   r   r   num_kvr3   chunk_valueschunk_weights	chunk_max
global_max	max_diffs
all_valuesall_weightsr0   r1   r!   r2   s   `````         @@@@r    _query_chunk_attentionrF      s$   $'IIbcN!FIzRJ0NCHHZ((Es~~59
A :
A> >" .1WW[[=SZZXY[acqMr[-s*L-T:J	J./ICOOIB77LYM!!q!)J//-488a8@K##r"   i   query_chunk_sizec           	          	
  j                   dd \  
		
 f	d}t        j                  j                  |ddt	        j
                  
z              \  }}t        j                  |d      S )a  
    Flax Memory-efficient multi-head dot product attention. https://huggingface.co/papers/2112.05682v2
    https://github.com/AminRezaei0x443/memory-efficient-attention

    Args:
        query (`jnp.ndarray`): (batch..., query_length, head, query_key_depth_per_head)
        key (`jnp.ndarray`): (batch..., key_value_length, head, query_key_depth_per_head)
        value (`jnp.ndarray`): (batch..., key_value_length, head, value_depth_per_head)
        precision (`jax.lax.Precision`, *optional*, defaults to `jax.lax.Precision.HIGHEST`):
            numerical precision for computation
        query_chunk_size (`int`, *optional*, defaults to 1024):
            chunk size to divide query array value must divide query_length equally without remainder
        key_chunk_size (`int`, *optional*, defaults to 4096):
            chunk size to divide key and value array value must divide key_value_length equally without remainder

    Returns:
        (`jnp.ndarray`) with shape of (batch..., query_length, head, value_depth_per_head)
    r   Nc           	         	 t         j                  j                  	dg	j                  dz
  z  | ddgz   t	        	j
                  d d       t        
      gz         }| 
z   t        |      fS )Nr   r$   r   r%   )r   r   r   r   r   )r   r   r)   r*   r+   r,   r6   rF   )r-   _query_chunkr   r   r1   num_qr   
q_featuresr   rG   r   s      r    r3   z5jax_memory_efficient_attention.<locals>.chunk_scannerf   s    gg++3%**q.1iA5FFU[["-.#6F2NPY[e1ff , 
 (("!s%9]k
 	
r"   r   )r4   initr5   lengthr   )r,   r   r   scanmathceilr   concatenate)r   r   r   r   rG   r   r3   rJ   resr1   rL   rM   s   ``````   @@@r    jax_memory_efficient_attentionrU   O   sr    * $);;rs#3 E9j
 
 WW\\
yy!112	  FAs ??3R((r"   c                       e Zd ZU dZeed<   dZeed<   dZeed<   dZe	ed<   d	Z
eed
<   d	Zeed<   ej                  Zej                  ed<   d Zd Zd ZddZy)FlaxAttentiona  
    A Flax multi-head attention module as described in: https://huggingface.co/papers/1706.03762

    Parameters:
        query_dim (:obj:`int`):
            Input hidden states dimension
        heads (:obj:`int`, *optional*, defaults to 8):
            Number of heads
        dim_head (:obj:`int`, *optional*, defaults to 64):
            Hidden states dimension inside each head
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://huggingface.co/papers/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`

    	query_dim   heads@   dim_head        dropoutFuse_memory_efficient_attentionsplit_head_dimdtypec                 $   t         j                  d       | j                  | j                  z  }| j                  dz  | _        t        j                  |d| j                  d      | _        t        j                  |d| j                  d      | _	        t        j                  |d| j                  d      | _
        t        j                  | j                  | j                  d	      | _        t        j                  | j                  
      | _        y )NFlax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.g      Fto_q)use_biasra   nameto_kto_vto_out_0)ra   rf   rate)loggerwarningr\   rZ   scalennDensera   r   r   r   rX   	proj_attnDropoutr^   dropout_layerself	inner_dims     r    setupzFlaxAttention.setup   s    [	

 MMDJJ.	]]D(
 XXi%tzzPVW
88ITZZfUXXi%tzzPVW
$..

TZZT\\:r"   c                     |j                   \  }}}| j                  }|j                  |||||z        }t        j                  |d      }|j                  ||z  |||z        }|S N)r   r      r$   r,   rZ   reshaper   	transposeru   tensor
batch_sizeseq_lendim	head_sizes         r    reshape_heads_to_batch_dimz(FlaxAttention.reshape_heads_to_batch_dim   se    #)<< 
GSJJ	
GYy@PQv|4
Y 6	AQRr"   c                     |j                   \  }}}| j                  }|j                  ||z  |||      }t        j                  |d      }|j                  ||z  |||z        }|S ry   r{   r~   s         r    reshape_batch_dim_to_headsz(FlaxAttention.reshape_batch_dim_to_heads   sd    #)<< 
GSJJ	
i 7GSQv|4
i 7#	/Rr"   Nc                    ||n|}| j                  |      }| j                  |      }| j                  |      }| j                  r|j                  d   }t        j                  ||d| j                  | j                  f      }t        j                  ||d| j                  | j                  f      }	t        j                  ||d| j                  | j                  f      }
n3| j                  |      }| j                  |      }	| j                  |      }
| j                  r|j                  ddd      }|	j                  ddd      }	|
j                  ddd      }
|j                  d   }|dz  dk(  rt        |dz        }n9|dz  dk(  rt        |dz        }n"|dz  dk(  rt        |dz        }nt        |      }t        ||	|
|d	
      }|j                  ddd      }| j                  |      }n| j                  rt        j                  d|	|      }nt        j                  d||	      }|| j                   z  }t#        j$                  || j                  rdnd      }| j                  rWt        j                  d||
      }|j                  d   }t        j                  ||d| j                  | j                  z  f      }n(t        j                  d||
      }| j                  |      }| j'                  |      }| j)                  ||      S )Nr   r	   rz   r   r   r[         i @  )rG   r   zb t n h, b f n h -> b n f tzb i d, b j d->b i jr   zb n f t, b t n h -> b f n hzb i j, b j d -> b i ddeterministic)r   r   r   r`   r,   r   r|   rZ   r\   r   r_   r}   intrU   r   r   rn   ro   softmaxrq   rs   )ru   hidden_statescontextr   
query_projkey_proj
value_projbquery_states
key_statesvalue_statesflatten_latent_dimrG   attention_scoresattention_probss                  r    __call__zFlaxAttention.__call__   s   #*?-ZZ.
88G$ZZ(
##A&A;;zAr4::t}}3UVLX2tzz4==/QRJ;;zAr4::t}}3UVL:::FL88BJ:::FL..'11!Q:L#--aA6J'11!Q:L
 ".!3!3B!7!B&!+#&'9B'>#? #b(A-#&'9B'>#? #a'1,#&'9A'=#> #&'9#: :j,IYjrM *33Aq!<M ;;MJM ""#&::.KZYe#f #&::.C\S]#^ /$**< jj)9dFYFY_`aO "" #

+H/[g h!''* #MAr4::PTP]P]C];^ _ #

+BOUa b $ ? ? N}5!!-}!MMr"   )NT)__name__
__module____qualname____doc__r   __annotations__rZ   r\   r^   floatr_   boolr`   r   float32ra   rw   r   r   r    r"   r    rW   rW      sg    , NE3NHcGU+0"D0 ND {{E399";"<Nr"   rW   c                       e Zd ZU dZeed<   eed<   eed<   dZeed<   dZe	ed<   e
j                  Ze
j                  ed	<   dZe	ed
<   dZe	ed<   d ZddZy)FlaxBasicTransformerBlockau  
    A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
    https://huggingface.co/papers/1706.03762


    Parameters:
        dim (:obj:`int`):
            Inner hidden states dimension
        n_heads (:obj:`int`):
            Number of heads
        d_head (:obj:`int`):
            Hidden states dimension inside each head
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        only_cross_attention (`bool`, defaults to `False`):
            Whether to only apply cross attention.
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://huggingface.co/papers/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
    r   n_headsd_headr]   r^   Fonly_cross_attentionra   r_   r`   c           	      2   t         j                  d       t        | j                  | j                  | j
                  | j                  | j                  | j                  | j                        | _
        t        | j                  | j                  | j
                  | j                  | j                  | j                  | j                        | _        t        | j                  | j                  | j                        | _        t        j                  d| j                        | _        t        j                  d| j                        | _        t        j                  d| j                        | _        t        j&                  | j                        | _        y )Nrc   ra   )r   r^   ra   h㈵>)epsilonra   rj   )rl   rm   rW   r   r   r   r^   r_   r`   ra   attn1attn2FlaxFeedForwardffro   	LayerNormnorm1norm2norm3rr   rs   ru   s    r    rw   zFlaxBasicTransformerBlock.setup!  s   [	
 #HHLLKKLL//**

 #HHLLKKLL//**

 "dhhDJJW\\$djjA
\\$djjA
\\$djjA
ZZT\\:r"   c                    |}| j                   r$| j                  | j                  |      ||      }n"| j                  | j                  |      |      }||z   }|}| j                  | j	                  |      ||      }||z   }|}| j                  | j                  |      |      }||z   }| j                  ||      S Nr   )r   r   r   r   r   r   r   rs   )ru   r   r   r   residuals        r    r   z"FlaxBasicTransformerBlock.__call__A  s     $$ JJtzz-'@'YfJgM JJtzz-'@P]J^M%0 !

4::m#<gUb
c%0 !

= 9W%0!!-}!MMr"   NT)r   r   r   r   r   r   r^   r   r   r   r   r   ra   r_   r`   rw   r   r   r"   r    r   r      s`    2 
HLKGU!&$&{{E399"+0"D0 ND ;@Nr"   r   c                       e Zd ZU dZeed<   eed<   eed<   dZeed<   dZeed<   d	Z	e
ed
<   d	Ze
ed<   ej                  Zej                  ed<   d	Ze
ed<   d	Ze
ed<   d ZddZy)FlaxTransformer2DModela  
    A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
    https://huggingface.co/papers/1506.02025


    Parameters:
        in_channels (:obj:`int`):
            Input number of channels
        n_heads (:obj:`int`):
            Number of heads
        d_head (:obj:`int`):
            Hidden states dimension inside each head
        depth (:obj:`int`, *optional*, defaults to 1):
            Number of transformers block
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        use_linear_projection (`bool`, defaults to `False`): tbd
        only_cross_attention (`bool`, defaults to `False`): tbd
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://huggingface.co/papers/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
    in_channelsr   r   rz   depthr]   r^   Fuse_linear_projectionr   ra   r_   r`   c                 Z   t         j                  d       t        j                  dd      | _        | j
                  | j                  z  }| j                  r't        j                  || j                        | _
        n)t        j                  |ddd| j                        | _
        t        | j                        D cg c][  }t        || j
                  | j                  | j                  | j                   | j                  | j"                  | j$                  	      ] c}| _        | j                  r't        j                  || j                        | _        n)t        j                  |ddd| j                        | _        t        j*                  | j                  
      | _        y c c}w )Nrc       r   )
num_groupsr   r   )rz   rz   VALID)kernel_sizestridespaddingra   )r^   r   ra   r_   r`   rj   )rl   rm   ro   	GroupNormnormr   r   r   rp   ra   proj_inConvranger   r   r^   r   r_   r`   transformer_blocksproj_outrr   rs   )ru   rv   rJ   s      r    rw   zFlaxTransformer2DModel.setup~  sC   [	

 LLB=	LL4;;.	%%88ITZZ@DL77"jjDL& 4::&#
  &%)%>%>jj/3/R/R#22	#
 %%HHYdjjADMGG"jjDM  ZZT\\:3#
s   >A F(c                    |j                   \  }}}}|}| j                  |      }| j                  r(|j                  |||z  |      }| j	                  |      }n'| j	                  |      }|j                  |||z  |      }| j
                  D ]  }	 |	|||      } | j                  r&| j                  |      }|j                  ||||      }n%|j                  ||||      }| j                  |      }||z   }| j                  ||      S r   )r,   r   r   r|   r   r   r   rs   )
ru   r   r   r   batchheightwidthchannelsr   transformer_blocks
             r    r   zFlaxTransformer2DModel.__call__  s   )6)<)<&vuh 		-0%%)11%%RM LL7M LL7M)11%%RM!%!8!8 	c-mWTabM	c %% MM-8M)11%QM)11%QM MM-8M%0!!-}!MMr"   Nr   )r   r   r   r   r   r   r   r^   r   r   r   r   r   r   ra   r_   r`   rw   r   r   r"   r    r   r   W  su    6 LKE3NGU"'4'!&$&{{E399"+0"D0 ND -;^Nr"   r   c                   n    e Zd ZU dZeed<   dZeed<   ej                  Z
ej                  ed<   d Zd	dZy)
r   a  
    Flax module that encapsulates two Linear layers separated by a non-linearity. It is the counterpart of PyTorch's
    [`FeedForward`] class, with the following simplifications:
    - The activation function is currently hardcoded to a gated linear unit from:
    https://huggingface.co/papers/2002.05202
    - `dim_out` is equal to `dim`.
    - The number of hidden dimensions is hardcoded to `dim * 4` in [`FlaxGELU`].

    Parameters:
        dim (:obj:`int`):
            Inner hidden states dimension
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
    r   r]   r^   ra   c                     t         j                  d       t        | j                  | j                  | j
                        | _        t        j                  | j                  | j
                        | _	        y )Nrc   r   )
rl   rm   	FlaxGEGLUr   r^   ra   net_0ro   rp   net_2r   s    r    rw   zFlaxFeedForward.setup  sL    [	
 txxtzzB
XXdhhdjj9
r"   c                 N    | j                  ||      }| j                  |      }|S r   )r   r   )ru   r   r   s      r    r   zFlaxFeedForward.__call__  s(    

=
N

=1r"   Nr   r   r   r   r   r   r   r^   r   r   r   ra   rw   r   r   r"   r    r   r     s4    " 
HGU{{E399"	:r"   r   c                   n    e Zd ZU dZeed<   dZeed<   ej                  Z
ej                  ed<   d Zd	dZy)
r   a  
    Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
    https://huggingface.co/papers/2002.05202.

    Parameters:
        dim (:obj:`int`):
            Input hidden states dimension
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
    r   r]   r^   ra   c                     t         j                  d       | j                  dz  }t        j                  |dz  | j
                        | _        t        j                  | j                        | _	        y )Nrc   r   r   r   rj   )
rl   rm   r   ro   rp   ra   projrr   r^   rs   rt   s     r    rw   zFlaxGEGLU.setup  sR    [	

 HHqL	HHY]$**=	ZZT\\:r"   c                     | j                  |      }t        j                  |dd      \  }}| j                  |t	        j
                  |      z  |      S )Nr   r   r   )r   r   splitrs   ro   gelu)ru   r   r   hidden_linearhidden_gelus        r    r   zFlaxGEGLU.__call__	  sL    		-0%(YY}aa%H"{!!-"''+2F"FVc!ddr"   Nr   r   r   r"   r    r   r     s5     
HGU{{E399";er"   r   )r   )r8   rQ   
flax.linenlinenro   r   	jax.numpynumpyr   utilsr   
get_loggerr   rl   r   rF   r   	PrecisionHIGHESTrU   ModulerW   r   r   r   r   r   r"   r    <module>r      s       
   
		H	%0$ 0$h "%!2!2!:!:TXpt-)NQ-)jm-)`|NBII |N~VN		 VNrlNRYY lN^$bii $Ne		 er"   