
    i7             $       :*   U d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZmZmZ d dlZej$                  j'                         r	d dlmc mZ ddlmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z) e	rdd	l*m+Z+ d
Z,dZ-dZ.dZ/dZ0dZ1 e       xr	  ede,      Z2 e       Z3 e       xr	  ede-      Z4 e       xr	  e de.      Z5 e"de/      Z6 e!       Z7 e#       xr	  e$de0      Z8 e%       xr	  e&de1      Z9e2rd dl:m;Z;m<Z< d dl=m>Z>m?Z? ndZ;dZ<dZ>dZ?e3rd dl@m;ZA d dl@m<ZB ndZAdZBe4rd dlCm;ZD ndZDe5rd dlEmFZFmGZGmHZHmIZImJZJmKZK ndZFdZIdZJdZGdZHdZKe6rd dlLmMc mNc mOZO e7rd dlPmQZQ ndZQe8rd dlRmSZT ndZTe9rd dlUmVZW ndZWej                  dk\  r-ej                  j                  Z[ej                  j                  Z]ndddddZ^dddddZ_e^Z[e_Z] ee`      Za G d debe      Zc G d d       Zde G d! d"             Zeecj                   eed#d$d%&      ecj                   eed#d'(      ecj                   eed)d$d&      ecj                   eed)d'd&      ecj                   eed*d+d&      iZkedeef   eld,<   e j                  ecj                  fd-eebecf   fd.       Zo	 	 	 	 	 	 dddd0d1ej                  d2ej                  d3ej                  d4eej                     d5eqd6erd7eeq   d8erd9eeebe
f      d-eec   d:ed;   d<ej                  fd=Zsd4eej                     d6erd<dfd>Ztd1ej                  d2ej                  d3ej                  d<dfd?Zud1ej                  d2ej                  d3ej                  d<dfd@ZvdAewdBewd<efdCZxd1ej                  d2ej                  d3ej                  d<dfdDZyd1ej                  d2ej                  d3ej                  d<dfdEZz	 dd1ej                  d2ej                  d3ej                  d4eej                     d<df
dFZ{d-ecd<dfdGZ| ej                  dHI      	 ddJewdKewdLewdMeej                     fdN       Z	 ddJewdKewd4ej                  dMeej                     fdOZ	 	 ddJewdKewdLewd4eej                     dMeej                     d<dfdPZd4ej                  dJewdQewd<ej                  fdRZdS Zd-ecd<dfdTZ e[dUdVdWX      	 	 	 	 	 	 	 	 	 	 	 	 ddYej                  dZej                  d[ej                  d\eeq   d]erd^eej                     d_eej                     d`eej                     daeej                     dbewdceqddewdeeer   dferdgewd<eej                  ej                  f   f dh       Z e]dU      	 	 	 	 	 	 	 	 	 	 	 	 ddYej                  dZej                  d[ej                  d\eeq   d]erd^eej                     d_eej                     d`eej                     daeej                     dbewdceqddewdeeer   dferdgewd<eej                  ej                  f   f di       Z	 	 	 	 	 	 	 	 ddkej                  j                  j                  d1ej                  d2ej                  d3ej                  d4eej                     d5eqd6erd7eeq   d8erdlerdmerdned;   fdoZdkej                  j                  j                  dpej                  fdqZ	 	 	 	 	 	 	 	 ddkej                  j                  j                  d1ej                  d2ej                  d3ej                  d4eej                     d5eqd6erd7eeq   d8erdlerdmerdned;   fdrZdkej                  j                  j                  dpej                  fdsZ	 	 	 	 	 	 	 	 ddkej                  j                  j                  d1ej                  d2ej                  d3ej                  d4eej                     d5eqd6erd7eeq   d8erdlerdmerdned;   fdtZdkej                  j                  j                  dpej                  fduZ	 	 	 	 	 	 	 	 ddkej                  j                  j                  d1ej                  d2ej                  d3ej                  d4eej                     d5eqd6erd7eeq   d8erdlerdmerdned;   fdvZdkej                  j                  j                  dpej                  fdwZdx Zdyej                  d<ej                  fdzZ G d{ d|ej                  j(                        Z G d} d~ej                  j(                        Z	 	 	 	 	 	 dddd1ej                  d2ej                  d3ej                  d4eej                     d5eqd6erd7eeq   d8erdlerdned;   fdZedj1                  ecj2                  eueze{gdj      	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d5eqd6erd7eeq   dlerdned;   d<ej                  fd       Zedj1                  ecj                  eueze{gd/      	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d5eqd6erd7eeq   dlerdned;   d<ej                  fd       Zedj1                  ecj                  eueze{gd/      	 	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d4eej                     d5eqd7eeq   d6erdlerdned;   d<ej                  fd       Zedj1                  ecj:                  eueze{g      	 	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d4eej                     d5eqd7eeq   d6erdlerdned;   d<ej                  fd       Zedj1                  ecj>                  eueze{g      	 	 	 	 dd1ej                  d2ej                  d3ej                  d7eeq   d6erdlerdned;   d<ej                  fd       Zedj1                  ecj                  eueze{gd/      	 	 	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d7eeq   d6erdeewewf   dceqdferderdned;   d<ej                  fd       Zedj1                  ecj                  eueze{gd/      	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d4eej                     d7eeq   d6erdlerdned;   d<ej                  fd       Zedj1                  ecjF                  eueze{g      	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d4eej                     d7eeq   d6erdlerdned;   d<ej                  fd       Zedj1                  ecjJ                  eveze{g      	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d5eqd6erd7eeq   dlerdned;   d<ej                  fd       Zedj1                  ecjN                  eteue{g      	 	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d4eeej                  df      d6erd7eeq   d8erdlerdned;   d<ej                  fd       Zedj1                  ecj                  eue{gdj      	 	 	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d4eej                     d5eqd6erd7eeq   d8erdlerdned;   d<ej                  fd       Zedj1                  ecjT                  eueze{gdj      	 	 	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d4eej                     d5eqd6erd7eeq   d8erdlerdned;   d<ej                  fd       Zedj1                  ecjX                  eue{g      	 	 	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d4eej                     d5eqd6erd7eeq   d8erdlerdned;   d<ej                  fd       Zedj1                  ecj\                  eueze{g      	 	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d5eqd6erd7eeq   d8erdlerdned;   d<ej                  fd       Zedj1                  ecj`                  eue{g      	 	 	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d4eej                     d5eqd6erd7eeq   d8erdlerdned;   d<ej                  fd       Zedj1                  ecjd                  eueze{g      	 	 	 	 dd1ej                  d2ej                  d3ej                  d5eqd7eeq   dlerdned;   d<ej                  fd       Zedj1                  ecjh                  eue{g      	 	 	 dd1ej                  d2ej                  d3ej                  d6erdlerdned;   d<ej                  fd       Zedj1                  ecjl                  eveze{gdj      	 	 	 	 dd1ej                  d2ej                  d3ej                  d6erd7eeq   dlerdned;   d<ej                  fd       Zedj1                  ecj                  eveze{gd/      	 	 	 	 dd1ej                  d2ej                  d3ej                  d6erd7eeq   dlerdned;   d<ej                  fd       Zedj1                  ecjr                  eveze{g      	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d4eej                     d6erd7eeq   dlerdned;   d<ej                  fd       Zedj1                  ecjv                   exdd       e{g      	 	 	 	 dd1ej                  d2ej                  d3ej                  d6erd7eeq   dlerdned;   d<ej                  fd       Zedj1                  ecjz                   exdd       e{g      	 	 	 	 dd1ej                  d2ej                  d3ej                  d6erd7eeq   dlerdned;   d<ej                  fd       Zedj1                  ecj~                   exdd       e{g      	 	 	 	 dd1ej                  d2ej                  d3ej                  d6erd7eeq   dlerdned;   d<ej                  fd       Zedj1                  ecj                   exdd       e{g      	 	 	 	 dd1ej                  d2ej                  d3ej                  d6erd7eeq   dlerdned;   d<ej                  fd       Zedj1                  ecj                  eteue{g      	 	 	 	 	 	 	 dd1ej                  d2ej                  d3ej                  d4eej                     d5eqd6erd7eeq   d8erdlerdned;   d<ej                  fd       Zy)    N)	dataclass)Enum)TYPE_CHECKINGAnyCallableDictListOptionalTupleUnion   )
get_loggeris_aiter_availableis_aiter_versionis_flash_attn_3_availableis_flash_attn_availableis_flash_attn_versionis_kernels_availableis_sageattention_availableis_sageattention_versionis_torch_npu_availableis_torch_versionis_torch_xla_availableis_torch_xla_versionis_xformers_availableis_xformers_version)DIFFUSERS_ATTN_BACKENDDIFFUSERS_ATTN_CHECKS   )ParallelConfigz2.6.3z0.1.5z2.1.12.5.0z2.2z0.0.29>=)flash_attn_funcflash_attn_varlen_func)_wrapped_flash_attn_backward_wrapped_flash_attn_forward)r#   )r$   )sageattnsageattn_qk_int8_pv_fp8_cuda!sageattn_qk_int8_pv_fp8_cuda_sm90sageattn_qk_int8_pv_fp16_cudasageattn_qk_int8_pv_fp16_tritonsageattn_varlen)npu_fusion_attention)flash_attentionz2.4.0)device_typesschemac                   d }||S |S )Nc                     | S N funcs    m/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/diffusers/models/attention_dispatch.pywrapzcustom_op_no_op.<locals>.wrap       K    r4   )namefnmutates_argsr/   r0   r8   s         r7   custom_op_no_opr>          	 zt)r)r:   )lib_stacklevelc                   d }||S |S )Nc                     | S r3   r4   r5   s    r7   r8   z!register_fake_no_op.<locals>.wrap   r9   r:   r4   )opr<   r@   rA   r8   s        r7   register_fake_no_oprE      r?   r:   c                   p    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZy)AttentionBackendNameflash	flash_hubflash_varlenflash_varlen_hub_flash_3_flash_varlen_3_flash_3_hub_flash_3_varlen_hubaiterflexnative_native_cudnn_native_efficient_native_flash_native_math_native_npu_native_xlasagesage_hubsage_varlen_sage_qk_int8_pv_fp8_cuda_sage_qk_int8_pv_fp8_cuda_sm90_sage_qk_int8_pv_fp16_cuda_sage_qk_int8_pv_fp16_tritonxformersN)__name__
__module____qualname__FLASH	FLASH_HUBFLASH_VARLENFLASH_VARLEN_HUB_FLASH_3_FLASH_VARLEN_3_FLASH_3_HUB_FLASH_3_VARLEN_HUBAITERFLEXNATIVE_NATIVE_CUDNN_NATIVE_EFFICIENT_NATIVE_FLASH_NATIVE_MATH_NATIVE_NPU_NATIVE_XLASAGESAGE_HUBSAGE_VARLEN_SAGE_QK_INT8_PV_FP8_CUDA_SAGE_QK_INT8_PV_FP8_CUDA_SM90_SAGE_QK_INT8_PV_FP16_CUDA_SAGE_QK_INT8_PV_FP16_TRITONXFORMERSr4   r:   r7   rG   rG      s     EI!L)H'O!L/ E DF#M+#M!LKK DHK ;%E"!=#A  Hr:   rG   c                       e Zd Zi Zi Zi Z e       Z ee	      Z
eZe	 	 d
dedeee      defd       Zed        Zed        Zededefd	       Zy)_AttentionBackendRegistryNbackendconstraintssupports_context_parallelc                 T     t         j                  d d         fd}|S )NzRegistering attention backend: z with constraints: c                     | j                   <   xs g j                  <   t        t        j                  |       j
                  j                               j                  <   r%j                  j                  j                         | S r3   )	_backends_constraintssetinspect	signature
parameterskeys_supported_arg_names_supports_context_paralleladdvalue)r6   r   clsr   r   s    r7   	decoratorz5_AttentionBackendRegistry.register.<locals>.decorator   su    %)CMM'"(3(9rCW%03G4E4Ed4K4V4V4[4[4]0^C$$W-(..227==AKr:   )loggerdebug)r   r   r   r   r   s   ```` r7   registerz"_AttentionBackendRegistry.register   s/     	6wi?RS^R_`a	 r:   c                 L    | j                   | j                  | j                      fS r3   )_active_backendr   r   s    r7   get_active_backendz,_AttentionBackendRegistry.get_active_backend   s"    ""CMM#2E2E$FFFr:   c                 H    t        | j                  j                               S r3   )listr   r   r   s    r7   list_backendsz'_AttentionBackendRegistry.list_backends   s    CMM&&())r:   returnc                 6    |j                   | j                  v }|S r3   )r   r   )r   r   r   s      r7   _is_context_parallel_availablez8_AttentionBackendRegistry._is_context_parallel_available   s    
 %,MMS5S5S$S!((r:   )NF)ra   rb   rc   r   r   r   r   r   rG   r   r   r   _checks_enabledclassmethodr
   r	   r   boolr   r   r   r   r4   r:   r7   r~   r~      s    IL!$*+ABO+O 15*/	% d8n- $(	 & G G * * )%) 
) )r:   r~   c                   N    e Zd ZU dZeed<   eed<   dZee   ed<   dZee	   ed<   y)_HubKernelConfigzEConfiguration for downloading and using a hub-based attention kernel.repo_idfunction_attrNrevision	kernel_fn)
ra   rb   rc   __doc__str__annotations__r   r
   r   r   r4   r:   r7   r   r      s-    OL"Hhsm"$(Ix!(r:   r   zkernels-community/flash-attn3r#   zfake-ops-return-probs)r   r   r   r$   )r   r   zkernels-community/flash-attn2z kernels-community/sage_attentionr'   _HUB_KERNELS_REGISTRYr   c              #     K   | t         j                  vrt        d|  d      t        |       } t	        |        t        |        t         j                  }| t         _        	 d |t         _        y# |t         _        w xY ww)z>
    Context manager to set the active attention backend.
    zBackend z is not registered.N)r~   r   
ValueErrorrG   %_check_attention_backend_requirements"_maybe_download_kernel_for_backendr   )r   old_backends     r7   attention_backendr     su     
 /9998G9,?@AA"7+G)'2&w/+;;K07-@4?!1K!1s   AB!A1 %B1A>>BF)r   parallel_configquerykeyr   	attn_mask	dropout_p	is_causalscale
enable_gqaattention_kwargsr   r    r   c	                j   |xs i }|	t         j                         \  }}n*t        |	      }t         j                  j	                  |      }| ||||||d|d|
i}t        dd      r||d<   t         j                  rrt        |      t        t         j                  |         z
  }|rt        j                  d| d| d       t         j                  j	                  |      D ]
  } |d	i |  |j                         D ci c]  \  }}|t         j                  |   v s||  }}} |d	i |S c c}}w )
N)r   r   r   r   r   r   r   _parallel_configr"   r!   r   z5Removing unsupported arguments for attention backend z: .r4   )r~   r   rG   r   getr   r   r   r   r   warningr   items)r   r   r   r   r   r   r   r   r   r   r   backend_name
backend_fnkwargsremoved_kwargscheckkvs                     r7   dispatch_attention_fnr   1  sU    (-2 $=#O#O#Q j+G4.88<<\J
 
 
 	O
F g&)| 00Vs+D+Y+YZf+g'hhNNRS_R``bcqbrrstu.;;??M 	EOFO	  &||~stq!6O6d6deq6r1radsFs ts   D/D/c                 $    | |rt        d      y y )Nz8`is_causal` cannot be True when `attn_mask` is not None.)r   )r   r   r   s      r7   _check_attn_mask_or_causalr   f  s    STT "+r:   c                     | j                   |j                   k7  s| j                   |j                   k7  rt        d      | j                  |j                  k7  s| j                  |j                  k7  rt        d      y )Nz1Query, key, and value must be on the same device.z/Query, key, and value must have the same dtype.)devicer   dtyper   r   r   r   s       r7   _check_devicer   k  s]    ||szz!U\\U\\%ALMM{{cii5;;%++#=JKK $>r:   c                 f    t        | ||       | j                  j                  dk7  rt        d      y )Ncudaz/Query, key, and value must be on a CUDA device.)r   r   typer   r   s       r7   _check_device_cudar   r  s1    %e$||F"JKK #r:   majorminorc                 |     dt         j                  dt         j                  dt         j                  dd f fd}|S )Nr   r   r   r   c                     t        | ||       t        j                  j                  | j                        fk  rt        d d d      y )NzJQuery, key, and value must be on a CUDA device with compute capability >= r   )r   torchr   get_device_capabilityr   r   )r   r   r   r   r   r   s       r7   check_device_cudaz:_check_device_cuda_atleast_smXY.<locals>.check_device_cuday  sY    5#u-::++ELL9UENJ\]b\ccdejdkklm  Kr:   )r   Tensor)r   r   r   s   `` r7   _check_device_cuda_atleast_smXYr   x  s9     ELL  dh  r:   c                     | j                   |j                   k7  rt        d      | j                   |j                   k7  rt        d      y )Nz'Query and key must have the same dtype.z)Query and value must have the same dtype.)r   r   r   s       r7   _check_qkv_dtype_matchr     s?    {{ciiBCC{{ekk!DEE "r:   c                     t        | ||       | j                  t        j                  t        j                  fvrt        d      y )Nz9Query, key, and value must be either bfloat16 or float16.)r   r   r   bfloat16float16r   r   s       r7   _check_qkv_dtype_bf16_or_fp16r     s8    5#u-{{5>>5==99TUU :r:   c                    | j                   d   |j                   d   k7  rt        d      |j                   d   |j                   d   k7  rt        d      |+|j                   d   |j                   d   k7  rt        d      y y )Nz0Query and key must have the same head dimension.z1Key and value must have the same sequence length.z4Attention mask must match the key's sequence length.)shaper   )r   r   r   r   r   s        r7   _check_shaper     s     {{2#))B-'KLL
yy}B'LMM!4		"!EOPP "Fr:   c                 2   | t         j                  t         j                  fv r't        s t	        d| j
                   dt         d      y | t         j                  t         j                  fv r t        st	        d| j
                   d      y | t         j                  t         j                  t         j                  t         j                  t         j                  fv r$t               st	        d| j
                   d      y | t         j                   k(  r't"        s t	        d| j
                   d	t$         d      y | t         j&                  t         j(                  t         j*                  t         j,                  t         j.                  t         j0                  fv r't2        s t	        d
| j
                   dt4         d      y | t         j6                  k(  r t8        st	        d| j
                   d      y | t         j:                  k(  r t<        st	        d| j
                   d      y | t         j>                  k(  r't@        s t	        d| j
                   dtB         d      y | t         jD                  k(  r'tF        s t	        d| j
                   dtH         d      y y )NzFlash Attention backend 'zb' is not usable because of missing package or the version is too old. Please install `flash-attn>=z`.zFlash Attention 3 backend 'zp' is not usable because of missing package or the version is too old. Please build FA3 beta release from source.z	Backend 'zl' is not usable because the `kernels` package isn't available. Please install it with `pip install kernels`.zAiter Attention backend 'z]' is not usable because of missing package or the version is too old. Please install `aiter>=zSage Attention backend 'ze' is not usable because of missing package or the version is too old. Please install `sageattention>=zFlex Attention backend 'zd' is not usable because of missing package or the version is too old. Please install `torch>=2.5.0`.zNPU Attention backend 'za' is not usable because of missing package or the version is too old. Please install `torch_npu`.zXLA Attention backend 'za' is not usable because of missing package or the version is too old. Please install `torch_xla>=zXformers Attention backend 'z`' is not usable because of missing package or the version is too old. Please install `xformers>=)%rG   rd   rf   _CAN_USE_FLASH_ATTNRuntimeErrorr   _REQUIRED_FLASH_VERSIONrh   ri   _CAN_USE_FLASH_ATTN_3re   rg   rj   rk   rv   r   rl   _CAN_USE_AITER_ATTN_REQUIRED_AITER_VERSIONru   rw   rx   ry   rz   r{   _CAN_USE_SAGE_ATTN_REQUIRED_SAGE_VERSIONrm   _CAN_USE_FLEX_ATTNrs   _CAN_USE_NPU_ATTNrt   _CAN_USE_XLA_ATTN_REQUIRED_XLA_VERSIONr|   _CAN_USE_XFORMERS_ATTN_REQUIRED_XFORMERS_VERSION)r   s    r7   r   r     s   '--/C/P/PQQ"+GMM?  ;]  ^u  ]v  vx  y  #
 
)224H4X4XY	Y$-gmm_  =m  n  %
 
&&--))00%% 
 $%GMM?  +W  X  &
 
(..	."+GMM?  ;X  Yp  Xq  qs  t  #
 
!!((66;;7799 
 "*7==/  :_  `v  _w  wy  z  "
 
(--	-!*7==/  :^  _  "
 
(44	4 )'--  9Z  [  !
 
(44	4 )'--  9Z  [p  Zq  qs  t  !
 
(11	1%.w}}o  >^  _y  ^z  z|  }  & 
2r:      )maxsize
batch_size	seq_len_q
seq_len_kvr   c                 <   t        j                  | f|t         j                  |      }t        j                  | f|t         j                  |      }t        j                  | dz   t         j                  |      }t        j                  | dz   t         j                  |      }t        j                  |d      |dd  t        j                  |d      |dd  |j                         j                         }|j                         j                         }	||f||f||	ffS )Nr   r   r   r   dim)r   fullint32zeroscumsummaxitem)
r   r   r   r   	seqlens_q	seqlens_kcu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_ks
             r7   3_prepare_for_flash_attn_or_sage_varlen_without_maskr    s     

J=)5;;vVI

J=*EKKPVWI;;zA~U[[PL;;zA~U[[PL||I15L||I15L==?'')L==?'')Ly!L,#?,P\A]]]r:   c                 .   t        j                  | f|t         j                  |      }|j                  dt         j                        }t        j                  | dz   t         j                  |      }t        j                  | dz   t         j                  |      }t        j
                  |d      |dd  t        j
                  |d      |dd  |j                         j                         }|j                         j                         }	||f||f||	ffS )Nr   r   )r   r   r   r   )r   r   r   sumr  r  r  r  )
r   r   r   r   r  r  r  r  r	  r
  s
             r7   0_prepare_for_flash_attn_or_sage_varlen_with_maskr    s     

J=)5;;vVI!5;;7I;;zA~U[[PL;;zA~U[[PL||I15L||I15L==?'')L==?'')Ly!L,#?,P\A]]]r:   c                 >    |t        | |||      S t        | |||      S r3   )r  r  )r   r   r   r   r   s        r7   &_prepare_for_flash_attn_or_sage_varlenr    s0     B:yZdflmm;J	S\^deer:   	seq_len_kc           	         | j                   t        j                  k7  rt        d| j                    d      | j                  dk(  r#| j                  d      j                  ||      } n@| j                  dk(  rG| j                  d      d|fvrt        d| j                  d    d| d      | j                  ||      } n| j                  d	k(  rY| j                  d      d|fvrt        d| j                  d    d| d
      | j                  d      } | j                  ||      } n| j                  dk(  r[| j                  d      d|fvrt        d| j                  d    d| d      | j                  |dd|      } | j                  d      } nt        d| j                         | j                  ||fk7  rt        d| j                   d| d| d      | S )z
    Normalize an attention mask to shape [batch_size, seq_len_k] (bool) suitable for inferring seqlens_[q|k] in
    FlashAttention/Sage varlen.

    Supports 1D to 4D shapes and common broadcasting patterns.
    z)Attention mask must be of type bool, got r   r   r   r   zattn_mask.shape[0] (z) must be 1 or z for 2D attention mask.   z for 3D attention mask.r      z for 4D attention mask.r   )r   r   z"Unsupported attention mask shape: z.Normalized attention mask shape mismatch: got z, expected (z, ))
r   r   r   r   ndim	unsqueezeexpandsizer   any)r   r   r  s      r7   _normalize_attn_maskr    s    %**$DY__DUUVWXX~~''*11*iH		1	>>!Q
O3&yq'9&:/*Ulm  $$Z;		1	 >>!Q
O3&yq'9&:/*Ulm  MMaM(	$$Z;		1	>>!Q
O3&yq'9&:/*Ulm  $$ZRC	MMfM-	 =ioo=NOPP:y11<Y__<M\ZdYeeghqgrrst
 	
 r:   c                     ||k\  S r3   r4   )	batch_idxhead_idxq_idxkv_idxs       r7   _flex_attention_causal_mask_modr!  P  s    F?r:   c                 4   | t         vry t         |    }|j                  y 	 ddlm}  ||j                  |j
                        }t        ||j                        }||_        y # t        $ r+}t        j                  d|j                   d|         d }~ww xY w)Nr   )
get_kernel)r   z)An error occurred while fetching kernel 'z' from the Hub: )r   r   kernelsr#  r   r   getattrr   	Exceptionr   error)r   configr#  kernel_modulekernel_funces         r7   r   r   U  s    ++"7+F#&"6>>FOOLmV-A-AB ' @@PP`ab`cdes   AA# #	B,&BBz,_diffusers_flash_attn_3::_flash_attn_forwardr4   r   )r=   r/   qr   r   softmax_scalecausalqv	q_descale	k_descale	v_descaleattention_chunksoftcap
num_splitspack_gqadeterministic	sm_marginc                     d}t        di d| d|d|d|d|d|d|d	|d
|d|d|	d|
d|d|d|d|^}}}|j                  ddd      }||fS )Nr   r   r,  r   r   r-  r.  r/  r0  r1  r2  window_sizer3  r4  r5  r6  r7  r8  r   r   r   r4   )flash_attn_3_funcpermute)r,  r   r   r-  r.  r/  r0  r1  r2  r3  r4  r5  r6  r7  r8  r;  outlse_s                      r7   _wrapped_flash_attn_3rA  o  s    & K$ 

  $	
        (    $  !LCq$ ++aA
C8Or:   c                 ~    d}| j                   \  }}}}|||f}t        j                  |       | j                  |      fS )Nr:  )r   r   
empty_like	new_empty)r,  r   r   r-  r.  r/  r0  r1  r2  r3  r4  r5  r6  r7  r8  r;  r   seq_len	num_headshead_dim	lse_shapes                        r7   r@  r@    sH    $ K 01ww,JHWi0IAI 666r:   Tctx
return_lse	_save_ctxr   c           
      :   |	rt        d      |
r6| j                  |||       || _        || _        || _        || _        || _        d |||fD        \  }}}t        j                  j                  j                  ||||||||      }|j                  dddd      }|S )Nz1Native attention does not support return_lse=Truec              3   D   K   | ]  }|j                  d ddd        ywr   r   r   r  Nr=  .0xs     r7   	<genexpr>z/_native_attention_forward_op.<locals>.<genexpr>       L11aA.L    r   r   r   r   r   r   r   r   r   r   r   r  )r   save_for_backwardr   r   r   r   r   r   nn
functionalscaled_dot_product_attentionr=  )rI  r   r   r   r   r   r   r   r   rJ  rK  r   r>  s                r7   _native_attention_forward_opr[    s     LMM eS%0!!!	#LU8KLE3
((


:
: ; 	C ++aAq
!CJr:   grad_outc           
         | j                   \  }}}|j                  d       |j                  d       |j                  d       d |||fD        \  }}}	t        j                  j                  j                  |||	| j                  | j                  | j                  | j                  | j                        }
|
j                  dddd      }
|j                  dddd      }t        j                  j                  |
|||	g|d	      \  }}}|j                  dddd      }|j                  dddd      }|j                  dddd      }|||fS )
NTc              3   D   K   | ]  }|j                  d ddd        ywrN  rO  rP  s     r7   rS  z0_native_attention_backward_op.<locals>.<genexpr>  s     RqyyAq!4RrU  rV  r   r   r   r  F)outputsinputsgrad_outputsretain_graph)saved_tensorsrequires_grad_r   rX  rY  rZ  r   r   r   r   r   r=  autogradgrad)rI  r\  argsr   r   r   r   query_tkey_tvalue_tr>  
grad_out_tgrad_query_t
grad_key_tgrad_value_t
grad_querygrad_key
grad_values                     r7   _native_attention_backward_oprr    sR    ))E3	t	Ruc5>QRGUG
((


:
:------ii>> ; 	C ++aAq
!C!!!Q1-J-2^^-@-@WeW5J]b .A .*L*l %%aAq1J!!!Q1-H%%aAq1Jx++r:   c                    |rt        d      d}|j                  dd      j                         }|j                  dd      j                         }|j                  dd      j                         }||||fz  }t        j                  j
                  j                  |||||	||d|	      \	  }}}}}}}}}|||||||fz  }|
r9 | j                  |  || _        || _	        || _
        || _        || _        || _        |j                  dd      j                         }| |j                  dd      j                         }|	r||fS |S )Nz6`enable_gqa` is not yet supported for cuDNN attention.r4   r   r   F)	r   r   r   	attn_biascompute_log_sumexpr   r   return_debug_maskr   )r   	transpose
contiguousr   opsaten#_scaled_dot_product_cudnn_attentionrW  r   r   r   r   max_qmax_k)rI  r   r   r   r   r   r   r   r   rJ  rK  r   tensors_to_saver>  r?  	cum_seq_q	cum_seq_kr|  r}  philox_seedphilox_offsetdebug_attn_masks                         r7   _cudnn_attention_forward_opr  	  sd    QRRO OOAq!,,.E
--1

(
(
*COOAq!,,.EsE**O 			::)# 	; 
	
 ^CiE5+}o S)Y]SSO/!!	!		
--1

(
(
*C
mmAq!,,.#C:,,r:   c                 
   | j                   \	  }}}}}}	}
}}|j                  dd      j                         }|j                  dd      j                         }|j                  dd      j                         }t        j                  j
                  j                  ||||||||| j                  |	|
| j                  | j                  | j                  | j                  | j                        \  }}}d |||fD        \  }}}|||fS )Nr   r   )	logsumexpr  r  rt  r  r  r|  r}  r   r   r   c              3   \   K   | ]$  }|j                  d d      j                          & ywr   r   Nrw  rx  rP  s     r7   rS  z/_cudnn_attention_backward_op.<locals>.<genexpr>b  s$     's1Aq(9(D(D(F's   *,)rc  rw  rx  r   ry  rz  ,_scaled_dot_product_cudnn_attention_backwardr   r|  r}  r   r   r   )rI  r\  rg  r   r   r   r   r>  r?  r  r  r  r  ro  rp  rq  s                   r7   _cudnn_attention_backward_opr  C  s    UXTeTeQE3sCI{M!!!Q'224H
--1

(
(
*COOAq!,,.E (-yy~~'b'b#--iiii----ii! (c ($J*$ (tQ[]egqPr's$J*x++r:   c                 @   |t        d      |rt        d      d}d}d }d}t        d |||fD              }||j                  d   dz  }|s|"|j                  j                  d	kD  r	|d
kD  r|nd}t        j                  |      5  t        |||||||d
   |d	   |||	      \  }}}}|j                  d
dd	      }d d d        |
rG| j                  |||       || _
        || _        || _        || _        || _        || _        || _        |	rfS S # 1 sw Y   ZxY w)Nz2`attn_mask` is not yet supported for flash-attn 2.z3`enable_gqa` is not yet supported for flash-attn 2.r:          Fc              3   4   K   | ]  }|j                     y wr3   )requires_gradrP  s     r7   rS  z._flash_attention_forward_op.<locals>.<genexpr>  s     D1qDs   r   g      r   r   gKH9r   )r   r  r   context_parallel_config_world_sizer   set_grad_enabledr&   r=  rW  r   r   r   r;  r4  alibi_slopesr7  )rI  r   r   r   r   r   r   r   r   rJ  rK  r   r;  r4  r  r7  grad_enabledr>  r?  S_dmask	rng_states                        r7   _flash_attention_forward_opr  h  sj    MNNNOO KGLMDU0CDDL}BD) (49I9a9a9m9mpq9q!*QIE					- #'BNN(
$S'9 kk!Q"#  eS%c9E!	!%')#C:,,5# #s   4DDc                    | j                   \  }}}}}}	t        j                  |      t        j                  |      t        j                  |      }}}
t        |||||||
||| j                  | j
                  | j                  | j                  d   | j                  d   | j                  | j                  | j                  |	      }|
dd |j                  d   f   }
|dd |j                  d   f   }|dd |j                  d   f   }|
||fS )Nr   r   .r   )rc  r   rC  r%   r   r   r   r;  r4  r  r7  r   )rI  r\  rg  r   r   r   r   r>  r?  r  ro  rp  rq  lse_ds                 r7   _flash_attention_backward_opr    s$    .1->->*E3sC','7'7'>@P@PQT@UW\WgWghmWn*J(		%E, C!58>>"#5!556J1x~~b1112HC!58>>"#5!556Jx++r:   c           	          |t        d      |dkD  rt        d      |rt        d      t        |||d|||	      }d }|	r|^}}}|j                  ddd	      }|	r||fS |S )
Nz4`attn_mask` is not yet supported for Sage attention.r  z4`dropout_p` is not yet supported for Sage attention.z5`enable_gqa` is not yet supported for Sage attention.NHDr,  r   r   tensor_layoutr   sm_scalerJ  r   r   r   )r   r'   r=  )rI  r   r   r   r   r   r   r   r   rJ  rK  r   r>  r?  r@  s                  r7   _sage_attention_forward_opr    s     OPP3OPPPQQ



C CS1kk!Q"#C:,,r:   c                     t        d      )Nz4Backward pass is not implemented for Sage attention.)NotImplementedError)rI  r\  rg  s      r7   _sage_attention_backward_opr    s    
 T
UUr:   c                 Z    t        | t        j                        r| j                         } | S r3   )
isinstancefuncolAsyncCollectiveTensorwait)tensors    r7   _wait_tensorr    s"    &&667Mr:   rR  c                     | j                   }| j                         } t        j                  | d d |      } | j	                  |      } t        |       } | S r3   )r   flattenr  all_to_all_singlereshaper  )rR  groupr   s      r7   _all_to_all_singler    sJ    GGE
 	
		A  D$6A			%AQAHr:   c                   V   e Zd Ze	 ddej
                  j                  j                  dej                  dej                  dej                  de	ej                     de
ded	e	e
   d
edede	d   fd       Zedej
                  j                  j                  dej                  fd       Zy)TemplatedRingAttentionNrI  r   r   r   r   r   r   r   r   rJ  r   r    c                    |j                   j                  }|j                   j                  }|j                   j                  }|dz   |z  }d x}}|
| _        || _        |j                  | _        |j                  | _        || _	        t        j                  |j                         |j                         g      j                         }t        j                  |d|j!                               }|j#                  |      }t%        |      D ]6  }|dkD  rE||   }|j'                         }|d | j)                  |      }||d  j)                  |      }|dz   |z  } |
| ||||||||d|dk(  |      \  }}|j                   j*                  r>|j-                  t        j.                        }|j-                  t        j.                        }|j1                  d      }|d|t        j2                  j4                  j7                  ||z
        ||z
  z  z
  }|t        j2                  j4                  j9                  ||z
        z
  }|}|}9 j-                  |j:                        }j=                  d      }|	r||fS |S )Nr   r   
gather_dimr  TrK  r   r   )r  
_ring_mesh_ring_local_rankring_degree
forward_opbackward_opr   q_shapekv_shaper   r   catr  rx  r  all_gather_tensor	get_groupchunkrangenumel
reshape_asconvert_to_fp32tofloat32r  rX  rY  sigmoid
logsigmoidr   squeeze)rI  r   r   r   r   r   r   r   r   rJ  r  r  r   	ring_meshrank
world_size	next_rankprev_outprev_lse	kv_bufferikv	key_numelr>  r?  s                            r7   forwardzTemplatedRingAttention.forward  sG     %<<GG	77HH%==II
AX+	""8#%kkyy/IIs{{}emmo>?JJL	,,Y1IL_L_Lab	OOJ/	z"  	A1uy)IIK	)n//49:11%8&]j8	!q&!1HC  77GGffU]]+ffU]]+--#C#!4!4!<!<S8^!LPX[^P^!__!4!4!?!?3!OOHHA 	D ffU[[!kk"o'Sz0S0r:   r\  c                 J   | j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }|dz   |z  }t        t        d|            dgz   }| j                   j                  j                  rt        j                  nj                  }t        j                  | j                  |j                        }	t        j                  | j                  |j                        }
t        j                  | j                  |j                        }d }| j                  ^}}}}t        j                   |j#                         |j#                         g      j%                         }t'        j(                  |d|j+                               }|j-                  |      }t        |      D ]*  }|dkD  rE||   }|j/                         }|d | j1                  |      }||d  j1                  |      }|dz   |z  }| j3                  |       ^}}}}|dkD  rCt5        |      }|
j/                         }|d | j1                  |
      }
||d  j1                  |      }|	|z  }	|
|z  }
||z  }||dz
  k  st        j                   |
j#                         |j#                         g      j%                         }t'        j6                  |||j+                               }- fd|	|
|fD        \  }	}
}|	|
|d d d d d d d d fS )Nr   r   r   r  )r  c              3   T   K   | ]  }|j                  j                         ! y wr3   )r  r   )rQ  rR  r\  s     r7   rS  z2TemplatedRingAttention.backward.<locals>.<genexpr>  s     +mQADD,@+ms   %()r   r  r  r  r  r   r  r  r   r  r   r  r  r   r  rc  r  r  rx  r  r  r  r  r  r  r  r  permute_tensor)rI  r\  rg  r  r  r  r  
next_ranksaccum_dtypero  rp  rq  next_grad_kvr   r   r   r@  r  r  r  r  grad_query_opgrad_key_opgrad_value_opgrad_kv_buffergrad_key_numels    `                        r7   backwardzTemplatedRingAttention.backward\  s    ((@@KK	##;;LL))AAMM
AX+	%:./1#5
'*';';'S'S'c'cemmiqiwiw[[KX
;;s||;xW[[[Y
 # 1 1sEAIIs{{}emmo>?JJL	,,Y1IL_L_Lab	OOJ/	z" 	nA1uy)IIK	)n//49:11%8&]j8	<?OOCQY<Z9M;1u!-l!;!)!1)/>:EEhO+NO<GG
S
-'J#H-'J:>!!&H,<,<,>
@R@R@T+U!V!a!a!c%44^ZW`WjWjWlm-	n0 ,n:W_akJl+m(
Hj8ZtT4tUY[___r:   r3   ra   rb   rc   staticmethodr   re  functionFunctionCtxr   r
   floatr   r  r  r4   r:   r7   r  r    s     8<D1^^$$00D1||D1 \\D1 ||	D1
 ELL)D1 D1 D1 D1 D1 D1 ##34D1 D1L 0`^^$$000`,,0` 0`r:   r  c                   V   e Zd Ze	 ddej
                  j                  j                  dej                  dej                  dej                  de	ej                     de
ded	e	e
   d
edede	d   fd       Zedej
                  j                  j                  dej                  fd       Zy)TemplatedUlyssesAttentionNrI  r   r   r   r   r   r   r   r   rJ  r   r    c                    |j                   j                  }|j                   j                  }|j                         |
| _        || _        || _        |j                  \  }}}}|j                  \  }}}}||z  }|j                  |||||      j                  ddddd      j                         }|j                  |||||      j                  ddddd      j                         }|j                  |||||      j                  ddddd      j                         }fd|||fD        \  }}}d |||fD        \  }}} |
| |||||||||	d|	      }|	r|^}}}|j                  |||||      j                  ddddd      j                         }t        |      }|j                  dd      j                  dddd      j                         }|	rrj                  ||||      j                  dddd      j                         }t        |      }|j                  dd      j                  ddd      j                         }nd }|	r||fS |S )
Nr   r   r   r  r  c              3   6   K   | ]  }t        |        y wr3   r  rQ  rR  r  s     r7   rS  z4TemplatedUlyssesAttention.forward.<locals>.<genexpr>  s     Wa/59W   c              3      K   | ]6  }|j                  d d      j                  dd dd      j                          8 ywr   r   r   r  Nr  r=  rx  rP  s     r7   rS  z4TemplatedUlyssesAttention.forward.<locals>.<genexpr>  s5     kRSQYYq!_44Q1a@KKMk   <>Tr  )r  _ulysses_meshulysses_degreer  r  r  r   r   r  r=  rx  r  r  )rI  r   r   r   r   r   r   r   r   rJ  r  r  r   ulysses_meshr  B	S_Q_LOCALHDr@  
S_KV_LOCALH_LOCALr>  r?  r  s                           @r7   r  z!TemplatedUlyssesAttention.forward  so     (??MM%==LL
&&(#%/"[[9a!ii:q!z/aJCKKAqRSUVXYZeegkk!ZWa@HHAqRSUVWbbdaZ!DLLQPQSTVWYZ[ffhWE3PUCVWsEkX]_bdiWjksE-
 LCqkk!ZGQ?GG1aQRTUVaac e,kk!Q''1a3>>@++aY@HHAqRST__aC$S%0C++a#++Aq!4??ACC'Sz0S0r:   r\  c                 x   | j                   j                  j                  }| j                   j                  j                  |j	                         |j
                  \  }|z  |j                        j                  ddddd      j                         }t        |      }|j                  dd      j                  dddd      j                         }| j                  | |      ^}}}}fd|||fD        \  }	}
}fd|	|
|fD        \  }	}
}d |	|
|fD        \  }	}
}|	|
|d d d d d d d d fS )	Nr   r   r   r  r  c              3      K   | ]:  }|j                        j                  d dddd      j                          < yw)r   r  r   r   r  N)r  r=  rx  )rQ  rR  r  r  r  S_LOCALr  s     r7   rS  z5TemplatedUlyssesAttention.backward.<locals>.<genexpr>  sF      ,
 IIaWgq9AA!Q1aP[[],
s   A Ac              3   6   K   | ]  }t        |        y wr3   r  r  s     r7   rS  z5TemplatedUlyssesAttention.backward.<locals>.<genexpr>  s     +uQ,>q%,H+ur  c              3      K   | ]6  }|j                  d d      j                  ddd d      j                          8 ywr  r  rP  s     r7   rS  z5TemplatedUlyssesAttention.backward.<locals>.<genexpr>  s9      ,
ABAIIaO##Aq!Q/::<,
r  )r   r  r  r  r  r   r  r=  rx  r  r  r  )rI  r\  rg  r  r  r  r  r  r@  ro  rp  rq  r  r  r  r  r  r  s               @@@@@@r7   r  z"TemplatedUlyssesAttention.backward  sa    ++CCQQ))AAPP
&&(#>>7Aqz/##Aw
GQGOOPQSTVWYZ\]^iik%h6##Aq)11!Q1=HHJ8;X8V5{MA,
#[-@,
(
Hj ,vS]_gisRt+u(
Hj,
GQS[]gFh,
(
Hj 8ZtT4tUY[___r:   r3   r  r4   r:   r7   r  r    s     8<<1^^$$00<1||<1 \\<1 ||	<1
 ELL)<1 <1 <1 <1 <1 <1 ##34<1 <1| `^^$$00`,,` `r:   r  )r   c	                J   |t        d      |rt        d      |rt        d      |j                  j                  dkD  r t        j	                  | |||||||||	|
|      S |j                  j
                  dkD  r t        j	                  | |||||||||	|
|      S t        d      )Nz<Attention mask is not yet supported for templated attention.z>Causal attention is not yet supported for templated attention.z1GQA is not yet supported for templated attention.r   z@Reaching this branch of code is unexpected. Please report a bug.)r   r  r  r  applyr  r  )r   r   r   r   r   r   r   r   rJ  r  r  r   s               r7   %_templated_context_parallel_attentionr    s     WXXYZZLMM //;;a?%++
 	
 
	1	1	@	@1	D(..
 	
 [\\r:   )r   r   c                     d }|t        | ||||||      }	|r,|	^}	}}
n&t        | ||d |||d|t        t        |      }	|r|	\  }	}|r|	|fS |	S )Nr,  r   r   r   r-  r.  return_attn_probsFr  r  r   )r#   r  r  r  )r   r   r   r   r   r   rJ  r   r?  r>  r@  s              r7   _flash_attentionr  +  s     C(
 LCq324-
 HC#C:,,r:   c           	          d }t         t        j                     j                  }	 |	| ||||||      }
|r|
^}
}}|r|
|fS |
S )Nr  )r   rG   re   r   )r   r   r   r   r   r   rJ  r   r?  r6   r>  r@  s               r7   _flash_attention_hubr
  \  s`     C !5!?!?@JJD



$C S1#C:,,r:   c	                 X   | j                   \  }	}
}}|j                   \  }}}}|t        ||	|      }t        |	|
||| j                        \  \  }}\  }}\  }}g g }}t	        |	      D ]7  }||   }|j                  ||d |f          |j                  ||d |f          9 | j                  dd      }t        j                  |d      }t        j                  |d      }t        t        j                     j                  } ||||||||||||      }|j                  d|	df      }|S Nr   r   r   r   r   )r,  r   r   r  r  r	  r
  r   r-  r.  r  r   )r   r  r  r   r  appendr  r   r  r   rG   rg   r   	unflatten)r   r   r   r   r   r   r   rJ  r   r   r   r@  r   r  r  r  r	  r
  	key_validvalue_validb	valid_lenquery_packed
key_packedvalue_packedr6   r>  s                              r7   _flash_varlen_attention_hubr  |  s]     #(++J	1a))Az1a(J
K	 	/	:5<<	
 ONQ	0\<2N<  {I: 1aL	Q

]+,5JYJ/01
 ==A&L9!,J99[a0L !5!F!FGQQD



!!!!$C --J+
,CJr:   )r   c	                    | j                   \  }	}
}}|j                   \  }}}}|t        ||	|      }t        |	|
||| j                        \  \  }}\  }}\  }}g g }}t	        |	      D ]7  }||   }|j                  ||d |f          |j                  ||d |f          9 | j                  dd      }t        j                  |d      }t        j                  |d      }t        |||||||||||      }|j                  d|	df      }|S r  )r   r  r  r   r  r  r  r   r  r$   r  )r   r   r   r   r   r   r   rJ  r   r   r   r@  r   r  r  r  r	  r
  r  r  r  r  r  r  r  r>  s                             r7   _flash_varlen_attentionr    sF    #(++J	1a))Az1a(J
K	 	/	:5<<	
 ONQ	0\<2N<  {I: 1aL	Q

]+,5JYJ/01
 ==A&L9!,J99[a0L
 


!!!!$C --J+
,CJr:   c                 8    t        | ||||      \  }}|r||fS |S )N)r,  r   r   r-  r.  )rA  )	r   r   r   r   r   rJ  r   r>  r?  s	            r7   _flash_attention_3r    s4     %


HC $C:,,r:   r;  r  c
                    |	r&t        t        j                  j                   d      t        t        j                     j
                  }
 |
di d| d|d|d|d|dd dd d	d d
d d|d|dddd d|ddd|}|r
|d   |d   fS |S )Nz( is not implemented for parallelism yet.r,  r   r   r-  r.  r/  r0  r1  r2  r;  r4  r5  r   r6  r7  r8  r   r  r4   )r  rG   rj   r   r   r   )r   r   r   r   r   r;  r4  r7  r  r   r6   r>  s               r7   _flash_attention_3_hubr    s   " !%9%F%F%L%L$MMu"vww !5!B!BCMMD
 

  	
           $   ,!C(  1CFCF9c9r:   c                 f   | j                   \  }}	}
}
|j                   \  }
}}
}
|t        |||      }t        ||	||| j                        \  \  }
}\  }}\  }}g g }}t	        |      D ]7  }||   }|j                  ||d |f          |j                  ||d |f          9 | j                  dd      }t        j                  |d      }t        j                  |d      }t        t        j                     j                  } ||||||||||	      ^}}}
|j                  d|df      }|r||fS |S Nr  r   r   r   )	r,  r   r   r  r  r	  r
  r-  r.  r   )r   r  r  r   r  r  r  r   r  r   rG   rk   r   r  )r   r   r   r   r   r   rJ  r   r   r   r@  r   r  r  r  r	  r
  r  r  r  r  r  r  r  r6   r>  r?  s                              r7   _flash_attention_3_varlen_hubr   /  si    #(++J	1a))Az1a(J
K	 	/	:5<<	
 ONQ	0\<2N<  {I: 1aL	Q

]+,5JYJ/01
 ==A&L9!,J99[a0L !5!I!IJTTD


!!!!
LCq --J+
,C#C:,,r:   c                 *   | j                   \  }}	}
}
|j                   \  }
}}
}
|t        |||      }t        ||	||| j                        \  \  }
}\  }}\  }}g g }}t	        |      D ]7  }||   }|j                  ||d |f          |j                  ||d |f          9 | j                  dd      }t        j                  |d      }t        j                  |d      }t        |||||||||	      ^}}}
|j                  d|df      }|r||fS |S r  )r   r  r  r   r  r  r  r   r  flash_attn_3_varlen_funcr  )r   r   r   r   r   r   rJ  r   r   r   r@  r   r  r  r  r	  r
  r  r  r  r  r  r  r  r>  r?  s                             r7   _flash_varlen_attention_3r#  e  sR    #(++J	1a))Az1a(J
K	 	/	:5<<	
 ONQ	0\<2N<  {I: 1aL	Q

]+,5JYJ/01
 ==A&L9!,J99[a0L+


!!!!
LCq --J+
,C#C:,,r:   c           	          |s*t        j                         rt        | |||||d      ^}}	}
nt        | ||||||      }|r|^}}	}
|r|	fS |S )NT)r,  r   r   r   r-  r.  rJ  )r   is_grad_enabledaiter_flash_attn_func)r   r   r   r   r   r   rJ  r   r>  r?  r@  s              r7   _aiter_flash_attentionr'    sy     %//1,
S1 $!
 LCq#C:,,r:   zflex_attention.BlockMaskc	           
         d }	d }
| j                   \  }}}}|j                   \  }}}}t        t        j                        r}
n|r)t        j                  t
        ||||| j                        }
nt        j                        rj                  dk(  r2j                  j                  d      dj                  d      d      j                  ||||      j                  t        j                  k(  r*fd}t        j                  ||d ||| j                        }
nfd}	nt        d      d | ||fD        \  } }}t        j                  | |||	|
|||      }|j!                  dddd	      }|S )
Nr   r   r   c                     | |||f   S r3   r4   )r  r  r  r   r   s       r7   mask_modz(_native_flex_attention.<locals>.mask_mod  s     HeV!CDDr:   c                     | ||||f   z   S r3   r4   )scorer  r  r  r   r   s        r7   	score_modz)_native_flex_attention.<locals>.score_mod  s    yHeV)KLLLr:   zCAttention mask must be either None, a BlockMask, or a 2D/4D tensor.c              3   D   K   | ]  }|j                  d ddd        ywrN  rO  rP  s     r7   rS  z)_native_flex_attention.<locals>.<genexpr>  rT  rU  )r   r   r   r-  
block_maskr   r   rJ  r  )r   r  flex_attention	BlockMaskcreate_block_maskr!  r   r   	is_tensorr  viewr  r  r   r   r   r=  )r   r   r   r   r   r   r   rJ  r   r-  r/  r   r   rF  r@  r   r*  r>  s      `              r7   _native_flex_attentionr5    ss     IJ*/++'J	9a))Az1aJy.2J2JK
	#55+ZIz[`[g[g

 
	#>>Q!y~~a'8!Y^^A=NPQRI$$ZIzR	??ejj(E (99*dIz5<<J
M ^__LU8KLE3

'
'	C ++aAq
!CJr:   c
                    |rt        d      |	Wd | ||fD        \  } }}t        j                  j                  j	                  | |||||||      }
|
j                  dddd      }
|
S t        | ||||||||t        t        |	      }
|
S )	NzDNative attention backend does not support setting `return_lse=True`.c              3   D   K   | ]  }|j                  d ddd        ywrN  rO  rP  s     r7   rS  z$_native_attention.<locals>.<genexpr>  s     PqQYYq!Q2PrU  rV  r   r   r   r  r  )	r   r   rX  rY  rZ  r=  r  r[  rr  r   r   r   r   r   r   r   r   rJ  r   r>  s              r7   _native_attentionr9    s    " _``PUC<OPsEhh!!>>! ? 	
 kk!Q1%" J 435-
 Jr:   c
                    d }
|	|sd | ||fD        \  } }}t         j                  j                  j                  t         j                  j                  j                  j
                        5  t         j                  j                  j                  | |||||||      }d d d        j                  dddd      }n&t        | ||||||||t        t        |	      }|r|\  }}
|r||
fS |S # 1 sw Y   LxY w)Nc              3   `   K   | ]&  }|j                  d ddd      j                          ( ywrN  )r=  rx  rP  s     r7   rS  z*_native_cudnn_attention.<locals>.<genexpr>G  s(     ]AQYYq!Q2==?]s   ,.rV  r   r   r   r  r  )r   rX  	attentionsdpa_kernel
SDPBackendCUDNN_ATTENTIONrY  rZ  r=  r  r  r  )r   r   r   r   r   r   r   r   rJ  r   r?  r>  s               r7   _native_cudnn_attentionr@  4  s	   " C
]%QTV[I\]sEXX++EHH,>,>,I,I,Y,YZ 
	((%%BB###% C 	C
	 kk!Q1%324-
 HC#C:,,=
	 
	s   ,2C))C2c
                    |rt        d      d | ||fD        \  } }}t        j                  j                  j	                  t        j                  j                  j
                  j                        5  t        j                  j                  j                  | |||||||      }
d d d        
j                  dddd      }
|
S # 1 sw Y   xY w)NzNNative efficient attention backend does not support setting `return_lse=True`.c              3   D   K   | ]  }|j                  d ddd        ywrN  rO  rP  s     r7   rS  z._native_efficient_attention.<locals>.<genexpr>{  rT  rU  rV  r   r   r   r  )
r   r   rX  r<  r=  r>  EFFICIENT_ATTENTIONrY  rZ  r=  r8  s              r7   _native_efficient_attentionrD  i  s      ijjLU8KLE3				'	'(:(:(E(E(Y(Y	Z 

hh!!>>! ? 	


 ++aAq
!CJ

 

   32CCc	                    |rt        d      d | ||fD        \  } }}t        j                  j                  j	                  t        j                  j                  j
                  j                        5  t        j                  j                  j                  | ||d ||||      }	d d d        	j                  dddd      }	|	S # 1 sw Y   xY w)NzJNative flash attention backend does not support setting `return_lse=True`.c              3   D   K   | ]  }|j                  d ddd        ywrN  rO  rP  s     r7   rS  z*_native_flash_attention.<locals>.<genexpr>  rT  rU  rV  r   r   r   r  )
r   r   rX  r<  r=  r>  FLASH_ATTENTIONrY  rZ  r=  )
r   r   r   r   r   r   r   rJ  r   r>  s
             r7   _native_flash_attentionrI    s     effLU8KLE3				'	'(:(:(E(E(U(U	V 

hh!!>>! ? 	


 ++aAq
!CJ

 

rE  c
                    |rt        d      d | ||fD        \  } }}t        j                  j                  j	                  t        j                  j                  j
                  j                        5  t        j                  j                  j                  | |||||||      }
d d d        
j                  dddd      }
|
S # 1 sw Y   xY w)NzINative math attention backend does not support setting `return_lse=True`.c              3   D   K   | ]  }|j                  d ddd        ywrN  rO  rP  s     r7   rS  z)_native_math_attention.<locals>.<genexpr>  rT  rU  rV  r   r   r   r  )
r   r   rX  r<  r=  r>  MATHrY  rZ  r=  r8  s              r7   _native_math_attentionrM    s      deeLU8KLE3				'	'(:(:(E(E(J(J	K 

hh!!>>! ? 	


 ++aAq
!CJ

 

rE  c                 &   |rt        d      d | ||fD        \  } }}t        | ||| j                  d      dd |%dt        j                  | j
                  d         z  n|ddd|z
  dd	
      d	   }|j                  dd      j                         }|S )NzANPU attention backend does not support setting `return_lse=True`.c              3   \   K   | ]$  }|j                  d d      j                          & ywr  r  rP  s     r7   rS  z(_native_npu_attention.<locals>.<genexpr>  s$     UAQ*557Ur  r   BNSDg      ?r   i   Fr   )input_layoutpser   pre_tockensnext_tockens	keep_probsyncinner_preciser   )r   r-   r  mathsqrtr   rw  rx  )r   r   r   r   r   rJ  r   r>  s           r7   _native_npu_attentionrZ    s     \]]U%eATUE3


127-cDIIekk"o..U	/ 	C --1

(
(
*CJr:   c                     |rt        d      d | ||fD        \  } }}| t        j                  | j                  d         z  } t	        | |||      }|j                  dddd      }|S )	NzAXLA attention backend does not support setting `return_lse=True`.c              3   D   K   | ]  }|j                  d ddd        ywrN  rO  rP  s     r7   rS  z(_native_xla_attention.<locals>.<genexpr>  rT  rU  r   )r,  r   r   r.  r   r   r   r  )r   rX  rY  r   xla_flash_attentionr=  )r   r   r   r   rJ  r   r>  s          r7   _native_xla_attentionr^    sw     \]]LU8KLE3DIIekk"o..E



	C ++aAq
!CJr:   c                     d }|t        | ||d|||      }|r,|^}}}	n&t        | ||d d||d|t        t        |      }|r|\  }}|r||fS |S )Nr  r  r  Fr  )r'   r  r  r  )
r   r   r   r   r   rJ  r   r?  r>  r@  s
             r7   _sage_attentionr`    s     C!
 LCq313-
 HC#C:,,r:   c           	          d }t         t        j                     j                  }| || ||d|||      }	|r|	^}	}}
|r	|fS 	S Nr  r  )r   rG   rv   r   )r   r   r   r   r   rJ  r   r?  r6   r>  r@  s              r7   _sage_attention_hubrc  ;  sf     C !5!>!>?IID!
 LCq#C:,,r:   c                 2   |rt        d      | j                  \  }}	}
}
|j                  \  }
}}
}
|t        |||      }t        ||	||| j                        \  \  }
}\  }}\  }}g g }}t        |      D ]7  }||   }|j                  ||d |f          |j                  ||d |f          9 | j                  dd      }t        j                  |d      }t        j                  |d      }t        |||||||||	      }|j                  d|df      }|S )Nz?Sage varlen backend does not support setting `return_lse=True`.r  r   r   r   )	r,  r   r   r  r  r	  r
  r   r  r   )r   r   r  r  r   r  r  r  r   r  r,   r  )r   r   r   r   r   r   rJ  r   r   r   r@  r   r  r  r  r	  r
  r  r  r  r  r  r  r  r>  s                            r7   _sage_varlen_attentionre  [  sO    Z[["'++J	1a))Az1a(J
K	 	/	:5<<	
 ONQ	0\<2N<  {I: 1aL	Q

]+,5JYJ/01
 ==A&L9!,J99[a0L



!!!!
C --J+
,CJr:   	   c           	      &    t        | ||d|||      S rb  )r(   r   r   r   r   r   rJ  r   s          r7   #_sage_qk_int8_pv_fp8_cuda_attentionri    s&     (


 r:   c           	      &    t        | ||d|||      S rb  )r)   rh  s          r7   (_sage_qk_int8_pv_fp8_cuda_sm90_attentionrk    s&     -


 r:      c           	      &    t        | ||d|||      S rb  )r*   rh  s          r7   $_sage_qk_int8_pv_fp16_cuda_attentionrn    s&     )


 r:   c           	      &    t        | ||d|||      S rb  )r+   rh  s          r7   &_sage_qk_int8_pv_fp16_triton_attentionrp    s&     +


 r:   c
                    |rt        d      | j                  \  }
}}}|j                  \  }}}}|rt        j                         }n||j                  dk(  r3|j                  |j                  d      d|j                  d      d      }n|j                  dk7  rt        d      |j                  |
|||      j                  |       }|rz||z  dk7  rt        d      ||z  }| j                  d|df      } |j                  d|df      j                  ddd|d      }|j                  d|df      j                  ddd|d      }t        j                  | |||||      }|r|j                  dd	      }|S )
NzFxformers attention backend does not support setting `return_lse=True`.r   r   r   r  zDOnly 2D and 4D attention masks are supported for xformers attention.zKNumber of heads in query must be divisible by number of heads in key/value.r   r  )r   r   xopsLowerTriangularMaskr  r4  r  r  type_asr  memory_efficient_attentionr  )r   r   r   r   r   r   r   r   rJ  r   r   r   num_heads_qr@  r   num_heads_kvnum_heads_per_groupr>  s                     r7   _xformers_attentionry    s~     abb,1KK)J	;%(YY"Az<,,.			>>Q!y~~a'8!Y^^A=NPQRI^^q cdd$$ZiT\\]bc	%*jkk)\9L"#56mmAb1299"b"FY[]^L"#56==b"bJ]_ab

)
)%eY	SX
YCkk!QJr:   r3   )Nr  FNFN)NN)NFNNNNr   r  r   NFr   )Nr  FNFFTN)Nr  FNFF)r  FNFN)Nr  NFFN)NFFN)NFr:  r  FFN)NNFFN)NFNFFN)Nr  FNFFN)r  FNFFN)r  NFN)FFN)FNFN)NFNFN)
contextlib	functoolsr   rX  dataclassesr   enumr   typingr   r   r   r   r	   r
   r   r   r   distributedis_available)torch.distributed._functional_collectives_functional_collectivesr  utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   utils.constantsr   r   _modeling_parallelr    r   r   r   _REQUIRED_FLEX_VERSIONr   r   r   r   r   r   r   r   r   r   
flash_attnr#   r$   flash_attn.flash_attn_interfacer%   r&   flash_attn_interfacer<  r"  rP   r&  sageattentionr'   r(   r)   r*   r+   r,   !torch.nn.attention.flex_attentionrX  r<  r0  	torch_npur-   $torch_xla.experimental.custom_kernelr.   r]  xformers.opsry  rr  __version__library	custom_op
_custom_opregister_fake_register_faker>   rE   ra   r   r   rG   r~   r   rj   rk   re   rg   rv   r   r   contextmanagerrn   r   r   r  r   r   r   r   r   intr   r   r   r   r   	lru_cacher   r  r  r  r  r!  r   rA  r@  re  r  r  r[  rr  r  r  r  r  r  r  r  r  Functionr  r  r  r   rd   r  r
  r  rf   r  rh   r  r  r   ri   r#  rl   r'  rm   r5  r9  ro   r@  rp   rD  rq   rI  rr   rM  rs   rZ  rt   r^  ru   r`  rc  rw   re  rx   ri  ry   rk  rz   rn  r{   rp  r|   ry  r4   r:   r7   <module>r     s        !  S S S  	!!#>>    " L 2! !      % -/h4I$Pg4h 13 (*^/?F]/^ /1l6NtUk6l %d,BC *, *,b1EdLa1b .0j5HOi5j  BiiO!#' "& IW#>    H$(!&*##' (,%O  ?> . [ D 	((J]]00N*UY **4Q * !J(N 
H	'3 'T*) *)Z ) ) ) %%'7/?P[r( ,,.>/./
 ""$4/?P[_% ))+;/?Wbf, !!#32*W[$!I t24DDE , BVB]B] @uS*>%>? @ @0 )-!15.  /326. <<. 	.  <<.  %	. 
 .  .  E?.  .  tCH~..  *+.  ./.  \\. jU(5<<*@ UT U`d U
L LELL L L\` LLell L Lell Lae L3 s x F%,, FU\\ F%,, Fei FV VELL VQVQ]Q] Vlp V )-	Q<<Q	Q <<Q %	Q 
Q0B3G BD BJ S!
 &*	^^^ ^ U\\"	^ "^* &*	^^^ ||^ U\\"	^* )-%)	f	f	f 	f %		f
 U\\"	f 
	f1ELL 1c 1c 1V[VbVb 1h
0D  4 :Z`a
 &*!%(,(,(,#&||&||& ||& E?	&
 & 	& %& %& %& & & & tn& & &  5<<%&!& b&R >?
 &*!%(,(,(,#7||7||7 ||7 E?	7
 7 	7 %7 %7 %7 7 7 7 tn7 7 7  5<<%&!7 @7B )-!37(		 	 	,	,(<<( 
( <<	(
 %( ( ( E?( ( ( ( /0(V",		 	 	,	,",ll",Z )-!375-		 	 	,	,5-<<5- 
5- <<	5-
 %5- 5- 5- E?5- 5- 5- 5- /05-t!,		 	 	,	,!,ll!,T )-!37;-		 	 	,	,;-<<;- 
;- <<	;-
 %;- ;- ;- E?;- ;- ;- ;- /0;-|#,		 	 	,	,#,ll#,V )-!37#-		 	 	,	,#-<<#- 
#- <<	#-
 %#- #- #- E?#- #- #- #- /0#-LV		 	 	,	,VllV
%,, 
%,, 
y`U^^44 y`x\` 7 7 \`F )-!6] 486]<<6]	6] <<6] %	6]
 6] 6] E?6] 6] 6] /06]x ## =|L" $  !37)-<<)-	)- <<)- 	)-
 )- E?)- )- /0)- \\)-
)-X ##"" =|L# $  !37-<<-	- <<- 	-
 - E?- - /0- \\-
-6 ##)) =|L# $  )-!371<<1	1 <<1 %	1
 1 E?1 1 1 /01 \\1
1h ##%% =|L $  )-!370<<0	0 <<0 %	0
 0 E?0 0 0 /00 \\0	0f ##!! =|L $  "37-<<-	- <<- E?	-
 - - /0- \\-	-& ##%% =|L# $  "#+#37$:<<$:	$: <<$: E?	$:
 $: sCx$: $: $: $: /0$: \\$:
$:N ##,, =|L# $  )-!37.-<<.-	.- <<.- %	.-
 E?.- .- .- /0.- \\.-
.-b ##(( =|L $  )-!37--<<--	-- <<-- %	--
 E?-- -- -- /0-- \\--	--` ###%BLQ $  !37"-<<"-	"- <<"- 	"-
 "- E?"- "- /0"- \\"-	"-J ##+]LI $  LP!378<<8	8 <<8 ell,FFGH	8
 8 E?8 8 8 /08 \\8	8v ##-" $  )-!37+<<+	+ <<+ %	+
 + + E?+ + + /0+ \\+
+\ ##&& =|L" $  )-!37--<<--	-- <<-- %	--
 -- -- E?-- -- -- /0-- \\--
--` ##**- $  )-!37<<	 << %	
   E?   /0 \\	< ##&& =|L $  !37<<	 << 	
  E?   /0 \\	: ##%%- $  )-!37<<	 << %	
   E?   /0 \\	< ##$$ =|L $  !37<<	 << 	
 E?  /0 \\	> ##$$- $  37<<	 << 	
  /0 \\	, ###%BLQ" $  !37(-<<(-	(- <<(- 	(-
 E?(- (- /0(- \\(-
(-V ##!!#%BLQ# $  !37-<<-	- <<- 	-
 E?- - /0- \\-
-6 ##$$#%BLQ $  )-!370<<0	0 <<0 %	0
 0 E?0 0 /00 \\0	0f ##220A6E $  !37<<	 << 	
 E?  /0 \\	( ##770A6E $  !37<<	 << 	
 E?  /0 \\	( ##330A6E $  !37<<	 << 	
 E?  /0 \\	( ##550A6E $  !37<<	 << 	
 E?  /0 \\	( ##!!+]LI $  )-!37(<<(	( <<( %	(
 ( ( E?( ( ( /0( \\(	(r:   