
    >̄i"                     d   d dl Z ddlmZmZmZmZmZ ddlmZ ddl	m
Z
mZ e j                  e j                  e j                  gZd Zd Zd	 Z e ed
dddddd       edd
ddddd       eddddddd       eddddddd       ed
d
ddddd       ed
dddddd       edd
ddddd       ed
dddddd       eddddddd       ed
dd
dddd       edd
d
dddd       eddd
dddd       eddd
dddd       ed
d
d
dddd       ed
dddddd       edd
ddddd       ed
dddddd       eddddddd      g e       z   g de
edd       edd i      edej&                  dej&                  dej&                  dej&                  dej&                  dej&                  d ej&                  d!ej&                  dej&                  d"ej&                  fd#                     Z G d$ d%e j*                  j,                        Zej0                  Zy)&    N   )Configautotunecdiv
heuristicsjit)language   )early_config_pruneestimate_matmul_timec                 n    | |u r| S | t         v sJ |t         v sJ t         D ]  }| |u r|c S ||u s| c S  y N)_ordered_datatypes)abds      [/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/triton/ops/matmul.pyget_higher_dtyper   
   sS    Av"""""""" 6H6H	    c                       fdS )Nc                 *    |    j                         S r   )zero_)nargsnames    r   <lambda>zinit_to_zero.<locals>.<lambda>   s    t**, r    )r   s   `r   init_to_zeror      s	    ,,r   c                      g } dD ]u  }dD ]n  }dD ]g  }dD ]`  }|dk  rdnd}| j                  t        |||dd	||
             dD ].  }| j                  t        ||||d	||t        d                   0 b i p w | S )N)r               )       )r$   @   )r$   r%         r%   r   r    r
   BLOCK_MBLOCK_NBLOCK_KSPLIT_K
num_stages	num_warps)r   r       r#   C)r.   r/   pre_hook)appendr   r   )configsr.   block_mblock_kblock_nr/   split_ks          r   get_configs_io_boundr9      s    G% l
 	lG# 
l1 	lG%,]INN7wSZghi*4	KL $1 l"w7W^kr#s.8IXdehXikll	l
l	ll Nr   r&   r'   r$   r(   r   r0   r-   r%   r    r!   )MNK
   )r   
perf_modeltop_k)r4   keyprune_configs_byEVEN_Kc                 *    | d   | d   | d   z  z  dk(  S )Nr<   r+   r,   r   r   )argss    r   r   r   M   s!    49Y$y/(IJaO r   dot_out_dtype
allow_tf32fp8_fast_accumr)   r*   r+   GROUP_Mr,   AB_DTYPEc                 t   t        j                  d      }t        j                  d      }t        j                  ||      }t        j                  ||      }||z  }||z  }t        |||z  z
  |      }||z  ||z  z   }||z  |z  }||z  t        j                  d|      z   }||z  t        j                  d|      z   } t        j
                  t        j                  ||z  |      |      }!t        j
                  t        j                  | |z  |      |      }"||z  t        j                  d|      z   }#| |!d d d f   |z  |#d d d f   |z  z   z   } ||#d d d f   |z  |"d d d f   |	z  z   z   }t        j                  ||f|      }$t        dt        j                  |||z              D ]E  }%|r+t        j                  |       }&t        j                  |      }'nz||%||z  z  z
  }(t        j                  d|j                  j                        })t        j                  | |#d d d f   |(k  |)      }&t        j                  ||#d d d f   |(k  |)      }'|rJ|&j                  |j                  j                        }&|'j                  |j                  j                        }'|rt        j                  |&|'|$||      }$n|$t        j                  |&|'||      z  }$| ||z  |z  z  } |||z  |z  z  }H |$j                  |j                  j                        }$||z  t        j                  d|      z   }||z  t        j                  d|      z   } ||d d d f   |
z  | d d d f   |z  z   z   }||k  d d d f   | |k  d d d f   z  }*|dk(  rt        j                  ||$|*       y t        j                  ||$|*       y )Nr   r
   )dtype)r
   r
   )maskother)	out_dtyperF   )rL   )tl
program_idr   minarangemax_contiguousmultiple_ofzerosrangeloadrK   
element_tytodotstore
atomic_add)+ABr1   r:   r;   r<   	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnrE   rF   rG   r)   r*   r+   rH   r,   rB   rI   pidpid_zgrid_mgrid_nwidthgroup_id
group_sizepid_mpid_nrmrnramrbnrkacckr   r   k_remaining_0rL   s+                                              r   _kernelrw   .   s|   Z --
CMM!EWWQ FWWQ FfEe|HVh00':Jw#
"23E5[j)E	299Q0	0B	299Q0	0B


BNN267;W
EC


BNN267;W
EC	299Q0	0B	SD\I%47i(??@A	R4[9$s47|i'??@A
((GW%]
;C1bgga7!234 +
A
Aa7W#455K&(:(:;B47k 9DA1d7k 9DAQWW''(AQWW''(A&&Asm
SC266!Q-JOOC	Ww**	Ww**#+$ &&##
$C	299Q0	0B	299Q0	0B	R4[9$r$'{Y'>>?AFAtGQa00D!|
Cd#
a4(r   c                   6    e Zd ZeZi Zed        Zedd       Zy)_matmulc                    | j                   }| j                  d      dkD  r$| j                  d      dkD  r| j                         } |j                  d      dkD  r$|j                  d      dkD  r|j                         }| j                  d   |j                  d   k(  sJ d       | j                  \  }|j                  \  }| j                  t
        j                  t
        j                  t
        j                  fv s;|j                  t
        j                  t
        j                  t
        j                  fv rt        j                  }nk| j                  t        j                  fv s|j                  t        j                  fv rt        j                  }n t        | j                  |j                        }t        j                  f||      }	|S|t        j                  t        j                  t        j                   fv rt
        j                  }nt
        j                  }nt#        |t        j                        sJ d       |t        j                  k(  rt
        j                  }nC|t        j                  t        j                   fv rt
        j                  }nt
        j                  }d}
| j                  t
        j                  t
        j                  fv r.|j                  t
        j                  t
        j                  fv rd}
| j                  t        j                  fv r|j                  t        j                  fv rd}
fd}t%        |   | ||	|| j                  d      | j                  d      |j                  d      |j                  d      |	j                  d      |	j                  d      |||d	|

       |	S )Nr   r
   zincompatible dimensions)devicerK   z#dot_out_dtype must be a torch.dtypeTFc                 L    t        | d         t        | d         z  | d   fS )Nr)   r*   r,   )r   )METAr:   r;   s    r   r   z_matmul._call.<locals>.<lambda>   s.    T!T)_5QY8PPRVW`Rab r   r0   )rE   rF   rG   rH   rI   )r{   stride
contiguousshaperK   rO   
float8e4nvfloat8e4b15float8e5torchfloat16int8int32r   emptyfloat32bfloat16
isinstancerw   )r   r   rE   rF   rG   r{   r<   _c_dtypecab_dtypegridr:   r;   s               @@r   _callz_matmul._call   s   88A;?qxx{QA88A;?qxx{QAwwqzQWWQZ'B)BB'ww1ww177r}}bnnbkkBB77r}}bnnbkkBBmmGWW$EJJ<(?kkG&qww8GKKAvW= 5==%--HH "

 "mU[[9`;``9- "

5==%.."AA "

 "77r}}bkk22qww2==RTR]R]B^7^H77uzzl"qww5::,'>Hbq!Q1HHQK!HHQK!HHQK!'!)	* r   Nc                 6    t         j                  |||||      S )N)rE   rF   rG   )ry   r   )ctxr   r   rE   rF   rG   s         r   forwardz_matmul.forward   s    }}Q:ft}uur   )NTT)	__name__
__module____qualname__rw   kernel_locksstaticmethodr   r   r   r   r   ry   ry      s5    FF1 1f v vr   ry   )r    r   r   r   r   r   r	   rO   matmul_perf_modelr   r   r   r   r   r   r   r   r9   	constexprrw   autogradFunctionry   applymatmulr   r   r   <module>r      s    6 6  GmmU^^U]]C -$ 
 	332!LYZfgh332!LYZfgh32"KXYefg2#"KXYefg332!LYZfgh32"KXYefg2#"KXYefg32"KXYefg2"JWXdef3331MZ[ghi3331MZ[ghi32#!LYZfgh2##!LYZfgh3331MZ[ghi32"KXYefg2#"KXYefg32"KXYefg2"JWXdef)* 	+, 	0*1< O  ;) <<	;)
 ;) LL;) \\;) -/LL;) DF<<;) \\;) -/LL;) CE,,;) []ZfZf;) =D;)|;venn%% ;v| 
r   