
    >̄i                     r    d dl Z d dlZddlmZ ddlmZ ddlmZ ddlmZm	Z	m
Z
mZ d Zd Zd	 Z	 dd
Zd Zy)    N   )cdiv)runtime)driver)get_dram_gbpsget_max_simd_tflopsget_max_tensorcore_tflopsnvsmic                     |t        |d      z  }t        j                  j                  |      d   dz  }t	        dg      d   }t        ||      |z  t        ||| |      z  }|S z# return compute throughput in TOPS    multiprocessor_countzclocks.max.smr   )minr   utilsget_device_propertiesr
   r	   	backenddevicenum_ctas	num_warpsdtypetotal_warpsnum_subcorescur_sm_clocktflopss	            f/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/triton/ops/matmul_perf_model.pyget_tensorcore_tflopsr      sq    SA..K<<55f=>TUXYYL/*+A.L{+l:=V|Wf>. .FM    c                     |t        |d      z  }t        j                  j                  |      d   dz  }t	        dg      d   }t        ||      |z  t        ||| |      z  }|S r   )r   r   r   r   r
   r   r   s	            r   get_simd_tflopsr       sp    SA..K<<55f=>TUXYYL/*+A.L{+l:=PQVXdfmou=vvFMr   c                     t         j                  j                  |      }|d   dk  r"|t         j                  k(  rt	        | ||||      S t        | ||||      S )Nr      )torchcudaget_device_capabilityfloat32r    r   )r   r   r   r   r   
capabilitys         r   
get_tflopsr(      sS    11&9J!}qUemm3w)UKK &(IuMMr   c                    t         j                  j                  }t        j                  j                         }|j                  }|j                         }t        ||      }t        ||	      }|}||z  |z  }t        ||      t        ||	      }}d|z  |z  |z  dz  }t        |||| |      }||z  }t        j                  j                  |      d   }t        d||z        }t        d|dz        }t        t        d|dz
  dz        d      }t        ||      |dz  |d	z  z   z  }|d
z  }||z  |z  dd|dz
  z  z   z  }||z  |z  dz  |dz
  z  } ||z  |z  dd|dz
  z  z   z  }!||z  |z  dz  |dz
  z  }"||!z   dz  }#| |"z   dz  }$|#|z  |$|z  z   }%|dz  }&||z  |z  |z  dz  }'|dk(  r|'|&z  }(n|&})|'|)z  }(||z  dz  dz  |&z  }*|(|*z  }(t        ||%      |(z   }+|rt!        d|+ d| d|% d|( d|dz   d       |+S )zO return estimated running time in ms
          = max(compute, loading) + store r   i   @r          L   r   gffffff?g?r   g?g?i   g333333?zTotal time: zms, compute time: zms, loading time: zms, store time: zms, Activate CTAs: d   %)r   r   CUDAr#   r$   current_devicer   element_sizer   maxr(   r   r   r   r   r   print),r   
num_stagesABCMNKBLOCK_MBLOCK_NBLOCK_KSPLIT_Kdebugkwargsr   r   r   dtsize	num_cta_m	num_cta_n	num_cta_kr   	total_opstput
compute_msnum_smactive_cta_ratioactive_cta_ratio_bw1active_cta_ratio_bw2dram_bwl2_bwload_a_dram	load_a_l2load_b_dram	load_b_l2
total_dramtotal_l2load_msstore_bwstore_c_dramstore_ms	reduce_bwzero_mstotal_time_mss,                                               r   estimate_matmul_timer[   %   s    oo""GZZ&&(FGGE^^FQ IQ II9$y0H q'?C7OqA A	A!34IgvxEBDT!J \\//78NOF1h/0q(R-0s1x"}&BCQGGV,0Dt0KNbeiNi0ijGaKEa%&.Ay1}(=$=>KA$	A6Ia%&.Ay1}(=$=>KA$	A6I+<JI%+6H7"X%55G }Hq56>G+{;L!|(*	)+a%!){+h6G
G,x7M]O+=j\ J&i'7z B  0 45Q8 	9 r   c                 t   t         j                  j                         }t         j                  j                         }|d   j	                         }|d   j
                  }g }| D ]s  }|j                  }|d   |d   |d   |j                  f\  }	}
}}t        j                  j                  |      d   }|	|
z   |z  |z  |z  }||k  sc|j                  |       u |} |t         j                  t         j                  fvr"| D cg c]  }|j                  d   dk(  s| } }i }| D ]g  }|j                  }|d   |d   |d   |d   |j                  |j                  f\  }	}
}}}}|	|
|||f}||v r||   j                  ||f       `||fg||<   i g }|j                         D ]  \  }}|\  }	}
}}}|d   d	k\  r[|	|
z  |z  d
z  }|t!        d|      z  d	z  }d}||z  t#        j$                  d|fd      }|D ]  }|j                  |d           q|d   d   }d|_        |j                  |        |S c c}w )Nr5   r;   r<   r=   max_shared_memr>   r*   r   r"   i   r   i,  r   c                 R    | d   z
  dk  rdt        | d   z
        z   S | d   z
  S )Nr*   r   
   )abs)xoptimal_num_stagess    r   <lambda>z$early_config_prune.<locals>.<lambda>   sB    aD--2 %'QqT4F-F)G$G 89!?Q8Q r   )key)r#   r$   r0   r%   r1   r   r@   r4   r   r   r   appendfloat16r&   r   itemsr   heapq	nsmallest)configs
named_argsr   r'   rA   r   pruned_configsconfigkwr;   r<   r=   r4   max_shared_memoryrequired_shared_memoryconfigs_mapr>   r   rd   kvmmas
mma_cyclesldgsts_latencynearestnrandom_configrb   s                              @r   early_config_prunerz   g   s   ZZ&&(F113J_))+FsO!!E N *]]yM2i="Y-9J9JJ 	.': #LL>>vFGWX")G"3w!>!Kf!T!%66!!&)* G U]]EMM22(/Qf6==3Kq3P6QQ K 	6]]yM2i="Y-IHXHXZ`ZkZkk 	B'7Iz '9=+##VZ$89!' 45K	6 N!!# 11895'7Ia=AW$w.+>DAy 11A5J N!/*!< oo1 RSG  ,%%ad+, aDGM'(M$!!-0)1* K Rs   H5H5)F)rh   r#    r   _C.libtriton.tritonr   r   testingr   r   r	   r
   r   r    r(   r[   rz    r   r   <module>r      s:       )  \ \N ?D;r   