B
    L·e(E  ã               @   sÂ   d dl Z d dlZd dlZd dlZd dlmZ ddlmZ dd„ Zd'dd	„Z	d(dd„Z
d)dd„ZG dd„ dƒZG dd„ dƒZdd„ Zd*dd„Zd+dd„Zdd„ Zdd „ Zed,d#d$„ƒZd-d%d&„ZdS ).é    N)Úcontextmanageré   )Úruntimec             C   sL   d  | ¡} dddd|  dg}t |¡}| tjj¡ d¡}dd„ |D ƒ}|S )	Nú,z
nvidia-smiz-iÚ0z--query-gpu=z--format=csv,noheader,nounitsc             S   s   g | ]}t |ƒ‘qS © )Úint)Ú.0Úxr   r   úQ/home/ankuromar296_gmail_com/.local/lib/python3.7/site-packages/triton/testing.pyú
<listcomp>   s    znvsmi.<locals>.<listcomp>)ÚjoinÚ
subprocessÚcheck_outputÚdecodeÚsysÚstdoutÚencodingÚsplit)ÚattrsÚcmdÚoutÚretr   r   r   Únvsmi
   s    

r   é   c          	      sÚ  dd l ‰ˆj ¡ ˆj ¡ kr$tdƒ‚| ƒ  |d k	rXx$|D ]}| ¡  | d¡ d |_q8W ˆj ¡ ‰ ˆj 	ˆ ¡ | ƒ  W d Q R X ˆj 
¡  ‡ fdd„} ˆjjdd}ˆjjdd}| ¡  | ƒ  | ¡  ˆj 
¡  | |¡}tdt|| ƒƒ}‡fdd	„t|ƒD ƒ}‡fd
d	„t|ƒD ƒ}g }d}	x¢t|	ƒD ]–}
ˆj 
¡  xPt|ƒD ]D}|d k	rhx|D ]}d |_qXW ||  ¡  | ƒ  ||  ¡  qDW ˆj 
¡  ˆ dd	„ t||ƒD ƒ¡}| ˆ |¡¡ q,W ˆ ˆ |¡¡ ¡ S )Nr   zQCannot capture graph in default stream. Please use side stream in benchmark code.Tc                  s   ˆ   ¡ S )N)Zreplayr   )Úgr   r   Ú<lambda>,   ó    z$do_bench_cudagraph.<locals>.<lambda>)Úenable_timingr   c                s   g | ]}ˆ j jd d‘qS )T)r   )ÚcudaÚEvent)r	   Úi)Útorchr   r   r   8   s    z&do_bench_cudagraph.<locals>.<listcomp>c                s   g | ]}ˆ j jd d‘qS )T)r   )r   r    )r	   r!   )r"   r   r   r   9   s    é2   c             S   s   g | ]\}}|  |¡‘qS r   )Úelapsed_time)r	   ÚsÚer   r   r   r   K   s    )r"   r   Zcurrent_streamZdefault_streamÚRuntimeErrorZdetach_Zrequires_grad_ÚgradZ	CUDAGraphÚgraphÚsynchronizer    Úrecordr$   Úmaxr   ÚrangeÚtensorÚzipÚappendÚminÚmeanÚitem)ÚfnÚrepÚgrad_to_noner
   Ústart_eventÚ	end_eventÚestimate_msÚn_repeatr   Z	n_retriesÚ_r!   Útimesr   )r   r"   r   Údo_bench_cudagraph   sN    










r=   é   éd   Tr2   c                sú  |dkst ‚dd l‰ | ƒ  ˆ j ¡  |r@ˆ jtdƒˆ jdd}nˆ jtdƒˆ jdd}ˆ jjdd}ˆ jjdd}	| ¡  xt	d	ƒD ]}
| 
¡  | ƒ  q„W |	 ¡  ˆ j ¡  | |	¡d	 }td
t|| ƒƒ}td
t|| ƒƒ}‡ fdd„t	|ƒD ƒ}‡ fdd„t	|ƒD ƒ}	xt	|ƒD ]}
| ƒ  qW xXt	|ƒD ]L}|d k	rRx|D ]}d |_qBW | 
¡  ||  ¡  | ƒ  |	|  ¡  q.W ˆ j ¡  ˆ jdd„ t||	ƒD ƒˆ jd}|d k	rèˆ  |ˆ j|ˆ jd¡ ¡ }t|ƒd
krä|d }|S tˆ |ƒ|ƒ ¡ S )N)r1   r,   r2   Zmedianr   g    €„ŽAr   )ÚdtypeÚdeviceg    €„®AT)r   é   r   c                s   g | ]}ˆ j jd d‘qS )T)r   )r   r    )r	   r!   )r"   r   r   r      s    zdo_bench.<locals>.<listcomp>c                s   g | ]}ˆ j jd d‘qS )T)r   )r   r    )r	   r!   )r"   r   r   r   ‚   s    c             S   s   g | ]\}}|  |¡‘qS r   )r$   )r	   r%   r&   r   r   r   r   –   s    )r@   )ÚAssertionErrorr"   r   r*   Úemptyr   Úint8r    r+   r-   Zzero_r$   r,   r(   r.   r/   ÚfloatZquantileÚtolistÚlenÚgetattrr3   )r4   Zwarmupr5   r6   Z	quantilesZ
fast_flushZreturn_modeÚcacher7   r8   r;   r9   Zn_warmupr:   r!   r
   r<   r   r   )r"   r   Údo_benchP   sL    





 
rK   Ú c             C   sN  dd l }dd l}t| |jƒs&| | ¡} t||jƒs<| |¡}|d krHd}t|ƒrZ|| jƒn|}|d krjd}t|ƒr||| jƒn|}t| |jƒr°| j|jkr |  ¡ } |  	¡  
¡   ¡ } t||jƒrà|j|jkrÐ| ¡ }| 	¡  
¡   ¡ }| jdksö|jdkr|jj| |||dd d S |j| |||dsJt|› d| › d	|› d
|› d|› d
ƒ‚d S )Nr   g{®Gáz„?g        r   T)ÚatolÚrtolZ	equal_nan)rM   rN   ú z is not close to z (atol=z, rtol=ú))Únumpyr"   Ú
isinstanceZTensorr.   Úcallabler@   Úbfloat16rF   ÚcpuÚdetachÚsizeÚtestingZassert_allcloseZallcloserC   )r
   ÚyrM   rN   Úerr_msgÚnpr"   r   r   r   Úassert_closeŸ   s2    

r\   c               @   s   e Zd ZdZddd„ZdS )Ú	Benchmarkzk
    This class is used by the :code:`perf_report` function to generate line plots with a concise API.
    rL   FNc             C   sL   || _ || _|
| _|| _|| _|| _|| _|| _|| _|	| _	|| _
|| _dS )a  
        Constructor

        :param x_names: Name of the arguments that should appear on the x axis of the plot. If the list contains more than one element, all the arguments are assumed to have the same value.
        :type x_names: List[str]
        :param x_vals: List of values to use for the arguments in :code:`x_names`.
        :type x_vals: List[Any]
        :param line_arg: Argument name for which different values correspond to different lines in the plot.
        :type line_arg: str
        :param line_vals: List of values to use for the arguments in :code:`line_arg`.
        :type line_vals: List[str]
        :param line_names: Label names for the different lines.
        :type line_names: List[str]
        :param plot_name: Name of the plot.
        :type plot_name: str
        :param args: List of arguments to remain fixed throughout the benchmark.
        :type args: List[str]
        :param xlabel: Label for the x axis of the plot.
        :type xlabel: str, optional
        :param ylabel: Label for the y axis of the plot.
        :type ylabel: str, optional
        :param x_log: Whether the x axis should be log scale.
        :type x_log: bool, optional
        :param y_log: Whether the y axis should be log scale.
        :type y_log: bool, optional
        N)Úx_namesÚx_valsÚx_logÚline_argÚ	line_valsÚ
line_namesÚy_logÚstylesÚxlabelÚylabelÚ	plot_nameÚargs)Úselfr^   r_   ra   rb   rc   rh   ri   rf   rg   r`   rd   Úcolorre   r   r   r   Ú__init__É   s    *zBenchmark.__init__)rL   rL   FFNN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__rl   r   r   r   r   r]   Ä   s        r]   c               @   s&   e Zd Zdd„ Zdd„ Zd
dd„Zd	S )ÚMarkc             C   s   || _ || _d S )N)r4   Ú
benchmarks)rj   r4   rr   r   r   r   rl     s    zMark.__init__c          
      sò  dd l }dd lm} dd l}|j}dd„ |jD ƒ}	dd„ |jD ƒ}
|j|jd g| |	 |
 d}xÊ|jD ]À‰ ‡ fdd„|jD ƒ}g g g   }}}xz|jD ]p}| j	f ||j
|i|j—Ž}y|\}}	}
W n$ tk
rê   |d d   }}	}
Y nX ||g7 }||	g7 }||
g7 }qšW ˆ g| | | |jt|ƒ< qjW |jr”| ¡  | ¡ }|jd ‰ x®t|jƒD ] \}}||d  ||d	   }	}
|jr”|j| d nd }|jr®|j| d
 nd }|j|ˆ  || |||d |	d k	r\|
d k	r\|j|ˆ  |	|
d|d q\W | ¡  |jr|jn
d |j¡}| |¡ | |j¡ | |jrHdnd¡ | |jr^dnd¡ |rr|  ¡  |r”| !|j" ||j› d¡¡ ||jd g|j  }|rÆt#|jd ƒ t#|ƒ |rî|j$|j" ||j› d¡ddd d S )Nr   c             S   s   g | ]}|› d ‘qS )z-minr   )r	   r
   r   r   r   r     s    zMark._run.<locals>.<listcomp>c             S   s   g | ]}|› d ‘qS )z-maxr   )r	   r
   r   r   r   r     s    )Úcolumnsc                s   i | ]
}ˆ |“qS r   r   )r	   Zx_name)r
   r   r   ú
<dictcomp>  s    zMark._run.<locals>.<dictcomp>z-minz-maxr   )Úlabelrk   Zlsg333333Ã?)Úalphark   z = ÚlogZlinearz.pngú:z.csvz%.1fF)Zfloat_formatÚindex)%ÚosZmatplotlib.pyplotZpyplotZpandasrc   Z	DataFramer^   r_   rb   r4   ra   ri   Ú	TypeErrorÚlocrH   rh   ZfigureZsubplotÚ	enumeratere   ZplotZfill_betweenZlegendrf   r   Z
set_xlabelZ
set_ylabelrg   Z
set_xscaler`   Z
set_yscalerd   ÚshowZsavefigÚpathÚprintZto_csv)rj   ÚbenchÚ	save_pathÚ
show_plotsÚ
print_datarz   ZpltÚpdZy_meanZy_minZy_maxÚdfZx_argsZrow_meanZrow_minZrow_maxrY   r   Zaxr!   ÚcolZstyrf   r   )r
   r   Ú_run  s^     

 

z	Mark._runFrL   c             C   sˆ   t | jtƒ}|r| jgn| j}|r@ttj |d¡dƒ}| d¡ x4|D ],}|  ||||¡ |rF| d|j	› d¡ qFW |r„| d¡ d S )Nzresults.htmlÚwz<html><body>
z<image src="z.png"/>
z</body></html>
)
rR   rr   r]   Úopenrz   r   r   Úwriterˆ   rh   )rj   rƒ   r„   r‚   Zhas_single_benchrr   Úhtmlr   r   r   r   Úrun:  s    

zMark.runN)FFrL   )rm   rn   ro   rl   rˆ   r   r   r   r   r   rq     s   3rq   c                s   ‡ fdd„}|S )zê
    Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.

    :param benchmarks: Benchmarking configurations.
    :type benchmarks: List of :class:`Benchmark`
    c                s
   t | ˆ ƒS )N)rq   )r4   )rr   r   r   r   O  r   zperf_report.<locals>.<lambda>r   )rr   Úwrapperr   )rr   r   Úperf_reportH  s    r   c             C   sf   ddl }ddlm} | s tjj} |s.|j ¡ }|j |¡d }|j |¡d }|| d d d	 }|S )
z return DRAM bandwidth in GB/s r   Nr   )ÚdriverZmem_clock_rateZmem_bus_widthé   g    €„.Aé   )	r"   r   r   ÚbackendÚCUDAr   Úcurrent_deviceÚutilsÚget_device_properties)r“   rA   r"   r   Zmem_clock_khzZ	bus_widthZbw_gbpsr   r   r   Úget_dram_gbpsS  s    
r˜   c       
      C   sÔ   dd l }ddlm} |s tjj}|s.|j ¡ }|j |¡d d }|sV|j |¡d }|j 	|¡}|d dk r‚| |j
ks|t‚d}n>| |jkr’d}n.| |j
|jgkr¨d	}n| |jkr¸d
}ntdƒ‚|| | d }	|	S )Nr   r   )r   Úmultiprocessor_counté   Úsm_clock_rater’   é   i   i   zdtype not supportedg•Ö&è.>)r"   r   r   r“   r”   r   r•   r–   r—   Úget_device_capabilityÚfloat16rC   Úfloat32rT   rE   r'   )
r@   r“   rA   Ú
clock_rater"   r   Únum_subcoresÚ
capabilityÚops_per_sub_coreÚtflopsr   r   r   Úget_max_tensorcore_tflopsb  s,    


r¥   c                 s   ‡ fdd„}|S )Nc                s   t  ˆ ¡‡‡ fdd„ƒ}|S )Nc        
         sÚ   dd l }| t ¡ ¡ ¡ }ˆ  ¡ | ¡ k}|rÌ|dkrÌtj ˆjd ¡}tj	d ddœ}d|ksht
dƒ‚|d jjj}|› d	ˆj› d
|› d}tjddd|gd|d}	|	jdks¸t
dƒ‚dt|	jƒksÖt
‚n
ˆ| |Ž d S )Nr   zcuda-memcheckÚ__file__ÚPATHÚ1)r§   ZPYTORCH_NO_CUDA_MEMORY_CACHINGÚrequestz@memcheck'ed test must have a (possibly unused) `request` fixturez::ú[ú]Zpytestz-vsT)Úcapture_outputÚenvz7cuda-memcheck returned an error: bounds checking failedzERROR SUMMARY: 0 errors)ÚpsutilÚProcessrz   ÚgetppidÚnameÚitemsr   ÚrealpathÚ__globals__ÚenvironrC   ÚnodeZcallspecÚidrm   r   r   Ú
returncodeÚstrr   )
ri   Úkwargsr®   Z	ppid_nameZrun_cuda_memcheckr   r­   Ztest_idr   r   )Útarget_kwargsÚtest_fnr   r   rŽ   „  s    z1cuda_memcheck.<locals>.decorator.<locals>.wrapper)Ú	functoolsÚwraps)r¼   rŽ   )r»   )r¼   r   Ú	decoratorƒ  s    z cuda_memcheck.<locals>.decoratorr   )r»   r¿   r   )r»   r   Úcuda_memcheck‚  s    rÀ   c             C   sL   d  | ¡} dddd|  dg}t |¡}| tjj¡ d¡}dd„ |D ƒ}|S )	Nr   z
nvidia-smiz-ir   z--query-gpu=z--format=csv,noheader,nounitsc             S   s   g | ]}t |ƒ‘qS r   )r   )r	   r
   r   r   r   r   ¤  s    znvsmi_attr.<locals>.<listcomp>)r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   Ú
nvsmi_attr™  s    

rÁ   éF  é¿  c             c   s  zÎt  dddddg¡ t  dddd| › d| › g¡ t  dddd|› d|› g¡ td	gƒd
 }tdgƒd
 }t||  ƒdk sŽtd| › dƒ‚t|| ƒdk s®td|› dƒ‚d|  }d| d }||fV  W d t  dddddg¡ t  ddddg¡ t  ddddg¡ X d S )Nz
nvidia-smiz-ir   z-pmr¨   z--lock-gpu-clocks=r   z--lock-memory-clocks=zclocks.current.smr   zclocks.current.memoryé
   zGPU SMs must run at z MHzgÞ 3ßÁOÌ?i   gü©ñÒMbP?z-rgcz-rmc)r   r   rÁ   ÚabsrC   )Zref_sm_clockZref_mem_clockZcur_sm_clockZcur_mem_clockr¤   Zgbpsr   r   r   Úset_gpu_clock¨  s,      rÆ   c       
      C   sÔ   dd l }ddlm} |s tjj}|s.|j ¡ }|j |¡d d }|j |¡d }|j 	¡ }|d dk r’| |j
krxd}qÀ| |jkrˆd	}qÀtd
ƒ‚n.| |j
kr¢d}n| |j|jgkr¸d	}ntd
ƒ‚|| | d }	|	S )Nr   r   )r   r™   rš   r›   r’   é    é@   zdtype not supportedg•Ö&è.>)r"   r   r   r“   r”   r   r•   r–   r—   r   rŸ   rž   r'   rT   )
r@   r“   rA   r"   r   r¡   r    r¢   r£   r¤   r   r   r   Úget_max_simd_tflopsÉ  s,    





rÉ   )r   N)r>   r?   NNTr2   )NNrL   )NN)NNN)rÂ   rÃ   )NN)r½   rz   r   r   Ú
contextlibr   Z_C.libtriton.tritonr   r   r=   rK   r\   r]   rq   r   r˜   r¥   rÀ   rÁ   rÆ   rÉ   r   r   r   r   Ú<module>   s,   	
=   
L
%>F

  