diff options
| author | root <root@ipepdvdev1.ipe.kit.edu> | 2016-05-24 00:04:46 +0200 | 
|---|---|---|
| committer | root <root@ipepdvdev1.ipe.kit.edu> | 2016-05-24 00:04:46 +0200 | 
| commit | ca1b1cea796bcfaeb86f201cf35065a606921cc1 (patch) | |
| tree | 9dbe7a85bad655dbbc7852149c749d9e85573601 | |
| parent | 16e0aeeed527f8452e336685f664d7aa848702d3 (diff) | |
| download | gdrtest-ca1b1cea796bcfaeb86f201cf35065a606921cc1.tar.gz gdrtest-ca1b1cea796bcfaeb86f201cf35065a606921cc1.tar.bz2 gdrtest-ca1b1cea796bcfaeb86f201cf35065a606921cc1.tar.xz gdrtest-ca1b1cea796bcfaeb86f201cf35065a606921cc1.zip | |
Read hw counter
| -rw-r--r-- | config.h | 6 | ||||
| -rw-r--r-- | gdr_test.cu | 11 | ||||
| -rw-r--r-- | ipedma.h | 15 | ||||
| -rw-r--r-- | kernels.cu | 16 | 
4 files changed, 39 insertions, 9 deletions
| @@ -1,9 +1,13 @@  #define ITERS 100 -#define GPU_ITERS 1000 +#define GPU_ITERS 100 +  #define TLP_SIZE 64  #define GPU_PAGE 65536 +  #define PAGE_SIZE 4096  #define VERBOSE  #define GPU_DESC + +#define USE_HW_CONTER diff --git a/gdr_test.cu b/gdr_test.cu index 13af482..cc624fc 100644 --- a/gdr_test.cu +++ b/gdr_test.cu @@ -172,6 +172,8 @@ int main(int argc, char *argv[]) {  //    initAssert (cuMemHostRegister ((void*)((((uintptr_t)bar)/65536)*65536), 65536, CU_MEMHOSTREGISTER_DEVICEMAP));      initAssert (cuMemHostRegister ((void*)bar, 4096, CU_MEMHOSTREGISTER_IOMEMORY));      initAssert (cuMemHostGetDevicePointer(&dBAR, (void*)bar, 0)); +	// no effect +    //initAssert (cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, dBAR));      bar_info = pcilib_get_bar_info(pci, BAR);      printf("%p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size); @@ -208,6 +210,7 @@ int main(int argc, char *argv[]) {      WR32 (REG_UPDATE_THRESHOLD, 0);      WR64 (REG_UPDATE_ADDRESS, desc_bus);      WR32 (REG_DMA, 1); +    WR32 (REG_INTERCONNECT, 0x232);      WR32 (REG_COUNTER, 1);  #ifdef VERBOSE @@ -244,13 +247,19 @@ int main(int argc, char *argv[]) {  	double lat = 1000. * kbuf[0] / gpu_props.clockRate;  	double latk = 1000. * kbuf[1] / gpu_props.clockRate;  	double latc = ((tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS; +# ifdef USE_HW_CONTER +	double lath = 4. * RD32 (0x20) / 1000; +# else +	double lath = 0; +# endif  #else  	double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.;  	double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.;  	double latc = 0; +	double lath = 0;  #endif -	printf("Latency: %.3lf us / %.3lf us (%.3lf us) %x %x %x %x\n", lat, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]); +	printf("hw: % 6.3lf us, sw: % 6.3lf us, +krn: % 6.3lf us, total: % 7.3lf us: %x %x %x %x\n", lath, lat, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]);  #else  	if (!i)  gettimeofday(&tvs, NULL);  #endif /* VERBOSE */ @@ -1,20 +1,21 @@  #define REG_RESET_DMA                   0x00  #define REG_DMA                         0x04  #define REG_NUM_PACKETS_PER_DESCRIPTOR  0x10 -#define REG_PERF_COUNTER                0x28 +#define REG_PERF_COUNTER		0x20 +//#define REG_PERF_COUNTER                0x28  #define REG_PACKET_LENGTH               0x0C  #define REG_DESCRIPTOR_ADDRESS          0x50  #define REG_UPDATE_ADDRESS              0x58  #define REG_UPDATE_THRESHOLD            0x60 - +#define REG_INTERCONNECT		0x9048  #define REG_COUNTER                     0x9000 -#define WR32(addr, value) *(uint32_t *) (((char*)(bar)) + (addr)) = (value); -#define RD32(addr) (*(uint32_t *) (((char*)(bar)) + (addr))) -#define WR32_sleep(addr, value) *(uint32_t *) (((char*)(bar)) + (addr)) = (value); usleep (100); +#define WR32(addr, value) *(volatile uint32_t *) (((char*)(bar)) + (addr)) = (value); +#define RD32(addr) (*(volatile uint32_t *) (((char*)(bar)) + (addr))) +#define WR32_sleep(addr, value) *(volatile uint32_t *) (((char*)(bar)) + (addr)) = (value); usleep (100); -#define WR64(addr, value) *(uint64_t *) (((char*)(bar)) + (addr)) = (value); -#define RD64(addr) (*(uint64_t *) (((char*)(bar)) + (addr))) +#define WR64(addr, value) *(volatile uint64_t *) (((char*)(bar)) + (addr)) = (value); +#define RD64(addr) (*(volatile uint64_t *) (((char*)(bar)) + (addr)))  #define WR64_sleep(addr, value) *(uint64_t *) (((char*)(bar)) + (addr)) = (value); usleep (100); @@ -17,8 +17,20 @@ __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t  	desc[1] = 0; +	    // Reset counter +#ifdef USE_HW_CONTER +        WR32 (REG_DMA, 0); +	WR32 (REG_PERF_COUNTER, 0); + +	WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); + +        t1 = clock64(); +	WR32 (REG_DMA, 1); +#else          t1 = clock64();  	WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); +#endif +          do {  	    if (++wait > 0x10000) break; @@ -26,6 +38,10 @@ __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t  	t2 = clock64(); +#ifdef USE_HW_CONTER +	WR32 (REG_PERF_COUNTER, 1); +#endif +  	null<<<1,1>>>(data);  	cudaDeviceSynchronize();  	t3 = clock64(); | 
