diff options
Diffstat (limited to 'gdr_test.cu')
-rw-r--r-- | gdr_test.cu | 106 |
1 files changed, 95 insertions, 11 deletions
diff --git a/gdr_test.cu b/gdr_test.cu index d0118d5..42f7cc5 100644 --- a/gdr_test.cu +++ b/gdr_test.cu @@ -14,6 +14,7 @@ #include <pcilib/bar.h> #include <pcilib/kmem.h> +#include "gpudirect.h" #include "config.h" #include "ipedma.h" #include "kernels.h" @@ -119,9 +120,17 @@ int main(int argc, char *argv[]) { initAssert(cuDeviceGetProperties(&gpu_props, gpu)); printf (" GPU Clock %lu KHz\n", gpu_props.clockRate); - CUdeviceptr d_A, d_D; + cudaStream_t stream; + cudaStreamCreate(&stream); + + cudaEvent_t events[GPU_ITERS]; + for (int i = 0; i < GPU_ITERS; i++) + cudaEventCreate(&events[i]); + + CUdeviceptr d_A, d_D, d_RES; initAssert(cuMemAlloc(&d_D, GPU_PAGE)); // Should be multiple of GPU page, or mapping of next allocation will segfault the gdrcopy module initAssert(cuMemAlloc(&d_A, PAGE_SIZE)); + initAssert(cuMemAlloc(&d_RES, GPU_ITERS * sizeof(uint64_t))); unsigned int flag = 1; initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_D)); initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_A)); @@ -199,7 +208,7 @@ int main(int argc, char *argv[]) { printf("\nSize: %lu bytes (%lu %lu-byte descriptors with packet length set to %lu), GPU itertions: %lu, Iterations: %lu\n", SIZE, NUM_PAGES, PAGE_SIZE, TLP_SIZE, GPU_ITERS, ITERS); memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t)); - volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + 2 * sizeof(uint32_t)); + volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + DESCRIPTOR_OFFSET + 2 * sizeof(uint32_t)); WR32 (REG_RESET_DMA, 1); usleep (100000); @@ -209,7 +218,7 @@ int main(int argc, char *argv[]) { WR32_sleep (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE)); WR32_sleep (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE); WR32_sleep (REG_UPDATE_THRESHOLD, 1); - WR64_sleep (REG_UPDATE_COUNTER, desc_bus); + WR64_sleep (REG_UPDATE_COUNTER, D_info.bus_addr); WR64_sleep (REG_UPDATE_ADDRESS, desc_bus + DESCRIPTOR_OFFSET); WR32_sleep (REG_DMA, 1); WR32_sleep (REG_INTERCONNECT, 0x232); @@ -232,14 +241,48 @@ int main(int argc, char *argv[]) { #ifdef GPU_DESC ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint32_t*)d_D, (uint64_t*)(d_D + DESCRIPTOR_OFFSET), (uint32_t*)d_A); #else +#ifdef CUDA8 + cudaDeviceSynchronize(); + + *(uint32_t*)D = 0; + WR32 (REG_DMA, 0); + WR32 (REG_PERF_COUNTER, 0); + +/* + for (int j = 0; j < GPU_ITERS; j++) { + WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); + } + WR32 (REG_DMA, 1); + usleep(10000); +*/ + WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); + + for (int j = 0; j < GPU_ITERS; j++) { + initAssert(cuStreamWaitValue32(stream, d_D, j + 1, CU_STREAM_WAIT_VALUE_GEQ|CU_STREAM_WAIT_VALUE_FLUSH)); + measure<<<1, 1, 0, stream>>>(j, (void*)dBAR, A_info.bus_addr, (uint64_t*)d_RES, (uint32_t*)d_A); + cudaEventRecord(events[j], stream); + } +// printf("D = %u\n", *(uint32_t*)D); + + // Wait until all is pushed down. + usleep(1000); + + clock_gettime(CLOCK_REALTIME, &tss); + WR32 (REG_DMA, 1); + memcpy(&tse, &tss, sizeof(struct timeval)); + +# else // WR64 (REG_DESCRIPTOR_ADDRESS, kbuf_bus); + WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); + do { } while (*hwaddr == 0); clock_gettime(CLOCK_REALTIME, &tse); null<<<1, 1>>>((uint32_t*)d_A); +# endif #endif err = cudaDeviceSynchronize(); if (err) printf("Oopps, synchronization error %i", err); @@ -251,23 +294,57 @@ int main(int argc, char *argv[]) { #ifdef VERBOSE initAssert(cuMemcpyDtoH((void*)kbuf, d_A, PAGE_SIZE)); -# ifdef GPU_DESC - double lat = 1000. * kbuf[0] / gpu_props.clockRate; - double latk = 1000. * kbuf[1] / gpu_props.clockRate; - double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS; # ifdef USE_HW_CONTER double lath = 4. * RD32 (0x20) / 1000; # else double lath = 0; # endif -#else + + double disp = 0, min = 1E+10, max = 0; + long num = 0; +# ifdef GPU_DESC + double lat = 1000. * kbuf[0] / gpu_props.clockRate; + double latk = 1000. * kbuf[1] / gpu_props.clockRate; + double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS; +# else double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.; double latc = 0; - double lath = 0; -#endif +#ifdef CUDA8 + uint64_t res[GPU_ITERS]; + cudaMemcpy(res, (void*)d_RES, GPU_ITERS * sizeof(uint64_t), cudaMemcpyDeviceToHost); + printf("Iterations (us):"); + + for (int j = 1; j < GPU_ITERS; j++) { + float ms; + cudaEventElapsedTime(&ms, events[j - 1], events[j]); + double lati = ms * 1000.; +// double lati = 4. * (res[j] - res[j - 1]) / 1000; +// double lati = 1000. * (res[j] - res[j - 1]) / gpu_props.clockRate; + + lat += lati; + if (j > 1) disp += pow(lat - lati * j, 2) / (j * (j - 1)); + + if (lati > max) max = lati; + if (lati < min) min = lati; + if (lati > 11) num++; + + printf(" % 6.3lf", lati); + } + printf("\n"); + + + lat /= GPU_ITERS; + latk /= GPU_ITERS; + latc /= GPU_ITERS; + lath /= GPU_ITERS; + + disp = sqrt(disp / (GPU_ITERS - 1)); + +#endif +# endif - printf("hw: % 6.3lf us, sw: % 6.3lf us, +krn: % 6.3lf us, total: % 7.3lf us: %x %x %x %x\n", lath, lat, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]); + printf("hw: % 8.3lf us, sw: % 8.3lf us (% 8.3lf - % 8.3lf / % 8.3lf / % 3lu), +krn: % 8.3lf us, total: % 8.3lf us: %x %x %x %x\n", lath, lat, min, max, disp, num, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]); #else if (!i) gettimeofday(&tvs, NULL); #endif /* VERBOSE */ @@ -306,7 +383,14 @@ int main(int argc, char *argv[]) { gdr_unpin_buffer(g, D_mh); gdr_close(g); + cuMemFree(d_RES); cuMemFree(d_A); cuMemFree(d_D); + + for (int i = 0; i < GPU_ITERS; i++) + cudaEventDestroy(events[i]); + + cudaStreamDestroy(stream); + printf("GDR closed\n"); } |