diff options
| -rw-r--r-- | config.h | 3 | ||||
| -rw-r--r-- | gdr_test.cu | 106 | ||||
| -rw-r--r-- | gpudirect.h | 29 | ||||
| -rw-r--r-- | kernels.cu | 18 | ||||
| -rw-r--r-- | kernels.h | 3 | 
5 files changed, 147 insertions, 12 deletions
| @@ -1,5 +1,6 @@  #define VERBOSE -#define GPU_DESC +//#define GPU_DESC +#define CUDA8  #define USE_HW_CONTER diff --git a/gdr_test.cu b/gdr_test.cu index d0118d5..42f7cc5 100644 --- a/gdr_test.cu +++ b/gdr_test.cu @@ -14,6 +14,7 @@  #include <pcilib/bar.h>  #include <pcilib/kmem.h> +#include "gpudirect.h"  #include "config.h"  #include "ipedma.h"  #include "kernels.h" @@ -119,9 +120,17 @@ int main(int argc, char *argv[]) {      initAssert(cuDeviceGetProperties(&gpu_props, gpu));      printf (" GPU Clock %lu KHz\n", gpu_props.clockRate); -    CUdeviceptr d_A, d_D; +    cudaStream_t stream; +    cudaStreamCreate(&stream); +     +    cudaEvent_t events[GPU_ITERS]; +    for (int i = 0; i < GPU_ITERS; i++) +	cudaEventCreate(&events[i]); + +    CUdeviceptr d_A, d_D, d_RES;      initAssert(cuMemAlloc(&d_D, GPU_PAGE));	// Should be multiple of GPU page, or mapping of next allocation will segfault the gdrcopy module      initAssert(cuMemAlloc(&d_A, PAGE_SIZE)); +    initAssert(cuMemAlloc(&d_RES, GPU_ITERS * sizeof(uint64_t)));      unsigned int flag = 1;      initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_D));      initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_A)); @@ -199,7 +208,7 @@ int main(int argc, char *argv[]) {      printf("\nSize: %lu bytes (%lu %lu-byte descriptors with packet length set to %lu), GPU itertions: %lu, Iterations: %lu\n", SIZE, NUM_PAGES, PAGE_SIZE, TLP_SIZE, GPU_ITERS, ITERS);      memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t)); -    volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + 2 * sizeof(uint32_t)); +    volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + DESCRIPTOR_OFFSET + 2 * sizeof(uint32_t));      WR32 (REG_RESET_DMA, 1);      usleep (100000); @@ -209,7 +218,7 @@ int main(int argc, char *argv[]) {      WR32_sleep (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE));      WR32_sleep (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);      WR32_sleep (REG_UPDATE_THRESHOLD, 1); -    WR64_sleep (REG_UPDATE_COUNTER, desc_bus); +    WR64_sleep (REG_UPDATE_COUNTER, D_info.bus_addr);      WR64_sleep (REG_UPDATE_ADDRESS, desc_bus + DESCRIPTOR_OFFSET);      WR32_sleep (REG_DMA, 1);      WR32_sleep (REG_INTERCONNECT, 0x232); @@ -232,14 +241,48 @@ int main(int argc, char *argv[]) {  #ifdef GPU_DESC  	ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint32_t*)d_D, (uint64_t*)(d_D + DESCRIPTOR_OFFSET), (uint32_t*)d_A);  #else +#ifdef CUDA8 +	cudaDeviceSynchronize(); +	 +	*(uint32_t*)D = 0; +	WR32 (REG_DMA, 0); +	WR32 (REG_PERF_COUNTER, 0); + +/* +	for (int j = 0; j < GPU_ITERS; j++) { +	    WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); +	} +	WR32 (REG_DMA, 1); +	usleep(10000); +*/ +  	WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); + +	for (int j = 0; j < GPU_ITERS; j++) { +	    initAssert(cuStreamWaitValue32(stream, d_D, j + 1, CU_STREAM_WAIT_VALUE_GEQ|CU_STREAM_WAIT_VALUE_FLUSH)); +	    measure<<<1, 1, 0, stream>>>(j, (void*)dBAR, A_info.bus_addr, (uint64_t*)d_RES,  (uint32_t*)d_A); +	    cudaEventRecord(events[j], stream); +	} +//	printf("D = %u\n", *(uint32_t*)D); + +	    // Wait until all is pushed down. +	usleep(1000); + +	clock_gettime(CLOCK_REALTIME, &tss); +	WR32 (REG_DMA, 1); +	memcpy(&tse, &tss, sizeof(struct timeval)); + +# else  //    WR64 (REG_DESCRIPTOR_ADDRESS, kbuf_bus); +	WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); +  	do {  	} while (*hwaddr == 0);  	clock_gettime(CLOCK_REALTIME, &tse);  	null<<<1, 1>>>((uint32_t*)d_A); +# endif  #endif  	err = cudaDeviceSynchronize();  	if (err) printf("Oopps, synchronization error %i", err); @@ -251,23 +294,57 @@ int main(int argc, char *argv[]) {  #ifdef VERBOSE          initAssert(cuMemcpyDtoH((void*)kbuf, d_A, PAGE_SIZE)); -# ifdef GPU_DESC -	double lat = 1000. * kbuf[0] / gpu_props.clockRate; -	double latk = 1000. * kbuf[1] / gpu_props.clockRate; -	double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS;  # ifdef USE_HW_CONTER  	double lath = 4. * RD32 (0x20) / 1000;  # else  	double lath = 0;  # endif -#else + +	double disp = 0, min = 1E+10, max = 0; +	long num = 0; +# ifdef GPU_DESC +	double lat = 1000. * kbuf[0] / gpu_props.clockRate; +	double latk = 1000. * kbuf[1] / gpu_props.clockRate; +	double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS; +# else  	double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.;  	double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.;  	double latc = 0; -	double lath = 0; -#endif +#ifdef CUDA8 +	uint64_t res[GPU_ITERS]; +	cudaMemcpy(res, (void*)d_RES, GPU_ITERS * sizeof(uint64_t),  cudaMemcpyDeviceToHost); +	printf("Iterations (us):"); + +	for (int j = 1; j < GPU_ITERS; j++) { +	    float ms; +	    cudaEventElapsedTime(&ms, events[j - 1], events[j]); +	    double lati = ms * 1000.; +//	    double lati = 4. * (res[j] - res[j - 1]) / 1000; +//	    double lati = 1000. * (res[j] - res[j - 1]) / gpu_props.clockRate;  + +	    lat += lati; +	    if (j > 1) disp += pow(lat - lati * j, 2) / (j * (j - 1)); + +	    if (lati > max) max = lati; +	    if (lati < min) min = lati; +	    if (lati > 11) num++; + +	    printf(" % 6.3lf", lati); +	} +	printf("\n"); +	 +	 +	lat /= GPU_ITERS; +	latk /= GPU_ITERS; +	latc /= GPU_ITERS; +	lath /= GPU_ITERS; +	 +	disp = sqrt(disp / (GPU_ITERS - 1)); +	 +#endif  +# endif -	printf("hw: % 6.3lf us, sw: % 6.3lf us, +krn: % 6.3lf us, total: % 7.3lf us: %x %x %x %x\n", lath, lat, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]); +	printf("hw: % 8.3lf us, sw: % 8.3lf us (% 8.3lf - % 8.3lf / % 8.3lf / % 3lu), +krn: % 8.3lf us, total: % 8.3lf us: %x %x %x %x\n", lath, lat, min, max, disp, num, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]);  #else  	if (!i)  gettimeofday(&tvs, NULL);  #endif /* VERBOSE */ @@ -306,7 +383,14 @@ int main(int argc, char *argv[]) {      gdr_unpin_buffer(g, D_mh);      gdr_close(g); +    cuMemFree(d_RES);      cuMemFree(d_A);      cuMemFree(d_D); + +    for (int i = 0; i < GPU_ITERS; i++) +	cudaEventDestroy(events[i]); + +    cudaStreamDestroy(stream); +      printf("GDR closed\n");  } diff --git a/gpudirect.h b/gpudirect.h new file mode 100644 index 0000000..9449ff9 --- /dev/null +++ b/gpudirect.h @@ -0,0 +1,29 @@ +enum CU_STREAM_WAIT_FLAGS { +    CU_STREAM_WAIT_VALUE_GEQ			=	0x0, +    CU_STREAM_WAIT_VALUE_EQ			=	0x1, +    CU_STREAM_WAIT_VALUE_AND			=	0x2, +    CU_STREAM_WAIT_VALUE_FLUSH			=	1<<30 +}; + +enum CU_STREAM_WRITE_FLAGS { +    CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER	=	0x1 +}; + +enum CU_STREAM_MEM_OP_FLAGS { +    CU_STREAM_MEM_OP_WAIT_VALUE_32		=	1, +    CU_STREAM_MEM_OP_WRITE_VALUE_32		=	2, +    CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES	=	3 +}; + +# ifdef __cplusplus +extern "C" { +# endif + +CUresult	cuStreamWaitValue32(CUstream	stream,	CUdeviceptr	addr,	uint32_t	value,	unsigned	int	flags); +CUresult	cuStreamWriteValue32(CUstream	stream,	CUdeviceptr	addr,	uint32_t	value,	unsigned	int	flags); +//CUresult	cuStreamBatchMemOp(CUstream	stream,	unsigned	int	count, CUstreamBatchMemOpParams	*paramArray,	unsigned int flags); + + +# ifdef __cplusplus +} +# endif @@ -7,6 +7,24 @@  __global__ void null(uint32_t *data) {  } +__global__ void feedback(volatile void *bar, uint32_t *data) { +    WR32 (REG_PERF_COUNTER, 1); +} + +__global__ void measure(int iter, volatile void *bar, uint64_t bus_addr, uint64_t *res, uint32_t *data) { +	// Clocks are incorrect as not running while waiting. +    //res[iter] = clock64(); +	// Thats does not work either (no RD support in kernels?) +    //res[iter] = RD32(REG_PERF_COUNTER); + +    WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); + +    if ((iter + 1) == GPU_ITERS) { +	WR32 (REG_PERF_COUNTER, 1); +    } +} + +  __device__ void ksleep(uint32_t clocks) {      clock_t start = clock64(), now; @@ -1,4 +1,7 @@  __global__ void null(uint32_t *data); +__global__ void feedback(volatile void *bar, uint32_t *data); +__global__ void measure(int iter, volatile void *bar, uint64_t bus_addr, uint64_t *res, uint32_t *data); +  __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint32_t *counter, volatile uint64_t *desc, uint32_t *data); | 
