diff options
| -rw-r--r-- | CMakeLists.txt | 8 | ||||
| -rw-r--r-- | config.h | 37 | ||||
| -rw-r--r-- | gdr_test.cu | 48 | ||||
| -rw-r--r-- | ipedma.h | 4 | ||||
| -rw-r--r-- | kernels.cu | 52 | ||||
| -rw-r--r-- | kernels.h | 2 | ||||
| -rwxr-xr-x | test.sh | 12 | 
7 files changed, 119 insertions, 44 deletions
| diff --git a/CMakeLists.txt b/CMakeLists.txt index f3369d9..1290255 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,10 +6,16 @@ set(IPECAMERA_ABI_VERSION "0")  cmake_minimum_required(VERSION 2.6)  list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") -add_definitions("-fPIC --std=gnu99 -Wall -O2 -gdwarf-2 -g3  -fno-omit-frame-pointer") +add_definitions("-fPIC --std=gnu99 -O2 -gdwarf-2 -g3  -fno-omit-frame-pointer")  find_package(CUDA REQUIRED)  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_35,code=sm_35;-rdc=true) + +if (DEFINED SIZE) +message(" * Setting size to ${SIZE}") +set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-Xcompiler;-DSIZE=${SIZE}) +endif () +  set(CUDA_SEPARABLE_COMPILATION ON) @@ -1,13 +1,36 @@ -#define ITERS 100 -#define GPU_ITERS 100 +#define VERBOSE +#define GPU_DESC +#define USE_HW_CONTER -#define TLP_SIZE 64  #define GPU_PAGE 65536 -#define PAGE_SIZE 4096 +#define MIN(a, b) (((a) > (b))?(b):(a)) +#define MAX(a, b) (((a) < (b))?(b):(a)) -#define VERBOSE -#define GPU_DESC +#ifdef SIZE +# if SIZE >= 65536 +#   define TLP_SIZE 64 +#   define PAGE_SIZE 65536 +#   define NUM_PAGES (SIZE / GPU_PAGE) +# elif SIZE >= 256 +#   define TLP_SIZE 64 +#   define PAGE_SIZE SIZE +#   define NUM_PAGES 1 +# else +#   define TLP_SIZE (SIZE / 4) +#   define PAGE_SIZE SIZE +#   define NUM_PAGES 1 +# endif -#define USE_HW_CONTER +# define GPU_ITERS MIN(100, MAX(4, 4l * 1024 * 1024 * 1024 / SIZE)) +# define ITERS MIN(100, MAX(10, 4l * 1024 * 1024 * 1024 / SIZE)) +#else  +# define ITERS 100 +# define GPU_ITERS 100 + +# define TLP_SIZE 64 +# define PAGE_SIZE 4096 +# define NUM_PAGES 4 +# define SIZE (NUM_PAGES * PAGE_SIZE) +#endif diff --git a/gdr_test.cu b/gdr_test.cu index cc624fc..d0118d5 100644 --- a/gdr_test.cu +++ b/gdr_test.cu @@ -93,11 +93,11 @@ int main(int argc, char *argv[]) {          char gpu_name[30] = {0};          initAssert (cuDeviceGetName (gpu_name, 30, current_gpu)); -        printf("GPU %i: %s\n", num_gpus, gpu_name); +        printf(" GPU %i: %s\n", num_gpus, gpu_name);          if (strncmp (gpu_name, "Tesla K40", 9) == 0) { -            printf ("Found a Tesla GPU! I'll use that one.\n"); +            printf ("  Found a Tesla GPU! I'll use that one.\n");              gpu = current_gpu;              break;          } @@ -112,12 +112,12 @@ int main(int argc, char *argv[]) {          //(see https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/html/group__CUDART__DRIVER.html)      unsigned int api_version;      initAssert (cuCtxGetApiVersion (context, &api_version)); -    printf ("CUDA API Version: %u\n", api_version); -    printf ("CUDA init done\n\n"); +    printf (" CUDA API Version: %u\n", api_version); +    //printf ("CUDA init done\n\n");      CUdevprop gpu_props;      initAssert(cuDeviceGetProperties(&gpu_props, gpu)); -    printf ("Clock %lu KHz\n", gpu_props.clockRate); +    printf (" GPU Clock %lu KHz\n", gpu_props.clockRate);      CUdeviceptr d_A, d_D;      initAssert(cuMemAlloc(&d_D, GPU_PAGE));	// Should be multiple of GPU page, or mapping of next allocation will segfault the gdrcopy module @@ -149,7 +149,7 @@ int main(int argc, char *argv[]) {      int A_bar_off = A_info.va - d_A;      volatile uint32_t *A = (uint32_t *)((char *)A_bar_ptr + A_bar_off); -    printf("DevicePtr: %lx, GDR ptr: %p, Bus ptr: %lx, (Bar: %p, Offset: %i), VA: 0x%lx, Size: %lu, Page: %lu\n", d_A, A, A_info.bus_addr, A_bar_ptr, A_bar_off, A_info.va, A_info.mapped_size, A_info.page_size); +    printf("\nDevicePtr: %lx, GDR ptr: %p, Bus ptr: %lx, (Bar: %p, Offset: %i), VA: 0x%lx, Size: %lu\n", d_A, A, A_info.bus_addr, A_bar_ptr, A_bar_off, A_info.va, A_info.mapped_size);      pcilib_t *pci;      volatile void *bar; @@ -166,7 +166,7 @@ int main(int argc, char *argv[]) {          printf("map bar\n");          exit(1);      } -    printf("BAR mapped to: %p\n", bar); +    //printf("BAR mapped to: %p\n", bar);      CUdeviceptr dBAR;  //    initAssert (cuMemHostRegister ((void*)((((uintptr_t)bar)/65536)*65536), 65536, CU_MEMHOSTREGISTER_DEVICEMAP)); @@ -176,7 +176,7 @@ int main(int argc, char *argv[]) {      //initAssert (cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, dBAR));      bar_info = pcilib_get_bar_info(pci, BAR); -    printf("%p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size); +    printf("Bar: %p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size);      pcilib_kmem_handle_t *kdesc_kmem = pcilib_alloc_kernel_memory (pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS);      uintptr_t kdesc_bus = pcilib_kmem_get_block_ba (pci, kdesc_kmem, 0); @@ -196,6 +196,7 @@ int main(int argc, char *argv[]) {      uintptr_t desc_bus = kdesc_bus;  #endif +    printf("\nSize: %lu bytes (%lu %lu-byte descriptors with packet length set to %lu), GPU itertions: %lu, Iterations: %lu\n", SIZE, NUM_PAGES, PAGE_SIZE, TLP_SIZE, GPU_ITERS, ITERS);      memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t));      volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + 2 * sizeof(uint32_t)); @@ -205,16 +206,22 @@ int main(int argc, char *argv[]) {      WR32 (REG_RESET_DMA, 0);      usleep (100000); -    WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE)); -    WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE); -    WR32 (REG_UPDATE_THRESHOLD, 0); -    WR64 (REG_UPDATE_ADDRESS, desc_bus); -    WR32 (REG_DMA, 1); -    WR32 (REG_INTERCONNECT, 0x232); -    WR32 (REG_COUNTER, 1); +    WR32_sleep (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE)); +    WR32_sleep (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE); +    WR32_sleep (REG_UPDATE_THRESHOLD, 1); +    WR64_sleep (REG_UPDATE_COUNTER, desc_bus); +    WR64_sleep (REG_UPDATE_ADDRESS, desc_bus + DESCRIPTOR_OFFSET); +    WR32_sleep (REG_DMA, 1); +    WR32_sleep (REG_INTERCONNECT, 0x232); +    WR32_sleep (REG_COUNTER, 1); +     +    usleep(100000);  #ifdef VERBOSE -    struct timespec tss, tse, tsk; +    struct timespec tss, tsk; +# ifndef GPU_DESC +    struct timespec tse; +# endif  #else      struct timeval tvs, tve;  #endif /* VERBOSE */ @@ -223,7 +230,7 @@ int main(int argc, char *argv[]) {  	clock_gettime(CLOCK_REALTIME, &tss);  #ifdef GPU_DESC -	ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint64_t*)d_D, (uint32_t*)d_A); +	ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint32_t*)d_D, (uint64_t*)(d_D + DESCRIPTOR_OFFSET), (uint32_t*)d_A);  #else  	WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr);  //    WR64 (REG_DESCRIPTOR_ADDRESS, kbuf_bus); @@ -234,7 +241,8 @@ int main(int argc, char *argv[]) {  	null<<<1, 1>>>((uint32_t*)d_A);  #endif -	cudaDeviceSynchronize(); +	err = cudaDeviceSynchronize(); +	if (err) printf("Oopps, synchronization error %i", err);  	clock_gettime(CLOCK_REALTIME, &tsk); @@ -246,7 +254,7 @@ int main(int argc, char *argv[]) {  # ifdef GPU_DESC  	double lat = 1000. * kbuf[0] / gpu_props.clockRate;  	double latk = 1000. * kbuf[1] / gpu_props.clockRate; -	double latc = ((tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS; +	double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS;  # ifdef USE_HW_CONTER  	double lath = 4. * RD32 (0x20) / 1000;  # else @@ -288,7 +296,7 @@ int main(int argc, char *argv[]) {      pcilib_free_kernel_memory(pci, kdesc_kmem,  KMEM_DEFAULT_FLAGS);      pcilib_close(pci); -    printf("PCI closed\n"); +    printf("\nPCI closed\n");      gdr_unmap(g, A_mh, A_bar_ptr, PAGE_SIZE); @@ -7,10 +7,14 @@  #define REG_DESCRIPTOR_ADDRESS          0x50  #define REG_UPDATE_ADDRESS              0x58  #define REG_UPDATE_THRESHOLD            0x60 +#define REG_UPDATE_COUNTER		0x70  #define REG_INTERCONNECT		0x9048  #define REG_COUNTER                     0x9000 + +#define DESCRIPTOR_OFFSET		256 +  #define WR32(addr, value) *(volatile uint32_t *) (((char*)(bar)) + (addr)) = (value);  #define RD32(addr) (*(volatile uint32_t *) (((char*)(bar)) + (addr)))  #define WR32_sleep(addr, value) *(volatile uint32_t *) (((char*)(bar)) + (addr)) = (value); usleep (100); @@ -7,34 +7,53 @@  __global__ void null(uint32_t *data) {  } +__device__ void ksleep(uint32_t clocks) { +    clock_t start = clock64(), now; -__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t *desc, uint32_t *data) { -    int i; +    do { +	now = clock64(); +    } while ((start < now)&&((now - start) < clocks)); +} + + +__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint32_t *counter, volatile uint64_t *desc, uint32_t *data) { +    int i, j;      clock_t sum = 0, sumk = 0, t1, t2, t3;      for (i = 0; i < GPU_ITERS; i++) {  	long wait = 0; -	desc[1] = 0; +	    // It would not work as we don't know in which order threads/blocks are executed. We also can't push 0 in all threads +	    // as we unsure if it will overwrite. In non-iterative use case we do not need to push zero and it could work. +	    // Single thread of block should poll and, then, we synchronize. Limiting amount of blocks will be good... +	if ((threadIdx.x == 0)&&(blockIdx.x == 0)) { +		// Reset counter +	    //desc[1] = 0; +	    *counter = 0; -	    // Reset counter  #ifdef USE_HW_CONTER -        WR32 (REG_DMA, 0); -	WR32 (REG_PERF_COUNTER, 0); +	    WR32 (REG_DMA, 0); +	    WR32 (REG_PERF_COUNTER, 0); -	WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); +	    for (j = 0; j <  NUM_PAGES; j++) { +		WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); +//	    	ksleep(10000); +	    } -        t1 = clock64(); -	WR32 (REG_DMA, 1); +    	    t1 = clock64(); +	    WR32 (REG_DMA, 1);  #else -        t1 = clock64(); -	WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); +    	    t1 = clock64(); +    	    for (j = 0; j <  NUM_PAGES; j++) { +		WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); +//	    	ksleep(10000); +	    }  #endif - +	}          do {  	    if (++wait > 0x10000) break; -	} while (desc[1] == 0); +	} while (((*counter) < (NUM_PAGES))/*||(desc[1] == 0)*/);  	t2 = clock64(); @@ -50,8 +69,11 @@ __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t  	sumk += t3 - t1;      } -    data[0] = sum / GPU_ITERS; -    data[1] = sumk / GPU_ITERS; +    if ((threadIdx.x == 0)&&(blockIdx.x == 0)) { +	data[0] = sum / GPU_ITERS; +	data[1] = sumk / GPU_ITERS; +	data[2] = *counter; +    }  } @@ -1,5 +1,5 @@  __global__ void null(uint32_t *data); -__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t *desc, uint32_t *data); +__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint32_t *counter, volatile uint64_t *desc, uint32_t *data); @@ -0,0 +1,12 @@ +#! /bin/bash + +rm results.txt +date > results.txt + +for pow in `seq 7 30`; do +    size=`echo "2^$pow" | bc` +    echo $size +    cmake -DSIZE=$size +    make +    ./gdr_test >> results.txt +done | 
