5 files changed, 147 insertions, 12 deletions
diff --git a/config.h b/config.h
index a0d29f3..a461638 100644
--- a/config.h
+++ b/config.h
@@ -1,5 +1,6 @@
 #define VERBOSE
-#define GPU_DESC
+//#define GPU_DESC
+#define CUDA8
 
 #define USE_HW_CONTER
 
diff --git a/gdr_test.cu b/gdr_test.cu
index d0118d5..42f7cc5 100644
--- a/gdr_test.cu
+++ b/gdr_test.cu
@@ -14,6 +14,7 @@
 #include <pcilib/bar.h>
 #include <pcilib/kmem.h>
 
+#include "gpudirect.h"
 #include "config.h"
 #include "ipedma.h"
 #include "kernels.h"
@@ -119,9 +120,17 @@ int main(int argc, char *argv[]) {
     initAssert(cuDeviceGetProperties(&gpu_props, gpu));
     printf (" GPU Clock %lu KHz\n", gpu_props.clockRate);
 
-    CUdeviceptr d_A, d_D;
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+    
+    cudaEvent_t events[GPU_ITERS];
+    for (int i = 0; i < GPU_ITERS; i++)
+	cudaEventCreate(&events[i]);
+
+    CUdeviceptr d_A, d_D, d_RES;
     initAssert(cuMemAlloc(&d_D, GPU_PAGE));	// Should be multiple of GPU page, or mapping of next allocation will segfault the gdrcopy module
     initAssert(cuMemAlloc(&d_A, PAGE_SIZE));
+    initAssert(cuMemAlloc(&d_RES, GPU_ITERS * sizeof(uint64_t)));
     unsigned int flag = 1;
     initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_D));
     initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_A));
@@ -199,7 +208,7 @@ int main(int argc, char *argv[]) {
     printf("\nSize: %lu bytes (%lu %lu-byte descriptors with packet length set to %lu), GPU itertions: %lu, Iterations: %lu\n", SIZE, NUM_PAGES, PAGE_SIZE, TLP_SIZE, GPU_ITERS, ITERS);
 
     memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t));
-    volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + 2 * sizeof(uint32_t));
+    volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + DESCRIPTOR_OFFSET + 2 * sizeof(uint32_t));
 
     WR32 (REG_RESET_DMA, 1);
     usleep (100000);
@@ -209,7 +218,7 @@ int main(int argc, char *argv[]) {
     WR32_sleep (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE));
     WR32_sleep (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
     WR32_sleep (REG_UPDATE_THRESHOLD, 1);
-    WR64_sleep (REG_UPDATE_COUNTER, desc_bus);
+    WR64_sleep (REG_UPDATE_COUNTER, D_info.bus_addr);
     WR64_sleep (REG_UPDATE_ADDRESS, desc_bus + DESCRIPTOR_OFFSET);
     WR32_sleep (REG_DMA, 1);
     WR32_sleep (REG_INTERCONNECT, 0x232);
@@ -232,14 +241,48 @@ int main(int argc, char *argv[]) {
 #ifdef GPU_DESC
 	ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint32_t*)d_D, (uint64_t*)(d_D + DESCRIPTOR_OFFSET), (uint32_t*)d_A);
 #else
+#ifdef CUDA8
+	cudaDeviceSynchronize();
+	
+	*(uint32_t*)D = 0;
+	WR32 (REG_DMA, 0);
+	WR32 (REG_PERF_COUNTER, 0);
+
+/*
+	for (int j = 0; j < GPU_ITERS; j++) {
+	    WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr);
+	}
+	WR32 (REG_DMA, 1);
+	usleep(10000);
+*/
+
 	WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr);
+
+	for (int j = 0; j < GPU_ITERS; j++) {
+	    initAssert(cuStreamWaitValue32(stream, d_D, j + 1, CU_STREAM_WAIT_VALUE_GEQ|CU_STREAM_WAIT_VALUE_FLUSH));
+	    measure<<<1, 1, 0, stream>>>(j, (void*)dBAR, A_info.bus_addr, (uint64_t*)d_RES,  (uint32_t*)d_A);
+	    cudaEventRecord(events[j], stream);
+	}
+//	printf("D = %u\n", *(uint32_t*)D);
+
+	    // Wait until all is pushed down.
+	usleep(1000);
+
+	clock_gettime(CLOCK_REALTIME, &tss);
+	WR32 (REG_DMA, 1);
+	memcpy(&tse, &tss, sizeof(struct timeval));
+
+# else
 //    WR64 (REG_DESCRIPTOR_ADDRESS, kbuf_bus);
+	WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr);
+
 
 	do {
 	} while (*hwaddr == 0);
 	clock_gettime(CLOCK_REALTIME, &tse);
 
 	null<<<1, 1>>>((uint32_t*)d_A);
+# endif
 #endif
 	err = cudaDeviceSynchronize();
 	if (err) printf("Oopps, synchronization error %i", err);
@@ -251,23 +294,57 @@ int main(int argc, char *argv[]) {
 #ifdef VERBOSE
         initAssert(cuMemcpyDtoH((void*)kbuf, d_A, PAGE_SIZE));
 
-# ifdef GPU_DESC
-	double lat = 1000. * kbuf[0] / gpu_props.clockRate;
-	double latk = 1000. * kbuf[1] / gpu_props.clockRate;
-	double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS;
 # ifdef USE_HW_CONTER
 	double lath = 4. * RD32 (0x20) / 1000;
 # else
 	double lath = 0;
 # endif
-#else
+
+	double disp = 0, min = 1E+10, max = 0;
+	long num = 0;
+# ifdef GPU_DESC
+	double lat = 1000. * kbuf[0] / gpu_props.clockRate;
+	double latk = 1000. * kbuf[1] / gpu_props.clockRate;
+	double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS;
+# else
 	double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.;
 	double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.;
 	double latc = 0;
-	double lath = 0;
-#endif
+#ifdef CUDA8
+	uint64_t res[GPU_ITERS];
+	cudaMemcpy(res, (void*)d_RES, GPU_ITERS * sizeof(uint64_t),  cudaMemcpyDeviceToHost);
+	printf("Iterations (us):");
+
+	for (int j = 1; j < GPU_ITERS; j++) {
+	    float ms;
+	    cudaEventElapsedTime(&ms, events[j - 1], events[j]);
+	    double lati = ms * 1000.;
+//	    double lati = 4. * (res[j] - res[j - 1]) / 1000;
+//	    double lati = 1000. * (res[j] - res[j - 1]) / gpu_props.clockRate; 
+
+	    lat += lati;
+	    if (j > 1) disp += pow(lat - lati * j, 2) / (j * (j - 1));
+
+	    if (lati > max) max = lati;
+	    if (lati < min) min = lati;
+	    if (lati > 11) num++;
+
+	    printf(" % 6.3lf", lati);
+	}
+	printf("\n");
+	
+	
+	lat /= GPU_ITERS;
+	latk /= GPU_ITERS;
+	latc /= GPU_ITERS;
+	lath /= GPU_ITERS;
+	
+	disp = sqrt(disp / (GPU_ITERS - 1));
+	
+#endif 
+# endif
 
-	printf("hw: % 6.3lf us, sw: % 6.3lf us, +krn: % 6.3lf us, total: % 7.3lf us: %x %x %x %x\n", lath, lat, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]);
+	printf("hw: % 8.3lf us, sw: % 8.3lf us (% 8.3lf - % 8.3lf / % 8.3lf / % 3lu), +krn: % 8.3lf us, total: % 8.3lf us: %x %x %x %x\n", lath, lat, min, max, disp, num, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]);
 #else
 	if (!i)  gettimeofday(&tvs, NULL);
 #endif /* VERBOSE */
@@ -306,7 +383,14 @@ int main(int argc, char *argv[]) {
     gdr_unpin_buffer(g, D_mh);
 
     gdr_close(g);
+    cuMemFree(d_RES);
     cuMemFree(d_A);
     cuMemFree(d_D);
+
+    for (int i = 0; i < GPU_ITERS; i++)
+	cudaEventDestroy(events[i]);
+
+    cudaStreamDestroy(stream);
+
     printf("GDR closed\n");
 }
diff --git a/gpudirect.h b/gpudirect.h
new file mode 100644
index 0000000..9449ff9
--- /dev/null
+++ b/gpudirect.h
@@ -0,0 +1,29 @@
+enum CU_STREAM_WAIT_FLAGS {
+    CU_STREAM_WAIT_VALUE_GEQ			=	0x0,
+    CU_STREAM_WAIT_VALUE_EQ			=	0x1,
+    CU_STREAM_WAIT_VALUE_AND			=	0x2,
+    CU_STREAM_WAIT_VALUE_FLUSH			=	1<<30
+};
+
+enum CU_STREAM_WRITE_FLAGS {
+    CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER	=	0x1
+};
+
+enum CU_STREAM_MEM_OP_FLAGS {
+    CU_STREAM_MEM_OP_WAIT_VALUE_32		=	1,
+    CU_STREAM_MEM_OP_WRITE_VALUE_32		=	2,
+    CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES	=	3
+};
+
+# ifdef __cplusplus
+extern "C" {
+# endif
+
+CUresult	cuStreamWaitValue32(CUstream	stream,	CUdeviceptr	addr,	uint32_t	value,	unsigned	int	flags);
+CUresult	cuStreamWriteValue32(CUstream	stream,	CUdeviceptr	addr,	uint32_t	value,	unsigned	int	flags);
+//CUresult	cuStreamBatchMemOp(CUstream	stream,	unsigned	int	count, CUstreamBatchMemOpParams	*paramArray,	unsigned int flags);
+
+
+# ifdef __cplusplus
+}
+# endif
diff --git a/kernels.cu b/kernels.cu
index f4ea114..7e7e689 100644
--- a/kernels.cu
+++ b/kernels.cu
@@ -7,6 +7,24 @@
 __global__ void null(uint32_t *data) {
 }
 
+__global__ void feedback(volatile void *bar, uint32_t *data) {
+    WR32 (REG_PERF_COUNTER, 1);
+}
+
+__global__ void measure(int iter, volatile void *bar, uint64_t bus_addr, uint64_t *res, uint32_t *data) {
+	// Clocks are incorrect as not running while waiting.
+    //res[iter] = clock64();
+	// Thats does not work either (no RD support in kernels?)
+    //res[iter] = RD32(REG_PERF_COUNTER);
+
+    WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
+
+    if ((iter + 1) == GPU_ITERS) {
+	WR32 (REG_PERF_COUNTER, 1);
+    }
+}
+
+
 __device__ void ksleep(uint32_t clocks) {
     clock_t start = clock64(), now;
 
diff --git a/kernels.h b/kernels.h
index 9158401..12eaa5e 100644
--- a/kernels.h
+++ b/kernels.h
@@ -1,4 +1,7 @@
 __global__ void null(uint32_t *data);
+__global__ void feedback(volatile void *bar, uint32_t *data);
+__global__ void measure(int iter, volatile void *bar, uint64_t bus_addr, uint64_t *res, uint32_t *data);
+
 
 __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint32_t *counter, volatile uint64_t *desc, uint32_t *data);