diff options
Diffstat (limited to 'kernels.cu')
-rw-r--r-- | kernels.cu | 52 |
1 files changed, 37 insertions, 15 deletions
@@ -7,34 +7,53 @@ __global__ void null(uint32_t *data) { } +__device__ void ksleep(uint32_t clocks) { + clock_t start = clock64(), now; -__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t *desc, uint32_t *data) { - int i; + do { + now = clock64(); + } while ((start < now)&&((now - start) < clocks)); +} + + +__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint32_t *counter, volatile uint64_t *desc, uint32_t *data) { + int i, j; clock_t sum = 0, sumk = 0, t1, t2, t3; for (i = 0; i < GPU_ITERS; i++) { long wait = 0; - desc[1] = 0; + // It would not work as we don't know in which order threads/blocks are executed. We also can't push 0 in all threads + // as we unsure if it will overwrite. In non-iterative use case we do not need to push zero and it could work. + // Single thread of block should poll and, then, we synchronize. Limiting amount of blocks will be good... + if ((threadIdx.x == 0)&&(blockIdx.x == 0)) { + // Reset counter + //desc[1] = 0; + *counter = 0; - // Reset counter #ifdef USE_HW_CONTER - WR32 (REG_DMA, 0); - WR32 (REG_PERF_COUNTER, 0); + WR32 (REG_DMA, 0); + WR32 (REG_PERF_COUNTER, 0); - WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); + for (j = 0; j < NUM_PAGES; j++) { + WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); +// ksleep(10000); + } - t1 = clock64(); - WR32 (REG_DMA, 1); + t1 = clock64(); + WR32 (REG_DMA, 1); #else - t1 = clock64(); - WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); + t1 = clock64(); + for (j = 0; j < NUM_PAGES; j++) { + WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); +// ksleep(10000); + } #endif - + } do { if (++wait > 0x10000) break; - } while (desc[1] == 0); + } while (((*counter) < (NUM_PAGES))/*||(desc[1] == 0)*/); t2 = clock64(); @@ -50,8 +69,11 @@ __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t sumk += t3 - t1; } - data[0] = sum / GPU_ITERS; - data[1] = sumk / GPU_ITERS; + if ((threadIdx.x == 0)&&(blockIdx.x == 0)) { + data[0] = sum / GPU_ITERS; + data[1] = sumk / GPU_ITERS; + data[2] = *counter; + } } |