diff options
author | root <root@ipepdvdev1.ipe.kit.edu> | 2016-05-25 06:35:59 +0200 |
---|---|---|
committer | root <root@ipepdvdev1.ipe.kit.edu> | 2016-05-25 06:35:59 +0200 |
commit | a6f3e96f2cafc183ab29e53007a86bb968d654b8 (patch) | |
tree | 5248f06934289d65e366507890ebac3c7a4816ab /kernels.cu | |
parent | ca1b1cea796bcfaeb86f201cf35065a606921cc1 (diff) | |
download | gdrtest-a6f3e96f2cafc183ab29e53007a86bb968d654b8.tar.gz gdrtest-a6f3e96f2cafc183ab29e53007a86bb968d654b8.tar.bz2 gdrtest-a6f3e96f2cafc183ab29e53007a86bb968d654b8.tar.xz gdrtest-a6f3e96f2cafc183ab29e53007a86bb968d654b8.zip |
Support bigger writes
Diffstat (limited to 'kernels.cu')
-rw-r--r-- | kernels.cu | 52 |
1 files changed, 37 insertions, 15 deletions
@@ -7,34 +7,53 @@ __global__ void null(uint32_t *data) { } +__device__ void ksleep(uint32_t clocks) { + clock_t start = clock64(), now; -__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t *desc, uint32_t *data) { - int i; + do { + now = clock64(); + } while ((start < now)&&((now - start) < clocks)); +} + + +__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint32_t *counter, volatile uint64_t *desc, uint32_t *data) { + int i, j; clock_t sum = 0, sumk = 0, t1, t2, t3; for (i = 0; i < GPU_ITERS; i++) { long wait = 0; - desc[1] = 0; + // It would not work as we don't know in which order threads/blocks are executed. We also can't push 0 in all threads + // as we unsure if it will overwrite. In non-iterative use case we do not need to push zero and it could work. + // Single thread of block should poll and, then, we synchronize. Limiting amount of blocks will be good... + if ((threadIdx.x == 0)&&(blockIdx.x == 0)) { + // Reset counter + //desc[1] = 0; + *counter = 0; - // Reset counter #ifdef USE_HW_CONTER - WR32 (REG_DMA, 0); - WR32 (REG_PERF_COUNTER, 0); + WR32 (REG_DMA, 0); + WR32 (REG_PERF_COUNTER, 0); - WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); + for (j = 0; j < NUM_PAGES; j++) { + WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); +// ksleep(10000); + } - t1 = clock64(); - WR32 (REG_DMA, 1); + t1 = clock64(); + WR32 (REG_DMA, 1); #else - t1 = clock64(); - WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); + t1 = clock64(); + for (j = 0; j < NUM_PAGES; j++) { + WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); +// ksleep(10000); + } #endif - + } do { if (++wait > 0x10000) break; - } while (desc[1] == 0); + } while (((*counter) < (NUM_PAGES))/*||(desc[1] == 0)*/); t2 = clock64(); @@ -50,8 +69,11 @@ __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t sumk += t3 - t1; } - data[0] = sum / GPU_ITERS; - data[1] = sumk / GPU_ITERS; + if ((threadIdx.x == 0)&&(blockIdx.x == 0)) { + data[0] = sum / GPU_ITERS; + data[1] = sumk / GPU_ITERS; + data[2] = *counter; + } } |