summaryrefslogtreecommitdiffstats
path: root/cuda/2d/util.cu
diff options
context:
space:
mode:
Diffstat (limited to 'cuda/2d/util.cu')
-rw-r--r--cuda/2d/util.cu244
1 files changed, 244 insertions, 0 deletions
diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu
new file mode 100644
index 0000000..06f6714
--- /dev/null
+++ b/cuda/2d/util.cu
@@ -0,0 +1,244 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include "util.h"
+
+namespace astraCUDA {
+
+bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch,
+ unsigned int width, unsigned int height,
+ float* outD_data, unsigned int out_pitch)
+{
+ // TODO: a full memset isn't necessary. Only the edges.
+ cudaError_t err;
+ err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, sizeof(float)*(width+2), height+2);
+ ASTRA_CUDA_ASSERT(err);
+ err = cudaMemcpy2D(outD_data + out_pitch + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
+ ASTRA_CUDA_ASSERT(err);
+ assert(err == cudaSuccess);
+ return true;
+}
+
+bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch,
+ unsigned int width, unsigned int height,
+ float* inD_data, unsigned int in_pitch)
+{
+ cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + (in_pitch + 1), sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
+ ASTRA_CUDA_ASSERT(err);
+ return true;
+}
+
+
+bool copySinogramFromDevice(float* out_data, unsigned int out_pitch,
+ unsigned int width, unsigned int height,
+ float* inD_data, unsigned int in_pitch)
+{
+ cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + 1, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
+ ASTRA_CUDA_ASSERT(err);
+ return true;
+}
+
+bool copySinogramToDevice(const float* in_data, unsigned int in_pitch,
+ unsigned int width, unsigned int height,
+ float* outD_data, unsigned int out_pitch)
+{
+ // TODO: a full memset isn't necessary. Only the edges.
+ cudaError_t err;
+ err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, (width+2)*sizeof(float), height);
+ ASTRA_CUDA_ASSERT(err);
+ err = cudaMemcpy2D(outD_data + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
+ ASTRA_CUDA_ASSERT(err);
+ return true;
+}
+
+
+bool allocateVolume(float*& ptr, unsigned int width, unsigned int height, unsigned int& pitch)
+{
+ size_t p;
+ cudaError_t ret = cudaMallocPitch((void**)&ptr, &p, sizeof(float)*width, height);
+ if (ret != cudaSuccess) {
+ reportCudaError(ret);
+ fprintf(stderr, "Failed to allocate %dx%d GPU buffer\n", width, height);
+ return false;
+ }
+
+ assert(p % sizeof(float) == 0);
+
+ pitch = p / sizeof(float);
+
+ return true;
+}
+
+void zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ cudaError_t err;
+ err = cudaMemset2D(data, sizeof(float)*pitch, 0, sizeof(float)*width, height);
+ ASTRA_CUDA_ASSERT(err);
+}
+
+
+template <unsigned int blockSize>
+__global__ void reduce1D(float *g_idata, float *g_odata, unsigned int n)
+{
+ extern __shared__ float sdata[];
+ unsigned int tid = threadIdx.x;
+
+ unsigned int i = blockIdx.x*(blockSize*2) + tid;
+ unsigned int gridSize = blockSize*gridDim.x;
+ sdata[tid] = 0;
+ while (i < n) { sdata[tid] += g_idata[i]; i += gridSize; }
+ __syncthreads();
+ if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
+ if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
+ if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
+ if (tid < 32) {
+ volatile float* smem = sdata;
+ if (blockSize >= 64) smem[tid] += smem[tid + 32];
+ if (blockSize >= 32) smem[tid] += smem[tid + 16];
+ if (blockSize >= 16) smem[tid] += smem[tid + 8];
+ if (blockSize >= 8) smem[tid] += smem[tid + 4];
+ if (blockSize >= 4) smem[tid] += smem[tid + 2];
+ if (blockSize >= 2) smem[tid] += smem[tid + 1];
+ }
+ if (tid == 0) g_odata[blockIdx.x] = sdata[0];
+}
+
+__global__ void reduce2D(float *g_idata, float *g_odata,
+ unsigned int pitch,
+ unsigned int nx, unsigned int ny,
+ unsigned int padX, unsigned int padY)
+{
+ extern __shared__ float sdata[];
+ const unsigned int tidx = threadIdx.x;
+ const unsigned int tidy = threadIdx.y;
+ const unsigned int tid = tidy * 16 + tidx;
+
+ unsigned int x = blockIdx.x*16 + tidx;
+ unsigned int y = blockIdx.y*16 + tidy;
+
+ sdata[tid] = 0;
+
+ if (x >= padX && x < padX + nx) {
+
+ while (y < padY + ny) {
+ if (y >= padY)
+ sdata[tid] += (g_idata[pitch*y+x] * g_idata[pitch*y+x]);
+ y += 16 * gridDim.y;
+ }
+
+ }
+
+ __syncthreads();
+
+ if (tid < 128)
+ sdata[tid] += sdata[tid + 128];
+ __syncthreads();
+
+ if (tid < 64)
+ sdata[tid] += sdata[tid + 64];
+ __syncthreads();
+
+ if (tid < 32) { // 32 is warp size
+ volatile float* smem = sdata;
+ smem[tid] += smem[tid + 32];
+ smem[tid] += smem[tid + 16];
+ smem[tid] += smem[tid + 8];
+ smem[tid] += smem[tid + 4];
+ smem[tid] += smem[tid + 2];
+ smem[tid] += smem[tid + 1];
+ }
+
+ if (tid == 0)
+ g_odata[blockIdx.y * gridDim.x + blockIdx.x] = sdata[0];
+}
+
+float dotProduct2D(float* D_data, unsigned int pitch,
+ unsigned int width, unsigned int height,
+ unsigned int padX, unsigned int padY)
+{
+ unsigned int bx = ((width+padX) + 15) / 16;
+ unsigned int by = ((height+padY) + 127) / 128;
+ unsigned int shared_mem2 = sizeof(float) * 16 * 16;
+
+ dim3 dimBlock2(16, 16);
+ dim3 dimGrid2(bx, by);
+
+ float* D_buf;
+ cudaMalloc(&D_buf, sizeof(float) * (bx * by + 1) );
+
+ // Step 1: reduce 2D from image to a single vector, taking sum of squares
+
+ reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height, padX, padY);
+ cudaTextForceKernelsCompletion();
+
+ // Step 2: reduce 1D: add up elements in vector
+ if (bx * by > 512)
+ reduce1D<512><<< 1, 512, sizeof(float)*512>>>(D_buf, D_buf+(bx*by), bx*by);
+ else if (bx * by > 128)
+ reduce1D<128><<< 1, 128, sizeof(float)*128>>>(D_buf, D_buf+(bx*by), bx*by);
+ else if (bx * by > 32)
+ reduce1D<32><<< 1, 32, sizeof(float)*32*2>>>(D_buf, D_buf+(bx*by), bx*by);
+ else if (bx * by > 8)
+ reduce1D<8><<< 1, 8, sizeof(float)*8*2>>>(D_buf, D_buf+(bx*by), bx*by);
+ else
+ reduce1D<1><<< 1, 1, sizeof(float)*1*2>>>(D_buf, D_buf+(bx*by), bx*by);
+
+ float x;
+ cudaMemcpy(&x, D_buf+(bx*by), 4, cudaMemcpyDeviceToHost);
+
+ cudaTextForceKernelsCompletion();
+
+ cudaFree(D_buf);
+
+ return x;
+}
+
+
+bool cudaTextForceKernelsCompletion()
+{
+ cudaError_t returnedCudaError = cudaThreadSynchronize();
+
+ if(returnedCudaError != cudaSuccess) {
+ fprintf(stderr, "Failed to force completion of cuda kernels: %d: %s.\n", returnedCudaError, cudaGetErrorString(returnedCudaError));
+ return false;
+ }
+
+ return true;
+}
+
+void reportCudaError(cudaError_t err)
+{
+ if(err != cudaSuccess)
+ fprintf(stderr, "CUDA error %d: %s.\n", err, cudaGetErrorString(err));
+}
+
+
+
+}