From 3a6769465bee7d56d0ddff36613b886446421e07 Mon Sep 17 00:00:00 2001 From: Willem Jan Palenstijn Date: Wed, 16 Apr 2014 11:12:55 +0000 Subject: Remove padding in 2D cuda in favour of Border mode --- cuda/2d/algo.cu | 10 +-- cuda/2d/arith.cu | 182 +++++++++++++++++++++++--------------------------- cuda/2d/arith.h | 37 +++++----- cuda/2d/astra.cu | 44 ++++++------ cuda/2d/cgls.cu | 52 +++++++-------- cuda/2d/darthelper.cu | 62 ++++++++--------- cuda/2d/dataop.cu | 16 ++--- cuda/2d/em.cu | 44 ++++++------ cuda/2d/fan_bp.cu | 28 ++++---- cuda/2d/fan_fp.cu | 26 ++++---- cuda/2d/fft.cu | 8 +-- cuda/2d/fft.h | 4 +- cuda/2d/par_bp.cu | 28 ++++---- cuda/2d/par_fp.cu | 38 +++++------ cuda/2d/sart.cu | 60 ++++++++--------- cuda/2d/sirt.cu | 72 ++++++++++---------- cuda/2d/util.cu | 46 ++++++------- cuda/2d/util.h | 4 +- 18 files changed, 365 insertions(+), 396 deletions(-) (limited to 'cuda/2d') diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu index f04607f..71cbfb3 100644 --- a/cuda/2d/algo.cu +++ b/cuda/2d/algo.cu @@ -214,11 +214,11 @@ bool ReconAlgo::setMaxConstraint(float fMax) bool ReconAlgo::allocateBuffers() { bool ok; - ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); if (!ok) return false; - ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch); if (!ok) { cudaFree(D_volumeData); D_volumeData = 0; @@ -226,7 +226,7 @@ bool ReconAlgo::allocateBuffers() } if (useVolumeMask) { - ok = allocateVolume(D_maskData, dims.iVolWidth+2, dims.iVolHeight+2, maskPitch); + ok = allocateVolume(D_maskData, dims.iVolWidth, dims.iVolHeight, maskPitch); if (!ok) { cudaFree(D_volumeData); cudaFree(D_sinoData); @@ -237,7 +237,7 @@ bool ReconAlgo::allocateBuffers() } if (useSinogramMask) { - ok = allocateVolume(D_smaskData, dims.iProjDets+2, dims.iProjAngles, smaskPitch); + ok = allocateVolume(D_smaskData, dims.iProjDets, dims.iProjAngles, smaskPitch); if (!ok) { cudaFree(D_volumeData); cudaFree(D_sinoData); @@ -271,7 +271,7 @@ bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPit return false; // rescale sinogram to adjust for pixel size - processVol(D_sinoData, fSinogramScale, + processVol(D_sinoData, fSinogramScale, //1.0f/(fPixelSize*fPixelSize), sinoPitch, dims.iProjDets, dims.iProjAngles); diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu index 1ee02ca..42c2c98 100644 --- a/cuda/2d/arith.cu +++ b/cuda/2d/arith.cu @@ -144,14 +144,14 @@ struct opMulMask { -template +template __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -161,14 +161,14 @@ __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, uns } } -template +template __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -178,14 +178,14 @@ __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned } } -template +template __global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -197,14 +197,14 @@ __global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fPa -template +template __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -214,14 +214,14 @@ __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, uns } } -template +template __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -231,14 +231,14 @@ __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned } } -template +template __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -248,14 +248,14 @@ __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, u } } -template +template __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -280,51 +280,51 @@ __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, -template +template void processVolCopy(float* out, unsigned int width, unsigned int height) { float* D_out; unsigned int pitch; - allocateVolume(D_out, width+2, height+2, pitch); + allocateVolume(D_out, width, height, pitch); copyVolumeToDevice(out, width, width, height, D_out, pitch); - processVol(D_out, pitch, width, height); + processVol(D_out, pitch, width, height); copyVolumeFromDevice(out, width, width, height, D_out, pitch); cudaFree(D_out); } -template +template void processVolCopy(float* out, float param, unsigned int width, unsigned int height) { float* D_out; unsigned int pitch; - allocateVolume(D_out, width+2, height+2, pitch); + allocateVolume(D_out, width, height, pitch); copyVolumeToDevice(out, width, width, height, D_out, pitch); - processVol(D_out, param, pitch, width, height); + processVol(D_out, param, pitch, width, height); copyVolumeFromDevice(out, width, width, height, D_out, pitch); cudaFree(D_out); } -template +template void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height) { float* D_out1; float* D_out2; unsigned int pitch; - allocateVolume(D_out1, width+2, height+2, pitch); + allocateVolume(D_out1, width, height, pitch); copyVolumeToDevice(out1, width, width, height, D_out1, pitch); - allocateVolume(D_out2, width+2, height+2, pitch); + allocateVolume(D_out2, width, height, pitch); copyVolumeToDevice(out2, width, width, height, D_out2, pitch); - processVol(D_out1, D_out2, param1, param2, pitch, width, height); + processVol(D_out1, D_out2, param1, param2, pitch, width, height); copyVolumeFromDevice(out1, width, width, height, D_out1, pitch); copyVolumeFromDevice(out2, width, width, height, D_out2, pitch); @@ -334,19 +334,19 @@ void processVolCopy(float* out1, float* out2, float param1, float param2, unsign } -template +template void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height) { float* D_out; float* D_in; unsigned int pitch; - allocateVolume(D_out, width+2, height+2, pitch); + allocateVolume(D_out, width, height, pitch); copyVolumeToDevice(out, width, width, height, D_out, pitch); - allocateVolume(D_in, width+2, height+2, pitch); + allocateVolume(D_in, width, height, pitch); copyVolumeToDevice(in, width, width, height, D_in, pitch); - processVol(D_out, D_in, pitch, width, height); + processVol(D_out, D_in, pitch, width, height); copyVolumeFromDevice(out, width, width, height, D_out, pitch); @@ -354,19 +354,19 @@ void processVolCopy(float* out, const float* in, unsigned int width, unsigned in cudaFree(D_in); } -template +template void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height) { float* D_out; float* D_in; unsigned int pitch; - allocateVolume(D_out, width+2, height+2, pitch); + allocateVolume(D_out, width, height, pitch); copyVolumeToDevice(out, width, width, height, D_out, pitch); - allocateVolume(D_in, width+2, height+2, pitch); + allocateVolume(D_in, width, height, pitch); copyVolumeToDevice(in, width, width, height, D_in, pitch); - processVol(D_out, D_in, param, pitch, width, height); + processVol(D_out, D_in, param, pitch, width, height); copyVolumeFromDevice(out, width, width, height, D_out, pitch); @@ -374,7 +374,7 @@ void processVolCopy(float* out, const float* in, float param, unsigned int width cudaFree(D_in); } -template +template void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height) { float* D_out; @@ -382,14 +382,14 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int float* D_in2; unsigned int pitch; - allocateVolume(D_out, width+2, height+2, pitch); + allocateVolume(D_out, width, height, pitch); copyVolumeToDevice(out, width, width, height, D_out, pitch); - allocateVolume(D_in1, width+2, height+2, pitch); + allocateVolume(D_in1, width, height, pitch); copyVolumeToDevice(in1, width, width, height, D_in1, pitch); - allocateVolume(D_in2, width+2, height+2, pitch); + allocateVolume(D_in2, width, height, pitch); copyVolumeToDevice(in2, width, width, height, D_in2, pitch); - processVol(D_out, D_in1, D_in2, pitch, width, height); + processVol(D_out, D_in1, D_in2, pitch, width, height); copyVolumeFromDevice(out, width, width, height, D_out, pitch); @@ -398,7 +398,7 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int cudaFree(D_in2); } -template +template void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height) { float* D_out; @@ -406,14 +406,14 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param, float* D_in2; unsigned int pitch; - allocateVolume(D_out, width+2, height+2, pitch); + allocateVolume(D_out, width, height, pitch); copyVolumeToDevice(out, width, width, height, D_out, pitch); - allocateVolume(D_in1, width+2, height+2, pitch); + allocateVolume(D_in1, width, height, pitch); copyVolumeToDevice(in1, width, width, height, D_in1, pitch); - allocateVolume(D_in2, width+2, height+2, pitch); + allocateVolume(D_in2, width, height, pitch); copyVolumeToDevice(in2, width, width, height, D_in2, pitch); - processVol(D_out, D_in1, D_in2, param, pitch, width, height); + processVol(D_out, D_in1, D_in2, param, pitch, width, height); copyVolumeFromDevice(out, width, width, height, D_out, pitch); @@ -430,80 +430,80 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param, -template +template void processVol(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+511)/512); - devtoD<<>>(pfOut, pitch, width, height); + devtoD<<>>(pfOut, pitch, width, height); cudaTextForceKernelsCompletion(); } -template +template void processVol(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); - devFtoD<<>>(pfOut, fParam, pitch, width, height); + devFtoD<<>>(pfOut, fParam, pitch, width, height); cudaTextForceKernelsCompletion(); } -template +template void processVol(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); - devFFtoDD<<>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height); + devFFtoDD<<>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height); cudaTextForceKernelsCompletion(); } -template +template void processVol(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); - devDtoD<<>>(pfOut, pfIn, pitch, width, height); + devDtoD<<>>(pfOut, pfIn, pitch, width, height); cudaTextForceKernelsCompletion(); } -template +template void processVol(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); - devDFtoD<<>>(pfOut, pfIn, fParam, pitch, width, height); + devDFtoD<<>>(pfOut, pfIn, fParam, pitch, width, height); cudaTextForceKernelsCompletion(); } -template +template void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); - devDDFtoD<<>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); + devDDFtoD<<>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); cudaTextForceKernelsCompletion(); } -template +template void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); - devDDtoD<<>>(pfOut, pfIn1, pfIn2, pitch, width, height); + devDDtoD<<>>(pfOut, pfIn1, pfIn2, pitch, width, height); cudaTextForceKernelsCompletion(); } @@ -533,7 +533,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devtoD<<>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devtoD<<>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; } @@ -549,7 +549,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devFtoD<<>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devFtoD<<>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; } @@ -566,7 +566,7 @@ void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, flo unsigned int step = out1.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devFFtoDD<<>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devFFtoDD<<>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut1 += step; pfOut2 += step; } @@ -585,7 +585,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDtoD<<>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDtoD<<>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn += step; } @@ -603,7 +603,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDFtoD<<>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDFtoD<<>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn += step; } @@ -622,7 +622,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDDFtoD<<>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDDFtoD<<>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -642,7 +642,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDDtoD<<>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDDtoD<<>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -672,7 +672,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devtoD<<>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devtoD<<>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; } @@ -688,7 +688,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devFtoD<<>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devFtoD<<>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; } @@ -705,7 +705,7 @@ void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, fl unsigned int step = out1.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devFFtoDD<<>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devFFtoDD<<>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut1 += step; pfOut2 += step; } @@ -724,7 +724,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDtoD<<>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDtoD<<>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn += step; } @@ -742,7 +742,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDFtoD<<>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDFtoD<<>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn += step; } @@ -761,7 +761,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDDFtoD<<>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDDFtoD<<>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -781,7 +781,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDDtoD<<>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDDtoD<<>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -808,59 +808,45 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit #define INST_DFtoD(name) \ - template void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height); \ - template void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height); \ - template void processVol(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height); \ + template void processVol(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); #define INST_DtoD(name) \ - template void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height); \ - template void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height); \ - template void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height); \ + template void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); #define INST_DDtoD(name) \ - template void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ - template void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ - template void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ + template void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); #define INST_DDFtoD(name) \ - template void processVolCopy(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ - template void processVolCopy(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ - template void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ + template void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); #define INST_toD(name) \ - template void processVolCopy(float* out, unsigned int width, unsigned int height); \ - template void processVolCopy(float* out, unsigned int width, unsigned int height); \ - template void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy(float* out, unsigned int width, unsigned int height); \ + template void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims); #define INST_FtoD(name) \ - template void processVolCopy(float* out, float param, unsigned int width, unsigned int height); \ - template void processVolCopy(float* out, float param, unsigned int width, unsigned int height); \ - template void processVol(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy(float* out, float param, unsigned int width, unsigned int height); \ + template void processVol(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D(cudaPitchedPtr& out, float param, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, float param, const SDimensions3D& dims); #define INST_FFtoDD(name) \ - template void processVolCopy(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ - template void processVolCopy(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ - template void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ + template void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); diff --git a/cuda/2d/arith.h b/cuda/2d/arith.h index c8c7b41..d745aef 100644 --- a/cuda/2d/arith.h +++ b/cuda/2d/arith.h @@ -55,28 +55,21 @@ struct opSetMaskedValues; struct opMulMask; - -enum VolType { - SINO = 0, - VOL = 1 -}; - - -template void processVolCopy(float* out, unsigned int width, unsigned int height); -template void processVolCopy(float* out, float param, unsigned int width, unsigned int height); -template void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height); -template void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height); -template void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height); -template void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); -template void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height); - -template void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height); -template void processVol(float* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); -template void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); -template void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); -template void processVol(float* out, const float* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); -template void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); -template void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); +template void processVolCopy(float* out, unsigned int width, unsigned int height); +template void processVolCopy(float* out, float param, unsigned int width, unsigned int height); +template void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height); +template void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height); +template void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height); +template void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); +template void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height); + +template void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height); +template void processVol(float* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); +template void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); +template void processVol(float* out, const float* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); template void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims); template void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu index 2240629..1c2e623 100644 --- a/cuda/2d/astra.cu +++ b/cuda/2d/astra.cu @@ -206,13 +206,13 @@ bool AstraFBP::init(int iGPUIndex) } } - bool ok = allocateVolume(pData->D_volumeData, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2, pData->volumePitch); + bool ok = allocateVolume(pData->D_volumeData, pData->dims.iVolWidth, pData->dims.iVolHeight, pData->volumePitch); if (!ok) { return false; } - ok = allocateVolume(pData->D_sinoData, pData->dims.iProjDets+2, pData->dims.iProjAngles, pData->sinoPitch); + ok = allocateVolume(pData->D_sinoData, pData->dims.iProjDets, pData->dims.iProjAngles, pData->sinoPitch); if (!ok) { cudaFree(pData->D_volumeData); @@ -241,7 +241,7 @@ bool AstraFBP::setSinogram(const float* pfSinogram, return false; // rescale sinogram to adjust for pixel size - processVol(pData->D_sinoData, + processVol(pData->D_sinoData, 1.0f/(pData->fPixelSize*pData->fPixelSize), pData->sinoPitch, pData->dims.iProjDets, pData->dims.iProjAngles); @@ -270,7 +270,7 @@ bool AstraFBP::run() return false; } - zeroVolume(pData->D_volumeData, pData->volumePitch, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2); + zeroVolume(pData->D_volumeData, pData->volumePitch, pData->dims.iVolWidth, pData->dims.iVolHeight); bool ok = false; @@ -283,11 +283,11 @@ bool AstraFBP::run() allocateComplexOnDevice(pData->dims.iProjAngles, iFFTFourDetCount, &pDevComplexSinogram); - runCudaFFT(pData->dims.iProjAngles, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram); + runCudaFFT(pData->dims.iProjAngles, pData->D_sinoData, pData->sinoPitch, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram); applyFilter(pData->dims.iProjAngles, iFFTFourDetCount, pDevComplexSinogram, pData->m_pDevFilter); - runCudaIFFT(pData->dims.iProjAngles, pDevComplexSinogram, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount); + runCudaIFFT(pData->dims.iProjAngles, pDevComplexSinogram, pData->D_sinoData, pData->sinoPitch, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount); freeComplexOnDevice(pDevComplexSinogram); @@ -299,7 +299,7 @@ bool AstraFBP::run() return false; } - processVol(pData->D_volumeData, + processVol(pData->D_volumeData, (M_PI / 2.0f) / (float)pData->dims.iProjAngles, pData->volumePitch, pData->dims.iVolWidth, pData->dims.iVolHeight); @@ -443,7 +443,7 @@ bool AstraFBP::setFilter(E_FBPFILTER _eFilter, const float * _pfHostFilter /* = cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice); delete[] pfHostRealFilter; - runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter); + runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter); cudaFree(pfDevRealFilter); @@ -478,7 +478,7 @@ bool AstraFBP::setFilter(E_FBPFILTER _eFilter, const float * _pfHostFilter /* = cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice); delete[] pfHostRealFilter; - runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter); + runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter); cudaFree(pfDevRealFilter); @@ -515,7 +515,7 @@ bool BPalgo::init() bool BPalgo::iterate(unsigned int) { // TODO: This zeroVolume makes an earlier memcpy of D_volumeData redundant - zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight); callBP(D_volumeData, volumePitch, D_sinoData, sinoPitch); return true; } @@ -525,12 +525,12 @@ float BPalgo::computeDiffNorm() float *D_projData; unsigned int projPitch; - allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch); - cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*dims.iProjDets, dims.iProjAngles, cudaMemcpyDeviceToDevice); callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); - float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); cudaFree(D_projData); @@ -579,14 +579,14 @@ bool astraCudaFP(const float* pfVolume, float* pfSinogram, float* D_volumeData; unsigned int volumePitch; - ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); if (!ok) return false; float* D_sinoData; unsigned int sinoPitch; - ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch); if (!ok) { cudaFree(D_volumeData); return false; @@ -601,7 +601,7 @@ bool astraCudaFP(const float* pfVolume, float* pfSinogram, return false; } - zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles); ok = FP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pfAngles, pfOffsets, 1.0f); if (!ok) { cudaFree(D_volumeData); @@ -666,14 +666,14 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, float* D_volumeData; unsigned int volumePitch; - ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); if (!ok) return false; float* D_sinoData; unsigned int sinoPitch; - ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch); if (!ok) { cudaFree(D_volumeData); return false; @@ -688,7 +688,7 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, return false; } - zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles); // TODO: Turn this geometry conversion into a util function SFanProjection* projs = new SFanProjection[dims.iProjAngles]; @@ -777,14 +777,14 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, float* D_volumeData; unsigned int volumePitch; - ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); if (!ok) return false; float* D_sinoData; unsigned int sinoPitch; - ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch); if (!ok) { cudaFree(D_volumeData); return false; @@ -799,7 +799,7 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, return false; } - zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles); ok = FanFP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pAngles, 1.0f); diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu index 5b1cf46..df8db0b 100644 --- a/cuda/2d/cgls.cu +++ b/cuda/2d/cgls.cu @@ -73,16 +73,16 @@ void CGLS::reset() bool CGLS::init() { // Lifetime of z: within an iteration - allocateVolume(D_z, dims.iVolWidth+2, dims.iVolHeight+2, zPitch); + allocateVolume(D_z, dims.iVolWidth, dims.iVolHeight, zPitch); // Lifetime of p: full algorithm - allocateVolume(D_p, dims.iVolWidth+2, dims.iVolHeight+2, pPitch); + allocateVolume(D_p, dims.iVolWidth, dims.iVolHeight, pPitch); // Lifetime of r: full algorithm - allocateVolume(D_r, dims.iProjDets+2, dims.iProjAngles, rPitch); + allocateVolume(D_r, dims.iProjDets, dims.iProjAngles, rPitch); // Lifetime of w: within an iteration - allocateVolume(D_w, dims.iProjDets+2, dims.iProjAngles, wPitch); + allocateVolume(D_w, dims.iProjDets, dims.iProjAngles, wPitch); // TODO: check if allocations succeeded return true; @@ -120,13 +120,13 @@ bool CGLS::iterate(unsigned int iterations) if (!sliceInitialized) { // copy sinogram - cudaMemcpy2D(D_r, sizeof(float)*rPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + cudaMemcpy2D(D_r, sizeof(float)*rPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice); // r = sino - A*x if (useVolumeMask) { // Use z as temporary storage here since it is unused - cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); - processVol(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); + cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); + processVol(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); callFP(D_z, zPitch, D_r, rPitch, -1.0f); } else { callFP(D_volumeData, volumePitch, D_r, rPitch, -1.0f); @@ -134,13 +134,13 @@ bool CGLS::iterate(unsigned int iterations) // p = A'*r - zeroVolume(D_p, pPitch, dims.iVolWidth+2, dims.iVolHeight+2); + zeroVolume(D_p, pPitch, dims.iVolWidth, dims.iVolHeight); callBP(D_p, pPitch, D_r, rPitch); if (useVolumeMask) - processVol(D_p, D_maskData, pPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_p, D_maskData, pPitch, dims.iVolWidth, dims.iVolHeight); - gamma = dotProduct2D(D_p, pPitch, dims.iVolWidth, dims.iVolHeight, 1, 1); + gamma = dotProduct2D(D_p, pPitch, dims.iVolWidth, dims.iVolHeight); sliceInitialized = true; } @@ -150,32 +150,32 @@ bool CGLS::iterate(unsigned int iterations) for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { // w = A*p - zeroVolume(D_w, wPitch, dims.iProjDets+2, dims.iProjAngles); + zeroVolume(D_w, wPitch, dims.iProjDets, dims.iProjAngles); callFP(D_p, pPitch, D_w, wPitch, 1.0f); // alpha = gamma / - float ww = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + float ww = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles); float alpha = gamma / ww; // x += alpha*p - processVol(D_volumeData, D_p, alpha, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, D_p, alpha, volumePitch, dims.iVolWidth, dims.iVolHeight); // r -= alpha*w - processVol(D_r, D_w, -alpha, rPitch, dims.iProjDets, dims.iProjAngles); + processVol(D_r, D_w, -alpha, rPitch, dims.iProjDets, dims.iProjAngles); // z = A'*r - zeroVolume(D_z, zPitch, dims.iVolWidth+2, dims.iVolHeight+2); + zeroVolume(D_z, zPitch, dims.iVolWidth, dims.iVolHeight); callBP(D_z, zPitch, D_r, rPitch); if (useVolumeMask) - processVol(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); float beta = 1.0f / gamma; - gamma = dotProduct2D(D_z, zPitch, dims.iVolWidth, dims.iVolHeight, 1, 1); + gamma = dotProduct2D(D_z, zPitch, dims.iVolWidth, dims.iVolHeight); beta *= gamma; // p = z + beta*p - processVol(D_p, D_z, beta, pPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_p, D_z, beta, pPitch, dims.iVolWidth, dims.iVolHeight); } @@ -189,12 +189,12 @@ float CGLS::computeDiffNorm() // used outside of iterations. // copy sinogram to w - cudaMemcpy2D(D_w, sizeof(float)*wPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + cudaMemcpy2D(D_w, sizeof(float)*wPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice); // do FP, subtracting projection from sinogram if (useVolumeMask) { - cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); - processVol(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); + cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); + processVol(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); callFP(D_z, zPitch, D_w, wPitch, -1.0f); } else { callFP(D_volumeData, volumePitch, D_w, wPitch, -1.0f); @@ -202,7 +202,7 @@ float CGLS::computeDiffNorm() // compute norm of D_w - float s = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + float s = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles); return sqrt(s); } @@ -264,12 +264,12 @@ int main() dims.iRaysPerDet = 1; unsigned int volumePitch, sinoPitch; - allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); - zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); + zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight); printf("pitch: %u\n", volumePitch); - allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); - zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch); + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles); printf("pitch: %u\n", sinoPitch); unsigned int y, x; diff --git a/cuda/2d/darthelper.cu b/cuda/2d/darthelper.cu index 768d14e..b61eaae 100644 --- a/cuda/2d/darthelper.cu +++ b/cuda/2d/darthelper.cu @@ -33,7 +33,7 @@ $Id$ namespace astraCUDA { // CUDA function for the selection of ROI -__global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +__global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsigned int width, unsigned int height) { float x = (float)(threadIdx.x + 16*blockIdx.x); float y = (float)(threadIdx.y + 16*blockIdx.y); @@ -44,7 +44,7 @@ __global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsign if ((x-w)*(x-w) + (y-h)*(y-h) > radius * radius * 0.25f) { float* d = (float*)in; - unsigned int o = (y+padY)*pitch+x+padX; + unsigned int o = y*pitch+x; d[o] = 0.0f; } } @@ -54,12 +54,12 @@ void roiSelect(float* out, float radius, unsigned int width, unsigned int height float* D_data; unsigned int pitch; - allocateVolume(D_data, width+2, height+2, pitch); + allocateVolume(D_data, width, height, pitch); copyVolumeToDevice(out, width, width, height, D_data, pitch); dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); - devRoiSelect<<>>(D_data, radius, pitch, width, height, 1, 1); + devRoiSelect<<>>(D_data, radius, pitch, width, height); copyVolumeFromDevice(out, width, width, height, D_data, pitch); @@ -70,7 +70,7 @@ void roiSelect(float* out, float radius, unsigned int width, unsigned int height // CUDA function for the masking of DART with a radius == 1 -__global__ void devDartMask(float* mask, const float* in, unsigned int conn, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +__global__ void devDartMask(float* mask, const float* in, unsigned int conn, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; unsigned int y = threadIdx.y + 16*blockIdx.y; @@ -80,7 +80,7 @@ __global__ void devDartMask(float* mask, const float* in, unsigned int conn, uns float* d = (float*)in; float* m = (float*)mask; - unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. + unsigned int o2 = y*pitch+x; // On this row. unsigned int o1 = o2 - pitch; // On previous row. unsigned int o3 = o2 + pitch; // On next row. @@ -101,7 +101,7 @@ __global__ void devDartMask(float* mask, const float* in, unsigned int conn, uns // CUDA function for the masking of DART with a radius > 1 -__global__ void devDartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +__global__ void devDartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; unsigned int y = threadIdx.y + 16*blockIdx.y; @@ -115,13 +115,13 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con int r = radius; // o2: index of the current center pixel - int o2 = (y+padY)*pitch+x+padX; + int o2 = y*pitch+x; if (conn == 8) // 8-connected { for (int row = -r; row <= r; row++) { - int o1 = (y+padY+row)*pitch+x+padX; + int o1 = (y+row)*pitch+x; for (int col = -r; col <= r; col++) { if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;} @@ -131,7 +131,7 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con else if (conn == 4) // 4-connected { // horizontal - unsigned int o1 = (y+padY)*pitch+x+padX; + unsigned int o1 = y*pitch+x; for (int col = -r; col <= r; col++) { if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;} @@ -140,7 +140,7 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con // vertical for (int row = -r; row <= r; row++) { - unsigned int o1 = (y+padY+row)*pitch+x+padX; + unsigned int o1 = (y+row)*pitch+x; if (d[o1] != d[o2]) {m[o2] = 1.0f; return;} } } @@ -149,7 +149,7 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con // CUDA function for the masking of ADART with a radius == 1 -__global__ void devADartMask(float* mask, const float* in, unsigned int conn, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +__global__ void devADartMask(float* mask, const float* in, unsigned int conn, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; unsigned int y = threadIdx.y + 16*blockIdx.y; @@ -159,7 +159,7 @@ __global__ void devADartMask(float* mask, const float* in, unsigned int conn, un float* d = (float*)in; float* m = (float*)mask; - unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. + unsigned int o2 = y*pitch+x; // On this row. unsigned int o1 = o2 - pitch; // On previous row. unsigned int o3 = o2 + pitch; // On next row. @@ -186,7 +186,7 @@ __global__ void devADartMask(float* mask, const float* in, unsigned int conn, un // CUDA function for the masking of ADART with a radius > 1 -__global__ void devADartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +__global__ void devADartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; unsigned int y = threadIdx.y + 16*blockIdx.y; @@ -199,13 +199,13 @@ __global__ void devADartMaskRadius(float* mask, const float* in, unsigned int co int r = radius; - unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. + unsigned int o2 = y*pitch+x; // On this row. if (conn == 8) { for (int row = -r; row <= r; row++) { - unsigned int o1 = (y+padY+row)*pitch+x+padX; + unsigned int o1 = (y+row)*pitch+x; for (int col = -r; col <= r; col++) { if (d[o1+col] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} @@ -223,7 +223,7 @@ __global__ void devADartMaskRadius(float* mask, const float* in, unsigned int co // vertical for (int row = -r; row <= r; row++) { - unsigned int o1 = (y+padY+row)*pitch+x+padX; + unsigned int o1 = (y+row)*pitch+x; if (d[o1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} } } @@ -237,23 +237,23 @@ void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigne float* D_maskData; unsigned int pitch; - allocateVolume(D_segmentationData, width+2, height+2, pitch); + allocateVolume(D_segmentationData, width, height, pitch); copyVolumeToDevice(segmentation, width, width, height, D_segmentationData, pitch); - allocateVolume(D_maskData, width+2, height+2, pitch); - zeroVolume(D_maskData, pitch, width+2, height+2); + allocateVolume(D_maskData, width, height, pitch); + zeroVolume(D_maskData, pitch, width, height); dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); if (threshold == 1 && radius == 1) - devDartMask<<>>(D_maskData, D_segmentationData, conn, pitch, width, height, 1, 1); + devDartMask<<>>(D_maskData, D_segmentationData, conn, pitch, width, height); else if (threshold > 1 && radius == 1) - devADartMask<<>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height, 1, 1); + devADartMask<<>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height); else if (threshold == 1 && radius > 1) - devDartMaskRadius<<>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height, 1, 1); + devDartMaskRadius<<>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height); else - devADartMaskRadius<<>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height, 1, 1); + devADartMaskRadius<<>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height); copyVolumeFromDevice(mask, width, width, height, D_maskData, pitch); @@ -263,7 +263,7 @@ void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigne } -__global__ void devDartSmoothingRadius(float* out, const float* in, float b, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +__global__ void devDartSmoothingRadius(float* out, const float* in, float b, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; unsigned int y = threadIdx.y + 16*blockIdx.y; @@ -274,13 +274,13 @@ __global__ void devDartSmoothingRadius(float* out, const float* in, float b, uns float* d = (float*)in; float* m = (float*)out; - unsigned int o2 = (y+padY)*pitch+x+padX; + unsigned int o2 = y*pitch+x; int r = radius; float res = -d[o2]; for (int row = -r; row < r; row++) { - unsigned int o1 = (y+padY+row)*pitch+x+padX; + unsigned int o1 = (y+row)*pitch+x; for (int col = -r; col <= r; col++) { res += d[o1+col]; @@ -295,7 +295,7 @@ __global__ void devDartSmoothingRadius(float* out, const float* in, float b, uns } -__global__ void devDartSmoothing(float* out, const float* in, float b, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +__global__ void devDartSmoothing(float* out, const float* in, float b, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; unsigned int y = threadIdx.y + 16*blockIdx.y; @@ -305,7 +305,7 @@ __global__ void devDartSmoothing(float* out, const float* in, float b, unsigned float* d = (float*)in; float* m = (float*)out; - unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. + unsigned int o2 = y*pitch+x; // On this row. unsigned int o1 = o2 - pitch; // On previous row. unsigned int o3 = o2 + pitch; // On next row. @@ -329,9 +329,9 @@ void dartSmoothing(float* out, const float* in, float b, unsigned int radius, un dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); if (radius == 1) - devDartSmoothing<<>>(D_outData, D_inData, b, pitch, width, height, 1, 1); + devDartSmoothing<<>>(D_outData, D_inData, b, pitch, width, height); else - devDartSmoothingRadius<<>>(D_outData, D_inData, b, radius, pitch, width, height, 1, 1); + devDartSmoothingRadius<<>>(D_outData, D_inData, b, radius, pitch, width, height); copyVolumeFromDevice(out, width, width, height, D_outData, pitch); diff --git a/cuda/2d/dataop.cu b/cuda/2d/dataop.cu index 68573b2..9dc193e 100644 --- a/cuda/2d/dataop.cu +++ b/cuda/2d/dataop.cu @@ -39,10 +39,10 @@ void operationVolumeMult(float* data1, float* data2, unsigned int width, unsigne float* D_data2; unsigned int pitch; - allocateVolume(D_data1, width+2, height+2, pitch); + allocateVolume(D_data1, width, height, pitch); copyVolumeToDevice(data1, width, width, height, D_data1, pitch); - allocateVolume(D_data2, width+2, height+2, pitch); + allocateVolume(D_data2, width, height, pitch); copyVolumeToDevice(data2, width, width, height, D_data2, pitch); processVol(D_data1, D_data2, pitch, width, height); @@ -59,10 +59,10 @@ void operationVolumeMultScalarMask(float* data, float* mask, float scalar, unsig float* D_mask; unsigned int pitch; - allocateVolume(D_data, width+2, height+2, pitch); + allocateVolume(D_data, width, height, pitch); copyVolumeToDevice(data, width, width, height, D_data, pitch); - allocateVolume(D_mask, width+2, height+2, pitch); + allocateVolume(D_mask, width, height, pitch); copyVolumeToDevice(mask, width, width, height, D_mask, pitch); processVol(D_data, D_mask, scalar, pitch, width, height); @@ -79,7 +79,7 @@ void operationVolumeMultScalar(float* data, float scalar, unsigned int width, un float* D_data; unsigned int pitch; - allocateVolume(D_data, width+2, height+2, pitch); + allocateVolume(D_data, width, height, pitch); copyVolumeToDevice(data, width, width, height, D_data, pitch); processVol(D_data, scalar, pitch, width, height); @@ -96,10 +96,10 @@ void operationVolumeAdd(float* data1, float* data2, unsigned int width, unsigned float* D_data2; unsigned int pitch; - allocateVolume(D_data1, width+2, height+2, pitch); + allocateVolume(D_data1, width, height, pitch); copyVolumeToDevice(data1, width, width, height, D_data1, pitch); - allocateVolume(D_data2, width+2, height+2, pitch); + allocateVolume(D_data2, width, height, pitch); copyVolumeToDevice(data2, width, width, height, D_data2, pitch); processVol(D_data1, D_data2, pitch, width, height); @@ -116,7 +116,7 @@ void operationVolumeAddScalar(float* data, float scalar, unsigned int width, uns float* D_data; unsigned int pitch; - allocateVolume(D_data, width+2, height+2, pitch); + allocateVolume(D_data, width, height, pitch); copyVolumeToDevice(data, width, width, height, D_data, pitch); processVol(D_data, scalar, pitch, width, height); diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu index 74d1bbf..cf3f10c 100644 --- a/cuda/2d/em.cu +++ b/cuda/2d/em.cu @@ -73,14 +73,14 @@ void EM::reset() bool EM::init() { - allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch); - zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); + allocateVolume(D_pixelWeight, dims.iVolWidth, dims.iVolHeight, pixelPitch); + zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); - allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); - zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight); - allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); - zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch); + zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); // We can't precompute pixelWeights when using a volume mask #if 0 @@ -94,22 +94,22 @@ bool EM::init() bool EM::precomputeWeights() { - zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); + zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); #if 0 if (useSinogramMask) { callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch); } else #endif { - processVol(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles); + processVol(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles); callBP(D_pixelWeight, pixelPitch, D_projData, projPitch); } - processVol(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); #if 0 if (useVolumeMask) { // scale pixel weights with mask to zero out masked pixels - processVol(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight); } #endif @@ -129,18 +129,18 @@ bool EM::iterate(unsigned int iterations) for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { // Do FP of volumeData - zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); callFP(D_volumeData, volumePitch, D_projData, projPitch, 1.0f); // Divide sinogram by FP (into projData) - processVol(D_projData, D_sinoData, projPitch, dims.iProjDets, dims.iProjAngles); + processVol(D_projData, D_sinoData, projPitch, dims.iProjDets, dims.iProjAngles); // Do BP of projData into tmpData - zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight); callBP(D_tmpData, tmpPitch, D_projData, projPitch); // Multiply volumeData with tmpData divided by pixel weights - processVol(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); } @@ -150,12 +150,12 @@ bool EM::iterate(unsigned int iterations) float EM::computeDiffNorm() { // copy sinogram to projection data - cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice); // do FP, subtracting projection from sinogram if (useVolumeMask) { - cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); - processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); + processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); } else { callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); @@ -164,7 +164,7 @@ float EM::computeDiffNorm() // compute norm of D_projData - float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); return sqrt(s); } @@ -218,12 +218,12 @@ int main() dims.iRaysPerDet = 1; unsigned int volumePitch, sinoPitch; - allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); - zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); + zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight); printf("pitch: %u\n", volumePitch); - allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); - zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch); + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles); printf("pitch: %u\n", sinoPitch); unsigned int y, x; diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu index 12086c0..9bec25d 100644 --- a/cuda/2d/fan_bp.cu +++ b/cuda/2d/fan_bp.cu @@ -61,12 +61,12 @@ __constant__ float gC_DetUX[g_MaxAngles]; __constant__ float gC_DetUY[g_MaxAngles]; -static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height) +static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height, cudaTextureAddressMode mode = cudaAddressModeBorder) { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - gT_FanProjTexture.addressMode[0] = cudaAddressModeClamp; - gT_FanProjTexture.addressMode[1] = cudaAddressModeClamp; + gT_FanProjTexture.addressMode[0] = mode; + gT_FanProjTexture.addressMode[1] = mode; gT_FanProjTexture.filterMode = cudaFilterModeLinear; gT_FanProjTexture.normalized = false; @@ -116,12 +116,12 @@ __global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int s const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX; const float fDen = fDetUX * fYD - fDetUY * fXD; - const float fT = fNum / fDen + 1.0f; + const float fT = fNum / fDen; fVal += tex2D(gT_FanProjTexture, fT, fA); fA += 1.0f; } - volData[(Y+1)*volPitch+X+1] += fVal; + volData[Y*volPitch+X] += fVal; } // supersampling version @@ -171,7 +171,7 @@ __global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned in const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX; const float fDen = fDetUX * fYD - fDetUY * fXD; - const float fT = fNum / fDen + 1.0f; + const float fT = fNum / fDen; fVal += tex2D(gT_FanProjTexture, fT, fA); fY -= fSubStep; } @@ -180,7 +180,7 @@ __global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned in fA += 1.0f; } - volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim); + volData[Y*volPitch+X] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim); } @@ -222,7 +222,7 @@ __global__ void devFanBP_SART(float* D_volData, unsigned int volPitch, const SDi const float fT = fNum / fDen; const float fVal = tex2D(gT_FanProjTexture, fT, 0.5f); - volData[(Y+1)*volPitch+X+1] += fVal; + volData[Y*volPitch+X] += fVal; } // Weighted BP for use in fan beam FBP @@ -268,12 +268,12 @@ __global__ void devFanBP_FBPWeighted(float* D_volData, unsigned int volPitch, un const float fWeight = fXD*fXD + fYD*fYD; - const float fT = fNum / fDen + 1.0f; + const float fT = fNum / fDen; fVal += tex2D(gT_FanProjTexture, fT, fA) / fWeight; fA += 1.0f; } - volData[(Y+1)*volPitch+X+1] += fVal; + volData[Y*volPitch+X] += fVal; } @@ -331,7 +331,7 @@ bool FanBP_FBPWeighted(float* D_volumeData, unsigned int volumePitch, // TODO: process angles block by block assert(dims.iProjAngles <= g_MaxAngles); - bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + bindProjDataTexture(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); // transfer angles to constant memory float* tmp = new float[dims.iProjAngles]; @@ -376,7 +376,7 @@ bool FanBP_SART(float* D_volumeData, unsigned int volumePitch, const SDimensions& dims, const SFanProjection* angles) { // only one angle - bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1); + bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1, cudaAddressModeClamp); // transfer angle to constant memory #define TRANSFER_TO_CONSTANT(name) do { cudaMemcpyToSymbol(gC_##name, &(angles[angle].f##name), sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) @@ -442,10 +442,10 @@ int main() #undef ROTATE0 - allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); printf("pitch: %u\n", volumePitch); - allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch); printf("pitch: %u\n", projPitch); unsigned int y, x; diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu index cf9f352..b24029c 100644 --- a/cuda/2d/fan_fp.cu +++ b/cuda/2d/fan_fp.cu @@ -67,8 +67,8 @@ static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned i cudaMallocArray(&dataArray, &channelDesc, width, height); cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice); - gT_FanVolumeTexture.addressMode[0] = cudaAddressModeClamp; - gT_FanVolumeTexture.addressMode[1] = cudaAddressModeClamp; + gT_FanVolumeTexture.addressMode[0] = cudaAddressModeBorder; + gT_FanVolumeTexture.addressMode[1] = cudaAddressModeBorder; gT_FanVolumeTexture.filterMode = cudaFilterModeLinear; gT_FanVolumeTexture.normalized = false; @@ -129,8 +129,8 @@ __global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsig // intersect ray with first slice - float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 1.5f; - float fX = startSlice + 1.5f; + float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 0.5f; + float fX = startSlice + 0.5f; int endSlice = startSlice + g_blockSlices; if (endSlice > dims.iVolWidth) @@ -148,7 +148,7 @@ __global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsig } - projData[angle*projPitch+detector+1] += fVal; + projData[angle*projPitch+detector] += fVal; } @@ -201,8 +201,8 @@ __global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigne // intersect ray with first slice - float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 1.5f; - float fY = startSlice + 1.5f; + float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 0.5f; + float fY = startSlice + 0.5f; int endSlice = startSlice + g_blockSlices; if (endSlice > dims.iVolHeight) @@ -221,7 +221,7 @@ __global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigne } - projData[angle*projPitch+detector+1] += fVal; + projData[angle*projPitch+detector] += fVal; } bool FanFP(float* D_volumeData, unsigned int volumePitch, @@ -233,7 +233,7 @@ bool FanFP(float* D_volumeData, unsigned int volumePitch, assert(dims.iProjAngles <= g_MaxAngles); cudaArray* D_dataArray; - bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight); // transfer angles to constant memory float* tmp = new float[dims.iProjAngles]; @@ -325,10 +325,10 @@ int main() #undef ROTATE0 - allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); printf("pitch: %u\n", volumePitch); - allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch); printf("pitch: %u\n", projPitch); unsigned int y, x; @@ -358,8 +358,8 @@ int main() s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x]; printf("cpu norm: %f\n", s); - //zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); - s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + //zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); + s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); printf("gpu norm: %f\n", s); saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino); diff --git a/cuda/2d/fft.cu b/cuda/2d/fft.cu index 79e4be7..75bd92c 100644 --- a/cuda/2d/fft.cu +++ b/cuda/2d/fft.cu @@ -209,7 +209,7 @@ bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount, } bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource, - int _iSourcePitch, int _iSourcePadX, int _iProjDets, + int _iSourcePitch, int _iProjDets, int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount, cufftComplex * _pDevTargetComplex) { @@ -221,7 +221,7 @@ bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource, for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) { - const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch + _iSourcePadX; + const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch; float * pfTargetLocation = pfDevRealFFTSource + iProjectionIndex * _iFFTRealDetectorCount; SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice)); @@ -241,7 +241,7 @@ bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource, bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex, float * _pfRealTarget, - int _iTargetPitch, int _iTargetPadX, int _iProjDets, + int _iTargetPitch, int _iProjDets, int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount) { float * pfDevRealFFTTarget = NULL; @@ -264,7 +264,7 @@ bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex, for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) { const float * pfSourceLocation = pfDevRealFFTTarget + iProjectionIndex * _iFFTRealDetectorCount; - float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch + _iTargetPadX; + float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch; SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice)); } diff --git a/cuda/2d/fft.h b/cuda/2d/fft.h index 55324e5..d4428a5 100644 --- a/cuda/2d/fft.h +++ b/cuda/2d/fft.h @@ -45,13 +45,13 @@ bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount, cufftComplex * _pDevComplexTarget); bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource, - int _iSourcePitch, int _iSourcePadX, int _iProjDets, + int _iSourcePitch, int _iProjDets, int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount, cufftComplex * _pDevTargetComplex); bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex, float * _pfRealTarget, - int _iTargetPitch, int _iTargetPadX, int _iProjDets, + int _iTargetPitch, int _iProjDets, int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount); void applyFilter(int _iProjectionCount, int _iFreqBinCount, diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu index d202341..fc99102 100644 --- a/cuda/2d/par_bp.cu +++ b/cuda/2d/par_bp.cu @@ -57,12 +57,12 @@ __constant__ float gC_angle_sin[g_MaxAngles]; __constant__ float gC_angle_cos[g_MaxAngles]; __constant__ float gC_angle_offset[g_MaxAngles]; -static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height) +static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height, cudaTextureAddressMode mode = cudaAddressModeBorder) { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - gT_projTexture.addressMode[0] = cudaAddressModeClamp; - gT_projTexture.addressMode[1] = cudaAddressModeClamp; + gT_projTexture.addressMode[0] = mode; + gT_projTexture.addressMode[1] = mode; gT_projTexture.filterMode = cudaFilterModeLinear; gT_projTexture.normalized = false; @@ -94,7 +94,7 @@ __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int star float fVal = 0.0f; float fA = startAngle + 0.5f; - const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f; + const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f; if (offsets) { @@ -123,7 +123,7 @@ __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int star } - volData[(Y+1)*volPitch+X+1] += fVal; + volData[Y*volPitch+X] += fVal; } // supersampling version @@ -150,7 +150,7 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s float fVal = 0.0f; float fA = startAngle + 0.5f; - const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f; + const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f; if (offsets) { @@ -196,7 +196,7 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s } - volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim); + volData[Y*volPitch+X] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim); } __global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset, float angle_sin, float angle_cos, const SDimensions dims) @@ -218,7 +218,7 @@ __global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset const float fT = fT_base + fX * angle_cos - fY * angle_sin + offset; const float fVal = tex2D(gT_projTexture, fT, 0.5f); - D_volData[(Y+1)*volPitch+X+1] += fVal; + D_volData[Y*volPitch+X] += fVal; } @@ -232,7 +232,7 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch, float* angle_sin = new float[dims.iProjAngles]; float* angle_cos = new float[dims.iProjAngles]; - bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + bindProjDataTexture(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); for (unsigned int i = 0; i < dims.iProjAngles; ++i) { angle_sin[i] = sinf(angles[i]); @@ -302,8 +302,10 @@ bool BP_SART(float* D_volumeData, unsigned int volumePitch, unsigned int angle, const SDimensions& dims, const float* angles, const float* TOffsets) { - // only one angle - bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1); + // Only one angle. + // We need to Clamp to the border pixels instead of to zero, because + // SART weights with ray length. + bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1, cudaAddressModeClamp); float angle_sin = sinf(angles[angle]); float angle_cos = cosf(angles[angle]); @@ -346,10 +348,10 @@ int main() unsigned int volumePitch, projPitch; - allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); printf("pitch: %u\n", volumePitch); - allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch); printf("pitch: %u\n", projPitch); unsigned int y, x; diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu index f811f98..097122b 100644 --- a/cuda/2d/par_fp.cu +++ b/cuda/2d/par_fp.cu @@ -74,8 +74,8 @@ static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned i cudaMallocArray(&dataArray, &channelDesc, width, height); cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice); - gT_volumeTexture.addressMode[0] = cudaAddressModeClamp; - gT_volumeTexture.addressMode[1] = cudaAddressModeClamp; + gT_volumeTexture.addressMode[0] = cudaAddressModeBorder; + gT_volumeTexture.addressMode[1] = cudaAddressModeBorder; gT_volumeTexture.filterMode = cudaFilterModeLinear; gT_volumeTexture.normalized = false; @@ -153,8 +153,8 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned float fVal = 0.0f; // project detector on slice - float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f; - float fS = startSlice + 1.5f; + float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 0.5f; + float fS = startSlice + 0.5f; int endSlice = startSlice + g_blockSlices; if (endSlice > dims.iVolWidth) endSlice = dims.iVolWidth; @@ -189,7 +189,7 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned } - D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; + D_projData[angle*projPitch+detector] += fVal * fDistCorr; } // projection for angles that are roughly vertical @@ -255,8 +255,8 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i fDistCorr *= outputScale; float fVal = 0.0f; - float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f; - float fS = startSlice+1.5f; + float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 0.5f; + float fS = startSlice+0.5f; int endSlice = startSlice + g_blockSlices; if (endSlice > dims.iVolHeight) endSlice = dims.iVolHeight; @@ -290,7 +290,7 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i } - D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; + D_projData[angle*projPitch+detector] += fVal * fDistCorr; } // projection for angles that are roughly horizontal @@ -331,8 +331,8 @@ __global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, u float fVal = 0.0f; // project detector on slice - float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f; - float fS = startSlice + 1.5f; + float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 0.5f; + float fS = startSlice + 0.5f; int endSlice = startSlice + g_blockSlices; if (endSlice > dims.iVolWidth) endSlice = dims.iVolWidth; @@ -367,7 +367,7 @@ __global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, u } - D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; + D_projData[angle*projPitch+detector] += fVal * fDistCorr; } @@ -408,8 +408,8 @@ __global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, uns fDistCorr *= outputScale; float fVal = 0.0f; - float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f; - float fS = startSlice+1.5f; + float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 0.5f; + float fS = startSlice+0.5f; int endSlice = startSlice + g_blockSlices; if (endSlice > dims.iVolHeight) endSlice = dims.iVolHeight; @@ -443,7 +443,7 @@ __global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, uns } - D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; + D_projData[angle*projPitch+detector] += fVal * fDistCorr; } @@ -457,7 +457,7 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch, assert(dims.iProjAngles <= g_MaxAngles); cudaArray* D_dataArray; - bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight); cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); @@ -579,7 +579,7 @@ bool FP(float* D_volumeData, unsigned int volumePitch, assert(dims.fDetScale > 0.9999f); cudaArray* D_dataArray; - bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight); cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); @@ -682,10 +682,10 @@ int main() dims.iRaysPerDet = 1; unsigned int volumePitch, projPitch; - allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); printf("pitch: %u\n", volumePitch); - allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch); printf("pitch: %u\n", projPitch); unsigned int y, x; @@ -715,7 +715,7 @@ int main() s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x]; printf("cpu norm: %f\n", s); - //zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + //zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); printf("gpu norm: %f\n", s); diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu index a40176d..7f499ce 100644 --- a/cuda/2d/sart.cu +++ b/cuda/2d/sart.cu @@ -39,14 +39,13 @@ $Id$ namespace astraCUDA { - +// FIXME: Remove these functions. (Outdated) __global__ void devMUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; - // Copy result down and left one pixel. - pfOut[x + pitch] = pfOut[x + 1] * pfIn[x + 1]; + pfOut[x] *= pfIn[x]; } void MUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width) @@ -106,18 +105,15 @@ void SART::reset() bool SART::init() { if (useVolumeMask) { - allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); - zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight); } - // HACK: D_projData consists of two lines. The first is used padded, - // the second unpadded. This is to satisfy the alignment requirements - // of resp. FP and BP_SART. - allocateVolume(D_projData, dims.iProjDets+2, 2, projPitch); - zeroVolume(D_projData, projPitch, dims.iProjDets+2, 1); + allocateVolume(D_projData, dims.iProjDets, 1, projPitch); + zeroVolume(D_projData, projPitch, dims.iProjDets, 1); - allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch); - zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); + allocateVolume(D_lineWeight, dims.iProjDets, dims.iProjAngles, linePitch); + zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); // We can't precompute lineWeights when using a mask if (!useVolumeMask) @@ -142,23 +138,23 @@ bool SART::setProjectionOrder(int* _projectionOrder, int _projectionCount) bool SART::precomputeWeights() { - zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); + zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); if (useVolumeMask) { callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f); } else { // Allocate tmpData temporarily - allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); - zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight); - processVol(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight); callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f); cudaFree(D_tmpData); D_tmpData = 0; } - processVol(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); + processVol(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); return true; } @@ -181,12 +177,12 @@ bool SART::iterate(unsigned int iterations) } // copy one line of sinogram to projection data - cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData + angle*sinoPitch, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), 1, cudaMemcpyDeviceToDevice); + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData + angle*sinoPitch, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), 1, cudaMemcpyDeviceToDevice); // do FP, subtracting projection from sinogram if (useVolumeMask) { - cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); - processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); + processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); callFP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle, -1.0f); } else { callFP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle, -1.0f); @@ -197,17 +193,17 @@ bool SART::iterate(unsigned int iterations) if (useVolumeMask) { // BP, mask, and add back // TODO: Try putting the masking directly in the BP - zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight); callBP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle); - processVol(D_volumeData, D_maskData, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, D_maskData, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight); } else { callBP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle); } if (useMinConstraint) - processVol(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); if (useMaxConstraint) - processVol(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); iteration++; @@ -220,16 +216,16 @@ float SART::computeDiffNorm() { unsigned int pPitch; float *D_p; - allocateVolume(D_p, dims.iProjDets+2, dims.iProjAngles, pPitch); - zeroVolume(D_p, pPitch, dims.iProjDets+2, dims.iProjAngles); + allocateVolume(D_p, dims.iProjDets, dims.iProjAngles, pPitch); + zeroVolume(D_p, pPitch, dims.iProjDets, dims.iProjAngles); // copy sinogram to D_p - cudaMemcpy2D(D_p, sizeof(float)*pPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + cudaMemcpy2D(D_p, sizeof(float)*pPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice); // do FP, subtracting projection from sinogram if (useVolumeMask) { - cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); - processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); + processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); } else { callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); @@ -237,7 +233,7 @@ float SART::computeDiffNorm() // compute norm of D_p - float s = dotProduct2D(D_p, pPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + float s = dotProduct2D(D_p, pPitch, dims.iProjDets, dims.iProjAngles); cudaFree(D_p); @@ -267,11 +263,11 @@ bool SART::callBP_SART(float* D_volumeData, unsigned int volumePitch, { if (angles) { assert(!fanProjs); - return BP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch, + return BP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle, dims, angles, TOffsets); } else { assert(fanProjs); - return FanBP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch, + return FanBP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle, dims, fanProjs); } diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu index 31954e4..eb65962 100644 --- a/cuda/2d/sirt.cu +++ b/cuda/2d/sirt.cu @@ -88,17 +88,17 @@ void SIRT::reset() bool SIRT::init() { - allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch); - zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); + allocateVolume(D_pixelWeight, dims.iVolWidth, dims.iVolHeight, pixelPitch); + zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); - allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); - zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight); - allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); - zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch); + zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); - allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch); - zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); + allocateVolume(D_lineWeight, dims.iProjDets, dims.iProjAngles, linePitch); + zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); // We can't precompute lineWeights and pixelWeights when using a mask if (!useVolumeMask && !useSinogramMask) @@ -110,33 +110,33 @@ bool SIRT::init() bool SIRT::precomputeWeights() { - zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); + zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); if (useVolumeMask) { callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f); } else { - processVol(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight); callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f); } - processVol(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); + processVol(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); if (useSinogramMask) { // scale line weights with sinogram mask to zero out masked sinogram pixels - processVol(D_lineWeight, D_smaskData, linePitch, dims.iProjDets, dims.iProjAngles); + processVol(D_lineWeight, D_smaskData, linePitch, dims.iProjDets, dims.iProjAngles); } - zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); + zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); if (useSinogramMask) { callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch); } else { - processVol(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles); + processVol(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles); callBP(D_pixelWeight, pixelPitch, D_projData, projPitch); } - processVol(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); if (useVolumeMask) { // scale pixel weights with mask to zero out masked pixels - processVol(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight); } return true; @@ -160,7 +160,7 @@ bool SIRT::uploadMinMaxMasks(const float* pfMinMaskData, const float* pfMaxMaskD freeMinMaxMasks = true; bool ok = true; if (pfMinMaskData) { - allocateVolume(D_minMaskData, dims.iVolWidth+2, dims.iVolHeight+2, minMaskPitch); + allocateVolume(D_minMaskData, dims.iVolWidth, dims.iVolHeight, minMaskPitch); ok = copyVolumeToDevice(pfMinMaskData, iPitch, dims.iVolWidth, dims.iVolHeight, D_minMaskData, minMaskPitch); @@ -169,7 +169,7 @@ bool SIRT::uploadMinMaxMasks(const float* pfMinMaskData, const float* pfMaxMaskD return false; if (pfMaxMaskData) { - allocateVolume(D_maxMaskData, dims.iVolWidth+2, dims.iVolHeight+2, maxMaskPitch); + allocateVolume(D_maxMaskData, dims.iVolWidth, dims.iVolHeight, maxMaskPitch); ok = copyVolumeToDevice(pfMaxMaskData, iPitch, dims.iVolWidth, dims.iVolHeight, D_maxMaskData, maxMaskPitch); @@ -191,33 +191,33 @@ bool SIRT::iterate(unsigned int iterations) for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { // copy sinogram to projection data - cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice); // do FP, subtracting projection from sinogram if (useVolumeMask) { - cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); - processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); + processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); } else { callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); } - processVol(D_projData, D_lineWeight, projPitch, dims.iProjDets, dims.iProjAngles); + processVol(D_projData, D_lineWeight, projPitch, dims.iProjDets, dims.iProjAngles); - zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight); callBP(D_tmpData, tmpPitch, D_projData, projPitch); - processVol(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight); if (useMinConstraint) - processVol(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); if (useMaxConstraint) - processVol(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); if (D_minMaskData) - processVol(D_volumeData, D_minMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, D_minMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight); if (D_maxMaskData) - processVol(D_volumeData, D_maxMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, D_maxMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight); } return true; @@ -226,12 +226,12 @@ bool SIRT::iterate(unsigned int iterations) float SIRT::computeDiffNorm() { // copy sinogram to projection data - cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice); // do FP, subtracting projection from sinogram if (useVolumeMask) { - cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); - processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); + processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); } else { callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); @@ -240,7 +240,7 @@ float SIRT::computeDiffNorm() // compute norm of D_projData - float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); return sqrt(s); } @@ -300,12 +300,12 @@ int main() dims.iRaysPerDet = 1; unsigned int volumePitch, sinoPitch; - allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); - zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); + zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight); printf("pitch: %u\n", volumePitch); - allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); - zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch); + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles); printf("pitch: %u\n", sinoPitch); unsigned int y, x; diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu index 06f6714..8bb2f2f 100644 --- a/cuda/2d/util.cu +++ b/cuda/2d/util.cu @@ -36,11 +36,8 @@ bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch, unsigned int width, unsigned int height, float* outD_data, unsigned int out_pitch) { - // TODO: a full memset isn't necessary. Only the edges. cudaError_t err; - err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, sizeof(float)*(width+2), height+2); - ASTRA_CUDA_ASSERT(err); - err = cudaMemcpy2D(outD_data + out_pitch + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); + err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); ASTRA_CUDA_ASSERT(err); assert(err == cudaSuccess); return true; @@ -50,7 +47,7 @@ bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch, unsigned int width, unsigned int height, float* inD_data, unsigned int in_pitch) { - cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + (in_pitch + 1), sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); + cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); ASTRA_CUDA_ASSERT(err); return true; } @@ -60,7 +57,7 @@ bool copySinogramFromDevice(float* out_data, unsigned int out_pitch, unsigned int width, unsigned int height, float* inD_data, unsigned int in_pitch) { - cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + 1, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); + cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); ASTRA_CUDA_ASSERT(err); return true; } @@ -69,11 +66,8 @@ bool copySinogramToDevice(const float* in_data, unsigned int in_pitch, unsigned int width, unsigned int height, float* outD_data, unsigned int out_pitch) { - // TODO: a full memset isn't necessary. Only the edges. cudaError_t err; - err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, (width+2)*sizeof(float), height); - ASTRA_CUDA_ASSERT(err); - err = cudaMemcpy2D(outD_data + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); + err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); ASTRA_CUDA_ASSERT(err); return true; } @@ -132,8 +126,7 @@ __global__ void reduce1D(float *g_idata, float *g_odata, unsigned int n) __global__ void reduce2D(float *g_idata, float *g_odata, unsigned int pitch, - unsigned int nx, unsigned int ny, - unsigned int padX, unsigned int padY) + unsigned int nx, unsigned int ny) { extern __shared__ float sdata[]; const unsigned int tidx = threadIdx.x; @@ -145,11 +138,10 @@ __global__ void reduce2D(float *g_idata, float *g_odata, sdata[tid] = 0; - if (x >= padX && x < padX + nx) { + if (x < nx) { - while (y < padY + ny) { - if (y >= padY) - sdata[tid] += (g_idata[pitch*y+x] * g_idata[pitch*y+x]); + while (y < ny) { + sdata[tid] += (g_idata[pitch*y+x] * g_idata[pitch*y+x]); y += 16 * gridDim.y; } @@ -180,11 +172,10 @@ __global__ void reduce2D(float *g_idata, float *g_odata, } float dotProduct2D(float* D_data, unsigned int pitch, - unsigned int width, unsigned int height, - unsigned int padX, unsigned int padY) + unsigned int width, unsigned int height) { - unsigned int bx = ((width+padX) + 15) / 16; - unsigned int by = ((height+padY) + 127) / 128; + unsigned int bx = (width + 15) / 16; + unsigned int by = (height + 127) / 128; unsigned int shared_mem2 = sizeof(float) * 16 * 16; dim3 dimBlock2(16, 16); @@ -192,26 +183,27 @@ float dotProduct2D(float* D_data, unsigned int pitch, float* D_buf; cudaMalloc(&D_buf, sizeof(float) * (bx * by + 1) ); + float* D_res = D_buf + (bx*by); // Step 1: reduce 2D from image to a single vector, taking sum of squares - reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height, padX, padY); + reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height); cudaTextForceKernelsCompletion(); // Step 2: reduce 1D: add up elements in vector if (bx * by > 512) - reduce1D<512><<< 1, 512, sizeof(float)*512>>>(D_buf, D_buf+(bx*by), bx*by); + reduce1D<512><<< 1, 512, sizeof(float)*512>>>(D_buf, D_res, bx*by); else if (bx * by > 128) - reduce1D<128><<< 1, 128, sizeof(float)*128>>>(D_buf, D_buf+(bx*by), bx*by); + reduce1D<128><<< 1, 128, sizeof(float)*128>>>(D_buf, D_res, bx*by); else if (bx * by > 32) - reduce1D<32><<< 1, 32, sizeof(float)*32*2>>>(D_buf, D_buf+(bx*by), bx*by); + reduce1D<32><<< 1, 32, sizeof(float)*32*2>>>(D_buf, D_res, bx*by); else if (bx * by > 8) - reduce1D<8><<< 1, 8, sizeof(float)*8*2>>>(D_buf, D_buf+(bx*by), bx*by); + reduce1D<8><<< 1, 8, sizeof(float)*8*2>>>(D_buf, D_res, bx*by); else - reduce1D<1><<< 1, 1, sizeof(float)*1*2>>>(D_buf, D_buf+(bx*by), bx*by); + reduce1D<1><<< 1, 1, sizeof(float)*1*2>>>(D_buf, D_res, bx*by); float x; - cudaMemcpy(&x, D_buf+(bx*by), 4, cudaMemcpyDeviceToHost); + cudaMemcpy(&x, D_res, 4, cudaMemcpyDeviceToHost); cudaTextForceKernelsCompletion(); diff --git a/cuda/2d/util.h b/cuda/2d/util.h index d31e2eb..ed4a45c 100644 --- a/cuda/2d/util.h +++ b/cuda/2d/util.h @@ -82,8 +82,8 @@ void reportCudaError(cudaError_t err); float dotProduct2D(float* D_data, unsigned int pitch, - unsigned int width, unsigned int height, - unsigned int padX, unsigned int padY); + unsigned int width, unsigned int height); + } -- cgit v1.2.3