diff options
Diffstat (limited to 'cuda/2d')
-rw-r--r-- | cuda/2d/algo.cu | 356 | ||||
-rw-r--r-- | cuda/2d/algo.h | 155 | ||||
-rw-r--r-- | cuda/2d/arith.cu | 893 | ||||
-rw-r--r-- | cuda/2d/arith.h | 101 | ||||
-rw-r--r-- | cuda/2d/astra.cu | 824 | ||||
-rw-r--r-- | cuda/2d/astra.h | 205 | ||||
-rw-r--r-- | cuda/2d/cgls.cu | 304 | ||||
-rw-r--r-- | cuda/2d/cgls.h | 92 | ||||
-rw-r--r-- | cuda/2d/darthelper.cu | 358 | ||||
-rw-r--r-- | cuda/2d/darthelper.h | 44 | ||||
-rw-r--r-- | cuda/2d/dataop.cu | 130 | ||||
-rw-r--r-- | cuda/2d/dataop.h | 47 | ||||
-rw-r--r-- | cuda/2d/dims.h | 68 | ||||
-rw-r--r-- | cuda/2d/em.cu | 262 | ||||
-rw-r--r-- | cuda/2d/em.h | 77 | ||||
-rw-r--r-- | cuda/2d/fan_bp.cu | 374 | ||||
-rw-r--r-- | cuda/2d/fan_bp.h | 45 | ||||
-rw-r--r-- | cuda/2d/fan_fp.cu | 370 | ||||
-rw-r--r-- | cuda/2d/fan_fp.h | 41 | ||||
-rw-r--r-- | cuda/2d/fbp_filters.h | 58 | ||||
-rw-r--r-- | cuda/2d/fft.cu | 873 | ||||
-rw-r--r-- | cuda/2d/fft.h | 69 | ||||
-rw-r--r-- | cuda/2d/par_bp.cu | 357 | ||||
-rw-r--r-- | cuda/2d/par_bp.h | 48 | ||||
-rw-r--r-- | cuda/2d/par_fp.cu | 704 | ||||
-rw-r--r-- | cuda/2d/par_fp.h | 41 | ||||
-rw-r--r-- | cuda/2d/sart.cu | 283 | ||||
-rw-r--r-- | cuda/2d/sart.h | 85 | ||||
-rw-r--r-- | cuda/2d/sirt.cu | 342 | ||||
-rw-r--r-- | cuda/2d/sirt.h | 90 | ||||
-rw-r--r-- | cuda/2d/util.cu | 244 | ||||
-rw-r--r-- | cuda/2d/util.h | 90 |
32 files changed, 8030 insertions, 0 deletions
diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu new file mode 100644 index 0000000..5ae5d08 --- /dev/null +++ b/cuda/2d/algo.cu @@ -0,0 +1,356 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cassert> + +#include "algo.h" +#include "par_fp.h" +#include "fan_fp.h" +#include "par_bp.h" +#include "fan_bp.h" +#include "util.h" +#include "arith.h" + +namespace astraCUDA { + +ReconAlgo::ReconAlgo() +{ + angles = 0; + TOffsets = 0; + fanProjs = 0; + shouldAbort = false; + + useVolumeMask = false; + useSinogramMask = false; + D_maskData = 0; + D_smaskData = 0; + + D_sinoData = 0; + D_volumeData = 0; + + useMinConstraint = false; + useMaxConstraint = false; + + freeGPUMemory = false; +} + +ReconAlgo::~ReconAlgo() +{ + reset(); +} + +void ReconAlgo::reset() +{ + delete[] angles; + delete[] TOffsets; + delete[] fanProjs; + + if (freeGPUMemory) { + cudaFree(D_maskData); + cudaFree(D_smaskData); + cudaFree(D_sinoData); + cudaFree(D_volumeData); + } + + angles = 0; + TOffsets = 0; + fanProjs = 0; + shouldAbort = false; + + useVolumeMask = false; + useSinogramMask = false; + + D_maskData = 0; + D_smaskData = 0; + + D_sinoData = 0; + D_volumeData = 0; + + useMinConstraint = false; + useMaxConstraint = false; + + freeGPUMemory = false; +} + +bool ReconAlgo::setGPUIndex(int iGPUIndex) +{ + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + return true; +} + +bool ReconAlgo::enableVolumeMask() +{ + useVolumeMask = true; + return true; +} + +bool ReconAlgo::enableSinogramMask() +{ + useSinogramMask = true; + return true; +} + + +bool ReconAlgo::setGeometry(const SDimensions& _dims, const float* _angles) +{ + dims = _dims; + + angles = new float[dims.iProjAngles]; + + memcpy(angles, _angles, sizeof(angles[0]) * dims.iProjAngles); + + delete[] fanProjs; + fanProjs = 0; + + return true; +} + +bool ReconAlgo::setFanGeometry(const SDimensions& _dims, + const SFanProjection* _projs) +{ + dims = _dims; + fanProjs = new SFanProjection[dims.iProjAngles]; + + memcpy(fanProjs, _projs, sizeof(fanProjs[0]) * dims.iProjAngles); + + delete[] angles; + angles = 0; + + return true; +} + + +bool ReconAlgo::setTOffsets(const float* _TOffsets) +{ + // TODO: determine if they're all zero? + TOffsets = new float[dims.iProjAngles]; + memcpy(TOffsets, _TOffsets, sizeof(angles[0]) * dims.iProjAngles); + + return true; +} + + + +bool ReconAlgo::setVolumeMask(float* _D_maskData, unsigned int _maskPitch) +{ + assert(useVolumeMask); + + D_maskData = _D_maskData; + maskPitch = _maskPitch; + + return true; +} + +bool ReconAlgo::setSinogramMask(float* _D_smaskData, unsigned int _smaskPitch) +{ + assert(useSinogramMask); + + D_smaskData = _D_smaskData; + smaskPitch = _smaskPitch; + + return true; +} + +bool ReconAlgo::setBuffers(float* _D_volumeData, unsigned int _volumePitch, + float* _D_projData, unsigned int _projPitch) +{ + D_volumeData = _D_volumeData; + volumePitch = _volumePitch; + D_sinoData = _D_projData; + sinoPitch = _projPitch; + + return true; +} + +bool ReconAlgo::setMinConstraint(float fMin) +{ + fMinConstraint = fMin; + useMinConstraint = true; + return true; +} + +bool ReconAlgo::setMaxConstraint(float fMax) +{ + fMaxConstraint = fMax; + useMaxConstraint = true; + return true; +} + + + +bool ReconAlgo::allocateBuffers() +{ + bool ok; + ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + if (!ok) + return false; + + ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + if (!ok) { + cudaFree(D_volumeData); + D_volumeData = 0; + return false; + } + + if (useVolumeMask) { + ok = allocateVolume(D_maskData, dims.iVolWidth+2, dims.iVolHeight+2, maskPitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + D_volumeData = 0; + D_sinoData = 0; + return false; + } + } + + if (useSinogramMask) { + ok = allocateVolume(D_smaskData, dims.iProjDets+2, dims.iProjAngles, smaskPitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + cudaFree(D_maskData); + D_volumeData = 0; + D_sinoData = 0; + D_maskData = 0; + return false; + } + } + + freeGPUMemory = true; + return true; +} + +bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale, + const float* pfReconstruction, unsigned int iReconstructionPitch, + const float* pfVolMask, unsigned int iVolMaskPitch, + const float* pfSinoMask, unsigned int iSinoMaskPitch) +{ + if (!pfSinogram) + return false; + if (!pfReconstruction) + return false; + + bool ok = copySinogramToDevice(pfSinogram, iSinogramPitch, + dims.iProjDets, + dims.iProjAngles, + D_sinoData, sinoPitch); + if (!ok) + return false; + + // rescale sinogram to adjust for pixel size + processVol<opMul,SINO>(D_sinoData, fSinogramScale, + //1.0f/(fPixelSize*fPixelSize), + sinoPitch, + dims.iProjDets, dims.iProjAngles); + + ok = copyVolumeToDevice(pfReconstruction, iReconstructionPitch, + dims.iVolWidth, dims.iVolHeight, + D_volumeData, volumePitch); + if (!ok) + return false; + + + + if (useVolumeMask) { + if (!pfVolMask) + return false; + + ok = copyVolumeToDevice(pfVolMask, iVolMaskPitch, + dims.iVolWidth, dims.iVolHeight, + D_maskData, maskPitch); + if (!ok) + return false; + } + + if (useSinogramMask) { + if (!pfSinoMask) + return false; + + ok = copySinogramToDevice(pfSinoMask, iSinoMaskPitch, + dims.iProjDets, dims.iProjAngles, + D_smaskData, smaskPitch); + if (!ok) + return false; + } + + return true; +} + +bool ReconAlgo::getReconstruction(float* pfReconstruction, + unsigned int iReconstructionPitch) const +{ + bool ok = copyVolumeFromDevice(pfReconstruction, iReconstructionPitch, + dims.iVolWidth, + dims.iVolHeight, + D_volumeData, volumePitch); + if (!ok) + return false; + + return true; +} + + +bool ReconAlgo::callFP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + float outputScale) +{ + if (angles) { + assert(!fanProjs); + return FP(D_volumeData, volumePitch, D_projData, projPitch, + dims, angles, TOffsets, outputScale); + } else { + assert(fanProjs); + return FanFP(D_volumeData, volumePitch, D_projData, projPitch, + dims, fanProjs, outputScale); + } +} + +bool ReconAlgo::callBP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch) +{ + if (angles) { + assert(!fanProjs); + return BP(D_volumeData, volumePitch, D_projData, projPitch, + dims, angles, TOffsets); + } else { + assert(fanProjs); + return FanBP(D_volumeData, volumePitch, D_projData, projPitch, + dims, fanProjs); + } + +} + + + +} diff --git a/cuda/2d/algo.h b/cuda/2d/algo.h new file mode 100644 index 0000000..96195a3 --- /dev/null +++ b/cuda/2d/algo.h @@ -0,0 +1,155 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_ALGO_H +#define _CUDA_ALGO_H + +#include "util.h" + +namespace astraCUDA { + +class _AstraExport ReconAlgo { +public: + ReconAlgo(); + virtual ~ReconAlgo(); + + bool setGPUIndex(int iGPUIndex); + + bool setGeometry(const SDimensions& dims, const float* angles); + bool setFanGeometry(const SDimensions& dims, const SFanProjection* projs); + + // setTOffsets should (optionally) be called after setGeometry + bool setTOffsets(const float* TOffsets); + + void signalAbort() { shouldAbort = true; } + + virtual bool enableVolumeMask(); + virtual bool enableSinogramMask(); + + // init should be called after setting all geometry + virtual bool init() = 0; + + // setVolumeMask should be called after init and before iterate, + // but only if enableVolumeMask was called before init. + // It may be called again after iterate. + bool setVolumeMask(float* D_maskData, unsigned int maskPitch); + + // setSinogramMask should be called after init and before iterate, + // but only if enableSinogramMask was called before init. + // It may be called again after iterate. + bool setSinogramMask(float* D_smaskData, unsigned int smaskPitch); + + + // setBuffers should be called after init and before iterate. + // It may be called again after iterate. + virtual bool setBuffers(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch); + + + // instead of calling setBuffers, you can also call allocateBuffers + // to let ReconAlgo manage its own GPU memory + virtual bool allocateBuffers(); + virtual bool copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale, + const float* pfReconstruction, unsigned int iReconstructionPitch, + const float* pfVolMask, unsigned int iVolMaskPitch, + const float* pfSinoMask, unsigned int iSinoMaskPitch); + + + + // set Min/Max constraints. They may be called at any time, and will affect + // any iterate() calls afterwards. + virtual bool setMinConstraint(float fMin); + virtual bool setMaxConstraint(float fMax); + + + // iterate should be called after init and setBuffers. + // It may be called multiple times. + virtual bool iterate(unsigned int iterations) = 0; + + // Compute the norm of the difference of the FP of the current + // reconstruction and the sinogram. (This performs one FP.) + // It can be called after iterate. + virtual float computeDiffNorm() = 0; + // TODO: computeDiffNorm shouldn't be virtual, but for it to be + // implemented in ReconAlgo, it needs a way to get a suitable + // temporary sinogram buffer. + + bool getReconstruction(float* pfReconstruction, + unsigned int iReconstructionPitch) const; + + + +protected: + void reset(); + + bool callFP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + float outputScale); + bool callBP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch); + + + SDimensions dims; + float* angles; + float* TOffsets; + SFanProjection* fanProjs; + + volatile bool shouldAbort; + + bool freeGPUMemory; + + // Input/output + float* D_sinoData; + unsigned int sinoPitch; + + float* D_volumeData; + unsigned int volumePitch; + + // Masks + bool useVolumeMask; + bool useSinogramMask; + + float* D_maskData; + unsigned int maskPitch; + float* D_smaskData; + unsigned int smaskPitch; + + // Min/max + bool useMinConstraint; + bool useMaxConstraint; + float fMinConstraint; + float fMaxConstraint; + + +}; + + +} + +#endif + diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu new file mode 100644 index 0000000..1ee02ca --- /dev/null +++ b/cuda/2d/arith.cu @@ -0,0 +1,893 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "util.h" +#include "arith.h" +#include <cassert> + +namespace astraCUDA { + + +struct opAddScaled { + __device__ void operator()(float& out, const float in, const float inp) { + out += in * inp; + } +}; +struct opScaleAndAdd { + __device__ void operator()(float& out, const float in, const float inp) { + out = in + out * inp; + } +}; +struct opAddMulScaled { + __device__ void operator()(float& out, const float in1, const float in2, const float inp) { + out += in1 * in2 * inp; + } +}; +struct opAddMul { + __device__ void operator()(float& out, const float in1, const float in2) { + out += in1 * in2; + } +}; +struct opAdd { + __device__ void operator()(float& out, const float in) { + out += in; + } +}; +struct opAdd2 { + __device__ void operator()(float& out, const float in1, const float in2) { + out += in1 + in2; + } +}; +struct opMul { + __device__ void operator()(float& out, const float in) { + out *= in; + } +}; +struct opMul2 { + __device__ void operator()(float& out, const float in1, const float in2) { + out *= in1 * in2; + } +}; +struct opDividedBy { + __device__ void operator()(float& out, const float in) { + if (out > 0.000001f) // out is assumed to be positive + out = in / out; + else + out = 0.0f; + } +}; +struct opInvert { + __device__ void operator()(float& out) { + if (out > 0.000001f) // out is assumed to be positive + out = 1 / out; + else + out = 0.0f; + } +}; +struct opSet { + __device__ void operator()(float& out, const float inp) { + out = inp; + } +}; +struct opClampMin { + __device__ void operator()(float& out, const float inp) { + if (out < inp) + out = inp; + } +}; +struct opClampMax { + __device__ void operator()(float& out, const float inp) { + if (out > inp) + out = inp; + } +}; +struct opClampMinMask { + __device__ void operator()(float& out, const float in) { + if (out < in) + out = in; + } +}; +struct opClampMaxMask { + __device__ void operator()(float& out, const float in) { + if (out > in) + out = in; + } +}; +struct opSetMaskedValues { + __device__ void operator()(float& out, const float in, const float inp) { + if (!in) + out = inp; + } +}; +struct opSegmentAndMask { + __device__ void operator()(float& out1, float& out2, const float inp1, const float inp2) { + if (out1 >= inp1) { + out1 = inp2; + out2 = 0.0f; + } + + } + +}; +struct opMulMask { + __device__ void operator()(float& out, const float mask, const float in) { + if (mask > 0.0f) { + out *= in; + } + } +}; + + + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], fParam); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut1[off], pfOut2[off], fParam1, fParam2); + off += pitch; + y++; + } +} + + + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn[off], fParam); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn1[off], pfIn2[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn1[off], pfIn2[off], fParam); + off += pitch; + y++; + } +} + + + + + + + + + + + + + + + + +template<typename op, VolType t> +void processVolCopy(float* out, unsigned int width, unsigned int height) +{ + float* D_out; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + + processVol<op, t>(D_out, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); +} + +template<typename op, VolType t> +void processVolCopy(float* out, float param, unsigned int width, unsigned int height) +{ + float* D_out; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + + processVol<op, t>(D_out, param, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); +} + +template<typename op, VolType t> +void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height) +{ + float* D_out1; + float* D_out2; + + unsigned int pitch; + allocateVolume(D_out1, width+2, height+2, pitch); + copyVolumeToDevice(out1, width, width, height, D_out1, pitch); + allocateVolume(D_out2, width+2, height+2, pitch); + copyVolumeToDevice(out2, width, width, height, D_out2, pitch); + + processVol<op, t>(D_out1, D_out2, param1, param2, pitch, width, height); + + copyVolumeFromDevice(out1, width, width, height, D_out1, pitch); + copyVolumeFromDevice(out2, width, width, height, D_out2, pitch); + + cudaFree(D_out1); + cudaFree(D_out2); +} + + +template<typename op, VolType t> +void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height) +{ + float* D_out; + float* D_in; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + allocateVolume(D_in, width+2, height+2, pitch); + copyVolumeToDevice(in, width, width, height, D_in, pitch); + + processVol<op, t>(D_out, D_in, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); + cudaFree(D_in); +} + +template<typename op, VolType t> +void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height) +{ + float* D_out; + float* D_in; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + allocateVolume(D_in, width+2, height+2, pitch); + copyVolumeToDevice(in, width, width, height, D_in, pitch); + + processVol<op, t>(D_out, D_in, param, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); + cudaFree(D_in); +} + +template<typename op, VolType t> +void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height) +{ + float* D_out; + float* D_in1; + float* D_in2; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + allocateVolume(D_in1, width+2, height+2, pitch); + copyVolumeToDevice(in1, width, width, height, D_in1, pitch); + allocateVolume(D_in2, width+2, height+2, pitch); + copyVolumeToDevice(in2, width, width, height, D_in2, pitch); + + processVol<op, t>(D_out, D_in1, D_in2, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); + cudaFree(D_in1); + cudaFree(D_in2); +} + +template<typename op, VolType t> +void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height) +{ + float* D_out; + float* D_in1; + float* D_in2; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + allocateVolume(D_in1, width+2, height+2, pitch); + copyVolumeToDevice(in1, width, width, height, D_in1, pitch); + allocateVolume(D_in2, width+2, height+2, pitch); + copyVolumeToDevice(in2, width, width, height, D_in2, pitch); + + processVol<op, t>(D_out, D_in1, D_in2, param, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); + cudaFree(D_in1); + cudaFree(D_in2); +} + + + + + + + + + +template<typename op, VolType t> +void processVol(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+511)/512); + + devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devFFtoDD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + + +template<typename op, VolType t> +void processVol(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + + + + + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut1 = (float*)out1.ptr; + float *pfOut2 = (float*)out2.ptr; + unsigned int step = out1.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut1 += step; + pfOut2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut1 = (float*)out1.ptr; + float *pfOut2 = (float*)out2.ptr; + unsigned int step = out1.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut1 += step; + pfOut2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + + + + + + +#define INST_DFtoD(name) \ + template void processVolCopy<name, VOL>(float* out, const float* in, float param, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, const float* in, float param, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); + +#define INST_DtoD(name) \ + template void processVolCopy<name, VOL>(float* out, const float* in, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, const float* in, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); + +#define INST_DDtoD(name) \ + template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); + +#define INST_DDFtoD(name) \ + template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); + + +#define INST_toD(name) \ + template void processVolCopy<name, VOL>(float* out, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); + +#define INST_FtoD(name) \ + template void processVolCopy<name, VOL>(float* out, float param, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, float param, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); + +#define INST_FFtoDD(name) \ + template void processVolCopy<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); + + + +INST_DFtoD(opAddScaled) +INST_DFtoD(opScaleAndAdd) +INST_DDFtoD(opAddMulScaled) +INST_DDtoD(opAddMul) +INST_DDtoD(opMul2) +INST_DDtoD(opAdd2) +INST_DtoD(opMul) +INST_DDtoD(opMulMask) +INST_DtoD(opAdd) +INST_DtoD(opDividedBy) +INST_toD(opInvert) +INST_FtoD(opSet) +INST_FtoD(opMul) +INST_DFtoD(opMulMask) +INST_FtoD(opAdd) +INST_FtoD(opClampMin) +INST_FtoD(opClampMax) +INST_DtoD(opClampMinMask) +INST_DtoD(opClampMaxMask) + +// PDART-specific: +INST_DFtoD(opSetMaskedValues) +INST_FFtoDD(opSegmentAndMask) + +} diff --git a/cuda/2d/arith.h b/cuda/2d/arith.h new file mode 100644 index 0000000..c8c7b41 --- /dev/null +++ b/cuda/2d/arith.h @@ -0,0 +1,101 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_ARITH_H +#define _CUDA_ARITH_H + +#include <cuda.h> + +namespace astraCUDA { + + +struct opAddScaled; +struct opScaleAndAdd; +struct opAddMulScaled; +struct opAddMul; +struct opAdd; +struct opAdd2; +struct opMul; +struct opMul2; +struct opDividedBy; +struct opInvert; +struct opSet; +struct opClampMin; +struct opClampMax; +struct opClampMinMask; +struct opClampMaxMask; +struct opSegmentAndMask; +struct opSetMaskedValues; + +struct opMulMask; + + + +enum VolType { + SINO = 0, + VOL = 1 +}; + + +template<typename op, VolType t> void processVolCopy(float* out, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVolCopy(float* out, float param, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height); + +template<typename op, VolType t> void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(float* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(float* out, const float* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); + +template<typename op> void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); + +template<typename op> void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); + + + +} + +#endif diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu new file mode 100644 index 0000000..71ed025 --- /dev/null +++ b/cuda/2d/astra.cu @@ -0,0 +1,824 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "util.h" +#include "par_fp.h" +#include "fan_fp.h" +#include "par_bp.h" +#include "arith.h" +#include "astra.h" + +#include "fft.h" + +#include <fstream> +#include <cuda.h> + +#include "../../include/astra/Logger.h" + +using namespace astraCUDA; +using namespace std; + + +namespace astra { + +enum CUDAProjectionType { + PROJ_PARALLEL, + PROJ_FAN +}; + + +class AstraFBP_internal { +public: + SDimensions dims; + float* angles; + float* TOffsets; + + float fPixelSize; + + bool initialized; + bool setStartReconstruction; + + float* D_sinoData; + unsigned int sinoPitch; + + float* D_volumeData; + unsigned int volumePitch; + + cufftComplex * m_pDevFilter; +}; + +AstraFBP::AstraFBP() +{ + pData = new AstraFBP_internal(); + + pData->angles = 0; + pData->D_sinoData = 0; + pData->D_volumeData = 0; + + pData->dims.iVolWidth = 0; + pData->dims.iProjAngles = 0; + pData->dims.fDetScale = 1.0f; + pData->dims.iRaysPerDet = 1; + pData->dims.iRaysPerPixelDim = 1; + + pData->initialized = false; + pData->setStartReconstruction = false; + + pData->m_pDevFilter = NULL; +} + +AstraFBP::~AstraFBP() +{ + delete[] pData->angles; + pData->angles = 0; + + delete[] pData->TOffsets; + pData->TOffsets = 0; + + cudaFree(pData->D_sinoData); + pData->D_sinoData = 0; + + cudaFree(pData->D_volumeData); + pData->D_volumeData = 0; + + if(pData->m_pDevFilter != NULL) + { + freeComplexOnDevice(pData->m_pDevFilter); + pData->m_pDevFilter = NULL; + } + + delete pData; + pData = 0; +} + +bool AstraFBP::setReconstructionGeometry(unsigned int iVolWidth, + unsigned int iVolHeight, + float fPixelSize) +{ + if (pData->initialized) + return false; + + pData->dims.iVolWidth = iVolWidth; + pData->dims.iVolHeight = iVolHeight; + + pData->fPixelSize = fPixelSize; + + return (iVolWidth > 0 && iVolHeight > 0 && fPixelSize > 0.0f); +} + +bool AstraFBP::setProjectionGeometry(unsigned int iProjAngles, + unsigned int iProjDets, + const float* pfAngles, + float fDetSize) +{ + if (pData->initialized) + return false; + + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjDets = iProjDets; + pData->dims.fDetScale = fDetSize / pData->fPixelSize; + + if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0) + return false; + + pData->angles = new float[iProjAngles]; + memcpy(pData->angles, pfAngles, iProjAngles * sizeof(pfAngles[0])); + + return true; +} + +bool AstraFBP::setPixelSuperSampling(unsigned int iPixelSuperSampling) +{ + if (pData->initialized) + return false; + + if (iPixelSuperSampling == 0) + return false; + + pData->dims.iRaysPerPixelDim = iPixelSuperSampling; + + return true; +} + + +bool AstraFBP::setTOffsets(const float* pfTOffsets) +{ + if (pData->initialized) + return false; + + if (pfTOffsets == 0) + return false; + + pData->TOffsets = new float[pData->dims.iProjAngles]; + memcpy(pData->TOffsets, pfTOffsets, pData->dims.iProjAngles * sizeof(pfTOffsets[0])); + + return true; +} + +bool AstraFBP::init(int iGPUIndex) +{ + if (pData->initialized) + { + return false; + } + + if (pData->dims.iProjAngles == 0 || pData->dims.iVolWidth == 0) + { + return false; + } + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + { + return false; + } + + bool ok = allocateVolume(pData->D_volumeData, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2, pData->volumePitch); + if (!ok) + { + return false; + } + + ok = allocateVolume(pData->D_sinoData, pData->dims.iProjDets+2, pData->dims.iProjAngles, pData->sinoPitch); + if (!ok) + { + cudaFree(pData->D_volumeData); + pData->D_volumeData = 0; + return false; + } + + pData->initialized = true; + + return true; +} + +bool AstraFBP::setSinogram(const float* pfSinogram, + unsigned int iSinogramPitch) +{ + if (!pData->initialized) + return false; + if (!pfSinogram) + return false; + + bool ok = copySinogramToDevice(pfSinogram, iSinogramPitch, + pData->dims.iProjDets, + pData->dims.iProjAngles, + pData->D_sinoData, pData->sinoPitch); + if (!ok) + return false; + + // rescale sinogram to adjust for pixel size + processVol<opMul,SINO>(pData->D_sinoData, + 1.0f/(pData->fPixelSize*pData->fPixelSize), + pData->sinoPitch, + pData->dims.iProjDets, pData->dims.iProjAngles); + + pData->setStartReconstruction = false; + + return true; +} + +static int calcNextPowerOfTwo(int _iValue) +{ + int iOutput = 1; + + while(iOutput < _iValue) + { + iOutput *= 2; + } + + return iOutput; +} + +bool AstraFBP::run() +{ + if (!pData->initialized) + { + return false; + } + + zeroVolume(pData->D_volumeData, pData->volumePitch, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2); + + bool ok = false; + + if (pData->m_pDevFilter) { + + int iFFTRealDetCount = calcNextPowerOfTwo(2 * pData->dims.iProjDets); + int iFFTFourDetCount = calcFFTFourSize(iFFTRealDetCount); + + cufftComplex * pDevComplexSinogram = NULL; + + allocateComplexOnDevice(pData->dims.iProjAngles, iFFTFourDetCount, &pDevComplexSinogram); + + runCudaFFT(pData->dims.iProjAngles, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram); + + applyFilter(pData->dims.iProjAngles, iFFTFourDetCount, pDevComplexSinogram, pData->m_pDevFilter); + + runCudaIFFT(pData->dims.iProjAngles, pDevComplexSinogram, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount); + + freeComplexOnDevice(pDevComplexSinogram); + + } + + ok = BP(pData->D_volumeData, pData->volumePitch, pData->D_sinoData, pData->sinoPitch, pData->dims, pData->angles, pData->TOffsets); + if(!ok) + { + return false; + } + + processVol<opMul,VOL>(pData->D_volumeData, + (M_PI / 2.0f) / (float)pData->dims.iProjAngles, + pData->volumePitch, + pData->dims.iVolWidth, pData->dims.iVolHeight); + + return true; +} + +bool AstraFBP::getReconstruction(float* pfReconstruction, unsigned int iReconstructionPitch) const +{ + if (!pData->initialized) + return false; + + bool ok = copyVolumeFromDevice(pfReconstruction, iReconstructionPitch, + pData->dims.iVolWidth, + pData->dims.iVolHeight, + pData->D_volumeData, pData->volumePitch); + if (!ok) + return false; + + return true; +} + +int AstraFBP::calcFourierFilterSize(int _iDetectorCount) +{ + int iFFTRealDetCount = calcNextPowerOfTwo(2 * _iDetectorCount); + int iFreqBinCount = calcFFTFourSize(iFFTRealDetCount); + + // CHECKME: Matlab makes this at least 64. Do we also need to? + return iFreqBinCount; +} + +bool AstraFBP::setFilter(E_FBPFILTER _eFilter, const float * _pfHostFilter /* = NULL */, int _iFilterWidth /* = 0 */, float _fD /* = 1.0f */, float _fFilterParameter /* = -1.0f */) +{ + if(pData->m_pDevFilter != 0) + { + freeComplexOnDevice(pData->m_pDevFilter); + pData->m_pDevFilter = 0; + } + + if (_eFilter == FILTER_NONE) + return true; // leave pData->m_pDevFilter set to 0 + + + int iFFTRealDetCount = calcNextPowerOfTwo(2 * pData->dims.iProjDets); + int iFreqBinCount = calcFFTFourSize(iFFTRealDetCount); + + cufftComplex * pHostFilter = new cufftComplex[pData->dims.iProjAngles * iFreqBinCount]; + memset(pHostFilter, 0, sizeof(cufftComplex) * pData->dims.iProjAngles * iFreqBinCount); + + allocateComplexOnDevice(pData->dims.iProjAngles, iFreqBinCount, &(pData->m_pDevFilter)); + + switch(_eFilter) + { + case FILTER_NONE: + // handled above + break; + case FILTER_RAMLAK: + case FILTER_SHEPPLOGAN: + case FILTER_COSINE: + case FILTER_HAMMING: + case FILTER_HANN: + case FILTER_TUKEY: + case FILTER_LANCZOS: + case FILTER_TRIANGULAR: + case FILTER_GAUSSIAN: + case FILTER_BARTLETTHANN: + case FILTER_BLACKMAN: + case FILTER_NUTTALL: + case FILTER_BLACKMANHARRIS: + case FILTER_BLACKMANNUTTALL: + case FILTER_FLATTOP: + { + genFilter(_eFilter, _fD, pData->dims.iProjAngles, pHostFilter, iFFTRealDetCount, iFreqBinCount, _fFilterParameter); + uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter); + + break; + } + case FILTER_PROJECTION: + { + // make sure the offered filter has the correct size + assert(_iFilterWidth == iFreqBinCount); + + for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++) + { + float fValue = _pfHostFilter[iFreqBinIndex]; + + for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++) + { + pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue; + pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f; + } + } + uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter); + break; + } + case FILTER_SINOGRAM: + { + // make sure the offered filter has the correct size + assert(_iFilterWidth == iFreqBinCount); + + for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++) + { + for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++) + { + float fValue = _pfHostFilter[iFreqBinIndex + iProjectionIndex * _iFilterWidth]; + + pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue; + pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f; + } + } + uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter); + break; + } + case FILTER_RPROJECTION: + { + int iProjectionCount = pData->dims.iProjAngles; + int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount; + float * pfHostRealFilter = new float[iRealFilterElementCount]; + memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount); + + int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount); + int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2; + int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth; + + int iFilterShiftSize = _iFilterWidth / 2; + + for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++) + { + int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount; + float fValue = _pfHostFilter[iDetectorIndex]; + + for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++) + { + pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue; + } + } + + float* pfDevRealFilter = NULL; + cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors + cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice); + delete[] pfHostRealFilter; + + runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter); + + cudaFree(pfDevRealFilter); + + break; + } + case FILTER_RSINOGRAM: + { + int iProjectionCount = pData->dims.iProjAngles; + int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount; + float* pfHostRealFilter = new float[iRealFilterElementCount]; + memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount); + + int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount); + int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2; + int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth; + + int iFilterShiftSize = _iFilterWidth / 2; + + for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++) + { + int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount; + + for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++) + { + float fValue = _pfHostFilter[iDetectorIndex + iProjectionIndex * _iFilterWidth]; + pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue; + } + } + + float* pfDevRealFilter = NULL; + cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors + cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice); + delete[] pfHostRealFilter; + + runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter); + + cudaFree(pfDevRealFilter); + + break; + } + default: + { + fprintf(stderr, "AstraFBP::setFilter: Weird filter type requested"); + delete [] pHostFilter; + return false; + } + } + + delete [] pHostFilter; + + return true; +} + +BPalgo::BPalgo() +{ + +} + +BPalgo::~BPalgo() +{ + +} + +bool BPalgo::init() +{ + return true; +} + +bool BPalgo::iterate(unsigned int) +{ + // TODO: This zeroVolume makes an earlier memcpy of D_volumeData redundant + zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + callBP(D_volumeData, volumePitch, D_sinoData, sinoPitch); + return true; +} + +float BPalgo::computeDiffNorm() +{ + float *D_projData; + unsigned int projPitch; + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); + + float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + + cudaFree(D_projData); + + return sqrt(s); +} + + +bool astraCudaFP(const float* pfVolume, float* pfSinogram, + unsigned int iVolWidth, unsigned int iVolHeight, + unsigned int iProjAngles, unsigned int iProjDets, + const float *pfAngles, const float *pfOffsets, + float fDetSize, unsigned int iDetSuperSampling, + int iGPUIndex) +{ + SDimensions dims; + + if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjDets = iProjDets; + dims.fDetScale = fDetSize; + + if (iDetSuperSampling == 0) + return false; + + dims.iRaysPerDet = iDetSuperSampling; + + if (iVolWidth <= 0 || iVolHeight <= 0) + return false; + + dims.iVolWidth = iVolWidth; + dims.iVolHeight = iVolHeight; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + bool ok; + + float* D_volumeData; + unsigned int volumePitch; + + ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + if (!ok) + return false; + + float* D_sinoData; + unsigned int sinoPitch; + + ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + if (!ok) { + cudaFree(D_volumeData); + return false; + } + + ok = copyVolumeToDevice(pfVolume, dims.iVolWidth, + dims.iVolWidth, dims.iVolHeight, + D_volumeData, volumePitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + ok = FP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pfAngles, pfOffsets, 1.0f); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + ok = copySinogramFromDevice(pfSinogram, dims.iProjDets, + dims.iProjDets, + dims.iProjAngles, + D_sinoData, sinoPitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return true; +} + +bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, + unsigned int iVolWidth, unsigned int iVolHeight, + unsigned int iProjAngles, unsigned int iProjDets, + const float *pfAngles, float fOriginSourceDistance, + float fOriginDetectorDistance, float fPixelSize, + float fDetSize, + unsigned int iDetSuperSampling, + int iGPUIndex) +{ + SDimensions dims; + + if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjDets = iProjDets; + + if (iDetSuperSampling == 0) + return false; + + dims.iRaysPerDet = iDetSuperSampling; + + if (iVolWidth <= 0 || iVolHeight <= 0) + return false; + + dims.iVolWidth = iVolWidth; + dims.iVolHeight = iVolHeight; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + bool ok; + + float* D_volumeData; + unsigned int volumePitch; + + ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + if (!ok) + return false; + + float* D_sinoData; + unsigned int sinoPitch; + + ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + if (!ok) { + cudaFree(D_volumeData); + return false; + } + + ok = copyVolumeToDevice(pfVolume, dims.iVolWidth, + dims.iVolWidth, dims.iVolHeight, + D_volumeData, volumePitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + + // TODO: Turn this geometry conversion into a util function + SFanProjection* projs = new SFanProjection[dims.iProjAngles]; + + float fSrcX0 = 0.0f; + float fSrcY0 = -fOriginSourceDistance / fPixelSize; + float fDetUX0 = fDetSize / fPixelSize; + float fDetUY0 = 0.0f; + float fDetSX0 = dims.iProjDets * fDetUX0 / -2.0f; + float fDetSY0 = fOriginDetectorDistance / fPixelSize; + +#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = f##name##X0 * cos(alpha) - f##name##Y0 * sin(alpha); projs[i].f##name##Y = f##name##X0 * sin(alpha) + f##name##Y0 * cos(alpha); } while(0) + for (int i = 0; i < dims.iProjAngles; ++i) { + ROTATE0(Src, i, pfAngles[i]); + ROTATE0(DetS, i, pfAngles[i]); + ROTATE0(DetU, i, pfAngles[i]); + } + +#undef ROTATE0 + + ok = FanFP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, projs, 1.0f); + delete[] projs; + + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + ok = copySinogramFromDevice(pfSinogram, dims.iProjDets, + dims.iProjDets, + dims.iProjAngles, + D_sinoData, sinoPitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + cudaFree(D_volumeData); + cudaFree(D_sinoData); + + return true; + +} + + +bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, + unsigned int iVolWidth, unsigned int iVolHeight, + unsigned int iProjAngles, unsigned int iProjDets, + const SFanProjection *pAngles, + unsigned int iDetSuperSampling, + int iGPUIndex) +{ + SDimensions dims; + + if (iProjAngles == 0 || iProjDets == 0 || pAngles == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjDets = iProjDets; + dims.fDetScale = 1.0f; // TODO? + + if (iDetSuperSampling == 0) + return false; + + dims.iRaysPerDet = iDetSuperSampling; + + if (iVolWidth <= 0 || iVolHeight <= 0) + return false; + + dims.iVolWidth = iVolWidth; + dims.iVolHeight = iVolHeight; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + bool ok; + + float* D_volumeData; + unsigned int volumePitch; + + ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + if (!ok) + return false; + + float* D_sinoData; + unsigned int sinoPitch; + + ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + if (!ok) { + cudaFree(D_volumeData); + return false; + } + + ok = copyVolumeToDevice(pfVolume, dims.iVolWidth, + dims.iVolWidth, dims.iVolHeight, + D_volumeData, volumePitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + + ok = FanFP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pAngles, 1.0f); + + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + ok = copySinogramFromDevice(pfSinogram, dims.iProjDets, + dims.iProjDets, + dims.iProjAngles, + D_sinoData, sinoPitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + cudaFree(D_volumeData); + cudaFree(D_sinoData); + + return true; + +} + + +} diff --git a/cuda/2d/astra.h b/cuda/2d/astra.h new file mode 100644 index 0000000..9e58301 --- /dev/null +++ b/cuda/2d/astra.h @@ -0,0 +1,205 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_ASTRA_H +#define _CUDA_ASTRA_H + +#include "fft.h" +#include "fbp_filters.h" +#include "dims.h" +#include "algo.h" + +using astraCUDA::SFanProjection; + +namespace astra { + +enum Cuda2DProjectionKernel { + ker2d_default = 0 +}; + +class AstraFBP_internal; + +class _AstraExport AstraFBP { +public: + // Constructor + AstraFBP(); + + // Destructor + ~AstraFBP(); + + // Set the size of the reconstruction rectangle. + // Volume pixels are currently assumed to be 1x1 squares. + bool setReconstructionGeometry(unsigned int iVolWidth, + unsigned int iVolHeight, + float fPixelSize = 1.0f); + + // Set the projection angles and number of detector pixels per angle. + // pfAngles must be a float array of length iProjAngles. + // fDetSize indicates the size of a detector pixel compared to a + // volume pixel edge. + // + // pfAngles will only be read from during this call. + bool setProjectionGeometry(unsigned int iProjAngles, + unsigned int iProjDets, + const float *pfAngles, + float fDetSize = 1.0f); + + // Set linear supersampling factor for the BP. + // (The number of rays is the square of this) + // + // This may optionally be called before init(). + bool setPixelSuperSampling(unsigned int iPixelSuperSampling); + + // Set per-detector shifts. + // + // pfTOffsets will only be read from during this call. + bool setTOffsets(const float *pfTOffsets); + + // Returns the required size of a filter in the fourier domain + // when multiplying it with the fft of the projection data. + // Its value is equal to the smallest power of two larger than + // or equal to twice the number of detectors in the spatial domain. + // + // _iDetectorCount is the number of detectors in the spatial domain. + static int calcFourierFilterSize(int _iDetectorCount); + + // Sets the filter type. Some filter types require the user to supply an + // array containing the filter. + // The number of elements in a filter in the fourier domain should be equal + // to the value returned by calcFourierFilterSize(). + // The following types require a filter: + // + // - FILTER_PROJECTION: + // The filter size should be equal to the output of + // calcFourierFilterSize(). The filtered sinogram is + // multiplied with the supplied filter. + // + // - FILTER_SINOGRAM: + // Same as FILTER_PROJECTION, but now the filter should contain a row for + // every projection direction. + // + // - FILTER_RPROJECTION: + // The filter should now contain one kernel (= ifft of filter), with the + // peak in the center. The filter width + // can be any value. If odd, the peak is assumed to be in the center, if + // even, it is assumed to be at floor(filter-width/2). + // + // - FILTER_RSINOGRAM + // Same as FILTER_RPROJECTION, but now the supplied filter should contain a + // row for every projection direction. + // + // A large number of other filters (FILTER_RAMLAK, FILTER_SHEPPLOGAN, + // FILTER_COSINE, FILTER_HAMMING, and FILTER_HANN) + // have a D variable, which gives the cutoff point in the frequency domain. + // Setting this value to 1.0 will include the whole filter + bool setFilter(E_FBPFILTER _eFilter, + const float * _pfHostFilter = NULL, + int _iFilterWidth = 0, float _fD = 1.0f, float _fFilterParameter = -1.0f); + + // Initialize CUDA, allocate GPU buffers and + // precompute geometry-specific data. + // + // CUDA is set up to use GPU number iGPUIndex. + // + // This must be called after calling setReconstructionGeometry() and + // setProjectionGeometry(). + bool init(int iGPUIndex = 0); + + // Setup input sinogram for a slice. + // pfSinogram must be a float array of size iProjAngles*iSinogramPitch. + // NB: iSinogramPitch is measured in floats, not in bytes. + // + // This must be called after init(), and before iterate(). It may be + // called again after iterate()/getReconstruction() to start a new slice. + // + // pfSinogram will only be read from during this call. + bool setSinogram(const float* pfSinogram, unsigned int iSinogramPitch); + + // Runs an FBP reconstruction. + // This must be called after setSinogram(). + // + // run can be called before setFilter, but will then use the default Ram-Lak filter + bool run(); + + // Get the reconstructed slice. + // pfReconstruction must be a float array of size + // iVolHeight*iReconstructionPitch. + // NB: iReconstructionPitch is measured in floats, not in bytes. + // + // This may be called after run(). + bool getReconstruction(float* pfReconstruction, + unsigned int iReconstructionPitch) const; + +private: + AstraFBP_internal* pData; +}; + +class _AstraExport BPalgo : public astraCUDA::ReconAlgo { +public: + BPalgo(); + ~BPalgo(); + + virtual bool init(); + + virtual bool iterate(unsigned int iterations); + + virtual float computeDiffNorm(); +}; + + + + +// TODO: Clean up this interface to FP + +// Do a single forward projection +_AstraExport bool astraCudaFP(const float* pfVolume, float* pfSinogram, + unsigned int iVolWidth, unsigned int iVolHeight, + unsigned int iProjAngles, unsigned int iProjDets, + const float *pfAngles, const float *pfOffsets, + float fDetSize = 1.0f, unsigned int iDetSuperSampling = 1, + int iGPUIndex = 0); + +// Do a single forward projection, fan beam +_AstraExport bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, + unsigned int iVolWidth, unsigned int iVolHeight, + unsigned int iProjAngles, unsigned int iProjDets, + const float *pfAngles, float fOriginSourceDistance, + float fOriginDetectorDistance, float fPixelSize = 1.0f, + float fDetSize = 1.0f, + unsigned int iDetSuperSampling = 1, + int iGPUIndex = 0); + +_AstraExport bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, + unsigned int iVolWidth, unsigned int iVolHeight, + unsigned int iProjAngles, unsigned int iProjDets, + const SFanProjection *pAngles, + unsigned int iDetSuperSampling = 1, + int iGPUIndex = 0); + +} +#endif diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu new file mode 100644 index 0000000..5b1cf46 --- /dev/null +++ b/cuda/2d/cgls.cu @@ -0,0 +1,304 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "cgls.h" +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +namespace astraCUDA { + +CGLS::CGLS() : ReconAlgo() +{ + D_z = 0; + D_p = 0; + D_r = 0; + D_w = 0; + + sliceInitialized = false; +} + + +CGLS::~CGLS() +{ + reset(); +} + +void CGLS::reset() +{ + cudaFree(D_z); + cudaFree(D_p); + cudaFree(D_r); + cudaFree(D_w); + + D_z = 0; + D_p = 0; + D_r = 0; + D_w = 0; + + ReconAlgo::reset(); +} + +bool CGLS::init() +{ + // Lifetime of z: within an iteration + allocateVolume(D_z, dims.iVolWidth+2, dims.iVolHeight+2, zPitch); + + // Lifetime of p: full algorithm + allocateVolume(D_p, dims.iVolWidth+2, dims.iVolHeight+2, pPitch); + + // Lifetime of r: full algorithm + allocateVolume(D_r, dims.iProjDets+2, dims.iProjAngles, rPitch); + + // Lifetime of w: within an iteration + allocateVolume(D_w, dims.iProjDets+2, dims.iProjAngles, wPitch); + + // TODO: check if allocations succeeded + return true; +} + + +bool CGLS::setBuffers(float* _D_volumeData, unsigned int _volumePitch, + float* _D_projData, unsigned int _projPitch) +{ + bool ok = ReconAlgo::setBuffers(_D_volumeData, _volumePitch, + _D_projData, _projPitch); + + if (!ok) + return false; + + sliceInitialized = false; + + return true; +} + +bool CGLS::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale, + const float* pfReconstruction, unsigned int iReconstructionPitch, + const float* pfVolMask, unsigned int iVolMaskPitch, + const float* pfSinoMask, unsigned int iSinoMaskPitch) +{ + sliceInitialized = false; + + return ReconAlgo::copyDataToGPU(pfSinogram, iSinogramPitch, fSinogramScale, pfReconstruction, iReconstructionPitch, pfVolMask, iVolMaskPitch, pfSinoMask, iSinoMaskPitch); +} + +bool CGLS::iterate(unsigned int iterations) +{ + shouldAbort = false; + + if (!sliceInitialized) { + + // copy sinogram + cudaMemcpy2D(D_r, sizeof(float)*rPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + + // r = sino - A*x + if (useVolumeMask) { + // Use z as temporary storage here since it is unused + cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); + processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_z, zPitch, D_r, rPitch, -1.0f); + } else { + callFP(D_volumeData, volumePitch, D_r, rPitch, -1.0f); + } + + + // p = A'*r + zeroVolume(D_p, pPitch, dims.iVolWidth+2, dims.iVolHeight+2); + callBP(D_p, pPitch, D_r, rPitch); + if (useVolumeMask) + processVol<opMul, VOL>(D_p, D_maskData, pPitch, dims.iVolWidth, dims.iVolHeight); + + + gamma = dotProduct2D(D_p, pPitch, dims.iVolWidth, dims.iVolHeight, 1, 1); + + sliceInitialized = true; + } + + + // iteration + for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { + + // w = A*p + zeroVolume(D_w, wPitch, dims.iProjDets+2, dims.iProjAngles); + callFP(D_p, pPitch, D_w, wPitch, 1.0f); + + // alpha = gamma / <w,w> + float ww = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + float alpha = gamma / ww; + + // x += alpha*p + processVol<opAddScaled, VOL>(D_volumeData, D_p, alpha, volumePitch, dims.iVolWidth, dims.iVolHeight); + + // r -= alpha*w + processVol<opAddScaled, SINO>(D_r, D_w, -alpha, rPitch, dims.iProjDets, dims.iProjAngles); + + + // z = A'*r + zeroVolume(D_z, zPitch, dims.iVolWidth+2, dims.iVolHeight+2); + callBP(D_z, zPitch, D_r, rPitch); + if (useVolumeMask) + processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); + + float beta = 1.0f / gamma; + gamma = dotProduct2D(D_z, zPitch, dims.iVolWidth, dims.iVolHeight, 1, 1); + beta *= gamma; + + // p = z + beta*p + processVol<opScaleAndAdd, VOL>(D_p, D_z, beta, pPitch, dims.iVolWidth, dims.iVolHeight); + + } + + return true; +} + + +float CGLS::computeDiffNorm() +{ + // We can use w and z as temporary storage here since they're not + // used outside of iterations. + + // copy sinogram to w + cudaMemcpy2D(D_w, sizeof(float)*wPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); + processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_z, zPitch, D_w, wPitch, -1.0f); + } else { + callFP(D_volumeData, volumePitch, D_w, wPitch, -1.0f); + } + + // compute norm of D_w + + float s = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + + return sqrt(s); +} + +bool doCGLS(float* D_volumeData, unsigned int volumePitch, + float* D_sinoData, unsigned int sinoPitch, + const SDimensions& dims, /*const SAugmentedData& augs,*/ + const float* angles, const float* TOffsets, unsigned int iterations) +{ + CGLS cgls; + bool ok = true; + + ok &= cgls.setGeometry(dims, angles); +#if 0 + if (D_maskData) + ok &= cgls.enableVolumeMask(); +#endif + if (TOffsets) + ok &= cgls.setTOffsets(TOffsets); + + if (!ok) + return false; + + ok = cgls.init(); + if (!ok) + return false; + +#if 0 + if (D_maskData) + ok &= cgls.setVolumeMask(D_maskData, maskPitch); +#endif + + ok &= cgls.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); + if (!ok) + return false; + + ok = cgls.iterate(iterations); + + return ok; +} + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_sinoData; + + SDimensions dims; + dims.iVolWidth = 1024; + dims.iVolHeight = 1024; + dims.iProjAngles = 512; + dims.iProjDets = 1536; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + unsigned int volumePitch, sinoPitch; + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + printf("pitch: %u\n", sinoPitch); + + unsigned int y, x; + float* sino = loadImage("sino.png", y, x); + + float* img = new float[dims.iVolWidth*dims.iVolHeight]; + + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch); + + float* angle = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) + angle[i] = i*(M_PI/dims.iProjAngles); + + CGLS cgls; + + cgls.setGeometry(dims, angle); + cgls.init(); + + cgls.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); + + cgls.iterate(25); + + delete[] angle; + + copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + + saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img); + + return 0; +} +#endif diff --git a/cuda/2d/cgls.h b/cuda/2d/cgls.h new file mode 100644 index 0000000..1013bf8 --- /dev/null +++ b/cuda/2d/cgls.h @@ -0,0 +1,92 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_CGLS_H +#define _CUDA_CGLS_H + +#include "util.h" +#include "algo.h" + +namespace astraCUDA { + +class _AstraExport CGLS : public ReconAlgo { +public: + CGLS(); + virtual ~CGLS(); + + // disable some features + virtual bool enableSinogramMask() { return false; } + virtual bool setMinConstraint(float) { return false; } + virtual bool setMaxConstraint(float) { return false; } + + virtual bool init(); + + virtual bool setBuffers(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch); + + virtual bool copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale, + const float* pfReconstruction, unsigned int iReconstructionPitch, + const float* pfVolMask, unsigned int iVolMaskPitch, + const float* pfSinoMask, unsigned int iSinoMaskPitch); + + + virtual bool iterate(unsigned int iterations); + + virtual float computeDiffNorm(); + +protected: + void reset(); + + bool sliceInitialized; + + // Buffers + float* D_r; + unsigned int rPitch; + + float* D_w; + unsigned int wPitch; + + float* D_z; + unsigned int zPitch; + + float* D_p; + unsigned int pPitch; + + + float gamma; +}; + + +_AstraExport bool doCGLS(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, unsigned int iterations); + +} + +#endif diff --git a/cuda/2d/darthelper.cu b/cuda/2d/darthelper.cu new file mode 100644 index 0000000..db0036e --- /dev/null +++ b/cuda/2d/darthelper.cu @@ -0,0 +1,358 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "util.h" +#include "darthelper.h" +#include <cassert> + +namespace astraCUDA { + +// CUDA function for the selection of ROI +__global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +{ + float x = (float)(threadIdx.x + 16*blockIdx.x); + float y = (float)(threadIdx.y + 16*blockIdx.y); + + float w = (width-1.0f)*0.5f; + float h = (height-1.0f)*0.5f; + + if ((x-w)*(x-w) + (y-h)*(y-h) > radius * radius * 0.25f) + { + float* d = (float*)in; + unsigned int o = (y+padY)*pitch+x+padX; + d[o] = 0.0f; + } +} + +void roiSelect(float* out, float radius, unsigned int width, unsigned int height) +{ + float* D_data; + + unsigned int pitch; + allocateVolume(D_data, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_data, pitch); + + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + devRoiSelect<<<gridSize, blockSize>>>(D_data, radius, pitch, width, height, 1, 1); + + copyVolumeFromDevice(out, width, width, height, D_data, pitch); + + cudaFree(D_data); +} + + + + +// CUDA function for the masking of DART with a radius == 1 +__global__ void devDartMask(float* mask, const float* in, unsigned int conn, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > 0 && x < width - 1 && y > 0 && y < height - 1) { + float* d = (float*)in; + float* m = (float*)mask; + + unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. + unsigned int o1 = o2 - pitch; // On previous row. + unsigned int o3 = o2 + pitch; // On next row. + + if ((conn == 8 && // 8-connected + (d[o1 - 1] != d[o2] || d[o1] != d[o2] || d[o1 + 1] != d[o2] || + d[o2 - 1] != d[o2] || d[o2 + 1] != d[o2] || + d[o3 - 1] != d[o2] || d[o3] != d[o2] || d[o3 + 1] != d[o2] )) + || + (conn == 4 && // 4-connected + ( d[o1] != d[o2] || + d[o2 - 1] != d[o2] || d[o3 + 1] != d[o2] || + d[o3] != d[o2] ))) + { + m[o2] = 1.0f; + } + } +} + + +// CUDA function for the masking of DART with a radius > 1 +__global__ void devDartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius) + { + float* d = (float*)in; + float* m = (float*)mask; + + int r = radius; + + // o2: index of the current center pixel + int o2 = (y+padY)*pitch+x+padX; + + if (conn == 8) // 8-connected + { + for (int row = -r; row <= r; row++) + { + int o1 = (y+padY+row)*pitch+x+padX; + for (int col = -r; col <= r; col++) + { + if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;} + } + } + } + else if (conn == 4) // 4-connected + { + // horizontal + unsigned int o1 = (y+padY)*pitch+x+padX; + for (int col = -r; col <= r; col++) + { + if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;} + } + + // vertical + for (int row = -r; row <= r; row++) + { + unsigned int o1 = (y+padY+row)*pitch+x+padX; + if (d[o1] != d[o2]) {m[o2] = 1.0f; return;} + } + } + } +} + + +// CUDA function for the masking of ADART with a radius == 1 +__global__ void devADartMask(float* mask, const float* in, unsigned int conn, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > 0 && x < width - 1 && y > 0 && y < height - 1) { + float* d = (float*)in; + float* m = (float*)mask; + + unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. + unsigned int o1 = o2 - pitch; // On previous row. + unsigned int o3 = o2 + pitch; // On next row. + + if (conn == 8) + { + if (d[o1 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o1 ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o1 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o2 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o2 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o3 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o3 ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o3 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + } + else if (conn == 4) + { + if (d[o1 ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o2 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o2 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o3 ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + } + } +} + + +// CUDA function for the masking of ADART with a radius > 1 +__global__ void devADartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius) + { + float* d = (float*)in; + float* m = (float*)mask; + + int r = radius; + + unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. + + if (conn == 8) + { + for (int row = -r; row <= r; row++) + { + unsigned int o1 = (y+padY+row)*pitch+x+padX; + for (int col = -r; col <= r; col++) + { + if (d[o1+col] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + } + } + } + else if (conn == 4) + { + // horizontal + for (int col = -r; col <= r; col++) + { + if (d[o2+col] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + } + + // vertical + for (int row = -r; row <= r; row++) + { + unsigned int o1 = (y+padY+row)*pitch+x+padX; + if (d[o1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + } + } + } +} + + +void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int width, unsigned int height) +{ + float* D_segmentationData; + float* D_maskData; + + unsigned int pitch; + allocateVolume(D_segmentationData, width+2, height+2, pitch); + copyVolumeToDevice(segmentation, width, width, height, D_segmentationData, pitch); + + allocateVolume(D_maskData, width+2, height+2, pitch); + zeroVolume(D_maskData, pitch, width+2, height+2); + + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + if (threshold == 1 && radius == 1) + devDartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, pitch, width, height, 1, 1); + else if (threshold > 1 && radius == 1) + devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height, 1, 1); + else if (threshold == 1 && radius > 1) + devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height, 1, 1); + else + devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height, 1, 1); + + copyVolumeFromDevice(mask, width, width, height, D_maskData, pitch); + + cudaFree(D_segmentationData); + cudaFree(D_maskData); + +} + + +__global__ void devDartSmoothingRadius(float* out, const float* in, float b, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius) + { + float* d = (float*)in; + float* m = (float*)out; + + unsigned int o2 = (y+padY)*pitch+x+padX; + int r = radius; + float res = -d[o2]; + + for (int row = -r; row < r; row++) + { + unsigned int o1 = (y+padY+row)*pitch+x+padX; + for (int col = -r; col <= r; col++) + { + res += d[o1+col]; + } + } + + res *= b / 4*r*(r+1); + res += (1.0f-b) * d[o2]; + + m[o2] = res; + } +} + + +__global__ void devDartSmoothing(float* out, const float* in, float b, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > 0 && x < width - 1 && y > 0 && y < height - 1) { + float* d = (float*)in; + float* m = (float*)out; + + unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. + unsigned int o1 = o2 - pitch; // On previous row. + unsigned int o3 = o2 + pitch; // On next row. + + m[o2] = (1.0f-b) * d[o2] + b * 0.125f * (d[o1 - 1] + d[o1] + d[o1 + 1] + d[o2 - 1] + d[o2 + 1] + d[o3 - 1] + d[o3] + d[o3 + 1]); + } +} + + +void dartSmoothing(float* out, const float* in, float b, unsigned int radius, unsigned int width, unsigned int height) +{ + float* D_inData; + float* D_outData; + + unsigned int pitch; + allocateVolume(D_inData, width+2, height+2, pitch); + copyVolumeToDevice(in, width, width, height, D_inData, pitch); + + allocateVolume(D_outData, width+2, height+2, pitch); + zeroVolume(D_outData, pitch, width+2, height+2); + + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + if (radius == 1) + devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, pitch, width, height, 1, 1); + else + devDartSmoothingRadius<<<gridSize, blockSize>>>(D_outData, D_inData, b, radius, pitch, width, height, 1, 1); + + copyVolumeFromDevice(out, width, width, height, D_outData, pitch); + + cudaFree(D_outData); + cudaFree(D_inData); + +} + + + +bool setGPUIndex(int iGPUIndex) +{ + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + return true; +} + + +} diff --git a/cuda/2d/darthelper.h b/cuda/2d/darthelper.h new file mode 100644 index 0000000..e05f01e --- /dev/null +++ b/cuda/2d/darthelper.h @@ -0,0 +1,44 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_ARITH2_H +#define _CUDA_ARITH2_H + +#include <cuda.h> + +namespace astraCUDA { + + void roiSelect(float* out, float radius, unsigned int width, unsigned int height); + void dartMask(float* out, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int width, unsigned int height); + void dartSmoothing(float* out, const float* in, float b, unsigned int radius, unsigned int width, unsigned int height); + + bool setGPUIndex(int index); + +} + +#endif diff --git a/cuda/2d/dataop.cu b/cuda/2d/dataop.cu new file mode 100644 index 0000000..68573b2 --- /dev/null +++ b/cuda/2d/dataop.cu @@ -0,0 +1,130 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "util.h" +#include "dataop.h" +#include "arith.h" +#include <cassert> + +namespace astraCUDA { + +void operationVolumeMult(float* data1, float* data2, unsigned int width, unsigned int height) +{ + float* D_data1; + float* D_data2; + + unsigned int pitch; + allocateVolume(D_data1, width+2, height+2, pitch); + copyVolumeToDevice(data1, width, width, height, D_data1, pitch); + + allocateVolume(D_data2, width+2, height+2, pitch); + copyVolumeToDevice(data2, width, width, height, D_data2, pitch); + + processVol<opMul, VOL>(D_data1, D_data2, pitch, width, height); + + copyVolumeFromDevice(data1, width, width, height, D_data1, pitch); + + cudaFree(D_data1); + cudaFree(D_data2); +} + +void operationVolumeMultScalarMask(float* data, float* mask, float scalar, unsigned int width, unsigned int height) +{ + float* D_data; + float* D_mask; + + unsigned int pitch; + allocateVolume(D_data, width+2, height+2, pitch); + copyVolumeToDevice(data, width, width, height, D_data, pitch); + + allocateVolume(D_mask, width+2, height+2, pitch); + copyVolumeToDevice(mask, width, width, height, D_mask, pitch); + + processVol<opMulMask, VOL>(D_data, D_mask, scalar, pitch, width, height); + + copyVolumeFromDevice(data, width, width, height, D_data, pitch); + + cudaFree(D_data); + cudaFree(D_mask); +} + + +void operationVolumeMultScalar(float* data, float scalar, unsigned int width, unsigned int height) +{ + float* D_data; + + unsigned int pitch; + allocateVolume(D_data, width+2, height+2, pitch); + copyVolumeToDevice(data, width, width, height, D_data, pitch); + + processVol<opMul, VOL>(D_data, scalar, pitch, width, height); + + copyVolumeFromDevice(data, width, width, height, D_data, pitch); + + cudaFree(D_data); +} + + +void operationVolumeAdd(float* data1, float* data2, unsigned int width, unsigned int height) +{ + float* D_data1; + float* D_data2; + + unsigned int pitch; + allocateVolume(D_data1, width+2, height+2, pitch); + copyVolumeToDevice(data1, width, width, height, D_data1, pitch); + + allocateVolume(D_data2, width+2, height+2, pitch); + copyVolumeToDevice(data2, width, width, height, D_data2, pitch); + + processVol<opAdd, VOL>(D_data1, D_data2, pitch, width, height); + + copyVolumeFromDevice(data1, width, width, height, D_data1, pitch); + + cudaFree(D_data1); + cudaFree(D_data2); +} + + +void operationVolumeAddScalar(float* data, float scalar, unsigned int width, unsigned int height) +{ + float* D_data; + + unsigned int pitch; + allocateVolume(D_data, width+2, height+2, pitch); + copyVolumeToDevice(data, width, width, height, D_data, pitch); + + processVol<opAdd, VOL>(D_data, scalar, pitch, width, height); + + copyVolumeFromDevice(data, width, width, height, D_data, pitch); + + cudaFree(D_data); +} + + +} diff --git a/cuda/2d/dataop.h b/cuda/2d/dataop.h new file mode 100644 index 0000000..3e9c7e2 --- /dev/null +++ b/cuda/2d/dataop.h @@ -0,0 +1,47 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_DATAOP_H +#define _CUDA_DATAOP_H + +#include <cuda.h> + +namespace astraCUDA { + + void operationVolumeMult(float* data1, float* data2, unsigned int width, unsigned int height); + + void operationVolumeMultScalar(float* data, float scalar, unsigned int width, unsigned int height); + void operationVolumeMultScalarMask(float* data, float* mask, float scalar, unsigned int width, unsigned int height); + + void operationVolumeAdd(float* data1, float* data2, unsigned int width, unsigned int height); + + void operationVolumeAddScalar(float* data, float scalar, unsigned int width, unsigned int height); + +} + +#endif diff --git a/cuda/2d/dims.h b/cuda/2d/dims.h new file mode 100644 index 0000000..21ccb31 --- /dev/null +++ b/cuda/2d/dims.h @@ -0,0 +1,68 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_DIMS_H +#define _CUDA_DIMS_H + +namespace astraCUDA { + +struct SFanProjection { + // the source + float fSrcX, fSrcY; + + // the start of the (linear) detector + float fDetSX, fDetSY; + + // the length of a single detector pixel + float fDetUX, fDetUY; +}; + + +struct SDimensions { + unsigned int iVolWidth; + unsigned int iVolHeight; + unsigned int iProjAngles; + unsigned int iProjDets; + float fDetScale; // size of detector compared to volume pixels + unsigned int iRaysPerDet; + unsigned int iRaysPerPixelDim; +}; + +struct SDimensions3D { + unsigned int iVolX; + unsigned int iVolY; + unsigned int iVolZ; + unsigned int iProjAngles; + unsigned int iProjU; // number of detectors in the U direction + unsigned int iProjV; // number of detectors in the V direction +}; + +} + +#endif + diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu new file mode 100644 index 0000000..74d1bbf --- /dev/null +++ b/cuda/2d/em.cu @@ -0,0 +1,262 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "em.h" +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +namespace astraCUDA { + + +// TODO: ensure non-negativity somewhere?? + + +EM::EM() +{ + D_projData = 0; + D_tmpData = 0; + D_pixelWeight = 0; + +} + + +EM::~EM() +{ + reset(); +} + +void EM::reset() +{ + cudaFree(D_projData); + cudaFree(D_tmpData); + cudaFree(D_pixelWeight); + + D_projData = 0; + D_tmpData = 0; + D_pixelWeight = 0; + + ReconAlgo::reset(); +} + + +bool EM::init() +{ + allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch); + zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); + + allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + + // We can't precompute pixelWeights when using a volume mask +#if 0 + if (!useVolumeMask) +#endif + precomputeWeights(); + + // TODO: check if allocations succeeded + return true; +} + +bool EM::precomputeWeights() +{ + zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); +#if 0 + if (useSinogramMask) { + callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch); + } else +#endif + { + processVol<opSet, SINO>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles); + callBP(D_pixelWeight, pixelPitch, D_projData, projPitch); + } + processVol<opInvert, VOL>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); + +#if 0 + if (useVolumeMask) { + // scale pixel weights with mask to zero out masked pixels + processVol<opMul, VOL>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight); + } +#endif + + return true; +} + +bool EM::iterate(unsigned int iterations) +{ + shouldAbort = false; + +#if 0 + if (useVolumeMask) + precomputeWeights(); +#endif + + // iteration + for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { + + // Do FP of volumeData + zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + callFP(D_volumeData, volumePitch, D_projData, projPitch, 1.0f); + + // Divide sinogram by FP (into projData) + processVol<opDividedBy, SINO>(D_projData, D_sinoData, projPitch, dims.iProjDets, dims.iProjAngles); + + // Do BP of projData into tmpData + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + callBP(D_tmpData, tmpPitch, D_projData, projPitch); + + // Multiply volumeData with tmpData divided by pixel weights + processVol<opMul2, VOL>(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); + + } + + return true; +} + +float EM::computeDiffNorm() +{ + // copy sinogram to projection data + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); + processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); + } else { + callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); + } + + + // compute norm of D_projData + + float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + + return sqrt(s); +} + + +bool doEM(float* D_volumeData, unsigned int volumePitch, + float* D_sinoData, unsigned int sinoPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, unsigned int iterations) +{ + EM em; + bool ok = true; + + ok &= em.setGeometry(dims, angles); + if (TOffsets) + ok &= em.setTOffsets(TOffsets); + + if (!ok) + return false; + + ok = em.init(); + if (!ok) + return false; + + ok &= em.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); + if (!ok) + return false; + + ok = em.iterate(iterations); + + return ok; +} + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_sinoData; + + SDimensions dims; + dims.iVolWidth = 1024; + dims.iVolHeight = 1024; + dims.iProjAngles = 512; + dims.iProjDets = 1536; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + unsigned int volumePitch, sinoPitch; + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + printf("pitch: %u\n", sinoPitch); + + unsigned int y, x; + float* sino = loadImage("sino.png", y, x); + + float* img = new float[dims.iVolWidth*dims.iVolHeight]; + + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch); + + float* angle = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) + angle[i] = i*(M_PI/dims.iProjAngles); + + EM em; + + em.setGeometry(dims, angle); + em.init(); + + // TODO: Initialize D_volumeData with an unfiltered backprojection + + em.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); + + em.iterate(25); + + + delete[] angle; + + copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + + saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img); + + return 0; +} + +#endif diff --git a/cuda/2d/em.h b/cuda/2d/em.h new file mode 100644 index 0000000..5a9ffed --- /dev/null +++ b/cuda/2d/em.h @@ -0,0 +1,77 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_EM_H +#define _CUDA_EM_H + +#include "util.h" +#include "algo.h" + +namespace astraCUDA { + +class _AstraExport EM : public ReconAlgo { +public: + EM(); + virtual ~EM(); + + // disable some features + virtual bool enableSinogramMask() { return false; } + virtual bool enableVolumeMask() { return false; } + virtual bool setMinConstraint(float) { return false; } + virtual bool setMaxConstraint(float) { return false; } + + virtual bool init(); + + virtual bool iterate(unsigned int iterations); + + virtual float computeDiffNorm(); + +protected: + void reset(); + bool precomputeWeights(); + + // Temporary buffers + float* D_projData; + unsigned int projPitch; + + float* D_tmpData; + unsigned int tmpPitch; + + // Geometry-specific precomputed data + float* D_pixelWeight; + unsigned int pixelPitch; +}; + +_AstraExport bool doEM(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, unsigned int iterations); + +} + +#endif diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu new file mode 100644 index 0000000..1edc6d9 --- /dev/null +++ b/cuda/2d/fan_bp.cu @@ -0,0 +1,374 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> + +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +#define PIXELTRACE + + +typedef texture<float, 2, cudaReadModeElementType> texture2D; + +static texture2D gT_FanProjTexture; + + +namespace astraCUDA { + +const unsigned int g_anglesPerBlock = 16; +const unsigned int g_blockSliceSize = 32; +const unsigned int g_blockSlices = 16; + +const unsigned int g_MaxAngles = 2560; + +__constant__ float gC_SrcX[g_MaxAngles]; +__constant__ float gC_SrcY[g_MaxAngles]; +__constant__ float gC_DetSX[g_MaxAngles]; +__constant__ float gC_DetSY[g_MaxAngles]; +__constant__ float gC_DetUX[g_MaxAngles]; +__constant__ float gC_DetUY[g_MaxAngles]; + + +static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_FanProjTexture.addressMode[0] = cudaAddressModeClamp; + gT_FanProjTexture.addressMode[1] = cudaAddressModeClamp; + gT_FanProjTexture.filterMode = cudaFilterModeLinear; + gT_FanProjTexture.normalized = false; + + cudaBindTexture2D(0, gT_FanProjTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch); + + // TODO: error value? + + return true; +} + +__global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims) +{ + const int relX = threadIdx.x; + const int relY = threadIdx.y; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + const int X = blockIdx.x * g_blockSlices + relX; + const int Y = blockIdx.y * g_blockSliceSize + relY; + + if (X >= dims.iVolWidth || Y >= dims.iVolHeight) + return; + + const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ); + const float fY = - ( Y - 0.5f*dims.iVolHeight + 0.5f ); + + float* volData = (float*)D_volData; + + float fVal = 0.0f; + float fA = startAngle + 0.5f; + + // TODO: Distance correction? + + for (int angle = startAngle; angle < endAngle; ++angle) + { + const float fSrcX = gC_SrcX[angle]; + const float fSrcY = gC_SrcY[angle]; + const float fDetSX = gC_DetSX[angle]; + const float fDetSY = gC_DetSY[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + + const float fXD = fSrcX - fX; + const float fYD = fSrcY - fY; + + const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX; + const float fDen = fDetUX * fYD - fDetUY * fXD; + + const float fT = fNum / fDen + 1.0f; + fVal += tex2D(gT_FanProjTexture, fT, fA); + fA += 1.0f; + } + + volData[(Y+1)*volPitch+X+1] += fVal; +} + +// supersampling version +__global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims) +{ + const int relX = threadIdx.x; + const int relY = threadIdx.y; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + const int X = blockIdx.x * g_blockSlices + relX; + const int Y = blockIdx.y * g_blockSliceSize + relY; + + if (X >= dims.iVolWidth || Y >= dims.iVolHeight) + return; + + const float fXb = ( X - 0.5f*dims.iVolWidth + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim); + const float fYb = - ( Y - 0.5f*dims.iVolHeight + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim); + + const float fSubStep = 1.0f/dims.iRaysPerPixelDim; + + float* volData = (float*)D_volData; + + float fVal = 0.0f; + float fA = startAngle + 0.5f; + + // TODO: Distance correction? + + for (int angle = startAngle; angle < endAngle; ++angle) + { + const float fSrcX = gC_SrcX[angle]; + const float fSrcY = gC_SrcY[angle]; + const float fDetSX = gC_DetSX[angle]; + const float fDetSY = gC_DetSY[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + + // TODO: Optimize these loops... + float fX = fXb; + for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) { + float fY = fYb; + for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) { + const float fXD = fSrcX - fX; + const float fYD = fSrcY - fY; + + const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX; + const float fDen = fDetUX * fYD - fDetUY * fXD; + + const float fT = fNum / fDen + 1.0f; + fVal += tex2D(gT_FanProjTexture, fT, fA); + fY -= fSubStep; + } + fX += fSubStep; + } + fA += 1.0f; + } + + volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim); +} + + +// BP specifically for SART. +// It includes (free) weighting with voxel weight. +// It assumes the proj texture is set up _without_ padding, unlike regular BP. +__global__ void devFanBP_SART(float* D_volData, unsigned int volPitch, const SDimensions dims) +{ + const int relX = threadIdx.x; + const int relY = threadIdx.y; + + const int X = blockIdx.x * g_blockSlices + relX; + const int Y = blockIdx.y * g_blockSliceSize + relY; + + if (X >= dims.iVolWidth || Y >= dims.iVolHeight) + return; + + const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ); + const float fY = - ( Y - 0.5f*dims.iVolHeight + 0.5f ); + + float* volData = (float*)D_volData; + + // TODO: Distance correction? + + // TODO: Constant memory vs parameters. + const float fSrcX = gC_SrcX[0]; + const float fSrcY = gC_SrcY[0]; + const float fDetSX = gC_DetSX[0]; + const float fDetSY = gC_DetSY[0]; + const float fDetUX = gC_DetUX[0]; + const float fDetUY = gC_DetUY[0]; + + const float fXD = fSrcX - fX; + const float fYD = fSrcY - fY; + + const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX; + const float fDen = fDetUX * fYD - fDetUY * fXD; + + const float fT = fNum / fDen; + const float fVal = tex2D(gT_FanProjTexture, fT, 0.5f); + + volData[(Y+1)*volPitch+X+1] += fVal; +} + + +bool FanBP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const SFanProjection* angles) +{ + // TODO: process angles block by block + assert(dims.iProjAngles <= g_MaxAngles); + + bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(SrcX); + TRANSFER_TO_CONSTANT(SrcY); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + dim3 dimBlock(g_blockSlices, g_blockSliceSize); + dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices, + (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize); + + cudaStream_t stream; + cudaStreamCreate(&stream); + + for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) { + if (dims.iRaysPerPixelDim > 1) + devFanBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims); + else + devFanBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims); + } + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + cudaStreamDestroy(stream); + + return true; +} + + +// D_projData is a pointer to one padded sinogram line +bool FanBP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle, + const SDimensions& dims, const SFanProjection* angles) +{ + // only one angle + bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1); + + // transfer angle to constant memory +#define TRANSFER_TO_CONSTANT(name) do { cudaMemcpyToSymbol(gC_##name, &(angles[angle].f##name), sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(SrcX); + TRANSFER_TO_CONSTANT(SrcY); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + +#undef TRANSFER_TO_CONSTANT + + dim3 dimBlock(g_blockSlices, g_blockSliceSize); + dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices, + (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize); + + devFanBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, dims); + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + return true; +} + + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_projData; + + SDimensions dims; + dims.iVolWidth = 128; + dims.iVolHeight = 128; + dims.iProjAngles = 180; + dims.iProjDets = 256; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + unsigned int volumePitch, projPitch; + + SFanProjection projs[180]; + + projs[0].fSrcX = 0.0f; + projs[0].fSrcY = 1536.0f; + projs[0].fDetSX = 128.0f; + projs[0].fDetSY = -512.0f; + projs[0].fDetUX = -1.0f; + projs[0].fDetUY = 0.0f; + +#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = projs[0].f##name##X * cos(alpha) - projs[0].f##name##Y * sin(alpha); projs[i].f##name##Y = projs[0].f##name##X * sin(alpha) + projs[0].f##name##Y * cos(alpha); } while(0) + + for (int i = 1; i < 180; ++i) { + ROTATE0(Src, i, i*2*M_PI/180); + ROTATE0(DetS, i, i*2*M_PI/180); + ROTATE0(DetU, i, i*2*M_PI/180); + } + +#undef ROTATE0 + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + printf("pitch: %u\n", projPitch); + + unsigned int y, x; + float* sino = loadImage("sino.png", y, x); + + float* img = new float[dims.iVolWidth*dims.iVolHeight]; + + memset(img, 0, dims.iVolWidth*dims.iVolHeight*sizeof(float)); + + copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); + + FanBP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs); + + copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + + saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img); + + return 0; +} +#endif diff --git a/cuda/2d/fan_bp.h b/cuda/2d/fan_bp.h new file mode 100644 index 0000000..a4e62be --- /dev/null +++ b/cuda/2d/fan_bp.h @@ -0,0 +1,45 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_FAN_BP_H +#define _CUDA_FAN_BP_H + +namespace astraCUDA { + +_AstraExport bool FanBP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const SFanProjection* angles); + +_AstraExport bool FanBP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle, + const SDimensions& dims, const SFanProjection* angles); + +} + +#endif diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu new file mode 100644 index 0000000..cf9f352 --- /dev/null +++ b/cuda/2d/fan_fp.cu @@ -0,0 +1,370 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + + +typedef texture<float, 2, cudaReadModeElementType> texture2D; + +static texture2D gT_FanVolumeTexture; + + +namespace astraCUDA { + +static const unsigned g_MaxAngles = 2560; +__constant__ float gC_SrcX[g_MaxAngles]; +__constant__ float gC_SrcY[g_MaxAngles]; +__constant__ float gC_DetSX[g_MaxAngles]; +__constant__ float gC_DetSY[g_MaxAngles]; +__constant__ float gC_DetUX[g_MaxAngles]; +__constant__ float gC_DetUY[g_MaxAngles]; + + +// optimization parameters +static const unsigned int g_anglesPerBlock = 16; +static const unsigned int g_detBlockSize = 32; +static const unsigned int g_blockSlices = 64; + +static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + dataArray = 0; + cudaMallocArray(&dataArray, &channelDesc, width, height); + cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice); + + gT_FanVolumeTexture.addressMode[0] = cudaAddressModeClamp; + gT_FanVolumeTexture.addressMode[1] = cudaAddressModeClamp; + gT_FanVolumeTexture.filterMode = cudaFilterModeLinear; + gT_FanVolumeTexture.normalized = false; + + // TODO: For very small sizes (roughly <=512x128) with few angles (<=180) + // not using an array is more efficient. + //cudaBindTexture2D(0, gT_FanVolumeTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch); + cudaBindTextureToArray(gT_FanVolumeTexture, dataArray, channelDesc); + + // TODO: error value? + + return true; +} + +// projection for angles that are roughly horizontal +// (detector roughly vertical) +__global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) +{ + float* projData = (float*)D_projData; + const int relDet = threadIdx.x; + const int relAngle = threadIdx.y; + + const int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; + if (angle >= endAngle) + return; + + const int detector = blockIdx.y * g_detBlockSize + relDet; + + if (detector < 0 || detector >= dims.iProjDets) + return; + + const float fSrcX = gC_SrcX[angle]; + const float fSrcY = gC_SrcY[angle]; + const float fDetSX = gC_DetSX[angle]; + const float fDetSY = gC_DetSY[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + + float fVal = 0.0f; + + const float fdx = fabsf(fDetSX + detector*fDetUX + 0.5f - fSrcX); + const float fdy = fabsf(fDetSY + detector*fDetUY + 0.5f - fSrcY); + + if (fdy > fdx) + return; + + + for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { + const float fDet = detector + (0.5f + iSubT) / dims.iRaysPerDet; + + const float fDetX = fDetSX + fDet * fDetUX; + const float fDetY = fDetSY + fDet * fDetUY; + + // ray: y = alpha * x + beta + const float alpha = (fSrcY - fDetY) / (fSrcX - fDetX); + const float beta = fSrcY - alpha * fSrcX; + + const float fDistCorr = sqrt(alpha*alpha+1.0f) * outputScale / dims.iRaysPerDet; + + // intersect ray with first slice + + float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 1.5f; + float fX = startSlice + 1.5f; + + int endSlice = startSlice + g_blockSlices; + if (endSlice > dims.iVolWidth) + endSlice = dims.iVolWidth; + + float fV = 0.0f; + for (int slice = startSlice; slice < endSlice; ++slice) + { + fV += tex2D(gT_FanVolumeTexture, fX, fY); + fY -= alpha; + fX += 1.0f; + } + + fVal += fV * fDistCorr; + + } + + projData[angle*projPitch+detector+1] += fVal; +} + + +// projection for angles that are roughly vertical +// (detector roughly horizontal) +__global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) +{ + const int relDet = threadIdx.x; + const int relAngle = threadIdx.y; + + const int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; + + if (angle >= endAngle) + return; + + const int detector = blockIdx.y * g_detBlockSize + relDet; + + if (detector < 0 || detector >= dims.iProjDets) + return; + + float* projData = (float*)D_projData; + + const float fSrcX = gC_SrcX[angle]; + const float fSrcY = gC_SrcY[angle]; + const float fDetSX = gC_DetSX[angle]; + const float fDetSY = gC_DetSY[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + + float fVal = 0.0f; + + const float fdx = fabsf(fDetSX + detector*fDetUX + 0.5f - fSrcX); + const float fdy = fabsf(fDetSY + detector*fDetUY + 0.5f - fSrcY); + + if (fdy <= fdx) + return; + + + for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { + const float fDet = detector + (0.5f + iSubT) / dims.iRaysPerDet /*- gC_angle_offset[angle]*/; + + const float fDetX = fDetSX + fDet * fDetUX; + const float fDetY = fDetSY + fDet * fDetUY; + + // ray: x = alpha * y + beta + const float alpha = (fSrcX - fDetX) / (fSrcY - fDetY); + const float beta = fSrcX - alpha * fSrcY; + + const float fDistCorr = sqrt(alpha*alpha+1) * outputScale / dims.iRaysPerDet; + + // intersect ray with first slice + + float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 1.5f; + float fY = startSlice + 1.5f; + + int endSlice = startSlice + g_blockSlices; + if (endSlice > dims.iVolHeight) + endSlice = dims.iVolHeight; + + float fV = 0.0f; + + for (int slice = startSlice; slice < endSlice; ++slice) + { + fV += tex2D(gT_FanVolumeTexture, fX, fY); + fX -= alpha; + fY += 1.0f; + } + + fVal += fV * fDistCorr; + + } + + projData[angle*projPitch+detector+1] += fVal; +} + +bool FanFP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const SFanProjection* angles, + float outputScale) +{ + // TODO: load angles into constant memory in smaller blocks + assert(dims.iProjAngles <= g_MaxAngles); + + cudaArray* D_dataArray; + bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(SrcX); + TRANSFER_TO_CONSTANT(SrcY); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles + const unsigned int g_blockSliceSize = g_detBlockSize; + + std::list<cudaStream_t> streams; + + + unsigned int blockStart = 0; + unsigned int blockEnd = dims.iProjAngles; + + dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock, + (dims.iProjDets+g_blockSliceSize-1)/g_blockSliceSize); // angle blocks, regions + cudaStream_t stream1; + cudaStreamCreate(&stream1); + streams.push_back(stream1); + for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices) + FanFPhorizontal<<<dimGrid, dimBlock, 0, stream1>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); + + cudaStream_t stream2; + cudaStreamCreate(&stream2); + streams.push_back(stream2); + for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices) + FanFPvertical<<<dimGrid, dimBlock, 0, stream2>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); + + cudaStreamDestroy(stream1); + cudaStreamDestroy(stream2); + + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + cudaFreeArray(D_dataArray); + + return true; +} + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_projData; + + SDimensions dims; + dims.iVolWidth = 128; + dims.iVolHeight = 128; + dims.iProjAngles = 180; + dims.iProjDets = 256; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + unsigned int volumePitch, projPitch; + + SFanProjection projs[180]; + + projs[0].fSrcX = 0.0f; + projs[0].fSrcY = 1536.0f; + projs[0].fDetSX = 128.0f; + projs[0].fDetSY = -512.0f; + projs[0].fDetUX = -1.0f; + projs[0].fDetUY = 0.0f; + +#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = projs[0].f##name##X * cos(alpha) - projs[0].f##name##Y * sin(alpha); projs[i].f##name##Y = projs[0].f##name##X * sin(alpha) + projs[0].f##name##Y * cos(alpha); } while(0) + + for (int i = 1; i < 180; ++i) { + ROTATE0(Src, i, i*2*M_PI/180); + ROTATE0(DetS, i, i*2*M_PI/180); + ROTATE0(DetU, i, i*2*M_PI/180); + } + +#undef ROTATE0 + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + printf("pitch: %u\n", projPitch); + + unsigned int y, x; + float* img = loadImage("phantom128.png", y, x); + + float* sino = new float[dims.iProjAngles * dims.iProjDets]; + + memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float)); + + copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); + + float* angle = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) + angle[i] = i*(M_PI/dims.iProjAngles); + + FanFP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs, 1.0f); + + delete[] angle; + + copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); + + float s = 0.0f; + for (unsigned int y = 0; y < dims.iProjAngles; ++y) + for (unsigned int x = 0; x < dims.iProjDets; ++x) + s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x]; + printf("cpu norm: %f\n", s); + + //zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + printf("gpu norm: %f\n", s); + + saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino); + + + return 0; +} +#endif diff --git a/cuda/2d/fan_fp.h b/cuda/2d/fan_fp.h new file mode 100644 index 0000000..0734f40 --- /dev/null +++ b/cuda/2d/fan_fp.h @@ -0,0 +1,41 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_FAN_FP_H +#define _CUDA_FAN_FP_H + +namespace astraCUDA { + +_AstraExport bool FanFP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const SFanProjection* angles, + float outputScale); + +} + +#endif diff --git a/cuda/2d/fbp_filters.h b/cuda/2d/fbp_filters.h new file mode 100644 index 0000000..1232f8e --- /dev/null +++ b/cuda/2d/fbp_filters.h @@ -0,0 +1,58 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef FBP_FILTERS_H +#define FBP_FILTERS_H + +enum E_FBPFILTER +{ + FILTER_NONE, //< no filter (regular BP) + FILTER_RAMLAK, //< default FBP filter + FILTER_SHEPPLOGAN, //< Shepp-Logan + FILTER_COSINE, //< Cosine + FILTER_HAMMING, //< Hamming filter + FILTER_HANN, //< Hann filter + FILTER_TUKEY, //< Tukey filter + FILTER_LANCZOS, //< Lanczos filter + FILTER_TRIANGULAR, //< Triangular filter + FILTER_GAUSSIAN, //< Gaussian filter + FILTER_BARTLETTHANN, //< Bartlett-Hann filter + FILTER_BLACKMAN, //< Blackman filter + FILTER_NUTTALL, //< Nuttall filter, continuous first derivative + FILTER_BLACKMANHARRIS, //< Blackman-Harris filter + FILTER_BLACKMANNUTTALL, //< Blackman-Nuttall filter + FILTER_FLATTOP, //< Flat top filter + FILTER_KAISER, //< Kaiser filter + FILTER_PARZEN, //< Parzen filter + FILTER_PROJECTION, //< all projection directions share one filter + FILTER_SINOGRAM, //< every projection direction has its own filter + FILTER_RPROJECTION, //< projection filter in real space (as opposed to fourier space) + FILTER_RSINOGRAM, //< sinogram filter in real space +}; + +#endif /* FBP_FILTERS_H */ diff --git a/cuda/2d/fft.cu b/cuda/2d/fft.cu new file mode 100644 index 0000000..79e4be7 --- /dev/null +++ b/cuda/2d/fft.cu @@ -0,0 +1,873 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "fft.h" +#include "util.h" + +#include <cufft.h> +#include <iostream> +#include <cuda.h> +#include <fstream> + +#include "../../include/astra/Logger.h" + +using namespace astra; + +// TODO: evaluate what we want to do in these situations: + +#define CHECK_ERROR(errorMessage) do { \ + cudaError_t err = cudaThreadSynchronize(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + CLogger::writeTerminalCUDAError(__FILE__, __LINE__, cudaGetErrorString( err)); \ + exit(EXIT_FAILURE); \ + } } while (0) + +#define SAFE_CALL( call) do { \ + cudaError err = call; \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + CLogger::writeTerminalCUDAError(__FILE__, __LINE__, cudaGetErrorString( err)); \ + exit(EXIT_FAILURE); \ + } \ + err = cudaThreadSynchronize(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + CLogger::writeTerminalCUDAError(__FILE__, __LINE__, cudaGetErrorString( err)); \ + exit(EXIT_FAILURE); \ + } } while (0) + + +__global__ static void applyFilter_kernel(int _iProjectionCount, + int _iFreqBinCount, + cufftComplex * _pSinogram, + cufftComplex * _pFilter) +{ + int iIndex = threadIdx.x + blockIdx.x * blockDim.x; + int iProjectionIndex = iIndex / _iFreqBinCount; + + if(iProjectionIndex >= _iProjectionCount) + { + return; + } + + float fA = _pSinogram[iIndex].x; + float fB = _pSinogram[iIndex].y; + float fC = _pFilter[iIndex].x; + float fD = _pFilter[iIndex].y; + + _pSinogram[iIndex].x = fA * fC - fB * fD; + _pSinogram[iIndex].y = fA * fD + fC * fB; +} + +__global__ static void rescaleInverseFourier_kernel(int _iProjectionCount, + int _iDetectorCount, + float* _pfInFourierOutput) +{ + int iIndex = threadIdx.x + blockIdx.x * blockDim.x; + int iProjectionIndex = iIndex / _iDetectorCount; + int iDetectorIndex = iIndex % _iDetectorCount; + + if(iProjectionIndex >= _iProjectionCount) + { + return; + } + + _pfInFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex] /= (float)_iDetectorCount; +} + +static void rescaleInverseFourier(int _iProjectionCount, int _iDetectorCount, + float * _pfInFourierOutput) +{ + const int iBlockSize = 256; + int iElementCount = _iProjectionCount * _iDetectorCount; + int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize; + + rescaleInverseFourier_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount, + _iDetectorCount, + _pfInFourierOutput); + CHECK_ERROR("rescaleInverseFourier_kernel failed"); +} + +void applyFilter(int _iProjectionCount, int _iFreqBinCount, + cufftComplex * _pSinogram, cufftComplex * _pFilter) +{ + const int iBlockSize = 256; + int iElementCount = _iProjectionCount * _iFreqBinCount; + int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize; + + applyFilter_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount, + _iFreqBinCount, + _pSinogram, _pFilter); + CHECK_ERROR("applyFilter_kernel failed"); +} + +static bool invokeCudaFFT(int _iProjectionCount, int _iDetectorCount, + const float * _pfDevSource, + cufftComplex * _pDevTargetComplex) +{ + cufftHandle plan; + cufftResult result; + + result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_R2C, _iProjectionCount); + if(result != CUFFT_SUCCESS) + { + std::cerr << "Failed to plan 1d r2c fft" << std::endl; + return false; + } + + result = cufftExecR2C(plan, (cufftReal *)_pfDevSource, _pDevTargetComplex); + cufftDestroy(plan); + + if(result != CUFFT_SUCCESS) + { + std::cerr << "Failed to exec 1d r2c fft" << std::endl; + return false; + } + + return true; +} + +static bool invokeCudaIFFT(int _iProjectionCount, int _iDetectorCount, + const cufftComplex * _pDevSourceComplex, + float * _pfDevTarget) +{ + cufftHandle plan; + cufftResult result; + + result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_C2R, _iProjectionCount); + if(result != CUFFT_SUCCESS) + { + std::cerr << "Failed to plan 1d c2r fft" << std::endl; + return false; + } + + // todo: why do we have to get rid of the const qualifier? + result = cufftExecC2R(plan, (cufftComplex *)_pDevSourceComplex, + (cufftReal *)_pfDevTarget); + cufftDestroy(plan); + + if(result != CUFFT_SUCCESS) + { + std::cerr << "Failed to exec 1d c2r fft" << std::endl; + return false; + } + + return true; +} + +bool allocateComplexOnDevice(int _iProjectionCount, int _iDetectorCount, + cufftComplex ** _ppDevComplex) +{ + size_t bufferSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount; + SAFE_CALL(cudaMalloc((void **)_ppDevComplex, bufferSize)); + return true; +} + +bool freeComplexOnDevice(cufftComplex * _pDevComplex) +{ + SAFE_CALL(cudaFree(_pDevComplex)); + return true; +} + +bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount, + cufftComplex * _pHostComplexSource, + cufftComplex * _pDevComplexTarget) +{ + size_t memSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount; + SAFE_CALL(cudaMemcpy(_pDevComplexTarget, _pHostComplexSource, memSize, cudaMemcpyHostToDevice)); + + return true; +} + +bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource, + int _iSourcePitch, int _iSourcePadX, int _iProjDets, + int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount, + cufftComplex * _pDevTargetComplex) +{ + float * pfDevRealFFTSource = NULL; + size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount; + + SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTSource, bufferMemSize)); + SAFE_CALL(cudaMemset(pfDevRealFFTSource, 0, bufferMemSize)); + + for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) + { + const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch + _iSourcePadX; + float * pfTargetLocation = pfDevRealFFTSource + iProjectionIndex * _iFFTRealDetectorCount; + + SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice)); + } + + bool bResult = invokeCudaFFT(_iProjectionCount, _iFFTRealDetectorCount, + pfDevRealFFTSource, _pDevTargetComplex); + if(!bResult) + { + return false; + } + + SAFE_CALL(cudaFree(pfDevRealFFTSource)); + + return true; +} + +bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex, + float * _pfRealTarget, + int _iTargetPitch, int _iTargetPadX, int _iProjDets, + int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount) +{ + float * pfDevRealFFTTarget = NULL; + size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount; + + SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTTarget, bufferMemSize)); + + bool bResult = invokeCudaIFFT(_iProjectionCount, _iFFTRealDetectorCount, + _pDevSourceComplex, pfDevRealFFTTarget); + if(!bResult) + { + return false; + } + + rescaleInverseFourier(_iProjectionCount, _iFFTRealDetectorCount, + pfDevRealFFTTarget); + + SAFE_CALL(cudaMemset(_pfRealTarget, 0, sizeof(float) * _iProjectionCount * _iTargetPitch)); + + for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) + { + const float * pfSourceLocation = pfDevRealFFTTarget + iProjectionIndex * _iFFTRealDetectorCount; + float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch + _iTargetPadX; + + SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice)); + } + + SAFE_CALL(cudaFree(pfDevRealFFTTarget)); + + return true; +} + + +// Because the input is real, the Fourier transform is symmetric. +// CUFFT only outputs the first half (ignoring the redundant second half), +// and expects the same as input for the IFFT. +int calcFFTFourSize(int _iFFTRealSize) +{ + int iFFTFourSize = _iFFTRealSize / 2 + 1; + + return iFFTFourSize; +} + +void genIdenFilter(int _iProjectionCount, cufftComplex * _pFilter, + int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount) +{ + for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) + { + for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + int iIndex = iDetectorIndex + iProjectionIndex * _iFFTFourierDetectorCount; + _pFilter[iIndex].x = 1.0f; + _pFilter[iIndex].y = 0.0f; + } + } +} + +void genFilter(E_FBPFILTER _eFilter, float _fD, int _iProjectionCount, + cufftComplex * _pFilter, int _iFFTRealDetectorCount, + int _iFFTFourierDetectorCount, float _fParameter /* = -1.0f */) +{ + float * pfFilt = new float[_iFFTFourierDetectorCount]; + float * pfW = new float[_iFFTFourierDetectorCount]; + + for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fRelIndex = (float)iDetectorIndex / (float)_iFFTRealDetectorCount; + + // filt = 2*( 0:(order/2) )./order; + pfFilt[iDetectorIndex] = 2.0f * fRelIndex; + //pfFilt[iDetectorIndex] = 1.0f; + + // w = 2*pi*(0:size(filt,2)-1)/order + pfW[iDetectorIndex] = 3.1415f * 2.0f * fRelIndex; + } + + switch(_eFilter) + { + case FILTER_RAMLAK: + { + // do nothing + break; + } + case FILTER_SHEPPLOGAN: + { + // filt(2:end) = filt(2:end) .* (sin(w(2:end)/(2*d))./(w(2:end)/(2*d))) + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * (sinf(pfW[iDetectorIndex] / 2.0f / _fD) / (pfW[iDetectorIndex] / 2.0f / _fD)); + } + break; + } + case FILTER_COSINE: + { + // filt(2:end) = filt(2:end) .* cos(w(2:end)/(2*d)) + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * cosf(pfW[iDetectorIndex] / 2.0f / _fD); + } + break; + } + case FILTER_HAMMING: + { + // filt(2:end) = filt(2:end) .* (.54 + .46 * cos(w(2:end)/d)) + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * ( 0.54f + 0.46f * cosf(pfW[iDetectorIndex] / _fD)); + } + break; + } + case FILTER_HANN: + { + // filt(2:end) = filt(2:end) .*(1+cos(w(2:end)./d)) / 2 + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * (1.0f + cosf(pfW[iDetectorIndex] / _fD)) / 2.0f; + } + break; + } + case FILTER_TUKEY: + { + float fAlpha = _fParameter; + if(_fParameter < 0.0f) fAlpha = 0.5f; + float fN = (float)_iFFTFourierDetectorCount; + float fHalfN = fN / 2.0f; + float fEnumTerm = fAlpha * fHalfN; + float fDenom = (1.0f - fAlpha) * fHalfN; + float fBlockStart = fHalfN - fEnumTerm; + float fBlockEnd = fHalfN + fEnumTerm; + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fAbsSmallN = fabs((float)iDetectorIndex); + float fStoredValue = 0.0f; + + if((fBlockStart <= fAbsSmallN) && (fAbsSmallN <= fBlockEnd)) + { + fStoredValue = 1.0f; + } + else + { + float fEnum = fAbsSmallN - fEnumTerm; + float fCosInput = M_PI * fEnum / fDenom; + fStoredValue = 0.5f * (1.0f + cosf(fCosInput)); + } + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_LANCZOS: + { + float fDenum = (float)(_iFFTFourierDetectorCount - 1); + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fX = 2.0f * fSmallN / fDenum - 1.0f; + float fSinInput = M_PI * fX; + float fStoredValue = 0.0f; + + if(fabsf(fSinInput) > 0.001f) + { + fStoredValue = sin(fSinInput)/fSinInput; + } + else + { + fStoredValue = 1.0f; + } + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_TRIANGULAR: + { + float fNMinusOne = (float)(_iFFTFourierDetectorCount - 1); + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fAbsInput = fSmallN - fNMinusOne / 2.0f; + float fParenInput = fNMinusOne / 2.0f - fabsf(fAbsInput); + float fStoredValue = 2.0f / fNMinusOne * fParenInput; + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_GAUSSIAN: + { + float fSigma = _fParameter; + if(_fParameter < 0.0f) fSigma = 0.4f; + float fN = (float)_iFFTFourierDetectorCount; + float fQuotient = (fN - 1.0f) / 2.0f; + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fEnum = fSmallN - fQuotient; + float fDenom = fSigma * fQuotient; + float fPower = -0.5f * (fEnum / fDenom) * (fEnum / fDenom); + float fStoredValue = expf(fPower); + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_BARTLETTHANN: + { + const float fA0 = 0.62f; + const float fA1 = 0.48f; + const float fA2 = 0.38f; + float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f; + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fAbsInput = fSmallN / fNMinusOne - 0.5f; + float fFirstTerm = fA1 * fabsf(fAbsInput); + float fCosInput = 2.0f * M_PI * fSmallN / fNMinusOne; + float fSecondTerm = fA2 * cosf(fCosInput); + float fStoredValue = fA0 - fFirstTerm - fSecondTerm; + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_BLACKMAN: + { + float fAlpha = _fParameter; + if(_fParameter < 0.0f) fAlpha = 0.16f; + float fA0 = (1.0f - fAlpha) / 2.0f; + float fA1 = 0.5f; + float fA2 = fAlpha / 2.0f; + float fNMinusOne = (float)(_iFFTFourierDetectorCount - 1); + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fCosInput1 = 2.0f * M_PI * 0.5f * fSmallN / fNMinusOne; + float fCosInput2 = 4.0f * M_PI * 0.5f * fSmallN / fNMinusOne; + float fStoredValue = fA0 - fA1 * cosf(fCosInput1) + fA2 * cosf(fCosInput2); + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_NUTTALL: + { + const float fA0 = 0.355768f; + const float fA1 = 0.487396f; + const float fA2 = 0.144232f; + const float fA3 = 0.012604f; + float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f; + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fBaseCosInput = M_PI * fSmallN / fNMinusOne; + float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput); + float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput); + float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput); + float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm; + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_BLACKMANHARRIS: + { + const float fA0 = 0.35875f; + const float fA1 = 0.48829f; + const float fA2 = 0.14128f; + const float fA3 = 0.01168f; + float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f; + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fBaseCosInput = M_PI * fSmallN / fNMinusOne; + float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput); + float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput); + float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput); + float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm; + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_BLACKMANNUTTALL: + { + const float fA0 = 0.3635819f; + const float fA1 = 0.4891775f; + const float fA2 = 0.1365995f; + const float fA3 = 0.0106411f; + float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f; + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fBaseCosInput = M_PI * fSmallN / fNMinusOne; + float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput); + float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput); + float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput); + float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm; + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_FLATTOP: + { + const float fA0 = 1.0f; + const float fA1 = 1.93f; + const float fA2 = 1.29f; + const float fA3 = 0.388f; + const float fA4 = 0.032f; + float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f; + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fBaseCosInput = M_PI * fSmallN / fNMinusOne; + float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput); + float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput); + float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput); + float fFourthTerm = fA4 * cosf(8.0f * fBaseCosInput); + float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm + fFourthTerm; + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_KAISER: + { + float fAlpha = _fParameter; + if(_fParameter < 0.0f) fAlpha = 3.0f; + float fPiTimesAlpha = M_PI * fAlpha; + float fNMinusOne = (float)(_iFFTFourierDetectorCount - 1); + float fDenom = (float)j0((double)fPiTimesAlpha); + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fSquareInput = 2.0f * fSmallN / fNMinusOne - 1; + float fSqrtInput = 1.0f - fSquareInput * fSquareInput; + float fBesselInput = fPiTimesAlpha * sqrt(fSqrtInput); + float fEnum = (float)j0((double)fBesselInput); + float fStoredValue = fEnum / fDenom; + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_PARZEN: + { + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fQ = fSmallN / (float)(_iFFTFourierDetectorCount - 1); + float fStoredValue = 0.0f; + + if(fQ <= 0.5f) + { + fStoredValue = 1.0f - 6.0f * fQ * fQ * (1.0f - fQ); + } + else + { + float fCubedValue = 1.0f - fQ; + fStoredValue = 2.0f * fCubedValue * fCubedValue * fCubedValue; + } + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + default: + { + std::cerr << "Cannot serve requested filter" << std::endl; + } + } + + // filt(w>pi*d) = 0; + float fPiTimesD = M_PI * _fD; + for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fWValue = pfW[iDetectorIndex]; + + if(fWValue > fPiTimesD) + { + pfFilt[iDetectorIndex] = 0.0f; + } + } + + for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fFilterValue = pfFilt[iDetectorIndex]; + + for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) + { + int iIndex = iDetectorIndex + iProjectionIndex * _iFFTFourierDetectorCount; + _pFilter[iIndex].x = fFilterValue; + _pFilter[iIndex].y = 0.0f; + } + } + + delete[] pfFilt; + delete[] pfW; +} + +#ifdef STANDALONE + +__global__ static void doubleFourierOutput_kernel(int _iProjectionCount, + int _iDetectorCount, + cufftComplex* _pFourierOutput) +{ + int iIndex = threadIdx.x + blockIdx.x * blockDim.x; + int iProjectionIndex = iIndex / _iDetectorCount; + int iDetectorIndex = iIndex % _iDetectorCount; + + if(iProjectionIndex >= _iProjectionCount) + { + return; + } + + if(iDetectorIndex <= (_iDetectorCount / 2)) + { + return; + } + + int iOtherDetectorIndex = _iDetectorCount - iDetectorIndex; + + _pFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex].x = _pFourierOutput[iProjectionIndex * _iDetectorCount + iOtherDetectorIndex].x; + _pFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex].y = -_pFourierOutput[iProjectionIndex * _iDetectorCount + iOtherDetectorIndex].y; +} + +static void doubleFourierOutput(int _iProjectionCount, int _iDetectorCount, + cufftComplex * _pFourierOutput) +{ + const int iBlockSize = 256; + int iElementCount = _iProjectionCount * _iDetectorCount; + int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize; + + doubleFourierOutput_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount, + _iDetectorCount, + _pFourierOutput); + CHECK_ERROR("doubleFourierOutput_kernel failed"); +} + + + +static void writeToMatlabFile(const char * _fileName, const float * _pfData, + int _iRowCount, int _iColumnCount) +{ + std::ofstream out(_fileName); + + for(int iRowIndex = 0; iRowIndex < _iRowCount; iRowIndex++) + { + for(int iColumnIndex = 0; iColumnIndex < _iColumnCount; iColumnIndex++) + { + out << _pfData[iColumnIndex + iRowIndex * _iColumnCount] << " "; + } + + out << std::endl; + } +} + +static void convertComplexToRealImg(const cufftComplex * _pComplex, + int _iElementCount, + float * _pfReal, float * _pfImaginary) +{ + for(int iIndex = 0; iIndex < _iElementCount; iIndex++) + { + _pfReal[iIndex] = _pComplex[iIndex].x; + _pfImaginary[iIndex] = _pComplex[iIndex].y; + } +} + +void testCudaFFT() +{ + const int iProjectionCount = 2; + const int iDetectorCount = 1024; + const int iTotalElementCount = iProjectionCount * iDetectorCount; + + float * pfHostProj = new float[iTotalElementCount]; + memset(pfHostProj, 0, sizeof(float) * iTotalElementCount); + + for(int iProjectionIndex = 0; iProjectionIndex < iProjectionCount; iProjectionIndex++) + { + for(int iDetectorIndex = 0; iDetectorIndex < iDetectorCount; iDetectorIndex++) + { +// int + +// pfHostProj[iIndex] = (float)rand() / (float)RAND_MAX; + } + } + + writeToMatlabFile("proj.mat", pfHostProj, iProjectionCount, iDetectorCount); + + float * pfDevProj = NULL; + SAFE_CALL(cudaMalloc((void **)&pfDevProj, sizeof(float) * iTotalElementCount)); + SAFE_CALL(cudaMemcpy(pfDevProj, pfHostProj, sizeof(float) * iTotalElementCount, cudaMemcpyHostToDevice)); + + cufftComplex * pDevFourProj = NULL; + SAFE_CALL(cudaMalloc((void **)&pDevFourProj, sizeof(cufftComplex) * iTotalElementCount)); + + cufftHandle plan; + cufftResult result; + + result = cufftPlan1d(&plan, iDetectorCount, CUFFT_R2C, iProjectionCount); + if(result != CUFFT_SUCCESS) + { + cerr << "Failed to plan 1d r2c fft" << endl; + } + + result = cufftExecR2C(plan, pfDevProj, pDevFourProj); + if(result != CUFFT_SUCCESS) + { + cerr << "Failed to exec 1d r2c fft" << endl; + } + + cufftDestroy(plan); + + doubleFourierOutput(iProjectionCount, iDetectorCount, pDevFourProj); + + cufftComplex * pHostFourProj = new cufftComplex[iTotalElementCount]; + SAFE_CALL(cudaMemcpy(pHostFourProj, pDevFourProj, sizeof(cufftComplex) * iTotalElementCount, cudaMemcpyDeviceToHost)); + + float * pfHostFourProjReal = new float[iTotalElementCount]; + float * pfHostFourProjImaginary = new float[iTotalElementCount]; + + convertComplexToRealImg(pHostFourProj, iTotalElementCount, pfHostFourProjReal, pfHostFourProjImaginary); + + writeToMatlabFile("proj_four_real.mat", pfHostFourProjReal, iProjectionCount, iDetectorCount); + writeToMatlabFile("proj_four_imaginary.mat", pfHostFourProjImaginary, iProjectionCount, iDetectorCount); + + float * pfDevInFourProj = NULL; + SAFE_CALL(cudaMalloc((void **)&pfDevInFourProj, sizeof(float) * iTotalElementCount)); + + result = cufftPlan1d(&plan, iDetectorCount, CUFFT_C2R, iProjectionCount); + if(result != CUFFT_SUCCESS) + { + cerr << "Failed to plan 1d c2r fft" << endl; + } + + result = cufftExecC2R(plan, pDevFourProj, pfDevInFourProj); + if(result != CUFFT_SUCCESS) + { + cerr << "Failed to exec 1d c2r fft" << endl; + } + + cufftDestroy(plan); + + rescaleInverseFourier(iProjectionCount, iDetectorCount, pfDevInFourProj); + + float * pfHostInFourProj = new float[iTotalElementCount]; + SAFE_CALL(cudaMemcpy(pfHostInFourProj, pfDevInFourProj, sizeof(float) * iTotalElementCount, cudaMemcpyDeviceToHost)); + + writeToMatlabFile("in_four.mat", pfHostInFourProj, iProjectionCount, iDetectorCount); + + SAFE_CALL(cudaFree(pDevFourProj)); + SAFE_CALL(cudaFree(pfDevProj)); + + delete [] pfHostInFourProj; + delete [] pfHostFourProjReal; + delete [] pfHostFourProjImaginary; + delete [] pfHostProj; + delete [] pHostFourProj; +} + +void downloadDebugFilterComplex(float * _pfHostSinogram, int _iProjectionCount, + int _iDetectorCount, + cufftComplex * _pDevFilter, + int _iFilterDetCount) +{ + cufftComplex * pHostFilter = NULL; + size_t complMemSize = sizeof(cufftComplex) * _iFilterDetCount * _iProjectionCount; + pHostFilter = (cufftComplex *)malloc(complMemSize); + SAFE_CALL(cudaMemcpy(pHostFilter, _pDevFilter, complMemSize, cudaMemcpyDeviceToHost)); + + for(int iTargetProjIndex = 0; iTargetProjIndex < _iProjectionCount; iTargetProjIndex++) + { + for(int iTargetDetIndex = 0; iTargetDetIndex < min(_iDetectorCount, _iFilterDetCount); iTargetDetIndex++) + { + cufftComplex source = pHostFilter[iTargetDetIndex + iTargetProjIndex * _iFilterDetCount]; + float fReadValue = sqrtf(source.x * source.x + source.y * source.y); + _pfHostSinogram[iTargetDetIndex + iTargetProjIndex * _iDetectorCount] = fReadValue; + } + } + + free(pHostFilter); +} + +void downloadDebugFilterReal(float * _pfHostSinogram, int _iProjectionCount, + int _iDetectorCount, float * _pfDevFilter, + int _iFilterDetCount) +{ + float * pfHostFilter = NULL; + size_t memSize = sizeof(float) * _iFilterDetCount * _iProjectionCount; + pfHostFilter = (float *)malloc(memSize); + SAFE_CALL(cudaMemcpy(pfHostFilter, _pfDevFilter, memSize, cudaMemcpyDeviceToHost)); + + for(int iTargetProjIndex = 0; iTargetProjIndex < _iProjectionCount; iTargetProjIndex++) + { + for(int iTargetDetIndex = 0; iTargetDetIndex < min(_iDetectorCount, _iFilterDetCount); iTargetDetIndex++) + { + float fSource = pfHostFilter[iTargetDetIndex + iTargetProjIndex * _iFilterDetCount]; + _pfHostSinogram[iTargetDetIndex + iTargetProjIndex * _iDetectorCount] = fSource; + } + } + + free(pfHostFilter); +} + + +#endif diff --git a/cuda/2d/fft.h b/cuda/2d/fft.h new file mode 100644 index 0000000..55324e5 --- /dev/null +++ b/cuda/2d/fft.h @@ -0,0 +1,69 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef FFT_H +#define FFT_H + +#include <cufft.h> +#include <cuda.h> + +#include "fbp_filters.h" + +bool allocateComplexOnDevice(int _iProjectionCount, + int _iDetectorCount, + cufftComplex ** _ppDevComplex); + +bool freeComplexOnDevice(cufftComplex * _pDevComplex); + +bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount, + cufftComplex * _pHostComplexSource, + cufftComplex * _pDevComplexTarget); + +bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource, + int _iSourcePitch, int _iSourcePadX, int _iProjDets, + int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount, + cufftComplex * _pDevTargetComplex); + +bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex, + float * _pfRealTarget, + int _iTargetPitch, int _iTargetPadX, int _iProjDets, + int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount); + +void applyFilter(int _iProjectionCount, int _iFreqBinCount, + cufftComplex * _pSinogram, cufftComplex * _pFilter); + +int calcFFTFourSize(int _iFFTRealSize); + +void genFilter(E_FBPFILTER _eFilter, float _fD, int _iProjectionCount, + cufftComplex * _pFilter, int _iFFTRealDetectorCount, + int _iFFTFourierDetectorCount, float _fParameter = -1.0f); + +void genIdenFilter(int _iProjectionCount, cufftComplex * _pFilter, + int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount); + +#endif /* FFT_H */ diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu new file mode 100644 index 0000000..1057879 --- /dev/null +++ b/cuda/2d/par_bp.cu @@ -0,0 +1,357 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> + +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +#define PIXELTRACE + + +typedef texture<float, 2, cudaReadModeElementType> texture2D; + +static texture2D gT_projTexture; + + +namespace astraCUDA { + +const unsigned int g_anglesPerBlock = 16; +const unsigned int g_blockSliceSize = 32; +const unsigned int g_blockSlices = 16; + +const unsigned int g_MaxAngles = 2560; + +__constant__ float gC_angle_sin[g_MaxAngles]; +__constant__ float gC_angle_cos[g_MaxAngles]; +__constant__ float gC_angle_offset[g_MaxAngles]; + +static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_projTexture.addressMode[0] = cudaAddressModeClamp; + gT_projTexture.addressMode[1] = cudaAddressModeClamp; + gT_projTexture.filterMode = cudaFilterModeLinear; + gT_projTexture.normalized = false; + + cudaBindTexture2D(0, gT_projTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch); + + // TODO: error value? + + return true; +} + +__global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims) +{ + const int relX = threadIdx.x; + const int relY = threadIdx.y; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + const int X = blockIdx.x * g_blockSlices + relX; + const int Y = blockIdx.y * g_blockSliceSize + relY; + + if (X >= dims.iVolWidth || Y >= dims.iVolHeight) + return; + + const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ) / dims.fDetScale; + const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f ) / dims.fDetScale; + + float* volData = (float*)D_volData; + + float fVal = 0.0f; + float fA = startAngle + 0.5f; + const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f; + + if (offsets) { + + for (int angle = startAngle; angle < endAngle; ++angle) + { + const float cos_theta = gC_angle_cos[angle]; + const float sin_theta = gC_angle_sin[angle]; + const float TOffset = gC_angle_offset[angle]; + + const float fT = fT_base + fX * cos_theta - fY * sin_theta + TOffset; + fVal += tex2D(gT_projTexture, fT, fA); + fA += 1.0f; + } + + } else { + + for (int angle = startAngle; angle < endAngle; ++angle) + { + const float cos_theta = gC_angle_cos[angle]; + const float sin_theta = gC_angle_sin[angle]; + + const float fT = fT_base + fX * cos_theta - fY * sin_theta; + fVal += tex2D(gT_projTexture, fT, fA); + fA += 1.0f; + } + + } + + volData[(Y+1)*volPitch+X+1] += fVal; +} + +// supersampling version +__global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims) +{ + const int relX = threadIdx.x; + const int relY = threadIdx.y; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + const int X = blockIdx.x * g_blockSlices + relX; + const int Y = blockIdx.y * g_blockSliceSize + relY; + + if (X >= dims.iVolWidth || Y >= dims.iVolHeight) + return; + + const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim) / dims.fDetScale; + const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim) / dims.fDetScale; + + const float fSubStep = 1.0f/(dims.iRaysPerPixelDim * dims.fDetScale); + + float* volData = (float*)D_volData; + + float fVal = 0.0f; + float fA = startAngle + 0.5f; + const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f; + + if (offsets) { + + for (int angle = startAngle; angle < endAngle; ++angle) + { + const float cos_theta = gC_angle_cos[angle]; + const float sin_theta = gC_angle_sin[angle]; + const float TOffset = gC_angle_offset[angle]; + + float fT = fT_base + fX * cos_theta - fY * sin_theta + TOffset; + + for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) { + float fTy = fT; + fT += fSubStep * cos_theta; + for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) { + fVal += tex2D(gT_projTexture, fTy, fA); + fTy -= fSubStep * sin_theta; + } + } + fA += 1.0f; + } + + } else { + + for (int angle = startAngle; angle < endAngle; ++angle) + { + const float cos_theta = gC_angle_cos[angle]; + const float sin_theta = gC_angle_sin[angle]; + + float fT = fT_base + fX * cos_theta - fY * sin_theta; + + for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) { + float fTy = fT; + fT += fSubStep * cos_theta; + for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) { + fVal += tex2D(gT_projTexture, fTy, fA); + fTy -= fSubStep * sin_theta; + } + } + fA += 1.0f; + + } + + } + + volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim); +} + +__global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset, float angle_sin, float angle_cos, const SDimensions dims) +{ + const int relX = threadIdx.x; + const int relY = threadIdx.y; + + const int X = blockIdx.x * g_blockSlices + relX; + const int Y = blockIdx.y * g_blockSliceSize + relY; + + if (X >= dims.iVolWidth || Y >= dims.iVolHeight) + return; + + const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ) / dims.fDetScale; + const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f ) / dims.fDetScale; + + const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f; + + const float fT = fT_base + fX * angle_cos - fY * angle_sin + offset; + const float fVal = tex2D(gT_projTexture, fT, 0.5f); + + D_volData[(Y+1)*volPitch+X+1] += fVal; +} + + +bool BP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const float* angles, const float* TOffsets) +{ + // TODO: process angles block by block + assert(dims.iProjAngles <= g_MaxAngles); + + float* angle_sin = new float[dims.iProjAngles]; + float* angle_cos = new float[dims.iProjAngles]; + + bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) { + angle_sin[i] = sinf(angles[i]); + angle_cos[i] = cosf(angles[i]); + } + cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_sin, angle_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_cos, angle_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + assert(e1 == cudaSuccess); + assert(e2 == cudaSuccess); + + if (TOffsets) { + cudaError_t e3 = cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + assert(e3 == cudaSuccess); + } + + delete[] angle_sin; + delete[] angle_cos; + + dim3 dimBlock(g_blockSlices, g_blockSliceSize); + dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices, + (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize); + + cudaStream_t stream; + cudaStreamCreate(&stream); + + for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) { + + if (dims.iRaysPerPixelDim > 1) + devBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims); + else + devBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims); + } + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + cudaStreamDestroy(stream); + + return true; +} + +bool BP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle, const SDimensions& dims, + const float* angles, const float* TOffsets) +{ + // only one angle + bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1); + + float angle_sin = sinf(angles[angle]); + float angle_cos = cosf(angles[angle]); + + float offset = 0.0f; + if (TOffsets) + offset = TOffsets[angle]; + + dim3 dimBlock(g_blockSlices, g_blockSliceSize); + dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices, + (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize); + + devBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, offset, angle_sin, angle_cos, dims); + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + return true; +} + + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_projData; + + SDimensions dims; + dims.iVolWidth = 1024; + dims.iVolHeight = 1024; + dims.iProjAngles = 512; + dims.iProjDets = 1536; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + + unsigned int volumePitch, projPitch; + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + printf("pitch: %u\n", projPitch); + + unsigned int y, x; + float* sino = loadImage("sino.png", y, x); + + float* img = new float[dims.iVolWidth*dims.iVolHeight]; + + memset(img, 0, dims.iVolWidth*dims.iVolHeight*sizeof(float)); + + copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); + + float* angle = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) + angle[i] = i*(M_PI/dims.iProjAngles); + + BP(D_volumeData, volumePitch, D_projData, projPitch, dims, angle, 0); + + delete[] angle; + + copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + + saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img); + + return 0; +} +#endif diff --git a/cuda/2d/par_bp.h b/cuda/2d/par_bp.h new file mode 100644 index 0000000..c6dbd59 --- /dev/null +++ b/cuda/2d/par_bp.h @@ -0,0 +1,48 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_PAR_BP_H +#define _CUDA_PAR_BP_H + +#include "dims.h" + +namespace astraCUDA { + +_AstraExport bool BP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets); + +_AstraExport bool BP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle, const SDimensions& dims, + const float* angles, const float* TOffsets); + +} + +#endif diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu new file mode 100644 index 0000000..585cb06 --- /dev/null +++ b/cuda/2d/par_fp.cu @@ -0,0 +1,704 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +#define PIXELTRACE + + +typedef texture<float, 2, cudaReadModeElementType> texture2D; + +static texture2D gT_volumeTexture; + + +namespace astraCUDA { + +static const unsigned g_MaxAngles = 2560; +__constant__ float gC_angle[g_MaxAngles]; +__constant__ float gC_angle_offset[g_MaxAngles]; + + +// optimization parameters +static const unsigned int g_anglesPerBlock = 16; +static const unsigned int g_detBlockSize = 32; +static const unsigned int g_blockSlices = 64; + +// fixed point scaling factor +#define fPREC_FACTOR 16.0f +#define iPREC_FACTOR 16 + + +// if necessary, a buffer of zeroes of size g_MaxAngles +static float* g_pfZeroes = 0; + + +static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + dataArray = 0; + cudaMallocArray(&dataArray, &channelDesc, width, height); + cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice); + + gT_volumeTexture.addressMode[0] = cudaAddressModeClamp; + gT_volumeTexture.addressMode[1] = cudaAddressModeClamp; + gT_volumeTexture.filterMode = cudaFilterModeLinear; + gT_volumeTexture.normalized = false; + + // TODO: For very small sizes (roughly <=512x128) with few angles (<=180) + // not using an array is more efficient. +// cudaBindTexture2D(0, gT_volumeTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch); + cudaBindTextureToArray(gT_volumeTexture, dataArray, channelDesc); + + // TODO: error value? + + return true; +} + +// projection for angles that are roughly horizontal +// theta between 45 and 135 degrees +__global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale) +{ + const int relDet = threadIdx.x; + const int relAngle = threadIdx.y; + + int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; + + if (angle >= endAngle) + return; + + const float theta = gC_angle[angle]; + const float cos_theta = __cosf(theta); + const float sin_theta = __sinf(theta); + + // compute start detector for this block/angle: + // (The same for all threadIdx.x) + // ------------------------------------- + + const int midSlice = startSlice + g_blockSlices / 2; + + // ASSUMPTION: fDetScale >= 1.0f + // problem: detector regions get skipped because slice blocks aren't large + // enough + const unsigned int g_blockSliceSize = g_detBlockSize; + + // project (midSlice,midRegion) on this thread's detector + + const float fBase = 0.5f*dims.iProjDets - 0.5f + + ( + (midSlice - 0.5f*dims.iVolWidth + 0.5f) * cos_theta + - (g_blockSliceSize/2 - 0.5f*dims.iVolHeight + 0.5f) * sin_theta + + gC_angle_offset[angle] + ) / dims.fDetScale; + int iBase = (int)floorf(fBase * fPREC_FACTOR); + int iInc = (int)floorf(g_blockSliceSize * sin_theta / dims.fDetScale * -fPREC_FACTOR); + + // ASSUMPTION: 16 > regionOffset / fDetScale + const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; + const int detPrevRegion = (iBase + (blockIdx.y - regionOffset - 1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; + + if (blockIdx.y > 0 && detRegion == detPrevRegion) + return; + + const int detector = detRegion * g_detBlockSize + relDet; + + // Now project the part of the ray to angle,detector through + // slices startSlice to startSlice+g_blockSlices-1 + + if (detector < 0 || detector >= dims.iProjDets) + return; + + const float fDetStep = -dims.fDetScale / sin_theta; + float fSliceStep = cos_theta / sin_theta; + float fDistCorr; + if (sin_theta > 0.0f) + fDistCorr = -fDetStep; + else + fDistCorr = fDetStep; + fDistCorr *= outputScale; + + float fVal = 0.0f; + // project detector on slice + float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f; + float fS = startSlice + 1.5f; + int endSlice = startSlice + g_blockSlices; + if (endSlice > dims.iVolWidth) + endSlice = dims.iVolWidth; + + if (dims.iRaysPerDet > 1) { + + fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep; + const float fSubDetStep = fDetStep / dims.iRaysPerDet; + fDistCorr /= dims.iRaysPerDet; + + fSliceStep -= dims.iRaysPerDet * fSubDetStep; + + for (int slice = startSlice; slice < endSlice; ++slice) + { + for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { + fVal += tex2D(gT_volumeTexture, fS, fP); + fP += fSubDetStep; + } + fP += fSliceStep; + fS += 1.0f; + } + + } else { + + for (int slice = startSlice; slice < endSlice; ++slice) + { + fVal += tex2D(gT_volumeTexture, fS, fP); + fP += fSliceStep; + fS += 1.0f; + } + + + } + + D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; +} + +// projection for angles that are roughly vertical +// theta between 0 and 45, or 135 and 180 degrees +__global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale) +{ + const int relDet = threadIdx.x; + const int relAngle = threadIdx.y; + + int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; + + if (angle >= endAngle) + return; + + const float theta = gC_angle[angle]; + const float cos_theta = __cosf(theta); + const float sin_theta = __sinf(theta); + + // compute start detector for this block/angle: + // (The same for all threadIdx.x) + // ------------------------------------- + + const int midSlice = startSlice + g_blockSlices / 2; + + // project (midSlice,midRegion) on this thread's detector + + // ASSUMPTION: fDetScale >= 1.0f + // problem: detector regions get skipped because slice blocks aren't large + // enough + const unsigned int g_blockSliceSize = g_detBlockSize; + + const float fBase = 0.5f*dims.iProjDets - 0.5f + + ( + (g_blockSliceSize/2 - 0.5f*dims.iVolWidth + 0.5f) * cos_theta + - (midSlice - 0.5f*dims.iVolHeight + 0.5f) * sin_theta + + gC_angle_offset[angle] + ) / dims.fDetScale; + int iBase = (int)floorf(fBase * fPREC_FACTOR); + int iInc = (int)floorf(g_blockSliceSize * cos_theta / dims.fDetScale * fPREC_FACTOR); + + // ASSUMPTION: 16 > regionOffset / fDetScale + const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; + const int detPrevRegion = (iBase + (blockIdx.y - regionOffset-1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; + + if (blockIdx.y > 0 && detRegion == detPrevRegion) + return; + + const int detector = detRegion * g_detBlockSize + relDet; + + // Now project the part of the ray to angle,detector through + // slices startSlice to startSlice+g_blockSlices-1 + + if (detector < 0 || detector >= dims.iProjDets) + return; + + const float fDetStep = dims.fDetScale / cos_theta; + float fSliceStep = sin_theta / cos_theta; + float fDistCorr; + if (cos_theta < 0.0f) + fDistCorr = -fDetStep; + else + fDistCorr = fDetStep; + fDistCorr *= outputScale; + + float fVal = 0.0f; + float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f; + float fS = startSlice+1.5f; + int endSlice = startSlice + g_blockSlices; + if (endSlice > dims.iVolHeight) + endSlice = dims.iVolHeight; + + if (dims.iRaysPerDet > 1) { + + fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep; + const float fSubDetStep = fDetStep / dims.iRaysPerDet; + fDistCorr /= dims.iRaysPerDet; + + fSliceStep -= dims.iRaysPerDet * fSubDetStep; + + for (int slice = startSlice; slice < endSlice; ++slice) + { + for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { + fVal += tex2D(gT_volumeTexture, fP, fS); + fP += fSubDetStep; + } + fP += fSliceStep; + fS += 1.0f; + } + + } else { + + for (int slice = startSlice; slice < endSlice; ++slice) + { + fVal += tex2D(gT_volumeTexture, fP, fS); + fP += fSliceStep; + fS += 1.0f; + } + + } + + D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; +} + +// projection for angles that are roughly horizontal +// (detector roughly vertical) +__global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) +{ + const int relDet = threadIdx.x; + const int relAngle = threadIdx.y; + + int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; + + if (angle >= endAngle) + return; + + const float theta = gC_angle[angle]; + const float cos_theta = __cosf(theta); + const float sin_theta = __sinf(theta); + + // compute start detector for this block/angle: + const int detRegion = blockIdx.y; + + const int detector = detRegion * g_detBlockSize + relDet; + + // Now project the part of the ray to angle,detector through + // slices startSlice to startSlice+g_blockSlices-1 + + if (detector < 0 || detector >= dims.iProjDets) + return; + + const float fDetStep = -dims.fDetScale / sin_theta; + float fSliceStep = cos_theta / sin_theta; + float fDistCorr; + if (sin_theta > 0.0f) + fDistCorr = -fDetStep; + else + fDistCorr = fDetStep; + fDistCorr *= outputScale; + + float fVal = 0.0f; + // project detector on slice + float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f; + float fS = startSlice + 1.5f; + int endSlice = startSlice + g_blockSlices; + if (endSlice > dims.iVolWidth) + endSlice = dims.iVolWidth; + + if (dims.iRaysPerDet > 1) { + + fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep; + const float fSubDetStep = fDetStep / dims.iRaysPerDet; + fDistCorr /= dims.iRaysPerDet; + + fSliceStep -= dims.iRaysPerDet * fSubDetStep; + + for (int slice = startSlice; slice < endSlice; ++slice) + { + for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { + fVal += tex2D(gT_volumeTexture, fS, fP); + fP += fSubDetStep; + } + fP += fSliceStep; + fS += 1.0f; + } + + } else { + + for (int slice = startSlice; slice < endSlice; ++slice) + { + fVal += tex2D(gT_volumeTexture, fS, fP); + fP += fSliceStep; + fS += 1.0f; + } + + + } + + D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; +} + + +// projection for angles that are roughly vertical +// (detector roughly horizontal) +__global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) +{ + const int relDet = threadIdx.x; + const int relAngle = threadIdx.y; + + int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; + + if (angle >= endAngle) + return; + + const float theta = gC_angle[angle]; + const float cos_theta = __cosf(theta); + const float sin_theta = __sinf(theta); + + // compute start detector for this block/angle: + const int detRegion = blockIdx.y; + + const int detector = detRegion * g_detBlockSize + relDet; + + // Now project the part of the ray to angle,detector through + // slices startSlice to startSlice+g_blockSlices-1 + + if (detector < 0 || detector >= dims.iProjDets) + return; + + const float fDetStep = dims.fDetScale / cos_theta; + float fSliceStep = sin_theta / cos_theta; + float fDistCorr; + if (cos_theta < 0.0f) + fDistCorr = -fDetStep; + else + fDistCorr = fDetStep; + fDistCorr *= outputScale; + + float fVal = 0.0f; + float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f; + float fS = startSlice+1.5f; + int endSlice = startSlice + g_blockSlices; + if (endSlice > dims.iVolHeight) + endSlice = dims.iVolHeight; + + if (dims.iRaysPerDet > 1) { + + fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep; + const float fSubDetStep = fDetStep / dims.iRaysPerDet; + fDistCorr /= dims.iRaysPerDet; + + fSliceStep -= dims.iRaysPerDet * fSubDetStep; + + for (int slice = startSlice; slice < endSlice; ++slice) + { + for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { + fVal += tex2D(gT_volumeTexture, fP, fS); + fP += fSubDetStep; + } + fP += fSliceStep; + fS += 1.0f; + } + + } else { + + for (int slice = startSlice; slice < endSlice; ++slice) + { + fVal += tex2D(gT_volumeTexture, fP, fS); + fP += fSliceStep; + fS += 1.0f; + } + + } + + D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; +} + + + +bool FP_simple(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, float outputScale) +{ + // TODO: load angles into constant memory in smaller blocks + assert(dims.iProjAngles <= g_MaxAngles); + + cudaArray* D_dataArray; + bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + + cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + + if (TOffsets) { + cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + } else { + if (!g_pfZeroes) { + g_pfZeroes = new float[g_MaxAngles]; + memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float)); + } + cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + } + + dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // detector block size, angles + + std::list<cudaStream_t> streams; + + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + // TODO: Check if it's worth it to store this info instead + // of recomputing it every FP. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + bool blockVertical = false; + for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { + bool vertical; + // TODO: Having <= instead of < below causes a 5% speedup. + // Maybe we should detect corner cases and put them in the optimal + // group of angles. + if (a != dims.iProjAngles) + vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a]))); + if (a == dims.iProjAngles || vertical != blockVertical) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock, + (dims.iProjDets+g_detBlockSize-1)/g_detBlockSize); // angle blocks, detector blocks + + // TODO: check if we can't immediately + // destroy the stream after use + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + //printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical); + if (!blockVertical) + for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices) + FPhorizontal_simple<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); + else + for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices) + FPvertical_simple<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); + } + blockVertical = vertical; + blockStart = a; + } + } + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) + cudaStreamDestroy(*iter); + + streams.clear(); + + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + cudaFreeArray(D_dataArray); + + + return true; +} + +bool FP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, float outputScale) +{ + return FP_simple(D_volumeData, volumePitch, D_projData, projPitch, + dims, angles, TOffsets, outputScale); + + // TODO: Fix bug in this non-simple FP with large detscale and TOffsets +#if 0 + + // TODO: load angles into constant memory in smaller blocks + assert(dims.iProjAngles <= g_MaxAngles); + + // TODO: compute region size dynamically to resolve these two assumptions + // ASSUMPTION: 16 > regionOffset / fDetScale + const unsigned int g_blockSliceSize = g_detBlockSize; + assert(16 > (g_blockSlices / g_blockSliceSize) / dims.fDetScale); + // ASSUMPTION: fDetScale >= 1.0f + assert(dims.fDetScale > 0.9999f); + + cudaArray* D_dataArray; + bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + + cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + + if (TOffsets) { + cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + } else { + if (!g_pfZeroes) { + g_pfZeroes = new float[g_MaxAngles]; + memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float)); + } + cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + } + + int regionOffset = g_blockSlices / g_blockSliceSize; + + dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles + + std::list<cudaStream_t> streams; + + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + // TODO: Check if it's worth it to store this info instead + // of recomputing it every FP. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + bool blockVertical = false; + for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { + bool vertical; + // TODO: Having <= instead of < below causes a 5% speedup. + // Maybe we should detect corner cases and put them in the optimal + // group of angles. + if (a != dims.iProjAngles) + vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a]))); + if (a == dims.iProjAngles || vertical != blockVertical) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + unsigned int length = dims.iVolHeight; + if (blockVertical) + length = dims.iVolWidth; + dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock, + (length+g_blockSliceSize-1)/g_blockSliceSize+2*regionOffset); // angle blocks, regions + // TODO: check if we can't immediately + // destroy the stream after use + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + //printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical); + if (!blockVertical) + for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices) + FPhorizontal<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale); + else + for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices) + FPvertical<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale); + } + blockVertical = vertical; + blockStart = a; + } + } + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) + cudaStreamDestroy(*iter); + + streams.clear(); + + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + cudaFreeArray(D_dataArray); + + + return true; +#endif +} + + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_projData; + + SDimensions dims; + dims.iVolWidth = 1024; + dims.iVolHeight = 1024; + dims.iProjAngles = 512; + dims.iProjDets = 1536; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + unsigned int volumePitch, projPitch; + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + printf("pitch: %u\n", projPitch); + + unsigned int y, x; + float* img = loadImage("phantom.png", y, x); + + float* sino = new float[dims.iProjAngles * dims.iProjDets]; + + memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float)); + + copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); + + float* angle = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) + angle[i] = i*(M_PI/dims.iProjAngles); + + FP(D_volumeData, volumePitch, D_projData, projPitch, dims, angle, 0, 1.0f); + + delete[] angle; + + copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); + + float s = 0.0f; + for (unsigned int y = 0; y < dims.iProjAngles; ++y) + for (unsigned int x = 0; x < dims.iProjDets; ++x) + s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x]; + printf("cpu norm: %f\n", s); + + //zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + printf("gpu norm: %f\n", s); + + saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino); + + + return 0; +} +#endif diff --git a/cuda/2d/par_fp.h b/cuda/2d/par_fp.h new file mode 100644 index 0000000..3213b14 --- /dev/null +++ b/cuda/2d/par_fp.h @@ -0,0 +1,41 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_PAR_FP_H +#define _CUDA_PAR_FP_H + +namespace astraCUDA { + +_AstraExport bool FP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, float fOutputScale); + +} + +#endif diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu new file mode 100644 index 0000000..a40176d --- /dev/null +++ b/cuda/2d/sart.cu @@ -0,0 +1,283 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "sart.h" +#include "util.h" +#include "arith.h" +#include "fan_fp.h" +#include "fan_bp.h" +#include "par_fp.h" +#include "par_bp.h" + +namespace astraCUDA { + + +__global__ void devMUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + // Copy result down and left one pixel. + pfOut[x + pitch] = pfOut[x + 1] * pfIn[x + 1]; +} + +void MUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, 1); + + devMUL_SART<<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width); + + cudaTextForceKernelsCompletion(); +} + + + +SART::SART() : ReconAlgo() +{ + D_projData = 0; + D_tmpData = 0; + + D_lineWeight = 0; + + projectionOrder = 0; + projectionCount = 0; + iteration = 0; + customOrder = false; +} + + +SART::~SART() +{ + reset(); +} + +void SART::reset() +{ + cudaFree(D_projData); + cudaFree(D_tmpData); + cudaFree(D_lineWeight); + + D_projData = 0; + D_tmpData = 0; + + D_lineWeight = 0; + + useVolumeMask = false; + useSinogramMask = false; + + if (projectionOrder != NULL) delete[] projectionOrder; + projectionOrder = 0; + projectionCount = 0; + iteration = 0; + customOrder = false; + + ReconAlgo::reset(); +} + +bool SART::init() +{ + if (useVolumeMask) { + allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + } + + // HACK: D_projData consists of two lines. The first is used padded, + // the second unpadded. This is to satisfy the alignment requirements + // of resp. FP and BP_SART. + allocateVolume(D_projData, dims.iProjDets+2, 2, projPitch); + zeroVolume(D_projData, projPitch, dims.iProjDets+2, 1); + + allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch); + zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); + + // We can't precompute lineWeights when using a mask + if (!useVolumeMask) + precomputeWeights(); + + // TODO: check if allocations succeeded + return true; +} + +bool SART::setProjectionOrder(int* _projectionOrder, int _projectionCount) +{ + customOrder = true; + projectionCount = _projectionCount; + projectionOrder = new int[projectionCount]; + for (int i = 0; i < projectionCount; i++) { + projectionOrder[i] = _projectionOrder[i]; + } + + return true; +} + + +bool SART::precomputeWeights() +{ + zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); + if (useVolumeMask) { + callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f); + } else { + // Allocate tmpData temporarily + allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + + + processVol<opSet, VOL>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f); + + + cudaFree(D_tmpData); + D_tmpData = 0; + } + processVol<opInvert, SINO>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); + + return true; +} + +bool SART::iterate(unsigned int iterations) +{ + shouldAbort = false; + + if (useVolumeMask) + precomputeWeights(); + + // iteration + for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { + + int angle; + if (customOrder) { + angle = projectionOrder[iteration % projectionCount]; + } else { + angle = iteration % dims.iProjAngles; + } + + // copy one line of sinogram to projection data + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData + angle*sinoPitch, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), 1, cudaMemcpyDeviceToDevice); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); + processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + callFP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle, -1.0f); + } else { + callFP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle, -1.0f); + } + + MUL_SART(D_projData, D_lineWeight + angle*linePitch, projPitch, dims.iProjDets); + + if (useVolumeMask) { + // BP, mask, and add back + // TODO: Try putting the masking directly in the BP + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + callBP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle); + processVol<opAddMul, VOL>(D_volumeData, D_maskData, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight); + } else { + callBP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle); + } + + if (useMinConstraint) + processVol<opClampMin, VOL>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + if (useMaxConstraint) + processVol<opClampMax, VOL>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + + iteration++; + + } + + return true; +} + +float SART::computeDiffNorm() +{ + unsigned int pPitch; + float *D_p; + allocateVolume(D_p, dims.iProjDets+2, dims.iProjAngles, pPitch); + zeroVolume(D_p, pPitch, dims.iProjDets+2, dims.iProjAngles); + + // copy sinogram to D_p + cudaMemcpy2D(D_p, sizeof(float)*pPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); + processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); + } else { + callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); + } + + + // compute norm of D_p + float s = dotProduct2D(D_p, pPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + + cudaFree(D_p); + + return sqrt(s); +} + +bool SART::callFP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle, float outputScale) +{ + SDimensions d = dims; + d.iProjAngles = 1; + if (angles) { + assert(!fanProjs); + return FP(D_volumeData, volumePitch, D_projData, projPitch, + d, &angles[angle], TOffsets, outputScale); + } else { + assert(fanProjs); + return FanFP(D_volumeData, volumePitch, D_projData, projPitch, + d, &fanProjs[angle], outputScale); + } +} + +bool SART::callBP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle) +{ + if (angles) { + assert(!fanProjs); + return BP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch, + angle, dims, angles, TOffsets); + } else { + assert(fanProjs); + return FanBP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch, + angle, dims, fanProjs); + } + +} + + +} + + diff --git a/cuda/2d/sart.h b/cuda/2d/sart.h new file mode 100644 index 0000000..ad80259 --- /dev/null +++ b/cuda/2d/sart.h @@ -0,0 +1,85 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_SART_H +#define _CUDA_SART_H + +#include "util.h" +#include "algo.h" + +namespace astraCUDA { + +class _AstraExport SART : public ReconAlgo { +public: + SART(); + ~SART(); + + // disable some features + virtual bool enableSinogramMask() { return false; } + + virtual bool init(); + + virtual bool setProjectionOrder(int* projectionOrder, int projectionCount); + + virtual bool iterate(unsigned int iterations); + + virtual float computeDiffNorm(); + +protected: + void reset(); + bool precomputeWeights(); + + bool callFP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle, float outputScale); + bool callBP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle); + + + // projection angle variables + bool customOrder; + int* projectionOrder; + int projectionCount; + int iteration; + + // Temporary buffers + float* D_projData; + unsigned int projPitch; + + float* D_tmpData; // Only used when there's a volume mask + unsigned int tmpPitch; + + // Geometry-specific precomputed data + float* D_lineWeight; + unsigned int linePitch; +}; + +} + +#endif diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu new file mode 100644 index 0000000..31954e4 --- /dev/null +++ b/cuda/2d/sirt.cu @@ -0,0 +1,342 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "sirt.h" +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +namespace astraCUDA { + +SIRT::SIRT() : ReconAlgo() +{ + D_projData = 0; + D_tmpData = 0; + + D_lineWeight = 0; + D_pixelWeight = 0; + + D_minMaskData = 0; + D_maxMaskData = 0; + + freeMinMaxMasks = false; +} + + +SIRT::~SIRT() +{ + reset(); +} + +void SIRT::reset() +{ + cudaFree(D_projData); + cudaFree(D_tmpData); + cudaFree(D_lineWeight); + cudaFree(D_pixelWeight); + if (freeMinMaxMasks) { + cudaFree(D_minMaskData); + cudaFree(D_maxMaskData); + } + + D_projData = 0; + D_tmpData = 0; + + D_lineWeight = 0; + D_pixelWeight = 0; + + freeMinMaxMasks = false; + D_minMaskData = 0; + D_maxMaskData = 0; + + useVolumeMask = false; + useSinogramMask = false; + + ReconAlgo::reset(); +} + +bool SIRT::init() +{ + allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch); + zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); + + allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + + allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch); + zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); + + // We can't precompute lineWeights and pixelWeights when using a mask + if (!useVolumeMask && !useSinogramMask) + precomputeWeights(); + + // TODO: check if allocations succeeded + return true; +} + +bool SIRT::precomputeWeights() +{ + zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); + if (useVolumeMask) { + callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f); + } else { + processVol<opSet, VOL>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f); + } + processVol<opInvert, SINO>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); + + if (useSinogramMask) { + // scale line weights with sinogram mask to zero out masked sinogram pixels + processVol<opMul, SINO>(D_lineWeight, D_smaskData, linePitch, dims.iProjDets, dims.iProjAngles); + } + + + zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); + if (useSinogramMask) { + callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch); + } else { + processVol<opSet, SINO>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles); + callBP(D_pixelWeight, pixelPitch, D_projData, projPitch); + } + processVol<opInvert, VOL>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); + + if (useVolumeMask) { + // scale pixel weights with mask to zero out masked pixels + processVol<opMul, VOL>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight); + } + + return true; +} + +bool SIRT::setMinMaxMasks(float* D_minMaskData_, float* D_maxMaskData_, + unsigned int iPitch) +{ + D_minMaskData = D_minMaskData_; + D_maxMaskData = D_maxMaskData_; + minMaskPitch = iPitch; + maxMaskPitch = iPitch; + + freeMinMaxMasks = false; + return true; +} + +bool SIRT::uploadMinMaxMasks(const float* pfMinMaskData, const float* pfMaxMaskData, + unsigned int iPitch) +{ + freeMinMaxMasks = true; + bool ok = true; + if (pfMinMaskData) { + allocateVolume(D_minMaskData, dims.iVolWidth+2, dims.iVolHeight+2, minMaskPitch); + ok = copyVolumeToDevice(pfMinMaskData, iPitch, + dims.iVolWidth, dims.iVolHeight, + D_minMaskData, minMaskPitch); + } + if (!ok) + return false; + + if (pfMaxMaskData) { + allocateVolume(D_maxMaskData, dims.iVolWidth+2, dims.iVolHeight+2, maxMaskPitch); + ok = copyVolumeToDevice(pfMaxMaskData, iPitch, + dims.iVolWidth, dims.iVolHeight, + D_maxMaskData, maxMaskPitch); + } + if (!ok) + return false; + + return true; +} + +bool SIRT::iterate(unsigned int iterations) +{ + shouldAbort = false; + + if (useVolumeMask || useSinogramMask) + precomputeWeights(); + + // iteration + for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { + + // copy sinogram to projection data + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); + processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); + } else { + callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); + } + + processVol<opMul, SINO>(D_projData, D_lineWeight, projPitch, dims.iProjDets, dims.iProjAngles); + + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + + callBP(D_tmpData, tmpPitch, D_projData, projPitch); + + processVol<opAddMul, VOL>(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight); + + if (useMinConstraint) + processVol<opClampMin, VOL>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + if (useMaxConstraint) + processVol<opClampMax, VOL>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + if (D_minMaskData) + processVol<opClampMinMask, VOL>(D_volumeData, D_minMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight); + if (D_maxMaskData) + processVol<opClampMaxMask, VOL>(D_volumeData, D_maxMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight); + } + + return true; +} + +float SIRT::computeDiffNorm() +{ + // copy sinogram to projection data + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); + processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); + } else { + callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); + } + + + // compute norm of D_projData + + float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + + return sqrt(s); +} + + +bool doSIRT(float* D_volumeData, unsigned int volumePitch, + float* D_sinoData, unsigned int sinoPitch, + float* D_maskData, unsigned int maskPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, unsigned int iterations) +{ + SIRT sirt; + bool ok = true; + + ok &= sirt.setGeometry(dims, angles); + if (D_maskData) + ok &= sirt.enableVolumeMask(); + if (TOffsets) + ok &= sirt.setTOffsets(TOffsets); + + if (!ok) + return false; + + ok = sirt.init(); + if (!ok) + return false; + + if (D_maskData) + ok &= sirt.setVolumeMask(D_maskData, maskPitch); + + ok &= sirt.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); + if (!ok) + return false; + + ok = sirt.iterate(iterations); + + return ok; +} + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_sinoData; + + SDimensions dims; + dims.iVolWidth = 1024; + dims.iVolHeight = 1024; + dims.iProjAngles = 512; + dims.iProjDets = 1536; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + unsigned int volumePitch, sinoPitch; + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + printf("pitch: %u\n", sinoPitch); + + unsigned int y, x; + float* sino = loadImage("sino.png", y, x); + + float* img = new float[dims.iVolWidth*dims.iVolHeight]; + + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch); + + float* angle = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) + angle[i] = i*(M_PI/dims.iProjAngles); + + SIRT sirt; + + sirt.setGeometry(dims, angle); + sirt.init(); + + sirt.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); + + sirt.iterate(25); + + + delete[] angle; + + copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + + saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img); + + return 0; +} +#endif + diff --git a/cuda/2d/sirt.h b/cuda/2d/sirt.h new file mode 100644 index 0000000..5592616 --- /dev/null +++ b/cuda/2d/sirt.h @@ -0,0 +1,90 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_SIRT_H +#define _CUDA_SIRT_H + +#include "util.h" +#include "algo.h" + +namespace astraCUDA { + +class _AstraExport SIRT : public ReconAlgo { +public: + SIRT(); + ~SIRT(); + + virtual bool init(); + + // Set min/max masks to existing GPU memory buffers + bool setMinMaxMasks(float* D_minMaskData, float* D_maxMaskData, + unsigned int pitch); + + // Set min/max masks from RAM buffers + bool uploadMinMaxMasks(const float* minMaskData, const float* maxMaskData, + unsigned int pitch); + + virtual bool iterate(unsigned int iterations); + + virtual float computeDiffNorm(); + +protected: + void reset(); + bool precomputeWeights(); + + // Temporary buffers + float* D_projData; + unsigned int projPitch; + + float* D_tmpData; + unsigned int tmpPitch; + + // Geometry-specific precomputed data + float* D_lineWeight; + unsigned int linePitch; + + float* D_pixelWeight; + unsigned int pixelPitch; + + // Masks + bool freeMinMaxMasks; + float* D_minMaskData; + unsigned int minMaskPitch; + float* D_maxMaskData; + unsigned int maxMaskPitch; +}; + +bool doSIRT(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + float* D_maskData, unsigned int maskPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, unsigned int iterations); + +} + +#endif diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu new file mode 100644 index 0000000..06f6714 --- /dev/null +++ b/cuda/2d/util.cu @@ -0,0 +1,244 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include "util.h" + +namespace astraCUDA { + +bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch, + unsigned int width, unsigned int height, + float* outD_data, unsigned int out_pitch) +{ + // TODO: a full memset isn't necessary. Only the edges. + cudaError_t err; + err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, sizeof(float)*(width+2), height+2); + ASTRA_CUDA_ASSERT(err); + err = cudaMemcpy2D(outD_data + out_pitch + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); + ASTRA_CUDA_ASSERT(err); + assert(err == cudaSuccess); + return true; +} + +bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch, + unsigned int width, unsigned int height, + float* inD_data, unsigned int in_pitch) +{ + cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + (in_pitch + 1), sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); + ASTRA_CUDA_ASSERT(err); + return true; +} + + +bool copySinogramFromDevice(float* out_data, unsigned int out_pitch, + unsigned int width, unsigned int height, + float* inD_data, unsigned int in_pitch) +{ + cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + 1, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); + ASTRA_CUDA_ASSERT(err); + return true; +} + +bool copySinogramToDevice(const float* in_data, unsigned int in_pitch, + unsigned int width, unsigned int height, + float* outD_data, unsigned int out_pitch) +{ + // TODO: a full memset isn't necessary. Only the edges. + cudaError_t err; + err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, (width+2)*sizeof(float), height); + ASTRA_CUDA_ASSERT(err); + err = cudaMemcpy2D(outD_data + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); + ASTRA_CUDA_ASSERT(err); + return true; +} + + +bool allocateVolume(float*& ptr, unsigned int width, unsigned int height, unsigned int& pitch) +{ + size_t p; + cudaError_t ret = cudaMallocPitch((void**)&ptr, &p, sizeof(float)*width, height); + if (ret != cudaSuccess) { + reportCudaError(ret); + fprintf(stderr, "Failed to allocate %dx%d GPU buffer\n", width, height); + return false; + } + + assert(p % sizeof(float) == 0); + + pitch = p / sizeof(float); + + return true; +} + +void zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned int height) +{ + cudaError_t err; + err = cudaMemset2D(data, sizeof(float)*pitch, 0, sizeof(float)*width, height); + ASTRA_CUDA_ASSERT(err); +} + + +template <unsigned int blockSize> +__global__ void reduce1D(float *g_idata, float *g_odata, unsigned int n) +{ + extern __shared__ float sdata[]; + unsigned int tid = threadIdx.x; + + unsigned int i = blockIdx.x*(blockSize*2) + tid; + unsigned int gridSize = blockSize*gridDim.x; + sdata[tid] = 0; + while (i < n) { sdata[tid] += g_idata[i]; i += gridSize; } + __syncthreads(); + if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); } + if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); } + if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); } + if (tid < 32) { + volatile float* smem = sdata; + if (blockSize >= 64) smem[tid] += smem[tid + 32]; + if (blockSize >= 32) smem[tid] += smem[tid + 16]; + if (blockSize >= 16) smem[tid] += smem[tid + 8]; + if (blockSize >= 8) smem[tid] += smem[tid + 4]; + if (blockSize >= 4) smem[tid] += smem[tid + 2]; + if (blockSize >= 2) smem[tid] += smem[tid + 1]; + } + if (tid == 0) g_odata[blockIdx.x] = sdata[0]; +} + +__global__ void reduce2D(float *g_idata, float *g_odata, + unsigned int pitch, + unsigned int nx, unsigned int ny, + unsigned int padX, unsigned int padY) +{ + extern __shared__ float sdata[]; + const unsigned int tidx = threadIdx.x; + const unsigned int tidy = threadIdx.y; + const unsigned int tid = tidy * 16 + tidx; + + unsigned int x = blockIdx.x*16 + tidx; + unsigned int y = blockIdx.y*16 + tidy; + + sdata[tid] = 0; + + if (x >= padX && x < padX + nx) { + + while (y < padY + ny) { + if (y >= padY) + sdata[tid] += (g_idata[pitch*y+x] * g_idata[pitch*y+x]); + y += 16 * gridDim.y; + } + + } + + __syncthreads(); + + if (tid < 128) + sdata[tid] += sdata[tid + 128]; + __syncthreads(); + + if (tid < 64) + sdata[tid] += sdata[tid + 64]; + __syncthreads(); + + if (tid < 32) { // 32 is warp size + volatile float* smem = sdata; + smem[tid] += smem[tid + 32]; + smem[tid] += smem[tid + 16]; + smem[tid] += smem[tid + 8]; + smem[tid] += smem[tid + 4]; + smem[tid] += smem[tid + 2]; + smem[tid] += smem[tid + 1]; + } + + if (tid == 0) + g_odata[blockIdx.y * gridDim.x + blockIdx.x] = sdata[0]; +} + +float dotProduct2D(float* D_data, unsigned int pitch, + unsigned int width, unsigned int height, + unsigned int padX, unsigned int padY) +{ + unsigned int bx = ((width+padX) + 15) / 16; + unsigned int by = ((height+padY) + 127) / 128; + unsigned int shared_mem2 = sizeof(float) * 16 * 16; + + dim3 dimBlock2(16, 16); + dim3 dimGrid2(bx, by); + + float* D_buf; + cudaMalloc(&D_buf, sizeof(float) * (bx * by + 1) ); + + // Step 1: reduce 2D from image to a single vector, taking sum of squares + + reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height, padX, padY); + cudaTextForceKernelsCompletion(); + + // Step 2: reduce 1D: add up elements in vector + if (bx * by > 512) + reduce1D<512><<< 1, 512, sizeof(float)*512>>>(D_buf, D_buf+(bx*by), bx*by); + else if (bx * by > 128) + reduce1D<128><<< 1, 128, sizeof(float)*128>>>(D_buf, D_buf+(bx*by), bx*by); + else if (bx * by > 32) + reduce1D<32><<< 1, 32, sizeof(float)*32*2>>>(D_buf, D_buf+(bx*by), bx*by); + else if (bx * by > 8) + reduce1D<8><<< 1, 8, sizeof(float)*8*2>>>(D_buf, D_buf+(bx*by), bx*by); + else + reduce1D<1><<< 1, 1, sizeof(float)*1*2>>>(D_buf, D_buf+(bx*by), bx*by); + + float x; + cudaMemcpy(&x, D_buf+(bx*by), 4, cudaMemcpyDeviceToHost); + + cudaTextForceKernelsCompletion(); + + cudaFree(D_buf); + + return x; +} + + +bool cudaTextForceKernelsCompletion() +{ + cudaError_t returnedCudaError = cudaThreadSynchronize(); + + if(returnedCudaError != cudaSuccess) { + fprintf(stderr, "Failed to force completion of cuda kernels: %d: %s.\n", returnedCudaError, cudaGetErrorString(returnedCudaError)); + return false; + } + + return true; +} + +void reportCudaError(cudaError_t err) +{ + if(err != cudaSuccess) + fprintf(stderr, "CUDA error %d: %s.\n", err, cudaGetErrorString(err)); +} + + + +} diff --git a/cuda/2d/util.h b/cuda/2d/util.h new file mode 100644 index 0000000..d31e2eb --- /dev/null +++ b/cuda/2d/util.h @@ -0,0 +1,90 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_UTIL_H +#define _CUDA_UTIL_H + +#include <cuda.h> +#include <driver_types.h> + +#ifdef _MSC_VER + +#ifdef DLL_EXPORTS +#define _AstraExport __declspec(dllexport) +#define EXPIMP_TEMPLATE +#else +#define _AstraExport __declspec(dllimport) +#define EXPIMP_TEMPLATE extern +#endif + +#else + +#define _AstraExport + +#endif + +#include "dims.h" + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + +#define ASTRA_CUDA_ASSERT(err) do { if (err != cudaSuccess) { astraCUDA::reportCudaError(err); assert(err == cudaSuccess); } } while(0) + + +namespace astraCUDA { + +bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch, + unsigned int width, unsigned int height, + float* outD_data, unsigned int out_pitch); +bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch, + unsigned int width, unsigned int height, + float* inD_data, unsigned int in_pitch); +bool copySinogramFromDevice(float* out_data, unsigned int out_pitch, + unsigned int width, unsigned int height, + float* inD_data, unsigned int in_pitch); +bool copySinogramToDevice(const float* in_data, unsigned int in_pitch, + unsigned int width, unsigned int height, + float* outD_data, unsigned int out_pitch); + +bool allocateVolume(float*& D_ptr, unsigned int width, unsigned int height, unsigned int& pitch); + +void zeroVolume(float* D_data, unsigned int pitch, unsigned int width, unsigned int height); + +bool cudaTextForceKernelsCompletion(); +void reportCudaError(cudaError_t err); + + + +float dotProduct2D(float* D_data, unsigned int pitch, + unsigned int width, unsigned int height, + unsigned int padX, unsigned int padY); + +} + +#endif |