summaryrefslogtreecommitdiffstats
path: root/cuda/2d
diff options
context:
space:
mode:
Diffstat (limited to 'cuda/2d')
-rw-r--r--cuda/2d/algo.cu356
-rw-r--r--cuda/2d/algo.h155
-rw-r--r--cuda/2d/arith.cu893
-rw-r--r--cuda/2d/arith.h101
-rw-r--r--cuda/2d/astra.cu824
-rw-r--r--cuda/2d/astra.h205
-rw-r--r--cuda/2d/cgls.cu304
-rw-r--r--cuda/2d/cgls.h92
-rw-r--r--cuda/2d/darthelper.cu358
-rw-r--r--cuda/2d/darthelper.h44
-rw-r--r--cuda/2d/dataop.cu130
-rw-r--r--cuda/2d/dataop.h47
-rw-r--r--cuda/2d/dims.h68
-rw-r--r--cuda/2d/em.cu262
-rw-r--r--cuda/2d/em.h77
-rw-r--r--cuda/2d/fan_bp.cu374
-rw-r--r--cuda/2d/fan_bp.h45
-rw-r--r--cuda/2d/fan_fp.cu370
-rw-r--r--cuda/2d/fan_fp.h41
-rw-r--r--cuda/2d/fbp_filters.h58
-rw-r--r--cuda/2d/fft.cu873
-rw-r--r--cuda/2d/fft.h69
-rw-r--r--cuda/2d/par_bp.cu357
-rw-r--r--cuda/2d/par_bp.h48
-rw-r--r--cuda/2d/par_fp.cu704
-rw-r--r--cuda/2d/par_fp.h41
-rw-r--r--cuda/2d/sart.cu283
-rw-r--r--cuda/2d/sart.h85
-rw-r--r--cuda/2d/sirt.cu342
-rw-r--r--cuda/2d/sirt.h90
-rw-r--r--cuda/2d/util.cu244
-rw-r--r--cuda/2d/util.h90
32 files changed, 8030 insertions, 0 deletions
diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu
new file mode 100644
index 0000000..5ae5d08
--- /dev/null
+++ b/cuda/2d/algo.cu
@@ -0,0 +1,356 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cassert>
+
+#include "algo.h"
+#include "par_fp.h"
+#include "fan_fp.h"
+#include "par_bp.h"
+#include "fan_bp.h"
+#include "util.h"
+#include "arith.h"
+
+namespace astraCUDA {
+
+ReconAlgo::ReconAlgo()
+{
+ angles = 0;
+ TOffsets = 0;
+ fanProjs = 0;
+ shouldAbort = false;
+
+ useVolumeMask = false;
+ useSinogramMask = false;
+ D_maskData = 0;
+ D_smaskData = 0;
+
+ D_sinoData = 0;
+ D_volumeData = 0;
+
+ useMinConstraint = false;
+ useMaxConstraint = false;
+
+ freeGPUMemory = false;
+}
+
+ReconAlgo::~ReconAlgo()
+{
+ reset();
+}
+
+void ReconAlgo::reset()
+{
+ delete[] angles;
+ delete[] TOffsets;
+ delete[] fanProjs;
+
+ if (freeGPUMemory) {
+ cudaFree(D_maskData);
+ cudaFree(D_smaskData);
+ cudaFree(D_sinoData);
+ cudaFree(D_volumeData);
+ }
+
+ angles = 0;
+ TOffsets = 0;
+ fanProjs = 0;
+ shouldAbort = false;
+
+ useVolumeMask = false;
+ useSinogramMask = false;
+
+ D_maskData = 0;
+ D_smaskData = 0;
+
+ D_sinoData = 0;
+ D_volumeData = 0;
+
+ useMinConstraint = false;
+ useMaxConstraint = false;
+
+ freeGPUMemory = false;
+}
+
+bool ReconAlgo::setGPUIndex(int iGPUIndex)
+{
+ cudaSetDevice(iGPUIndex);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ return false;
+
+ return true;
+}
+
+bool ReconAlgo::enableVolumeMask()
+{
+ useVolumeMask = true;
+ return true;
+}
+
+bool ReconAlgo::enableSinogramMask()
+{
+ useSinogramMask = true;
+ return true;
+}
+
+
+bool ReconAlgo::setGeometry(const SDimensions& _dims, const float* _angles)
+{
+ dims = _dims;
+
+ angles = new float[dims.iProjAngles];
+
+ memcpy(angles, _angles, sizeof(angles[0]) * dims.iProjAngles);
+
+ delete[] fanProjs;
+ fanProjs = 0;
+
+ return true;
+}
+
+bool ReconAlgo::setFanGeometry(const SDimensions& _dims,
+ const SFanProjection* _projs)
+{
+ dims = _dims;
+ fanProjs = new SFanProjection[dims.iProjAngles];
+
+ memcpy(fanProjs, _projs, sizeof(fanProjs[0]) * dims.iProjAngles);
+
+ delete[] angles;
+ angles = 0;
+
+ return true;
+}
+
+
+bool ReconAlgo::setTOffsets(const float* _TOffsets)
+{
+ // TODO: determine if they're all zero?
+ TOffsets = new float[dims.iProjAngles];
+ memcpy(TOffsets, _TOffsets, sizeof(angles[0]) * dims.iProjAngles);
+
+ return true;
+}
+
+
+
+bool ReconAlgo::setVolumeMask(float* _D_maskData, unsigned int _maskPitch)
+{
+ assert(useVolumeMask);
+
+ D_maskData = _D_maskData;
+ maskPitch = _maskPitch;
+
+ return true;
+}
+
+bool ReconAlgo::setSinogramMask(float* _D_smaskData, unsigned int _smaskPitch)
+{
+ assert(useSinogramMask);
+
+ D_smaskData = _D_smaskData;
+ smaskPitch = _smaskPitch;
+
+ return true;
+}
+
+bool ReconAlgo::setBuffers(float* _D_volumeData, unsigned int _volumePitch,
+ float* _D_projData, unsigned int _projPitch)
+{
+ D_volumeData = _D_volumeData;
+ volumePitch = _volumePitch;
+ D_sinoData = _D_projData;
+ sinoPitch = _projPitch;
+
+ return true;
+}
+
+bool ReconAlgo::setMinConstraint(float fMin)
+{
+ fMinConstraint = fMin;
+ useMinConstraint = true;
+ return true;
+}
+
+bool ReconAlgo::setMaxConstraint(float fMax)
+{
+ fMaxConstraint = fMax;
+ useMaxConstraint = true;
+ return true;
+}
+
+
+
+bool ReconAlgo::allocateBuffers()
+{
+ bool ok;
+ ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ if (!ok)
+ return false;
+
+ ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+ if (!ok) {
+ cudaFree(D_volumeData);
+ D_volumeData = 0;
+ return false;
+ }
+
+ if (useVolumeMask) {
+ ok = allocateVolume(D_maskData, dims.iVolWidth+2, dims.iVolHeight+2, maskPitch);
+ if (!ok) {
+ cudaFree(D_volumeData);
+ cudaFree(D_sinoData);
+ D_volumeData = 0;
+ D_sinoData = 0;
+ return false;
+ }
+ }
+
+ if (useSinogramMask) {
+ ok = allocateVolume(D_smaskData, dims.iProjDets+2, dims.iProjAngles, smaskPitch);
+ if (!ok) {
+ cudaFree(D_volumeData);
+ cudaFree(D_sinoData);
+ cudaFree(D_maskData);
+ D_volumeData = 0;
+ D_sinoData = 0;
+ D_maskData = 0;
+ return false;
+ }
+ }
+
+ freeGPUMemory = true;
+ return true;
+}
+
+bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale,
+ const float* pfReconstruction, unsigned int iReconstructionPitch,
+ const float* pfVolMask, unsigned int iVolMaskPitch,
+ const float* pfSinoMask, unsigned int iSinoMaskPitch)
+{
+ if (!pfSinogram)
+ return false;
+ if (!pfReconstruction)
+ return false;
+
+ bool ok = copySinogramToDevice(pfSinogram, iSinogramPitch,
+ dims.iProjDets,
+ dims.iProjAngles,
+ D_sinoData, sinoPitch);
+ if (!ok)
+ return false;
+
+ // rescale sinogram to adjust for pixel size
+ processVol<opMul,SINO>(D_sinoData, fSinogramScale,
+ //1.0f/(fPixelSize*fPixelSize),
+ sinoPitch,
+ dims.iProjDets, dims.iProjAngles);
+
+ ok = copyVolumeToDevice(pfReconstruction, iReconstructionPitch,
+ dims.iVolWidth, dims.iVolHeight,
+ D_volumeData, volumePitch);
+ if (!ok)
+ return false;
+
+
+
+ if (useVolumeMask) {
+ if (!pfVolMask)
+ return false;
+
+ ok = copyVolumeToDevice(pfVolMask, iVolMaskPitch,
+ dims.iVolWidth, dims.iVolHeight,
+ D_maskData, maskPitch);
+ if (!ok)
+ return false;
+ }
+
+ if (useSinogramMask) {
+ if (!pfSinoMask)
+ return false;
+
+ ok = copySinogramToDevice(pfSinoMask, iSinoMaskPitch,
+ dims.iProjDets, dims.iProjAngles,
+ D_smaskData, smaskPitch);
+ if (!ok)
+ return false;
+ }
+
+ return true;
+}
+
+bool ReconAlgo::getReconstruction(float* pfReconstruction,
+ unsigned int iReconstructionPitch) const
+{
+ bool ok = copyVolumeFromDevice(pfReconstruction, iReconstructionPitch,
+ dims.iVolWidth,
+ dims.iVolHeight,
+ D_volumeData, volumePitch);
+ if (!ok)
+ return false;
+
+ return true;
+}
+
+
+bool ReconAlgo::callFP(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ float outputScale)
+{
+ if (angles) {
+ assert(!fanProjs);
+ return FP(D_volumeData, volumePitch, D_projData, projPitch,
+ dims, angles, TOffsets, outputScale);
+ } else {
+ assert(fanProjs);
+ return FanFP(D_volumeData, volumePitch, D_projData, projPitch,
+ dims, fanProjs, outputScale);
+ }
+}
+
+bool ReconAlgo::callBP(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch)
+{
+ if (angles) {
+ assert(!fanProjs);
+ return BP(D_volumeData, volumePitch, D_projData, projPitch,
+ dims, angles, TOffsets);
+ } else {
+ assert(fanProjs);
+ return FanBP(D_volumeData, volumePitch, D_projData, projPitch,
+ dims, fanProjs);
+ }
+
+}
+
+
+
+}
diff --git a/cuda/2d/algo.h b/cuda/2d/algo.h
new file mode 100644
index 0000000..96195a3
--- /dev/null
+++ b/cuda/2d/algo.h
@@ -0,0 +1,155 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_ALGO_H
+#define _CUDA_ALGO_H
+
+#include "util.h"
+
+namespace astraCUDA {
+
+class _AstraExport ReconAlgo {
+public:
+ ReconAlgo();
+ virtual ~ReconAlgo();
+
+ bool setGPUIndex(int iGPUIndex);
+
+ bool setGeometry(const SDimensions& dims, const float* angles);
+ bool setFanGeometry(const SDimensions& dims, const SFanProjection* projs);
+
+ // setTOffsets should (optionally) be called after setGeometry
+ bool setTOffsets(const float* TOffsets);
+
+ void signalAbort() { shouldAbort = true; }
+
+ virtual bool enableVolumeMask();
+ virtual bool enableSinogramMask();
+
+ // init should be called after setting all geometry
+ virtual bool init() = 0;
+
+ // setVolumeMask should be called after init and before iterate,
+ // but only if enableVolumeMask was called before init.
+ // It may be called again after iterate.
+ bool setVolumeMask(float* D_maskData, unsigned int maskPitch);
+
+ // setSinogramMask should be called after init and before iterate,
+ // but only if enableSinogramMask was called before init.
+ // It may be called again after iterate.
+ bool setSinogramMask(float* D_smaskData, unsigned int smaskPitch);
+
+
+ // setBuffers should be called after init and before iterate.
+ // It may be called again after iterate.
+ virtual bool setBuffers(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch);
+
+
+ // instead of calling setBuffers, you can also call allocateBuffers
+ // to let ReconAlgo manage its own GPU memory
+ virtual bool allocateBuffers();
+ virtual bool copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale,
+ const float* pfReconstruction, unsigned int iReconstructionPitch,
+ const float* pfVolMask, unsigned int iVolMaskPitch,
+ const float* pfSinoMask, unsigned int iSinoMaskPitch);
+
+
+
+ // set Min/Max constraints. They may be called at any time, and will affect
+ // any iterate() calls afterwards.
+ virtual bool setMinConstraint(float fMin);
+ virtual bool setMaxConstraint(float fMax);
+
+
+ // iterate should be called after init and setBuffers.
+ // It may be called multiple times.
+ virtual bool iterate(unsigned int iterations) = 0;
+
+ // Compute the norm of the difference of the FP of the current
+ // reconstruction and the sinogram. (This performs one FP.)
+ // It can be called after iterate.
+ virtual float computeDiffNorm() = 0;
+ // TODO: computeDiffNorm shouldn't be virtual, but for it to be
+ // implemented in ReconAlgo, it needs a way to get a suitable
+ // temporary sinogram buffer.
+
+ bool getReconstruction(float* pfReconstruction,
+ unsigned int iReconstructionPitch) const;
+
+
+
+protected:
+ void reset();
+
+ bool callFP(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ float outputScale);
+ bool callBP(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch);
+
+
+ SDimensions dims;
+ float* angles;
+ float* TOffsets;
+ SFanProjection* fanProjs;
+
+ volatile bool shouldAbort;
+
+ bool freeGPUMemory;
+
+ // Input/output
+ float* D_sinoData;
+ unsigned int sinoPitch;
+
+ float* D_volumeData;
+ unsigned int volumePitch;
+
+ // Masks
+ bool useVolumeMask;
+ bool useSinogramMask;
+
+ float* D_maskData;
+ unsigned int maskPitch;
+ float* D_smaskData;
+ unsigned int smaskPitch;
+
+ // Min/max
+ bool useMinConstraint;
+ bool useMaxConstraint;
+ float fMinConstraint;
+ float fMaxConstraint;
+
+
+};
+
+
+}
+
+#endif
+
diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu
new file mode 100644
index 0000000..1ee02ca
--- /dev/null
+++ b/cuda/2d/arith.cu
@@ -0,0 +1,893 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include "util.h"
+#include "arith.h"
+#include <cassert>
+
+namespace astraCUDA {
+
+
+struct opAddScaled {
+ __device__ void operator()(float& out, const float in, const float inp) {
+ out += in * inp;
+ }
+};
+struct opScaleAndAdd {
+ __device__ void operator()(float& out, const float in, const float inp) {
+ out = in + out * inp;
+ }
+};
+struct opAddMulScaled {
+ __device__ void operator()(float& out, const float in1, const float in2, const float inp) {
+ out += in1 * in2 * inp;
+ }
+};
+struct opAddMul {
+ __device__ void operator()(float& out, const float in1, const float in2) {
+ out += in1 * in2;
+ }
+};
+struct opAdd {
+ __device__ void operator()(float& out, const float in) {
+ out += in;
+ }
+};
+struct opAdd2 {
+ __device__ void operator()(float& out, const float in1, const float in2) {
+ out += in1 + in2;
+ }
+};
+struct opMul {
+ __device__ void operator()(float& out, const float in) {
+ out *= in;
+ }
+};
+struct opMul2 {
+ __device__ void operator()(float& out, const float in1, const float in2) {
+ out *= in1 * in2;
+ }
+};
+struct opDividedBy {
+ __device__ void operator()(float& out, const float in) {
+ if (out > 0.000001f) // out is assumed to be positive
+ out = in / out;
+ else
+ out = 0.0f;
+ }
+};
+struct opInvert {
+ __device__ void operator()(float& out) {
+ if (out > 0.000001f) // out is assumed to be positive
+ out = 1 / out;
+ else
+ out = 0.0f;
+ }
+};
+struct opSet {
+ __device__ void operator()(float& out, const float inp) {
+ out = inp;
+ }
+};
+struct opClampMin {
+ __device__ void operator()(float& out, const float inp) {
+ if (out < inp)
+ out = inp;
+ }
+};
+struct opClampMax {
+ __device__ void operator()(float& out, const float inp) {
+ if (out > inp)
+ out = inp;
+ }
+};
+struct opClampMinMask {
+ __device__ void operator()(float& out, const float in) {
+ if (out < in)
+ out = in;
+ }
+};
+struct opClampMaxMask {
+ __device__ void operator()(float& out, const float in) {
+ if (out > in)
+ out = in;
+ }
+};
+struct opSetMaskedValues {
+ __device__ void operator()(float& out, const float in, const float inp) {
+ if (!in)
+ out = inp;
+ }
+};
+struct opSegmentAndMask {
+ __device__ void operator()(float& out1, float& out2, const float inp1, const float inp2) {
+ if (out1 >= inp1) {
+ out1 = inp2;
+ out2 = 0.0f;
+ }
+
+ }
+
+};
+struct opMulMask {
+ __device__ void operator()(float& out, const float mask, const float in) {
+ if (mask > 0.0f) {
+ out *= in;
+ }
+ }
+};
+
+
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ if (x >= width) return;
+
+ unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+ unsigned int off = (y+padY)*pitch+x+padX;
+ for (unsigned int i = 0; i < repeat; ++i) {
+ if (y >= height)
+ break;
+ op()(pfOut[off]);
+ off += pitch;
+ y++;
+ }
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ if (x >= width) return;
+
+ unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+ unsigned int off = (y+padY)*pitch+x+padX;
+ for (unsigned int i = 0; i < repeat; ++i) {
+ if (y >= height)
+ break;
+ op()(pfOut[off], fParam);
+ off += pitch;
+ y++;
+ }
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ if (x >= width) return;
+
+ unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+ unsigned int off = (y+padY)*pitch+x+padX;
+ for (unsigned int i = 0; i < repeat; ++i) {
+ if (y >= height)
+ break;
+ op()(pfOut1[off], pfOut2[off], fParam1, fParam2);
+ off += pitch;
+ y++;
+ }
+}
+
+
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ if (x >= width) return;
+
+ unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+ unsigned int off = (y+padY)*pitch+x+padX;
+ for (unsigned int i = 0; i < repeat; ++i) {
+ if (y >= height)
+ break;
+ op()(pfOut[off], pfIn[off]);
+ off += pitch;
+ y++;
+ }
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ if (x >= width) return;
+
+ unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+ unsigned int off = (y+padY)*pitch+x+padX;
+ for (unsigned int i = 0; i < repeat; ++i) {
+ if (y >= height)
+ break;
+ op()(pfOut[off], pfIn[off], fParam);
+ off += pitch;
+ y++;
+ }
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ if (x >= width) return;
+
+ unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+ unsigned int off = (y+padY)*pitch+x+padX;
+ for (unsigned int i = 0; i < repeat; ++i) {
+ if (y >= height)
+ break;
+ op()(pfOut[off], pfIn1[off], pfIn2[off]);
+ off += pitch;
+ y++;
+ }
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ if (x >= width) return;
+
+ unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+ unsigned int off = (y+padY)*pitch+x+padX;
+ for (unsigned int i = 0; i < repeat; ++i) {
+ if (y >= height)
+ break;
+ op()(pfOut[off], pfIn1[off], pfIn2[off], fParam);
+ off += pitch;
+ y++;
+ }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+template<typename op, VolType t>
+void processVolCopy(float* out, unsigned int width, unsigned int height)
+{
+ float* D_out;
+
+ unsigned int pitch;
+ allocateVolume(D_out, width+2, height+2, pitch);
+ copyVolumeToDevice(out, width, width, height, D_out, pitch);
+
+ processVol<op, t>(D_out, pitch, width, height);
+
+ copyVolumeFromDevice(out, width, width, height, D_out, pitch);
+
+ cudaFree(D_out);
+}
+
+template<typename op, VolType t>
+void processVolCopy(float* out, float param, unsigned int width, unsigned int height)
+{
+ float* D_out;
+
+ unsigned int pitch;
+ allocateVolume(D_out, width+2, height+2, pitch);
+ copyVolumeToDevice(out, width, width, height, D_out, pitch);
+
+ processVol<op, t>(D_out, param, pitch, width, height);
+
+ copyVolumeFromDevice(out, width, width, height, D_out, pitch);
+
+ cudaFree(D_out);
+}
+
+template<typename op, VolType t>
+void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height)
+{
+ float* D_out1;
+ float* D_out2;
+
+ unsigned int pitch;
+ allocateVolume(D_out1, width+2, height+2, pitch);
+ copyVolumeToDevice(out1, width, width, height, D_out1, pitch);
+ allocateVolume(D_out2, width+2, height+2, pitch);
+ copyVolumeToDevice(out2, width, width, height, D_out2, pitch);
+
+ processVol<op, t>(D_out1, D_out2, param1, param2, pitch, width, height);
+
+ copyVolumeFromDevice(out1, width, width, height, D_out1, pitch);
+ copyVolumeFromDevice(out2, width, width, height, D_out2, pitch);
+
+ cudaFree(D_out1);
+ cudaFree(D_out2);
+}
+
+
+template<typename op, VolType t>
+void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height)
+{
+ float* D_out;
+ float* D_in;
+
+ unsigned int pitch;
+ allocateVolume(D_out, width+2, height+2, pitch);
+ copyVolumeToDevice(out, width, width, height, D_out, pitch);
+ allocateVolume(D_in, width+2, height+2, pitch);
+ copyVolumeToDevice(in, width, width, height, D_in, pitch);
+
+ processVol<op, t>(D_out, D_in, pitch, width, height);
+
+ copyVolumeFromDevice(out, width, width, height, D_out, pitch);
+
+ cudaFree(D_out);
+ cudaFree(D_in);
+}
+
+template<typename op, VolType t>
+void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height)
+{
+ float* D_out;
+ float* D_in;
+
+ unsigned int pitch;
+ allocateVolume(D_out, width+2, height+2, pitch);
+ copyVolumeToDevice(out, width, width, height, D_out, pitch);
+ allocateVolume(D_in, width+2, height+2, pitch);
+ copyVolumeToDevice(in, width, width, height, D_in, pitch);
+
+ processVol<op, t>(D_out, D_in, param, pitch, width, height);
+
+ copyVolumeFromDevice(out, width, width, height, D_out, pitch);
+
+ cudaFree(D_out);
+ cudaFree(D_in);
+}
+
+template<typename op, VolType t>
+void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height)
+{
+ float* D_out;
+ float* D_in1;
+ float* D_in2;
+
+ unsigned int pitch;
+ allocateVolume(D_out, width+2, height+2, pitch);
+ copyVolumeToDevice(out, width, width, height, D_out, pitch);
+ allocateVolume(D_in1, width+2, height+2, pitch);
+ copyVolumeToDevice(in1, width, width, height, D_in1, pitch);
+ allocateVolume(D_in2, width+2, height+2, pitch);
+ copyVolumeToDevice(in2, width, width, height, D_in2, pitch);
+
+ processVol<op, t>(D_out, D_in1, D_in2, pitch, width, height);
+
+ copyVolumeFromDevice(out, width, width, height, D_out, pitch);
+
+ cudaFree(D_out);
+ cudaFree(D_in1);
+ cudaFree(D_in2);
+}
+
+template<typename op, VolType t>
+void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height)
+{
+ float* D_out;
+ float* D_in1;
+ float* D_in2;
+
+ unsigned int pitch;
+ allocateVolume(D_out, width+2, height+2, pitch);
+ copyVolumeToDevice(out, width, width, height, D_out, pitch);
+ allocateVolume(D_in1, width+2, height+2, pitch);
+ copyVolumeToDevice(in1, width, width, height, D_in1, pitch);
+ allocateVolume(D_in2, width+2, height+2, pitch);
+ copyVolumeToDevice(in2, width, width, height, D_in2, pitch);
+
+ processVol<op, t>(D_out, D_in1, D_in2, param, pitch, width, height);
+
+ copyVolumeFromDevice(out, width, width, height, D_out, pitch);
+
+ cudaFree(D_out);
+ cudaFree(D_in1);
+ cudaFree(D_in2);
+}
+
+
+
+
+
+
+
+
+
+template<typename op, VolType t>
+void processVol(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+511)/512);
+
+ devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+15)/16);
+
+ devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+15)/16);
+
+ devFFtoDD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height);
+
+ cudaTextForceKernelsCompletion();
+}
+
+
+template<typename op, VolType t>
+void processVol(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+15)/16);
+
+ devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+15)/16);
+
+ devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+15)/16);
+
+ devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+15)/16);
+
+ devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
+
+ cudaTextForceKernelsCompletion();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+ float *pfOut = (float*)out.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+ for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+ devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ pfOut += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+ float *pfOut = (float*)out.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+ for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+ devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ pfOut += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+ float *pfOut1 = (float*)out1.ptr;
+ float *pfOut2 = (float*)out2.ptr;
+ unsigned int step = out1.pitch/sizeof(float) * dims.iVolY;
+
+ for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+ devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ pfOut1 += step;
+ pfOut2 += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn = (float*)in.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+ for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+ devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ pfOut += step;
+ pfIn += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn = (float*)in.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+ for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+ devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ pfOut += step;
+ pfIn += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn1 = (float*)in1.ptr;
+ float *pfIn2 = (float*)in2.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+ for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+ devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ pfOut += step;
+ pfIn1 += step;
+ pfIn2 += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn1 = (float*)in1.ptr;
+ float *pfIn2 = (float*)in2.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+ for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+ devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ pfOut += step;
+ pfIn1 += step;
+ pfIn2 += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+ float *pfOut = (float*)out.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+ for (unsigned int i = 0; i < dims.iProjV; ++i) {
+ devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ pfOut += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+ float *pfOut = (float*)out.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+ for (unsigned int i = 0; i < dims.iProjV; ++i) {
+ devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ pfOut += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+ float *pfOut1 = (float*)out1.ptr;
+ float *pfOut2 = (float*)out2.ptr;
+ unsigned int step = out1.pitch/sizeof(float) * dims.iProjAngles;
+
+ for (unsigned int i = 0; i < dims.iProjV; ++i) {
+ devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ pfOut1 += step;
+ pfOut2 += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn = (float*)in.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+ for (unsigned int i = 0; i < dims.iProjV; ++i) {
+ devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ pfOut += step;
+ pfIn += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn = (float*)in.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+ for (unsigned int i = 0; i < dims.iProjV; ++i) {
+ devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ pfOut += step;
+ pfIn += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn1 = (float*)in1.ptr;
+ float *pfIn2 = (float*)in2.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+ for (unsigned int i = 0; i < dims.iProjV; ++i) {
+ devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ pfOut += step;
+ pfIn1 += step;
+ pfIn2 += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn1 = (float*)in1.ptr;
+ float *pfIn2 = (float*)in2.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+ for (unsigned int i = 0; i < dims.iProjV; ++i) {
+ devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ pfOut += step;
+ pfIn1 += step;
+ pfIn2 += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#define INST_DFtoD(name) \
+ template void processVolCopy<name, VOL>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
+ template void processVolCopy<name, SINO>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
+ template void processVol<name, VOL>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name, SINO>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \
+ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
+
+#define INST_DtoD(name) \
+ template void processVolCopy<name, VOL>(float* out, const float* in, unsigned int width, unsigned int height); \
+ template void processVolCopy<name, SINO>(float* out, const float* in, unsigned int width, unsigned int height); \
+ template void processVol<name, VOL>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name, SINO>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \
+ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
+
+#define INST_DDtoD(name) \
+ template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
+ template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
+ template void processVol<name, VOL>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name, SINO>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \
+ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
+
+#define INST_DDFtoD(name) \
+ template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
+ template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
+ template void processVol<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \
+ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
+
+
+#define INST_toD(name) \
+ template void processVolCopy<name, VOL>(float* out, unsigned int width, unsigned int height); \
+ template void processVolCopy<name, SINO>(float* out, unsigned int width, unsigned int height); \
+ template void processVol<name, VOL>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name, SINO>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \
+ template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims);
+
+#define INST_FtoD(name) \
+ template void processVolCopy<name, VOL>(float* out, float param, unsigned int width, unsigned int height); \
+ template void processVolCopy<name, SINO>(float* out, float param, unsigned int width, unsigned int height); \
+ template void processVol<name, VOL>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name, SINO>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); \
+ template void processSino3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims);
+
+#define INST_FFtoDD(name) \
+ template void processVolCopy<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
+ template void processVolCopy<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
+ template void processVol<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); \
+ template void processSino3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims);
+
+
+
+INST_DFtoD(opAddScaled)
+INST_DFtoD(opScaleAndAdd)
+INST_DDFtoD(opAddMulScaled)
+INST_DDtoD(opAddMul)
+INST_DDtoD(opMul2)
+INST_DDtoD(opAdd2)
+INST_DtoD(opMul)
+INST_DDtoD(opMulMask)
+INST_DtoD(opAdd)
+INST_DtoD(opDividedBy)
+INST_toD(opInvert)
+INST_FtoD(opSet)
+INST_FtoD(opMul)
+INST_DFtoD(opMulMask)
+INST_FtoD(opAdd)
+INST_FtoD(opClampMin)
+INST_FtoD(opClampMax)
+INST_DtoD(opClampMinMask)
+INST_DtoD(opClampMaxMask)
+
+// PDART-specific:
+INST_DFtoD(opSetMaskedValues)
+INST_FFtoDD(opSegmentAndMask)
+
+}
diff --git a/cuda/2d/arith.h b/cuda/2d/arith.h
new file mode 100644
index 0000000..c8c7b41
--- /dev/null
+++ b/cuda/2d/arith.h
@@ -0,0 +1,101 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_ARITH_H
+#define _CUDA_ARITH_H
+
+#include <cuda.h>
+
+namespace astraCUDA {
+
+
+struct opAddScaled;
+struct opScaleAndAdd;
+struct opAddMulScaled;
+struct opAddMul;
+struct opAdd;
+struct opAdd2;
+struct opMul;
+struct opMul2;
+struct opDividedBy;
+struct opInvert;
+struct opSet;
+struct opClampMin;
+struct opClampMax;
+struct opClampMinMask;
+struct opClampMaxMask;
+struct opSegmentAndMask;
+struct opSetMaskedValues;
+
+struct opMulMask;
+
+
+
+enum VolType {
+ SINO = 0,
+ VOL = 1
+};
+
+
+template<typename op, VolType t> void processVolCopy(float* out, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVolCopy(float* out, float param, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height);
+
+template<typename op, VolType t> void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(float* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(float* out, const float* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height);
+
+template<typename op> void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
+
+template<typename op> void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
+
+
+
+}
+
+#endif
diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu
new file mode 100644
index 0000000..71ed025
--- /dev/null
+++ b/cuda/2d/astra.cu
@@ -0,0 +1,824 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "util.h"
+#include "par_fp.h"
+#include "fan_fp.h"
+#include "par_bp.h"
+#include "arith.h"
+#include "astra.h"
+
+#include "fft.h"
+
+#include <fstream>
+#include <cuda.h>
+
+#include "../../include/astra/Logger.h"
+
+using namespace astraCUDA;
+using namespace std;
+
+
+namespace astra {
+
+enum CUDAProjectionType {
+ PROJ_PARALLEL,
+ PROJ_FAN
+};
+
+
+class AstraFBP_internal {
+public:
+ SDimensions dims;
+ float* angles;
+ float* TOffsets;
+
+ float fPixelSize;
+
+ bool initialized;
+ bool setStartReconstruction;
+
+ float* D_sinoData;
+ unsigned int sinoPitch;
+
+ float* D_volumeData;
+ unsigned int volumePitch;
+
+ cufftComplex * m_pDevFilter;
+};
+
+AstraFBP::AstraFBP()
+{
+ pData = new AstraFBP_internal();
+
+ pData->angles = 0;
+ pData->D_sinoData = 0;
+ pData->D_volumeData = 0;
+
+ pData->dims.iVolWidth = 0;
+ pData->dims.iProjAngles = 0;
+ pData->dims.fDetScale = 1.0f;
+ pData->dims.iRaysPerDet = 1;
+ pData->dims.iRaysPerPixelDim = 1;
+
+ pData->initialized = false;
+ pData->setStartReconstruction = false;
+
+ pData->m_pDevFilter = NULL;
+}
+
+AstraFBP::~AstraFBP()
+{
+ delete[] pData->angles;
+ pData->angles = 0;
+
+ delete[] pData->TOffsets;
+ pData->TOffsets = 0;
+
+ cudaFree(pData->D_sinoData);
+ pData->D_sinoData = 0;
+
+ cudaFree(pData->D_volumeData);
+ pData->D_volumeData = 0;
+
+ if(pData->m_pDevFilter != NULL)
+ {
+ freeComplexOnDevice(pData->m_pDevFilter);
+ pData->m_pDevFilter = NULL;
+ }
+
+ delete pData;
+ pData = 0;
+}
+
+bool AstraFBP::setReconstructionGeometry(unsigned int iVolWidth,
+ unsigned int iVolHeight,
+ float fPixelSize)
+{
+ if (pData->initialized)
+ return false;
+
+ pData->dims.iVolWidth = iVolWidth;
+ pData->dims.iVolHeight = iVolHeight;
+
+ pData->fPixelSize = fPixelSize;
+
+ return (iVolWidth > 0 && iVolHeight > 0 && fPixelSize > 0.0f);
+}
+
+bool AstraFBP::setProjectionGeometry(unsigned int iProjAngles,
+ unsigned int iProjDets,
+ const float* pfAngles,
+ float fDetSize)
+{
+ if (pData->initialized)
+ return false;
+
+ pData->dims.iProjAngles = iProjAngles;
+ pData->dims.iProjDets = iProjDets;
+ pData->dims.fDetScale = fDetSize / pData->fPixelSize;
+
+ if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0)
+ return false;
+
+ pData->angles = new float[iProjAngles];
+ memcpy(pData->angles, pfAngles, iProjAngles * sizeof(pfAngles[0]));
+
+ return true;
+}
+
+bool AstraFBP::setPixelSuperSampling(unsigned int iPixelSuperSampling)
+{
+ if (pData->initialized)
+ return false;
+
+ if (iPixelSuperSampling == 0)
+ return false;
+
+ pData->dims.iRaysPerPixelDim = iPixelSuperSampling;
+
+ return true;
+}
+
+
+bool AstraFBP::setTOffsets(const float* pfTOffsets)
+{
+ if (pData->initialized)
+ return false;
+
+ if (pfTOffsets == 0)
+ return false;
+
+ pData->TOffsets = new float[pData->dims.iProjAngles];
+ memcpy(pData->TOffsets, pfTOffsets, pData->dims.iProjAngles * sizeof(pfTOffsets[0]));
+
+ return true;
+}
+
+bool AstraFBP::init(int iGPUIndex)
+{
+ if (pData->initialized)
+ {
+ return false;
+ }
+
+ if (pData->dims.iProjAngles == 0 || pData->dims.iVolWidth == 0)
+ {
+ return false;
+ }
+
+ cudaSetDevice(iGPUIndex);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ {
+ return false;
+ }
+
+ bool ok = allocateVolume(pData->D_volumeData, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2, pData->volumePitch);
+ if (!ok)
+ {
+ return false;
+ }
+
+ ok = allocateVolume(pData->D_sinoData, pData->dims.iProjDets+2, pData->dims.iProjAngles, pData->sinoPitch);
+ if (!ok)
+ {
+ cudaFree(pData->D_volumeData);
+ pData->D_volumeData = 0;
+ return false;
+ }
+
+ pData->initialized = true;
+
+ return true;
+}
+
+bool AstraFBP::setSinogram(const float* pfSinogram,
+ unsigned int iSinogramPitch)
+{
+ if (!pData->initialized)
+ return false;
+ if (!pfSinogram)
+ return false;
+
+ bool ok = copySinogramToDevice(pfSinogram, iSinogramPitch,
+ pData->dims.iProjDets,
+ pData->dims.iProjAngles,
+ pData->D_sinoData, pData->sinoPitch);
+ if (!ok)
+ return false;
+
+ // rescale sinogram to adjust for pixel size
+ processVol<opMul,SINO>(pData->D_sinoData,
+ 1.0f/(pData->fPixelSize*pData->fPixelSize),
+ pData->sinoPitch,
+ pData->dims.iProjDets, pData->dims.iProjAngles);
+
+ pData->setStartReconstruction = false;
+
+ return true;
+}
+
+static int calcNextPowerOfTwo(int _iValue)
+{
+ int iOutput = 1;
+
+ while(iOutput < _iValue)
+ {
+ iOutput *= 2;
+ }
+
+ return iOutput;
+}
+
+bool AstraFBP::run()
+{
+ if (!pData->initialized)
+ {
+ return false;
+ }
+
+ zeroVolume(pData->D_volumeData, pData->volumePitch, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2);
+
+ bool ok = false;
+
+ if (pData->m_pDevFilter) {
+
+ int iFFTRealDetCount = calcNextPowerOfTwo(2 * pData->dims.iProjDets);
+ int iFFTFourDetCount = calcFFTFourSize(iFFTRealDetCount);
+
+ cufftComplex * pDevComplexSinogram = NULL;
+
+ allocateComplexOnDevice(pData->dims.iProjAngles, iFFTFourDetCount, &pDevComplexSinogram);
+
+ runCudaFFT(pData->dims.iProjAngles, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram);
+
+ applyFilter(pData->dims.iProjAngles, iFFTFourDetCount, pDevComplexSinogram, pData->m_pDevFilter);
+
+ runCudaIFFT(pData->dims.iProjAngles, pDevComplexSinogram, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount);
+
+ freeComplexOnDevice(pDevComplexSinogram);
+
+ }
+
+ ok = BP(pData->D_volumeData, pData->volumePitch, pData->D_sinoData, pData->sinoPitch, pData->dims, pData->angles, pData->TOffsets);
+ if(!ok)
+ {
+ return false;
+ }
+
+ processVol<opMul,VOL>(pData->D_volumeData,
+ (M_PI / 2.0f) / (float)pData->dims.iProjAngles,
+ pData->volumePitch,
+ pData->dims.iVolWidth, pData->dims.iVolHeight);
+
+ return true;
+}
+
+bool AstraFBP::getReconstruction(float* pfReconstruction, unsigned int iReconstructionPitch) const
+{
+ if (!pData->initialized)
+ return false;
+
+ bool ok = copyVolumeFromDevice(pfReconstruction, iReconstructionPitch,
+ pData->dims.iVolWidth,
+ pData->dims.iVolHeight,
+ pData->D_volumeData, pData->volumePitch);
+ if (!ok)
+ return false;
+
+ return true;
+}
+
+int AstraFBP::calcFourierFilterSize(int _iDetectorCount)
+{
+ int iFFTRealDetCount = calcNextPowerOfTwo(2 * _iDetectorCount);
+ int iFreqBinCount = calcFFTFourSize(iFFTRealDetCount);
+
+ // CHECKME: Matlab makes this at least 64. Do we also need to?
+ return iFreqBinCount;
+}
+
+bool AstraFBP::setFilter(E_FBPFILTER _eFilter, const float * _pfHostFilter /* = NULL */, int _iFilterWidth /* = 0 */, float _fD /* = 1.0f */, float _fFilterParameter /* = -1.0f */)
+{
+ if(pData->m_pDevFilter != 0)
+ {
+ freeComplexOnDevice(pData->m_pDevFilter);
+ pData->m_pDevFilter = 0;
+ }
+
+ if (_eFilter == FILTER_NONE)
+ return true; // leave pData->m_pDevFilter set to 0
+
+
+ int iFFTRealDetCount = calcNextPowerOfTwo(2 * pData->dims.iProjDets);
+ int iFreqBinCount = calcFFTFourSize(iFFTRealDetCount);
+
+ cufftComplex * pHostFilter = new cufftComplex[pData->dims.iProjAngles * iFreqBinCount];
+ memset(pHostFilter, 0, sizeof(cufftComplex) * pData->dims.iProjAngles * iFreqBinCount);
+
+ allocateComplexOnDevice(pData->dims.iProjAngles, iFreqBinCount, &(pData->m_pDevFilter));
+
+ switch(_eFilter)
+ {
+ case FILTER_NONE:
+ // handled above
+ break;
+ case FILTER_RAMLAK:
+ case FILTER_SHEPPLOGAN:
+ case FILTER_COSINE:
+ case FILTER_HAMMING:
+ case FILTER_HANN:
+ case FILTER_TUKEY:
+ case FILTER_LANCZOS:
+ case FILTER_TRIANGULAR:
+ case FILTER_GAUSSIAN:
+ case FILTER_BARTLETTHANN:
+ case FILTER_BLACKMAN:
+ case FILTER_NUTTALL:
+ case FILTER_BLACKMANHARRIS:
+ case FILTER_BLACKMANNUTTALL:
+ case FILTER_FLATTOP:
+ {
+ genFilter(_eFilter, _fD, pData->dims.iProjAngles, pHostFilter, iFFTRealDetCount, iFreqBinCount, _fFilterParameter);
+ uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter);
+
+ break;
+ }
+ case FILTER_PROJECTION:
+ {
+ // make sure the offered filter has the correct size
+ assert(_iFilterWidth == iFreqBinCount);
+
+ for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++)
+ {
+ float fValue = _pfHostFilter[iFreqBinIndex];
+
+ for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++)
+ {
+ pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue;
+ pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f;
+ }
+ }
+ uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter);
+ break;
+ }
+ case FILTER_SINOGRAM:
+ {
+ // make sure the offered filter has the correct size
+ assert(_iFilterWidth == iFreqBinCount);
+
+ for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++)
+ {
+ for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++)
+ {
+ float fValue = _pfHostFilter[iFreqBinIndex + iProjectionIndex * _iFilterWidth];
+
+ pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue;
+ pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f;
+ }
+ }
+ uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter);
+ break;
+ }
+ case FILTER_RPROJECTION:
+ {
+ int iProjectionCount = pData->dims.iProjAngles;
+ int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount;
+ float * pfHostRealFilter = new float[iRealFilterElementCount];
+ memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount);
+
+ int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount);
+ int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2;
+ int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth;
+
+ int iFilterShiftSize = _iFilterWidth / 2;
+
+ for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++)
+ {
+ int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount;
+ float fValue = _pfHostFilter[iDetectorIndex];
+
+ for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++)
+ {
+ pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue;
+ }
+ }
+
+ float* pfDevRealFilter = NULL;
+ cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors
+ cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice);
+ delete[] pfHostRealFilter;
+
+ runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);
+
+ cudaFree(pfDevRealFilter);
+
+ break;
+ }
+ case FILTER_RSINOGRAM:
+ {
+ int iProjectionCount = pData->dims.iProjAngles;
+ int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount;
+ float* pfHostRealFilter = new float[iRealFilterElementCount];
+ memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount);
+
+ int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount);
+ int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2;
+ int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth;
+
+ int iFilterShiftSize = _iFilterWidth / 2;
+
+ for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++)
+ {
+ int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount;
+
+ for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++)
+ {
+ float fValue = _pfHostFilter[iDetectorIndex + iProjectionIndex * _iFilterWidth];
+ pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue;
+ }
+ }
+
+ float* pfDevRealFilter = NULL;
+ cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors
+ cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice);
+ delete[] pfHostRealFilter;
+
+ runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);
+
+ cudaFree(pfDevRealFilter);
+
+ break;
+ }
+ default:
+ {
+ fprintf(stderr, "AstraFBP::setFilter: Weird filter type requested");
+ delete [] pHostFilter;
+ return false;
+ }
+ }
+
+ delete [] pHostFilter;
+
+ return true;
+}
+
+BPalgo::BPalgo()
+{
+
+}
+
+BPalgo::~BPalgo()
+{
+
+}
+
+bool BPalgo::init()
+{
+ return true;
+}
+
+bool BPalgo::iterate(unsigned int)
+{
+ // TODO: This zeroVolume makes an earlier memcpy of D_volumeData redundant
+ zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ callBP(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+ return true;
+}
+
+float BPalgo::computeDiffNorm()
+{
+ float *D_projData;
+ unsigned int projPitch;
+
+ allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+
+ cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+ callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
+
+ float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+
+ cudaFree(D_projData);
+
+ return sqrt(s);
+}
+
+
+bool astraCudaFP(const float* pfVolume, float* pfSinogram,
+ unsigned int iVolWidth, unsigned int iVolHeight,
+ unsigned int iProjAngles, unsigned int iProjDets,
+ const float *pfAngles, const float *pfOffsets,
+ float fDetSize, unsigned int iDetSuperSampling,
+ int iGPUIndex)
+{
+ SDimensions dims;
+
+ if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0)
+ return false;
+
+ dims.iProjAngles = iProjAngles;
+ dims.iProjDets = iProjDets;
+ dims.fDetScale = fDetSize;
+
+ if (iDetSuperSampling == 0)
+ return false;
+
+ dims.iRaysPerDet = iDetSuperSampling;
+
+ if (iVolWidth <= 0 || iVolHeight <= 0)
+ return false;
+
+ dims.iVolWidth = iVolWidth;
+ dims.iVolHeight = iVolHeight;
+
+ cudaSetDevice(iGPUIndex);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ return false;
+
+
+ bool ok;
+
+ float* D_volumeData;
+ unsigned int volumePitch;
+
+ ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ if (!ok)
+ return false;
+
+ float* D_sinoData;
+ unsigned int sinoPitch;
+
+ ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+ if (!ok) {
+ cudaFree(D_volumeData);
+ return false;
+ }
+
+ ok = copyVolumeToDevice(pfVolume, dims.iVolWidth,
+ dims.iVolWidth, dims.iVolHeight,
+ D_volumeData, volumePitch);
+ if (!ok) {
+ cudaFree(D_volumeData);
+ cudaFree(D_sinoData);
+ return false;
+ }
+
+ zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+ ok = FP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pfAngles, pfOffsets, 1.0f);
+ if (!ok) {
+ cudaFree(D_volumeData);
+ cudaFree(D_sinoData);
+ return false;
+ }
+
+ ok = copySinogramFromDevice(pfSinogram, dims.iProjDets,
+ dims.iProjDets,
+ dims.iProjAngles,
+ D_sinoData, sinoPitch);
+ if (!ok) {
+ cudaFree(D_volumeData);
+ cudaFree(D_sinoData);
+ return false;
+ }
+
+ cudaFree(D_volumeData);
+ cudaFree(D_sinoData);
+ return true;
+}
+
+bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
+ unsigned int iVolWidth, unsigned int iVolHeight,
+ unsigned int iProjAngles, unsigned int iProjDets,
+ const float *pfAngles, float fOriginSourceDistance,
+ float fOriginDetectorDistance, float fPixelSize,
+ float fDetSize,
+ unsigned int iDetSuperSampling,
+ int iGPUIndex)
+{
+ SDimensions dims;
+
+ if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0)
+ return false;
+
+ dims.iProjAngles = iProjAngles;
+ dims.iProjDets = iProjDets;
+
+ if (iDetSuperSampling == 0)
+ return false;
+
+ dims.iRaysPerDet = iDetSuperSampling;
+
+ if (iVolWidth <= 0 || iVolHeight <= 0)
+ return false;
+
+ dims.iVolWidth = iVolWidth;
+ dims.iVolHeight = iVolHeight;
+
+ cudaSetDevice(iGPUIndex);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ return false;
+
+
+ bool ok;
+
+ float* D_volumeData;
+ unsigned int volumePitch;
+
+ ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ if (!ok)
+ return false;
+
+ float* D_sinoData;
+ unsigned int sinoPitch;
+
+ ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+ if (!ok) {
+ cudaFree(D_volumeData);
+ return false;
+ }
+
+ ok = copyVolumeToDevice(pfVolume, dims.iVolWidth,
+ dims.iVolWidth, dims.iVolHeight,
+ D_volumeData, volumePitch);
+ if (!ok) {
+ cudaFree(D_volumeData);
+ cudaFree(D_sinoData);
+ return false;
+ }
+
+ zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+
+ // TODO: Turn this geometry conversion into a util function
+ SFanProjection* projs = new SFanProjection[dims.iProjAngles];
+
+ float fSrcX0 = 0.0f;
+ float fSrcY0 = -fOriginSourceDistance / fPixelSize;
+ float fDetUX0 = fDetSize / fPixelSize;
+ float fDetUY0 = 0.0f;
+ float fDetSX0 = dims.iProjDets * fDetUX0 / -2.0f;
+ float fDetSY0 = fOriginDetectorDistance / fPixelSize;
+
+#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = f##name##X0 * cos(alpha) - f##name##Y0 * sin(alpha); projs[i].f##name##Y = f##name##X0 * sin(alpha) + f##name##Y0 * cos(alpha); } while(0)
+ for (int i = 0; i < dims.iProjAngles; ++i) {
+ ROTATE0(Src, i, pfAngles[i]);
+ ROTATE0(DetS, i, pfAngles[i]);
+ ROTATE0(DetU, i, pfAngles[i]);
+ }
+
+#undef ROTATE0
+
+ ok = FanFP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, projs, 1.0f);
+ delete[] projs;
+
+ if (!ok) {
+ cudaFree(D_volumeData);
+ cudaFree(D_sinoData);
+ return false;
+ }
+
+ ok = copySinogramFromDevice(pfSinogram, dims.iProjDets,
+ dims.iProjDets,
+ dims.iProjAngles,
+ D_sinoData, sinoPitch);
+ if (!ok) {
+ cudaFree(D_volumeData);
+ cudaFree(D_sinoData);
+ return false;
+ }
+
+ cudaFree(D_volumeData);
+ cudaFree(D_sinoData);
+
+ return true;
+
+}
+
+
+bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
+ unsigned int iVolWidth, unsigned int iVolHeight,
+ unsigned int iProjAngles, unsigned int iProjDets,
+ const SFanProjection *pAngles,
+ unsigned int iDetSuperSampling,
+ int iGPUIndex)
+{
+ SDimensions dims;
+
+ if (iProjAngles == 0 || iProjDets == 0 || pAngles == 0)
+ return false;
+
+ dims.iProjAngles = iProjAngles;
+ dims.iProjDets = iProjDets;
+ dims.fDetScale = 1.0f; // TODO?
+
+ if (iDetSuperSampling == 0)
+ return false;
+
+ dims.iRaysPerDet = iDetSuperSampling;
+
+ if (iVolWidth <= 0 || iVolHeight <= 0)
+ return false;
+
+ dims.iVolWidth = iVolWidth;
+ dims.iVolHeight = iVolHeight;
+
+ cudaSetDevice(iGPUIndex);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ return false;
+
+
+ bool ok;
+
+ float* D_volumeData;
+ unsigned int volumePitch;
+
+ ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ if (!ok)
+ return false;
+
+ float* D_sinoData;
+ unsigned int sinoPitch;
+
+ ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+ if (!ok) {
+ cudaFree(D_volumeData);
+ return false;
+ }
+
+ ok = copyVolumeToDevice(pfVolume, dims.iVolWidth,
+ dims.iVolWidth, dims.iVolHeight,
+ D_volumeData, volumePitch);
+ if (!ok) {
+ cudaFree(D_volumeData);
+ cudaFree(D_sinoData);
+ return false;
+ }
+
+ zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+
+ ok = FanFP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pAngles, 1.0f);
+
+ if (!ok) {
+ cudaFree(D_volumeData);
+ cudaFree(D_sinoData);
+ return false;
+ }
+
+ ok = copySinogramFromDevice(pfSinogram, dims.iProjDets,
+ dims.iProjDets,
+ dims.iProjAngles,
+ D_sinoData, sinoPitch);
+ if (!ok) {
+ cudaFree(D_volumeData);
+ cudaFree(D_sinoData);
+ return false;
+ }
+
+ cudaFree(D_volumeData);
+ cudaFree(D_sinoData);
+
+ return true;
+
+}
+
+
+}
diff --git a/cuda/2d/astra.h b/cuda/2d/astra.h
new file mode 100644
index 0000000..9e58301
--- /dev/null
+++ b/cuda/2d/astra.h
@@ -0,0 +1,205 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_ASTRA_H
+#define _CUDA_ASTRA_H
+
+#include "fft.h"
+#include "fbp_filters.h"
+#include "dims.h"
+#include "algo.h"
+
+using astraCUDA::SFanProjection;
+
+namespace astra {
+
+enum Cuda2DProjectionKernel {
+ ker2d_default = 0
+};
+
+class AstraFBP_internal;
+
+class _AstraExport AstraFBP {
+public:
+ // Constructor
+ AstraFBP();
+
+ // Destructor
+ ~AstraFBP();
+
+ // Set the size of the reconstruction rectangle.
+ // Volume pixels are currently assumed to be 1x1 squares.
+ bool setReconstructionGeometry(unsigned int iVolWidth,
+ unsigned int iVolHeight,
+ float fPixelSize = 1.0f);
+
+ // Set the projection angles and number of detector pixels per angle.
+ // pfAngles must be a float array of length iProjAngles.
+ // fDetSize indicates the size of a detector pixel compared to a
+ // volume pixel edge.
+ //
+ // pfAngles will only be read from during this call.
+ bool setProjectionGeometry(unsigned int iProjAngles,
+ unsigned int iProjDets,
+ const float *pfAngles,
+ float fDetSize = 1.0f);
+
+ // Set linear supersampling factor for the BP.
+ // (The number of rays is the square of this)
+ //
+ // This may optionally be called before init().
+ bool setPixelSuperSampling(unsigned int iPixelSuperSampling);
+
+ // Set per-detector shifts.
+ //
+ // pfTOffsets will only be read from during this call.
+ bool setTOffsets(const float *pfTOffsets);
+
+ // Returns the required size of a filter in the fourier domain
+ // when multiplying it with the fft of the projection data.
+ // Its value is equal to the smallest power of two larger than
+ // or equal to twice the number of detectors in the spatial domain.
+ //
+ // _iDetectorCount is the number of detectors in the spatial domain.
+ static int calcFourierFilterSize(int _iDetectorCount);
+
+ // Sets the filter type. Some filter types require the user to supply an
+ // array containing the filter.
+ // The number of elements in a filter in the fourier domain should be equal
+ // to the value returned by calcFourierFilterSize().
+ // The following types require a filter:
+ //
+ // - FILTER_PROJECTION:
+ // The filter size should be equal to the output of
+ // calcFourierFilterSize(). The filtered sinogram is
+ // multiplied with the supplied filter.
+ //
+ // - FILTER_SINOGRAM:
+ // Same as FILTER_PROJECTION, but now the filter should contain a row for
+ // every projection direction.
+ //
+ // - FILTER_RPROJECTION:
+ // The filter should now contain one kernel (= ifft of filter), with the
+ // peak in the center. The filter width
+ // can be any value. If odd, the peak is assumed to be in the center, if
+ // even, it is assumed to be at floor(filter-width/2).
+ //
+ // - FILTER_RSINOGRAM
+ // Same as FILTER_RPROJECTION, but now the supplied filter should contain a
+ // row for every projection direction.
+ //
+ // A large number of other filters (FILTER_RAMLAK, FILTER_SHEPPLOGAN,
+ // FILTER_COSINE, FILTER_HAMMING, and FILTER_HANN)
+ // have a D variable, which gives the cutoff point in the frequency domain.
+ // Setting this value to 1.0 will include the whole filter
+ bool setFilter(E_FBPFILTER _eFilter,
+ const float * _pfHostFilter = NULL,
+ int _iFilterWidth = 0, float _fD = 1.0f, float _fFilterParameter = -1.0f);
+
+ // Initialize CUDA, allocate GPU buffers and
+ // precompute geometry-specific data.
+ //
+ // CUDA is set up to use GPU number iGPUIndex.
+ //
+ // This must be called after calling setReconstructionGeometry() and
+ // setProjectionGeometry().
+ bool init(int iGPUIndex = 0);
+
+ // Setup input sinogram for a slice.
+ // pfSinogram must be a float array of size iProjAngles*iSinogramPitch.
+ // NB: iSinogramPitch is measured in floats, not in bytes.
+ //
+ // This must be called after init(), and before iterate(). It may be
+ // called again after iterate()/getReconstruction() to start a new slice.
+ //
+ // pfSinogram will only be read from during this call.
+ bool setSinogram(const float* pfSinogram, unsigned int iSinogramPitch);
+
+ // Runs an FBP reconstruction.
+ // This must be called after setSinogram().
+ //
+ // run can be called before setFilter, but will then use the default Ram-Lak filter
+ bool run();
+
+ // Get the reconstructed slice.
+ // pfReconstruction must be a float array of size
+ // iVolHeight*iReconstructionPitch.
+ // NB: iReconstructionPitch is measured in floats, not in bytes.
+ //
+ // This may be called after run().
+ bool getReconstruction(float* pfReconstruction,
+ unsigned int iReconstructionPitch) const;
+
+private:
+ AstraFBP_internal* pData;
+};
+
+class _AstraExport BPalgo : public astraCUDA::ReconAlgo {
+public:
+ BPalgo();
+ ~BPalgo();
+
+ virtual bool init();
+
+ virtual bool iterate(unsigned int iterations);
+
+ virtual float computeDiffNorm();
+};
+
+
+
+
+// TODO: Clean up this interface to FP
+
+// Do a single forward projection
+_AstraExport bool astraCudaFP(const float* pfVolume, float* pfSinogram,
+ unsigned int iVolWidth, unsigned int iVolHeight,
+ unsigned int iProjAngles, unsigned int iProjDets,
+ const float *pfAngles, const float *pfOffsets,
+ float fDetSize = 1.0f, unsigned int iDetSuperSampling = 1,
+ int iGPUIndex = 0);
+
+// Do a single forward projection, fan beam
+_AstraExport bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
+ unsigned int iVolWidth, unsigned int iVolHeight,
+ unsigned int iProjAngles, unsigned int iProjDets,
+ const float *pfAngles, float fOriginSourceDistance,
+ float fOriginDetectorDistance, float fPixelSize = 1.0f,
+ float fDetSize = 1.0f,
+ unsigned int iDetSuperSampling = 1,
+ int iGPUIndex = 0);
+
+_AstraExport bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
+ unsigned int iVolWidth, unsigned int iVolHeight,
+ unsigned int iProjAngles, unsigned int iProjDets,
+ const SFanProjection *pAngles,
+ unsigned int iDetSuperSampling = 1,
+ int iGPUIndex = 0);
+
+}
+#endif
diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu
new file mode 100644
index 0000000..5b1cf46
--- /dev/null
+++ b/cuda/2d/cgls.cu
@@ -0,0 +1,304 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "cgls.h"
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+namespace astraCUDA {
+
+CGLS::CGLS() : ReconAlgo()
+{
+ D_z = 0;
+ D_p = 0;
+ D_r = 0;
+ D_w = 0;
+
+ sliceInitialized = false;
+}
+
+
+CGLS::~CGLS()
+{
+ reset();
+}
+
+void CGLS::reset()
+{
+ cudaFree(D_z);
+ cudaFree(D_p);
+ cudaFree(D_r);
+ cudaFree(D_w);
+
+ D_z = 0;
+ D_p = 0;
+ D_r = 0;
+ D_w = 0;
+
+ ReconAlgo::reset();
+}
+
+bool CGLS::init()
+{
+ // Lifetime of z: within an iteration
+ allocateVolume(D_z, dims.iVolWidth+2, dims.iVolHeight+2, zPitch);
+
+ // Lifetime of p: full algorithm
+ allocateVolume(D_p, dims.iVolWidth+2, dims.iVolHeight+2, pPitch);
+
+ // Lifetime of r: full algorithm
+ allocateVolume(D_r, dims.iProjDets+2, dims.iProjAngles, rPitch);
+
+ // Lifetime of w: within an iteration
+ allocateVolume(D_w, dims.iProjDets+2, dims.iProjAngles, wPitch);
+
+ // TODO: check if allocations succeeded
+ return true;
+}
+
+
+bool CGLS::setBuffers(float* _D_volumeData, unsigned int _volumePitch,
+ float* _D_projData, unsigned int _projPitch)
+{
+ bool ok = ReconAlgo::setBuffers(_D_volumeData, _volumePitch,
+ _D_projData, _projPitch);
+
+ if (!ok)
+ return false;
+
+ sliceInitialized = false;
+
+ return true;
+}
+
+bool CGLS::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale,
+ const float* pfReconstruction, unsigned int iReconstructionPitch,
+ const float* pfVolMask, unsigned int iVolMaskPitch,
+ const float* pfSinoMask, unsigned int iSinoMaskPitch)
+{
+ sliceInitialized = false;
+
+ return ReconAlgo::copyDataToGPU(pfSinogram, iSinogramPitch, fSinogramScale, pfReconstruction, iReconstructionPitch, pfVolMask, iVolMaskPitch, pfSinoMask, iSinoMaskPitch);
+}
+
+bool CGLS::iterate(unsigned int iterations)
+{
+ shouldAbort = false;
+
+ if (!sliceInitialized) {
+
+ // copy sinogram
+ cudaMemcpy2D(D_r, sizeof(float)*rPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+
+ // r = sino - A*x
+ if (useVolumeMask) {
+ // Use z as temporary storage here since it is unused
+ cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
+ processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
+ callFP(D_z, zPitch, D_r, rPitch, -1.0f);
+ } else {
+ callFP(D_volumeData, volumePitch, D_r, rPitch, -1.0f);
+ }
+
+
+ // p = A'*r
+ zeroVolume(D_p, pPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ callBP(D_p, pPitch, D_r, rPitch);
+ if (useVolumeMask)
+ processVol<opMul, VOL>(D_p, D_maskData, pPitch, dims.iVolWidth, dims.iVolHeight);
+
+
+ gamma = dotProduct2D(D_p, pPitch, dims.iVolWidth, dims.iVolHeight, 1, 1);
+
+ sliceInitialized = true;
+ }
+
+
+ // iteration
+ for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
+
+ // w = A*p
+ zeroVolume(D_w, wPitch, dims.iProjDets+2, dims.iProjAngles);
+ callFP(D_p, pPitch, D_w, wPitch, 1.0f);
+
+ // alpha = gamma / <w,w>
+ float ww = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+ float alpha = gamma / ww;
+
+ // x += alpha*p
+ processVol<opAddScaled, VOL>(D_volumeData, D_p, alpha, volumePitch, dims.iVolWidth, dims.iVolHeight);
+
+ // r -= alpha*w
+ processVol<opAddScaled, SINO>(D_r, D_w, -alpha, rPitch, dims.iProjDets, dims.iProjAngles);
+
+
+ // z = A'*r
+ zeroVolume(D_z, zPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ callBP(D_z, zPitch, D_r, rPitch);
+ if (useVolumeMask)
+ processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
+
+ float beta = 1.0f / gamma;
+ gamma = dotProduct2D(D_z, zPitch, dims.iVolWidth, dims.iVolHeight, 1, 1);
+ beta *= gamma;
+
+ // p = z + beta*p
+ processVol<opScaleAndAdd, VOL>(D_p, D_z, beta, pPitch, dims.iVolWidth, dims.iVolHeight);
+
+ }
+
+ return true;
+}
+
+
+float CGLS::computeDiffNorm()
+{
+ // We can use w and z as temporary storage here since they're not
+ // used outside of iterations.
+
+ // copy sinogram to w
+ cudaMemcpy2D(D_w, sizeof(float)*wPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+
+ // do FP, subtracting projection from sinogram
+ if (useVolumeMask) {
+ cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
+ processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
+ callFP(D_z, zPitch, D_w, wPitch, -1.0f);
+ } else {
+ callFP(D_volumeData, volumePitch, D_w, wPitch, -1.0f);
+ }
+
+ // compute norm of D_w
+
+ float s = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+
+ return sqrt(s);
+}
+
+bool doCGLS(float* D_volumeData, unsigned int volumePitch,
+ float* D_sinoData, unsigned int sinoPitch,
+ const SDimensions& dims, /*const SAugmentedData& augs,*/
+ const float* angles, const float* TOffsets, unsigned int iterations)
+{
+ CGLS cgls;
+ bool ok = true;
+
+ ok &= cgls.setGeometry(dims, angles);
+#if 0
+ if (D_maskData)
+ ok &= cgls.enableVolumeMask();
+#endif
+ if (TOffsets)
+ ok &= cgls.setTOffsets(TOffsets);
+
+ if (!ok)
+ return false;
+
+ ok = cgls.init();
+ if (!ok)
+ return false;
+
+#if 0
+ if (D_maskData)
+ ok &= cgls.setVolumeMask(D_maskData, maskPitch);
+#endif
+
+ ok &= cgls.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+ if (!ok)
+ return false;
+
+ ok = cgls.iterate(iterations);
+
+ return ok;
+}
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+ float* D_volumeData;
+ float* D_sinoData;
+
+ SDimensions dims;
+ dims.iVolWidth = 1024;
+ dims.iVolHeight = 1024;
+ dims.iProjAngles = 512;
+ dims.iProjDets = 1536;
+ dims.fDetScale = 1.0f;
+ dims.iRaysPerDet = 1;
+ unsigned int volumePitch, sinoPitch;
+
+ allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ printf("pitch: %u\n", volumePitch);
+
+ allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+ zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+ printf("pitch: %u\n", sinoPitch);
+
+ unsigned int y, x;
+ float* sino = loadImage("sino.png", y, x);
+
+ float* img = new float[dims.iVolWidth*dims.iVolHeight];
+
+ copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch);
+
+ float* angle = new float[dims.iProjAngles];
+
+ for (unsigned int i = 0; i < dims.iProjAngles; ++i)
+ angle[i] = i*(M_PI/dims.iProjAngles);
+
+ CGLS cgls;
+
+ cgls.setGeometry(dims, angle);
+ cgls.init();
+
+ cgls.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+
+ cgls.iterate(25);
+
+ delete[] angle;
+
+ copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+
+ saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
+
+ return 0;
+}
+#endif
diff --git a/cuda/2d/cgls.h b/cuda/2d/cgls.h
new file mode 100644
index 0000000..1013bf8
--- /dev/null
+++ b/cuda/2d/cgls.h
@@ -0,0 +1,92 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_CGLS_H
+#define _CUDA_CGLS_H
+
+#include "util.h"
+#include "algo.h"
+
+namespace astraCUDA {
+
+class _AstraExport CGLS : public ReconAlgo {
+public:
+ CGLS();
+ virtual ~CGLS();
+
+ // disable some features
+ virtual bool enableSinogramMask() { return false; }
+ virtual bool setMinConstraint(float) { return false; }
+ virtual bool setMaxConstraint(float) { return false; }
+
+ virtual bool init();
+
+ virtual bool setBuffers(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch);
+
+ virtual bool copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale,
+ const float* pfReconstruction, unsigned int iReconstructionPitch,
+ const float* pfVolMask, unsigned int iVolMaskPitch,
+ const float* pfSinoMask, unsigned int iSinoMaskPitch);
+
+
+ virtual bool iterate(unsigned int iterations);
+
+ virtual float computeDiffNorm();
+
+protected:
+ void reset();
+
+ bool sliceInitialized;
+
+ // Buffers
+ float* D_r;
+ unsigned int rPitch;
+
+ float* D_w;
+ unsigned int wPitch;
+
+ float* D_z;
+ unsigned int zPitch;
+
+ float* D_p;
+ unsigned int pPitch;
+
+
+ float gamma;
+};
+
+
+_AstraExport bool doCGLS(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ const SDimensions& dims, const float* angles,
+ const float* TOffsets, unsigned int iterations);
+
+}
+
+#endif
diff --git a/cuda/2d/darthelper.cu b/cuda/2d/darthelper.cu
new file mode 100644
index 0000000..db0036e
--- /dev/null
+++ b/cuda/2d/darthelper.cu
@@ -0,0 +1,358 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include "util.h"
+#include "darthelper.h"
+#include <cassert>
+
+namespace astraCUDA {
+
+// CUDA function for the selection of ROI
+__global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+{
+ float x = (float)(threadIdx.x + 16*blockIdx.x);
+ float y = (float)(threadIdx.y + 16*blockIdx.y);
+
+ float w = (width-1.0f)*0.5f;
+ float h = (height-1.0f)*0.5f;
+
+ if ((x-w)*(x-w) + (y-h)*(y-h) > radius * radius * 0.25f)
+ {
+ float* d = (float*)in;
+ unsigned int o = (y+padY)*pitch+x+padX;
+ d[o] = 0.0f;
+ }
+}
+
+void roiSelect(float* out, float radius, unsigned int width, unsigned int height)
+{
+ float* D_data;
+
+ unsigned int pitch;
+ allocateVolume(D_data, width+2, height+2, pitch);
+ copyVolumeToDevice(out, width, width, height, D_data, pitch);
+
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+15)/16);
+ devRoiSelect<<<gridSize, blockSize>>>(D_data, radius, pitch, width, height, 1, 1);
+
+ copyVolumeFromDevice(out, width, width, height, D_data, pitch);
+
+ cudaFree(D_data);
+}
+
+
+
+
+// CUDA function for the masking of DART with a radius == 1
+__global__ void devDartMask(float* mask, const float* in, unsigned int conn, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+ // Sacrifice the border pixels to simplify the implementation.
+ if (x > 0 && x < width - 1 && y > 0 && y < height - 1) {
+ float* d = (float*)in;
+ float* m = (float*)mask;
+
+ unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+ unsigned int o1 = o2 - pitch; // On previous row.
+ unsigned int o3 = o2 + pitch; // On next row.
+
+ if ((conn == 8 && // 8-connected
+ (d[o1 - 1] != d[o2] || d[o1] != d[o2] || d[o1 + 1] != d[o2] ||
+ d[o2 - 1] != d[o2] || d[o2 + 1] != d[o2] ||
+ d[o3 - 1] != d[o2] || d[o3] != d[o2] || d[o3 + 1] != d[o2] ))
+ ||
+ (conn == 4 && // 4-connected
+ ( d[o1] != d[o2] ||
+ d[o2 - 1] != d[o2] || d[o3 + 1] != d[o2] ||
+ d[o3] != d[o2] )))
+ {
+ m[o2] = 1.0f;
+ }
+ }
+}
+
+
+// CUDA function for the masking of DART with a radius > 1
+__global__ void devDartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+ // Sacrifice the border pixels to simplify the implementation.
+ if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius)
+ {
+ float* d = (float*)in;
+ float* m = (float*)mask;
+
+ int r = radius;
+
+ // o2: index of the current center pixel
+ int o2 = (y+padY)*pitch+x+padX;
+
+ if (conn == 8) // 8-connected
+ {
+ for (int row = -r; row <= r; row++)
+ {
+ int o1 = (y+padY+row)*pitch+x+padX;
+ for (int col = -r; col <= r; col++)
+ {
+ if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;}
+ }
+ }
+ }
+ else if (conn == 4) // 4-connected
+ {
+ // horizontal
+ unsigned int o1 = (y+padY)*pitch+x+padX;
+ for (int col = -r; col <= r; col++)
+ {
+ if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;}
+ }
+
+ // vertical
+ for (int row = -r; row <= r; row++)
+ {
+ unsigned int o1 = (y+padY+row)*pitch+x+padX;
+ if (d[o1] != d[o2]) {m[o2] = 1.0f; return;}
+ }
+ }
+ }
+}
+
+
+// CUDA function for the masking of ADART with a radius == 1
+__global__ void devADartMask(float* mask, const float* in, unsigned int conn, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+ // Sacrifice the border pixels to simplify the implementation.
+ if (x > 0 && x < width - 1 && y > 0 && y < height - 1) {
+ float* d = (float*)in;
+ float* m = (float*)mask;
+
+ unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+ unsigned int o1 = o2 - pitch; // On previous row.
+ unsigned int o3 = o2 + pitch; // On next row.
+
+ if (conn == 8)
+ {
+ if (d[o1 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ if (d[o1 ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ if (d[o1 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ if (d[o2 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ if (d[o2 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ if (d[o3 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ if (d[o3 ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ if (d[o3 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ }
+ else if (conn == 4)
+ {
+ if (d[o1 ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ if (d[o2 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ if (d[o2 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ if (d[o3 ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ }
+ }
+}
+
+
+// CUDA function for the masking of ADART with a radius > 1
+__global__ void devADartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+ // Sacrifice the border pixels to simplify the implementation.
+ if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius)
+ {
+ float* d = (float*)in;
+ float* m = (float*)mask;
+
+ int r = radius;
+
+ unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+
+ if (conn == 8)
+ {
+ for (int row = -r; row <= r; row++)
+ {
+ unsigned int o1 = (y+padY+row)*pitch+x+padX;
+ for (int col = -r; col <= r; col++)
+ {
+ if (d[o1+col] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ }
+ }
+ }
+ else if (conn == 4)
+ {
+ // horizontal
+ for (int col = -r; col <= r; col++)
+ {
+ if (d[o2+col] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ }
+
+ // vertical
+ for (int row = -r; row <= r; row++)
+ {
+ unsigned int o1 = (y+padY+row)*pitch+x+padX;
+ if (d[o1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+ }
+ }
+ }
+}
+
+
+void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int width, unsigned int height)
+{
+ float* D_segmentationData;
+ float* D_maskData;
+
+ unsigned int pitch;
+ allocateVolume(D_segmentationData, width+2, height+2, pitch);
+ copyVolumeToDevice(segmentation, width, width, height, D_segmentationData, pitch);
+
+ allocateVolume(D_maskData, width+2, height+2, pitch);
+ zeroVolume(D_maskData, pitch, width+2, height+2);
+
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+15)/16);
+
+ if (threshold == 1 && radius == 1)
+ devDartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, pitch, width, height, 1, 1);
+ else if (threshold > 1 && radius == 1)
+ devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height, 1, 1);
+ else if (threshold == 1 && radius > 1)
+ devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height, 1, 1);
+ else
+ devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height, 1, 1);
+
+ copyVolumeFromDevice(mask, width, width, height, D_maskData, pitch);
+
+ cudaFree(D_segmentationData);
+ cudaFree(D_maskData);
+
+}
+
+
+__global__ void devDartSmoothingRadius(float* out, const float* in, float b, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+ // Sacrifice the border pixels to simplify the implementation.
+ if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius)
+ {
+ float* d = (float*)in;
+ float* m = (float*)out;
+
+ unsigned int o2 = (y+padY)*pitch+x+padX;
+ int r = radius;
+ float res = -d[o2];
+
+ for (int row = -r; row < r; row++)
+ {
+ unsigned int o1 = (y+padY+row)*pitch+x+padX;
+ for (int col = -r; col <= r; col++)
+ {
+ res += d[o1+col];
+ }
+ }
+
+ res *= b / 4*r*(r+1);
+ res += (1.0f-b) * d[o2];
+
+ m[o2] = res;
+ }
+}
+
+
+__global__ void devDartSmoothing(float* out, const float* in, float b, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+ // Sacrifice the border pixels to simplify the implementation.
+ if (x > 0 && x < width - 1 && y > 0 && y < height - 1) {
+ float* d = (float*)in;
+ float* m = (float*)out;
+
+ unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+ unsigned int o1 = o2 - pitch; // On previous row.
+ unsigned int o3 = o2 + pitch; // On next row.
+
+ m[o2] = (1.0f-b) * d[o2] + b * 0.125f * (d[o1 - 1] + d[o1] + d[o1 + 1] + d[o2 - 1] + d[o2 + 1] + d[o3 - 1] + d[o3] + d[o3 + 1]);
+ }
+}
+
+
+void dartSmoothing(float* out, const float* in, float b, unsigned int radius, unsigned int width, unsigned int height)
+{
+ float* D_inData;
+ float* D_outData;
+
+ unsigned int pitch;
+ allocateVolume(D_inData, width+2, height+2, pitch);
+ copyVolumeToDevice(in, width, width, height, D_inData, pitch);
+
+ allocateVolume(D_outData, width+2, height+2, pitch);
+ zeroVolume(D_outData, pitch, width+2, height+2);
+
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+15)/16);
+ if (radius == 1)
+ devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, pitch, width, height, 1, 1);
+ else
+ devDartSmoothingRadius<<<gridSize, blockSize>>>(D_outData, D_inData, b, radius, pitch, width, height, 1, 1);
+
+ copyVolumeFromDevice(out, width, width, height, D_outData, pitch);
+
+ cudaFree(D_outData);
+ cudaFree(D_inData);
+
+}
+
+
+
+bool setGPUIndex(int iGPUIndex)
+{
+ cudaSetDevice(iGPUIndex);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ return false;
+
+ return true;
+}
+
+
+}
diff --git a/cuda/2d/darthelper.h b/cuda/2d/darthelper.h
new file mode 100644
index 0000000..e05f01e
--- /dev/null
+++ b/cuda/2d/darthelper.h
@@ -0,0 +1,44 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_ARITH2_H
+#define _CUDA_ARITH2_H
+
+#include <cuda.h>
+
+namespace astraCUDA {
+
+ void roiSelect(float* out, float radius, unsigned int width, unsigned int height);
+ void dartMask(float* out, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int width, unsigned int height);
+ void dartSmoothing(float* out, const float* in, float b, unsigned int radius, unsigned int width, unsigned int height);
+
+ bool setGPUIndex(int index);
+
+}
+
+#endif
diff --git a/cuda/2d/dataop.cu b/cuda/2d/dataop.cu
new file mode 100644
index 0000000..68573b2
--- /dev/null
+++ b/cuda/2d/dataop.cu
@@ -0,0 +1,130 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include "util.h"
+#include "dataop.h"
+#include "arith.h"
+#include <cassert>
+
+namespace astraCUDA {
+
+void operationVolumeMult(float* data1, float* data2, unsigned int width, unsigned int height)
+{
+ float* D_data1;
+ float* D_data2;
+
+ unsigned int pitch;
+ allocateVolume(D_data1, width+2, height+2, pitch);
+ copyVolumeToDevice(data1, width, width, height, D_data1, pitch);
+
+ allocateVolume(D_data2, width+2, height+2, pitch);
+ copyVolumeToDevice(data2, width, width, height, D_data2, pitch);
+
+ processVol<opMul, VOL>(D_data1, D_data2, pitch, width, height);
+
+ copyVolumeFromDevice(data1, width, width, height, D_data1, pitch);
+
+ cudaFree(D_data1);
+ cudaFree(D_data2);
+}
+
+void operationVolumeMultScalarMask(float* data, float* mask, float scalar, unsigned int width, unsigned int height)
+{
+ float* D_data;
+ float* D_mask;
+
+ unsigned int pitch;
+ allocateVolume(D_data, width+2, height+2, pitch);
+ copyVolumeToDevice(data, width, width, height, D_data, pitch);
+
+ allocateVolume(D_mask, width+2, height+2, pitch);
+ copyVolumeToDevice(mask, width, width, height, D_mask, pitch);
+
+ processVol<opMulMask, VOL>(D_data, D_mask, scalar, pitch, width, height);
+
+ copyVolumeFromDevice(data, width, width, height, D_data, pitch);
+
+ cudaFree(D_data);
+ cudaFree(D_mask);
+}
+
+
+void operationVolumeMultScalar(float* data, float scalar, unsigned int width, unsigned int height)
+{
+ float* D_data;
+
+ unsigned int pitch;
+ allocateVolume(D_data, width+2, height+2, pitch);
+ copyVolumeToDevice(data, width, width, height, D_data, pitch);
+
+ processVol<opMul, VOL>(D_data, scalar, pitch, width, height);
+
+ copyVolumeFromDevice(data, width, width, height, D_data, pitch);
+
+ cudaFree(D_data);
+}
+
+
+void operationVolumeAdd(float* data1, float* data2, unsigned int width, unsigned int height)
+{
+ float* D_data1;
+ float* D_data2;
+
+ unsigned int pitch;
+ allocateVolume(D_data1, width+2, height+2, pitch);
+ copyVolumeToDevice(data1, width, width, height, D_data1, pitch);
+
+ allocateVolume(D_data2, width+2, height+2, pitch);
+ copyVolumeToDevice(data2, width, width, height, D_data2, pitch);
+
+ processVol<opAdd, VOL>(D_data1, D_data2, pitch, width, height);
+
+ copyVolumeFromDevice(data1, width, width, height, D_data1, pitch);
+
+ cudaFree(D_data1);
+ cudaFree(D_data2);
+}
+
+
+void operationVolumeAddScalar(float* data, float scalar, unsigned int width, unsigned int height)
+{
+ float* D_data;
+
+ unsigned int pitch;
+ allocateVolume(D_data, width+2, height+2, pitch);
+ copyVolumeToDevice(data, width, width, height, D_data, pitch);
+
+ processVol<opAdd, VOL>(D_data, scalar, pitch, width, height);
+
+ copyVolumeFromDevice(data, width, width, height, D_data, pitch);
+
+ cudaFree(D_data);
+}
+
+
+}
diff --git a/cuda/2d/dataop.h b/cuda/2d/dataop.h
new file mode 100644
index 0000000..3e9c7e2
--- /dev/null
+++ b/cuda/2d/dataop.h
@@ -0,0 +1,47 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_DATAOP_H
+#define _CUDA_DATAOP_H
+
+#include <cuda.h>
+
+namespace astraCUDA {
+
+ void operationVolumeMult(float* data1, float* data2, unsigned int width, unsigned int height);
+
+ void operationVolumeMultScalar(float* data, float scalar, unsigned int width, unsigned int height);
+ void operationVolumeMultScalarMask(float* data, float* mask, float scalar, unsigned int width, unsigned int height);
+
+ void operationVolumeAdd(float* data1, float* data2, unsigned int width, unsigned int height);
+
+ void operationVolumeAddScalar(float* data, float scalar, unsigned int width, unsigned int height);
+
+}
+
+#endif
diff --git a/cuda/2d/dims.h b/cuda/2d/dims.h
new file mode 100644
index 0000000..21ccb31
--- /dev/null
+++ b/cuda/2d/dims.h
@@ -0,0 +1,68 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_DIMS_H
+#define _CUDA_DIMS_H
+
+namespace astraCUDA {
+
+struct SFanProjection {
+ // the source
+ float fSrcX, fSrcY;
+
+ // the start of the (linear) detector
+ float fDetSX, fDetSY;
+
+ // the length of a single detector pixel
+ float fDetUX, fDetUY;
+};
+
+
+struct SDimensions {
+ unsigned int iVolWidth;
+ unsigned int iVolHeight;
+ unsigned int iProjAngles;
+ unsigned int iProjDets;
+ float fDetScale; // size of detector compared to volume pixels
+ unsigned int iRaysPerDet;
+ unsigned int iRaysPerPixelDim;
+};
+
+struct SDimensions3D {
+ unsigned int iVolX;
+ unsigned int iVolY;
+ unsigned int iVolZ;
+ unsigned int iProjAngles;
+ unsigned int iProjU; // number of detectors in the U direction
+ unsigned int iProjV; // number of detectors in the V direction
+};
+
+}
+
+#endif
+
diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu
new file mode 100644
index 0000000..74d1bbf
--- /dev/null
+++ b/cuda/2d/em.cu
@@ -0,0 +1,262 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "em.h"
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+namespace astraCUDA {
+
+
+// TODO: ensure non-negativity somewhere??
+
+
+EM::EM()
+{
+ D_projData = 0;
+ D_tmpData = 0;
+ D_pixelWeight = 0;
+
+}
+
+
+EM::~EM()
+{
+ reset();
+}
+
+void EM::reset()
+{
+ cudaFree(D_projData);
+ cudaFree(D_tmpData);
+ cudaFree(D_pixelWeight);
+
+ D_projData = 0;
+ D_tmpData = 0;
+ D_pixelWeight = 0;
+
+ ReconAlgo::reset();
+}
+
+
+bool EM::init()
+{
+ allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch);
+ zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+ allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
+ zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+ allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+ zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+
+ // We can't precompute pixelWeights when using a volume mask
+#if 0
+ if (!useVolumeMask)
+#endif
+ precomputeWeights();
+
+ // TODO: check if allocations succeeded
+ return true;
+}
+
+bool EM::precomputeWeights()
+{
+ zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+#if 0
+ if (useSinogramMask) {
+ callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch);
+ } else
+#endif
+ {
+ processVol<opSet, SINO>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles);
+ callBP(D_pixelWeight, pixelPitch, D_projData, projPitch);
+ }
+ processVol<opInvert, VOL>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+
+#if 0
+ if (useVolumeMask) {
+ // scale pixel weights with mask to zero out masked pixels
+ processVol<opMul, VOL>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+ }
+#endif
+
+ return true;
+}
+
+bool EM::iterate(unsigned int iterations)
+{
+ shouldAbort = false;
+
+#if 0
+ if (useVolumeMask)
+ precomputeWeights();
+#endif
+
+ // iteration
+ for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
+
+ // Do FP of volumeData
+ zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+ callFP(D_volumeData, volumePitch, D_projData, projPitch, 1.0f);
+
+ // Divide sinogram by FP (into projData)
+ processVol<opDividedBy, SINO>(D_projData, D_sinoData, projPitch, dims.iProjDets, dims.iProjAngles);
+
+ // Do BP of projData into tmpData
+ zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ callBP(D_tmpData, tmpPitch, D_projData, projPitch);
+
+ // Multiply volumeData with tmpData divided by pixel weights
+ processVol<opMul2, VOL>(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+
+ }
+
+ return true;
+}
+
+float EM::computeDiffNorm()
+{
+ // copy sinogram to projection data
+ cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+
+ // do FP, subtracting projection from sinogram
+ if (useVolumeMask) {
+ cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
+ processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+ callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
+ } else {
+ callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
+ }
+
+
+ // compute norm of D_projData
+
+ float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+
+ return sqrt(s);
+}
+
+
+bool doEM(float* D_volumeData, unsigned int volumePitch,
+ float* D_sinoData, unsigned int sinoPitch,
+ const SDimensions& dims, const float* angles,
+ const float* TOffsets, unsigned int iterations)
+{
+ EM em;
+ bool ok = true;
+
+ ok &= em.setGeometry(dims, angles);
+ if (TOffsets)
+ ok &= em.setTOffsets(TOffsets);
+
+ if (!ok)
+ return false;
+
+ ok = em.init();
+ if (!ok)
+ return false;
+
+ ok &= em.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+ if (!ok)
+ return false;
+
+ ok = em.iterate(iterations);
+
+ return ok;
+}
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+ float* D_volumeData;
+ float* D_sinoData;
+
+ SDimensions dims;
+ dims.iVolWidth = 1024;
+ dims.iVolHeight = 1024;
+ dims.iProjAngles = 512;
+ dims.iProjDets = 1536;
+ dims.fDetScale = 1.0f;
+ dims.iRaysPerDet = 1;
+ unsigned int volumePitch, sinoPitch;
+
+ allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ printf("pitch: %u\n", volumePitch);
+
+ allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+ zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+ printf("pitch: %u\n", sinoPitch);
+
+ unsigned int y, x;
+ float* sino = loadImage("sino.png", y, x);
+
+ float* img = new float[dims.iVolWidth*dims.iVolHeight];
+
+ copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch);
+
+ float* angle = new float[dims.iProjAngles];
+
+ for (unsigned int i = 0; i < dims.iProjAngles; ++i)
+ angle[i] = i*(M_PI/dims.iProjAngles);
+
+ EM em;
+
+ em.setGeometry(dims, angle);
+ em.init();
+
+ // TODO: Initialize D_volumeData with an unfiltered backprojection
+
+ em.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+
+ em.iterate(25);
+
+
+ delete[] angle;
+
+ copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+
+ saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
+
+ return 0;
+}
+
+#endif
diff --git a/cuda/2d/em.h b/cuda/2d/em.h
new file mode 100644
index 0000000..5a9ffed
--- /dev/null
+++ b/cuda/2d/em.h
@@ -0,0 +1,77 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_EM_H
+#define _CUDA_EM_H
+
+#include "util.h"
+#include "algo.h"
+
+namespace astraCUDA {
+
+class _AstraExport EM : public ReconAlgo {
+public:
+ EM();
+ virtual ~EM();
+
+ // disable some features
+ virtual bool enableSinogramMask() { return false; }
+ virtual bool enableVolumeMask() { return false; }
+ virtual bool setMinConstraint(float) { return false; }
+ virtual bool setMaxConstraint(float) { return false; }
+
+ virtual bool init();
+
+ virtual bool iterate(unsigned int iterations);
+
+ virtual float computeDiffNorm();
+
+protected:
+ void reset();
+ bool precomputeWeights();
+
+ // Temporary buffers
+ float* D_projData;
+ unsigned int projPitch;
+
+ float* D_tmpData;
+ unsigned int tmpPitch;
+
+ // Geometry-specific precomputed data
+ float* D_pixelWeight;
+ unsigned int pixelPitch;
+};
+
+_AstraExport bool doEM(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ const SDimensions& dims, const float* angles,
+ const float* TOffsets, unsigned int iterations);
+
+}
+
+#endif
diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu
new file mode 100644
index 0000000..1edc6d9
--- /dev/null
+++ b/cuda/2d/fan_bp.cu
@@ -0,0 +1,374 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+#define PIXELTRACE
+
+
+typedef texture<float, 2, cudaReadModeElementType> texture2D;
+
+static texture2D gT_FanProjTexture;
+
+
+namespace astraCUDA {
+
+const unsigned int g_anglesPerBlock = 16;
+const unsigned int g_blockSliceSize = 32;
+const unsigned int g_blockSlices = 16;
+
+const unsigned int g_MaxAngles = 2560;
+
+__constant__ float gC_SrcX[g_MaxAngles];
+__constant__ float gC_SrcY[g_MaxAngles];
+__constant__ float gC_DetSX[g_MaxAngles];
+__constant__ float gC_DetSY[g_MaxAngles];
+__constant__ float gC_DetUX[g_MaxAngles];
+__constant__ float gC_DetUY[g_MaxAngles];
+
+
+static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+ gT_FanProjTexture.addressMode[0] = cudaAddressModeClamp;
+ gT_FanProjTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_FanProjTexture.filterMode = cudaFilterModeLinear;
+ gT_FanProjTexture.normalized = false;
+
+ cudaBindTexture2D(0, gT_FanProjTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch);
+
+ // TODO: error value?
+
+ return true;
+}
+
+__global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims)
+{
+ const int relX = threadIdx.x;
+ const int relY = threadIdx.y;
+
+ int endAngle = startAngle + g_anglesPerBlock;
+ if (endAngle > dims.iProjAngles)
+ endAngle = dims.iProjAngles;
+ const int X = blockIdx.x * g_blockSlices + relX;
+ const int Y = blockIdx.y * g_blockSliceSize + relY;
+
+ if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
+ return;
+
+ const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f );
+ const float fY = - ( Y - 0.5f*dims.iVolHeight + 0.5f );
+
+ float* volData = (float*)D_volData;
+
+ float fVal = 0.0f;
+ float fA = startAngle + 0.5f;
+
+ // TODO: Distance correction?
+
+ for (int angle = startAngle; angle < endAngle; ++angle)
+ {
+ const float fSrcX = gC_SrcX[angle];
+ const float fSrcY = gC_SrcY[angle];
+ const float fDetSX = gC_DetSX[angle];
+ const float fDetSY = gC_DetSY[angle];
+ const float fDetUX = gC_DetUX[angle];
+ const float fDetUY = gC_DetUY[angle];
+
+ const float fXD = fSrcX - fX;
+ const float fYD = fSrcY - fY;
+
+ const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;
+ const float fDen = fDetUX * fYD - fDetUY * fXD;
+
+ const float fT = fNum / fDen + 1.0f;
+ fVal += tex2D(gT_FanProjTexture, fT, fA);
+ fA += 1.0f;
+ }
+
+ volData[(Y+1)*volPitch+X+1] += fVal;
+}
+
+// supersampling version
+__global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims)
+{
+ const int relX = threadIdx.x;
+ const int relY = threadIdx.y;
+
+ int endAngle = startAngle + g_anglesPerBlock;
+ if (endAngle > dims.iProjAngles)
+ endAngle = dims.iProjAngles;
+ const int X = blockIdx.x * g_blockSlices + relX;
+ const int Y = blockIdx.y * g_blockSliceSize + relY;
+
+ if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
+ return;
+
+ const float fXb = ( X - 0.5f*dims.iVolWidth + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim);
+ const float fYb = - ( Y - 0.5f*dims.iVolHeight + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim);
+
+ const float fSubStep = 1.0f/dims.iRaysPerPixelDim;
+
+ float* volData = (float*)D_volData;
+
+ float fVal = 0.0f;
+ float fA = startAngle + 0.5f;
+
+ // TODO: Distance correction?
+
+ for (int angle = startAngle; angle < endAngle; ++angle)
+ {
+ const float fSrcX = gC_SrcX[angle];
+ const float fSrcY = gC_SrcY[angle];
+ const float fDetSX = gC_DetSX[angle];
+ const float fDetSY = gC_DetSY[angle];
+ const float fDetUX = gC_DetUX[angle];
+ const float fDetUY = gC_DetUY[angle];
+
+ // TODO: Optimize these loops...
+ float fX = fXb;
+ for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) {
+ float fY = fYb;
+ for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) {
+ const float fXD = fSrcX - fX;
+ const float fYD = fSrcY - fY;
+
+ const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;
+ const float fDen = fDetUX * fYD - fDetUY * fXD;
+
+ const float fT = fNum / fDen + 1.0f;
+ fVal += tex2D(gT_FanProjTexture, fT, fA);
+ fY -= fSubStep;
+ }
+ fX += fSubStep;
+ }
+ fA += 1.0f;
+ }
+
+ volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
+}
+
+
+// BP specifically for SART.
+// It includes (free) weighting with voxel weight.
+// It assumes the proj texture is set up _without_ padding, unlike regular BP.
+__global__ void devFanBP_SART(float* D_volData, unsigned int volPitch, const SDimensions dims)
+{
+ const int relX = threadIdx.x;
+ const int relY = threadIdx.y;
+
+ const int X = blockIdx.x * g_blockSlices + relX;
+ const int Y = blockIdx.y * g_blockSliceSize + relY;
+
+ if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
+ return;
+
+ const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f );
+ const float fY = - ( Y - 0.5f*dims.iVolHeight + 0.5f );
+
+ float* volData = (float*)D_volData;
+
+ // TODO: Distance correction?
+
+ // TODO: Constant memory vs parameters.
+ const float fSrcX = gC_SrcX[0];
+ const float fSrcY = gC_SrcY[0];
+ const float fDetSX = gC_DetSX[0];
+ const float fDetSY = gC_DetSY[0];
+ const float fDetUX = gC_DetUX[0];
+ const float fDetUY = gC_DetUY[0];
+
+ const float fXD = fSrcX - fX;
+ const float fYD = fSrcY - fY;
+
+ const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;
+ const float fDen = fDetUX * fYD - fDetUY * fXD;
+
+ const float fT = fNum / fDen;
+ const float fVal = tex2D(gT_FanProjTexture, fT, 0.5f);
+
+ volData[(Y+1)*volPitch+X+1] += fVal;
+}
+
+
+bool FanBP(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ const SDimensions& dims, const SFanProjection* angles)
+{
+ // TODO: process angles block by block
+ assert(dims.iProjAngles <= g_MaxAngles);
+
+ bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+
+ // transfer angles to constant memory
+ float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+ TRANSFER_TO_CONSTANT(SrcX);
+ TRANSFER_TO_CONSTANT(SrcY);
+ TRANSFER_TO_CONSTANT(DetSX);
+ TRANSFER_TO_CONSTANT(DetSY);
+ TRANSFER_TO_CONSTANT(DetUX);
+ TRANSFER_TO_CONSTANT(DetUY);
+
+#undef TRANSFER_TO_CONSTANT
+
+ delete[] tmp;
+
+ dim3 dimBlock(g_blockSlices, g_blockSliceSize);
+ dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
+ (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
+
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+
+ for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
+ if (dims.iRaysPerPixelDim > 1)
+ devFanBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims);
+ else
+ devFanBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims);
+ }
+ cudaThreadSynchronize();
+
+ cudaTextForceKernelsCompletion();
+
+ cudaStreamDestroy(stream);
+
+ return true;
+}
+
+
+// D_projData is a pointer to one padded sinogram line
+bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ unsigned int angle,
+ const SDimensions& dims, const SFanProjection* angles)
+{
+ // only one angle
+ bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1);
+
+ // transfer angle to constant memory
+#define TRANSFER_TO_CONSTANT(name) do { cudaMemcpyToSymbol(gC_##name, &(angles[angle].f##name), sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+ TRANSFER_TO_CONSTANT(SrcX);
+ TRANSFER_TO_CONSTANT(SrcY);
+ TRANSFER_TO_CONSTANT(DetSX);
+ TRANSFER_TO_CONSTANT(DetSY);
+ TRANSFER_TO_CONSTANT(DetUX);
+ TRANSFER_TO_CONSTANT(DetUY);
+
+#undef TRANSFER_TO_CONSTANT
+
+ dim3 dimBlock(g_blockSlices, g_blockSliceSize);
+ dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
+ (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
+
+ devFanBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, dims);
+ cudaThreadSynchronize();
+
+ cudaTextForceKernelsCompletion();
+
+ return true;
+}
+
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+ float* D_volumeData;
+ float* D_projData;
+
+ SDimensions dims;
+ dims.iVolWidth = 128;
+ dims.iVolHeight = 128;
+ dims.iProjAngles = 180;
+ dims.iProjDets = 256;
+ dims.fDetScale = 1.0f;
+ dims.iRaysPerDet = 1;
+ unsigned int volumePitch, projPitch;
+
+ SFanProjection projs[180];
+
+ projs[0].fSrcX = 0.0f;
+ projs[0].fSrcY = 1536.0f;
+ projs[0].fDetSX = 128.0f;
+ projs[0].fDetSY = -512.0f;
+ projs[0].fDetUX = -1.0f;
+ projs[0].fDetUY = 0.0f;
+
+#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = projs[0].f##name##X * cos(alpha) - projs[0].f##name##Y * sin(alpha); projs[i].f##name##Y = projs[0].f##name##X * sin(alpha) + projs[0].f##name##Y * cos(alpha); } while(0)
+
+ for (int i = 1; i < 180; ++i) {
+ ROTATE0(Src, i, i*2*M_PI/180);
+ ROTATE0(DetS, i, i*2*M_PI/180);
+ ROTATE0(DetU, i, i*2*M_PI/180);
+ }
+
+#undef ROTATE0
+
+ allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ printf("pitch: %u\n", volumePitch);
+
+ allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+ printf("pitch: %u\n", projPitch);
+
+ unsigned int y, x;
+ float* sino = loadImage("sino.png", y, x);
+
+ float* img = new float[dims.iVolWidth*dims.iVolHeight];
+
+ memset(img, 0, dims.iVolWidth*dims.iVolHeight*sizeof(float));
+
+ copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+ copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+ FanBP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs);
+
+ copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+
+ saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
+
+ return 0;
+}
+#endif
diff --git a/cuda/2d/fan_bp.h b/cuda/2d/fan_bp.h
new file mode 100644
index 0000000..a4e62be
--- /dev/null
+++ b/cuda/2d/fan_bp.h
@@ -0,0 +1,45 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_FAN_BP_H
+#define _CUDA_FAN_BP_H
+
+namespace astraCUDA {
+
+_AstraExport bool FanBP(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ const SDimensions& dims, const SFanProjection* angles);
+
+_AstraExport bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ unsigned int angle,
+ const SDimensions& dims, const SFanProjection* angles);
+
+}
+
+#endif
diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu
new file mode 100644
index 0000000..cf9f352
--- /dev/null
+++ b/cuda/2d/fan_fp.cu
@@ -0,0 +1,370 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+
+typedef texture<float, 2, cudaReadModeElementType> texture2D;
+
+static texture2D gT_FanVolumeTexture;
+
+
+namespace astraCUDA {
+
+static const unsigned g_MaxAngles = 2560;
+__constant__ float gC_SrcX[g_MaxAngles];
+__constant__ float gC_SrcY[g_MaxAngles];
+__constant__ float gC_DetSX[g_MaxAngles];
+__constant__ float gC_DetSY[g_MaxAngles];
+__constant__ float gC_DetUX[g_MaxAngles];
+__constant__ float gC_DetUY[g_MaxAngles];
+
+
+// optimization parameters
+static const unsigned int g_anglesPerBlock = 16;
+static const unsigned int g_detBlockSize = 32;
+static const unsigned int g_blockSlices = 64;
+
+static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ dataArray = 0;
+ cudaMallocArray(&dataArray, &channelDesc, width, height);
+ cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice);
+
+ gT_FanVolumeTexture.addressMode[0] = cudaAddressModeClamp;
+ gT_FanVolumeTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_FanVolumeTexture.filterMode = cudaFilterModeLinear;
+ gT_FanVolumeTexture.normalized = false;
+
+ // TODO: For very small sizes (roughly <=512x128) with few angles (<=180)
+ // not using an array is more efficient.
+ //cudaBindTexture2D(0, gT_FanVolumeTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch);
+ cudaBindTextureToArray(gT_FanVolumeTexture, dataArray, channelDesc);
+
+ // TODO: error value?
+
+ return true;
+}
+
+// projection for angles that are roughly horizontal
+// (detector roughly vertical)
+__global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
+{
+ float* projData = (float*)D_projData;
+ const int relDet = threadIdx.x;
+ const int relAngle = threadIdx.y;
+
+ const int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+ if (angle >= endAngle)
+ return;
+
+ const int detector = blockIdx.y * g_detBlockSize + relDet;
+
+ if (detector < 0 || detector >= dims.iProjDets)
+ return;
+
+ const float fSrcX = gC_SrcX[angle];
+ const float fSrcY = gC_SrcY[angle];
+ const float fDetSX = gC_DetSX[angle];
+ const float fDetSY = gC_DetSY[angle];
+ const float fDetUX = gC_DetUX[angle];
+ const float fDetUY = gC_DetUY[angle];
+
+ float fVal = 0.0f;
+
+ const float fdx = fabsf(fDetSX + detector*fDetUX + 0.5f - fSrcX);
+ const float fdy = fabsf(fDetSY + detector*fDetUY + 0.5f - fSrcY);
+
+ if (fdy > fdx)
+ return;
+
+
+ for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+ const float fDet = detector + (0.5f + iSubT) / dims.iRaysPerDet;
+
+ const float fDetX = fDetSX + fDet * fDetUX;
+ const float fDetY = fDetSY + fDet * fDetUY;
+
+ // ray: y = alpha * x + beta
+ const float alpha = (fSrcY - fDetY) / (fSrcX - fDetX);
+ const float beta = fSrcY - alpha * fSrcX;
+
+ const float fDistCorr = sqrt(alpha*alpha+1.0f) * outputScale / dims.iRaysPerDet;
+
+ // intersect ray with first slice
+
+ float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
+ float fX = startSlice + 1.5f;
+
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > dims.iVolWidth)
+ endSlice = dims.iVolWidth;
+
+ float fV = 0.0f;
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ fV += tex2D(gT_FanVolumeTexture, fX, fY);
+ fY -= alpha;
+ fX += 1.0f;
+ }
+
+ fVal += fV * fDistCorr;
+
+ }
+
+ projData[angle*projPitch+detector+1] += fVal;
+}
+
+
+// projection for angles that are roughly vertical
+// (detector roughly horizontal)
+__global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
+{
+ const int relDet = threadIdx.x;
+ const int relAngle = threadIdx.y;
+
+ const int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+ if (angle >= endAngle)
+ return;
+
+ const int detector = blockIdx.y * g_detBlockSize + relDet;
+
+ if (detector < 0 || detector >= dims.iProjDets)
+ return;
+
+ float* projData = (float*)D_projData;
+
+ const float fSrcX = gC_SrcX[angle];
+ const float fSrcY = gC_SrcY[angle];
+ const float fDetSX = gC_DetSX[angle];
+ const float fDetSY = gC_DetSY[angle];
+ const float fDetUX = gC_DetUX[angle];
+ const float fDetUY = gC_DetUY[angle];
+
+ float fVal = 0.0f;
+
+ const float fdx = fabsf(fDetSX + detector*fDetUX + 0.5f - fSrcX);
+ const float fdy = fabsf(fDetSY + detector*fDetUY + 0.5f - fSrcY);
+
+ if (fdy <= fdx)
+ return;
+
+
+ for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+ const float fDet = detector + (0.5f + iSubT) / dims.iRaysPerDet /*- gC_angle_offset[angle]*/;
+
+ const float fDetX = fDetSX + fDet * fDetUX;
+ const float fDetY = fDetSY + fDet * fDetUY;
+
+ // ray: x = alpha * y + beta
+ const float alpha = (fSrcX - fDetX) / (fSrcY - fDetY);
+ const float beta = fSrcX - alpha * fSrcY;
+
+ const float fDistCorr = sqrt(alpha*alpha+1) * outputScale / dims.iRaysPerDet;
+
+ // intersect ray with first slice
+
+ float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
+ float fY = startSlice + 1.5f;
+
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > dims.iVolHeight)
+ endSlice = dims.iVolHeight;
+
+ float fV = 0.0f;
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ fV += tex2D(gT_FanVolumeTexture, fX, fY);
+ fX -= alpha;
+ fY += 1.0f;
+ }
+
+ fVal += fV * fDistCorr;
+
+ }
+
+ projData[angle*projPitch+detector+1] += fVal;
+}
+
+bool FanFP(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ const SDimensions& dims, const SFanProjection* angles,
+ float outputScale)
+{
+ // TODO: load angles into constant memory in smaller blocks
+ assert(dims.iProjAngles <= g_MaxAngles);
+
+ cudaArray* D_dataArray;
+ bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+ // transfer angles to constant memory
+ float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+ TRANSFER_TO_CONSTANT(SrcX);
+ TRANSFER_TO_CONSTANT(SrcY);
+ TRANSFER_TO_CONSTANT(DetSX);
+ TRANSFER_TO_CONSTANT(DetSY);
+ TRANSFER_TO_CONSTANT(DetUX);
+ TRANSFER_TO_CONSTANT(DetUY);
+
+#undef TRANSFER_TO_CONSTANT
+
+ delete[] tmp;
+
+ dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles
+ const unsigned int g_blockSliceSize = g_detBlockSize;
+
+ std::list<cudaStream_t> streams;
+
+
+ unsigned int blockStart = 0;
+ unsigned int blockEnd = dims.iProjAngles;
+
+ dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
+ (dims.iProjDets+g_blockSliceSize-1)/g_blockSliceSize); // angle blocks, regions
+ cudaStream_t stream1;
+ cudaStreamCreate(&stream1);
+ streams.push_back(stream1);
+ for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices)
+ FanFPhorizontal<<<dimGrid, dimBlock, 0, stream1>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
+
+ cudaStream_t stream2;
+ cudaStreamCreate(&stream2);
+ streams.push_back(stream2);
+ for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
+ FanFPvertical<<<dimGrid, dimBlock, 0, stream2>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
+
+ cudaStreamDestroy(stream1);
+ cudaStreamDestroy(stream2);
+
+ cudaThreadSynchronize();
+
+ cudaTextForceKernelsCompletion();
+
+ cudaFreeArray(D_dataArray);
+
+ return true;
+}
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+ float* D_volumeData;
+ float* D_projData;
+
+ SDimensions dims;
+ dims.iVolWidth = 128;
+ dims.iVolHeight = 128;
+ dims.iProjAngles = 180;
+ dims.iProjDets = 256;
+ dims.fDetScale = 1.0f;
+ dims.iRaysPerDet = 1;
+ unsigned int volumePitch, projPitch;
+
+ SFanProjection projs[180];
+
+ projs[0].fSrcX = 0.0f;
+ projs[0].fSrcY = 1536.0f;
+ projs[0].fDetSX = 128.0f;
+ projs[0].fDetSY = -512.0f;
+ projs[0].fDetUX = -1.0f;
+ projs[0].fDetUY = 0.0f;
+
+#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = projs[0].f##name##X * cos(alpha) - projs[0].f##name##Y * sin(alpha); projs[i].f##name##Y = projs[0].f##name##X * sin(alpha) + projs[0].f##name##Y * cos(alpha); } while(0)
+
+ for (int i = 1; i < 180; ++i) {
+ ROTATE0(Src, i, i*2*M_PI/180);
+ ROTATE0(DetS, i, i*2*M_PI/180);
+ ROTATE0(DetU, i, i*2*M_PI/180);
+ }
+
+#undef ROTATE0
+
+ allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ printf("pitch: %u\n", volumePitch);
+
+ allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+ printf("pitch: %u\n", projPitch);
+
+ unsigned int y, x;
+ float* img = loadImage("phantom128.png", y, x);
+
+ float* sino = new float[dims.iProjAngles * dims.iProjDets];
+
+ memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float));
+
+ copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+ copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+ float* angle = new float[dims.iProjAngles];
+
+ for (unsigned int i = 0; i < dims.iProjAngles; ++i)
+ angle[i] = i*(M_PI/dims.iProjAngles);
+
+ FanFP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs, 1.0f);
+
+ delete[] angle;
+
+ copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+ float s = 0.0f;
+ for (unsigned int y = 0; y < dims.iProjAngles; ++y)
+ for (unsigned int x = 0; x < dims.iProjDets; ++x)
+ s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];
+ printf("cpu norm: %f\n", s);
+
+ //zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+ s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+ printf("gpu norm: %f\n", s);
+
+ saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino);
+
+
+ return 0;
+}
+#endif
diff --git a/cuda/2d/fan_fp.h b/cuda/2d/fan_fp.h
new file mode 100644
index 0000000..0734f40
--- /dev/null
+++ b/cuda/2d/fan_fp.h
@@ -0,0 +1,41 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_FAN_FP_H
+#define _CUDA_FAN_FP_H
+
+namespace astraCUDA {
+
+_AstraExport bool FanFP(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ const SDimensions& dims, const SFanProjection* angles,
+ float outputScale);
+
+}
+
+#endif
diff --git a/cuda/2d/fbp_filters.h b/cuda/2d/fbp_filters.h
new file mode 100644
index 0000000..1232f8e
--- /dev/null
+++ b/cuda/2d/fbp_filters.h
@@ -0,0 +1,58 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef FBP_FILTERS_H
+#define FBP_FILTERS_H
+
+enum E_FBPFILTER
+{
+ FILTER_NONE, //< no filter (regular BP)
+ FILTER_RAMLAK, //< default FBP filter
+ FILTER_SHEPPLOGAN, //< Shepp-Logan
+ FILTER_COSINE, //< Cosine
+ FILTER_HAMMING, //< Hamming filter
+ FILTER_HANN, //< Hann filter
+ FILTER_TUKEY, //< Tukey filter
+ FILTER_LANCZOS, //< Lanczos filter
+ FILTER_TRIANGULAR, //< Triangular filter
+ FILTER_GAUSSIAN, //< Gaussian filter
+ FILTER_BARTLETTHANN, //< Bartlett-Hann filter
+ FILTER_BLACKMAN, //< Blackman filter
+ FILTER_NUTTALL, //< Nuttall filter, continuous first derivative
+ FILTER_BLACKMANHARRIS, //< Blackman-Harris filter
+ FILTER_BLACKMANNUTTALL, //< Blackman-Nuttall filter
+ FILTER_FLATTOP, //< Flat top filter
+ FILTER_KAISER, //< Kaiser filter
+ FILTER_PARZEN, //< Parzen filter
+ FILTER_PROJECTION, //< all projection directions share one filter
+ FILTER_SINOGRAM, //< every projection direction has its own filter
+ FILTER_RPROJECTION, //< projection filter in real space (as opposed to fourier space)
+ FILTER_RSINOGRAM, //< sinogram filter in real space
+};
+
+#endif /* FBP_FILTERS_H */
diff --git a/cuda/2d/fft.cu b/cuda/2d/fft.cu
new file mode 100644
index 0000000..79e4be7
--- /dev/null
+++ b/cuda/2d/fft.cu
@@ -0,0 +1,873 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include "fft.h"
+#include "util.h"
+
+#include <cufft.h>
+#include <iostream>
+#include <cuda.h>
+#include <fstream>
+
+#include "../../include/astra/Logger.h"
+
+using namespace astra;
+
+// TODO: evaluate what we want to do in these situations:
+
+#define CHECK_ERROR(errorMessage) do { \
+ cudaError_t err = cudaThreadSynchronize(); \
+ if( cudaSuccess != err) { \
+ fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
+ errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+ CLogger::writeTerminalCUDAError(__FILE__, __LINE__, cudaGetErrorString( err)); \
+ exit(EXIT_FAILURE); \
+ } } while (0)
+
+#define SAFE_CALL( call) do { \
+ cudaError err = call; \
+ if( cudaSuccess != err) { \
+ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
+ __FILE__, __LINE__, cudaGetErrorString( err) ); \
+ CLogger::writeTerminalCUDAError(__FILE__, __LINE__, cudaGetErrorString( err)); \
+ exit(EXIT_FAILURE); \
+ } \
+ err = cudaThreadSynchronize(); \
+ if( cudaSuccess != err) { \
+ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
+ __FILE__, __LINE__, cudaGetErrorString( err) ); \
+ CLogger::writeTerminalCUDAError(__FILE__, __LINE__, cudaGetErrorString( err)); \
+ exit(EXIT_FAILURE); \
+ } } while (0)
+
+
+__global__ static void applyFilter_kernel(int _iProjectionCount,
+ int _iFreqBinCount,
+ cufftComplex * _pSinogram,
+ cufftComplex * _pFilter)
+{
+ int iIndex = threadIdx.x + blockIdx.x * blockDim.x;
+ int iProjectionIndex = iIndex / _iFreqBinCount;
+
+ if(iProjectionIndex >= _iProjectionCount)
+ {
+ return;
+ }
+
+ float fA = _pSinogram[iIndex].x;
+ float fB = _pSinogram[iIndex].y;
+ float fC = _pFilter[iIndex].x;
+ float fD = _pFilter[iIndex].y;
+
+ _pSinogram[iIndex].x = fA * fC - fB * fD;
+ _pSinogram[iIndex].y = fA * fD + fC * fB;
+}
+
+__global__ static void rescaleInverseFourier_kernel(int _iProjectionCount,
+ int _iDetectorCount,
+ float* _pfInFourierOutput)
+{
+ int iIndex = threadIdx.x + blockIdx.x * blockDim.x;
+ int iProjectionIndex = iIndex / _iDetectorCount;
+ int iDetectorIndex = iIndex % _iDetectorCount;
+
+ if(iProjectionIndex >= _iProjectionCount)
+ {
+ return;
+ }
+
+ _pfInFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex] /= (float)_iDetectorCount;
+}
+
+static void rescaleInverseFourier(int _iProjectionCount, int _iDetectorCount,
+ float * _pfInFourierOutput)
+{
+ const int iBlockSize = 256;
+ int iElementCount = _iProjectionCount * _iDetectorCount;
+ int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize;
+
+ rescaleInverseFourier_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount,
+ _iDetectorCount,
+ _pfInFourierOutput);
+ CHECK_ERROR("rescaleInverseFourier_kernel failed");
+}
+
+void applyFilter(int _iProjectionCount, int _iFreqBinCount,
+ cufftComplex * _pSinogram, cufftComplex * _pFilter)
+{
+ const int iBlockSize = 256;
+ int iElementCount = _iProjectionCount * _iFreqBinCount;
+ int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize;
+
+ applyFilter_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount,
+ _iFreqBinCount,
+ _pSinogram, _pFilter);
+ CHECK_ERROR("applyFilter_kernel failed");
+}
+
+static bool invokeCudaFFT(int _iProjectionCount, int _iDetectorCount,
+ const float * _pfDevSource,
+ cufftComplex * _pDevTargetComplex)
+{
+ cufftHandle plan;
+ cufftResult result;
+
+ result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_R2C, _iProjectionCount);
+ if(result != CUFFT_SUCCESS)
+ {
+ std::cerr << "Failed to plan 1d r2c fft" << std::endl;
+ return false;
+ }
+
+ result = cufftExecR2C(plan, (cufftReal *)_pfDevSource, _pDevTargetComplex);
+ cufftDestroy(plan);
+
+ if(result != CUFFT_SUCCESS)
+ {
+ std::cerr << "Failed to exec 1d r2c fft" << std::endl;
+ return false;
+ }
+
+ return true;
+}
+
+static bool invokeCudaIFFT(int _iProjectionCount, int _iDetectorCount,
+ const cufftComplex * _pDevSourceComplex,
+ float * _pfDevTarget)
+{
+ cufftHandle plan;
+ cufftResult result;
+
+ result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_C2R, _iProjectionCount);
+ if(result != CUFFT_SUCCESS)
+ {
+ std::cerr << "Failed to plan 1d c2r fft" << std::endl;
+ return false;
+ }
+
+ // todo: why do we have to get rid of the const qualifier?
+ result = cufftExecC2R(plan, (cufftComplex *)_pDevSourceComplex,
+ (cufftReal *)_pfDevTarget);
+ cufftDestroy(plan);
+
+ if(result != CUFFT_SUCCESS)
+ {
+ std::cerr << "Failed to exec 1d c2r fft" << std::endl;
+ return false;
+ }
+
+ return true;
+}
+
+bool allocateComplexOnDevice(int _iProjectionCount, int _iDetectorCount,
+ cufftComplex ** _ppDevComplex)
+{
+ size_t bufferSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount;
+ SAFE_CALL(cudaMalloc((void **)_ppDevComplex, bufferSize));
+ return true;
+}
+
+bool freeComplexOnDevice(cufftComplex * _pDevComplex)
+{
+ SAFE_CALL(cudaFree(_pDevComplex));
+ return true;
+}
+
+bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount,
+ cufftComplex * _pHostComplexSource,
+ cufftComplex * _pDevComplexTarget)
+{
+ size_t memSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount;
+ SAFE_CALL(cudaMemcpy(_pDevComplexTarget, _pHostComplexSource, memSize, cudaMemcpyHostToDevice));
+
+ return true;
+}
+
+bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
+ int _iSourcePitch, int _iSourcePadX, int _iProjDets,
+ int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount,
+ cufftComplex * _pDevTargetComplex)
+{
+ float * pfDevRealFFTSource = NULL;
+ size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount;
+
+ SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTSource, bufferMemSize));
+ SAFE_CALL(cudaMemset(pfDevRealFFTSource, 0, bufferMemSize));
+
+ for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
+ {
+ const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch + _iSourcePadX;
+ float * pfTargetLocation = pfDevRealFFTSource + iProjectionIndex * _iFFTRealDetectorCount;
+
+ SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice));
+ }
+
+ bool bResult = invokeCudaFFT(_iProjectionCount, _iFFTRealDetectorCount,
+ pfDevRealFFTSource, _pDevTargetComplex);
+ if(!bResult)
+ {
+ return false;
+ }
+
+ SAFE_CALL(cudaFree(pfDevRealFFTSource));
+
+ return true;
+}
+
+bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
+ float * _pfRealTarget,
+ int _iTargetPitch, int _iTargetPadX, int _iProjDets,
+ int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount)
+{
+ float * pfDevRealFFTTarget = NULL;
+ size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount;
+
+ SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTTarget, bufferMemSize));
+
+ bool bResult = invokeCudaIFFT(_iProjectionCount, _iFFTRealDetectorCount,
+ _pDevSourceComplex, pfDevRealFFTTarget);
+ if(!bResult)
+ {
+ return false;
+ }
+
+ rescaleInverseFourier(_iProjectionCount, _iFFTRealDetectorCount,
+ pfDevRealFFTTarget);
+
+ SAFE_CALL(cudaMemset(_pfRealTarget, 0, sizeof(float) * _iProjectionCount * _iTargetPitch));
+
+ for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
+ {
+ const float * pfSourceLocation = pfDevRealFFTTarget + iProjectionIndex * _iFFTRealDetectorCount;
+ float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch + _iTargetPadX;
+
+ SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice));
+ }
+
+ SAFE_CALL(cudaFree(pfDevRealFFTTarget));
+
+ return true;
+}
+
+
+// Because the input is real, the Fourier transform is symmetric.
+// CUFFT only outputs the first half (ignoring the redundant second half),
+// and expects the same as input for the IFFT.
+int calcFFTFourSize(int _iFFTRealSize)
+{
+ int iFFTFourSize = _iFFTRealSize / 2 + 1;
+
+ return iFFTFourSize;
+}
+
+void genIdenFilter(int _iProjectionCount, cufftComplex * _pFilter,
+ int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount)
+{
+ for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
+ {
+ for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ int iIndex = iDetectorIndex + iProjectionIndex * _iFFTFourierDetectorCount;
+ _pFilter[iIndex].x = 1.0f;
+ _pFilter[iIndex].y = 0.0f;
+ }
+ }
+}
+
+void genFilter(E_FBPFILTER _eFilter, float _fD, int _iProjectionCount,
+ cufftComplex * _pFilter, int _iFFTRealDetectorCount,
+ int _iFFTFourierDetectorCount, float _fParameter /* = -1.0f */)
+{
+ float * pfFilt = new float[_iFFTFourierDetectorCount];
+ float * pfW = new float[_iFFTFourierDetectorCount];
+
+ for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fRelIndex = (float)iDetectorIndex / (float)_iFFTRealDetectorCount;
+
+ // filt = 2*( 0:(order/2) )./order;
+ pfFilt[iDetectorIndex] = 2.0f * fRelIndex;
+ //pfFilt[iDetectorIndex] = 1.0f;
+
+ // w = 2*pi*(0:size(filt,2)-1)/order
+ pfW[iDetectorIndex] = 3.1415f * 2.0f * fRelIndex;
+ }
+
+ switch(_eFilter)
+ {
+ case FILTER_RAMLAK:
+ {
+ // do nothing
+ break;
+ }
+ case FILTER_SHEPPLOGAN:
+ {
+ // filt(2:end) = filt(2:end) .* (sin(w(2:end)/(2*d))./(w(2:end)/(2*d)))
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * (sinf(pfW[iDetectorIndex] / 2.0f / _fD) / (pfW[iDetectorIndex] / 2.0f / _fD));
+ }
+ break;
+ }
+ case FILTER_COSINE:
+ {
+ // filt(2:end) = filt(2:end) .* cos(w(2:end)/(2*d))
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * cosf(pfW[iDetectorIndex] / 2.0f / _fD);
+ }
+ break;
+ }
+ case FILTER_HAMMING:
+ {
+ // filt(2:end) = filt(2:end) .* (.54 + .46 * cos(w(2:end)/d))
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * ( 0.54f + 0.46f * cosf(pfW[iDetectorIndex] / _fD));
+ }
+ break;
+ }
+ case FILTER_HANN:
+ {
+ // filt(2:end) = filt(2:end) .*(1+cos(w(2:end)./d)) / 2
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * (1.0f + cosf(pfW[iDetectorIndex] / _fD)) / 2.0f;
+ }
+ break;
+ }
+ case FILTER_TUKEY:
+ {
+ float fAlpha = _fParameter;
+ if(_fParameter < 0.0f) fAlpha = 0.5f;
+ float fN = (float)_iFFTFourierDetectorCount;
+ float fHalfN = fN / 2.0f;
+ float fEnumTerm = fAlpha * fHalfN;
+ float fDenom = (1.0f - fAlpha) * fHalfN;
+ float fBlockStart = fHalfN - fEnumTerm;
+ float fBlockEnd = fHalfN + fEnumTerm;
+
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fAbsSmallN = fabs((float)iDetectorIndex);
+ float fStoredValue = 0.0f;
+
+ if((fBlockStart <= fAbsSmallN) && (fAbsSmallN <= fBlockEnd))
+ {
+ fStoredValue = 1.0f;
+ }
+ else
+ {
+ float fEnum = fAbsSmallN - fEnumTerm;
+ float fCosInput = M_PI * fEnum / fDenom;
+ fStoredValue = 0.5f * (1.0f + cosf(fCosInput));
+ }
+
+ pfFilt[iDetectorIndex] *= fStoredValue;
+ }
+
+ break;
+ }
+ case FILTER_LANCZOS:
+ {
+ float fDenum = (float)(_iFFTFourierDetectorCount - 1);
+
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fSmallN = (float)iDetectorIndex;
+ float fX = 2.0f * fSmallN / fDenum - 1.0f;
+ float fSinInput = M_PI * fX;
+ float fStoredValue = 0.0f;
+
+ if(fabsf(fSinInput) > 0.001f)
+ {
+ fStoredValue = sin(fSinInput)/fSinInput;
+ }
+ else
+ {
+ fStoredValue = 1.0f;
+ }
+
+ pfFilt[iDetectorIndex] *= fStoredValue;
+ }
+
+ break;
+ }
+ case FILTER_TRIANGULAR:
+ {
+ float fNMinusOne = (float)(_iFFTFourierDetectorCount - 1);
+
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fSmallN = (float)iDetectorIndex;
+ float fAbsInput = fSmallN - fNMinusOne / 2.0f;
+ float fParenInput = fNMinusOne / 2.0f - fabsf(fAbsInput);
+ float fStoredValue = 2.0f / fNMinusOne * fParenInput;
+
+ pfFilt[iDetectorIndex] *= fStoredValue;
+ }
+
+ break;
+ }
+ case FILTER_GAUSSIAN:
+ {
+ float fSigma = _fParameter;
+ if(_fParameter < 0.0f) fSigma = 0.4f;
+ float fN = (float)_iFFTFourierDetectorCount;
+ float fQuotient = (fN - 1.0f) / 2.0f;
+
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fSmallN = (float)iDetectorIndex;
+ float fEnum = fSmallN - fQuotient;
+ float fDenom = fSigma * fQuotient;
+ float fPower = -0.5f * (fEnum / fDenom) * (fEnum / fDenom);
+ float fStoredValue = expf(fPower);
+
+ pfFilt[iDetectorIndex] *= fStoredValue;
+ }
+
+ break;
+ }
+ case FILTER_BARTLETTHANN:
+ {
+ const float fA0 = 0.62f;
+ const float fA1 = 0.48f;
+ const float fA2 = 0.38f;
+ float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f;
+
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fSmallN = (float)iDetectorIndex;
+ float fAbsInput = fSmallN / fNMinusOne - 0.5f;
+ float fFirstTerm = fA1 * fabsf(fAbsInput);
+ float fCosInput = 2.0f * M_PI * fSmallN / fNMinusOne;
+ float fSecondTerm = fA2 * cosf(fCosInput);
+ float fStoredValue = fA0 - fFirstTerm - fSecondTerm;
+
+ pfFilt[iDetectorIndex] *= fStoredValue;
+ }
+
+ break;
+ }
+ case FILTER_BLACKMAN:
+ {
+ float fAlpha = _fParameter;
+ if(_fParameter < 0.0f) fAlpha = 0.16f;
+ float fA0 = (1.0f - fAlpha) / 2.0f;
+ float fA1 = 0.5f;
+ float fA2 = fAlpha / 2.0f;
+ float fNMinusOne = (float)(_iFFTFourierDetectorCount - 1);
+
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fSmallN = (float)iDetectorIndex;
+ float fCosInput1 = 2.0f * M_PI * 0.5f * fSmallN / fNMinusOne;
+ float fCosInput2 = 4.0f * M_PI * 0.5f * fSmallN / fNMinusOne;
+ float fStoredValue = fA0 - fA1 * cosf(fCosInput1) + fA2 * cosf(fCosInput2);
+
+ pfFilt[iDetectorIndex] *= fStoredValue;
+ }
+
+ break;
+ }
+ case FILTER_NUTTALL:
+ {
+ const float fA0 = 0.355768f;
+ const float fA1 = 0.487396f;
+ const float fA2 = 0.144232f;
+ const float fA3 = 0.012604f;
+ float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f;
+
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fSmallN = (float)iDetectorIndex;
+ float fBaseCosInput = M_PI * fSmallN / fNMinusOne;
+ float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput);
+ float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput);
+ float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput);
+ float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm;
+
+ pfFilt[iDetectorIndex] *= fStoredValue;
+ }
+
+ break;
+ }
+ case FILTER_BLACKMANHARRIS:
+ {
+ const float fA0 = 0.35875f;
+ const float fA1 = 0.48829f;
+ const float fA2 = 0.14128f;
+ const float fA3 = 0.01168f;
+ float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f;
+
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fSmallN = (float)iDetectorIndex;
+ float fBaseCosInput = M_PI * fSmallN / fNMinusOne;
+ float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput);
+ float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput);
+ float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput);
+ float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm;
+
+ pfFilt[iDetectorIndex] *= fStoredValue;
+ }
+
+ break;
+ }
+ case FILTER_BLACKMANNUTTALL:
+ {
+ const float fA0 = 0.3635819f;
+ const float fA1 = 0.4891775f;
+ const float fA2 = 0.1365995f;
+ const float fA3 = 0.0106411f;
+ float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f;
+
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fSmallN = (float)iDetectorIndex;
+ float fBaseCosInput = M_PI * fSmallN / fNMinusOne;
+ float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput);
+ float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput);
+ float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput);
+ float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm;
+
+ pfFilt[iDetectorIndex] *= fStoredValue;
+ }
+
+ break;
+ }
+ case FILTER_FLATTOP:
+ {
+ const float fA0 = 1.0f;
+ const float fA1 = 1.93f;
+ const float fA2 = 1.29f;
+ const float fA3 = 0.388f;
+ const float fA4 = 0.032f;
+ float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f;
+
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fSmallN = (float)iDetectorIndex;
+ float fBaseCosInput = M_PI * fSmallN / fNMinusOne;
+ float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput);
+ float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput);
+ float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput);
+ float fFourthTerm = fA4 * cosf(8.0f * fBaseCosInput);
+ float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm + fFourthTerm;
+
+ pfFilt[iDetectorIndex] *= fStoredValue;
+ }
+
+ break;
+ }
+ case FILTER_KAISER:
+ {
+ float fAlpha = _fParameter;
+ if(_fParameter < 0.0f) fAlpha = 3.0f;
+ float fPiTimesAlpha = M_PI * fAlpha;
+ float fNMinusOne = (float)(_iFFTFourierDetectorCount - 1);
+ float fDenom = (float)j0((double)fPiTimesAlpha);
+
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fSmallN = (float)iDetectorIndex;
+ float fSquareInput = 2.0f * fSmallN / fNMinusOne - 1;
+ float fSqrtInput = 1.0f - fSquareInput * fSquareInput;
+ float fBesselInput = fPiTimesAlpha * sqrt(fSqrtInput);
+ float fEnum = (float)j0((double)fBesselInput);
+ float fStoredValue = fEnum / fDenom;
+
+ pfFilt[iDetectorIndex] *= fStoredValue;
+ }
+
+ break;
+ }
+ case FILTER_PARZEN:
+ {
+ for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fSmallN = (float)iDetectorIndex;
+ float fQ = fSmallN / (float)(_iFFTFourierDetectorCount - 1);
+ float fStoredValue = 0.0f;
+
+ if(fQ <= 0.5f)
+ {
+ fStoredValue = 1.0f - 6.0f * fQ * fQ * (1.0f - fQ);
+ }
+ else
+ {
+ float fCubedValue = 1.0f - fQ;
+ fStoredValue = 2.0f * fCubedValue * fCubedValue * fCubedValue;
+ }
+
+ pfFilt[iDetectorIndex] *= fStoredValue;
+ }
+
+ break;
+ }
+ default:
+ {
+ std::cerr << "Cannot serve requested filter" << std::endl;
+ }
+ }
+
+ // filt(w>pi*d) = 0;
+ float fPiTimesD = M_PI * _fD;
+ for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fWValue = pfW[iDetectorIndex];
+
+ if(fWValue > fPiTimesD)
+ {
+ pfFilt[iDetectorIndex] = 0.0f;
+ }
+ }
+
+ for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+ {
+ float fFilterValue = pfFilt[iDetectorIndex];
+
+ for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
+ {
+ int iIndex = iDetectorIndex + iProjectionIndex * _iFFTFourierDetectorCount;
+ _pFilter[iIndex].x = fFilterValue;
+ _pFilter[iIndex].y = 0.0f;
+ }
+ }
+
+ delete[] pfFilt;
+ delete[] pfW;
+}
+
+#ifdef STANDALONE
+
+__global__ static void doubleFourierOutput_kernel(int _iProjectionCount,
+ int _iDetectorCount,
+ cufftComplex* _pFourierOutput)
+{
+ int iIndex = threadIdx.x + blockIdx.x * blockDim.x;
+ int iProjectionIndex = iIndex / _iDetectorCount;
+ int iDetectorIndex = iIndex % _iDetectorCount;
+
+ if(iProjectionIndex >= _iProjectionCount)
+ {
+ return;
+ }
+
+ if(iDetectorIndex <= (_iDetectorCount / 2))
+ {
+ return;
+ }
+
+ int iOtherDetectorIndex = _iDetectorCount - iDetectorIndex;
+
+ _pFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex].x = _pFourierOutput[iProjectionIndex * _iDetectorCount + iOtherDetectorIndex].x;
+ _pFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex].y = -_pFourierOutput[iProjectionIndex * _iDetectorCount + iOtherDetectorIndex].y;
+}
+
+static void doubleFourierOutput(int _iProjectionCount, int _iDetectorCount,
+ cufftComplex * _pFourierOutput)
+{
+ const int iBlockSize = 256;
+ int iElementCount = _iProjectionCount * _iDetectorCount;
+ int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize;
+
+ doubleFourierOutput_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount,
+ _iDetectorCount,
+ _pFourierOutput);
+ CHECK_ERROR("doubleFourierOutput_kernel failed");
+}
+
+
+
+static void writeToMatlabFile(const char * _fileName, const float * _pfData,
+ int _iRowCount, int _iColumnCount)
+{
+ std::ofstream out(_fileName);
+
+ for(int iRowIndex = 0; iRowIndex < _iRowCount; iRowIndex++)
+ {
+ for(int iColumnIndex = 0; iColumnIndex < _iColumnCount; iColumnIndex++)
+ {
+ out << _pfData[iColumnIndex + iRowIndex * _iColumnCount] << " ";
+ }
+
+ out << std::endl;
+ }
+}
+
+static void convertComplexToRealImg(const cufftComplex * _pComplex,
+ int _iElementCount,
+ float * _pfReal, float * _pfImaginary)
+{
+ for(int iIndex = 0; iIndex < _iElementCount; iIndex++)
+ {
+ _pfReal[iIndex] = _pComplex[iIndex].x;
+ _pfImaginary[iIndex] = _pComplex[iIndex].y;
+ }
+}
+
+void testCudaFFT()
+{
+ const int iProjectionCount = 2;
+ const int iDetectorCount = 1024;
+ const int iTotalElementCount = iProjectionCount * iDetectorCount;
+
+ float * pfHostProj = new float[iTotalElementCount];
+ memset(pfHostProj, 0, sizeof(float) * iTotalElementCount);
+
+ for(int iProjectionIndex = 0; iProjectionIndex < iProjectionCount; iProjectionIndex++)
+ {
+ for(int iDetectorIndex = 0; iDetectorIndex < iDetectorCount; iDetectorIndex++)
+ {
+// int
+
+// pfHostProj[iIndex] = (float)rand() / (float)RAND_MAX;
+ }
+ }
+
+ writeToMatlabFile("proj.mat", pfHostProj, iProjectionCount, iDetectorCount);
+
+ float * pfDevProj = NULL;
+ SAFE_CALL(cudaMalloc((void **)&pfDevProj, sizeof(float) * iTotalElementCount));
+ SAFE_CALL(cudaMemcpy(pfDevProj, pfHostProj, sizeof(float) * iTotalElementCount, cudaMemcpyHostToDevice));
+
+ cufftComplex * pDevFourProj = NULL;
+ SAFE_CALL(cudaMalloc((void **)&pDevFourProj, sizeof(cufftComplex) * iTotalElementCount));
+
+ cufftHandle plan;
+ cufftResult result;
+
+ result = cufftPlan1d(&plan, iDetectorCount, CUFFT_R2C, iProjectionCount);
+ if(result != CUFFT_SUCCESS)
+ {
+ cerr << "Failed to plan 1d r2c fft" << endl;
+ }
+
+ result = cufftExecR2C(plan, pfDevProj, pDevFourProj);
+ if(result != CUFFT_SUCCESS)
+ {
+ cerr << "Failed to exec 1d r2c fft" << endl;
+ }
+
+ cufftDestroy(plan);
+
+ doubleFourierOutput(iProjectionCount, iDetectorCount, pDevFourProj);
+
+ cufftComplex * pHostFourProj = new cufftComplex[iTotalElementCount];
+ SAFE_CALL(cudaMemcpy(pHostFourProj, pDevFourProj, sizeof(cufftComplex) * iTotalElementCount, cudaMemcpyDeviceToHost));
+
+ float * pfHostFourProjReal = new float[iTotalElementCount];
+ float * pfHostFourProjImaginary = new float[iTotalElementCount];
+
+ convertComplexToRealImg(pHostFourProj, iTotalElementCount, pfHostFourProjReal, pfHostFourProjImaginary);
+
+ writeToMatlabFile("proj_four_real.mat", pfHostFourProjReal, iProjectionCount, iDetectorCount);
+ writeToMatlabFile("proj_four_imaginary.mat", pfHostFourProjImaginary, iProjectionCount, iDetectorCount);
+
+ float * pfDevInFourProj = NULL;
+ SAFE_CALL(cudaMalloc((void **)&pfDevInFourProj, sizeof(float) * iTotalElementCount));
+
+ result = cufftPlan1d(&plan, iDetectorCount, CUFFT_C2R, iProjectionCount);
+ if(result != CUFFT_SUCCESS)
+ {
+ cerr << "Failed to plan 1d c2r fft" << endl;
+ }
+
+ result = cufftExecC2R(plan, pDevFourProj, pfDevInFourProj);
+ if(result != CUFFT_SUCCESS)
+ {
+ cerr << "Failed to exec 1d c2r fft" << endl;
+ }
+
+ cufftDestroy(plan);
+
+ rescaleInverseFourier(iProjectionCount, iDetectorCount, pfDevInFourProj);
+
+ float * pfHostInFourProj = new float[iTotalElementCount];
+ SAFE_CALL(cudaMemcpy(pfHostInFourProj, pfDevInFourProj, sizeof(float) * iTotalElementCount, cudaMemcpyDeviceToHost));
+
+ writeToMatlabFile("in_four.mat", pfHostInFourProj, iProjectionCount, iDetectorCount);
+
+ SAFE_CALL(cudaFree(pDevFourProj));
+ SAFE_CALL(cudaFree(pfDevProj));
+
+ delete [] pfHostInFourProj;
+ delete [] pfHostFourProjReal;
+ delete [] pfHostFourProjImaginary;
+ delete [] pfHostProj;
+ delete [] pHostFourProj;
+}
+
+void downloadDebugFilterComplex(float * _pfHostSinogram, int _iProjectionCount,
+ int _iDetectorCount,
+ cufftComplex * _pDevFilter,
+ int _iFilterDetCount)
+{
+ cufftComplex * pHostFilter = NULL;
+ size_t complMemSize = sizeof(cufftComplex) * _iFilterDetCount * _iProjectionCount;
+ pHostFilter = (cufftComplex *)malloc(complMemSize);
+ SAFE_CALL(cudaMemcpy(pHostFilter, _pDevFilter, complMemSize, cudaMemcpyDeviceToHost));
+
+ for(int iTargetProjIndex = 0; iTargetProjIndex < _iProjectionCount; iTargetProjIndex++)
+ {
+ for(int iTargetDetIndex = 0; iTargetDetIndex < min(_iDetectorCount, _iFilterDetCount); iTargetDetIndex++)
+ {
+ cufftComplex source = pHostFilter[iTargetDetIndex + iTargetProjIndex * _iFilterDetCount];
+ float fReadValue = sqrtf(source.x * source.x + source.y * source.y);
+ _pfHostSinogram[iTargetDetIndex + iTargetProjIndex * _iDetectorCount] = fReadValue;
+ }
+ }
+
+ free(pHostFilter);
+}
+
+void downloadDebugFilterReal(float * _pfHostSinogram, int _iProjectionCount,
+ int _iDetectorCount, float * _pfDevFilter,
+ int _iFilterDetCount)
+{
+ float * pfHostFilter = NULL;
+ size_t memSize = sizeof(float) * _iFilterDetCount * _iProjectionCount;
+ pfHostFilter = (float *)malloc(memSize);
+ SAFE_CALL(cudaMemcpy(pfHostFilter, _pfDevFilter, memSize, cudaMemcpyDeviceToHost));
+
+ for(int iTargetProjIndex = 0; iTargetProjIndex < _iProjectionCount; iTargetProjIndex++)
+ {
+ for(int iTargetDetIndex = 0; iTargetDetIndex < min(_iDetectorCount, _iFilterDetCount); iTargetDetIndex++)
+ {
+ float fSource = pfHostFilter[iTargetDetIndex + iTargetProjIndex * _iFilterDetCount];
+ _pfHostSinogram[iTargetDetIndex + iTargetProjIndex * _iDetectorCount] = fSource;
+ }
+ }
+
+ free(pfHostFilter);
+}
+
+
+#endif
diff --git a/cuda/2d/fft.h b/cuda/2d/fft.h
new file mode 100644
index 0000000..55324e5
--- /dev/null
+++ b/cuda/2d/fft.h
@@ -0,0 +1,69 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef FFT_H
+#define FFT_H
+
+#include <cufft.h>
+#include <cuda.h>
+
+#include "fbp_filters.h"
+
+bool allocateComplexOnDevice(int _iProjectionCount,
+ int _iDetectorCount,
+ cufftComplex ** _ppDevComplex);
+
+bool freeComplexOnDevice(cufftComplex * _pDevComplex);
+
+bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount,
+ cufftComplex * _pHostComplexSource,
+ cufftComplex * _pDevComplexTarget);
+
+bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
+ int _iSourcePitch, int _iSourcePadX, int _iProjDets,
+ int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount,
+ cufftComplex * _pDevTargetComplex);
+
+bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
+ float * _pfRealTarget,
+ int _iTargetPitch, int _iTargetPadX, int _iProjDets,
+ int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount);
+
+void applyFilter(int _iProjectionCount, int _iFreqBinCount,
+ cufftComplex * _pSinogram, cufftComplex * _pFilter);
+
+int calcFFTFourSize(int _iFFTRealSize);
+
+void genFilter(E_FBPFILTER _eFilter, float _fD, int _iProjectionCount,
+ cufftComplex * _pFilter, int _iFFTRealDetectorCount,
+ int _iFFTFourierDetectorCount, float _fParameter = -1.0f);
+
+void genIdenFilter(int _iProjectionCount, cufftComplex * _pFilter,
+ int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount);
+
+#endif /* FFT_H */
diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu
new file mode 100644
index 0000000..1057879
--- /dev/null
+++ b/cuda/2d/par_bp.cu
@@ -0,0 +1,357 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+#define PIXELTRACE
+
+
+typedef texture<float, 2, cudaReadModeElementType> texture2D;
+
+static texture2D gT_projTexture;
+
+
+namespace astraCUDA {
+
+const unsigned int g_anglesPerBlock = 16;
+const unsigned int g_blockSliceSize = 32;
+const unsigned int g_blockSlices = 16;
+
+const unsigned int g_MaxAngles = 2560;
+
+__constant__ float gC_angle_sin[g_MaxAngles];
+__constant__ float gC_angle_cos[g_MaxAngles];
+__constant__ float gC_angle_offset[g_MaxAngles];
+
+static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+ gT_projTexture.addressMode[0] = cudaAddressModeClamp;
+ gT_projTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_projTexture.filterMode = cudaFilterModeLinear;
+ gT_projTexture.normalized = false;
+
+ cudaBindTexture2D(0, gT_projTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch);
+
+ // TODO: error value?
+
+ return true;
+}
+
+__global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims)
+{
+ const int relX = threadIdx.x;
+ const int relY = threadIdx.y;
+
+ int endAngle = startAngle + g_anglesPerBlock;
+ if (endAngle > dims.iProjAngles)
+ endAngle = dims.iProjAngles;
+ const int X = blockIdx.x * g_blockSlices + relX;
+ const int Y = blockIdx.y * g_blockSliceSize + relY;
+
+ if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
+ return;
+
+ const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ) / dims.fDetScale;
+ const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f ) / dims.fDetScale;
+
+ float* volData = (float*)D_volData;
+
+ float fVal = 0.0f;
+ float fA = startAngle + 0.5f;
+ const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f;
+
+ if (offsets) {
+
+ for (int angle = startAngle; angle < endAngle; ++angle)
+ {
+ const float cos_theta = gC_angle_cos[angle];
+ const float sin_theta = gC_angle_sin[angle];
+ const float TOffset = gC_angle_offset[angle];
+
+ const float fT = fT_base + fX * cos_theta - fY * sin_theta + TOffset;
+ fVal += tex2D(gT_projTexture, fT, fA);
+ fA += 1.0f;
+ }
+
+ } else {
+
+ for (int angle = startAngle; angle < endAngle; ++angle)
+ {
+ const float cos_theta = gC_angle_cos[angle];
+ const float sin_theta = gC_angle_sin[angle];
+
+ const float fT = fT_base + fX * cos_theta - fY * sin_theta;
+ fVal += tex2D(gT_projTexture, fT, fA);
+ fA += 1.0f;
+ }
+
+ }
+
+ volData[(Y+1)*volPitch+X+1] += fVal;
+}
+
+// supersampling version
+__global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims)
+{
+ const int relX = threadIdx.x;
+ const int relY = threadIdx.y;
+
+ int endAngle = startAngle + g_anglesPerBlock;
+ if (endAngle > dims.iProjAngles)
+ endAngle = dims.iProjAngles;
+ const int X = blockIdx.x * g_blockSlices + relX;
+ const int Y = blockIdx.y * g_blockSliceSize + relY;
+
+ if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
+ return;
+
+ const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim) / dims.fDetScale;
+ const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim) / dims.fDetScale;
+
+ const float fSubStep = 1.0f/(dims.iRaysPerPixelDim * dims.fDetScale);
+
+ float* volData = (float*)D_volData;
+
+ float fVal = 0.0f;
+ float fA = startAngle + 0.5f;
+ const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f;
+
+ if (offsets) {
+
+ for (int angle = startAngle; angle < endAngle; ++angle)
+ {
+ const float cos_theta = gC_angle_cos[angle];
+ const float sin_theta = gC_angle_sin[angle];
+ const float TOffset = gC_angle_offset[angle];
+
+ float fT = fT_base + fX * cos_theta - fY * sin_theta + TOffset;
+
+ for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) {
+ float fTy = fT;
+ fT += fSubStep * cos_theta;
+ for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) {
+ fVal += tex2D(gT_projTexture, fTy, fA);
+ fTy -= fSubStep * sin_theta;
+ }
+ }
+ fA += 1.0f;
+ }
+
+ } else {
+
+ for (int angle = startAngle; angle < endAngle; ++angle)
+ {
+ const float cos_theta = gC_angle_cos[angle];
+ const float sin_theta = gC_angle_sin[angle];
+
+ float fT = fT_base + fX * cos_theta - fY * sin_theta;
+
+ for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) {
+ float fTy = fT;
+ fT += fSubStep * cos_theta;
+ for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) {
+ fVal += tex2D(gT_projTexture, fTy, fA);
+ fTy -= fSubStep * sin_theta;
+ }
+ }
+ fA += 1.0f;
+
+ }
+
+ }
+
+ volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
+}
+
+__global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset, float angle_sin, float angle_cos, const SDimensions dims)
+{
+ const int relX = threadIdx.x;
+ const int relY = threadIdx.y;
+
+ const int X = blockIdx.x * g_blockSlices + relX;
+ const int Y = blockIdx.y * g_blockSliceSize + relY;
+
+ if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
+ return;
+
+ const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ) / dims.fDetScale;
+ const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f ) / dims.fDetScale;
+
+ const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f;
+
+ const float fT = fT_base + fX * angle_cos - fY * angle_sin + offset;
+ const float fVal = tex2D(gT_projTexture, fT, 0.5f);
+
+ D_volData[(Y+1)*volPitch+X+1] += fVal;
+}
+
+
+bool BP(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ const SDimensions& dims, const float* angles, const float* TOffsets)
+{
+ // TODO: process angles block by block
+ assert(dims.iProjAngles <= g_MaxAngles);
+
+ float* angle_sin = new float[dims.iProjAngles];
+ float* angle_cos = new float[dims.iProjAngles];
+
+ bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+
+ for (unsigned int i = 0; i < dims.iProjAngles; ++i) {
+ angle_sin[i] = sinf(angles[i]);
+ angle_cos[i] = cosf(angles[i]);
+ }
+ cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_sin, angle_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+ cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_cos, angle_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+ assert(e1 == cudaSuccess);
+ assert(e2 == cudaSuccess);
+
+ if (TOffsets) {
+ cudaError_t e3 = cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+ assert(e3 == cudaSuccess);
+ }
+
+ delete[] angle_sin;
+ delete[] angle_cos;
+
+ dim3 dimBlock(g_blockSlices, g_blockSliceSize);
+ dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
+ (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
+
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+
+ for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
+
+ if (dims.iRaysPerPixelDim > 1)
+ devBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims);
+ else
+ devBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims);
+ }
+ cudaThreadSynchronize();
+
+ cudaTextForceKernelsCompletion();
+
+ cudaStreamDestroy(stream);
+
+ return true;
+}
+
+bool BP_SART(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ unsigned int angle, const SDimensions& dims,
+ const float* angles, const float* TOffsets)
+{
+ // only one angle
+ bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1);
+
+ float angle_sin = sinf(angles[angle]);
+ float angle_cos = cosf(angles[angle]);
+
+ float offset = 0.0f;
+ if (TOffsets)
+ offset = TOffsets[angle];
+
+ dim3 dimBlock(g_blockSlices, g_blockSliceSize);
+ dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
+ (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
+
+ devBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, offset, angle_sin, angle_cos, dims);
+ cudaThreadSynchronize();
+
+ cudaTextForceKernelsCompletion();
+
+ return true;
+}
+
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+ float* D_volumeData;
+ float* D_projData;
+
+ SDimensions dims;
+ dims.iVolWidth = 1024;
+ dims.iVolHeight = 1024;
+ dims.iProjAngles = 512;
+ dims.iProjDets = 1536;
+ dims.fDetScale = 1.0f;
+ dims.iRaysPerDet = 1;
+
+ unsigned int volumePitch, projPitch;
+
+ allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ printf("pitch: %u\n", volumePitch);
+
+ allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+ printf("pitch: %u\n", projPitch);
+
+ unsigned int y, x;
+ float* sino = loadImage("sino.png", y, x);
+
+ float* img = new float[dims.iVolWidth*dims.iVolHeight];
+
+ memset(img, 0, dims.iVolWidth*dims.iVolHeight*sizeof(float));
+
+ copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+ copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+ float* angle = new float[dims.iProjAngles];
+
+ for (unsigned int i = 0; i < dims.iProjAngles; ++i)
+ angle[i] = i*(M_PI/dims.iProjAngles);
+
+ BP(D_volumeData, volumePitch, D_projData, projPitch, dims, angle, 0);
+
+ delete[] angle;
+
+ copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+
+ saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
+
+ return 0;
+}
+#endif
diff --git a/cuda/2d/par_bp.h b/cuda/2d/par_bp.h
new file mode 100644
index 0000000..c6dbd59
--- /dev/null
+++ b/cuda/2d/par_bp.h
@@ -0,0 +1,48 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_PAR_BP_H
+#define _CUDA_PAR_BP_H
+
+#include "dims.h"
+
+namespace astraCUDA {
+
+_AstraExport bool BP(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ const SDimensions& dims, const float* angles,
+ const float* TOffsets);
+
+_AstraExport bool BP_SART(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ unsigned int angle, const SDimensions& dims,
+ const float* angles, const float* TOffsets);
+
+}
+
+#endif
diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu
new file mode 100644
index 0000000..585cb06
--- /dev/null
+++ b/cuda/2d/par_fp.cu
@@ -0,0 +1,704 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+#define PIXELTRACE
+
+
+typedef texture<float, 2, cudaReadModeElementType> texture2D;
+
+static texture2D gT_volumeTexture;
+
+
+namespace astraCUDA {
+
+static const unsigned g_MaxAngles = 2560;
+__constant__ float gC_angle[g_MaxAngles];
+__constant__ float gC_angle_offset[g_MaxAngles];
+
+
+// optimization parameters
+static const unsigned int g_anglesPerBlock = 16;
+static const unsigned int g_detBlockSize = 32;
+static const unsigned int g_blockSlices = 64;
+
+// fixed point scaling factor
+#define fPREC_FACTOR 16.0f
+#define iPREC_FACTOR 16
+
+
+// if necessary, a buffer of zeroes of size g_MaxAngles
+static float* g_pfZeroes = 0;
+
+
+static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ dataArray = 0;
+ cudaMallocArray(&dataArray, &channelDesc, width, height);
+ cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice);
+
+ gT_volumeTexture.addressMode[0] = cudaAddressModeClamp;
+ gT_volumeTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_volumeTexture.filterMode = cudaFilterModeLinear;
+ gT_volumeTexture.normalized = false;
+
+ // TODO: For very small sizes (roughly <=512x128) with few angles (<=180)
+ // not using an array is more efficient.
+// cudaBindTexture2D(0, gT_volumeTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch);
+ cudaBindTextureToArray(gT_volumeTexture, dataArray, channelDesc);
+
+ // TODO: error value?
+
+ return true;
+}
+
+// projection for angles that are roughly horizontal
+// theta between 45 and 135 degrees
+__global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale)
+{
+ const int relDet = threadIdx.x;
+ const int relAngle = threadIdx.y;
+
+ int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+ if (angle >= endAngle)
+ return;
+
+ const float theta = gC_angle[angle];
+ const float cos_theta = __cosf(theta);
+ const float sin_theta = __sinf(theta);
+
+ // compute start detector for this block/angle:
+ // (The same for all threadIdx.x)
+ // -------------------------------------
+
+ const int midSlice = startSlice + g_blockSlices / 2;
+
+ // ASSUMPTION: fDetScale >= 1.0f
+ // problem: detector regions get skipped because slice blocks aren't large
+ // enough
+ const unsigned int g_blockSliceSize = g_detBlockSize;
+
+ // project (midSlice,midRegion) on this thread's detector
+
+ const float fBase = 0.5f*dims.iProjDets - 0.5f +
+ (
+ (midSlice - 0.5f*dims.iVolWidth + 0.5f) * cos_theta
+ - (g_blockSliceSize/2 - 0.5f*dims.iVolHeight + 0.5f) * sin_theta
+ + gC_angle_offset[angle]
+ ) / dims.fDetScale;
+ int iBase = (int)floorf(fBase * fPREC_FACTOR);
+ int iInc = (int)floorf(g_blockSliceSize * sin_theta / dims.fDetScale * -fPREC_FACTOR);
+
+ // ASSUMPTION: 16 > regionOffset / fDetScale
+ const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
+ const int detPrevRegion = (iBase + (blockIdx.y - regionOffset - 1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
+
+ if (blockIdx.y > 0 && detRegion == detPrevRegion)
+ return;
+
+ const int detector = detRegion * g_detBlockSize + relDet;
+
+ // Now project the part of the ray to angle,detector through
+ // slices startSlice to startSlice+g_blockSlices-1
+
+ if (detector < 0 || detector >= dims.iProjDets)
+ return;
+
+ const float fDetStep = -dims.fDetScale / sin_theta;
+ float fSliceStep = cos_theta / sin_theta;
+ float fDistCorr;
+ if (sin_theta > 0.0f)
+ fDistCorr = -fDetStep;
+ else
+ fDistCorr = fDetStep;
+ fDistCorr *= outputScale;
+
+ float fVal = 0.0f;
+ // project detector on slice
+ float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
+ float fS = startSlice + 1.5f;
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > dims.iVolWidth)
+ endSlice = dims.iVolWidth;
+
+ if (dims.iRaysPerDet > 1) {
+
+ fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
+ const float fSubDetStep = fDetStep / dims.iRaysPerDet;
+ fDistCorr /= dims.iRaysPerDet;
+
+ fSliceStep -= dims.iRaysPerDet * fSubDetStep;
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+ fVal += tex2D(gT_volumeTexture, fS, fP);
+ fP += fSubDetStep;
+ }
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+ } else {
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ fVal += tex2D(gT_volumeTexture, fS, fP);
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+
+ }
+
+ D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+}
+
+// projection for angles that are roughly vertical
+// theta between 0 and 45, or 135 and 180 degrees
+__global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale)
+{
+ const int relDet = threadIdx.x;
+ const int relAngle = threadIdx.y;
+
+ int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+ if (angle >= endAngle)
+ return;
+
+ const float theta = gC_angle[angle];
+ const float cos_theta = __cosf(theta);
+ const float sin_theta = __sinf(theta);
+
+ // compute start detector for this block/angle:
+ // (The same for all threadIdx.x)
+ // -------------------------------------
+
+ const int midSlice = startSlice + g_blockSlices / 2;
+
+ // project (midSlice,midRegion) on this thread's detector
+
+ // ASSUMPTION: fDetScale >= 1.0f
+ // problem: detector regions get skipped because slice blocks aren't large
+ // enough
+ const unsigned int g_blockSliceSize = g_detBlockSize;
+
+ const float fBase = 0.5f*dims.iProjDets - 0.5f +
+ (
+ (g_blockSliceSize/2 - 0.5f*dims.iVolWidth + 0.5f) * cos_theta
+ - (midSlice - 0.5f*dims.iVolHeight + 0.5f) * sin_theta
+ + gC_angle_offset[angle]
+ ) / dims.fDetScale;
+ int iBase = (int)floorf(fBase * fPREC_FACTOR);
+ int iInc = (int)floorf(g_blockSliceSize * cos_theta / dims.fDetScale * fPREC_FACTOR);
+
+ // ASSUMPTION: 16 > regionOffset / fDetScale
+ const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
+ const int detPrevRegion = (iBase + (blockIdx.y - regionOffset-1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
+
+ if (blockIdx.y > 0 && detRegion == detPrevRegion)
+ return;
+
+ const int detector = detRegion * g_detBlockSize + relDet;
+
+ // Now project the part of the ray to angle,detector through
+ // slices startSlice to startSlice+g_blockSlices-1
+
+ if (detector < 0 || detector >= dims.iProjDets)
+ return;
+
+ const float fDetStep = dims.fDetScale / cos_theta;
+ float fSliceStep = sin_theta / cos_theta;
+ float fDistCorr;
+ if (cos_theta < 0.0f)
+ fDistCorr = -fDetStep;
+ else
+ fDistCorr = fDetStep;
+ fDistCorr *= outputScale;
+
+ float fVal = 0.0f;
+ float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
+ float fS = startSlice+1.5f;
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > dims.iVolHeight)
+ endSlice = dims.iVolHeight;
+
+ if (dims.iRaysPerDet > 1) {
+
+ fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
+ const float fSubDetStep = fDetStep / dims.iRaysPerDet;
+ fDistCorr /= dims.iRaysPerDet;
+
+ fSliceStep -= dims.iRaysPerDet * fSubDetStep;
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+ fVal += tex2D(gT_volumeTexture, fP, fS);
+ fP += fSubDetStep;
+ }
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+ } else {
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ fVal += tex2D(gT_volumeTexture, fP, fS);
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+ }
+
+ D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+}
+
+// projection for angles that are roughly horizontal
+// (detector roughly vertical)
+__global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
+{
+ const int relDet = threadIdx.x;
+ const int relAngle = threadIdx.y;
+
+ int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+ if (angle >= endAngle)
+ return;
+
+ const float theta = gC_angle[angle];
+ const float cos_theta = __cosf(theta);
+ const float sin_theta = __sinf(theta);
+
+ // compute start detector for this block/angle:
+ const int detRegion = blockIdx.y;
+
+ const int detector = detRegion * g_detBlockSize + relDet;
+
+ // Now project the part of the ray to angle,detector through
+ // slices startSlice to startSlice+g_blockSlices-1
+
+ if (detector < 0 || detector >= dims.iProjDets)
+ return;
+
+ const float fDetStep = -dims.fDetScale / sin_theta;
+ float fSliceStep = cos_theta / sin_theta;
+ float fDistCorr;
+ if (sin_theta > 0.0f)
+ fDistCorr = -fDetStep;
+ else
+ fDistCorr = fDetStep;
+ fDistCorr *= outputScale;
+
+ float fVal = 0.0f;
+ // project detector on slice
+ float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
+ float fS = startSlice + 1.5f;
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > dims.iVolWidth)
+ endSlice = dims.iVolWidth;
+
+ if (dims.iRaysPerDet > 1) {
+
+ fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
+ const float fSubDetStep = fDetStep / dims.iRaysPerDet;
+ fDistCorr /= dims.iRaysPerDet;
+
+ fSliceStep -= dims.iRaysPerDet * fSubDetStep;
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+ fVal += tex2D(gT_volumeTexture, fS, fP);
+ fP += fSubDetStep;
+ }
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+ } else {
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ fVal += tex2D(gT_volumeTexture, fS, fP);
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+
+ }
+
+ D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+}
+
+
+// projection for angles that are roughly vertical
+// (detector roughly horizontal)
+__global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
+{
+ const int relDet = threadIdx.x;
+ const int relAngle = threadIdx.y;
+
+ int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+ if (angle >= endAngle)
+ return;
+
+ const float theta = gC_angle[angle];
+ const float cos_theta = __cosf(theta);
+ const float sin_theta = __sinf(theta);
+
+ // compute start detector for this block/angle:
+ const int detRegion = blockIdx.y;
+
+ const int detector = detRegion * g_detBlockSize + relDet;
+
+ // Now project the part of the ray to angle,detector through
+ // slices startSlice to startSlice+g_blockSlices-1
+
+ if (detector < 0 || detector >= dims.iProjDets)
+ return;
+
+ const float fDetStep = dims.fDetScale / cos_theta;
+ float fSliceStep = sin_theta / cos_theta;
+ float fDistCorr;
+ if (cos_theta < 0.0f)
+ fDistCorr = -fDetStep;
+ else
+ fDistCorr = fDetStep;
+ fDistCorr *= outputScale;
+
+ float fVal = 0.0f;
+ float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
+ float fS = startSlice+1.5f;
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > dims.iVolHeight)
+ endSlice = dims.iVolHeight;
+
+ if (dims.iRaysPerDet > 1) {
+
+ fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
+ const float fSubDetStep = fDetStep / dims.iRaysPerDet;
+ fDistCorr /= dims.iRaysPerDet;
+
+ fSliceStep -= dims.iRaysPerDet * fSubDetStep;
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+ fVal += tex2D(gT_volumeTexture, fP, fS);
+ fP += fSubDetStep;
+ }
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+ } else {
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ fVal += tex2D(gT_volumeTexture, fP, fS);
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+ }
+
+ D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+}
+
+
+
+bool FP_simple(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ const SDimensions& dims, const float* angles,
+ const float* TOffsets, float outputScale)
+{
+ // TODO: load angles into constant memory in smaller blocks
+ assert(dims.iProjAngles <= g_MaxAngles);
+
+ cudaArray* D_dataArray;
+ bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+ cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+ if (TOffsets) {
+ cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+ } else {
+ if (!g_pfZeroes) {
+ g_pfZeroes = new float[g_MaxAngles];
+ memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float));
+ }
+ cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+ }
+
+ dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // detector block size, angles
+
+ std::list<cudaStream_t> streams;
+
+
+ // Run over all angles, grouping them into groups of the same
+ // orientation (roughly horizontal vs. roughly vertical).
+ // Start a stream of grids for each such group.
+
+ // TODO: Check if it's worth it to store this info instead
+ // of recomputing it every FP.
+
+ unsigned int blockStart = 0;
+ unsigned int blockEnd = 0;
+ bool blockVertical = false;
+ for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+ bool vertical;
+ // TODO: Having <= instead of < below causes a 5% speedup.
+ // Maybe we should detect corner cases and put them in the optimal
+ // group of angles.
+ if (a != dims.iProjAngles)
+ vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a])));
+ if (a == dims.iProjAngles || vertical != blockVertical) {
+ // block done
+
+ blockEnd = a;
+ if (blockStart != blockEnd) {
+ dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
+ (dims.iProjDets+g_detBlockSize-1)/g_detBlockSize); // angle blocks, detector blocks
+
+ // TODO: check if we can't immediately
+ // destroy the stream after use
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+ streams.push_back(stream);
+ //printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical);
+ if (!blockVertical)
+ for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices)
+ FPhorizontal_simple<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
+ else
+ for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
+ FPvertical_simple<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
+ }
+ blockVertical = vertical;
+ blockStart = a;
+ }
+ }
+
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
+ cudaStreamDestroy(*iter);
+
+ streams.clear();
+
+ cudaThreadSynchronize();
+
+ cudaTextForceKernelsCompletion();
+
+ cudaFreeArray(D_dataArray);
+
+
+ return true;
+}
+
+bool FP(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ const SDimensions& dims, const float* angles,
+ const float* TOffsets, float outputScale)
+{
+ return FP_simple(D_volumeData, volumePitch, D_projData, projPitch,
+ dims, angles, TOffsets, outputScale);
+
+ // TODO: Fix bug in this non-simple FP with large detscale and TOffsets
+#if 0
+
+ // TODO: load angles into constant memory in smaller blocks
+ assert(dims.iProjAngles <= g_MaxAngles);
+
+ // TODO: compute region size dynamically to resolve these two assumptions
+ // ASSUMPTION: 16 > regionOffset / fDetScale
+ const unsigned int g_blockSliceSize = g_detBlockSize;
+ assert(16 > (g_blockSlices / g_blockSliceSize) / dims.fDetScale);
+ // ASSUMPTION: fDetScale >= 1.0f
+ assert(dims.fDetScale > 0.9999f);
+
+ cudaArray* D_dataArray;
+ bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+ cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+ if (TOffsets) {
+ cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+ } else {
+ if (!g_pfZeroes) {
+ g_pfZeroes = new float[g_MaxAngles];
+ memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float));
+ }
+ cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+ }
+
+ int regionOffset = g_blockSlices / g_blockSliceSize;
+
+ dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles
+
+ std::list<cudaStream_t> streams;
+
+
+ // Run over all angles, grouping them into groups of the same
+ // orientation (roughly horizontal vs. roughly vertical).
+ // Start a stream of grids for each such group.
+
+ // TODO: Check if it's worth it to store this info instead
+ // of recomputing it every FP.
+
+ unsigned int blockStart = 0;
+ unsigned int blockEnd = 0;
+ bool blockVertical = false;
+ for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+ bool vertical;
+ // TODO: Having <= instead of < below causes a 5% speedup.
+ // Maybe we should detect corner cases and put them in the optimal
+ // group of angles.
+ if (a != dims.iProjAngles)
+ vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a])));
+ if (a == dims.iProjAngles || vertical != blockVertical) {
+ // block done
+
+ blockEnd = a;
+ if (blockStart != blockEnd) {
+ unsigned int length = dims.iVolHeight;
+ if (blockVertical)
+ length = dims.iVolWidth;
+ dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
+ (length+g_blockSliceSize-1)/g_blockSliceSize+2*regionOffset); // angle blocks, regions
+ // TODO: check if we can't immediately
+ // destroy the stream after use
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+ streams.push_back(stream);
+ //printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical);
+ if (!blockVertical)
+ for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices)
+ FPhorizontal<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale);
+ else
+ for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
+ FPvertical<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale);
+ }
+ blockVertical = vertical;
+ blockStart = a;
+ }
+ }
+
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
+ cudaStreamDestroy(*iter);
+
+ streams.clear();
+
+ cudaThreadSynchronize();
+
+ cudaTextForceKernelsCompletion();
+
+ cudaFreeArray(D_dataArray);
+
+
+ return true;
+#endif
+}
+
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+ float* D_volumeData;
+ float* D_projData;
+
+ SDimensions dims;
+ dims.iVolWidth = 1024;
+ dims.iVolHeight = 1024;
+ dims.iProjAngles = 512;
+ dims.iProjDets = 1536;
+ dims.fDetScale = 1.0f;
+ dims.iRaysPerDet = 1;
+ unsigned int volumePitch, projPitch;
+
+ allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ printf("pitch: %u\n", volumePitch);
+
+ allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+ printf("pitch: %u\n", projPitch);
+
+ unsigned int y, x;
+ float* img = loadImage("phantom.png", y, x);
+
+ float* sino = new float[dims.iProjAngles * dims.iProjDets];
+
+ memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float));
+
+ copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+ copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+ float* angle = new float[dims.iProjAngles];
+
+ for (unsigned int i = 0; i < dims.iProjAngles; ++i)
+ angle[i] = i*(M_PI/dims.iProjAngles);
+
+ FP(D_volumeData, volumePitch, D_projData, projPitch, dims, angle, 0, 1.0f);
+
+ delete[] angle;
+
+ copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+ float s = 0.0f;
+ for (unsigned int y = 0; y < dims.iProjAngles; ++y)
+ for (unsigned int x = 0; x < dims.iProjDets; ++x)
+ s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];
+ printf("cpu norm: %f\n", s);
+
+ //zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+ s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+ printf("gpu norm: %f\n", s);
+
+ saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino);
+
+
+ return 0;
+}
+#endif
diff --git a/cuda/2d/par_fp.h b/cuda/2d/par_fp.h
new file mode 100644
index 0000000..3213b14
--- /dev/null
+++ b/cuda/2d/par_fp.h
@@ -0,0 +1,41 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_PAR_FP_H
+#define _CUDA_PAR_FP_H
+
+namespace astraCUDA {
+
+_AstraExport bool FP(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ const SDimensions& dims, const float* angles,
+ const float* TOffsets, float fOutputScale);
+
+}
+
+#endif
diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu
new file mode 100644
index 0000000..a40176d
--- /dev/null
+++ b/cuda/2d/sart.cu
@@ -0,0 +1,283 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "sart.h"
+#include "util.h"
+#include "arith.h"
+#include "fan_fp.h"
+#include "fan_bp.h"
+#include "par_fp.h"
+#include "par_bp.h"
+
+namespace astraCUDA {
+
+
+__global__ void devMUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ if (x >= width) return;
+
+ // Copy result down and left one pixel.
+ pfOut[x + pitch] = pfOut[x + 1] * pfIn[x + 1];
+}
+
+void MUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, 1);
+
+ devMUL_SART<<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width);
+
+ cudaTextForceKernelsCompletion();
+}
+
+
+
+SART::SART() : ReconAlgo()
+{
+ D_projData = 0;
+ D_tmpData = 0;
+
+ D_lineWeight = 0;
+
+ projectionOrder = 0;
+ projectionCount = 0;
+ iteration = 0;
+ customOrder = false;
+}
+
+
+SART::~SART()
+{
+ reset();
+}
+
+void SART::reset()
+{
+ cudaFree(D_projData);
+ cudaFree(D_tmpData);
+ cudaFree(D_lineWeight);
+
+ D_projData = 0;
+ D_tmpData = 0;
+
+ D_lineWeight = 0;
+
+ useVolumeMask = false;
+ useSinogramMask = false;
+
+ if (projectionOrder != NULL) delete[] projectionOrder;
+ projectionOrder = 0;
+ projectionCount = 0;
+ iteration = 0;
+ customOrder = false;
+
+ ReconAlgo::reset();
+}
+
+bool SART::init()
+{
+ if (useVolumeMask) {
+ allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
+ zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ }
+
+ // HACK: D_projData consists of two lines. The first is used padded,
+ // the second unpadded. This is to satisfy the alignment requirements
+ // of resp. FP and BP_SART.
+ allocateVolume(D_projData, dims.iProjDets+2, 2, projPitch);
+ zeroVolume(D_projData, projPitch, dims.iProjDets+2, 1);
+
+ allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch);
+ zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+
+ // We can't precompute lineWeights when using a mask
+ if (!useVolumeMask)
+ precomputeWeights();
+
+ // TODO: check if allocations succeeded
+ return true;
+}
+
+bool SART::setProjectionOrder(int* _projectionOrder, int _projectionCount)
+{
+ customOrder = true;
+ projectionCount = _projectionCount;
+ projectionOrder = new int[projectionCount];
+ for (int i = 0; i < projectionCount; i++) {
+ projectionOrder[i] = _projectionOrder[i];
+ }
+
+ return true;
+}
+
+
+bool SART::precomputeWeights()
+{
+ zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+ if (useVolumeMask) {
+ callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f);
+ } else {
+ // Allocate tmpData temporarily
+ allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
+ zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+
+ processVol<opSet, VOL>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+ callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f);
+
+
+ cudaFree(D_tmpData);
+ D_tmpData = 0;
+ }
+ processVol<opInvert, SINO>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
+
+ return true;
+}
+
+bool SART::iterate(unsigned int iterations)
+{
+ shouldAbort = false;
+
+ if (useVolumeMask)
+ precomputeWeights();
+
+ // iteration
+ for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
+
+ int angle;
+ if (customOrder) {
+ angle = projectionOrder[iteration % projectionCount];
+ } else {
+ angle = iteration % dims.iProjAngles;
+ }
+
+ // copy one line of sinogram to projection data
+ cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData + angle*sinoPitch, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), 1, cudaMemcpyDeviceToDevice);
+
+ // do FP, subtracting projection from sinogram
+ if (useVolumeMask) {
+ cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
+ processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+ callFP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle, -1.0f);
+ } else {
+ callFP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle, -1.0f);
+ }
+
+ MUL_SART(D_projData, D_lineWeight + angle*linePitch, projPitch, dims.iProjDets);
+
+ if (useVolumeMask) {
+ // BP, mask, and add back
+ // TODO: Try putting the masking directly in the BP
+ zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ callBP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle);
+ processVol<opAddMul, VOL>(D_volumeData, D_maskData, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ } else {
+ callBP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle);
+ }
+
+ if (useMinConstraint)
+ processVol<opClampMin, VOL>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ if (useMaxConstraint)
+ processVol<opClampMax, VOL>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+
+ iteration++;
+
+ }
+
+ return true;
+}
+
+float SART::computeDiffNorm()
+{
+ unsigned int pPitch;
+ float *D_p;
+ allocateVolume(D_p, dims.iProjDets+2, dims.iProjAngles, pPitch);
+ zeroVolume(D_p, pPitch, dims.iProjDets+2, dims.iProjAngles);
+
+ // copy sinogram to D_p
+ cudaMemcpy2D(D_p, sizeof(float)*pPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+
+ // do FP, subtracting projection from sinogram
+ if (useVolumeMask) {
+ cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
+ processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+ callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
+ } else {
+ callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
+ }
+
+
+ // compute norm of D_p
+ float s = dotProduct2D(D_p, pPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+
+ cudaFree(D_p);
+
+ return sqrt(s);
+}
+
+bool SART::callFP_SART(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ unsigned int angle, float outputScale)
+{
+ SDimensions d = dims;
+ d.iProjAngles = 1;
+ if (angles) {
+ assert(!fanProjs);
+ return FP(D_volumeData, volumePitch, D_projData, projPitch,
+ d, &angles[angle], TOffsets, outputScale);
+ } else {
+ assert(fanProjs);
+ return FanFP(D_volumeData, volumePitch, D_projData, projPitch,
+ d, &fanProjs[angle], outputScale);
+ }
+}
+
+bool SART::callBP_SART(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ unsigned int angle)
+{
+ if (angles) {
+ assert(!fanProjs);
+ return BP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch,
+ angle, dims, angles, TOffsets);
+ } else {
+ assert(fanProjs);
+ return FanBP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch,
+ angle, dims, fanProjs);
+ }
+
+}
+
+
+}
+
+
diff --git a/cuda/2d/sart.h b/cuda/2d/sart.h
new file mode 100644
index 0000000..ad80259
--- /dev/null
+++ b/cuda/2d/sart.h
@@ -0,0 +1,85 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_SART_H
+#define _CUDA_SART_H
+
+#include "util.h"
+#include "algo.h"
+
+namespace astraCUDA {
+
+class _AstraExport SART : public ReconAlgo {
+public:
+ SART();
+ ~SART();
+
+ // disable some features
+ virtual bool enableSinogramMask() { return false; }
+
+ virtual bool init();
+
+ virtual bool setProjectionOrder(int* projectionOrder, int projectionCount);
+
+ virtual bool iterate(unsigned int iterations);
+
+ virtual float computeDiffNorm();
+
+protected:
+ void reset();
+ bool precomputeWeights();
+
+ bool callFP_SART(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ unsigned int angle, float outputScale);
+ bool callBP_SART(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ unsigned int angle);
+
+
+ // projection angle variables
+ bool customOrder;
+ int* projectionOrder;
+ int projectionCount;
+ int iteration;
+
+ // Temporary buffers
+ float* D_projData;
+ unsigned int projPitch;
+
+ float* D_tmpData; // Only used when there's a volume mask
+ unsigned int tmpPitch;
+
+ // Geometry-specific precomputed data
+ float* D_lineWeight;
+ unsigned int linePitch;
+};
+
+}
+
+#endif
diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu
new file mode 100644
index 0000000..31954e4
--- /dev/null
+++ b/cuda/2d/sirt.cu
@@ -0,0 +1,342 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "sirt.h"
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+namespace astraCUDA {
+
+SIRT::SIRT() : ReconAlgo()
+{
+ D_projData = 0;
+ D_tmpData = 0;
+
+ D_lineWeight = 0;
+ D_pixelWeight = 0;
+
+ D_minMaskData = 0;
+ D_maxMaskData = 0;
+
+ freeMinMaxMasks = false;
+}
+
+
+SIRT::~SIRT()
+{
+ reset();
+}
+
+void SIRT::reset()
+{
+ cudaFree(D_projData);
+ cudaFree(D_tmpData);
+ cudaFree(D_lineWeight);
+ cudaFree(D_pixelWeight);
+ if (freeMinMaxMasks) {
+ cudaFree(D_minMaskData);
+ cudaFree(D_maxMaskData);
+ }
+
+ D_projData = 0;
+ D_tmpData = 0;
+
+ D_lineWeight = 0;
+ D_pixelWeight = 0;
+
+ freeMinMaxMasks = false;
+ D_minMaskData = 0;
+ D_maxMaskData = 0;
+
+ useVolumeMask = false;
+ useSinogramMask = false;
+
+ ReconAlgo::reset();
+}
+
+bool SIRT::init()
+{
+ allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch);
+ zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+ allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
+ zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+ allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+ zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+
+ allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch);
+ zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+
+ // We can't precompute lineWeights and pixelWeights when using a mask
+ if (!useVolumeMask && !useSinogramMask)
+ precomputeWeights();
+
+ // TODO: check if allocations succeeded
+ return true;
+}
+
+bool SIRT::precomputeWeights()
+{
+ zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+ if (useVolumeMask) {
+ callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f);
+ } else {
+ processVol<opSet, VOL>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+ callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f);
+ }
+ processVol<opInvert, SINO>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
+
+ if (useSinogramMask) {
+ // scale line weights with sinogram mask to zero out masked sinogram pixels
+ processVol<opMul, SINO>(D_lineWeight, D_smaskData, linePitch, dims.iProjDets, dims.iProjAngles);
+ }
+
+
+ zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ if (useSinogramMask) {
+ callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch);
+ } else {
+ processVol<opSet, SINO>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles);
+ callBP(D_pixelWeight, pixelPitch, D_projData, projPitch);
+ }
+ processVol<opInvert, VOL>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+
+ if (useVolumeMask) {
+ // scale pixel weights with mask to zero out masked pixels
+ processVol<opMul, VOL>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+ }
+
+ return true;
+}
+
+bool SIRT::setMinMaxMasks(float* D_minMaskData_, float* D_maxMaskData_,
+ unsigned int iPitch)
+{
+ D_minMaskData = D_minMaskData_;
+ D_maxMaskData = D_maxMaskData_;
+ minMaskPitch = iPitch;
+ maxMaskPitch = iPitch;
+
+ freeMinMaxMasks = false;
+ return true;
+}
+
+bool SIRT::uploadMinMaxMasks(const float* pfMinMaskData, const float* pfMaxMaskData,
+ unsigned int iPitch)
+{
+ freeMinMaxMasks = true;
+ bool ok = true;
+ if (pfMinMaskData) {
+ allocateVolume(D_minMaskData, dims.iVolWidth+2, dims.iVolHeight+2, minMaskPitch);
+ ok = copyVolumeToDevice(pfMinMaskData, iPitch,
+ dims.iVolWidth, dims.iVolHeight,
+ D_minMaskData, minMaskPitch);
+ }
+ if (!ok)
+ return false;
+
+ if (pfMaxMaskData) {
+ allocateVolume(D_maxMaskData, dims.iVolWidth+2, dims.iVolHeight+2, maxMaskPitch);
+ ok = copyVolumeToDevice(pfMaxMaskData, iPitch,
+ dims.iVolWidth, dims.iVolHeight,
+ D_maxMaskData, maxMaskPitch);
+ }
+ if (!ok)
+ return false;
+
+ return true;
+}
+
+bool SIRT::iterate(unsigned int iterations)
+{
+ shouldAbort = false;
+
+ if (useVolumeMask || useSinogramMask)
+ precomputeWeights();
+
+ // iteration
+ for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
+
+ // copy sinogram to projection data
+ cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+
+ // do FP, subtracting projection from sinogram
+ if (useVolumeMask) {
+ cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
+ processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+ callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
+ } else {
+ callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
+ }
+
+ processVol<opMul, SINO>(D_projData, D_lineWeight, projPitch, dims.iProjDets, dims.iProjAngles);
+
+ zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+ callBP(D_tmpData, tmpPitch, D_projData, projPitch);
+
+ processVol<opAddMul, VOL>(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+
+ if (useMinConstraint)
+ processVol<opClampMin, VOL>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ if (useMaxConstraint)
+ processVol<opClampMax, VOL>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ if (D_minMaskData)
+ processVol<opClampMinMask, VOL>(D_volumeData, D_minMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ if (D_maxMaskData)
+ processVol<opClampMaxMask, VOL>(D_volumeData, D_maxMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ }
+
+ return true;
+}
+
+float SIRT::computeDiffNorm()
+{
+ // copy sinogram to projection data
+ cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+
+ // do FP, subtracting projection from sinogram
+ if (useVolumeMask) {
+ cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
+ processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+ callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
+ } else {
+ callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
+ }
+
+
+ // compute norm of D_projData
+
+ float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+
+ return sqrt(s);
+}
+
+
+bool doSIRT(float* D_volumeData, unsigned int volumePitch,
+ float* D_sinoData, unsigned int sinoPitch,
+ float* D_maskData, unsigned int maskPitch,
+ const SDimensions& dims, const float* angles,
+ const float* TOffsets, unsigned int iterations)
+{
+ SIRT sirt;
+ bool ok = true;
+
+ ok &= sirt.setGeometry(dims, angles);
+ if (D_maskData)
+ ok &= sirt.enableVolumeMask();
+ if (TOffsets)
+ ok &= sirt.setTOffsets(TOffsets);
+
+ if (!ok)
+ return false;
+
+ ok = sirt.init();
+ if (!ok)
+ return false;
+
+ if (D_maskData)
+ ok &= sirt.setVolumeMask(D_maskData, maskPitch);
+
+ ok &= sirt.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+ if (!ok)
+ return false;
+
+ ok = sirt.iterate(iterations);
+
+ return ok;
+}
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+ float* D_volumeData;
+ float* D_sinoData;
+
+ SDimensions dims;
+ dims.iVolWidth = 1024;
+ dims.iVolHeight = 1024;
+ dims.iProjAngles = 512;
+ dims.iProjDets = 1536;
+ dims.fDetScale = 1.0f;
+ dims.iRaysPerDet = 1;
+ unsigned int volumePitch, sinoPitch;
+
+ allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ printf("pitch: %u\n", volumePitch);
+
+ allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+ zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+ printf("pitch: %u\n", sinoPitch);
+
+ unsigned int y, x;
+ float* sino = loadImage("sino.png", y, x);
+
+ float* img = new float[dims.iVolWidth*dims.iVolHeight];
+
+ copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch);
+
+ float* angle = new float[dims.iProjAngles];
+
+ for (unsigned int i = 0; i < dims.iProjAngles; ++i)
+ angle[i] = i*(M_PI/dims.iProjAngles);
+
+ SIRT sirt;
+
+ sirt.setGeometry(dims, angle);
+ sirt.init();
+
+ sirt.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+
+ sirt.iterate(25);
+
+
+ delete[] angle;
+
+ copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+
+ saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
+
+ return 0;
+}
+#endif
+
diff --git a/cuda/2d/sirt.h b/cuda/2d/sirt.h
new file mode 100644
index 0000000..5592616
--- /dev/null
+++ b/cuda/2d/sirt.h
@@ -0,0 +1,90 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_SIRT_H
+#define _CUDA_SIRT_H
+
+#include "util.h"
+#include "algo.h"
+
+namespace astraCUDA {
+
+class _AstraExport SIRT : public ReconAlgo {
+public:
+ SIRT();
+ ~SIRT();
+
+ virtual bool init();
+
+ // Set min/max masks to existing GPU memory buffers
+ bool setMinMaxMasks(float* D_minMaskData, float* D_maxMaskData,
+ unsigned int pitch);
+
+ // Set min/max masks from RAM buffers
+ bool uploadMinMaxMasks(const float* minMaskData, const float* maxMaskData,
+ unsigned int pitch);
+
+ virtual bool iterate(unsigned int iterations);
+
+ virtual float computeDiffNorm();
+
+protected:
+ void reset();
+ bool precomputeWeights();
+
+ // Temporary buffers
+ float* D_projData;
+ unsigned int projPitch;
+
+ float* D_tmpData;
+ unsigned int tmpPitch;
+
+ // Geometry-specific precomputed data
+ float* D_lineWeight;
+ unsigned int linePitch;
+
+ float* D_pixelWeight;
+ unsigned int pixelPitch;
+
+ // Masks
+ bool freeMinMaxMasks;
+ float* D_minMaskData;
+ unsigned int minMaskPitch;
+ float* D_maxMaskData;
+ unsigned int maxMaskPitch;
+};
+
+bool doSIRT(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ float* D_maskData, unsigned int maskPitch,
+ const SDimensions& dims, const float* angles,
+ const float* TOffsets, unsigned int iterations);
+
+}
+
+#endif
diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu
new file mode 100644
index 0000000..06f6714
--- /dev/null
+++ b/cuda/2d/util.cu
@@ -0,0 +1,244 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include "util.h"
+
+namespace astraCUDA {
+
+bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch,
+ unsigned int width, unsigned int height,
+ float* outD_data, unsigned int out_pitch)
+{
+ // TODO: a full memset isn't necessary. Only the edges.
+ cudaError_t err;
+ err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, sizeof(float)*(width+2), height+2);
+ ASTRA_CUDA_ASSERT(err);
+ err = cudaMemcpy2D(outD_data + out_pitch + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
+ ASTRA_CUDA_ASSERT(err);
+ assert(err == cudaSuccess);
+ return true;
+}
+
+bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch,
+ unsigned int width, unsigned int height,
+ float* inD_data, unsigned int in_pitch)
+{
+ cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + (in_pitch + 1), sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
+ ASTRA_CUDA_ASSERT(err);
+ return true;
+}
+
+
+bool copySinogramFromDevice(float* out_data, unsigned int out_pitch,
+ unsigned int width, unsigned int height,
+ float* inD_data, unsigned int in_pitch)
+{
+ cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + 1, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
+ ASTRA_CUDA_ASSERT(err);
+ return true;
+}
+
+bool copySinogramToDevice(const float* in_data, unsigned int in_pitch,
+ unsigned int width, unsigned int height,
+ float* outD_data, unsigned int out_pitch)
+{
+ // TODO: a full memset isn't necessary. Only the edges.
+ cudaError_t err;
+ err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, (width+2)*sizeof(float), height);
+ ASTRA_CUDA_ASSERT(err);
+ err = cudaMemcpy2D(outD_data + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
+ ASTRA_CUDA_ASSERT(err);
+ return true;
+}
+
+
+bool allocateVolume(float*& ptr, unsigned int width, unsigned int height, unsigned int& pitch)
+{
+ size_t p;
+ cudaError_t ret = cudaMallocPitch((void**)&ptr, &p, sizeof(float)*width, height);
+ if (ret != cudaSuccess) {
+ reportCudaError(ret);
+ fprintf(stderr, "Failed to allocate %dx%d GPU buffer\n", width, height);
+ return false;
+ }
+
+ assert(p % sizeof(float) == 0);
+
+ pitch = p / sizeof(float);
+
+ return true;
+}
+
+void zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ cudaError_t err;
+ err = cudaMemset2D(data, sizeof(float)*pitch, 0, sizeof(float)*width, height);
+ ASTRA_CUDA_ASSERT(err);
+}
+
+
+template <unsigned int blockSize>
+__global__ void reduce1D(float *g_idata, float *g_odata, unsigned int n)
+{
+ extern __shared__ float sdata[];
+ unsigned int tid = threadIdx.x;
+
+ unsigned int i = blockIdx.x*(blockSize*2) + tid;
+ unsigned int gridSize = blockSize*gridDim.x;
+ sdata[tid] = 0;
+ while (i < n) { sdata[tid] += g_idata[i]; i += gridSize; }
+ __syncthreads();
+ if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
+ if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
+ if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
+ if (tid < 32) {
+ volatile float* smem = sdata;
+ if (blockSize >= 64) smem[tid] += smem[tid + 32];
+ if (blockSize >= 32) smem[tid] += smem[tid + 16];
+ if (blockSize >= 16) smem[tid] += smem[tid + 8];
+ if (blockSize >= 8) smem[tid] += smem[tid + 4];
+ if (blockSize >= 4) smem[tid] += smem[tid + 2];
+ if (blockSize >= 2) smem[tid] += smem[tid + 1];
+ }
+ if (tid == 0) g_odata[blockIdx.x] = sdata[0];
+}
+
+__global__ void reduce2D(float *g_idata, float *g_odata,
+ unsigned int pitch,
+ unsigned int nx, unsigned int ny,
+ unsigned int padX, unsigned int padY)
+{
+ extern __shared__ float sdata[];
+ const unsigned int tidx = threadIdx.x;
+ const unsigned int tidy = threadIdx.y;
+ const unsigned int tid = tidy * 16 + tidx;
+
+ unsigned int x = blockIdx.x*16 + tidx;
+ unsigned int y = blockIdx.y*16 + tidy;
+
+ sdata[tid] = 0;
+
+ if (x >= padX && x < padX + nx) {
+
+ while (y < padY + ny) {
+ if (y >= padY)
+ sdata[tid] += (g_idata[pitch*y+x] * g_idata[pitch*y+x]);
+ y += 16 * gridDim.y;
+ }
+
+ }
+
+ __syncthreads();
+
+ if (tid < 128)
+ sdata[tid] += sdata[tid + 128];
+ __syncthreads();
+
+ if (tid < 64)
+ sdata[tid] += sdata[tid + 64];
+ __syncthreads();
+
+ if (tid < 32) { // 32 is warp size
+ volatile float* smem = sdata;
+ smem[tid] += smem[tid + 32];
+ smem[tid] += smem[tid + 16];
+ smem[tid] += smem[tid + 8];
+ smem[tid] += smem[tid + 4];
+ smem[tid] += smem[tid + 2];
+ smem[tid] += smem[tid + 1];
+ }
+
+ if (tid == 0)
+ g_odata[blockIdx.y * gridDim.x + blockIdx.x] = sdata[0];
+}
+
+float dotProduct2D(float* D_data, unsigned int pitch,
+ unsigned int width, unsigned int height,
+ unsigned int padX, unsigned int padY)
+{
+ unsigned int bx = ((width+padX) + 15) / 16;
+ unsigned int by = ((height+padY) + 127) / 128;
+ unsigned int shared_mem2 = sizeof(float) * 16 * 16;
+
+ dim3 dimBlock2(16, 16);
+ dim3 dimGrid2(bx, by);
+
+ float* D_buf;
+ cudaMalloc(&D_buf, sizeof(float) * (bx * by + 1) );
+
+ // Step 1: reduce 2D from image to a single vector, taking sum of squares
+
+ reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height, padX, padY);
+ cudaTextForceKernelsCompletion();
+
+ // Step 2: reduce 1D: add up elements in vector
+ if (bx * by > 512)
+ reduce1D<512><<< 1, 512, sizeof(float)*512>>>(D_buf, D_buf+(bx*by), bx*by);
+ else if (bx * by > 128)
+ reduce1D<128><<< 1, 128, sizeof(float)*128>>>(D_buf, D_buf+(bx*by), bx*by);
+ else if (bx * by > 32)
+ reduce1D<32><<< 1, 32, sizeof(float)*32*2>>>(D_buf, D_buf+(bx*by), bx*by);
+ else if (bx * by > 8)
+ reduce1D<8><<< 1, 8, sizeof(float)*8*2>>>(D_buf, D_buf+(bx*by), bx*by);
+ else
+ reduce1D<1><<< 1, 1, sizeof(float)*1*2>>>(D_buf, D_buf+(bx*by), bx*by);
+
+ float x;
+ cudaMemcpy(&x, D_buf+(bx*by), 4, cudaMemcpyDeviceToHost);
+
+ cudaTextForceKernelsCompletion();
+
+ cudaFree(D_buf);
+
+ return x;
+}
+
+
+bool cudaTextForceKernelsCompletion()
+{
+ cudaError_t returnedCudaError = cudaThreadSynchronize();
+
+ if(returnedCudaError != cudaSuccess) {
+ fprintf(stderr, "Failed to force completion of cuda kernels: %d: %s.\n", returnedCudaError, cudaGetErrorString(returnedCudaError));
+ return false;
+ }
+
+ return true;
+}
+
+void reportCudaError(cudaError_t err)
+{
+ if(err != cudaSuccess)
+ fprintf(stderr, "CUDA error %d: %s.\n", err, cudaGetErrorString(err));
+}
+
+
+
+}
diff --git a/cuda/2d/util.h b/cuda/2d/util.h
new file mode 100644
index 0000000..d31e2eb
--- /dev/null
+++ b/cuda/2d/util.h
@@ -0,0 +1,90 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_UTIL_H
+#define _CUDA_UTIL_H
+
+#include <cuda.h>
+#include <driver_types.h>
+
+#ifdef _MSC_VER
+
+#ifdef DLL_EXPORTS
+#define _AstraExport __declspec(dllexport)
+#define EXPIMP_TEMPLATE
+#else
+#define _AstraExport __declspec(dllimport)
+#define EXPIMP_TEMPLATE extern
+#endif
+
+#else
+
+#define _AstraExport
+
+#endif
+
+#include "dims.h"
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#define ASTRA_CUDA_ASSERT(err) do { if (err != cudaSuccess) { astraCUDA::reportCudaError(err); assert(err == cudaSuccess); } } while(0)
+
+
+namespace astraCUDA {
+
+bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch,
+ unsigned int width, unsigned int height,
+ float* outD_data, unsigned int out_pitch);
+bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch,
+ unsigned int width, unsigned int height,
+ float* inD_data, unsigned int in_pitch);
+bool copySinogramFromDevice(float* out_data, unsigned int out_pitch,
+ unsigned int width, unsigned int height,
+ float* inD_data, unsigned int in_pitch);
+bool copySinogramToDevice(const float* in_data, unsigned int in_pitch,
+ unsigned int width, unsigned int height,
+ float* outD_data, unsigned int out_pitch);
+
+bool allocateVolume(float*& D_ptr, unsigned int width, unsigned int height, unsigned int& pitch);
+
+void zeroVolume(float* D_data, unsigned int pitch, unsigned int width, unsigned int height);
+
+bool cudaTextForceKernelsCompletion();
+void reportCudaError(cudaError_t err);
+
+
+
+float dotProduct2D(float* D_data, unsigned int pitch,
+ unsigned int width, unsigned int height,
+ unsigned int padX, unsigned int padY);
+
+}
+
+#endif