summaryrefslogtreecommitdiffstats
path: root/cuda/2d/fan_fp.cu
diff options
context:
space:
mode:
Diffstat (limited to 'cuda/2d/fan_fp.cu')
-rw-r--r--cuda/2d/fan_fp.cu370
1 files changed, 370 insertions, 0 deletions
diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu
new file mode 100644
index 0000000..cf9f352
--- /dev/null
+++ b/cuda/2d/fan_fp.cu
@@ -0,0 +1,370 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+
+typedef texture<float, 2, cudaReadModeElementType> texture2D;
+
+static texture2D gT_FanVolumeTexture;
+
+
+namespace astraCUDA {
+
+static const unsigned g_MaxAngles = 2560;
+__constant__ float gC_SrcX[g_MaxAngles];
+__constant__ float gC_SrcY[g_MaxAngles];
+__constant__ float gC_DetSX[g_MaxAngles];
+__constant__ float gC_DetSY[g_MaxAngles];
+__constant__ float gC_DetUX[g_MaxAngles];
+__constant__ float gC_DetUY[g_MaxAngles];
+
+
+// optimization parameters
+static const unsigned int g_anglesPerBlock = 16;
+static const unsigned int g_detBlockSize = 32;
+static const unsigned int g_blockSlices = 64;
+
+static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ dataArray = 0;
+ cudaMallocArray(&dataArray, &channelDesc, width, height);
+ cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice);
+
+ gT_FanVolumeTexture.addressMode[0] = cudaAddressModeClamp;
+ gT_FanVolumeTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_FanVolumeTexture.filterMode = cudaFilterModeLinear;
+ gT_FanVolumeTexture.normalized = false;
+
+ // TODO: For very small sizes (roughly <=512x128) with few angles (<=180)
+ // not using an array is more efficient.
+ //cudaBindTexture2D(0, gT_FanVolumeTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch);
+ cudaBindTextureToArray(gT_FanVolumeTexture, dataArray, channelDesc);
+
+ // TODO: error value?
+
+ return true;
+}
+
+// projection for angles that are roughly horizontal
+// (detector roughly vertical)
+__global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
+{
+ float* projData = (float*)D_projData;
+ const int relDet = threadIdx.x;
+ const int relAngle = threadIdx.y;
+
+ const int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+ if (angle >= endAngle)
+ return;
+
+ const int detector = blockIdx.y * g_detBlockSize + relDet;
+
+ if (detector < 0 || detector >= dims.iProjDets)
+ return;
+
+ const float fSrcX = gC_SrcX[angle];
+ const float fSrcY = gC_SrcY[angle];
+ const float fDetSX = gC_DetSX[angle];
+ const float fDetSY = gC_DetSY[angle];
+ const float fDetUX = gC_DetUX[angle];
+ const float fDetUY = gC_DetUY[angle];
+
+ float fVal = 0.0f;
+
+ const float fdx = fabsf(fDetSX + detector*fDetUX + 0.5f - fSrcX);
+ const float fdy = fabsf(fDetSY + detector*fDetUY + 0.5f - fSrcY);
+
+ if (fdy > fdx)
+ return;
+
+
+ for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+ const float fDet = detector + (0.5f + iSubT) / dims.iRaysPerDet;
+
+ const float fDetX = fDetSX + fDet * fDetUX;
+ const float fDetY = fDetSY + fDet * fDetUY;
+
+ // ray: y = alpha * x + beta
+ const float alpha = (fSrcY - fDetY) / (fSrcX - fDetX);
+ const float beta = fSrcY - alpha * fSrcX;
+
+ const float fDistCorr = sqrt(alpha*alpha+1.0f) * outputScale / dims.iRaysPerDet;
+
+ // intersect ray with first slice
+
+ float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
+ float fX = startSlice + 1.5f;
+
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > dims.iVolWidth)
+ endSlice = dims.iVolWidth;
+
+ float fV = 0.0f;
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ fV += tex2D(gT_FanVolumeTexture, fX, fY);
+ fY -= alpha;
+ fX += 1.0f;
+ }
+
+ fVal += fV * fDistCorr;
+
+ }
+
+ projData[angle*projPitch+detector+1] += fVal;
+}
+
+
+// projection for angles that are roughly vertical
+// (detector roughly horizontal)
+__global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
+{
+ const int relDet = threadIdx.x;
+ const int relAngle = threadIdx.y;
+
+ const int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+ if (angle >= endAngle)
+ return;
+
+ const int detector = blockIdx.y * g_detBlockSize + relDet;
+
+ if (detector < 0 || detector >= dims.iProjDets)
+ return;
+
+ float* projData = (float*)D_projData;
+
+ const float fSrcX = gC_SrcX[angle];
+ const float fSrcY = gC_SrcY[angle];
+ const float fDetSX = gC_DetSX[angle];
+ const float fDetSY = gC_DetSY[angle];
+ const float fDetUX = gC_DetUX[angle];
+ const float fDetUY = gC_DetUY[angle];
+
+ float fVal = 0.0f;
+
+ const float fdx = fabsf(fDetSX + detector*fDetUX + 0.5f - fSrcX);
+ const float fdy = fabsf(fDetSY + detector*fDetUY + 0.5f - fSrcY);
+
+ if (fdy <= fdx)
+ return;
+
+
+ for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+ const float fDet = detector + (0.5f + iSubT) / dims.iRaysPerDet /*- gC_angle_offset[angle]*/;
+
+ const float fDetX = fDetSX + fDet * fDetUX;
+ const float fDetY = fDetSY + fDet * fDetUY;
+
+ // ray: x = alpha * y + beta
+ const float alpha = (fSrcX - fDetX) / (fSrcY - fDetY);
+ const float beta = fSrcX - alpha * fSrcY;
+
+ const float fDistCorr = sqrt(alpha*alpha+1) * outputScale / dims.iRaysPerDet;
+
+ // intersect ray with first slice
+
+ float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
+ float fY = startSlice + 1.5f;
+
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > dims.iVolHeight)
+ endSlice = dims.iVolHeight;
+
+ float fV = 0.0f;
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ fV += tex2D(gT_FanVolumeTexture, fX, fY);
+ fX -= alpha;
+ fY += 1.0f;
+ }
+
+ fVal += fV * fDistCorr;
+
+ }
+
+ projData[angle*projPitch+detector+1] += fVal;
+}
+
+bool FanFP(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ const SDimensions& dims, const SFanProjection* angles,
+ float outputScale)
+{
+ // TODO: load angles into constant memory in smaller blocks
+ assert(dims.iProjAngles <= g_MaxAngles);
+
+ cudaArray* D_dataArray;
+ bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+ // transfer angles to constant memory
+ float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+ TRANSFER_TO_CONSTANT(SrcX);
+ TRANSFER_TO_CONSTANT(SrcY);
+ TRANSFER_TO_CONSTANT(DetSX);
+ TRANSFER_TO_CONSTANT(DetSY);
+ TRANSFER_TO_CONSTANT(DetUX);
+ TRANSFER_TO_CONSTANT(DetUY);
+
+#undef TRANSFER_TO_CONSTANT
+
+ delete[] tmp;
+
+ dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles
+ const unsigned int g_blockSliceSize = g_detBlockSize;
+
+ std::list<cudaStream_t> streams;
+
+
+ unsigned int blockStart = 0;
+ unsigned int blockEnd = dims.iProjAngles;
+
+ dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
+ (dims.iProjDets+g_blockSliceSize-1)/g_blockSliceSize); // angle blocks, regions
+ cudaStream_t stream1;
+ cudaStreamCreate(&stream1);
+ streams.push_back(stream1);
+ for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices)
+ FanFPhorizontal<<<dimGrid, dimBlock, 0, stream1>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
+
+ cudaStream_t stream2;
+ cudaStreamCreate(&stream2);
+ streams.push_back(stream2);
+ for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
+ FanFPvertical<<<dimGrid, dimBlock, 0, stream2>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
+
+ cudaStreamDestroy(stream1);
+ cudaStreamDestroy(stream2);
+
+ cudaThreadSynchronize();
+
+ cudaTextForceKernelsCompletion();
+
+ cudaFreeArray(D_dataArray);
+
+ return true;
+}
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+ float* D_volumeData;
+ float* D_projData;
+
+ SDimensions dims;
+ dims.iVolWidth = 128;
+ dims.iVolHeight = 128;
+ dims.iProjAngles = 180;
+ dims.iProjDets = 256;
+ dims.fDetScale = 1.0f;
+ dims.iRaysPerDet = 1;
+ unsigned int volumePitch, projPitch;
+
+ SFanProjection projs[180];
+
+ projs[0].fSrcX = 0.0f;
+ projs[0].fSrcY = 1536.0f;
+ projs[0].fDetSX = 128.0f;
+ projs[0].fDetSY = -512.0f;
+ projs[0].fDetUX = -1.0f;
+ projs[0].fDetUY = 0.0f;
+
+#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = projs[0].f##name##X * cos(alpha) - projs[0].f##name##Y * sin(alpha); projs[i].f##name##Y = projs[0].f##name##X * sin(alpha) + projs[0].f##name##Y * cos(alpha); } while(0)
+
+ for (int i = 1; i < 180; ++i) {
+ ROTATE0(Src, i, i*2*M_PI/180);
+ ROTATE0(DetS, i, i*2*M_PI/180);
+ ROTATE0(DetU, i, i*2*M_PI/180);
+ }
+
+#undef ROTATE0
+
+ allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ printf("pitch: %u\n", volumePitch);
+
+ allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+ printf("pitch: %u\n", projPitch);
+
+ unsigned int y, x;
+ float* img = loadImage("phantom128.png", y, x);
+
+ float* sino = new float[dims.iProjAngles * dims.iProjDets];
+
+ memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float));
+
+ copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+ copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+ float* angle = new float[dims.iProjAngles];
+
+ for (unsigned int i = 0; i < dims.iProjAngles; ++i)
+ angle[i] = i*(M_PI/dims.iProjAngles);
+
+ FanFP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs, 1.0f);
+
+ delete[] angle;
+
+ copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+ float s = 0.0f;
+ for (unsigned int y = 0; y < dims.iProjAngles; ++y)
+ for (unsigned int x = 0; x < dims.iProjDets; ++x)
+ s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];
+ printf("cpu norm: %f\n", s);
+
+ //zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+ s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+ printf("gpu norm: %f\n", s);
+
+ saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino);
+
+
+ return 0;
+}
+#endif