diff options
Diffstat (limited to 'cuda/2d/fan_fp.cu')
-rw-r--r-- | cuda/2d/fan_fp.cu | 370 |
1 files changed, 370 insertions, 0 deletions
diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu new file mode 100644 index 0000000..cf9f352 --- /dev/null +++ b/cuda/2d/fan_fp.cu @@ -0,0 +1,370 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + + +typedef texture<float, 2, cudaReadModeElementType> texture2D; + +static texture2D gT_FanVolumeTexture; + + +namespace astraCUDA { + +static const unsigned g_MaxAngles = 2560; +__constant__ float gC_SrcX[g_MaxAngles]; +__constant__ float gC_SrcY[g_MaxAngles]; +__constant__ float gC_DetSX[g_MaxAngles]; +__constant__ float gC_DetSY[g_MaxAngles]; +__constant__ float gC_DetUX[g_MaxAngles]; +__constant__ float gC_DetUY[g_MaxAngles]; + + +// optimization parameters +static const unsigned int g_anglesPerBlock = 16; +static const unsigned int g_detBlockSize = 32; +static const unsigned int g_blockSlices = 64; + +static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + dataArray = 0; + cudaMallocArray(&dataArray, &channelDesc, width, height); + cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice); + + gT_FanVolumeTexture.addressMode[0] = cudaAddressModeClamp; + gT_FanVolumeTexture.addressMode[1] = cudaAddressModeClamp; + gT_FanVolumeTexture.filterMode = cudaFilterModeLinear; + gT_FanVolumeTexture.normalized = false; + + // TODO: For very small sizes (roughly <=512x128) with few angles (<=180) + // not using an array is more efficient. + //cudaBindTexture2D(0, gT_FanVolumeTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch); + cudaBindTextureToArray(gT_FanVolumeTexture, dataArray, channelDesc); + + // TODO: error value? + + return true; +} + +// projection for angles that are roughly horizontal +// (detector roughly vertical) +__global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) +{ + float* projData = (float*)D_projData; + const int relDet = threadIdx.x; + const int relAngle = threadIdx.y; + + const int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; + if (angle >= endAngle) + return; + + const int detector = blockIdx.y * g_detBlockSize + relDet; + + if (detector < 0 || detector >= dims.iProjDets) + return; + + const float fSrcX = gC_SrcX[angle]; + const float fSrcY = gC_SrcY[angle]; + const float fDetSX = gC_DetSX[angle]; + const float fDetSY = gC_DetSY[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + + float fVal = 0.0f; + + const float fdx = fabsf(fDetSX + detector*fDetUX + 0.5f - fSrcX); + const float fdy = fabsf(fDetSY + detector*fDetUY + 0.5f - fSrcY); + + if (fdy > fdx) + return; + + + for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { + const float fDet = detector + (0.5f + iSubT) / dims.iRaysPerDet; + + const float fDetX = fDetSX + fDet * fDetUX; + const float fDetY = fDetSY + fDet * fDetUY; + + // ray: y = alpha * x + beta + const float alpha = (fSrcY - fDetY) / (fSrcX - fDetX); + const float beta = fSrcY - alpha * fSrcX; + + const float fDistCorr = sqrt(alpha*alpha+1.0f) * outputScale / dims.iRaysPerDet; + + // intersect ray with first slice + + float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 1.5f; + float fX = startSlice + 1.5f; + + int endSlice = startSlice + g_blockSlices; + if (endSlice > dims.iVolWidth) + endSlice = dims.iVolWidth; + + float fV = 0.0f; + for (int slice = startSlice; slice < endSlice; ++slice) + { + fV += tex2D(gT_FanVolumeTexture, fX, fY); + fY -= alpha; + fX += 1.0f; + } + + fVal += fV * fDistCorr; + + } + + projData[angle*projPitch+detector+1] += fVal; +} + + +// projection for angles that are roughly vertical +// (detector roughly horizontal) +__global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) +{ + const int relDet = threadIdx.x; + const int relAngle = threadIdx.y; + + const int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; + + if (angle >= endAngle) + return; + + const int detector = blockIdx.y * g_detBlockSize + relDet; + + if (detector < 0 || detector >= dims.iProjDets) + return; + + float* projData = (float*)D_projData; + + const float fSrcX = gC_SrcX[angle]; + const float fSrcY = gC_SrcY[angle]; + const float fDetSX = gC_DetSX[angle]; + const float fDetSY = gC_DetSY[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + + float fVal = 0.0f; + + const float fdx = fabsf(fDetSX + detector*fDetUX + 0.5f - fSrcX); + const float fdy = fabsf(fDetSY + detector*fDetUY + 0.5f - fSrcY); + + if (fdy <= fdx) + return; + + + for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { + const float fDet = detector + (0.5f + iSubT) / dims.iRaysPerDet /*- gC_angle_offset[angle]*/; + + const float fDetX = fDetSX + fDet * fDetUX; + const float fDetY = fDetSY + fDet * fDetUY; + + // ray: x = alpha * y + beta + const float alpha = (fSrcX - fDetX) / (fSrcY - fDetY); + const float beta = fSrcX - alpha * fSrcY; + + const float fDistCorr = sqrt(alpha*alpha+1) * outputScale / dims.iRaysPerDet; + + // intersect ray with first slice + + float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 1.5f; + float fY = startSlice + 1.5f; + + int endSlice = startSlice + g_blockSlices; + if (endSlice > dims.iVolHeight) + endSlice = dims.iVolHeight; + + float fV = 0.0f; + + for (int slice = startSlice; slice < endSlice; ++slice) + { + fV += tex2D(gT_FanVolumeTexture, fX, fY); + fX -= alpha; + fY += 1.0f; + } + + fVal += fV * fDistCorr; + + } + + projData[angle*projPitch+detector+1] += fVal; +} + +bool FanFP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const SFanProjection* angles, + float outputScale) +{ + // TODO: load angles into constant memory in smaller blocks + assert(dims.iProjAngles <= g_MaxAngles); + + cudaArray* D_dataArray; + bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(SrcX); + TRANSFER_TO_CONSTANT(SrcY); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles + const unsigned int g_blockSliceSize = g_detBlockSize; + + std::list<cudaStream_t> streams; + + + unsigned int blockStart = 0; + unsigned int blockEnd = dims.iProjAngles; + + dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock, + (dims.iProjDets+g_blockSliceSize-1)/g_blockSliceSize); // angle blocks, regions + cudaStream_t stream1; + cudaStreamCreate(&stream1); + streams.push_back(stream1); + for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices) + FanFPhorizontal<<<dimGrid, dimBlock, 0, stream1>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); + + cudaStream_t stream2; + cudaStreamCreate(&stream2); + streams.push_back(stream2); + for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices) + FanFPvertical<<<dimGrid, dimBlock, 0, stream2>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); + + cudaStreamDestroy(stream1); + cudaStreamDestroy(stream2); + + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + cudaFreeArray(D_dataArray); + + return true; +} + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_projData; + + SDimensions dims; + dims.iVolWidth = 128; + dims.iVolHeight = 128; + dims.iProjAngles = 180; + dims.iProjDets = 256; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + unsigned int volumePitch, projPitch; + + SFanProjection projs[180]; + + projs[0].fSrcX = 0.0f; + projs[0].fSrcY = 1536.0f; + projs[0].fDetSX = 128.0f; + projs[0].fDetSY = -512.0f; + projs[0].fDetUX = -1.0f; + projs[0].fDetUY = 0.0f; + +#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = projs[0].f##name##X * cos(alpha) - projs[0].f##name##Y * sin(alpha); projs[i].f##name##Y = projs[0].f##name##X * sin(alpha) + projs[0].f##name##Y * cos(alpha); } while(0) + + for (int i = 1; i < 180; ++i) { + ROTATE0(Src, i, i*2*M_PI/180); + ROTATE0(DetS, i, i*2*M_PI/180); + ROTATE0(DetU, i, i*2*M_PI/180); + } + +#undef ROTATE0 + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + printf("pitch: %u\n", projPitch); + + unsigned int y, x; + float* img = loadImage("phantom128.png", y, x); + + float* sino = new float[dims.iProjAngles * dims.iProjDets]; + + memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float)); + + copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); + + float* angle = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) + angle[i] = i*(M_PI/dims.iProjAngles); + + FanFP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs, 1.0f); + + delete[] angle; + + copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); + + float s = 0.0f; + for (unsigned int y = 0; y < dims.iProjAngles; ++y) + for (unsigned int x = 0; x < dims.iProjDets; ++x) + s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x]; + printf("cpu norm: %f\n", s); + + //zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + printf("gpu norm: %f\n", s); + + saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino); + + + return 0; +} +#endif |