From b1ffc11d930c19bd73af9837a08bc8dde9fe8e72 Mon Sep 17 00:00:00 2001 From: Willem Jan Palenstijn Date: Fri, 29 Jul 2016 12:03:38 +0200 Subject: Add CUDA parvec support --- cuda/2d/algo.cu | 62 ++--- cuda/2d/algo.h | 22 +- cuda/2d/astra.cu | 726 ++++++++------------------------------------------ cuda/2d/astra.h | 156 ++--------- cuda/2d/cgls.cu | 36 --- cuda/2d/dims.h | 5 +- cuda/2d/em.cu | 28 -- cuda/2d/fbp.cu | 347 ++++++++++++++++++++++++ cuda/2d/fbp.h | 98 +++++++ cuda/2d/fbp_filters.h | 4 + cuda/2d/fft.cu | 12 +- cuda/2d/fft.h | 8 +- cuda/2d/par_bp.cu | 166 +++++------- cuda/2d/par_bp.h | 6 +- cuda/2d/par_fp.cu | 360 ++++--------------------- cuda/2d/par_fp.h | 4 +- cuda/2d/sart.cu | 8 +- cuda/2d/sirt.cu | 46 +--- cuda/3d/fdk.cu | 22 +- 19 files changed, 784 insertions(+), 1332 deletions(-) create mode 100644 cuda/2d/fbp.cu create mode 100644 cuda/2d/fbp.h (limited to 'cuda') diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu index dc74e51..bde4f42 100644 --- a/cuda/2d/algo.cu +++ b/cuda/2d/algo.cu @@ -35,13 +35,13 @@ $Id$ #include "fan_bp.h" #include "util.h" #include "arith.h" +#include "astra.h" namespace astraCUDA { ReconAlgo::ReconAlgo() { - angles = 0; - TOffsets = 0; + parProjs = 0; fanProjs = 0; shouldAbort = false; @@ -66,8 +66,7 @@ ReconAlgo::~ReconAlgo() void ReconAlgo::reset() { - delete[] angles; - delete[] TOffsets; + delete[] parProjs; delete[] fanProjs; if (freeGPUMemory) { @@ -77,8 +76,7 @@ void ReconAlgo::reset() cudaFree(D_volumeData); } - angles = 0; - TOffsets = 0; + parProjs = 0; fanProjs = 0; shouldAbort = false; @@ -124,46 +122,40 @@ bool ReconAlgo::enableSinogramMask() } -bool ReconAlgo::setGeometry(const SDimensions& _dims, const float* _angles) +bool ReconAlgo::setGeometry(const astra::CVolumeGeometry2D* pVolGeom, + const astra::CProjectionGeometry2D* pProjGeom) { - dims = _dims; + bool ok; - angles = new float[dims.iProjAngles]; + ok = convertAstraGeometry_dims(pVolGeom, pProjGeom, dims); - memcpy(angles, _angles, sizeof(angles[0]) * dims.iProjAngles); + if (!ok) + return false; + delete[] parProjs; + parProjs = 0; delete[] fanProjs; fanProjs = 0; - return true; -} - -bool ReconAlgo::setFanGeometry(const SDimensions& _dims, - const SFanProjection* _projs) -{ - dims = _dims; - fanProjs = new SFanProjection[dims.iProjAngles]; - - memcpy(fanProjs, _projs, sizeof(fanProjs[0]) * dims.iProjAngles); - - delete[] angles; - angles = 0; + fOutputScale = 1.0f; + ok = convertAstraGeometry(pVolGeom, pProjGeom, parProjs, fanProjs, fOutputScale); + if (!ok) + return false; return true; } - -bool ReconAlgo::setTOffsets(const float* _TOffsets) +bool ReconAlgo::setSuperSampling(int raysPerDet, int raysPerPixelDim) { - // TODO: determine if they're all zero? - TOffsets = new float[dims.iProjAngles]; - memcpy(TOffsets, _TOffsets, sizeof(angles[0]) * dims.iProjAngles); + if (raysPerDet <= 0 || raysPerPixelDim <= 0) + return false; + + dims.iRaysPerDet = raysPerDet; + dims.iRaysPerPixelDim = raysPerPixelDim; return true; } - - bool ReconAlgo::setVolumeMask(float* _D_maskData, unsigned int _maskPitch) { assert(useVolumeMask); @@ -324,14 +316,14 @@ bool ReconAlgo::callFP(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, float outputScale) { - if (angles) { + if (parProjs) { assert(!fanProjs); return FP(D_volumeData, volumePitch, D_projData, projPitch, - dims, angles, TOffsets, outputScale); + dims, parProjs, fOutputScale * outputScale); } else { assert(fanProjs); return FanFP(D_volumeData, volumePitch, D_projData, projPitch, - dims, fanProjs, outputScale); + dims, fanProjs, fOutputScale * outputScale); } } @@ -339,10 +331,10 @@ bool ReconAlgo::callBP(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, float outputScale) { - if (angles) { + if (parProjs) { assert(!fanProjs); return BP(D_volumeData, volumePitch, D_projData, projPitch, - dims, angles, TOffsets, outputScale); + dims, parProjs, outputScale); } else { assert(fanProjs); return FanBP(D_volumeData, volumePitch, D_projData, projPitch, diff --git a/cuda/2d/algo.h b/cuda/2d/algo.h index 99959c8..0b6a066 100644 --- a/cuda/2d/algo.h +++ b/cuda/2d/algo.h @@ -31,6 +31,17 @@ $Id$ #include "util.h" +namespace astra { + +class CParallelProjectionGeometry2D; +class CParallelVecProjectionGeometry2D; +class CFanFlatProjectionGeometry2D; +class CFanFlatVecProjectionGeometry2D; +class CVolumeGeometry2D; +class CProjectionGeometry2D; + +} + namespace astraCUDA { class _AstraExport ReconAlgo { @@ -40,11 +51,10 @@ public: bool setGPUIndex(int iGPUIndex); - bool setGeometry(const SDimensions& dims, const float* angles); - bool setFanGeometry(const SDimensions& dims, const SFanProjection* projs); + bool setGeometry(const astra::CVolumeGeometry2D* pVolGeom, + const astra::CProjectionGeometry2D* pProjGeom); - // setTOffsets should (optionally) be called after setGeometry - bool setTOffsets(const float* TOffsets); + bool setSuperSampling(int raysPerDet, int raysPerPixelDim); void signalAbort() { shouldAbort = true; } @@ -123,9 +133,9 @@ protected: SDimensions dims; - float* angles; - float* TOffsets; + SParProjection* parProjs; SFanProjection* fanProjs; + float fOutputScale; volatile bool shouldAbort; diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu index c1e6566..b39e0a9 100644 --- a/cuda/2d/astra.cu +++ b/cuda/2d/astra.cu @@ -42,8 +42,10 @@ $Id$ #include #include +#include "../../include/astra/GeometryUtil2D.h" #include "../../include/astra/VolumeGeometry2D.h" #include "../../include/astra/ParallelProjectionGeometry2D.h" +#include "../../include/astra/ParallelVecProjectionGeometry2D.h" #include "../../include/astra/FanFlatProjectionGeometry2D.h" #include "../../include/astra/FanFlatVecProjectionGeometry2D.h" @@ -64,515 +66,6 @@ enum CUDAProjectionType { }; -class AstraFBP_internal { -public: - SDimensions dims; - float* angles; - float* TOffsets; - astraCUDA::SFanProjection* fanProjections; - - float fOriginSourceDistance; - float fOriginDetectorDistance; - - float fPixelSize; - - bool bFanBeam; - bool bShortScan; - - bool initialized; - bool setStartReconstruction; - - float* D_sinoData; - unsigned int sinoPitch; - - float* D_volumeData; - unsigned int volumePitch; - - cufftComplex * m_pDevFilter; -}; - -AstraFBP::AstraFBP() -{ - pData = new AstraFBP_internal(); - - pData->angles = 0; - pData->fanProjections = 0; - pData->TOffsets = 0; - pData->D_sinoData = 0; - pData->D_volumeData = 0; - - pData->dims.iVolWidth = 0; - pData->dims.iProjAngles = 0; - pData->dims.fDetScale = 1.0f; - pData->dims.iRaysPerDet = 1; - pData->dims.iRaysPerPixelDim = 1; - - pData->initialized = false; - pData->setStartReconstruction = false; - - pData->m_pDevFilter = NULL; -} - -AstraFBP::~AstraFBP() -{ - delete[] pData->angles; - pData->angles = 0; - - delete[] pData->TOffsets; - pData->TOffsets = 0; - - delete[] pData->fanProjections; - pData->fanProjections = 0; - - cudaFree(pData->D_sinoData); - pData->D_sinoData = 0; - - cudaFree(pData->D_volumeData); - pData->D_volumeData = 0; - - if(pData->m_pDevFilter != NULL) - { - freeComplexOnDevice(pData->m_pDevFilter); - pData->m_pDevFilter = NULL; - } - - delete pData; - pData = 0; -} - -bool AstraFBP::setReconstructionGeometry(unsigned int iVolWidth, - unsigned int iVolHeight, - float fPixelSize) -{ - if (pData->initialized) - return false; - - pData->dims.iVolWidth = iVolWidth; - pData->dims.iVolHeight = iVolHeight; - - pData->fPixelSize = fPixelSize; - - return (iVolWidth > 0 && iVolHeight > 0 && fPixelSize > 0.0f); -} - -bool AstraFBP::setProjectionGeometry(unsigned int iProjAngles, - unsigned int iProjDets, - const float* pfAngles, - float fDetSize) -{ - if (pData->initialized) - return false; - - pData->dims.iProjAngles = iProjAngles; - pData->dims.iProjDets = iProjDets; - pData->dims.fDetScale = fDetSize / pData->fPixelSize; - - if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0) - return false; - - pData->angles = new float[iProjAngles]; - memcpy(pData->angles, pfAngles, iProjAngles * sizeof(pfAngles[0])); - - pData->bFanBeam = false; - - return true; -} - -bool AstraFBP::setFanGeometry(unsigned int iProjAngles, - unsigned int iProjDets, - const astraCUDA::SFanProjection *fanProjs, - const float* pfAngles, - float fOriginSourceDistance, - float fOriginDetectorDistance, - float fDetSize, - bool bShortScan) -{ - // Slightly abusing setProjectionGeometry for this... - if (!setProjectionGeometry(iProjAngles, iProjDets, pfAngles, fDetSize)) - return false; - - pData->fOriginSourceDistance = fOriginSourceDistance; - pData->fOriginDetectorDistance = fOriginDetectorDistance; - - pData->fanProjections = new astraCUDA::SFanProjection[iProjAngles]; - memcpy(pData->fanProjections, fanProjs, iProjAngles * sizeof(fanProjs[0])); - - pData->bFanBeam = true; - pData->bShortScan = bShortScan; - - return true; -} - - -bool AstraFBP::setPixelSuperSampling(unsigned int iPixelSuperSampling) -{ - if (pData->initialized) - return false; - - if (iPixelSuperSampling == 0) - return false; - - pData->dims.iRaysPerPixelDim = iPixelSuperSampling; - - return true; -} - - -bool AstraFBP::setTOffsets(const float* pfTOffsets) -{ - if (pData->initialized) - return false; - - if (pfTOffsets == 0) - return false; - - pData->TOffsets = new float[pData->dims.iProjAngles]; - memcpy(pData->TOffsets, pfTOffsets, pData->dims.iProjAngles * sizeof(pfTOffsets[0])); - - return true; -} - -bool AstraFBP::init(int iGPUIndex) -{ - if (pData->initialized) - { - return false; - } - - if (pData->dims.iProjAngles == 0 || pData->dims.iVolWidth == 0) - { - return false; - } - - if (iGPUIndex != -1) { - cudaSetDevice(iGPUIndex); - cudaError_t err = cudaGetLastError(); - - // Ignore errors caused by calling cudaSetDevice multiple times - if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) - { - return false; - } - } - - bool ok = allocateVolumeData(pData->D_volumeData, pData->volumePitch, pData->dims); - if (!ok) - { - return false; - } - - ok = allocateProjectionData(pData->D_sinoData, pData->sinoPitch, pData->dims); - if (!ok) - { - cudaFree(pData->D_volumeData); - pData->D_volumeData = 0; - return false; - } - - pData->initialized = true; - - return true; -} - -bool AstraFBP::setSinogram(const float* pfSinogram, - unsigned int iSinogramPitch) -{ - if (!pData->initialized) - return false; - if (!pfSinogram) - return false; - - bool ok = copySinogramToDevice(pfSinogram, iSinogramPitch, - pData->dims, - pData->D_sinoData, pData->sinoPitch); - if (!ok) - return false; - - // rescale sinogram to adjust for pixel size - processSino(pData->D_sinoData, - 1.0f/(pData->fPixelSize*pData->fPixelSize), - pData->sinoPitch, pData->dims); - - pData->setStartReconstruction = false; - - return true; -} - -static int calcNextPowerOfTwo(int _iValue) -{ - int iOutput = 1; - - while(iOutput < _iValue) - { - iOutput *= 2; - } - - return iOutput; -} - -bool AstraFBP::run() -{ - if (!pData->initialized) - { - return false; - } - - zeroVolumeData(pData->D_volumeData, pData->volumePitch, pData->dims); - - bool ok = false; - - if (pData->bFanBeam) { - // Call FDK_PreWeight to handle fan beam geometry. We treat - // this as a cone beam setup of a single slice: - - // TODO: TOffsets affects this preweighting... - - // We create a fake cudaPitchedPtr - cudaPitchedPtr tmp; - tmp.ptr = pData->D_sinoData; - tmp.pitch = pData->sinoPitch * sizeof(float); - tmp.xsize = pData->dims.iProjDets; - tmp.ysize = pData->dims.iProjAngles; - // and a fake Dimensions3D - astraCUDA3d::SDimensions3D dims3d; - dims3d.iVolX = pData->dims.iVolWidth; - dims3d.iVolY = pData->dims.iVolHeight; - dims3d.iVolZ = 1; - dims3d.iProjAngles = pData->dims.iProjAngles; - dims3d.iProjU = pData->dims.iProjDets; - dims3d.iProjV = 1; - dims3d.iRaysPerDetDim = dims3d.iRaysPerVoxelDim = 1; - - astraCUDA3d::FDK_PreWeight(tmp, pData->fOriginSourceDistance, - pData->fOriginDetectorDistance, 0.0f, 0.0f, - pData->dims.fDetScale, 1.0f, // TODO: Are these correct? - pData->bShortScan, dims3d, pData->angles); - } - - if (pData->m_pDevFilter) { - - int iFFTRealDetCount = calcNextPowerOfTwo(2 * pData->dims.iProjDets); - int iFFTFourDetCount = calcFFTFourSize(iFFTRealDetCount); - - cufftComplex * pDevComplexSinogram = NULL; - - allocateComplexOnDevice(pData->dims.iProjAngles, iFFTFourDetCount, &pDevComplexSinogram); - - runCudaFFT(pData->dims.iProjAngles, pData->D_sinoData, pData->sinoPitch, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram); - - applyFilter(pData->dims.iProjAngles, iFFTFourDetCount, pDevComplexSinogram, pData->m_pDevFilter); - - runCudaIFFT(pData->dims.iProjAngles, pDevComplexSinogram, pData->D_sinoData, pData->sinoPitch, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount); - - freeComplexOnDevice(pDevComplexSinogram); - - } - - float fOutputScale = (M_PI / 2.0f) / (float)pData->dims.iProjAngles; - - if (pData->bFanBeam) { - ok = FanBP_FBPWeighted(pData->D_volumeData, pData->volumePitch, pData->D_sinoData, pData->sinoPitch, pData->dims, pData->fanProjections, fOutputScale); - - } else { - ok = BP(pData->D_volumeData, pData->volumePitch, pData->D_sinoData, pData->sinoPitch, pData->dims, pData->angles, pData->TOffsets, fOutputScale); - } - if(!ok) - { - return false; - } - - return true; -} - -bool AstraFBP::getReconstruction(float* pfReconstruction, unsigned int iReconstructionPitch) const -{ - if (!pData->initialized) - return false; - - bool ok = copyVolumeFromDevice(pfReconstruction, iReconstructionPitch, - pData->dims, - pData->D_volumeData, pData->volumePitch); - if (!ok) - return false; - - return true; -} - -int AstraFBP::calcFourierFilterSize(int _iDetectorCount) -{ - int iFFTRealDetCount = calcNextPowerOfTwo(2 * _iDetectorCount); - int iFreqBinCount = calcFFTFourSize(iFFTRealDetCount); - - // CHECKME: Matlab makes this at least 64. Do we also need to? - return iFreqBinCount; -} - -bool AstraFBP::setFilter(E_FBPFILTER _eFilter, const float * _pfHostFilter /* = NULL */, int _iFilterWidth /* = 0 */, float _fD /* = 1.0f */, float _fFilterParameter /* = -1.0f */) -{ - if(pData->m_pDevFilter != 0) - { - freeComplexOnDevice(pData->m_pDevFilter); - pData->m_pDevFilter = 0; - } - - if (_eFilter == FILTER_NONE) - return true; // leave pData->m_pDevFilter set to 0 - - - int iFFTRealDetCount = calcNextPowerOfTwo(2 * pData->dims.iProjDets); - int iFreqBinCount = calcFFTFourSize(iFFTRealDetCount); - - cufftComplex * pHostFilter = new cufftComplex[pData->dims.iProjAngles * iFreqBinCount]; - memset(pHostFilter, 0, sizeof(cufftComplex) * pData->dims.iProjAngles * iFreqBinCount); - - allocateComplexOnDevice(pData->dims.iProjAngles, iFreqBinCount, &(pData->m_pDevFilter)); - - switch(_eFilter) - { - case FILTER_NONE: - // handled above - break; - case FILTER_RAMLAK: - case FILTER_SHEPPLOGAN: - case FILTER_COSINE: - case FILTER_HAMMING: - case FILTER_HANN: - case FILTER_TUKEY: - case FILTER_LANCZOS: - case FILTER_TRIANGULAR: - case FILTER_GAUSSIAN: - case FILTER_BARTLETTHANN: - case FILTER_BLACKMAN: - case FILTER_NUTTALL: - case FILTER_BLACKMANHARRIS: - case FILTER_BLACKMANNUTTALL: - case FILTER_FLATTOP: - case FILTER_PARZEN: - { - genFilter(_eFilter, _fD, pData->dims.iProjAngles, pHostFilter, iFFTRealDetCount, iFreqBinCount, _fFilterParameter); - uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter); - - break; - } - case FILTER_PROJECTION: - { - // make sure the offered filter has the correct size - assert(_iFilterWidth == iFreqBinCount); - - for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++) - { - float fValue = _pfHostFilter[iFreqBinIndex]; - - for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++) - { - pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue; - pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f; - } - } - uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter); - break; - } - case FILTER_SINOGRAM: - { - // make sure the offered filter has the correct size - assert(_iFilterWidth == iFreqBinCount); - - for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++) - { - for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++) - { - float fValue = _pfHostFilter[iFreqBinIndex + iProjectionIndex * _iFilterWidth]; - - pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue; - pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f; - } - } - uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter); - break; - } - case FILTER_RPROJECTION: - { - int iProjectionCount = pData->dims.iProjAngles; - int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount; - float * pfHostRealFilter = new float[iRealFilterElementCount]; - memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount); - - int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount); - int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2; - int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth; - - int iFilterShiftSize = _iFilterWidth / 2; - - for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++) - { - int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount; - float fValue = _pfHostFilter[iDetectorIndex]; - - for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++) - { - pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue; - } - } - - float* pfDevRealFilter = NULL; - cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors - cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice); - delete[] pfHostRealFilter; - - runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter); - - cudaFree(pfDevRealFilter); - - break; - } - case FILTER_RSINOGRAM: - { - int iProjectionCount = pData->dims.iProjAngles; - int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount; - float* pfHostRealFilter = new float[iRealFilterElementCount]; - memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount); - - int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount); - int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2; - int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth; - - int iFilterShiftSize = _iFilterWidth / 2; - - for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++) - { - int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount; - - for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++) - { - float fValue = _pfHostFilter[iDetectorIndex + iProjectionIndex * _iFilterWidth]; - pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue; - } - } - - float* pfDevRealFilter = NULL; - cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors - cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice); - delete[] pfHostRealFilter; - - runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter); - - cudaFree(pfDevRealFilter); - - break; - } - default: - { - ASTRA_ERROR("AstraFBP::setFilter: Unknown filter type requested"); - delete [] pHostFilter; - return false; - } - } - - delete [] pHostFilter; - - return true; -} - BPalgo::BPalgo() { @@ -617,18 +110,17 @@ float BPalgo::computeDiffNorm() bool astraCudaFP(const float* pfVolume, float* pfSinogram, unsigned int iVolWidth, unsigned int iVolHeight, unsigned int iProjAngles, unsigned int iProjDets, - const float *pfAngles, const float *pfOffsets, - float fDetSize, unsigned int iDetSuperSampling, + const SParProjection *pAngles, + unsigned int iDetSuperSampling, float fOutputScale, int iGPUIndex) { SDimensions dims; - if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0) + if (iProjAngles == 0 || iProjDets == 0 || pAngles == 0) return false; dims.iProjAngles = iProjAngles; dims.iProjDets = iProjDets; - dims.fDetScale = fDetSize; if (iDetSuperSampling == 0) return false; @@ -678,7 +170,7 @@ bool astraCudaFP(const float* pfVolume, float* pfSinogram, } zeroProjectionData(D_sinoData, sinoPitch, dims); - ok = FP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pfAngles, pfOffsets, fOutputScale); + ok = FP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pAngles, fOutputScale); if (!ok) { cudaFree(D_volumeData); cudaFree(D_sinoData); @@ -713,7 +205,6 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, dims.iProjAngles = iProjAngles; dims.iProjDets = iProjDets; - dims.fDetScale = 1.0f; // TODO? if (iDetSuperSampling == 0) return false; @@ -789,118 +280,99 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, } -bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom, - const CParallelProjectionGeometry2D* pProjGeom, - float*& detectorOffsets, float*& projectionAngles, - float& detSize, float& outputScale) +// adjust pProjs to normalize volume geometry +template +static bool convertAstraGeometry_internal(const CVolumeGeometry2D* pVolGeom, + unsigned int iProjectionAngleCount, + ProjectionT*& pProjs, + float& fOutputScale) { - assert(pVolGeom); - assert(pProjGeom); - assert(pProjGeom->getProjectionAngles()); - + // TODO: Make EPS relative const float EPS = 0.00001f; - int nth = pProjGeom->getProjectionAngleCount(); - // Check if pixels are square if (abs(pVolGeom->getPixelLengthX() - pVolGeom->getPixelLengthY()) > EPS) return false; + float dx = -(pVolGeom->getWindowMinX() + pVolGeom->getWindowMaxX()) / 2; + float dy = -(pVolGeom->getWindowMinY() + pVolGeom->getWindowMaxY()) / 2; - // Scale volume pixels to 1x1 - detSize = pProjGeom->getDetectorWidth() / pVolGeom->getPixelLengthX(); + float factor = 1.0f / pVolGeom->getPixelLengthX(); - // Copy angles - float *angles = new float[nth]; - for (int i = 0; i < nth; ++i) - angles[i] = pProjGeom->getProjectionAngles()[i]; - projectionAngles = angles; - - // Check if we need to translate - bool offCenter = false; - if (abs(pVolGeom->getWindowMinX() + pVolGeom->getWindowMaxX()) > EPS || - abs(pVolGeom->getWindowMinY() + pVolGeom->getWindowMaxY()) > EPS) - { - offCenter = true; + for (int i = 0; i < iProjectionAngleCount; ++i) { + // CHECKME: Order of scaling and translation + pProjs[i].translate(dx, dy); + pProjs[i].scale(factor); } + // CHECKME: Check factor + fOutputScale *= pVolGeom->getPixelLengthX() * pVolGeom->getPixelLengthY(); - // If there are existing detector offsets, or if we need to translate, - // we need to return offsets - if (offCenter) - { - float* offset = new float[nth]; + return true; +} - for (int i = 0; i < nth; ++i) - offset[i] = 0.0f; - if (offCenter) { - float dx = (pVolGeom->getWindowMinX() + pVolGeom->getWindowMaxX()) / 2; - float dy = (pVolGeom->getWindowMinY() + pVolGeom->getWindowMaxY()) / 2; - // CHECKME: Is d in pixels or in units? +bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom, + const CParallelProjectionGeometry2D* pProjGeom, + SParProjection*& pProjs, + float& fOutputScale) +{ + assert(pVolGeom); + assert(pProjGeom); + assert(pProjGeom->getProjectionAngles()); - for (int i = 0; i < nth; ++i) { - float d = dx * cos(angles[i]) + dy * sin(angles[i]); - offset[i] += d; - } - } + int nth = pProjGeom->getProjectionAngleCount(); - // CHECKME: Order of scaling and translation + pProjs = genParProjections(nth, + pProjGeom->getDetectorCount(), + pProjGeom->getDetectorWidth(), + pProjGeom->getProjectionAngles(), 0); - // Scale volume pixels to 1x1 - for (int i = 0; i < nth; ++i) { - //offset[i] /= pVolGeom->getPixelLengthX(); - //offset[i] *= detSize; - } + bool ok; + fOutputScale = 1.0f; + ok = convertAstraGeometry_internal(pVolGeom, nth, pProjs, fOutputScale); - detectorOffsets = offset; - } else { - detectorOffsets = 0; + if (!ok) { + delete[] pProjs; + pProjs = 0; } - outputScale = pVolGeom->getPixelLengthX(); - outputScale *= outputScale; - - return true; + return ok; } -static void convertAstraGeometry_internal(const CVolumeGeometry2D* pVolGeom, - unsigned int iProjectionAngleCount, - astraCUDA::SFanProjection*& pProjs, - float& outputScale) +bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom, + const CParallelVecProjectionGeometry2D* pProjGeom, + SParProjection*& pProjs, + float& fOutputScale) { - // Translate - float dx = (pVolGeom->getWindowMinX() + pVolGeom->getWindowMaxX()) / 2; - float dy = (pVolGeom->getWindowMinY() + pVolGeom->getWindowMaxY()) / 2; + assert(pVolGeom); + assert(pProjGeom); + assert(pProjGeom->getProjectionVectors()); - for (int i = 0; i < iProjectionAngleCount; ++i) { - pProjs[i].fSrcX -= dx; - pProjs[i].fSrcY -= dy; - pProjs[i].fDetSX -= dx; - pProjs[i].fDetSY -= dy; + int nth = pProjGeom->getProjectionAngleCount(); + + pProjs = new SParProjection[nth]; + + for (int i = 0; i < nth; ++i) { + pProjs[i] = pProjGeom->getProjectionVectors()[i]; } - // CHECKME: Order of scaling and translation + bool ok; + fOutputScale = 1.0f; - // Scale - float factor = 1.0f / pVolGeom->getPixelLengthX(); - for (int i = 0; i < iProjectionAngleCount; ++i) { - pProjs[i].fSrcX *= factor; - pProjs[i].fSrcY *= factor; - pProjs[i].fDetSX *= factor; - pProjs[i].fDetSY *= factor; - pProjs[i].fDetUX *= factor; - pProjs[i].fDetUY *= factor; + ok = convertAstraGeometry_internal(pVolGeom, nth, pProjs, fOutputScale); + if (!ok) { + delete[] pProjs; + pProjs = 0; } - // CHECKME: Check factor - outputScale = pVolGeom->getPixelLengthX(); -// outputScale *= outputScale; + return ok; } + bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom, const CFanFlatProjectionGeometry2D* pProjGeom, astraCUDA::SFanProjection*& pProjs, @@ -910,6 +382,7 @@ bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom, assert(pProjGeom); assert(pProjGeom->getProjectionAngles()); + // TODO: Make EPS relative const float EPS = 0.00001f; int nth = pProjGeom->getProjectionAngleCount(); @@ -928,23 +401,9 @@ bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom, float fDetSize = pProjGeom->getDetectorWidth(); const float *pfAngles = pProjGeom->getProjectionAngles(); - pProjs = new SFanProjection[nth]; - - float fSrcX0 = 0.0f; - float fSrcY0 = -fOriginSourceDistance; - float fDetUX0 = fDetSize; - float fDetUY0 = 0.0f; - float fDetSX0 = pProjGeom->getDetectorCount() * fDetUX0 / -2.0f; - float fDetSY0 = fOriginDetectorDistance; - -#define ROTATE0(name,i,alpha) do { pProjs[i].f##name##X = f##name##X0 * cos(alpha) - f##name##Y0 * sin(alpha); pProjs[i].f##name##Y = f##name##X0 * sin(alpha) + f##name##Y0 * cos(alpha); } while(0) - for (int i = 0; i < nth; ++i) { - ROTATE0(Src, i, pfAngles[i]); - ROTATE0(DetS, i, pfAngles[i]); - ROTATE0(DetU, i, pfAngles[i]); - } - -#undef ROTATE0 + pProjs = genFanProjections(nth, pProjGeom->getDetectorCount(), + fOriginSourceDistance, fOriginDetectorDistance, + fDetSize, pfAngles); convertAstraGeometry_internal(pVolGeom, nth, pProjs, outputScale); @@ -961,6 +420,7 @@ bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom, assert(pProjGeom); assert(pProjGeom->getProjectionVectors()); + // TODO: Make EPS relative const float EPS = 0.00001f; int nx = pVolGeom->getGridColCount(); @@ -983,6 +443,52 @@ bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom, } +bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom, + const CProjectionGeometry2D* pProjGeom, + astraCUDA::SParProjection*& pParProjs, + astraCUDA::SFanProjection*& pFanProjs, + float& outputScale) +{ + const CParallelProjectionGeometry2D* parProjGeom = dynamic_cast(pProjGeom); + const CParallelVecProjectionGeometry2D* parVecProjGeom = dynamic_cast(pProjGeom); + const CFanFlatProjectionGeometry2D* fanProjGeom = dynamic_cast(pProjGeom); + const CFanFlatVecProjectionGeometry2D* fanVecProjGeom = dynamic_cast(pProjGeom); + + bool ok = false; + + if (parProjGeom) { + ok = convertAstraGeometry(pVolGeom, parProjGeom, pParProjs, outputScale); + } else if (parVecProjGeom) { + ok = convertAstraGeometry(pVolGeom, parVecProjGeom, pParProjs, outputScale); + } else if (fanProjGeom) { + ok = convertAstraGeometry(pVolGeom, fanProjGeom, pFanProjs, outputScale); + } else if (fanVecProjGeom) { + ok = convertAstraGeometry(pVolGeom, fanVecProjGeom, pFanProjs, outputScale); + } else { + ok = false; + } + + return ok; +} + +bool convertAstraGeometry_dims(const CVolumeGeometry2D* pVolGeom, + const CProjectionGeometry2D* pProjGeom, + SDimensions& dims) +{ + dims.iVolWidth = pVolGeom->getGridColCount(); + dims.iVolHeight = pVolGeom->getGridRowCount(); + + dims.iProjAngles = pProjGeom->getProjectionAngleCount(); + dims.iProjDets = pProjGeom->getDetectorCount(); + + dims.iRaysPerDet = 1; + dims.iRaysPerPixelDim = 1; + + return true; +} + + + } diff --git a/cuda/2d/astra.h b/cuda/2d/astra.h index 617883b..8e91feb 100644 --- a/cuda/2d/astra.h +++ b/cuda/2d/astra.h @@ -29,7 +29,6 @@ $Id$ #ifndef _CUDA_ASTRA_H #define _CUDA_ASTRA_H -#include "fft.h" #include "fbp_filters.h" #include "dims.h" #include "algo.h" @@ -43,140 +42,12 @@ enum Cuda2DProjectionKernel { }; class CParallelProjectionGeometry2D; +class CParallelVecProjectionGeometry2D; class CFanFlatProjectionGeometry2D; class CFanFlatVecProjectionGeometry2D; class CVolumeGeometry2D; +class CProjectionGeometry2D; -class AstraFBP_internal; - -class _AstraExport AstraFBP { -public: - // Constructor - AstraFBP(); - - // Destructor - ~AstraFBP(); - - // Set the size of the reconstruction rectangle. - // Volume pixels are currently assumed to be 1x1 squares. - bool setReconstructionGeometry(unsigned int iVolWidth, - unsigned int iVolHeight, - float fPixelSize = 1.0f); - - // Set the projection angles and number of detector pixels per angle. - // pfAngles must be a float array of length iProjAngles. - // fDetSize indicates the size of a detector pixel compared to a - // volume pixel edge. - // - // pfAngles will only be read from during this call. - bool setProjectionGeometry(unsigned int iProjAngles, - unsigned int iProjDets, - const float *pfAngles, - float fDetSize = 1.0f); - // Set the projection angles and number of detector pixels per angle. - // pfAngles must be a float array of length iProjAngles. - // fDetSize indicates the size of a detector pixel compared to a - // volume pixel edge. - // - // pfAngles, fanProjs will only be read from during this call. - bool setFanGeometry(unsigned int iProjAngles, - unsigned int iProjDets, - const astraCUDA::SFanProjection *fanProjs, - const float *pfAngles, - float fOriginSourceDistance, - float fOriginDetectorDistance, - float fDetSize = 1.0f, - bool bShortScan = false); - - // Set linear supersampling factor for the BP. - // (The number of rays is the square of this) - // - // This may optionally be called before init(). - bool setPixelSuperSampling(unsigned int iPixelSuperSampling); - - // Set per-detector shifts. - // - // pfTOffsets will only be read from during this call. - bool setTOffsets(const float *pfTOffsets); - - // Returns the required size of a filter in the fourier domain - // when multiplying it with the fft of the projection data. - // Its value is equal to the smallest power of two larger than - // or equal to twice the number of detectors in the spatial domain. - // - // _iDetectorCount is the number of detectors in the spatial domain. - static int calcFourierFilterSize(int _iDetectorCount); - - // Sets the filter type. Some filter types require the user to supply an - // array containing the filter. - // The number of elements in a filter in the fourier domain should be equal - // to the value returned by calcFourierFilterSize(). - // The following types require a filter: - // - // - FILTER_PROJECTION: - // The filter size should be equal to the output of - // calcFourierFilterSize(). The filtered sinogram is - // multiplied with the supplied filter. - // - // - FILTER_SINOGRAM: - // Same as FILTER_PROJECTION, but now the filter should contain a row for - // every projection direction. - // - // - FILTER_RPROJECTION: - // The filter should now contain one kernel (= ifft of filter), with the - // peak in the center. The filter width - // can be any value. If odd, the peak is assumed to be in the center, if - // even, it is assumed to be at floor(filter-width/2). - // - // - FILTER_RSINOGRAM - // Same as FILTER_RPROJECTION, but now the supplied filter should contain a - // row for every projection direction. - // - // A large number of other filters (FILTER_RAMLAK, FILTER_SHEPPLOGAN, - // FILTER_COSINE, FILTER_HAMMING, and FILTER_HANN) - // have a D variable, which gives the cutoff point in the frequency domain. - // Setting this value to 1.0 will include the whole filter - bool setFilter(E_FBPFILTER _eFilter, - const float * _pfHostFilter = NULL, - int _iFilterWidth = 0, float _fD = 1.0f, float _fFilterParameter = -1.0f); - - // Initialize CUDA, allocate GPU buffers and - // precompute geometry-specific data. - // - // CUDA is set up to use GPU number iGPUIndex. - // - // This must be called after calling setReconstructionGeometry() and - // setProjectionGeometry(). - bool init(int iGPUIndex = 0); - - // Setup input sinogram for a slice. - // pfSinogram must be a float array of size iProjAngles*iSinogramPitch. - // NB: iSinogramPitch is measured in floats, not in bytes. - // - // This must be called after init(), and before iterate(). It may be - // called again after iterate()/getReconstruction() to start a new slice. - // - // pfSinogram will only be read from during this call. - bool setSinogram(const float* pfSinogram, unsigned int iSinogramPitch); - - // Runs an FBP reconstruction. - // This must be called after setSinogram(). - // - // run can be called before setFilter, but will then use the default Ram-Lak filter - bool run(); - - // Get the reconstructed slice. - // pfReconstruction must be a float array of size - // iVolHeight*iReconstructionPitch. - // NB: iReconstructionPitch is measured in floats, not in bytes. - // - // This may be called after run(). - bool getReconstruction(float* pfReconstruction, - unsigned int iReconstructionPitch) const; - -private: - AstraFBP_internal* pData; -}; class _AstraExport BPalgo : public astraCUDA::ReconAlgo { public: @@ -199,8 +70,8 @@ public: _AstraExport bool astraCudaFP(const float* pfVolume, float* pfSinogram, unsigned int iVolWidth, unsigned int iVolHeight, unsigned int iProjAngles, unsigned int iProjDets, - const float *pfAngles, const float *pfOffsets, - float fDetSize = 1.0f, unsigned int iDetSuperSampling = 1, + const SParProjection *pAngles, + unsigned int iDetSuperSampling = 1, float fOutputScale = 1.0f, int iGPUIndex = 0); _AstraExport bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, @@ -213,8 +84,14 @@ _AstraExport bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, _AstraExport bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom, const CParallelProjectionGeometry2D* pProjGeom, - float*& pfDetectorOffsets, float*& pfProjectionAngles, - float& fDetSize, float& fOutputScale); + astraCUDA::SParProjection*& pProjs, + float& fOutputScale); + +_AstraExport bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom, + const CParallelVecProjectionGeometry2D* pProjGeom, + astraCUDA::SParProjection*& pProjs, + float& fOutputScale); + _AstraExport bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom, const CFanFlatProjectionGeometry2D* pProjGeom, @@ -226,6 +103,15 @@ _AstraExport bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom, astraCUDA::SFanProjection*& pProjs, float& outputScale); +_AstraExport bool convertAstraGeometry_dims(const CVolumeGeometry2D* pVolGeom, + const CProjectionGeometry2D* pProjGeom, + astraCUDA::SDimensions& dims); + +_AstraExport bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom, + const CProjectionGeometry2D* pProjGeom, + astraCUDA::SParProjection*& pParProjs, + astraCUDA::SFanProjection*& pFanProjs, + float& outputScale); } #endif diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu index f402914..3f06581 100644 --- a/cuda/2d/cgls.cu +++ b/cuda/2d/cgls.cu @@ -207,42 +207,6 @@ float CGLS::computeDiffNorm() return sqrt(s); } -bool doCGLS(float* D_volumeData, unsigned int volumePitch, - float* D_sinoData, unsigned int sinoPitch, - const SDimensions& dims, /*const SAugmentedData& augs,*/ - const float* angles, const float* TOffsets, unsigned int iterations) -{ - CGLS cgls; - bool ok = true; - - ok &= cgls.setGeometry(dims, angles); -#if 0 - if (D_maskData) - ok &= cgls.enableVolumeMask(); -#endif - if (TOffsets) - ok &= cgls.setTOffsets(TOffsets); - - if (!ok) - return false; - - ok = cgls.init(); - if (!ok) - return false; - -#if 0 - if (D_maskData) - ok &= cgls.setVolumeMask(D_maskData, maskPitch); -#endif - - ok &= cgls.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); - if (!ok) - return false; - - ok = cgls.iterate(iterations); - - return ok; -} } diff --git a/cuda/2d/dims.h b/cuda/2d/dims.h index e870da5..f74d0e4 100644 --- a/cuda/2d/dims.h +++ b/cuda/2d/dims.h @@ -34,9 +34,11 @@ $Id$ namespace astraCUDA { +using astra::SParProjection; using astra::SFanProjection; + struct SDimensions { // Width, height of reconstruction volume unsigned int iVolWidth; @@ -48,9 +50,6 @@ struct SDimensions { // Number of detector pixels unsigned int iProjDets; - // size of detector compared to volume pixels - float fDetScale; - // in FP, number of rays to trace per detector pixel. // This should usually be set to 1. // If fDetScale > 1, this should be set to an integer of roughly diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu index 8593b08..47ec548 100644 --- a/cuda/2d/em.cu +++ b/cuda/2d/em.cu @@ -170,34 +170,6 @@ float EM::computeDiffNorm() } -bool doEM(float* D_volumeData, unsigned int volumePitch, - float* D_sinoData, unsigned int sinoPitch, - const SDimensions& dims, const float* angles, - const float* TOffsets, unsigned int iterations) -{ - EM em; - bool ok = true; - - ok &= em.setGeometry(dims, angles); - if (TOffsets) - ok &= em.setTOffsets(TOffsets); - - if (!ok) - return false; - - ok = em.init(); - if (!ok) - return false; - - ok &= em.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); - if (!ok) - return false; - - ok = em.iterate(iterations); - - return ok; -} - } #ifdef STANDALONE diff --git a/cuda/2d/fbp.cu b/cuda/2d/fbp.cu new file mode 100644 index 0000000..2feac7d --- /dev/null +++ b/cuda/2d/fbp.cu @@ -0,0 +1,347 @@ +/* +----------------------------------------------------------------------- +Copyright: 2010-2015, iMinds-Vision Lab, University of Antwerp + 2014-2015, CWI, Amsterdam + +Contact: astra@uantwerpen.be +Website: http://sf.net/projects/astra-toolbox + +This file is part of the ASTRA Toolbox. + + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see . + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "fbp.h" +#include "fft.h" +#include "par_bp.h" +#include "fan_bp.h" + +// For fan-beam preweighting +#include "../3d/fdk.h" + +#include "astra/Logging.h" + +#include + +namespace astraCUDA { + + + +static int calcNextPowerOfTwo(int n) +{ + int x = 1; + while (x < n) + x *= 2; + + return x; +} + +// static +int FBP::calcFourierFilterSize(int _iDetectorCount) +{ + int iFFTRealDetCount = calcNextPowerOfTwo(2 * _iDetectorCount); + int iFreqBinCount = calcFFTFourierSize(iFFTRealDetCount); + + // CHECKME: Matlab makes this at least 64. Do we also need to? + return iFreqBinCount; +} + + + + +FBP::FBP() : ReconAlgo() +{ + D_filter = 0; + +} + +FBP::~FBP() +{ + reset(); +} + +void FBP::reset() +{ + if (D_filter) { + freeComplexOnDevice((cufftComplex *)D_filter); + D_filter = 0; + } +} + +bool FBP::init() +{ + return true; +} + +bool FBP::setFilter(astra::E_FBPFILTER _eFilter, const float * _pfHostFilter /* = NULL */, int _iFilterWidth /* = 0 */, float _fD /* = 1.0f */, float _fFilterParameter /* = -1.0f */) +{ + if (D_filter) + { + freeComplexOnDevice((cufftComplex*)D_filter); + D_filter = 0; + } + + if (_eFilter == astra::FILTER_NONE) + return true; // leave D_filter set to 0 + + + int iFFTRealDetCount = calcNextPowerOfTwo(2 * dims.iProjDets); + int iFreqBinCount = calcFFTFourierSize(iFFTRealDetCount); + + cufftComplex * pHostFilter = new cufftComplex[dims.iProjAngles * iFreqBinCount]; + memset(pHostFilter, 0, sizeof(cufftComplex) * dims.iProjAngles * iFreqBinCount); + + allocateComplexOnDevice(dims.iProjAngles, iFreqBinCount, (cufftComplex**)&D_filter); + + switch(_eFilter) + { + case astra::FILTER_NONE: + // handled above + break; + case astra::FILTER_RAMLAK: + case astra::FILTER_SHEPPLOGAN: + case astra::FILTER_COSINE: + case astra::FILTER_HAMMING: + case astra::FILTER_HANN: + case astra::FILTER_TUKEY: + case astra::FILTER_LANCZOS: + case astra::FILTER_TRIANGULAR: + case astra::FILTER_GAUSSIAN: + case astra::FILTER_BARTLETTHANN: + case astra::FILTER_BLACKMAN: + case astra::FILTER_NUTTALL: + case astra::FILTER_BLACKMANHARRIS: + case astra::FILTER_BLACKMANNUTTALL: + case astra::FILTER_FLATTOP: + case astra::FILTER_PARZEN: + { + genFilter(_eFilter, _fD, dims.iProjAngles, pHostFilter, iFFTRealDetCount, iFreqBinCount, _fFilterParameter); + uploadComplexArrayToDevice(dims.iProjAngles, iFreqBinCount, pHostFilter, (cufftComplex*)D_filter); + + break; + } + case astra::FILTER_PROJECTION: + { + // make sure the offered filter has the correct size + assert(_iFilterWidth == iFreqBinCount); + + for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++) + { + float fValue = _pfHostFilter[iFreqBinIndex]; + + for(int iProjectionIndex = 0; iProjectionIndex < (int)dims.iProjAngles; iProjectionIndex++) + { + pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue; + pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f; + } + } + uploadComplexArrayToDevice(dims.iProjAngles, iFreqBinCount, pHostFilter, (cufftComplex*)D_filter); + break; + } + case astra::FILTER_SINOGRAM: + { + // make sure the offered filter has the correct size + assert(_iFilterWidth == iFreqBinCount); + + for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++) + { + for(int iProjectionIndex = 0; iProjectionIndex < (int)dims.iProjAngles; iProjectionIndex++) + { + float fValue = _pfHostFilter[iFreqBinIndex + iProjectionIndex * _iFilterWidth]; + + pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue; + pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f; + } + } + uploadComplexArrayToDevice(dims.iProjAngles, iFreqBinCount, pHostFilter, (cufftComplex*)D_filter); + break; + } + case astra::FILTER_RPROJECTION: + { + int iProjectionCount = dims.iProjAngles; + int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount; + float * pfHostRealFilter = new float[iRealFilterElementCount]; + memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount); + + int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount); + int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2; + int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth; + + int iFilterShiftSize = _iFilterWidth / 2; + + for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++) + { + int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount; + float fValue = _pfHostFilter[iDetectorIndex]; + + for(int iProjectionIndex = 0; iProjectionIndex < (int)dims.iProjAngles; iProjectionIndex++) + { + pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue; + } + } + + float* pfDevRealFilter = NULL; + cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors + cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice); + delete[] pfHostRealFilter; + + runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, (cufftComplex*)D_filter); + + cudaFree(pfDevRealFilter); + + break; + } + case astra::FILTER_RSINOGRAM: + { + int iProjectionCount = dims.iProjAngles; + int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount; + float* pfHostRealFilter = new float[iRealFilterElementCount]; + memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount); + + int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount); + int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2; + int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth; + + int iFilterShiftSize = _iFilterWidth / 2; + + for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++) + { + int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount; + + for(int iProjectionIndex = 0; iProjectionIndex < (int)dims.iProjAngles; iProjectionIndex++) + { + float fValue = _pfHostFilter[iDetectorIndex + iProjectionIndex * _iFilterWidth]; + pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue; + } + } + + float* pfDevRealFilter = NULL; + cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors + cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice); + delete[] pfHostRealFilter; + + runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, (cufftComplex*)D_filter); + + cudaFree(pfDevRealFilter); + + break; + } + default: + { + ASTRA_ERROR("FBP::setFilter: Unknown filter type requested"); + delete [] pHostFilter; + return false; + } + } + + delete [] pHostFilter; + + return true; +} + +bool FBP::iterate(unsigned int iterations) +{ + zeroVolumeData(D_volumeData, volumePitch, dims); + + bool ok = false; + + if (fanProjs) { + // Call FDK_PreWeight to handle fan beam geometry. We treat + // this as a cone beam setup of a single slice: + + // TODO: TOffsets affects this preweighting... + + // TODO: We take the fan parameters from the last projection here + // without checking if they're the same in all projections + + float *pfAngles = new float[dims.iProjAngles]; + + float fOriginSource, fOriginDetector, fDetSize, fOffset; + for (unsigned int i = 0; i < dims.iProjAngles; ++i) { + bool ok = astra::getFanParameters(fanProjs[i], dims.iProjDets, + pfAngles[i], + fOriginSource, fOriginDetector, + fDetSize, fOffset); + if (!ok) { + ASTRA_ERROR("FBP_CUDA: Failed to extract circular fan beam parameters from fan beam geometry"); + return false; + } + } + + // We create a fake cudaPitchedPtr + cudaPitchedPtr tmp; + tmp.ptr = D_sinoData; + tmp.pitch = sinoPitch * sizeof(float); + tmp.xsize = dims.iProjDets; + tmp.ysize = dims.iProjAngles; + // and a fake Dimensions3D + astraCUDA3d::SDimensions3D dims3d; + dims3d.iVolX = dims.iVolWidth; + dims3d.iVolY = dims.iVolHeight; + dims3d.iVolZ = 1; + dims3d.iProjAngles = dims.iProjAngles; + dims3d.iProjU = dims.iProjDets; + dims3d.iProjV = 1; + dims3d.iRaysPerDetDim = dims3d.iRaysPerVoxelDim = 1; + + astraCUDA3d::FDK_PreWeight(tmp, fOriginSource, + fOriginDetector, 0.0f, 0.0f, + fDetSize, 1.0f, + m_bShortScan, dims3d, pfAngles); + } else { + // TODO: How should different detector pixel size in different + // projections be handled? + } + + if (D_filter) { + + int iFFTRealDetCount = calcNextPowerOfTwo(2 * dims.iProjDets); + int iFFTFourDetCount = calcFFTFourierSize(iFFTRealDetCount); + + cufftComplex * pDevComplexSinogram = NULL; + + allocateComplexOnDevice(dims.iProjAngles, iFFTFourDetCount, &pDevComplexSinogram); + + runCudaFFT(dims.iProjAngles, D_sinoData, sinoPitch, dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram); + + applyFilter(dims.iProjAngles, iFFTFourDetCount, pDevComplexSinogram, (cufftComplex*)D_filter); + + runCudaIFFT(dims.iProjAngles, pDevComplexSinogram, D_sinoData, sinoPitch, dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount); + + freeComplexOnDevice(pDevComplexSinogram); + + } + + float fOutputScale = (M_PI / 2.0f) / (float)dims.iProjAngles; + + if (fanProjs) { + ok = FanBP_FBPWeighted(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, fanProjs, fOutputScale); + + } else { + ok = BP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, parProjs, fOutputScale); + } + if(!ok) + { + return false; + } + + return true; +} + + +} diff --git a/cuda/2d/fbp.h b/cuda/2d/fbp.h new file mode 100644 index 0000000..8b4d64d --- /dev/null +++ b/cuda/2d/fbp.h @@ -0,0 +1,98 @@ +/* +----------------------------------------------------------------------- +Copyright: 2010-2015, iMinds-Vision Lab, University of Antwerp + 2014-2015, CWI, Amsterdam + +Contact: astra@uantwerpen.be +Website: http://sf.net/projects/astra-toolbox + +This file is part of the ASTRA Toolbox. + + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see . + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "algo.h" +#include "fbp_filters.h" + +namespace astraCUDA { + +class _AstraExport FBP : public ReconAlgo { +public: + FBP(); + ~FBP(); + + virtual bool useSinogramMask() { return false; } + virtual bool useVolumeMask() { return false; } + + // Returns the required size of a filter in the fourier domain + // when multiplying it with the fft of the projection data. + // Its value is equal to the smallest power of two larger than + // or equal to twice the number of detectors in the spatial domain. + // + // _iDetectorCount is the number of detectors in the spatial domain. + static int calcFourierFilterSize(int _iDetectorCount); + + // Sets the filter type. Some filter types require the user to supply an + // array containing the filter. + // The number of elements in a filter in the fourier domain should be equal + // to the value returned by calcFourierFilterSize(). + // The following types require a filter: + // + // - FILTER_PROJECTION: + // The filter size should be equal to the output of + // calcFourierFilterSize(). The filtered sinogram is + // multiplied with the supplied filter. + // + // - FILTER_SINOGRAM: + // Same as FILTER_PROJECTION, but now the filter should contain a row for + // every projection direction. + // + // - FILTER_RPROJECTION: + // The filter should now contain one kernel (= ifft of filter), with the + // peak in the center. The filter width + // can be any value. If odd, the peak is assumed to be in the center, if + // even, it is assumed to be at floor(filter-width/2). + // + // - FILTER_RSINOGRAM + // Same as FILTER_RPROJECTION, but now the supplied filter should contain a + // row for every projection direction. + // + // A large number of other filters (FILTER_RAMLAK, FILTER_SHEPPLOGAN, + // FILTER_COSINE, FILTER_HAMMING, and FILTER_HANN) + // have a D variable, which gives the cutoff point in the frequency domain. + // Setting this value to 1.0 will include the whole filter + bool setFilter(astra::E_FBPFILTER _eFilter, + const float * _pfHostFilter = NULL, + int _iFilterWidth = 0, float _fD = 1.0f, float _fFilterParameter = -1.0f); + + bool setShortScan(bool ss) { m_bShortScan = ss; return true; } + + virtual bool init(); + + virtual bool iterate(unsigned int iterations); + + virtual float computeDiffNorm() { return 0.0f; } // TODO + +protected: + void reset(); + + void* D_filter; // cufftComplex* + bool m_bShortScan; +}; + +} diff --git a/cuda/2d/fbp_filters.h b/cuda/2d/fbp_filters.h index b206a5d..71c7572 100644 --- a/cuda/2d/fbp_filters.h +++ b/cuda/2d/fbp_filters.h @@ -29,6 +29,8 @@ $Id$ #ifndef FBP_FILTERS_H #define FBP_FILTERS_H +namespace astra { + enum E_FBPFILTER { FILTER_NONE, //< no filter (regular BP) @@ -55,4 +57,6 @@ enum E_FBPFILTER FILTER_RSINOGRAM, //< sinogram filter in real space }; +} + #endif /* FBP_FILTERS_H */ diff --git a/cuda/2d/fft.cu b/cuda/2d/fft.cu index 2d259a9..8a7cc10 100644 --- a/cuda/2d/fft.cu +++ b/cuda/2d/fft.cu @@ -64,6 +64,8 @@ using namespace astra; } } while (0) +namespace astraCUDA { + __global__ static void applyFilter_kernel(int _iProjectionCount, int _iFreqBinCount, cufftComplex * _pSinogram, @@ -276,11 +278,11 @@ bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex, // Because the input is real, the Fourier transform is symmetric. // CUFFT only outputs the first half (ignoring the redundant second half), // and expects the same as input for the IFFT. -int calcFFTFourSize(int _iFFTRealSize) +int calcFFTFourierSize(int _iFFTRealSize) { - int iFFTFourSize = _iFFTRealSize / 2 + 1; + int iFFTFourierSize = _iFFTRealSize / 2 + 1; - return iFFTFourSize; + return iFFTFourierSize; } void genIdenFilter(int _iProjectionCount, cufftComplex * _pFilter, @@ -695,6 +697,10 @@ void genFilter(E_FBPFILTER _eFilter, float _fD, int _iProjectionCount, delete[] pfW; } + +} + + #ifdef STANDALONE __global__ static void doubleFourierOutput_kernel(int _iProjectionCount, diff --git a/cuda/2d/fft.h b/cuda/2d/fft.h index e985fc6..b29f17a 100644 --- a/cuda/2d/fft.h +++ b/cuda/2d/fft.h @@ -34,6 +34,8 @@ $Id$ #include "fbp_filters.h" +namespace astraCUDA { + bool allocateComplexOnDevice(int _iProjectionCount, int _iDetectorCount, cufftComplex ** _ppDevComplex); @@ -57,13 +59,15 @@ bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex, void applyFilter(int _iProjectionCount, int _iFreqBinCount, cufftComplex * _pSinogram, cufftComplex * _pFilter); -int calcFFTFourSize(int _iFFTRealSize); +int calcFFTFourierSize(int _iFFTRealSize); -void genFilter(E_FBPFILTER _eFilter, float _fD, int _iProjectionCount, +void genFilter(astra::E_FBPFILTER _eFilter, float _fD, int _iProjectionCount, cufftComplex * _pFilter, int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount, float _fParameter = -1.0f); void genIdenFilter(int _iProjectionCount, cufftComplex * _pFilter, int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount); +} + #endif /* FFT_H */ diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu index d9f7325..cf0a684 100644 --- a/cuda/2d/par_bp.cu +++ b/cuda/2d/par_bp.cu @@ -53,8 +53,8 @@ const unsigned int g_blockSlices = 16; const unsigned int g_MaxAngles = 2560; -__constant__ float gC_angle_sin[g_MaxAngles]; -__constant__ float gC_angle_cos[g_MaxAngles]; +__constant__ float gC_angle_scaled_sin[g_MaxAngles]; +__constant__ float gC_angle_scaled_cos[g_MaxAngles]; __constant__ float gC_angle_offset[g_MaxAngles]; static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height, cudaTextureAddressMode mode = cudaAddressModeBorder) @@ -73,7 +73,7 @@ static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int wi return true; } -__global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims, float fOutputScale) +__global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims, float fOutputScale) { const int relX = threadIdx.x; const int relY = threadIdx.y; @@ -87,47 +87,30 @@ __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int star if (X >= dims.iVolWidth || Y >= dims.iVolHeight) return; - const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ) / dims.fDetScale; - const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f ) / dims.fDetScale; + const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ); + const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f ); float* volData = (float*)D_volData; float fVal = 0.0f; float fA = startAngle + 0.5f; - const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f; - if (offsets) { - - for (int angle = startAngle; angle < endAngle; ++angle) - { - const float cos_theta = gC_angle_cos[angle]; - const float sin_theta = gC_angle_sin[angle]; - const float TOffset = gC_angle_offset[angle]; - - const float fT = fT_base + fX * cos_theta - fY * sin_theta + TOffset; - fVal += tex2D(gT_projTexture, fT, fA); - fA += 1.0f; - } - - } else { - - for (int angle = startAngle; angle < endAngle; ++angle) - { - const float cos_theta = gC_angle_cos[angle]; - const float sin_theta = gC_angle_sin[angle]; - - const float fT = fT_base + fX * cos_theta - fY * sin_theta; - fVal += tex2D(gT_projTexture, fT, fA); - fA += 1.0f; - } + for (int angle = startAngle; angle < endAngle; ++angle) + { + const float scaled_cos_theta = gC_angle_scaled_cos[angle]; + const float scaled_sin_theta = gC_angle_scaled_sin[angle]; + const float TOffset = gC_angle_offset[angle]; + const float fT = fX * scaled_cos_theta - fY * scaled_sin_theta + TOffset; + fVal += tex2D(gT_projTexture, fT, fA); + fA += 1.0f; } volData[Y*volPitch+X] += fVal * fOutputScale; } // supersampling version -__global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims, float fOutputScale) +__global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims, float fOutputScale) { const int relX = threadIdx.x; const int relY = threadIdx.y; @@ -141,61 +124,35 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s if (X >= dims.iVolWidth || Y >= dims.iVolHeight) return; - const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim) / dims.fDetScale; - const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim) / dims.fDetScale; + const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim); + const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim); - const float fSubStep = 1.0f/(dims.iRaysPerPixelDim * dims.fDetScale); + const float fSubStep = 1.0f/(dims.iRaysPerPixelDim); // * dims.fDetScale); float* volData = (float*)D_volData; float fVal = 0.0f; float fA = startAngle + 0.5f; - const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f; fOutputScale /= (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim); - if (offsets) { - - for (int angle = startAngle; angle < endAngle; ++angle) - { - const float cos_theta = gC_angle_cos[angle]; - const float sin_theta = gC_angle_sin[angle]; - const float TOffset = gC_angle_offset[angle]; - - float fT = fT_base + fX * cos_theta - fY * sin_theta + TOffset; - - for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) { - float fTy = fT; - fT += fSubStep * cos_theta; - for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) { - fVal += tex2D(gT_projTexture, fTy, fA); - fTy -= fSubStep * sin_theta; - } - } - fA += 1.0f; - } - - } else { + for (int angle = startAngle; angle < endAngle; ++angle) + { + const float cos_theta = gC_angle_scaled_cos[angle]; + const float sin_theta = gC_angle_scaled_sin[angle]; + const float TOffset = gC_angle_offset[angle]; - for (int angle = startAngle; angle < endAngle; ++angle) - { - const float cos_theta = gC_angle_cos[angle]; - const float sin_theta = gC_angle_sin[angle]; + float fT = fX * cos_theta - fY * sin_theta + TOffset; - float fT = fT_base + fX * cos_theta - fY * sin_theta; - - for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) { - float fTy = fT; - fT += fSubStep * cos_theta; - for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) { - fVal += tex2D(gT_projTexture, fTy, fA); - fTy -= fSubStep * sin_theta; - } + for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) { + float fTy = fT; + fT += fSubStep * cos_theta; + for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) { + fVal += tex2D(gT_projTexture, fTy, fA); + fTy -= fSubStep * sin_theta; } - fA += 1.0f; - } - + fA += 1.0f; } volData[Y*volPitch+X] += fVal * fOutputScale; @@ -212,12 +169,10 @@ __global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset if (X >= dims.iVolWidth || Y >= dims.iVolHeight) return; - const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ) / dims.fDetScale; - const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f ) / dims.fDetScale; - - const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f; + const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ); + const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f ); - const float fT = fT_base + fX * angle_cos - fY * angle_sin + offset; + const float fT = fX * angle_cos - fY * angle_sin + offset; const float fVal = tex2D(gT_projTexture, fT, 0.5f); D_volData[Y*volPitch+X] += fVal * fOutputScale; @@ -226,32 +181,35 @@ __global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset bool BP_internal(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, - const SDimensions& dims, const float* angles, const float* TOffsets, float fOutputScale) + const SDimensions& dims, const SParProjection* angles, + float fOutputScale) { - // TODO: process angles block by block assert(dims.iProjAngles <= g_MaxAngles); - float* angle_sin = new float[dims.iProjAngles]; - float* angle_cos = new float[dims.iProjAngles]; + float* angle_scaled_sin = new float[dims.iProjAngles]; + float* angle_scaled_cos = new float[dims.iProjAngles]; + float* angle_offset = new float[dims.iProjAngles]; bindProjDataTexture(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); for (unsigned int i = 0; i < dims.iProjAngles; ++i) { - angle_sin[i] = sinf(angles[i]); - angle_cos[i] = cosf(angles[i]); + double d = angles[i].fDetUX * angles[i].fRayY - angles[i].fDetUY * angles[i].fRayX; + angle_scaled_cos[i] = angles[i].fRayY / d; + angle_scaled_sin[i] = -angles[i].fRayX / d; // TODO: Check signs + angle_offset[i] = (angles[i].fDetSY * angles[i].fRayX - angles[i].fDetSX * angles[i].fRayY) / d; } - cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_sin, angle_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); - cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_cos, angle_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + + cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_scaled_sin, angle_scaled_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_scaled_cos, angle_scaled_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + cudaError_t e3 = cudaMemcpyToSymbol(gC_angle_offset, angle_offset, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); assert(e1 == cudaSuccess); assert(e2 == cudaSuccess); + assert(e3 == cudaSuccess); - if (TOffsets) { - cudaError_t e3 = cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); - assert(e3 == cudaSuccess); - } - delete[] angle_sin; - delete[] angle_cos; + delete[] angle_scaled_sin; + delete[] angle_scaled_cos; + delete[] angle_offset; dim3 dimBlock(g_blockSlices, g_blockSliceSize); dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices, @@ -263,9 +221,9 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch, for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) { if (dims.iRaysPerPixelDim > 1) - devBP_SS<<>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims, fOutputScale); + devBP_SS<<>>(D_volumeData, volumePitch, i, dims, fOutputScale); else - devBP<<>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims, fOutputScale); + devBP<<>>(D_volumeData, volumePitch, i, dims, fOutputScale); } cudaThreadSynchronize(); @@ -278,7 +236,7 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch, bool BP(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, - const SDimensions& dims, const float* angles, const float* TOffsets, float fOutputScale) + const SDimensions& dims, const SParProjection* angles, float fOutputScale) { for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) { SDimensions subdims = dims; @@ -290,9 +248,7 @@ bool BP(float* D_volumeData, unsigned int volumePitch, bool ret; ret = BP_internal(D_volumeData, volumePitch, D_projData + iAngle * projPitch, projPitch, - subdims, angles + iAngle, - TOffsets ? TOffsets + iAngle : 0, - fOutputScale); + subdims, angles + iAngle, fOutputScale); if (!ret) return false; } @@ -303,25 +259,23 @@ bool BP(float* D_volumeData, unsigned int volumePitch, bool BP_SART(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, unsigned int angle, const SDimensions& dims, - const float* angles, const float* TOffsets, float fOutputScale) + const SParProjection* angles, float fOutputScale) { // Only one angle. // We need to Clamp to the border pixels instead of to zero, because // SART weights with ray length. bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1, cudaAddressModeClamp); - float angle_sin = sinf(angles[angle]); - float angle_cos = cosf(angles[angle]); - - float offset = 0.0f; - if (TOffsets) - offset = TOffsets[angle]; + double d = angles[angle].fDetUX * angles[angle].fRayY - angles[angle].fDetUY * angles[angle].fRayX; + float angle_scaled_cos = angles[angle].fRayY / d; + float angle_scaled_sin = -angles[angle].fRayX / d; // TODO: Check signs + float angle_offset = (angles[angle].fDetSY * angles[angle].fRayX - angles[angle].fDetSX * angles[angle].fRayY) / d; dim3 dimBlock(g_blockSlices, g_blockSliceSize); dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices, (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize); - devBP_SART<<>>(D_volumeData, volumePitch, offset, angle_sin, angle_cos, dims, fOutputScale); + devBP_SART<<>>(D_volumeData, volumePitch, angle_offset, angle_scaled_sin, angle_scaled_cos, dims, fOutputScale); cudaThreadSynchronize(); cudaTextForceKernelsCompletion(); diff --git a/cuda/2d/par_bp.h b/cuda/2d/par_bp.h index 64bcd34..102cb2b 100644 --- a/cuda/2d/par_bp.h +++ b/cuda/2d/par_bp.h @@ -35,13 +35,13 @@ namespace astraCUDA { _AstraExport bool BP(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, - const SDimensions& dims, const float* angles, - const float* TOffsets, float fOutputScale); + const SDimensions& dims, const SParProjection* angles, + float fOutputScale); _AstraExport bool BP_SART(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, unsigned int angle, const SDimensions& dims, - const float* angles, const float* TOffsets, float fOutputScale); + const SParProjection* angles, float fOutputScale); } diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu index bb8b909..3c010b3 100644 --- a/cuda/2d/par_fp.cu +++ b/cuda/2d/par_fp.cu @@ -30,6 +30,7 @@ $Id$ #include #include #include +#include #include "util.h" #include "arith.h" @@ -51,6 +52,7 @@ namespace astraCUDA { static const unsigned g_MaxAngles = 2560; __constant__ float gC_angle[g_MaxAngles]; __constant__ float gC_angle_offset[g_MaxAngles]; +__constant__ float gC_angle_detsize[g_MaxAngles]; // optimization parameters @@ -63,10 +65,6 @@ static const unsigned int g_blockSlices = 64; #define iPREC_FACTOR 16 -// if necessary, a buffer of zeroes of size g_MaxAngles -static float* g_pfZeroes = 0; - - static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height) { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); @@ -89,9 +87,10 @@ static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned i return true; } + // projection for angles that are roughly horizontal -// theta between 45 and 135 degrees -__global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale) +// (detector roughly vertical) +__global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) { const int relDet = threadIdx.x; const int relAngle = threadIdx.y; @@ -106,33 +105,7 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned const float sin_theta = __sinf(theta); // compute start detector for this block/angle: - // (The same for all threadIdx.x) - // ------------------------------------- - - const int midSlice = startSlice + g_blockSlices / 2; - - // ASSUMPTION: fDetScale >= 1.0f - // problem: detector regions get skipped because slice blocks aren't large - // enough - const unsigned int g_blockSliceSize = g_detBlockSize; - - // project (midSlice,midRegion) on this thread's detector - - const float fBase = 0.5f*dims.iProjDets - 0.5f + - ( - (midSlice - 0.5f*dims.iVolWidth + 0.5f) * cos_theta - - (g_blockSliceSize/2 - 0.5f*dims.iVolHeight + 0.5f) * sin_theta - + gC_angle_offset[angle] - ) / dims.fDetScale; - int iBase = (int)floorf(fBase * fPREC_FACTOR); - int iInc = (int)floorf(g_blockSliceSize * sin_theta / dims.fDetScale * -fPREC_FACTOR); - - // ASSUMPTION: 16 > regionOffset / fDetScale - const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; - const int detPrevRegion = (iBase + (blockIdx.y - regionOffset - 1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; - - if (blockIdx.y > 0 && detRegion == detPrevRegion) - return; + const int detRegion = blockIdx.y; const int detector = detRegion * g_detBlockSize + relDet; @@ -142,7 +115,7 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned if (detector < 0 || detector >= dims.iProjDets) return; - const float fDetStep = -dims.fDetScale / sin_theta; + const float fDetStep = -gC_angle_detsize[angle] / sin_theta; float fSliceStep = cos_theta / sin_theta; float fDistCorr; if (sin_theta > 0.0f) @@ -192,9 +165,10 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned D_projData[angle*projPitch+detector] += fVal * fDistCorr; } + // projection for angles that are roughly vertical -// theta between 0 and 45, or 135 and 180 degrees -__global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale) +// (detector roughly horizontal) +__global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) { const int relDet = threadIdx.x; const int relAngle = threadIdx.y; @@ -209,33 +183,7 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i const float sin_theta = __sinf(theta); // compute start detector for this block/angle: - // (The same for all threadIdx.x) - // ------------------------------------- - - const int midSlice = startSlice + g_blockSlices / 2; - - // project (midSlice,midRegion) on this thread's detector - - // ASSUMPTION: fDetScale >= 1.0f - // problem: detector regions get skipped because slice blocks aren't large - // enough - const unsigned int g_blockSliceSize = g_detBlockSize; - - const float fBase = 0.5f*dims.iProjDets - 0.5f + - ( - (g_blockSliceSize/2 - 0.5f*dims.iVolWidth + 0.5f) * cos_theta - - (midSlice - 0.5f*dims.iVolHeight + 0.5f) * sin_theta - + gC_angle_offset[angle] - ) / dims.fDetScale; - int iBase = (int)floorf(fBase * fPREC_FACTOR); - int iInc = (int)floorf(g_blockSliceSize * cos_theta / dims.fDetScale * fPREC_FACTOR); - - // ASSUMPTION: 16 > regionOffset / fDetScale - const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; - const int detPrevRegion = (iBase + (blockIdx.y - regionOffset-1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; - - if (blockIdx.y > 0 && detRegion == detPrevRegion) - return; + const int detRegion = blockIdx.y; const int detector = detRegion * g_detBlockSize + relDet; @@ -245,7 +193,7 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i if (detector < 0 || detector >= dims.iProjDets) return; - const float fDetStep = dims.fDetScale / cos_theta; + const float fDetStep = gC_angle_detsize[angle] / cos_theta; float fSliceStep = sin_theta / cos_theta; float fDistCorr; if (cos_theta < 0.0f) @@ -293,183 +241,67 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i D_projData[angle*projPitch+detector] += fVal * fDistCorr; } -// projection for angles that are roughly horizontal -// (detector roughly vertical) -__global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) -{ - const int relDet = threadIdx.x; - const int relAngle = threadIdx.y; - - int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; - - if (angle >= endAngle) - return; - - const float theta = gC_angle[angle]; - const float cos_theta = __cosf(theta); - const float sin_theta = __sinf(theta); - - // compute start detector for this block/angle: - const int detRegion = blockIdx.y; - - const int detector = detRegion * g_detBlockSize + relDet; - - // Now project the part of the ray to angle,detector through - // slices startSlice to startSlice+g_blockSlices-1 - - if (detector < 0 || detector >= dims.iProjDets) - return; - - const float fDetStep = -dims.fDetScale / sin_theta; - float fSliceStep = cos_theta / sin_theta; - float fDistCorr; - if (sin_theta > 0.0f) - fDistCorr = -fDetStep; - else - fDistCorr = fDetStep; - fDistCorr *= outputScale; - - float fVal = 0.0f; - // project detector on slice - float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 0.5f; - float fS = startSlice + 0.5f; - int endSlice = startSlice + g_blockSlices; - if (endSlice > dims.iVolWidth) - endSlice = dims.iVolWidth; - if (dims.iRaysPerDet > 1) { - fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep; - const float fSubDetStep = fDetStep / dims.iRaysPerDet; - fDistCorr /= dims.iRaysPerDet; - fSliceStep -= dims.iRaysPerDet * fSubDetStep; +// Coordinates of center of detector pixel number t: +// x = (t - 0.5*nDets + 0.5 - fOffset) * fSize * cos(fAngle) +// y = - (t - 0.5*nDets + 0.5 - fOffset) * fSize * sin(fAngle) - for (int slice = startSlice; slice < endSlice; ++slice) - { - for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { - fVal += tex2D(gT_volumeTexture, fS, fP); - fP += fSubDetStep; - } - fP += fSliceStep; - fS += 1.0f; - } - - } else { - for (int slice = startSlice; slice < endSlice; ++slice) - { - fVal += tex2D(gT_volumeTexture, fS, fP); - fP += fSliceStep; - fS += 1.0f; - } - - - } - - D_projData[angle*projPitch+detector] += fVal * fDistCorr; -} - - -// projection for angles that are roughly vertical -// (detector roughly horizontal) -__global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) +static void convertAndUploadAngles(const SParProjection *projs, unsigned int nth, unsigned int ndets) { - const int relDet = threadIdx.x; - const int relAngle = threadIdx.y; - - int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; - - if (angle >= endAngle) - return; - - const float theta = gC_angle[angle]; - const float cos_theta = __cosf(theta); - const float sin_theta = __sinf(theta); - - // compute start detector for this block/angle: - const int detRegion = blockIdx.y; - - const int detector = detRegion * g_detBlockSize + relDet; + float *angles = new float[nth]; + float *offset = new float[nth]; + float *detsizes = new float[nth]; - // Now project the part of the ray to angle,detector through - // slices startSlice to startSlice+g_blockSlices-1 + for (int i = 0; i < nth; ++i) { - if (detector < 0 || detector >= dims.iProjDets) - return; +#warning TODO: Replace by getParParamaters - const float fDetStep = dims.fDetScale / cos_theta; - float fSliceStep = sin_theta / cos_theta; - float fDistCorr; - if (cos_theta < 0.0f) - fDistCorr = -fDetStep; - else - fDistCorr = fDetStep; - fDistCorr *= outputScale; + // Take part of DetU orthogonal to Ray + double ux = projs[i].fDetUX; + double uy = projs[i].fDetUY; - float fVal = 0.0f; - float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 0.5f; - float fS = startSlice+0.5f; - int endSlice = startSlice + g_blockSlices; - if (endSlice > dims.iVolHeight) - endSlice = dims.iVolHeight; + double t = (ux * projs[i].fRayX + uy * projs[i].fRayY) / (projs[i].fRayX * projs[i].fRayX + projs[i].fRayY * projs[i].fRayY); - if (dims.iRaysPerDet > 1) { + ux -= t * projs[i].fRayX; + uy -= t * projs[i].fRayY; - fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep; - const float fSubDetStep = fDetStep / dims.iRaysPerDet; - fDistCorr /= dims.iRaysPerDet; + double angle = atan2(uy, ux); - fSliceStep -= dims.iRaysPerDet * fSubDetStep; + angles[i] = angle; - for (int slice = startSlice; slice < endSlice; ++slice) - { - for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { - fVal += tex2D(gT_volumeTexture, fP, fS); - fP += fSubDetStep; - } - fP += fSliceStep; - fS += 1.0f; - } + double norm2 = uy * uy + ux * ux; - } else { - - for (int slice = startSlice; slice < endSlice; ++slice) - { - fVal += tex2D(gT_volumeTexture, fP, fS); - fP += fSliceStep; - fS += 1.0f; - } + detsizes[i] = sqrt(norm2); + // CHECKME: SIGNS? + offset[i] = -0.5*ndets - (projs[i].fDetSY*uy + projs[i].fDetSX*ux) / norm2; } - D_projData[angle*projPitch+detector] += fVal * fDistCorr; + cudaMemcpyToSymbol(gC_angle, angles, nth*sizeof(float), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(gC_angle_offset, offset, nth*sizeof(float), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(gC_angle_detsize, detsizes, nth*sizeof(float), 0, cudaMemcpyHostToDevice); } bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, - const SDimensions& dims, const float* angles, - const float* TOffsets, float outputScale) + const SDimensions& dims, const SParProjection* angles, + float outputScale) { - // TODO: load angles into constant memory in smaller blocks assert(dims.iProjAngles <= g_MaxAngles); + assert(angles); + cudaArray* D_dataArray; bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight); - cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); - if (TOffsets) { - cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); - } else { - if (!g_pfZeroes) { - g_pfZeroes = new float[g_MaxAngles]; - memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float)); - } - cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); - } + convertAndUploadAngles(angles, dims.iProjAngles, dims.iProjDets); + dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // detector block size, angles @@ -492,7 +324,7 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch, // Maybe we should detect corner cases and put them in the optimal // group of angles. if (a != dims.iProjAngles) - vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a]))); + vertical = (fabsf(angles[a].fRayX) <= fabsf(angles[a].fRayY)); if (a == dims.iProjAngles || vertical != blockVertical) { // block done @@ -536,8 +368,8 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch, bool FP_simple(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, - const SDimensions& dims, const float* angles, - const float* TOffsets, float outputScale) + const SDimensions& dims, const SParProjection* angles, + float outputScale) { for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) { SDimensions subdims = dims; @@ -550,7 +382,7 @@ bool FP_simple(float* D_volumeData, unsigned int volumePitch, ret = FP_simple_internal(D_volumeData, volumePitch, D_projData + iAngle * projPitch, projPitch, subdims, angles + iAngle, - TOffsets ? TOffsets + iAngle : 0, outputScale); + outputScale); if (!ret) return false; } @@ -559,106 +391,12 @@ bool FP_simple(float* D_volumeData, unsigned int volumePitch, bool FP(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, - const SDimensions& dims, const float* angles, - const float* TOffsets, float outputScale) + const SDimensions& dims, const SParProjection* angles, + float outputScale) { return FP_simple(D_volumeData, volumePitch, D_projData, projPitch, - dims, angles, TOffsets, outputScale); - - // TODO: Fix bug in this non-simple FP with large detscale and TOffsets -#if 0 - - // TODO: load angles into constant memory in smaller blocks - assert(dims.iProjAngles <= g_MaxAngles); - - // TODO: compute region size dynamically to resolve these two assumptions - // ASSUMPTION: 16 > regionOffset / fDetScale - const unsigned int g_blockSliceSize = g_detBlockSize; - assert(16 > (g_blockSlices / g_blockSliceSize) / dims.fDetScale); - // ASSUMPTION: fDetScale >= 1.0f - assert(dims.fDetScale > 0.9999f); - - cudaArray* D_dataArray; - bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight); - - cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); - - if (TOffsets) { - cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); - } else { - if (!g_pfZeroes) { - g_pfZeroes = new float[g_MaxAngles]; - memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float)); - } - cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); - } - - int regionOffset = g_blockSlices / g_blockSliceSize; - - dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles - - std::list streams; - + dims, angles, outputScale); - // Run over all angles, grouping them into groups of the same - // orientation (roughly horizontal vs. roughly vertical). - // Start a stream of grids for each such group. - - // TODO: Check if it's worth it to store this info instead - // of recomputing it every FP. - - unsigned int blockStart = 0; - unsigned int blockEnd = 0; - bool blockVertical = false; - for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { - bool vertical; - // TODO: Having <= instead of < below causes a 5% speedup. - // Maybe we should detect corner cases and put them in the optimal - // group of angles. - if (a != dims.iProjAngles) - vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a]))); - if (a == dims.iProjAngles || vertical != blockVertical) { - // block done - - blockEnd = a; - if (blockStart != blockEnd) { - unsigned int length = dims.iVolHeight; - if (blockVertical) - length = dims.iVolWidth; - dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock, - (length+g_blockSliceSize-1)/g_blockSliceSize+2*regionOffset); // angle blocks, regions - // TODO: check if we can't immediately - // destroy the stream after use - cudaStream_t stream; - cudaStreamCreate(&stream); - streams.push_back(stream); - //printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical); - if (!blockVertical) - for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices) - FPhorizontal<<>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale); - else - for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices) - FPvertical<<>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale); - } - blockVertical = vertical; - blockStart = a; - } - } - - for (std::list::iterator iter = streams.begin(); iter != streams.end(); ++iter) - cudaStreamDestroy(*iter); - - streams.clear(); - - cudaThreadSynchronize(); - - cudaTextForceKernelsCompletion(); - - cudaFreeArray(D_dataArray); - - - return true; -#endif } diff --git a/cuda/2d/par_fp.h b/cuda/2d/par_fp.h index 010d064..5fd593c 100644 --- a/cuda/2d/par_fp.h +++ b/cuda/2d/par_fp.h @@ -33,8 +33,8 @@ namespace astraCUDA { _AstraExport bool FP(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, - const SDimensions& dims, const float* angles, - const float* TOffsets, float fOutputScale); + const SDimensions& dims, const SParProjection* angles, + float fOutputScale); } diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu index c8608a3..a1085cd 100644 --- a/cuda/2d/sart.cu +++ b/cuda/2d/sart.cu @@ -254,10 +254,10 @@ bool SART::callFP_SART(float* D_volumeData, unsigned int volumePitch, { SDimensions d = dims; d.iProjAngles = 1; - if (angles) { + if (parProjs) { assert(!fanProjs); return FP(D_volumeData, volumePitch, D_projData, projPitch, - d, &angles[angle], TOffsets, outputScale); + d, &parProjs[angle], outputScale); } else { assert(fanProjs); return FanFP(D_volumeData, volumePitch, D_projData, projPitch, @@ -269,10 +269,10 @@ bool SART::callBP_SART(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, unsigned int angle, float outputScale) { - if (angles) { + if (parProjs) { assert(!fanProjs); return BP_SART(D_volumeData, volumePitch, D_projData, projPitch, - angle, dims, angles, TOffsets, outputScale); + angle, dims, parProjs, outputScale); } else { assert(fanProjs); return FanBP_SART(D_volumeData, volumePitch, D_projData, projPitch, diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu index 4baaccb..9dc4f64 100644 --- a/cuda/2d/sirt.cu +++ b/cuda/2d/sirt.cu @@ -152,7 +152,7 @@ bool SIRT::precomputeWeights() bool SIRT::doSlabCorrections() { // This function compensates for effectively infinitely large slab-like - // objects of finite thickness 1. + // objects of finite thickness 1 in a parallel beam geometry. // Each ray through the object has an intersection of length d/cos(alpha). // The length of the ray actually intersecting the reconstruction volume is @@ -170,6 +170,10 @@ bool SIRT::doSlabCorrections() if (useVolumeMask || useSinogramMask) return false; + // Parallel-beam only + if (!parProjs) + return false; + // multiply by line weights processSino(D_sinoData, D_lineWeight, projPitch, dims); @@ -181,7 +185,10 @@ bool SIRT::doSlabCorrections() float bound = cosf(1.3963f); float* t = (float*)D_sinoData; for (int i = 0; i < dims.iProjAngles; ++i) { - float f = fabs(cosf(angles[i])); + // TODO: Checkme + // TODO: Replace by getParParameters + double angle = atan2(parProjs[i].fRayX, -parProjs[i].fRayY); + float f = fabs(cosf(angle)); if (f < bound) f = bound; @@ -189,7 +196,6 @@ bool SIRT::doSlabCorrections() processSino(t, f, sinoPitch, subdims); t += sinoPitch; } - return true; } @@ -299,40 +305,6 @@ float SIRT::computeDiffNorm() } -bool doSIRT(float* D_volumeData, unsigned int volumePitch, - float* D_sinoData, unsigned int sinoPitch, - float* D_maskData, unsigned int maskPitch, - const SDimensions& dims, const float* angles, - const float* TOffsets, unsigned int iterations) -{ - SIRT sirt; - bool ok = true; - - ok &= sirt.setGeometry(dims, angles); - if (D_maskData) - ok &= sirt.enableVolumeMask(); - if (TOffsets) - ok &= sirt.setTOffsets(TOffsets); - - if (!ok) - return false; - - ok = sirt.init(); - if (!ok) - return false; - - if (D_maskData) - ok &= sirt.setVolumeMask(D_maskData, maskPitch); - - ok &= sirt.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); - if (!ok) - return false; - - ok = sirt.iterate(iterations); - - return ok; -} - } #ifdef STANDALONE diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu index 0e13be1..e7418a2 100644 --- a/cuda/3d/fdk.cu +++ b/cuda/3d/fdk.cu @@ -353,7 +353,7 @@ bool FDK_Filter(cudaPitchedPtr D_projData, // The filtering is a regular ramp filter per detector line. int iPaddedDetCount = calcNextPowerOfTwo(2 * dims.iProjU); - int iHalfFFTSize = calcFFTFourSize(iPaddedDetCount); + int iHalfFFTSize = astraCUDA::calcFFTFourierSize(iPaddedDetCount); int projPitch = D_projData.pitch/sizeof(float); @@ -361,22 +361,22 @@ bool FDK_Filter(cudaPitchedPtr D_projData, float* D_sinoData = (float*)D_projData.ptr; cufftComplex * D_sinoFFT = NULL; - allocateComplexOnDevice(dims.iProjAngles, iHalfFFTSize, &D_sinoFFT); + astraCUDA::allocateComplexOnDevice(dims.iProjAngles, iHalfFFTSize, &D_sinoFFT); bool ok = true; for (int v = 0; v < dims.iProjV; ++v) { - ok = runCudaFFT(dims.iProjAngles, D_sinoData, projPitch, + ok = astraCUDA::runCudaFFT(dims.iProjAngles, D_sinoData, projPitch, dims.iProjU, iPaddedDetCount, iHalfFFTSize, D_sinoFFT); if (!ok) break; - applyFilter(dims.iProjAngles, iHalfFFTSize, D_sinoFFT, D_filter); + astraCUDA::applyFilter(dims.iProjAngles, iHalfFFTSize, D_sinoFFT, D_filter); - ok = runCudaIFFT(dims.iProjAngles, D_sinoFFT, D_sinoData, projPitch, + ok = astraCUDA::runCudaIFFT(dims.iProjAngles, D_sinoFFT, D_sinoData, projPitch, dims.iProjU, iPaddedDetCount, iHalfFFTSize); if (!ok) break; @@ -384,7 +384,7 @@ bool FDK_Filter(cudaPitchedPtr D_projData, D_sinoData += (dims.iProjAngles * projPitch); } - freeComplexOnDevice(D_sinoFFT); + astraCUDA::freeComplexOnDevice(D_sinoFFT); return ok; } @@ -401,7 +401,7 @@ bool FDK(cudaPitchedPtr D_volumeData, // TODO: Check errors cufftComplex * D_filter; int iPaddedDetCount = calcNextPowerOfTwo(2 * dims.iProjU); - int iHalfFFTSize = calcFFTFourSize(iPaddedDetCount); + int iHalfFFTSize = astraCUDA::calcFFTFourierSize(iPaddedDetCount); ok = FDK_PreWeight(D_projData, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, fDetUSize, fDetVSize, @@ -412,11 +412,11 @@ bool FDK(cudaPitchedPtr D_volumeData, cufftComplex *pHostFilter = new cufftComplex[dims.iProjAngles * iHalfFFTSize]; memset(pHostFilter, 0, sizeof(cufftComplex) * dims.iProjAngles * iHalfFFTSize); - genFilter(FILTER_RAMLAK, 1.0f, dims.iProjAngles, pHostFilter, iPaddedDetCount, iHalfFFTSize); + astraCUDA::genFilter(astra::FILTER_RAMLAK, 1.0f, dims.iProjAngles, pHostFilter, iPaddedDetCount, iHalfFFTSize); - allocateComplexOnDevice(dims.iProjAngles, iHalfFFTSize, &D_filter); - uploadComplexArrayToDevice(dims.iProjAngles, iHalfFFTSize, pHostFilter, D_filter); + astraCUDA::allocateComplexOnDevice(dims.iProjAngles, iHalfFFTSize, &D_filter); + astraCUDA::uploadComplexArrayToDevice(dims.iProjAngles, iHalfFFTSize, pHostFilter, D_filter); delete [] pHostFilter; @@ -430,7 +430,7 @@ bool FDK(cudaPitchedPtr D_volumeData, bShortScan, dims, angles); // Clean up filter - freeComplexOnDevice(D_filter); + astraCUDA::freeComplexOnDevice(D_filter); if (!ok) -- cgit v1.2.3