diff options
Diffstat (limited to 'cuda/3d/par3d_fp.cu')
-rw-r--r-- | cuda/3d/par3d_fp.cu | 814 |
1 files changed, 814 insertions, 0 deletions
diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu new file mode 100644 index 0000000..6bf9037 --- /dev/null +++ b/cuda/3d/par3d_fp.cu @@ -0,0 +1,814 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> +#include "util3d.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +#include "dims3d.h" + +typedef texture<float, 3, cudaReadModeElementType> texture3D; + +static texture3D gT_par3DVolumeTexture; + +namespace astraCUDA3d { + +static const unsigned int g_anglesPerBlock = 4; + +// thickness of the slices we're splitting the volume up into +static const unsigned int g_blockSlices = 64; +static const unsigned int g_detBlockU = 32; +static const unsigned int g_detBlockV = 32; + +static const unsigned g_MaxAngles = 1024; +__constant__ float gC_RayX[g_MaxAngles]; +__constant__ float gC_RayY[g_MaxAngles]; +__constant__ float gC_RayZ[g_MaxAngles]; +__constant__ float gC_DetSX[g_MaxAngles]; +__constant__ float gC_DetSY[g_MaxAngles]; +__constant__ float gC_DetSZ[g_MaxAngles]; +__constant__ float gC_DetUX[g_MaxAngles]; +__constant__ float gC_DetUY[g_MaxAngles]; +__constant__ float gC_DetUZ[g_MaxAngles]; +__constant__ float gC_DetVX[g_MaxAngles]; +__constant__ float gC_DetVY[g_MaxAngles]; +__constant__ float gC_DetVZ[g_MaxAngles]; + + +static bool bindVolumeDataTexture(const cudaArray* array) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_par3DVolumeTexture.addressMode[0] = cudaAddressModeClamp; + gT_par3DVolumeTexture.addressMode[1] = cudaAddressModeClamp; + gT_par3DVolumeTexture.addressMode[2] = cudaAddressModeClamp; + gT_par3DVolumeTexture.filterMode = cudaFilterModeLinear; + gT_par3DVolumeTexture.normalized = false; + + cudaBindTextureToArray(gT_par3DVolumeTexture, array, channelDesc); + + // TODO: error value? + + return true; +} + + + +// threadIdx: x = u detector +// y = relative angle +// blockIdx: x = u/v detector +// y = angle block + +#define PAR3D_FP_BODY(c0,c1,c2) \ + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \ + if (angle >= endAngle) \ + return; \ + \ + const float fRayX = gC_RayX[angle]; \ + const float fRayY = gC_RayY[angle]; \ + const float fRayZ = gC_RayZ[angle]; \ + const float fDetUX = gC_DetUX[angle]; \ + const float fDetUY = gC_DetUY[angle]; \ + const float fDetUZ = gC_DetUZ[angle]; \ + const float fDetVX = gC_DetVX[angle]; \ + const float fDetVY = gC_DetVY[angle]; \ + const float fDetVZ = gC_DetVZ[angle]; \ + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \ + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \ + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \ + \ + \ + \ + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \ + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \ + int endDetectorV = startDetectorV + g_detBlockV; \ + if (endDetectorV > dims.iProjV) \ + endDetectorV = dims.iProjV; \ + \ + int endSlice = startSlice + g_blockSlices; \ + if (endSlice > dims.iVol##c0) \ + endSlice = dims.iVol##c0; \ + \ + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \ + { \ + /* Trace ray in direction Ray to (detectorU,detectorV) from */ \ + /* X = startSlice to X = endSlice */ \ + \ + const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; \ + const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; \ + const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; \ + \ + /* (x) ( 1) ( 0) */ \ + /* ray: (y) = (ay) * x + (by) */ \ + /* (z) (az) (bz) */ \ + \ + const float a##c1 = fRay##c1 / fRay##c0; \ + const float a##c2 = fRay##c2 / fRay##c0; \ + const float b##c1 = fDet##c1 - a##c1 * fDet##c0; \ + const float b##c2 = fDet##c2 - a##c2 * fDet##c0; \ + \ + const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \ + \ + float fVal = 0.0f; \ + \ + float f##c0 = startSlice + 1.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\ + \ + for (int s = startSlice; s < endSlice; ++s) \ + { \ + fVal += tex3D(gT_par3DVolumeTexture, fX, fY, fZ); \ + f##c0 += 1.0f; \ + f##c1 += a##c1; \ + f##c2 += a##c2; \ + } \ + \ + fVal *= fDistCorr; \ + \ + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; \ + } + + + +// Supersampling version +#define PAR3D_FP_SS_BODY(c0,c1,c2) \ + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \ + if (angle >= endAngle) \ + return; \ + \ + const float fRayX = gC_RayX[angle]; \ + const float fRayY = gC_RayY[angle]; \ + const float fRayZ = gC_RayZ[angle]; \ + const float fDetUX = gC_DetUX[angle]; \ + const float fDetUY = gC_DetUY[angle]; \ + const float fDetUZ = gC_DetUZ[angle]; \ + const float fDetVX = gC_DetVX[angle]; \ + const float fDetVY = gC_DetVY[angle]; \ + const float fDetVZ = gC_DetVZ[angle]; \ + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \ + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \ + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \ + \ + \ + \ + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \ + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \ + int endDetectorV = startDetectorV + g_detBlockV; \ + if (endDetectorV > dims.iProjV) \ + endDetectorV = dims.iProjV; \ + \ + int endSlice = startSlice + g_blockSlices; \ + if (endSlice > dims.iVol##c0) \ + endSlice = dims.iVol##c0; \ + \ + const float fSubStep = 1.0f/dims.iRaysPerDetDim; \ + \ + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \ + { \ + \ + float fV = 0.0f; \ + \ + float fdU = detectorU - 0.5f + 0.5f*fSubStep; \ + for (int iSubU = 0; iSubU < dims.iRaysPerDetDim; ++iSubU, fdU+=fSubStep) { \ + float fdV = detectorV - 0.5f + 0.5f*fSubStep; \ + for (int iSubV = 0; iSubV < dims.iRaysPerDetDim; ++iSubV, fdV+=fSubStep) { \ + \ + /* Trace ray in direction Ray to (detectorU,detectorV) from */ \ + /* X = startSlice to X = endSlice */ \ + \ + const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX; \ + const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY; \ + const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ; \ + \ + /* (x) ( 1) ( 0) */ \ + /* ray: (y) = (ay) * x + (by) */ \ + /* (z) (az) (bz) */ \ + \ + const float a##c1 = fRay##c1 / fRay##c0; \ + const float a##c2 = fRay##c2 / fRay##c0; \ + const float b##c1 = fDet##c1 - a##c1 * fDet##c0; \ + const float b##c2 = fDet##c2 - a##c2 * fDet##c0; \ + \ + const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \ + \ + float fVal = 0.0f; \ + \ + float f##c0 = startSlice + 1.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\ + \ + for (int s = startSlice; s < endSlice; ++s) \ + { \ + fVal += tex3D(gT_par3DVolumeTexture, fX, fY, fZ); \ + f##c0 += 1.0f; \ + f##c1 += a##c1; \ + f##c2 += a##c2; \ + } \ + \ + fVal *= fDistCorr; \ + fV += fVal; \ + \ + } \ + } \ + \ + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (dims.iRaysPerDetDim * dims.iRaysPerDetDim);\ + } + + + +__global__ void par3D_FP_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_BODY(X,Y,Z) +} + +__global__ void par3D_FP_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_BODY(Y,X,Z) +} + +__global__ void par3D_FP_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_BODY(Z,X,Y) +} + +__global__ void par3D_FP_SS_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SS_BODY(X,Y,Z) +} + +__global__ void par3D_FP_SS_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SS_BODY(Y,X,Z) +} + +__global__ void par3D_FP_SS_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SS_BODY(Z,X,Y) +} + + +__device__ float dirWeights(float fX, float fN) { + if (fX <= 0.5f) // outside image on left + return 0.0f; + if (fX <= 1.5f) // half outside image on left + return (fX - 0.5f) * (fX - 0.5f); + if (fX <= fN + 0.5f) { // inside image + float t = fX - 0.5f - floorf(fX - 0.5f); + return t*t + (1-t)*(1-t); + } + if (fX <= fN + 1.5f) // half outside image on right + return (fN + 1.5f - fX) * (fN + 1.5f - fX); + return 0.0f; // outside image on right +} + +#define PAR3D_FP_SUMSQW_BODY(c0,c1,c2) \ + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \ + if (angle >= endAngle) \ + return; \ + \ + const float fRayX = gC_RayX[angle]; \ + const float fRayY = gC_RayY[angle]; \ + const float fRayZ = gC_RayZ[angle]; \ + const float fDetUX = gC_DetUX[angle]; \ + const float fDetUY = gC_DetUY[angle]; \ + const float fDetUZ = gC_DetUZ[angle]; \ + const float fDetVX = gC_DetVX[angle]; \ + const float fDetVY = gC_DetVY[angle]; \ + const float fDetVZ = gC_DetVZ[angle]; \ + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \ + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \ + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \ + \ + \ + \ + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \ + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \ + int endDetectorV = startDetectorV + g_detBlockV; \ + if (endDetectorV > dims.iProjV) \ + endDetectorV = dims.iProjV; \ + \ + int endSlice = startSlice + g_blockSlices; \ + if (endSlice > dims.iVol##c0) \ + endSlice = dims.iVol##c0; \ + \ + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \ + { \ + /* Trace ray in direction Ray to (detectorU,detectorV) from */ \ + /* X = startSlice to X = endSlice */ \ + \ + const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; \ + const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; \ + const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; \ + \ + /* (x) ( 1) ( 0) */ \ + /* ray: (y) = (ay) * x + (by) */ \ + /* (z) (az) (bz) */ \ + \ + const float a##c1 = fRay##c1 / fRay##c0; \ + const float a##c2 = fRay##c2 / fRay##c0; \ + const float b##c1 = fDet##c1 - a##c1 * fDet##c0; \ + const float b##c2 = fDet##c2 - a##c2 * fDet##c0; \ + \ + const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \ + \ + float fVal = 0.0f; \ + \ + float f##c0 = startSlice + 1.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\ + \ + for (int s = startSlice; s < endSlice; ++s) \ + { \ + fVal += dirWeights(f##c1, dims.iVol##c1) * dirWeights(f##c2, dims.iVol##c2) * fDistCorr * fDistCorr; \ + f##c0 += 1.0f; \ + f##c1 += a##c1; \ + f##c2 += a##c2; \ + } \ + \ + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; \ + } + +// Supersampling version +// TODO + + +__global__ void par3D_FP_SumSqW_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SUMSQW_BODY(X,Y,Z) +} + +__global__ void par3D_FP_SumSqW_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SUMSQW_BODY(Y,X,Z) +} + +__global__ void par3D_FP_SumSqW_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SUMSQW_BODY(Z,X,Y) +} + + + +bool Par3DFP_Array(cudaArray *D_volArray, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale) +{ + + bindVolumeDataTexture(D_volArray); + + + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(RayX); + TRANSFER_TO_CONSTANT(RayY); + TRANSFER_TO_CONSTANT(RayZ); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetSZ); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + TRANSFER_TO_CONSTANT(DetUZ); + TRANSFER_TO_CONSTANT(DetVX); + TRANSFER_TO_CONSTANT(DetVY); + TRANSFER_TO_CONSTANT(DetVZ); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + std::list<cudaStream_t> streams; + dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + int blockDirection = 0; + + // timeval t; + // tic(t); + + for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { + int dir; + if (a != dims.iProjAngles) { + float dX = fabsf(angles[a].fRayX); + float dY = fabsf(angles[a].fRayY); + float dZ = fabsf(angles[a].fRayZ); + + if (dX >= dY && dX >= dZ) + dir = 0; + else if (dY >= dX && dY >= dZ) + dir = 1; + else + dir = 2; + } + + if (a == dims.iProjAngles || dir != blockDirection) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + + dim3 dimGrid( + ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), +(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); + // TODO: check if we can't immediately + // destroy the stream after use + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + + // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y); + + if (blockDirection == 0) { + for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + par3D_FP_SS_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } else if (blockDirection == 1) { + for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + par3D_FP_SS_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } else if (blockDirection == 2) { + for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + par3D_FP_SS_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } + + } + + blockDirection = dir; + blockStart = a; + } + } + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) + cudaStreamDestroy(*iter); + + streams.clear(); + + cudaTextForceKernelsCompletion(); + + + // printf("%f\n", toc(t)); + + return true; +} + +bool Par3DFP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale) +{ + // transfer volume to array + cudaArray* cuArray = allocateVolumeArray(dims); + transferVolumeToArray(D_volumeData, cuArray, dims); + + bool ret = Par3DFP_Array(cuArray, D_projData, dims, angles, fOutputScale); + + cudaFreeArray(cuArray); + + return ret; +} + + + +bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale) +{ + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(RayX); + TRANSFER_TO_CONSTANT(RayY); + TRANSFER_TO_CONSTANT(RayZ); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetSZ); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + TRANSFER_TO_CONSTANT(DetUZ); + TRANSFER_TO_CONSTANT(DetVX); + TRANSFER_TO_CONSTANT(DetVY); + TRANSFER_TO_CONSTANT(DetVZ); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + std::list<cudaStream_t> streams; + dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + int blockDirection = 0; + + // timeval t; + // tic(t); + + for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { + int dir; + if (a != dims.iProjAngles) { + float dX = fabsf(angles[a].fRayX); + float dY = fabsf(angles[a].fRayY); + float dZ = fabsf(angles[a].fRayZ); + + if (dX >= dY && dX >= dZ) + dir = 0; + else if (dY >= dX && dY >= dZ) + dir = 1; + else + dir = 2; + } + + if (a == dims.iProjAngles || dir != blockDirection) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + + dim3 dimGrid( + ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), +(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); + // TODO: check if we can't immediately + // destroy the stream after use + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + + // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y); + + if (blockDirection == 0) { + for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else +#if 0 + par3D_FP_SS_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else + assert(false); +#endif + } else if (blockDirection == 1) { + for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else +#if 0 + par3D_FP_SS_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else + assert(false); +#endif + } else if (blockDirection == 2) { + for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else +#if 0 + par3D_FP_SS_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else + assert(false); +#endif + } + + } + + blockDirection = dir; + blockStart = a; + } + } + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) + cudaStreamDestroy(*iter); + + streams.clear(); + + cudaTextForceKernelsCompletion(); + + + // printf("%f\n", toc(t)); + + return true; +} + + + + + + + +} + +#ifdef STANDALONE + +using namespace astraCUDA3d; + +int main() +{ + cudaSetDevice(1); + + + SDimensions3D dims; + dims.iVolX = 500; + dims.iVolY = 500; + dims.iVolZ = 81; + dims.iProjAngles = 241; + dims.iProjU = 600; + dims.iProjV = 100; + dims.iRaysPerDet = 1; + + SPar3DProjection base; + base.fRayX = 1.0f; + base.fRayY = 0.0f; + base.fRayZ = 0.1f; + + base.fDetSX = 0.0f; + base.fDetSY = -300.0f; + base.fDetSZ = -50.0f; + + base.fDetUX = 0.0f; + base.fDetUY = 1.0f; + base.fDetUZ = 0.0f; + + base.fDetVX = 0.0f; + base.fDetVY = 0.0f; + base.fDetVZ = 1.0f; + + SPar3DProjection angle[dims.iProjAngles]; + + cudaPitchedPtr volData; // pitch, ptr, xsize, ysize + + volData = allocateVolumeData(dims); + + cudaPitchedPtr projData; // pitch, ptr, xsize, ysize + + projData = allocateProjectionData(dims); + + unsigned int ix = 500,iy = 500; + + float* buf = new float[dims.iProjU*dims.iProjV]; + + float* slice = new float[dims.iVolX*dims.iVolY]; + for (int i = 0; i < dims.iVolX*dims.iVolY; ++i) + slice[i] = 1.0f; + + for (unsigned int a = 0; a < 241; a += dims.iProjAngles) { + + zeroProjectionData(projData, dims); + + for (int y = 0; y < iy; y += dims.iVolY) { + for (int x = 0; x < ix; x += dims.iVolX) { + + timeval st; + tic(st); + + for (int z = 0; z < dims.iVolZ; ++z) { +// char sfn[256]; +// sprintf(sfn, "/home/wpalenst/projects/cone_simulation/phantom_4096/mouse_fem_phantom_%04d.png", 30+z); +// float* slice = loadSubImage(sfn, x, y, dims.iVolX, dims.iVolY); + + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = dims.iVolX*sizeof(float); + ptr.xsize = dims.iVolX*sizeof(float); + ptr.ysize = dims.iVolY; + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, z }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaError err = cudaMemcpy3D(&p); + assert(!err); +// delete[] slice; + } + + printf("Load: %f\n", toc(st)); + +#if 0 + + cudaPos zp = { 0, 0, 0 }; + + cudaPitchedPtr t; + t.ptr = new float[1024*1024]; + t.pitch = 1024*4; + t.xsize = 1024*4; + t.ysize = 1024; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = volData; + p.extent = extentS; + p.dstArray = 0; + p.dstPtr = t; + p.dstPos = zp; + p.kind = cudaMemcpyDeviceToHost; + cudaError err = cudaMemcpy3D(&p); + assert(!err); + + char fn[32]; + sprintf(fn, "t%d%d.png", x / dims.iVolX, y / dims.iVolY); + saveImage(fn, 1024, 1024, (float*)t.ptr); + saveImage("s.png", 4096, 4096, slice); + delete[] (float*)t.ptr; +#endif + + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = base.f##name##X * cos(alpha) - base.f##name##Y * sin(alpha); angle[i].f##name##Y = base.f##name##X * sin(alpha) + base.f##name##Y * cos(alpha); angle[i].f##name##Z = base.f##name##Z; } while(0) +#define SHIFT(name,i,x,y) do { angle[i].f##name##X += x; angle[i].f##name##Y += y; } while(0) + for (int i = 0; i < dims.iProjAngles; ++i) { + ROTATE0(Ray, i, (a+i)*.8*M_PI/180); + ROTATE0(DetS, i, (a+i)*.8*M_PI/180); + ROTATE0(DetU, i, (a+i)*.8*M_PI/180); + ROTATE0(DetV, i, (a+i)*.8*M_PI/180); + + +// SHIFT(Src, i, (-x+1536), (-y+1536)); +// SHIFT(DetS, i, (-x+1536), (-y+1536)); + } +#undef ROTATE0 +#undef SHIFT + tic(st); + + astraCUDA3d::Par3DFP(volData, projData, dims, angle, 1.0f); + + printf("FP: %f\n", toc(st)); + + } + } + for (unsigned int aa = 0; aa < dims.iProjAngles; ++aa) { + for (unsigned int v = 0; v < dims.iProjV; ++v) + cudaMemcpy(buf+v*dims.iProjU, ((float*)projData.ptr)+(v*dims.iProjAngles+aa)*(projData.pitch/sizeof(float)), dims.iProjU*sizeof(float), cudaMemcpyDeviceToHost); + + char fname[32]; + sprintf(fname, "proj%03d.png", a+aa); + saveImage(fname, dims.iProjV, dims.iProjU, buf, 0.0f, 1000.0f); + } + } + + delete[] buf; + +} +#endif |