diff options
-rw-r--r-- | cuda/2d/algo.cu | 12 | ||||
-rw-r--r-- | cuda/2d/fan_bp.cu | 62 | ||||
-rw-r--r-- | cuda/2d/par_bp.cu | 20 | ||||
-rw-r--r-- | cuda/3d/cone_bp.cu | 6 | ||||
-rw-r--r-- | src/CudaReconstructionAlgorithm2D.cpp | 3 | ||||
-rw-r--r-- | tests/python/test_line2d.py | 59 |
6 files changed, 132 insertions, 30 deletions
diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu index b4c2864..11422ff 100644 --- a/cuda/2d/algo.cu +++ b/cuda/2d/algo.cu @@ -258,10 +258,10 @@ bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPit if (!ok) return false; - // rescale sinogram to adjust for pixel size - processSino<opMul>(D_sinoData, fSinogramScale, - //1.0f/(fPixelSize*fPixelSize), - sinoPitch, dims); + // rescale sinogram + if (fSinogramScale != 1.0f) + processSino<opMul>(D_sinoData, fSinogramScale, + sinoPitch, dims); ok = copyVolumeToDevice(pfReconstruction, iReconstructionPitch, dims, @@ -331,11 +331,11 @@ bool ReconAlgo::callBP(float* D_volumeData, unsigned int volumePitch, if (parProjs) { assert(!fanProjs); return BP(D_volumeData, volumePitch, D_projData, projPitch, - dims, parProjs, outputScale); + dims, parProjs, fOutputScale * outputScale); } else { assert(fanProjs); return FanBP(D_volumeData, volumePitch, D_projData, projPitch, - dims, fanProjs, outputScale); + dims, fanProjs, fOutputScale * outputScale); } } diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu index dac3ac2..2072cd4 100644 --- a/cuda/2d/fan_bp.cu +++ b/cuda/2d/fan_bp.cu @@ -48,7 +48,7 @@ const unsigned int g_anglesPerBlock = 16; const unsigned int g_blockSliceSize = 32; const unsigned int g_blockSlices = 16; -const unsigned int g_MaxAngles = 2560; +const unsigned int g_MaxAngles = 2240; __constant__ float gC_SrcX[g_MaxAngles]; __constant__ float gC_SrcY[g_MaxAngles]; @@ -56,6 +56,7 @@ __constant__ float gC_DetSX[g_MaxAngles]; __constant__ float gC_DetSY[g_MaxAngles]; __constant__ float gC_DetUX[g_MaxAngles]; __constant__ float gC_DetUY[g_MaxAngles]; +__constant__ float gC_Scale[g_MaxAngles]; static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height, cudaTextureAddressMode mode = cudaAddressModeBorder) @@ -96,8 +97,6 @@ __global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int s float fVal = 0.0f; float fA = startAngle + 0.5f; - // TODO: Distance correction? - for (int angle = startAngle; angle < endAngle; ++angle) { const float fSrcX = gC_SrcX[angle]; @@ -106,15 +105,24 @@ __global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int s const float fDetSY = gC_DetSY[angle]; const float fDetUX = gC_DetUX[angle]; const float fDetUY = gC_DetUY[angle]; + const float fScale = gC_Scale[angle]; const float fXD = fSrcX - fX; const float fYD = fSrcY - fY; const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX; const float fDen = fDetUX * fYD - fDetUY * fXD; - - const float fT = fNum / fDen; - fVal += tex2D(gT_FanProjTexture, fT, fA); + + // fDen = || u (x-s) || (2x2 determinant) + + // Scale factor is the approximate number of rays traversing this pixel, + // given by the inverse size of a detector pixel scaled by the magnification + // factor of this pixel. + // Magnification factor is || u (d-s) || / || u (x-s) || + + const float fr = __fdividef(1.0f, fDen); + const float fT = fNum * fr; + fVal += tex2D(gT_FanProjTexture, fT, fA) * fScale * fr; fA += 1.0f; } @@ -148,8 +156,6 @@ __global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned in float fVal = 0.0f; float fA = startAngle + 0.5f; - // TODO: Distance correction? - for (int angle = startAngle; angle < endAngle; ++angle) { const float fSrcX = gC_SrcX[angle]; @@ -158,6 +164,7 @@ __global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned in const float fDetSY = gC_DetSY[angle]; const float fDetUX = gC_DetUX[angle]; const float fDetUY = gC_DetUY[angle]; + const float fScale = gC_Scale[angle]; // TODO: Optimize these loops... float fX = fXb; @@ -169,9 +176,10 @@ __global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned in const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX; const float fDen = fDetUX * fYD - fDetUY * fXD; - - const float fT = fNum / fDen; - fVal += tex2D(gT_FanProjTexture, fT, fA); + const float fr = __fdividef(1.0f, fDen); + + const float fT = fNum * fr; + fVal += tex2D(gT_FanProjTexture, fT, fA) * fScale * fr; fY -= fSubStep; } fX += fSubStep; @@ -202,8 +210,6 @@ __global__ void devFanBP_SART(float* D_volData, unsigned int volPitch, const SDi float* volData = (float*)D_volData; - // TODO: Distance correction? - // TODO: Constant memory vs parameters. const float fSrcX = gC_SrcX[0]; const float fSrcY = gC_SrcY[0]; @@ -211,15 +217,17 @@ __global__ void devFanBP_SART(float* D_volData, unsigned int volPitch, const SDi const float fDetSY = gC_DetSY[0]; const float fDetUX = gC_DetUX[0]; const float fDetUY = gC_DetUY[0]; + const float fScale = gC_Scale[0]; const float fXD = fSrcX - fX; const float fYD = fSrcY - fY; const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX; const float fDen = fDetUX * fYD - fDetUY * fXD; - - const float fT = fNum / fDen; - const float fVal = tex2D(gT_FanProjTexture, fT, 0.5f); + const float fr = __fdividef(1.0f, fDen); + + const float fT = fNum * fr; + const float fVal = tex2D(gT_FanProjTexture, fT, 0.5f) * fScale * fr; volData[Y*volPitch+X] += fVal * fOutputScale; } @@ -248,7 +256,7 @@ __global__ void devFanBP_FBPWeighted(float* D_volData, unsigned int volPitch, un float fVal = 0.0f; float fA = startAngle + 0.5f; - // TODO: Distance correction? + // TODO: Update for new projection scaling for (int angle = startAngle; angle < endAngle; ++angle) { @@ -299,6 +307,13 @@ bool FanBP_internal(float* D_volumeData, unsigned int volumePitch, #undef TRANSFER_TO_CONSTANT + for (unsigned int i = 0; i < dims.iProjAngles; ++i) { + double detsize = sqrt(angles[i].fDetUX * angles[i].fDetUX + angles[i].fDetUY * angles[i].fDetUY); + double scale = (angles[i].fDetUX * (angles[i].fSrcY - angles[i].fDetSY) - angles[i].fDetUY * (angles[i].fSrcX - angles[i].fDetSX)) / detsize; + tmp[i] = (float)scale; + } + cudaMemcpyToSymbol(gC_Scale, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + delete[] tmp; dim3 dimBlock(g_blockSlices, g_blockSliceSize); @@ -346,6 +361,14 @@ bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch, #undef TRANSFER_TO_CONSTANT + for (unsigned int i = 0; i < dims.iProjAngles; ++i) { + double detsize = sqrt(angles[i].fDetUX * angles[i].fDetUX + angles[i].fDetUY * angles[i].fDetUY); + double scale = (angles[i].fDetUX * (angles[i].fSrcY - angles[i].fDetSY) - angles[i].fDetUY * (angles[i].fSrcX - angles[i].fDetSX)) / detsize; + tmp[i] = (float)scale; + } + cudaMemcpyToSymbol(gC_Scale, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + + delete[] tmp; dim3 dimBlock(g_blockSlices, g_blockSliceSize); @@ -389,6 +412,11 @@ bool FanBP_SART(float* D_volumeData, unsigned int volumePitch, #undef TRANSFER_TO_CONSTANT + double detsize = sqrt(angles[angle].fDetUX * angles[angle].fDetUX + angles[angle].fDetUY * angles[angle].fDetUY); + double scale = (angles[angle].fDetUX * (angles[angle].fSrcY - angles[angle].fDetSY) - angles[angle].fDetUY * (angles[angle].fSrcX - angles[angle].fDetSX)) / detsize; + float tmp = (float)scale; + cudaMemcpyToSymbol(gC_Scale, &tmp, sizeof(float), 0, cudaMemcpyHostToDevice); + dim3 dimBlock(g_blockSlices, g_blockSliceSize); dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices, (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize); diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu index 09a6554..21484b1 100644 --- a/cuda/2d/par_bp.cu +++ b/cuda/2d/par_bp.cu @@ -53,6 +53,7 @@ const unsigned int g_MaxAngles = 2560; __constant__ float gC_angle_scaled_sin[g_MaxAngles]; __constant__ float gC_angle_scaled_cos[g_MaxAngles]; __constant__ float gC_angle_offset[g_MaxAngles]; +__constant__ float gC_angle_scale[g_MaxAngles]; static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height, cudaTextureAddressMode mode = cudaAddressModeBorder) { @@ -70,6 +71,7 @@ static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int wi return true; } +// TODO: Templated version with/without scale? (Or only the global outputscale) __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims, float fOutputScale) { const int relX = threadIdx.x; @@ -97,9 +99,10 @@ __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int star const float scaled_cos_theta = gC_angle_scaled_cos[angle]; const float scaled_sin_theta = gC_angle_scaled_sin[angle]; const float TOffset = gC_angle_offset[angle]; + const float scale = gC_angle_scale[angle]; const float fT = fX * scaled_cos_theta - fY * scaled_sin_theta + TOffset; - fVal += tex2D(gT_projTexture, fT, fA); + fVal += tex2D(gT_projTexture, fT, fA) * scale; fA += 1.0f; } @@ -138,6 +141,7 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s const float cos_theta = gC_angle_scaled_cos[angle]; const float sin_theta = gC_angle_scaled_sin[angle]; const float TOffset = gC_angle_offset[angle]; + const float scale = gC_angle_scale[angle]; float fT = fX * cos_theta - fY * sin_theta + TOffset; @@ -145,7 +149,7 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s float fTy = fT; fT += fSubStep * cos_theta; for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) { - fVal += tex2D(gT_projTexture, fTy, fA); + fVal += tex2D(gT_projTexture, fTy, fA) * scale; fTy -= fSubStep * sin_theta; } } @@ -172,6 +176,8 @@ __global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset const float fT = fX * angle_cos - fY * angle_sin + offset; const float fVal = tex2D(gT_projTexture, fT, 0.5f); + // NB: The 'scale' constant in devBP has been folded into fOutputScale by the caller here + D_volData[Y*volPitch+X] += fVal * fOutputScale; } @@ -186,27 +192,34 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch, float* angle_scaled_sin = new float[dims.iProjAngles]; float* angle_scaled_cos = new float[dims.iProjAngles]; float* angle_offset = new float[dims.iProjAngles]; + float* angle_scale = new float[dims.iProjAngles]; bindProjDataTexture(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); for (unsigned int i = 0; i < dims.iProjAngles; ++i) { double d = angles[i].fDetUX * angles[i].fRayY - angles[i].fDetUY * angles[i].fRayX; angle_scaled_cos[i] = angles[i].fRayY / d; - angle_scaled_sin[i] = -angles[i].fRayX / d; // TODO: Check signs + angle_scaled_sin[i] = -angles[i].fRayX / d; angle_offset[i] = (angles[i].fDetSY * angles[i].fRayX - angles[i].fDetSX * angles[i].fRayY) / d; + angle_scale[i] = sqrt(angles[i].fRayX * angles[i].fRayX + angles[i].fRayY * angles[i].fRayY) / abs(d); } + //fprintf(stderr, "outputscale in BP_internal: %f, %f\n", fOutputScale, angle_scale[0]); + //fprintf(stderr, "ray in BP_internal: %f,%f (length %f)\n", angles[0].fRayX, angles[0].fRayY, sqrt(angles[0].fRayX * angles[0].fRayX + angles[0].fRayY * angles[0].fRayY)); cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_scaled_sin, angle_scaled_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_scaled_cos, angle_scaled_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); cudaError_t e3 = cudaMemcpyToSymbol(gC_angle_offset, angle_offset, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + cudaError_t e4 = cudaMemcpyToSymbol(gC_angle_scale, angle_scale, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); assert(e1 == cudaSuccess); assert(e2 == cudaSuccess); assert(e3 == cudaSuccess); + assert(e4 == cudaSuccess); delete[] angle_scaled_sin; delete[] angle_scaled_cos; delete[] angle_offset; + delete[] angle_scale; dim3 dimBlock(g_blockSlices, g_blockSliceSize); dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices, @@ -267,6 +280,7 @@ bool BP_SART(float* D_volumeData, unsigned int volumePitch, float angle_scaled_cos = angles[angle].fRayY / d; float angle_scaled_sin = -angles[angle].fRayX / d; // TODO: Check signs float angle_offset = (angles[angle].fDetSY * angles[angle].fRayX - angles[angle].fDetSX * angles[angle].fRayY) / d; + fOutputScale *= sqrt(angles[angle].fRayX * angles[angle].fRayX + angles[angle].fRayY * angles[angle].fRayY) / abs(d); dim3 dimBlock(g_blockSlices, g_blockSliceSize); dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices, diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu index feebda2..2a7ec80 100644 --- a/cuda/3d/cone_bp.cu +++ b/cuda/3d/cone_bp.cu @@ -140,8 +140,10 @@ __global__ void dev_cone_BP(void* D_volData, unsigned int volPitch, int startAng if (FDKWEIGHT) { // The correct factor here is this one: // Z[idx] += (fr*fCd.w)*(fr*fCd.w)*fVal; - // This is the square of the inverse magnification factor - // from fX,fY,fZ to the detector. + // This is the square of the magnification factor + // from fX,fY,fZ to the virtual detector, where the + // virtual detector is the plane through the origin + // parallel to the detector, so spanned by u,v. // Since we are assuming we have a circular cone // beam trajectory, fCd.w is constant, and we instead diff --git a/src/CudaReconstructionAlgorithm2D.cpp b/src/CudaReconstructionAlgorithm2D.cpp index 1e81390..939a026 100644 --- a/src/CudaReconstructionAlgorithm2D.cpp +++ b/src/CudaReconstructionAlgorithm2D.cpp @@ -309,8 +309,7 @@ void CCudaReconstructionAlgorithm2D::run(int _iNrIterations) m_bAlgoInit = true; } - float fPixelSize = volgeom.getPixelLengthX(); - float fSinogramScale = 1.0f/(fPixelSize*fPixelSize); + float fSinogramScale = 1.0f; ok = m_pAlgo->copyDataToGPU(m_pSinogram->getDataConst(), m_pSinogram->getGeometry()->getDetectorCount(), fSinogramScale, m_pReconstruction->getDataConst(), volgeom.getGridColCount(), diff --git a/tests/python/test_line2d.py b/tests/python/test_line2d.py index 4c3d174..8b6e200 100644 --- a/tests/python/test_line2d.py +++ b/tests/python/test_line2d.py @@ -586,10 +586,66 @@ class Test2DKernel(unittest.TestCase): else: raise RuntimeError("Unsupported projector") + def single_test_adjoint(self, type, proj_type): + shape = np.random.randint(*range2d, size=2) + if FLEXVOL: + if not NONSQUARE: + pixsize = np.array([0.5, 0.5]) + np.random.random() + else: + pixsize = 0.5 + np.random.random(size=2) + origin = 10 * np.random.random(size=2) + else: + pixsize = (1.,1.) + origin = (0.,0.) + vg = astra.create_vol_geom(shape[1], shape[0], + origin[0] - 0.5 * shape[0] * pixsize[0], + origin[0] + 0.5 * shape[0] * pixsize[0], + origin[1] - 0.5 * shape[1] * pixsize[1], + origin[1] + 0.5 * shape[1] * pixsize[1]) + + if type == 'parallel': + pg = gen_random_geometry_parallel() + projector_id = astra.create_projector(proj_type, pg, vg) + elif type == 'parallel_vec': + pg = gen_random_geometry_parallel_vec() + projector_id = astra.create_projector(proj_type, pg, vg) + elif type == 'fanflat': + pg = gen_random_geometry_fanflat() + projector_id = astra.create_projector(proj_type_to_fan(proj_type), pg, vg) + elif type == 'fanflat_vec': + pg = gen_random_geometry_fanflat_vec() + projector_id = astra.create_projector(proj_type_to_fan(proj_type), pg, vg) + + for i in range(5): + X = np.random.random((shape[1], shape[0])) + Y = np.random.random(astra.geom_size(pg)) + + sinogram_id, fX = astra.create_sino(X, projector_id) + bp_id, fTY = astra.create_backprojection(Y, projector_id) + + astra.data2d.delete(sinogram_id) + astra.data2d.delete(bp_id) + + da = np.dot(fX.ravel(), Y.ravel()) + db = np.dot(X.ravel(), fTY.ravel()) + m = np.abs(da - db) + TOL = 1e-3 if 'cuda' not in proj_type else 1e-1 + if m / da >= TOL: + print(vg) + print(pg) + print(m/da, da/db, da, db) + self.assertTrue(m / da < TOL) + astra.projector.delete(projector_id) + + def multi_test(self, type, proj_type): np.random.seed(seed) for _ in range(nloops): self.single_test(type, proj_type) + def multi_test_adjoint(self, type, proj_type): + np.random.seed(seed) + for _ in range(nloops): + self.single_test_adjoint(type, proj_type) __combinations = { 'parallel': [ 'line', 'linear', 'distance_driven', 'strip', 'cuda' ], 'parallel_vec': [ 'line', 'linear', 'distance_driven', 'strip', 'cuda' ], @@ -600,7 +656,10 @@ for k, l in __combinations.items(): for v in l: def f(k,v): return lambda self: self.multi_test(k, v) + def f_adj(k,v): + return lambda self: self.multi_test_adjoint(k, v) setattr(Test2DKernel, 'test_' + k + '_' + v, f(k,v)) + setattr(Test2DKernel, 'test_' + k + '_' + v + '_adjoint', f_adj(k,v)) if __name__ == '__main__': unittest.main() |