Merge branch 'consistent_scaling'

author: Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl> 2019-09-27 15:16:26 +0200
committer: Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl> 2019-09-27 15:16:26 +0200
commit: 54af7e8e22a3f1c9d90b13291b28d39778c05564 (patch)
tree: 260310b16d624261bb80f82979af27750022259b /cuda/2d
parent: 1fec36f7ccadd5f7dcf2bb59b0654dc53653b0f3 (diff)
parent: b629db207bb263495bfff2e61ce189ccac27b4b9 (diff)
download: astra-54af7e8e22a3f1c9d90b13291b28d39778c05564.tar.gz
astra-54af7e8e22a3f1c9d90b13291b28d39778c05564.tar.bz2
astra-54af7e8e22a3f1c9d90b13291b28d39778c05564.tar.xz
astra-54af7e8e22a3f1c9d90b13291b28d39778c05564.zip
12 files changed, 175 insertions, 839 deletions
diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu
index b4c2864..be15b25 100644
--- a/cuda/2d/algo.cu
+++ b/cuda/2d/algo.cu
@@ -134,8 +134,8 @@ bool ReconAlgo::setGeometry(const astra::CVolumeGeometry2D* pVolGeom,
 	delete[] fanProjs;
 	fanProjs = 0;
 
-	fOutputScale = 1.0f;
-	ok = convertAstraGeometry(pVolGeom, pProjGeom, parProjs, fanProjs, fOutputScale);
+	fProjectorScale = 1.0f;
+	ok = convertAstraGeometry(pVolGeom, pProjGeom, parProjs, fanProjs, fProjectorScale);
 	if (!ok)
 		return false;
 
@@ -242,7 +242,7 @@ bool ReconAlgo::allocateBuffers()
 	return true;
 }
 
-bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale,
+bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch,
                               const float* pfReconstruction, unsigned int iReconstructionPitch,
                               const float* pfVolMask, unsigned int iVolMaskPitch,
                               const float* pfSinoMask, unsigned int iSinoMaskPitch)
@@ -258,11 +258,6 @@ bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPit
 	if (!ok)
 		return false;
 
-	// rescale sinogram to adjust for pixel size
-	processSino<opMul>(D_sinoData, fSinogramScale,
-	                       //1.0f/(fPixelSize*fPixelSize),
-	                       sinoPitch, dims);
-
 	ok = copyVolumeToDevice(pfReconstruction, iReconstructionPitch,
 	                        dims,
 	                        D_volumeData, volumePitch);
@@ -316,11 +311,11 @@ bool ReconAlgo::callFP(float* D_volumeData, unsigned int volumePitch,
 	if (parProjs) {
 		assert(!fanProjs);
 		return FP(D_volumeData, volumePitch, D_projData, projPitch,
-		          dims, parProjs, fOutputScale * outputScale);
+		          dims, parProjs, fProjectorScale * outputScale);
 	} else {
 		assert(fanProjs);
 		return FanFP(D_volumeData, volumePitch, D_projData, projPitch,
-		             dims, fanProjs, fOutputScale * outputScale);
+		             dims, fanProjs, fProjectorScale * outputScale);
 	}
 }
 
@@ -331,11 +326,11 @@ bool ReconAlgo::callBP(float* D_volumeData, unsigned int volumePitch,
 	if (parProjs) {
 		assert(!fanProjs);
 		return BP(D_volumeData, volumePitch, D_projData, projPitch,
-		          dims, parProjs, outputScale);
+		          dims, parProjs, fProjectorScale * outputScale);
 	} else {
 		assert(fanProjs);
 		return FanBP(D_volumeData, volumePitch, D_projData, projPitch,
-		             dims, fanProjs, outputScale);
+		             dims, fanProjs, fProjectorScale * outputScale);
 	}
 
 }
diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu
index ec03517..7ff1c95 100644
--- a/cuda/2d/astra.cu
+++ b/cuda/2d/astra.cu
@@ -302,7 +302,8 @@ static bool convertAstraGeometry_internal(const CVolumeGeometry2D* pVolGeom,
 		pProjs[i].scale(factor);
 	}
 	// CHECKME: Check factor
-	fOutputScale *= pVolGeom->getPixelLengthX() * pVolGeom->getPixelLengthY();
+	// NB: Only valid for square pixels
+	fOutputScale *= pVolGeom->getPixelLengthX();
 
 	return true;
 }
diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu
index b6a9fae..e7238b9 100644
--- a/cuda/2d/cgls.cu
+++ b/cuda/2d/cgls.cu
@@ -29,10 +29,6 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 #include "astra/cuda/2d/util.h"
 #include "astra/cuda/2d/arith.h"
 
-#ifdef STANDALONE
-#include "testutil.h"
-#endif
-
 #include <cstdio>
 #include <cassert>
 
@@ -102,14 +98,14 @@ bool CGLS::setBuffers(float* _D_volumeData, unsigned int _volumePitch,
 	return true;
 }
 
-bool CGLS::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale,
+bool CGLS::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch,
                          const float* pfReconstruction, unsigned int iReconstructionPitch,
                          const float* pfVolMask, unsigned int iVolMaskPitch,
                          const float* pfSinoMask, unsigned int iSinoMaskPitch)
 {
 	sliceInitialized = false;
 
-	return ReconAlgo::copyDataToGPU(pfSinogram, iSinogramPitch, fSinogramScale, pfReconstruction, iReconstructionPitch, pfVolMask, iVolMaskPitch, pfSinoMask, iSinoMaskPitch);
+	return ReconAlgo::copyDataToGPU(pfSinogram, iSinogramPitch, pfReconstruction, iReconstructionPitch, pfVolMask, iVolMaskPitch, pfSinoMask, iSinoMaskPitch);
 }
 
 bool CGLS::iterate(unsigned int iterations)
@@ -206,60 +202,3 @@ float CGLS::computeDiffNorm()
 
 
 }
-
-#ifdef STANDALONE
-
-using namespace astraCUDA;
-
-int main()
-{
-	float* D_volumeData;
-	float* D_sinoData;
-
-	SDimensions dims;
-	dims.iVolWidth = 1024;
-	dims.iVolHeight = 1024;
-	dims.iProjAngles = 512;
-	dims.iProjDets = 1536;
-	dims.fDetScale = 1.0f;
-	dims.iRaysPerDet = 1;
-	unsigned int volumePitch, sinoPitch;
-
-	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
-	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);
-	printf("pitch: %u\n", volumePitch);
-
-	allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
-	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
-	printf("pitch: %u\n", sinoPitch);
-	
-	unsigned int y, x;
-	float* sino = loadImage("sino.png", y, x);
-
-	float* img = new float[dims.iVolWidth*dims.iVolHeight];
-
-	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch);
-
-	float* angle = new float[dims.iProjAngles];
-
-	for (unsigned int i = 0; i < dims.iProjAngles; ++i)
-		angle[i] = i*(M_PI/dims.iProjAngles);
-
-	CGLS cgls;
-
-	cgls.setGeometry(dims, angle);
-	cgls.init();
-
-	cgls.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
-
-	cgls.iterate(25);
-
-	delete[] angle;
-
-	copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
-
-	saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
-
-	return 0;
-}
-#endif
diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu
index aa272d8..df140ec 100644
--- a/cuda/2d/em.cu
+++ b/cuda/2d/em.cu
@@ -29,10 +29,6 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 #include "astra/cuda/2d/util.h"
 #include "astra/cuda/2d/arith.h"
 
-#ifdef STANDALONE
-#include "testutil.h"
-#endif
-
 #include <cstdio>
 #include <cassert>
 
@@ -168,64 +164,3 @@ float EM::computeDiffNorm()
 
 
 }
-
-#ifdef STANDALONE
-
-using namespace astraCUDA;
-
-int main()
-{
-	float* D_volumeData;
-	float* D_sinoData;
-
-	SDimensions dims;
-	dims.iVolWidth = 1024;
-	dims.iVolHeight = 1024;
-	dims.iProjAngles = 512;
-	dims.iProjDets = 1536;
-	dims.fDetScale = 1.0f;
-	dims.iRaysPerDet = 1;
-	unsigned int volumePitch, sinoPitch;
-
-	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
-	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);
-	printf("pitch: %u\n", volumePitch);
-
-	allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
-	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
-	printf("pitch: %u\n", sinoPitch);
-	
-	unsigned int y, x;
-	float* sino = loadImage("sino.png", y, x);
-
-	float* img = new float[dims.iVolWidth*dims.iVolHeight];
-
-	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch);
-
-	float* angle = new float[dims.iProjAngles];
-
-	for (unsigned int i = 0; i < dims.iProjAngles; ++i)
-		angle[i] = i*(M_PI/dims.iProjAngles);
-
-	EM em;
-
-	em.setGeometry(dims, angle);
-	em.init();
-
-	// TODO: Initialize D_volumeData with an unfiltered backprojection
-
-	em.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
-
-	em.iterate(25);
-
-
-	delete[] angle;
-
-	copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
-
-	saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
-
-	return 0;
-}
-
-#endif
diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu
index dac3ac2..76d2fb9 100644
--- a/cuda/2d/fan_bp.cu
+++ b/cuda/2d/fan_bp.cu
@@ -28,10 +28,6 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 #include "astra/cuda/2d/util.h"
 #include "astra/cuda/2d/arith.h"
 
-#ifdef STANDALONE
-#include "testutil.h"
-#endif
-
 #include <cstdio>
 #include <cassert>
 #include <iostream>
@@ -50,12 +46,16 @@ const unsigned int g_blockSlices = 16;
 
 const unsigned int g_MaxAngles = 2560;
 
-__constant__ float gC_SrcX[g_MaxAngles];
-__constant__ float gC_SrcY[g_MaxAngles];
-__constant__ float gC_DetSX[g_MaxAngles];
-__constant__ float gC_DetSY[g_MaxAngles];
-__constant__ float gC_DetUX[g_MaxAngles];
-__constant__ float gC_DetUY[g_MaxAngles];
+struct DevFanParams {
+	float fNumC;
+	float fNumX;
+	float fNumY;
+	float fDenC;
+	float fDenX;
+	float fDenY;
+};
+
+__constant__ DevFanParams gC_C[g_MaxAngles];
 
 
 static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height, cudaTextureAddressMode mode = cudaAddressModeBorder)
@@ -74,6 +74,7 @@ static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int wi
 	return true;
 }
 
+template<bool FBPWEIGHT>
 __global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims, float fOutputScale)
 {
 	const int relX = threadIdx.x;
@@ -96,25 +97,25 @@ __global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int s
 	float fVal = 0.0f;
 	float fA = startAngle + 0.5f;
 
-	// TODO: Distance correction?
-
 	for (int angle = startAngle; angle < endAngle; ++angle)
 	{
-		const float fSrcX = gC_SrcX[angle];
-		const float fSrcY = gC_SrcY[angle];
-		const float fDetSX = gC_DetSX[angle];
-		const float fDetSY = gC_DetSY[angle];
-		const float fDetUX = gC_DetUX[angle];
-		const float fDetUY = gC_DetUY[angle];
-
-		const float fXD = fSrcX - fX;
-		const float fYD = fSrcY - fY;
-
-		const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;
-		const float fDen = fDetUX * fYD - fDetUY * fXD;
-		
-		const float fT = fNum / fDen;
-		fVal += tex2D(gT_FanProjTexture, fT, fA);
+		const float fNumC = gC_C[angle].fNumC;
+		const float fNumX = gC_C[angle].fNumX;
+		const float fNumY = gC_C[angle].fNumY;
+		const float fDenX = gC_C[angle].fDenX;
+		const float fDenY = gC_C[angle].fDenY;
+
+		const float fNum = fNumC + fNumX * fX + fNumY * fY;
+		const float fDen = (FBPWEIGHT ? 1.0 : gC_C[angle].fDenC) + fDenX * fX + fDenY * fY;
+
+		// Scale factor is the approximate number of rays traversing this pixel,
+		// given by the inverse size of a detector pixel scaled by the magnification
+		// factor of this pixel.
+		// Magnification factor is || u (d-s) || / || u (x-s) ||
+
+		const float fr = __fdividef(1.0f, fDen);
+		const float fT = fNum * fr;
+		fVal += tex2D(gT_FanProjTexture, fT, fA) * (FBPWEIGHT ? fr * fr : fr);
 		fA += 1.0f;
 	}
 
@@ -148,30 +149,27 @@ __global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned in
 	float fVal = 0.0f;
 	float fA = startAngle + 0.5f;
 
-	// TODO: Distance correction?
-
 	for (int angle = startAngle; angle < endAngle; ++angle)
 	{
-		const float fSrcX = gC_SrcX[angle];
-		const float fSrcY = gC_SrcY[angle];
-		const float fDetSX = gC_DetSX[angle];
-		const float fDetSY = gC_DetSY[angle];
-		const float fDetUX = gC_DetUX[angle];
-		const float fDetUY = gC_DetUY[angle];
+		const float fNumC = gC_C[angle].fNumC;
+		const float fNumX = gC_C[angle].fNumX;
+		const float fNumY = gC_C[angle].fNumY;
+		const float fDenC = gC_C[angle].fDenC;
+		const float fDenX = gC_C[angle].fDenX;
+		const float fDenY = gC_C[angle].fDenY;
 
 		// TODO: Optimize these loops...
 		float fX = fXb;
 		for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) {
 			float fY = fYb;
 			for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) {
-				const float fXD = fSrcX - fX;
-				const float fYD = fSrcY - fY;
-
-				const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;
-				const float fDen = fDetUX * fYD - fDetUY * fXD;
-		
-				const float fT = fNum / fDen;
-				fVal += tex2D(gT_FanProjTexture, fT, fA);
+
+				const float fNum = fNumC + fNumX * fX + fNumY * fY;
+				const float fDen = fDenC + fDenX * fX + fDenY * fY;
+				const float fr = __fdividef(1.0f, fDen);
+
+				const float fT = fNum * fr;
+				fVal += tex2D(gT_FanProjTexture, fT, fA) * fr;
 				fY -= fSubStep;
 			}
 			fX += fSubStep;
@@ -202,77 +200,97 @@ __global__ void devFanBP_SART(float* D_volData, unsigned int volPitch, const SDi
 
 	float* volData = (float*)D_volData;
 
-	// TODO: Distance correction?
-
-	// TODO: Constant memory vs parameters.
-	const float fSrcX = gC_SrcX[0];
-	const float fSrcY = gC_SrcY[0];
-	const float fDetSX = gC_DetSX[0];
-	const float fDetSY = gC_DetSY[0];
-	const float fDetUX = gC_DetUX[0];
-	const float fDetUY = gC_DetUY[0];
+	const float fNumC = gC_C[0].fNumC;
+	const float fNumX = gC_C[0].fNumX;
+	const float fNumY = gC_C[0].fNumY;
+	const float fDenC = gC_C[0].fDenC;
+	const float fDenX = gC_C[0].fDenX;
+	const float fDenY = gC_C[0].fDenY;
 
-	const float fXD = fSrcX - fX;
-	const float fYD = fSrcY - fY;
+	const float fNum = fNumC + fNumX * fX + fNumY * fY;
+	const float fDen = fDenC + fDenX * fX + fDenY * fY;
 
-	const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;
-	const float fDen = fDetUX * fYD - fDetUY * fXD;
-		
-	const float fT = fNum / fDen;
+	const float fr = __fdividef(1.0f, fDen);
+	const float fT = fNum * fr;
+	// NB: The scale constant in devBP is cancelled out by the SART weighting
 	const float fVal = tex2D(gT_FanProjTexture, fT, 0.5f);
 
 	volData[Y*volPitch+X] += fVal * fOutputScale;
 }
 
-// Weighted BP for use in fan beam FBP
-// Each pixel/ray is weighted by 1/L^2 where L is the distance to the source.
-__global__ void devFanBP_FBPWeighted(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims, float fOutputScale)
+struct Vec2 {
+	double x;
+	double y;
+	Vec2(double x_, double y_) : x(x_), y(y_) { }
+	Vec2 operator+(const Vec2 &b) const {
+		return Vec2(x + b.x, y + b.y);
+	}
+	Vec2 operator-(const Vec2 &b) const {
+		return Vec2(x - b.x, y - b.y);
+	}
+	Vec2 operator-() const {
+		return Vec2(-x, -y);
+	}
+	double norm() const {
+		return sqrt(x*x + y*y);
+	}
+};
+
+double det2(const Vec2 &a, const Vec2 &b) {
+	return a.x * b.y - a.y * b.x;
+}
+
+
+bool transferConstants(const SFanProjection* angles, unsigned int iProjAngles, bool FBP)
 {
-	const int relX = threadIdx.x;
-	const int relY = threadIdx.y;
+	DevFanParams *p = new DevFanParams[iProjAngles];
 
-	int endAngle = startAngle + g_anglesPerBlock;
-	if (endAngle > dims.iProjAngles)
-		endAngle = dims.iProjAngles;
-	const int X = blockIdx.x * g_blockSlices + relX;
-	const int Y = blockIdx.y * g_blockSliceSize + relY;
+	// We need three values in the kernel:
+	// projected coordinates of pixels on the detector:
+	// || x (s-d) || + ||s d|| / || u (s-x) ||
 
-	if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
-		return;
+	// ray density weighting factor for the adjoint
+	// || u (s-d) || / ( |u| * || u (s-x) || )
 
-	const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f );
-	const float fY = - ( Y - 0.5f*dims.iVolHeight + 0.5f );
+	// fan-beam FBP weighting factor
+	// ( || u s || / || u (s-x) || ) ^ 2
 
-	float* volData = (float*)D_volData;
 
-	float fVal = 0.0f;
-	float fA = startAngle + 0.5f;
 
-	// TODO: Distance correction?
+	for (unsigned int i = 0; i < iProjAngles; ++i) {
+		Vec2 u(angles[i].fDetUX, angles[i].fDetUY);
+		Vec2 s(angles[i].fSrcX, angles[i].fSrcY);
+		Vec2 d(angles[i].fDetSX, angles[i].fDetSY);
 
-	for (int angle = startAngle; angle < endAngle; ++angle)
-	{
-		const float fSrcX = gC_SrcX[angle];
-		const float fSrcY = gC_SrcY[angle];
-		const float fDetSX = gC_DetSX[angle];
-		const float fDetSY = gC_DetSY[angle];
-		const float fDetUX = gC_DetUX[angle];
-		const float fDetUY = gC_DetUY[angle];
-
-		const float fXD = fSrcX - fX;
-		const float fYD = fSrcY - fY;
-
-		const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;
-		const float fDen = fDetUX * fYD - fDetUY * fXD;
-
-		const float fWeight = fXD*fXD + fYD*fYD;
-		
-		const float fT = fNum / fDen;
-		fVal += tex2D(gT_FanProjTexture, fT, fA) / fWeight;
-		fA += 1.0f;
+
+
+		double fScale;
+		if (!FBP) {
+			// goal: 1/fDen = || u (s-d) || / ( |u| * || u (s-x) || )
+			// fDen = ( |u| * || u (s-x) || ) / || u (s-d) ||
+			// i.e. scale = |u| /  || u (s-d) ||
+
+			fScale = u.norm() / det2(u, s-d);
+		} else {
+			// goal: 1/fDen = || u s || / || u (s-x) ||
+			// fDen = || u (s-x) || / || u s ||
+			// i.e., scale = 1 / || u s ||
+
+			fScale = 1.0 / det2(u, s);
+		}
+
+		p[i].fNumC = fScale * det2(s,d);
+		p[i].fNumX = fScale * (s-d).y;
+		p[i].fNumY = -fScale * (s-d).x;
+		p[i].fDenC = fScale * det2(u, s); // == 1.0 for FBP
+		p[i].fDenX = fScale * u.y;
+		p[i].fDenY = -fScale * u.x;
 	}
 
-	volData[Y*volPitch+X] += fVal * fOutputScale;
+	// TODO: Check for errors
+	cudaMemcpyToSymbol(gC_C, p, iProjAngles*sizeof(DevFanParams), 0, cudaMemcpyHostToDevice);
+
+	return true;
 }
 
 
@@ -285,21 +303,9 @@ bool FanBP_internal(float* D_volumeData, unsigned int volumePitch,
 
 	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
 
-	// transfer angles to constant memory
-	float* tmp = new float[dims.iProjAngles];
-
-#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
-
-	TRANSFER_TO_CONSTANT(SrcX);
-	TRANSFER_TO_CONSTANT(SrcY);
-	TRANSFER_TO_CONSTANT(DetSX);
-	TRANSFER_TO_CONSTANT(DetSY);
-	TRANSFER_TO_CONSTANT(DetUX);
-	TRANSFER_TO_CONSTANT(DetUY);
-
-#undef TRANSFER_TO_CONSTANT
-
-	delete[] tmp;
+	bool ok = transferConstants(angles, dims.iProjAngles, false);
+	if (!ok)
+		return false;
 
 	dim3 dimBlock(g_blockSlices, g_blockSliceSize);
 	dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
@@ -312,7 +318,7 @@ bool FanBP_internal(float* D_volumeData, unsigned int volumePitch,
 		if (dims.iRaysPerPixelDim > 1)
 			devFanBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
 		else
-			devFanBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
+			devFanBP<false><<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
 	}
 	cudaThreadSynchronize();
 
@@ -332,21 +338,9 @@ bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch,
 
 	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
 
-	// transfer angles to constant memory
-	float* tmp = new float[dims.iProjAngles];
-
-#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
-
-	TRANSFER_TO_CONSTANT(SrcX);
-	TRANSFER_TO_CONSTANT(SrcY);
-	TRANSFER_TO_CONSTANT(DetSX);
-	TRANSFER_TO_CONSTANT(DetSY);
-	TRANSFER_TO_CONSTANT(DetUX);
-	TRANSFER_TO_CONSTANT(DetUY);
-
-#undef TRANSFER_TO_CONSTANT
-
-	delete[] tmp;
+	bool ok = transferConstants(angles, dims.iProjAngles, true);
+	if (!ok)
+		return false;
 
 	dim3 dimBlock(g_blockSlices, g_blockSliceSize);
 	dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
@@ -356,7 +350,7 @@ bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch,
 	cudaStreamCreate(&stream);
 
 	for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
-		devFanBP_FBPWeighted<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
+		devFanBP<true><<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
 	}
 	cudaThreadSynchronize();
 
@@ -377,17 +371,9 @@ bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,
 	// only one angle
 	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1, cudaAddressModeClamp);
 
-	// transfer angle to constant memory
-#define TRANSFER_TO_CONSTANT(name) do { cudaMemcpyToSymbol(gC_##name, &(angles[angle].f##name), sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
-
-	TRANSFER_TO_CONSTANT(SrcX);
-	TRANSFER_TO_CONSTANT(SrcY);
-	TRANSFER_TO_CONSTANT(DetSX);
-	TRANSFER_TO_CONSTANT(DetSY);
-	TRANSFER_TO_CONSTANT(DetUX);
-	TRANSFER_TO_CONSTANT(DetUY);
-
-#undef TRANSFER_TO_CONSTANT
+	bool ok = transferConstants(angles + angle, 1, false);
+	if (!ok)
+		return false;
 
 	dim3 dimBlock(g_blockSlices, g_blockSliceSize);
 	dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
@@ -448,66 +434,3 @@ bool FanBP_FBPWeighted(float* D_volumeData, unsigned int volumePitch,
 
 
 }
-
-#ifdef STANDALONE
-
-using namespace astraCUDA;
-
-int main()
-{
-	float* D_volumeData;
-	float* D_projData;
-
-	SDimensions dims;
-	dims.iVolWidth = 128;
-	dims.iVolHeight = 128;
-	dims.iProjAngles = 180;
-	dims.iProjDets = 256;
-	dims.fDetScale = 1.0f;
-	dims.iRaysPerDet = 1;
-	unsigned int volumePitch, projPitch;
-
-	SFanProjection projs[180];
-
-	projs[0].fSrcX = 0.0f;
-	projs[0].fSrcY = 1536.0f;
-	projs[0].fDetSX = 128.0f;
-	projs[0].fDetSY = -512.0f;
-	projs[0].fDetUX = -1.0f;
-	projs[0].fDetUY = 0.0f;
-
-#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = projs[0].f##name##X * cos(alpha) - projs[0].f##name##Y * sin(alpha); projs[i].f##name##Y = projs[0].f##name##X * sin(alpha) + projs[0].f##name##Y * cos(alpha); } while(0)
-
-	for (int i = 1; i < 180; ++i) {
-		ROTATE0(Src, i, i*2*M_PI/180);
-		ROTATE0(DetS, i, i*2*M_PI/180);
-		ROTATE0(DetU, i, i*2*M_PI/180);
-	}
-
-#undef ROTATE0
-
-	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
-	printf("pitch: %u\n", volumePitch);
-
-	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
-	printf("pitch: %u\n", projPitch);
-
-	unsigned int y, x;
-	float* sino = loadImage("sino.png", y, x);
-
-	float* img = new float[dims.iVolWidth*dims.iVolHeight];
-
-	memset(img, 0, dims.iVolWidth*dims.iVolHeight*sizeof(float));
-
-	copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
-	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
-
-	FanBP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs, 1.0f);
-
-	copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
-
-	saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
-
-	return 0;
-}
-#endif
diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu
index 3479650..60c02f8 100644
--- a/cuda/2d/fan_fp.cu
+++ b/cuda/2d/fan_fp.cu
@@ -28,10 +28,6 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 #include "astra/cuda/2d/util.h"
 #include "astra/cuda/2d/arith.h"
 
-#ifdef STANDALONE
-#include "testutil.h"
-#endif
-
 #include <cstdio>
 #include <cassert>
 #include <iostream>
@@ -308,84 +304,3 @@ bool FanFP(float* D_volumeData, unsigned int volumePitch,
 }
 
 }
-
-#ifdef STANDALONE
-
-using namespace astraCUDA;
-
-int main()
-{
-	float* D_volumeData;
-	float* D_projData;
-
-	SDimensions dims;
-	dims.iVolWidth = 128;
-	dims.iVolHeight = 128;
-	dims.iProjAngles = 180;
-	dims.iProjDets = 256;
-	dims.fDetScale = 1.0f;
-	dims.iRaysPerDet = 1;
-	unsigned int volumePitch, projPitch;
-
-	SFanProjection projs[180];
-
-	projs[0].fSrcX = 0.0f;
-	projs[0].fSrcY = 1536.0f;
-	projs[0].fDetSX = 128.0f;
-	projs[0].fDetSY = -512.0f;
-	projs[0].fDetUX = -1.0f;
-	projs[0].fDetUY = 0.0f;
-
-#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = projs[0].f##name##X * cos(alpha) - projs[0].f##name##Y * sin(alpha); projs[i].f##name##Y = projs[0].f##name##X * sin(alpha) + projs[0].f##name##Y * cos(alpha); } while(0)
-
-	for (int i = 1; i < 180; ++i) {
-		ROTATE0(Src, i, i*2*M_PI/180);
-		ROTATE0(DetS, i, i*2*M_PI/180);
-		ROTATE0(DetU, i, i*2*M_PI/180);
-	}
-
-#undef ROTATE0
-
-	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
-	printf("pitch: %u\n", volumePitch);
-
-	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
-	printf("pitch: %u\n", projPitch);
-
-	unsigned int y, x;
-	float* img = loadImage("phantom128.png", y, x);
-
-	float* sino = new float[dims.iProjAngles * dims.iProjDets];
-
-	memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float));
-
-	copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
-	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
-
-	float* angle = new float[dims.iProjAngles];
-
-	for (unsigned int i = 0; i < dims.iProjAngles; ++i)
-		angle[i] = i*(M_PI/dims.iProjAngles);
-
-	FanFP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs, 1.0f);
-
-	delete[] angle;
-
-	copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
-
-	float s = 0.0f;
-	for (unsigned int y = 0; y < dims.iProjAngles; ++y)
-		for (unsigned int x = 0; x < dims.iProjDets; ++x)
-			s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];
-	printf("cpu norm: %f\n", s);
-
-	//zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
-	s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
-	printf("gpu norm: %f\n", s);
-
-	saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino);
-
-
-	return 0;
-}
-#endif
diff --git a/cuda/2d/fbp.cu b/cuda/2d/fbp.cu
index a5b8a7a..4fc3983 100644
--- a/cuda/2d/fbp.cu
+++ b/cuda/2d/fbp.cu
@@ -58,7 +58,8 @@ int FBP::calcFourierFilterSize(int _iDetectorCount)
 FBP::FBP() : ReconAlgo()
 {
 	D_filter = 0;
-
+	m_bShortScan = false;
+	fReconstructionScale = 1.0f;
 }
 
 FBP::~FBP()
@@ -72,6 +73,8 @@ void FBP::reset()
 		freeComplexOnDevice((cufftComplex *)D_filter);
 		D_filter = 0;
 	}
+	m_bShortScan = false;
+	fReconstructionScale = 1.0f;
 }
 
 bool FBP::init()
@@ -79,6 +82,12 @@ bool FBP::init()
 	return true;
 }
 
+bool FBP::setReconstructionScale(float fScale)
+{
+	fReconstructionScale = fScale;
+	return true;
+}
+
 bool FBP::setFilter(const astra::SFilterConfig &_cfg)
 {
 	if (D_filter)
@@ -292,7 +301,7 @@ bool FBP::iterate(unsigned int iterations)
 
 		astraCUDA3d::FDK_PreWeight(tmp, fOriginSource,
 		              fOriginDetector, 0.0f,
-		              fFanDetSize, 1.0f, /* fPixelSize */ 1.0f,
+		              fFanDetSize, 1.0f,
 		              m_bShortScan, dims3d, pfAngles);
 	} else {
 		// TODO: How should different detector pixel size in different
@@ -319,17 +328,14 @@ bool FBP::iterate(unsigned int iterations)
 	}
 
 	if (fanProjs) {
-		float fOutputScale = 1.0 / (/*fPixelSize * fPixelSize * fPixelSize * */ fFanDetSize * fFanDetSize);
-
-		ok = FanBP_FBPWeighted(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, fanProjs, fOutputScale);
+		ok = FanBP_FBPWeighted(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, fanProjs, fProjectorScale * fReconstructionScale);
 
 	} else {
 		// scale by number of angles. For the fan-beam case, this is already
 		// handled by FDK_PreWeight
 		float fOutputScale = (M_PI / 2.0f) / (float)dims.iProjAngles;
-		//fOutputScale /= fDetSize * fDetSize;
 
-		ok = BP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, parProjs, fOutputScale);
+		ok = BP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, parProjs, fOutputScale * fProjectorScale * fReconstructionScale);
 	}
 	if(!ok)
 	{
diff --git a/cuda/2d/fft.cu b/cuda/2d/fft.cu
index 2e94b79..8361ad2 100644
--- a/cuda/2d/fft.cu
+++ b/cuda/2d/fft.cu
@@ -314,210 +314,3 @@ void genCuFFTFilter(const SFilterConfig &_cfg, int _iProjectionCount,
 
 
 }
-
-
-#ifdef STANDALONE
-
-__global__ static void doubleFourierOutput_kernel(int _iProjectionCount,
-                                                  int _iDetectorCount,
-                                                  cufftComplex* _pFourierOutput)
-{
-	int iIndex = threadIdx.x + blockIdx.x * blockDim.x;
-	int iProjectionIndex = iIndex / _iDetectorCount;
-	int iDetectorIndex = iIndex % _iDetectorCount;
-
-	if(iProjectionIndex >= _iProjectionCount)
-	{
-		return;
-	}
-
-	if(iDetectorIndex <= (_iDetectorCount / 2))
-	{
-		return;
-	}
-
-	int iOtherDetectorIndex = _iDetectorCount - iDetectorIndex;
-
-	_pFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex].x = _pFourierOutput[iProjectionIndex * _iDetectorCount + iOtherDetectorIndex].x;
-	_pFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex].y = -_pFourierOutput[iProjectionIndex * _iDetectorCount + iOtherDetectorIndex].y;
-}
-
-static void doubleFourierOutput(int _iProjectionCount, int _iDetectorCount,
-                                cufftComplex * _pFourierOutput)
-{
-	const int iBlockSize = 256;
-	int iElementCount = _iProjectionCount * _iDetectorCount;
-	int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize;
-
-	doubleFourierOutput_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount,
-	                                                          _iDetectorCount,
-	                                                          _pFourierOutput);
-	CHECK_ERROR("doubleFourierOutput_kernel failed");
-}
-
-
-
-static void writeToMatlabFile(const char * _fileName, const float * _pfData,
-                              int _iRowCount, int _iColumnCount)
-{
-	std::ofstream out(_fileName);
-
-	for(int iRowIndex = 0; iRowIndex < _iRowCount; iRowIndex++)
-	{
-		for(int iColumnIndex = 0; iColumnIndex < _iColumnCount; iColumnIndex++)
-		{
-			out << _pfData[iColumnIndex + iRowIndex * _iColumnCount] << " ";
-		}
-
-		out << std::endl;
-	}
-}
-
-static void convertComplexToRealImg(const cufftComplex * _pComplex,
-                                    int _iElementCount,
-                                    float * _pfReal, float * _pfImaginary)
-{
-	for(int iIndex = 0; iIndex < _iElementCount; iIndex++)
-	{
-		_pfReal[iIndex] = _pComplex[iIndex].x;
-		_pfImaginary[iIndex] = _pComplex[iIndex].y;
-	}
-}
-
-void testCudaFFT()
-{
-	const int iProjectionCount = 2;
-	const int iDetectorCount = 1024;
-	const int iTotalElementCount = iProjectionCount * iDetectorCount;
-
-	float * pfHostProj = new float[iTotalElementCount];
-	memset(pfHostProj, 0, sizeof(float) * iTotalElementCount);
-
-	for(int iProjectionIndex = 0; iProjectionIndex < iProjectionCount; iProjectionIndex++)
-	{
-		for(int iDetectorIndex = 0; iDetectorIndex < iDetectorCount; iDetectorIndex++)
-		{
-//			int
-
-//			pfHostProj[iIndex] = (float)rand() / (float)RAND_MAX;
-		}
-	}
-
-	writeToMatlabFile("proj.mat", pfHostProj, iProjectionCount, iDetectorCount);
-
-	float * pfDevProj = NULL;
-	SAFE_CALL(cudaMalloc((void **)&pfDevProj, sizeof(float) * iTotalElementCount));
-	SAFE_CALL(cudaMemcpy(pfDevProj, pfHostProj, sizeof(float) * iTotalElementCount, cudaMemcpyHostToDevice));
-
-	cufftComplex * pDevFourProj = NULL;
-	SAFE_CALL(cudaMalloc((void **)&pDevFourProj, sizeof(cufftComplex) * iTotalElementCount));
-
-	cufftHandle plan;
-	cufftResult result;
-
-	result = cufftPlan1d(&plan, iDetectorCount, CUFFT_R2C, iProjectionCount);
-	if(result != CUFFT_SUCCESS)
-	{
-		ASTRA_ERROR("Failed to plan 1d r2c fft");
-	}
-
-	result = cufftExecR2C(plan, pfDevProj, pDevFourProj);
-	if(result != CUFFT_SUCCESS)
-	{
-		ASTRA_ERROR("Failed to exec 1d r2c fft");
-	}
-
-	cufftDestroy(plan);
-
-	doubleFourierOutput(iProjectionCount, iDetectorCount, pDevFourProj);
-
-	cufftComplex * pHostFourProj = new cufftComplex[iTotalElementCount];
-	SAFE_CALL(cudaMemcpy(pHostFourProj, pDevFourProj, sizeof(cufftComplex) * iTotalElementCount, cudaMemcpyDeviceToHost));
-
-	float * pfHostFourProjReal = new float[iTotalElementCount];
-	float * pfHostFourProjImaginary = new float[iTotalElementCount];
-
-	convertComplexToRealImg(pHostFourProj, iTotalElementCount, pfHostFourProjReal, pfHostFourProjImaginary);
-
-	writeToMatlabFile("proj_four_real.mat", pfHostFourProjReal, iProjectionCount, iDetectorCount);
-	writeToMatlabFile("proj_four_imaginary.mat", pfHostFourProjImaginary, iProjectionCount, iDetectorCount);
-
-	float * pfDevInFourProj = NULL;
-	SAFE_CALL(cudaMalloc((void **)&pfDevInFourProj, sizeof(float) * iTotalElementCount));
-
-	result = cufftPlan1d(&plan, iDetectorCount, CUFFT_C2R, iProjectionCount);
-	if(result != CUFFT_SUCCESS)
-	{
-		ASTRA_ERROR("Failed to plan 1d c2r fft");
-	}
-
-	result = cufftExecC2R(plan, pDevFourProj, pfDevInFourProj);
-	if(result != CUFFT_SUCCESS)
-	{
-		ASTRA_ERROR("Failed to exec 1d c2r fft");
-	}
-
-	cufftDestroy(plan);
-
-	rescaleInverseFourier(iProjectionCount, iDetectorCount, pfDevInFourProj);
-
-	float * pfHostInFourProj = new float[iTotalElementCount];
-	SAFE_CALL(cudaMemcpy(pfHostInFourProj, pfDevInFourProj, sizeof(float) * iTotalElementCount, cudaMemcpyDeviceToHost));
-
-	writeToMatlabFile("in_four.mat", pfHostInFourProj, iProjectionCount, iDetectorCount);
-
-	SAFE_CALL(cudaFree(pDevFourProj));
-	SAFE_CALL(cudaFree(pfDevProj));
-
-	delete [] pfHostInFourProj;
-	delete [] pfHostFourProjReal;
-	delete [] pfHostFourProjImaginary;
-	delete [] pfHostProj;
-	delete [] pHostFourProj;
-}
-
-void downloadDebugFilterComplex(float * _pfHostSinogram, int _iProjectionCount,
-                                int _iDetectorCount,
-                                cufftComplex * _pDevFilter,
-                                int _iFilterDetCount)
-{
-	cufftComplex * pHostFilter = NULL;
-	size_t complMemSize = sizeof(cufftComplex) * _iFilterDetCount * _iProjectionCount;
-	pHostFilter = (cufftComplex *)malloc(complMemSize);
-	SAFE_CALL(cudaMemcpy(pHostFilter, _pDevFilter, complMemSize, cudaMemcpyDeviceToHost));
-
-	for(int iTargetProjIndex = 0; iTargetProjIndex < _iProjectionCount; iTargetProjIndex++)
-	{
-		for(int iTargetDetIndex = 0; iTargetDetIndex < min(_iDetectorCount, _iFilterDetCount); iTargetDetIndex++)
-		{
-			cufftComplex source = pHostFilter[iTargetDetIndex + iTargetProjIndex * _iFilterDetCount];
-			float fReadValue = sqrtf(source.x * source.x + source.y * source.y);
-			_pfHostSinogram[iTargetDetIndex + iTargetProjIndex * _iDetectorCount] = fReadValue;
-		}
-	}
-
-	free(pHostFilter);
-}
-
-void downloadDebugFilterReal(float * _pfHostSinogram, int _iProjectionCount,
-                             int _iDetectorCount, float * _pfDevFilter,
-                             int _iFilterDetCount)
-{
-	float * pfHostFilter = NULL;
-	size_t memSize = sizeof(float) * _iFilterDetCount * _iProjectionCount;
-	pfHostFilter = (float *)malloc(memSize);
-	SAFE_CALL(cudaMemcpy(pfHostFilter, _pfDevFilter, memSize, cudaMemcpyDeviceToHost));
-
-	for(int iTargetProjIndex = 0; iTargetProjIndex < _iProjectionCount; iTargetProjIndex++)
-	{
-		for(int iTargetDetIndex = 0; iTargetDetIndex < min(_iDetectorCount, _iFilterDetCount); iTargetDetIndex++)
-		{
-			float fSource = pfHostFilter[iTargetDetIndex + iTargetProjIndex * _iFilterDetCount];
-			_pfHostSinogram[iTargetDetIndex + iTargetProjIndex * _iDetectorCount] = fSource;
-		}
-	}
-
-	free(pfHostFilter);
-}
-
-#endif
diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu
index 09a6554..f080abb 100644
--- a/cuda/2d/par_bp.cu
+++ b/cuda/2d/par_bp.cu
@@ -28,10 +28,6 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 #include "astra/cuda/2d/util.h"
 #include "astra/cuda/2d/arith.h"
 
-#ifdef STANDALONE
-#include "testutil.h"
-#endif
-
 #include <cstdio>
 #include <cassert>
 #include <iostream>
@@ -53,6 +49,7 @@ const unsigned int g_MaxAngles = 2560;
 __constant__ float gC_angle_scaled_sin[g_MaxAngles];
 __constant__ float gC_angle_scaled_cos[g_MaxAngles];
 __constant__ float gC_angle_offset[g_MaxAngles];
+__constant__ float gC_angle_scale[g_MaxAngles];
 
 static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height, cudaTextureAddressMode mode = cudaAddressModeBorder)
 {
@@ -70,6 +67,7 @@ static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int wi
 	return true;
 }
 
+// TODO: Templated version with/without scale? (Or only the global outputscale)
 __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims, float fOutputScale)
 {
 	const int relX = threadIdx.x;
@@ -97,9 +95,10 @@ __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int star
 		const float scaled_cos_theta = gC_angle_scaled_cos[angle];
 		const float scaled_sin_theta = gC_angle_scaled_sin[angle];
 		const float TOffset = gC_angle_offset[angle];
+		const float scale = gC_angle_scale[angle];
 
 		const float fT = fX * scaled_cos_theta - fY * scaled_sin_theta + TOffset;
-		fVal += tex2D(gT_projTexture, fT, fA);
+		fVal += tex2D(gT_projTexture, fT, fA) * scale;
 		fA += 1.0f;
 	}
 
@@ -138,6 +137,7 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s
 		const float cos_theta = gC_angle_scaled_cos[angle];
 		const float sin_theta = gC_angle_scaled_sin[angle];
 		const float TOffset = gC_angle_offset[angle];
+		const float scale = gC_angle_scale[angle];
 
 		float fT = fX * cos_theta - fY * sin_theta + TOffset;
 
@@ -145,7 +145,7 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s
 			float fTy = fT;
 			fT += fSubStep * cos_theta;
 			for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) {
-				fVal += tex2D(gT_projTexture, fTy, fA);
+				fVal += tex2D(gT_projTexture, fTy, fA) * scale;
 				fTy -= fSubStep * sin_theta;
 			}
 		}
@@ -172,6 +172,8 @@ __global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset
 	const float fT = fX * angle_cos - fY * angle_sin + offset;
 	const float fVal = tex2D(gT_projTexture, fT, 0.5f);
 
+	// NB: The 'scale' constant in devBP is cancelled out by the SART weighting
+
 	D_volData[Y*volPitch+X] += fVal * fOutputScale;
 }
 
@@ -186,27 +188,34 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch,
 	float* angle_scaled_sin = new float[dims.iProjAngles];
 	float* angle_scaled_cos = new float[dims.iProjAngles];
 	float* angle_offset = new float[dims.iProjAngles];
+	float* angle_scale = new float[dims.iProjAngles];
 
 	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
 
 	for (unsigned int i = 0; i < dims.iProjAngles; ++i) {
 		double d = angles[i].fDetUX * angles[i].fRayY - angles[i].fDetUY * angles[i].fRayX;
 		angle_scaled_cos[i] = angles[i].fRayY / d;
-		angle_scaled_sin[i] = -angles[i].fRayX / d; // TODO: Check signs
+		angle_scaled_sin[i] = -angles[i].fRayX / d;
 		angle_offset[i] = (angles[i].fDetSY * angles[i].fRayX - angles[i].fDetSX * angles[i].fRayY) / d;
+		angle_scale[i] = sqrt(angles[i].fRayX * angles[i].fRayX + angles[i].fRayY * angles[i].fRayY) / abs(d);
 	}
+	//fprintf(stderr, "outputscale in BP_internal: %f, %f\n", fOutputScale, angle_scale[0]);
+	//fprintf(stderr, "ray in BP_internal: %f,%f (length %f)\n", angles[0].fRayX, angles[0].fRayY, sqrt(angles[0].fRayX * angles[0].fRayX + angles[0].fRayY * angles[0].fRayY));
 
 	cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_scaled_sin, angle_scaled_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
 	cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_scaled_cos, angle_scaled_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
 	cudaError_t e3 = cudaMemcpyToSymbol(gC_angle_offset, angle_offset, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+	cudaError_t e4 = cudaMemcpyToSymbol(gC_angle_scale, angle_scale, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
 	assert(e1 == cudaSuccess);
 	assert(e2 == cudaSuccess);
 	assert(e3 == cudaSuccess);
+	assert(e4 == cudaSuccess);
 
 
 	delete[] angle_scaled_sin;
 	delete[] angle_scaled_cos;
 	delete[] angle_offset;
+	delete[] angle_scale;
 
 	dim3 dimBlock(g_blockSlices, g_blockSliceSize);
 	dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
@@ -267,6 +276,8 @@ bool BP_SART(float* D_volumeData, unsigned int volumePitch,
 	float angle_scaled_cos = angles[angle].fRayY / d;
 	float angle_scaled_sin = -angles[angle].fRayX / d; // TODO: Check signs
 	float angle_offset = (angles[angle].fDetSY * angles[angle].fRayX - angles[angle].fDetSX * angles[angle].fRayY) / d;
+	// NB: The adjoint scaling factor from regular BP is cancelled out by the SART weighting
+	//fOutputScale *= sqrt(angles[angle].fRayX * angles[angle].fRayX + angles[angle].fRayY * angles[angle].fRayY) / abs(d);
 
 	dim3 dimBlock(g_blockSlices, g_blockSliceSize);
 	dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
@@ -282,55 +293,3 @@ bool BP_SART(float* D_volumeData, unsigned int volumePitch,
 
 
 }
-
-#ifdef STANDALONE
-
-using namespace astraCUDA;
-
-int main()
-{
-	float* D_volumeData;
-	float* D_projData;
-
-	SDimensions dims;
-	dims.iVolWidth = 1024;
-	dims.iVolHeight = 1024;
-	dims.iProjAngles = 512;
-	dims.iProjDets = 1536;
-	dims.fDetScale = 1.0f;
-	dims.iRaysPerDet = 1;
-
-	unsigned int volumePitch, projPitch;
-
-	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
-	printf("pitch: %u\n", volumePitch);
-
-	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
-	printf("pitch: %u\n", projPitch);
-
-	unsigned int y, x;
-	float* sino = loadImage("sino.png", y, x);
-
-	float* img = new float[dims.iVolWidth*dims.iVolHeight];
-
-	memset(img, 0, dims.iVolWidth*dims.iVolHeight*sizeof(float));
-
-	copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
-	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
-
-	float* angle = new float[dims.iProjAngles];
-
-	for (unsigned int i = 0; i < dims.iProjAngles; ++i)
-		angle[i] = i*(M_PI/dims.iProjAngles);
-
-	BP(D_volumeData, volumePitch, D_projData, projPitch, dims, angle, 0, 1.0f);
-
-	delete[] angle;
-
-	copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
-
-	saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
-
-	return 0;
-}
-#endif
diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu
index 0835301..ea436c3 100644
--- a/cuda/2d/par_fp.cu
+++ b/cuda/2d/par_fp.cu
@@ -28,10 +28,6 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 #include "astra/cuda/2d/util.h"
 #include "astra/cuda/2d/arith.h"
 
-#ifdef STANDALONE
-#include "testutil.h"
-#endif
-
 #include <cstdio>
 #include <cassert>
 #include <iostream>
@@ -115,10 +111,9 @@ __global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, u
 	float fSliceStep = cos_theta / sin_theta;
 	float fDistCorr;
 	if (sin_theta > 0.0f)
-		fDistCorr = -fDetStep;
+		fDistCorr = outputScale / sin_theta;
 	else
-		fDistCorr = fDetStep;
-	fDistCorr *= outputScale;
+		fDistCorr = -outputScale / sin_theta;
 
 	float fVal = 0.0f;
 	// project detector on slice
@@ -193,10 +188,9 @@ __global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, uns
 	float fSliceStep = sin_theta / cos_theta;
 	float fDistCorr;
 	if (cos_theta < 0.0f)
-		fDistCorr = -fDetStep;
+		fDistCorr = -outputScale / cos_theta; 
 	else
-		fDistCorr = fDetStep;
-	fDistCorr *= outputScale;
+		fDistCorr = outputScale / cos_theta;
 
 	float fVal = 0.0f;
 	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 0.5f;
@@ -375,65 +369,3 @@ bool FP(float* D_volumeData, unsigned int volumePitch,
 
 
 }
-
-#ifdef STANDALONE
-
-using namespace astraCUDA;
-
-int main()
-{
-	float* D_volumeData;
-	float* D_projData;
-
-	SDimensions dims;
-	dims.iVolWidth = 1024;
-	dims.iVolHeight = 1024;
-	dims.iProjAngles = 512;
-	dims.iProjDets = 1536;
-	dims.fDetScale = 1.0f;
-	dims.iRaysPerDet = 1;
-	unsigned int volumePitch, projPitch;
-
-	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
-	printf("pitch: %u\n", volumePitch);
-
-	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
-	printf("pitch: %u\n", projPitch);
-
-	unsigned int y, x;
-	float* img = loadImage("phantom.png", y, x);
-
-	float* sino = new float[dims.iProjAngles * dims.iProjDets];
-
-	memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float));
-
-	copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
-	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
-
-	float* angle = new float[dims.iProjAngles];
-
-	for (unsigned int i = 0; i < dims.iProjAngles; ++i)
-		angle[i] = i*(M_PI/dims.iProjAngles);
-
-	FP(D_volumeData, volumePitch, D_projData, projPitch, dims, angle, 0, 1.0f);
-
-	delete[] angle;
-
-	copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
-
-	float s = 0.0f;
-	for (unsigned int y = 0; y < dims.iProjAngles; ++y)
-		for (unsigned int x = 0; x < dims.iProjDets; ++x)
-			s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];
-	printf("cpu norm: %f\n", s);
-
-	//zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
-	s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
-	printf("gpu norm: %f\n", s);
-
-	saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino);
-
-
-	return 0;
-}
-#endif
diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu
index 64973ba..12ad6df 100644
--- a/cuda/2d/sart.cu
+++ b/cuda/2d/sart.cu
@@ -254,11 +254,11 @@ bool SART::callFP_SART(float* D_volumeData, unsigned int volumePitch,
 	if (parProjs) {
 		assert(!fanProjs);
 		return FP(D_volumeData, volumePitch, D_projData, projPitch,
-		          d, &parProjs[angle], outputScale);
+		          d, &parProjs[angle], outputScale * fProjectorScale);
 	} else {
 		assert(fanProjs);
 		return FanFP(D_volumeData, volumePitch, D_projData, projPitch,
-		             d, &fanProjs[angle], outputScale);
+		             d, &fanProjs[angle], outputScale * fProjectorScale);
 	}
 }
 
@@ -266,6 +266,7 @@ bool SART::callBP_SART(float* D_volumeData, unsigned int volumePitch,
                        float* D_projData, unsigned int projPitch,
                        unsigned int angle, float outputScale)
 {
+	// NB: No fProjectorScale here, as that it is cancelled out in the SART weighting
 	if (parProjs) {
 		assert(!fanProjs);
 		return BP_SART(D_volumeData, volumePitch, D_projData, projPitch,
diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu
index 2621490..2c5fdc9 100644
--- a/cuda/2d/sirt.cu
+++ b/cuda/2d/sirt.cu
@@ -29,10 +29,6 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 #include "astra/cuda/2d/util.h"
 #include "astra/cuda/2d/arith.h"
 
-#ifdef STANDALONE
-#include "testutil.h"
-#endif
-
 #include <cstdio>
 #include <cassert>
 
@@ -302,62 +298,3 @@ float SIRT::computeDiffNorm()
 
 
 }
-
-#ifdef STANDALONE
-
-using namespace astraCUDA;
-
-int main()
-{
-	float* D_volumeData;
-	float* D_sinoData;
-
-	SDimensions dims;
-	dims.iVolWidth = 1024;
-	dims.iVolHeight = 1024;
-	dims.iProjAngles = 512;
-	dims.iProjDets = 1536;
-	dims.fDetScale = 1.0f;
-	dims.iRaysPerDet = 1;
-	unsigned int volumePitch, sinoPitch;
-
-	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
-	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);
-	printf("pitch: %u\n", volumePitch);
-
-	allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
-	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
-	printf("pitch: %u\n", sinoPitch);
-	
-	unsigned int y, x;
-	float* sino = loadImage("sino.png", y, x);
-
-	float* img = new float[dims.iVolWidth*dims.iVolHeight];
-
-	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch);
-
-	float* angle = new float[dims.iProjAngles];
-
-	for (unsigned int i = 0; i < dims.iProjAngles; ++i)
-		angle[i] = i*(M_PI/dims.iProjAngles);
-
-	SIRT sirt;
-
-	sirt.setGeometry(dims, angle);
-	sirt.init();
-
-	sirt.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
-
-	sirt.iterate(25);
-
-
-	delete[] angle;
-
-	copyVolumeFromDevice(img, dims.iVolWidth, dims, D_volumeData, volumePitch);
-
-	saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
-
-	return 0;
-}
-#endif
-
author	Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl>	2019-09-27 15:16:26 +0200
committer	Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl>	2019-09-27 15:16:26 +0200
commit	54af7e8e22a3f1c9d90b13291b28d39778c05564 (patch)
tree	260310b16d624261bb80f82979af27750022259b /cuda/2d
parent	1fec36f7ccadd5f7dcf2bb59b0654dc53653b0f3 (diff)
parent	b629db207bb263495bfff2e61ce189ccac27b4b9 (diff)
download	astra-54af7e8e22a3f1c9d90b13291b28d39778c05564.tar.gz astra-54af7e8e22a3f1c9d90b13291b28d39778c05564.tar.bz2 astra-54af7e8e22a3f1c9d90b13291b28d39778c05564.tar.xz astra-54af7e8e22a3f1c9d90b13291b28d39778c05564.zip