From 9458268a8b9192af98fc1b88bf0a5fbbc7696a77 Mon Sep 17 00:00:00 2001
From: Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl>
Date: Thu, 12 Mar 2015 14:28:03 +0100
Subject: Add outputScale argument to 2D CUDA BP

---
 cuda/2d/algo.cu   |  7 ++++---
 cuda/2d/algo.h    |  3 ++-
 cuda/2d/astra.cu  | 12 +++++-------
 cuda/2d/cgls.cu   |  4 ++--
 cuda/2d/em.cu     |  4 ++--
 cuda/2d/fan_bp.cu | 47 +++++++++++++++++++++++++++--------------------
 cuda/2d/fan_bp.h  |  9 ++++++---
 cuda/2d/par_bp.cu | 31 +++++++++++++++++--------------
 cuda/2d/par_bp.h  |  4 ++--
 cuda/2d/sart.cu   | 10 +++++-----
 cuda/2d/sart.h    |  2 +-
 cuda/2d/sirt.cu   |  6 +++---
 12 files changed, 76 insertions(+), 63 deletions(-)

(limited to 'cuda')

diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu
index 144fabd..dc74e51 100644
--- a/cuda/2d/algo.cu
+++ b/cuda/2d/algo.cu
@@ -336,16 +336,17 @@ bool ReconAlgo::callFP(float* D_volumeData, unsigned int volumePitch,
 }
 
 bool ReconAlgo::callBP(float* D_volumeData, unsigned int volumePitch,
-                       float* D_projData, unsigned int projPitch)
+                       float* D_projData, unsigned int projPitch,
+                       float outputScale)
 {
 	if (angles) {
 		assert(!fanProjs);
 		return BP(D_volumeData, volumePitch, D_projData, projPitch,
-		          dims, angles, TOffsets);
+		          dims, angles, TOffsets, outputScale);
 	} else {
 		assert(fanProjs);
 		return FanBP(D_volumeData, volumePitch, D_projData, projPitch,
-		             dims, fanProjs);
+		             dims, fanProjs, outputScale);
 	}
 
 }
diff --git a/cuda/2d/algo.h b/cuda/2d/algo.h
index a75905e..99959c8 100644
--- a/cuda/2d/algo.h
+++ b/cuda/2d/algo.h
@@ -118,7 +118,8 @@ protected:
 	            float* D_projData, unsigned int projPitch,
 	            float outputScale);
 	bool callBP(float* D_volumeData, unsigned int volumePitch,
-	            float* D_projData, unsigned int projPitch);
+	            float* D_projData, unsigned int projPitch,
+	            float outputScale);
 
 
 	SDimensions dims;
diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu
index 0b5be06..5e2a07a 100644
--- a/cuda/2d/astra.cu
+++ b/cuda/2d/astra.cu
@@ -367,21 +367,19 @@ bool AstraFBP::run()
 
 	}
 
+	float fOutputScale = (M_PI / 2.0f) / (float)pData->dims.iProjAngles;
+
 	if (pData->bFanBeam) {
-		ok = FanBP_FBPWeighted(pData->D_volumeData, pData->volumePitch, pData->D_sinoData, pData->sinoPitch, pData->dims, pData->fanProjections);
+		ok = FanBP_FBPWeighted(pData->D_volumeData, pData->volumePitch, pData->D_sinoData, pData->sinoPitch, pData->dims, pData->fanProjections, fOutputScale);
 
 	} else {
-		ok = BP(pData->D_volumeData, pData->volumePitch, pData->D_sinoData, pData->sinoPitch, pData->dims, pData->angles, pData->TOffsets);
+		ok = BP(pData->D_volumeData, pData->volumePitch, pData->D_sinoData, pData->sinoPitch, pData->dims, pData->angles, pData->TOffsets, fOutputScale);
 	}
 	if(!ok)
 	{
 		return false;
 	}
 
-	processVol<opMul>(pData->D_volumeData,
-	                      (M_PI / 2.0f) / (float)pData->dims.iProjAngles,
-	                      pData->volumePitch, pData->dims);
-
 	return true;
 }
 
@@ -593,7 +591,7 @@ bool BPalgo::iterate(unsigned int)
 {
 	// TODO: This zeroVolume makes an earlier memcpy of D_volumeData redundant
 	zeroVolumeData(D_volumeData, volumePitch, dims);
-	callBP(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+	callBP(D_volumeData, volumePitch, D_sinoData, sinoPitch, 1.0f);
 	return true;
 }
 
diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu
index 9ead563..f402914 100644
--- a/cuda/2d/cgls.cu
+++ b/cuda/2d/cgls.cu
@@ -135,7 +135,7 @@ bool CGLS::iterate(unsigned int iterations)
 
 		// p = A'*r
 		zeroVolumeData(D_p, pPitch, dims);
-		callBP(D_p, pPitch, D_r, rPitch);
+		callBP(D_p, pPitch, D_r, rPitch, 1.0f);
 		if (useVolumeMask)
 			processVol<opMul>(D_p, D_maskData, pPitch, dims);
 
@@ -166,7 +166,7 @@ bool CGLS::iterate(unsigned int iterations)
 
 		// z = A'*r
 		zeroVolumeData(D_z, zPitch, dims);
-		callBP(D_z, zPitch, D_r, rPitch);
+		callBP(D_z, zPitch, D_r, rPitch, 1.0f);
 		if (useVolumeMask)
 			processVol<opMul>(D_z, D_maskData, zPitch, dims);
 
diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu
index 00127c0..8593b08 100644
--- a/cuda/2d/em.cu
+++ b/cuda/2d/em.cu
@@ -102,7 +102,7 @@ bool EM::precomputeWeights()
 #endif
 	{
 		processSino<opSet>(D_projData, 1.0f, projPitch, dims);
-		callBP(D_pixelWeight, pixelPitch, D_projData, projPitch);
+		callBP(D_pixelWeight, pixelPitch, D_projData, projPitch, 1.0f);
 	}
 	processVol<opInvert>(D_pixelWeight, pixelPitch, dims);
 
@@ -137,7 +137,7 @@ bool EM::iterate(unsigned int iterations)
 
 		// Do BP of projData into tmpData
 		zeroVolumeData(D_tmpData, tmpPitch, dims);
-		callBP(D_tmpData, tmpPitch, D_projData, projPitch);
+		callBP(D_tmpData, tmpPitch, D_projData, projPitch, 1.0f);
 
 		// Multiply volumeData with tmpData divided by pixel weights
 		processVol<opMul2>(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims);
diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu
index 74e8b12..b4321ba 100644
--- a/cuda/2d/fan_bp.cu
+++ b/cuda/2d/fan_bp.cu
@@ -77,7 +77,7 @@ static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int wi
 	return true;
 }
 
-__global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims)
+__global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims, float fOutputScale)
 {
 	const int relX = threadIdx.x;
 	const int relY = threadIdx.y;
@@ -121,11 +121,11 @@ __global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int s
 		fA += 1.0f;
 	}
 
-	volData[Y*volPitch+X] += fVal;
+	volData[Y*volPitch+X] += fVal * fOutputScale;
 }
 
 // supersampling version
-__global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims)
+__global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims, float fOutputScale)
 {
 	const int relX = threadIdx.x;
 	const int relY = threadIdx.y;
@@ -146,6 +146,8 @@ __global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned in
 
 	float* volData = (float*)D_volData;
 
+	fOutputScale /= (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
+
 	float fVal = 0.0f;
 	float fA = startAngle + 0.5f;
 
@@ -180,14 +182,14 @@ __global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned in
 		fA += 1.0f;
 	}
 
-	volData[Y*volPitch+X] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
+	volData[Y*volPitch+X] += fVal * fOutputScale;
 }
 
 
 // BP specifically for SART.
 // It includes (free) weighting with voxel weight.
 // It assumes the proj texture is set up _without_ padding, unlike regular BP.
-__global__ void devFanBP_SART(float* D_volData, unsigned int volPitch, const SDimensions dims)
+__global__ void devFanBP_SART(float* D_volData, unsigned int volPitch, const SDimensions dims, float fOutputScale)
 {
 	const int relX = threadIdx.x;
 	const int relY = threadIdx.y;
@@ -222,12 +224,12 @@ __global__ void devFanBP_SART(float* D_volData, unsigned int volPitch, const SDi
 	const float fT = fNum / fDen;
 	const float fVal = tex2D(gT_FanProjTexture, fT, 0.5f);
 
-	volData[Y*volPitch+X] += fVal;
+	volData[Y*volPitch+X] += fVal * fOutputScale;
 }
 
 // Weighted BP for use in fan beam FBP
 // Each pixel/ray is weighted by 1/L^2 where L is the distance to the source.
-__global__ void devFanBP_FBPWeighted(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims)
+__global__ void devFanBP_FBPWeighted(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims, float fOutputScale)
 {
 	const int relX = threadIdx.x;
 	const int relY = threadIdx.y;
@@ -273,13 +275,14 @@ __global__ void devFanBP_FBPWeighted(float* D_volData, unsigned int volPitch, un
 		fA += 1.0f;
 	}
 
-	volData[Y*volPitch+X] += fVal;
+	volData[Y*volPitch+X] += fVal * fOutputScale;
 }
 
 
 bool FanBP_internal(float* D_volumeData, unsigned int volumePitch,
            float* D_projData, unsigned int projPitch,
-           const SDimensions& dims, const SFanProjection* angles)
+           const SDimensions& dims, const SFanProjection* angles,
+           float fOutputScale)
 {
 	assert(dims.iProjAngles <= g_MaxAngles);
 
@@ -310,9 +313,9 @@ bool FanBP_internal(float* D_volumeData, unsigned int volumePitch,
 
 	for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
 		if (dims.iRaysPerPixelDim > 1)
-			devFanBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims);
+			devFanBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
 		else
-			devFanBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims);
+			devFanBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
 	}
 	cudaThreadSynchronize();
 
@@ -325,7 +328,8 @@ bool FanBP_internal(float* D_volumeData, unsigned int volumePitch,
 
 bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch,
            float* D_projData, unsigned int projPitch,
-           const SDimensions& dims, const SFanProjection* angles)
+           const SDimensions& dims, const SFanProjection* angles,
+           float fOutputScale)
 {
 	assert(dims.iProjAngles <= g_MaxAngles);
 
@@ -355,7 +359,7 @@ bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch,
 	cudaStreamCreate(&stream);
 
 	for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
-		devFanBP_FBPWeighted<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims);
+		devFanBP_FBPWeighted<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
 	}
 	cudaThreadSynchronize();
 
@@ -370,7 +374,8 @@ bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch,
 bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,
                 float* D_projData, unsigned int projPitch,
                 unsigned int angle,
-                const SDimensions& dims, const SFanProjection* angles)
+                const SDimensions& dims, const SFanProjection* angles,
+                float fOutputScale)
 {
 	// only one angle
 	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1, cudaAddressModeClamp);
@@ -391,7 +396,7 @@ bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,
 	dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
 	             (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
 
-	devFanBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, dims);
+	devFanBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, dims, fOutputScale);
 	cudaThreadSynchronize();
 
 	cudaTextForceKernelsCompletion();
@@ -401,7 +406,8 @@ bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,
 
 bool FanBP(float* D_volumeData, unsigned int volumePitch,
            float* D_projData, unsigned int projPitch,
-           const SDimensions& dims, const SFanProjection* angles)
+           const SDimensions& dims, const SFanProjection* angles,
+           float fOutputScale)
 {
 	for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) {
 		SDimensions subdims = dims;
@@ -413,7 +419,7 @@ bool FanBP(float* D_volumeData, unsigned int volumePitch,
 		bool ret;
 		ret = FanBP_internal(D_volumeData, volumePitch,
 		                  D_projData + iAngle * projPitch, projPitch,
-		                  subdims, angles + iAngle);
+		                  subdims, angles + iAngle, fOutputScale);
 		if (!ret)
 			return false;
 	}
@@ -422,7 +428,8 @@ bool FanBP(float* D_volumeData, unsigned int volumePitch,
 
 bool FanBP_FBPWeighted(float* D_volumeData, unsigned int volumePitch,
            float* D_projData, unsigned int projPitch,
-           const SDimensions& dims, const SFanProjection* angles)
+           const SDimensions& dims, const SFanProjection* angles,
+           float fOutputScale)
 {
 	for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) {
 		SDimensions subdims = dims;
@@ -434,7 +441,7 @@ bool FanBP_FBPWeighted(float* D_volumeData, unsigned int volumePitch,
 		bool ret;
 		ret = FanBP_FBPWeighted_internal(D_volumeData, volumePitch,
 		                  D_projData + iAngle * projPitch, projPitch,
-		                  subdims, angles + iAngle);
+		                  subdims, angles + iAngle, fOutputScale);
 
 		if (!ret)
 			return false;
@@ -498,7 +505,7 @@ int main()
 	copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
 	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
 
-	FanBP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs);
+	FanBP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs, 1.0f);
 
 	copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
 
diff --git a/cuda/2d/fan_bp.h b/cuda/2d/fan_bp.h
index e4e69b0..3ebe1e8 100644
--- a/cuda/2d/fan_bp.h
+++ b/cuda/2d/fan_bp.h
@@ -33,16 +33,19 @@ namespace astraCUDA {
 
 _AstraExport bool FanBP(float* D_volumeData, unsigned int volumePitch,
            float* D_projData, unsigned int projPitch,
-           const SDimensions& dims, const SFanProjection* angles);
+           const SDimensions& dims, const SFanProjection* angles,
+           float fOutputScale);
 
 _AstraExport bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,
                 float* D_projData, unsigned int projPitch,
                 unsigned int angle,
-                const SDimensions& dims, const SFanProjection* angles);
+                const SDimensions& dims, const SFanProjection* angles,
+                float fOutputScale);
 
 _AstraExport bool FanBP_FBPWeighted(float* D_volumeData, unsigned int volumePitch,
            float* D_projData, unsigned int projPitch,
-           const SDimensions& dims, const SFanProjection* angles);
+           const SDimensions& dims, const SFanProjection* angles,
+           float fOutputScale);
 
 
 }
diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu
index 635200f..d9f7325 100644
--- a/cuda/2d/par_bp.cu
+++ b/cuda/2d/par_bp.cu
@@ -73,7 +73,7 @@ static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int wi
 	return true;
 }
 
-__global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims)
+__global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims, float fOutputScale)
 {
 	const int relX = threadIdx.x;
 	const int relY = threadIdx.y;
@@ -123,11 +123,11 @@ __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int star
 
 	}
 
-	volData[Y*volPitch+X] += fVal;
+	volData[Y*volPitch+X] += fVal * fOutputScale;
 }
 
 // supersampling version
-__global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims)
+__global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims, float fOutputScale)
 {
 	const int relX = threadIdx.x;
 	const int relY = threadIdx.y;
@@ -152,6 +152,8 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s
 	float fA = startAngle + 0.5f;
 	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f;
 
+	fOutputScale /= (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
+
 	if (offsets) {
 
 		for (int angle = startAngle; angle < endAngle; ++angle)
@@ -196,10 +198,10 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s
 
 	}
 
-	volData[Y*volPitch+X] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
+	volData[Y*volPitch+X] += fVal * fOutputScale;
 }
 
-__global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset, float angle_sin, float angle_cos, const SDimensions dims)
+__global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset, float angle_sin, float angle_cos, const SDimensions dims, float fOutputScale)
 {
 	const int relX = threadIdx.x;
 	const int relY = threadIdx.y;
@@ -218,13 +220,13 @@ __global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset
 	const float fT = fT_base + fX * angle_cos - fY * angle_sin + offset;
 	const float fVal = tex2D(gT_projTexture, fT, 0.5f);
 
-	D_volData[Y*volPitch+X] += fVal;
+	D_volData[Y*volPitch+X] += fVal * fOutputScale;
 }
 
 
 bool BP_internal(float* D_volumeData, unsigned int volumePitch,
         float* D_projData, unsigned int projPitch,
-        const SDimensions& dims, const float* angles, const float* TOffsets)
+        const SDimensions& dims, const float* angles, const float* TOffsets, float fOutputScale)
 {
 	// TODO: process angles block by block
 	assert(dims.iProjAngles <= g_MaxAngles);
@@ -261,9 +263,9 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch,
 	for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
 
 		if (dims.iRaysPerPixelDim > 1)
-			devBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims);
+			devBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims, fOutputScale);
 		else
-			devBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims);
+			devBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims, fOutputScale);
 	}
 	cudaThreadSynchronize();
 
@@ -276,7 +278,7 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch,
 
 bool BP(float* D_volumeData, unsigned int volumePitch,
         float* D_projData, unsigned int projPitch,
-        const SDimensions& dims, const float* angles, const float* TOffsets)
+        const SDimensions& dims, const float* angles, const float* TOffsets, float fOutputScale)
 {
 	for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) {
 		SDimensions subdims = dims;
@@ -289,7 +291,8 @@ bool BP(float* D_volumeData, unsigned int volumePitch,
 		ret = BP_internal(D_volumeData, volumePitch,
 		                  D_projData + iAngle * projPitch, projPitch,
 		                  subdims, angles + iAngle,
-		                  TOffsets ? TOffsets + iAngle : 0);
+		                  TOffsets ? TOffsets + iAngle : 0,
+		                  fOutputScale);
 		if (!ret)
 			return false;
 	}
@@ -300,7 +303,7 @@ bool BP(float* D_volumeData, unsigned int volumePitch,
 bool BP_SART(float* D_volumeData, unsigned int volumePitch,
              float* D_projData, unsigned int projPitch,
              unsigned int angle, const SDimensions& dims,
-             const float* angles, const float* TOffsets)
+             const float* angles, const float* TOffsets, float fOutputScale)
 {
 	// Only one angle.
 	// We need to Clamp to the border pixels instead of to zero, because
@@ -318,7 +321,7 @@ bool BP_SART(float* D_volumeData, unsigned int volumePitch,
 	dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
 	             (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
 
-	devBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, offset, angle_sin, angle_cos, dims);
+	devBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, offset, angle_sin, angle_cos, dims, fOutputScale);
 	cudaThreadSynchronize();
 
 	cudaTextForceKernelsCompletion();
@@ -369,7 +372,7 @@ int main()
 	for (unsigned int i = 0; i < dims.iProjAngles; ++i)
 		angle[i] = i*(M_PI/dims.iProjAngles);
 
-	BP(D_volumeData, volumePitch, D_projData, projPitch, dims, angle, 0);
+	BP(D_volumeData, volumePitch, D_projData, projPitch, dims, angle, 0, 1.0f);
 
 	delete[] angle;
 
diff --git a/cuda/2d/par_bp.h b/cuda/2d/par_bp.h
index eaeafd8..64bcd34 100644
--- a/cuda/2d/par_bp.h
+++ b/cuda/2d/par_bp.h
@@ -36,12 +36,12 @@ namespace astraCUDA {
 _AstraExport bool BP(float* D_volumeData, unsigned int volumePitch,
         float* D_projData, unsigned int projPitch,
         const SDimensions& dims, const float* angles,
-        const float* TOffsets);
+        const float* TOffsets, float fOutputScale);
 
 _AstraExport bool BP_SART(float* D_volumeData, unsigned int volumePitch,
              float* D_projData, unsigned int projPitch,
              unsigned int angle, const SDimensions& dims,
-             const float* angles, const float* TOffsets);
+             const float* angles, const float* TOffsets, float fOutputScale);
 
 }
 
diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu
index 29670c3..e5cb5bb 100644
--- a/cuda/2d/sart.cu
+++ b/cuda/2d/sart.cu
@@ -200,10 +200,10 @@ bool SART::iterate(unsigned int iterations)
 			// BP, mask, and add back
 			// TODO: Try putting the masking directly in the BP
 			zeroVolumeData(D_tmpData, tmpPitch, dims);
-			callBP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle);
+			callBP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle, 1.0f);
 			processVol<opAddMul>(D_volumeData, D_maskData, D_tmpData, volumePitch, dims);
 		} else {
-			callBP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle);
+			callBP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle, 1.0f);
 		}
 
 		if (useMinConstraint)
@@ -264,16 +264,16 @@ bool SART::callFP_SART(float* D_volumeData, unsigned int volumePitch,
 
 bool SART::callBP_SART(float* D_volumeData, unsigned int volumePitch,
                        float* D_projData, unsigned int projPitch,
-                       unsigned int angle)
+                       unsigned int angle, float outputScale)
 {
 	if (angles) {
 		assert(!fanProjs);
 		return BP_SART(D_volumeData, volumePitch, D_projData, projPitch,
-		               angle, dims, angles, TOffsets);
+		               angle, dims, angles, TOffsets, outputScale);
 	} else {
 		assert(fanProjs);
 		return FanBP_SART(D_volumeData, volumePitch, D_projData, projPitch,
-		                  angle, dims, fanProjs);
+		                  angle, dims, fanProjs, outputScale);
 	}
 
 }
diff --git a/cuda/2d/sart.h b/cuda/2d/sart.h
index 6574a6f..7dcd641 100644
--- a/cuda/2d/sart.h
+++ b/cuda/2d/sart.h
@@ -59,7 +59,7 @@ protected:
 	                 unsigned int angle, float outputScale);
 	bool callBP_SART(float* D_volumeData, unsigned int volumePitch,
 	                 float* D_projData, unsigned int projPitch,
-	                 unsigned int angle);
+	                 unsigned int angle, float outputScale);
 
 
 	// projection angle variables
diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu
index a6194a5..162ee77 100644
--- a/cuda/2d/sirt.cu
+++ b/cuda/2d/sirt.cu
@@ -127,10 +127,10 @@ bool SIRT::precomputeWeights()
 
 	zeroVolumeData(D_pixelWeight, pixelPitch, dims);
 	if (useSinogramMask) {
-		callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch);
+		callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch, 1.0f);
 	} else {
 		processSino<opSet>(D_projData, 1.0f, projPitch, dims);
-		callBP(D_pixelWeight, pixelPitch, D_projData, projPitch);
+		callBP(D_pixelWeight, pixelPitch, D_projData, projPitch, 1.0f);
 	}
 	processVol<opInvert>(D_pixelWeight, pixelPitch, dims);
 
@@ -251,7 +251,7 @@ bool SIRT::iterate(unsigned int iterations)
 
 		zeroVolumeData(D_tmpData, tmpPitch, dims);
 
-		callBP(D_tmpData, tmpPitch, D_projData, projPitch);
+		callBP(D_tmpData, tmpPitch, D_projData, projPitch, 1.0f);
 
 		processVol<opAddMul>(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims);
 
-- 
cgit v1.2.3