1 files changed, 814 insertions, 0 deletions
diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu
new file mode 100644
index 0000000..6bf9037
--- /dev/null
+++ b/cuda/3d/par3d_fp.cu
@@ -0,0 +1,814 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+#include "util3d.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+#include "dims3d.h"
+
+typedef texture<float, 3, cudaReadModeElementType> texture3D;
+
+static texture3D gT_par3DVolumeTexture;
+
+namespace astraCUDA3d {
+
+static const unsigned int g_anglesPerBlock = 4;
+
+// thickness of the slices we're splitting the volume up into
+static const unsigned int g_blockSlices = 64;
+static const unsigned int g_detBlockU = 32;
+static const unsigned int g_detBlockV = 32;
+
+static const unsigned g_MaxAngles = 1024;
+__constant__ float gC_RayX[g_MaxAngles];
+__constant__ float gC_RayY[g_MaxAngles];
+__constant__ float gC_RayZ[g_MaxAngles];
+__constant__ float gC_DetSX[g_MaxAngles];
+__constant__ float gC_DetSY[g_MaxAngles];
+__constant__ float gC_DetSZ[g_MaxAngles];
+__constant__ float gC_DetUX[g_MaxAngles];
+__constant__ float gC_DetUY[g_MaxAngles];
+__constant__ float gC_DetUZ[g_MaxAngles];
+__constant__ float gC_DetVX[g_MaxAngles];
+__constant__ float gC_DetVY[g_MaxAngles];
+__constant__ float gC_DetVZ[g_MaxAngles];
+
+
+static bool bindVolumeDataTexture(const cudaArray* array)
+{
+	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+	gT_par3DVolumeTexture.addressMode[0] = cudaAddressModeClamp;
+	gT_par3DVolumeTexture.addressMode[1] = cudaAddressModeClamp;
+	gT_par3DVolumeTexture.addressMode[2] = cudaAddressModeClamp;
+	gT_par3DVolumeTexture.filterMode = cudaFilterModeLinear;
+	gT_par3DVolumeTexture.normalized = false;
+
+	cudaBindTextureToArray(gT_par3DVolumeTexture, array, channelDesc);
+
+	// TODO: error value?
+
+	return true;
+}
+
+
+
+// threadIdx: x = u detector
+//            y = relative angle
+// blockIdx:  x = u/v detector
+//            y = angle block
+
+#define PAR3D_FP_BODY(c0,c1,c2) \
+	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;                                        \
+	if (angle >= endAngle)                                                                                       \
+		return;                                                                                                  \
+                                                                                                                 \
+	const float fRayX = gC_RayX[angle];                                                                          \
+	const float fRayY = gC_RayY[angle];                                                                          \
+	const float fRayZ = gC_RayZ[angle];                                                                          \
+	const float fDetUX = gC_DetUX[angle];                                                                        \
+	const float fDetUY = gC_DetUY[angle];                                                                        \
+	const float fDetUZ = gC_DetUZ[angle];                                                                        \
+	const float fDetVX = gC_DetVX[angle];                                                                        \
+	const float fDetVY = gC_DetVY[angle];                                                                        \
+	const float fDetVZ = gC_DetVZ[angle];                                                                        \
+	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;                                        \
+	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;                                        \
+	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;                                        \
+                                                                                                                 \
+                                                                                                                 \
+                                                                                                                 \
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;    \
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;             \
+	int endDetectorV = startDetectorV + g_detBlockV;                                                             \
+	if (endDetectorV > dims.iProjV)                                                                              \
+		endDetectorV = dims.iProjV;                                                                              \
+                                                                                                                 \
+	int endSlice = startSlice + g_blockSlices;                                                                   \
+	if (endSlice > dims.iVol##c0)                                                                                \
+		endSlice = dims.iVol##c0;                                                                                \
+                                                                                                                 \
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)                                  \
+	{                                                                                                            \
+		/* Trace ray in direction Ray to (detectorU,detectorV) from  */                                          \
+		/* X = startSlice to X = endSlice                            */                                          \
+                                                                                                                 \
+		const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX;                                        \
+		const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY;                                        \
+		const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ;                                        \
+                                                                                                                 \
+		/*        (x)   ( 1)       ( 0)    */                                                                    \
+		/* ray:   (y) = (ay) * x + (by)    */                                                                    \
+		/*        (z)   (az)       (bz)    */                                                                    \
+                                                                                                                 \
+		const float a##c1 = fRay##c1 / fRay##c0;                                                                 \
+		const float a##c2 = fRay##c2 / fRay##c0;                                                                 \
+		const float b##c1 = fDet##c1 - a##c1 * fDet##c0;                                                         \
+		const float b##c2 = fDet##c2 - a##c2 * fDet##c0;                                                         \
+                                                                                                                 \
+		const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale;                               \
+                                                                                                                 \
+		float fVal = 0.0f;                                                                                       \
+                                                                                                                 \
+		float f##c0 = startSlice + 1.5f;                                                                         \
+		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
+		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+                                                                                                                 \
+		for (int s = startSlice; s < endSlice; ++s)                                                              \
+		{                                                                                                        \
+			fVal += tex3D(gT_par3DVolumeTexture, fX, fY, fZ);                                                    \
+			f##c0 += 1.0f;                                                                                       \
+			f##c1 += a##c1;                                                                                      \
+			f##c2 += a##c2;                                                                                      \
+		}                                                                                                        \
+                                                                                                                 \
+		fVal *= fDistCorr;                                                                                       \
+                                                                                                                 \
+		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal;                              \
+	}
+
+
+
+// Supersampling version
+#define PAR3D_FP_SS_BODY(c0,c1,c2) \
+	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;                                        \
+	if (angle >= endAngle)                                                                                       \
+		return;                                                                                                  \
+                                                                                                                 \
+	const float fRayX = gC_RayX[angle];                                                                          \
+	const float fRayY = gC_RayY[angle];                                                                          \
+	const float fRayZ = gC_RayZ[angle];                                                                          \
+	const float fDetUX = gC_DetUX[angle];                                                                        \
+	const float fDetUY = gC_DetUY[angle];                                                                        \
+	const float fDetUZ = gC_DetUZ[angle];                                                                        \
+	const float fDetVX = gC_DetVX[angle];                                                                        \
+	const float fDetVY = gC_DetVY[angle];                                                                        \
+	const float fDetVZ = gC_DetVZ[angle];                                                                        \
+	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;                                        \
+	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;                                        \
+	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;                                        \
+                                                                                                                 \
+                                                                                                                 \
+                                                                                                                 \
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;    \
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;             \
+	int endDetectorV = startDetectorV + g_detBlockV;                                                             \
+	if (endDetectorV > dims.iProjV)                                                                              \
+		endDetectorV = dims.iProjV;                                                                              \
+                                                                                                                 \
+	int endSlice = startSlice + g_blockSlices;                                                                   \
+	if (endSlice > dims.iVol##c0)                                                                                \
+		endSlice = dims.iVol##c0;                                                                                \
+                                                                                                                 \
+	const float fSubStep = 1.0f/dims.iRaysPerDetDim;                                                             \
+                                                                                                                 \
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)                                  \
+	{                                                                                                            \
+                                                                                                                 \
+		float fV = 0.0f;                                                                                         \
+                                                                                                                 \
+		float fdU = detectorU - 0.5f + 0.5f*fSubStep;                                                            \
+		for (int iSubU = 0; iSubU < dims.iRaysPerDetDim; ++iSubU, fdU+=fSubStep) {                               \
+		float fdV = detectorV - 0.5f + 0.5f*fSubStep;                                                            \
+		for (int iSubV = 0; iSubV < dims.iRaysPerDetDim; ++iSubV, fdV+=fSubStep) {                               \
+                                                                                                                 \
+		/* Trace ray in direction Ray to (detectorU,detectorV) from  */                                          \
+		/* X = startSlice to X = endSlice                            */                                          \
+                                                                                                                 \
+		const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX;                                                    \
+		const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY;                                                    \
+		const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ;                                                    \
+                                                                                                                 \
+		/*        (x)   ( 1)       ( 0)    */                                                                    \
+		/* ray:   (y) = (ay) * x + (by)    */                                                                    \
+		/*        (z)   (az)       (bz)    */                                                                    \
+                                                                                                                 \
+		const float a##c1 = fRay##c1 / fRay##c0;                                                                 \
+		const float a##c2 = fRay##c2 / fRay##c0;                                                                 \
+		const float b##c1 = fDet##c1 - a##c1 * fDet##c0;                                                         \
+		const float b##c2 = fDet##c2 - a##c2 * fDet##c0;                                                         \
+                                                                                                                 \
+		const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale;                               \
+                                                                                                                 \
+		float fVal = 0.0f;                                                                                       \
+                                                                                                                 \
+		float f##c0 = startSlice + 1.5f;                                                                         \
+		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
+		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+                                                                                                                 \
+		for (int s = startSlice; s < endSlice; ++s)                                                              \
+		{                                                                                                        \
+			fVal += tex3D(gT_par3DVolumeTexture, fX, fY, fZ);                                                    \
+			f##c0 += 1.0f;                                                                                       \
+			f##c1 += a##c1;                                                                                      \
+			f##c2 += a##c2;                                                                                      \
+		}                                                                                                        \
+                                                                                                                 \
+		fVal *= fDistCorr;                                                                                       \
+		fV += fVal;                                                                                              \
+                                                                                                                 \
+		}                                                                                                        \
+		}                                                                                                        \
+                                                                                                                 \
+		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (dims.iRaysPerDetDim * dims.iRaysPerDetDim);\
+	}
+
+
+
+__global__ void par3D_FP_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_BODY(X,Y,Z)
+}
+
+__global__ void par3D_FP_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_BODY(Y,X,Z)
+}
+
+__global__ void par3D_FP_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_BODY(Z,X,Y)
+}
+
+__global__ void par3D_FP_SS_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SS_BODY(X,Y,Z)
+}
+
+__global__ void par3D_FP_SS_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SS_BODY(Y,X,Z)
+}
+
+__global__ void par3D_FP_SS_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SS_BODY(Z,X,Y)
+}
+
+
+__device__ float dirWeights(float fX, float fN) {
+	if (fX <= 0.5f) // outside image on left
+		return 0.0f;
+	if (fX <= 1.5f) // half outside image on left
+		return (fX - 0.5f) * (fX - 0.5f);
+	if (fX <= fN + 0.5f) { // inside image
+		float t = fX - 0.5f - floorf(fX - 0.5f);
+		return t*t + (1-t)*(1-t);
+	}
+	if (fX <= fN + 1.5f) // half outside image on right
+		return (fN + 1.5f - fX) * (fN + 1.5f - fX);
+	return 0.0f; // outside image on right
+}
+
+#define PAR3D_FP_SUMSQW_BODY(c0,c1,c2) \
+	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;                                        \
+	if (angle >= endAngle)                                                                                       \
+		return;                                                                                                  \
+                                                                                                                 \
+	const float fRayX = gC_RayX[angle];                                                                          \
+	const float fRayY = gC_RayY[angle];                                                                          \
+	const float fRayZ = gC_RayZ[angle];                                                                          \
+	const float fDetUX = gC_DetUX[angle];                                                                        \
+	const float fDetUY = gC_DetUY[angle];                                                                        \
+	const float fDetUZ = gC_DetUZ[angle];                                                                        \
+	const float fDetVX = gC_DetVX[angle];                                                                        \
+	const float fDetVY = gC_DetVY[angle];                                                                        \
+	const float fDetVZ = gC_DetVZ[angle];                                                                        \
+	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;                                        \
+	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;                                        \
+	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;                                        \
+                                                                                                                 \
+                                                                                                                 \
+                                                                                                                 \
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;    \
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;             \
+	int endDetectorV = startDetectorV + g_detBlockV;                                                             \
+	if (endDetectorV > dims.iProjV)                                                                              \
+		endDetectorV = dims.iProjV;                                                                              \
+                                                                                                                 \
+	int endSlice = startSlice + g_blockSlices;                                                                   \
+	if (endSlice > dims.iVol##c0)                                                                                \
+		endSlice = dims.iVol##c0;                                                                                \
+                                                                                                                 \
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)                                  \
+	{                                                                                                            \
+		/* Trace ray in direction Ray to (detectorU,detectorV) from  */                                          \
+		/* X = startSlice to X = endSlice                            */                                          \
+                                                                                                                 \
+		const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX;                                        \
+		const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY;                                        \
+		const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ;                                        \
+                                                                                                                 \
+		/*        (x)   ( 1)       ( 0)    */                                                                    \
+		/* ray:   (y) = (ay) * x + (by)    */                                                                    \
+		/*        (z)   (az)       (bz)    */                                                                    \
+                                                                                                                 \
+		const float a##c1 = fRay##c1 / fRay##c0;                                                                 \
+		const float a##c2 = fRay##c2 / fRay##c0;                                                                 \
+		const float b##c1 = fDet##c1 - a##c1 * fDet##c0;                                                         \
+		const float b##c2 = fDet##c2 - a##c2 * fDet##c0;                                                         \
+                                                                                                                 \
+		const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale;                               \
+                                                                                                                 \
+		float fVal = 0.0f;                                                                                       \
+                                                                                                                 \
+		float f##c0 = startSlice + 1.5f;                                                                         \
+		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
+		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+                                                                                                                 \
+		for (int s = startSlice; s < endSlice; ++s)                                                              \
+		{                                                                                                        \
+			fVal += dirWeights(f##c1, dims.iVol##c1) * dirWeights(f##c2, dims.iVol##c2) * fDistCorr * fDistCorr; \
+			f##c0 += 1.0f;                                                                                       \
+			f##c1 += a##c1;                                                                                      \
+			f##c2 += a##c2;                                                                                      \
+		}                                                                                                        \
+                                                                                                                 \
+		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal;                              \
+	}
+
+// Supersampling version
+// TODO
+
+
+__global__ void par3D_FP_SumSqW_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SUMSQW_BODY(X,Y,Z)
+}
+
+__global__ void par3D_FP_SumSqW_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SUMSQW_BODY(Y,X,Z)
+}
+
+__global__ void par3D_FP_SumSqW_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SUMSQW_BODY(Z,X,Y)
+}
+
+
+
+bool Par3DFP_Array(cudaArray *D_volArray,
+                   cudaPitchedPtr D_projData,
+                   const SDimensions3D& dims, const SPar3DProjection* angles,
+                   float fOutputScale)
+{
+
+	bindVolumeDataTexture(D_volArray);
+
+
+	// transfer angles to constant memory
+	float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+	TRANSFER_TO_CONSTANT(RayX);
+	TRANSFER_TO_CONSTANT(RayY);
+	TRANSFER_TO_CONSTANT(RayZ);
+	TRANSFER_TO_CONSTANT(DetSX);
+	TRANSFER_TO_CONSTANT(DetSY);
+	TRANSFER_TO_CONSTANT(DetSZ);
+	TRANSFER_TO_CONSTANT(DetUX);
+	TRANSFER_TO_CONSTANT(DetUY);
+	TRANSFER_TO_CONSTANT(DetUZ);
+	TRANSFER_TO_CONSTANT(DetVX);
+	TRANSFER_TO_CONSTANT(DetVY);
+	TRANSFER_TO_CONSTANT(DetVZ);
+
+#undef TRANSFER_TO_CONSTANT
+
+	delete[] tmp;
+
+	std::list<cudaStream_t> streams;
+	dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles
+
+	// Run over all angles, grouping them into groups of the same
+	// orientation (roughly horizontal vs. roughly vertical).
+	// Start a stream of grids for each such group.
+
+	unsigned int blockStart = 0;
+	unsigned int blockEnd = 0;
+	int blockDirection = 0;
+
+	// timeval t;
+	// tic(t);
+
+	for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+		int dir;
+		if (a != dims.iProjAngles) {
+			float dX = fabsf(angles[a].fRayX);
+			float dY = fabsf(angles[a].fRayY);
+			float dZ = fabsf(angles[a].fRayZ);
+
+			if (dX >= dY && dX >= dZ)
+				dir = 0;
+			else if (dY >= dX && dY >= dZ)
+				dir = 1;
+			else
+				dir = 2;
+		}
+
+		if (a == dims.iProjAngles || dir != blockDirection) {
+			// block done
+
+			blockEnd = a;
+			if (blockStart != blockEnd) {
+
+				dim3 dimGrid(
+				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
+				// TODO: check if we can't immediately
+				//       destroy the stream after use
+				cudaStream_t stream;
+				cudaStreamCreate(&stream);
+				streams.push_back(stream);
+
+				// printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y);
+
+				if (blockDirection == 0) {
+					for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							par3D_FP_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+							par3D_FP_SS_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+				} else if (blockDirection == 1) {
+					for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							par3D_FP_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+							par3D_FP_SS_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+				} else if (blockDirection == 2) {
+					for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							par3D_FP_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+							par3D_FP_SS_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+				}
+
+			}
+
+			blockDirection = dir;
+			blockStart = a;
+		}
+	}
+
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
+		cudaStreamDestroy(*iter);
+
+	streams.clear();
+
+	cudaTextForceKernelsCompletion();
+
+
+	// printf("%f\n", toc(t));
+
+	return true;
+}
+
+bool Par3DFP(cudaPitchedPtr D_volumeData,
+             cudaPitchedPtr D_projData,
+             const SDimensions3D& dims, const SPar3DProjection* angles,
+             float fOutputScale)
+{
+	// transfer volume to array
+	cudaArray* cuArray = allocateVolumeArray(dims);
+	transferVolumeToArray(D_volumeData, cuArray, dims);
+
+	bool ret = Par3DFP_Array(cuArray, D_projData, dims, angles, fOutputScale);
+
+	cudaFreeArray(cuArray);
+
+	return ret;
+}
+
+
+
+bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData,
+                    cudaPitchedPtr D_projData,
+                    const SDimensions3D& dims, const SPar3DProjection* angles,
+                    float fOutputScale)
+{
+	// transfer angles to constant memory
+	float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+	TRANSFER_TO_CONSTANT(RayX);
+	TRANSFER_TO_CONSTANT(RayY);
+	TRANSFER_TO_CONSTANT(RayZ);
+	TRANSFER_TO_CONSTANT(DetSX);
+	TRANSFER_TO_CONSTANT(DetSY);
+	TRANSFER_TO_CONSTANT(DetSZ);
+	TRANSFER_TO_CONSTANT(DetUX);
+	TRANSFER_TO_CONSTANT(DetUY);
+	TRANSFER_TO_CONSTANT(DetUZ);
+	TRANSFER_TO_CONSTANT(DetVX);
+	TRANSFER_TO_CONSTANT(DetVY);
+	TRANSFER_TO_CONSTANT(DetVZ);
+
+#undef TRANSFER_TO_CONSTANT
+
+	delete[] tmp;
+
+	std::list<cudaStream_t> streams;
+	dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles
+
+	// Run over all angles, grouping them into groups of the same
+	// orientation (roughly horizontal vs. roughly vertical).
+	// Start a stream of grids for each such group.
+
+	unsigned int blockStart = 0;
+	unsigned int blockEnd = 0;
+	int blockDirection = 0;
+
+	// timeval t;
+	// tic(t);
+
+	for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+		int dir;
+		if (a != dims.iProjAngles) {
+			float dX = fabsf(angles[a].fRayX);
+			float dY = fabsf(angles[a].fRayY);
+			float dZ = fabsf(angles[a].fRayZ);
+
+			if (dX >= dY && dX >= dZ)
+				dir = 0;
+			else if (dY >= dX && dY >= dZ)
+				dir = 1;
+			else
+				dir = 2;
+		}
+
+		if (a == dims.iProjAngles || dir != blockDirection) {
+			// block done
+
+			blockEnd = a;
+			if (blockStart != blockEnd) {
+
+				dim3 dimGrid(
+				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
+				// TODO: check if we can't immediately
+				//       destroy the stream after use
+				cudaStream_t stream;
+				cudaStreamCreate(&stream);
+				streams.push_back(stream);
+
+				// printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y);
+
+				if (blockDirection == 0) {
+					for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							par3D_FP_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+#if 0
+							par3D_FP_SS_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+							assert(false);
+#endif
+				} else if (blockDirection == 1) {
+					for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							par3D_FP_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+#if 0
+							par3D_FP_SS_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+							assert(false);
+#endif
+				} else if (blockDirection == 2) {
+					for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							par3D_FP_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+#if 0
+							par3D_FP_SS_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+							assert(false);
+#endif
+				}
+
+			}
+
+			blockDirection = dir;
+			blockStart = a;
+		}
+	}
+
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
+		cudaStreamDestroy(*iter);
+
+	streams.clear();
+
+	cudaTextForceKernelsCompletion();
+
+
+	// printf("%f\n", toc(t));
+
+	return true;
+}
+
+
+
+
+
+
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA3d;
+
+int main()
+{
+	cudaSetDevice(1);
+
+
+	SDimensions3D dims;
+	dims.iVolX = 500;
+	dims.iVolY = 500;
+	dims.iVolZ = 81;
+	dims.iProjAngles = 241;
+	dims.iProjU = 600;
+	dims.iProjV = 100;
+	dims.iRaysPerDet = 1;
+
+	SPar3DProjection base;
+	base.fRayX = 1.0f;
+	base.fRayY = 0.0f;
+	base.fRayZ = 0.1f;
+
+	base.fDetSX = 0.0f;
+	base.fDetSY = -300.0f;
+	base.fDetSZ = -50.0f;
+
+	base.fDetUX = 0.0f;
+	base.fDetUY = 1.0f;
+	base.fDetUZ = 0.0f;
+
+	base.fDetVX = 0.0f;
+	base.fDetVY = 0.0f;
+	base.fDetVZ = 1.0f;
+
+	SPar3DProjection angle[dims.iProjAngles];
+
+	cudaPitchedPtr volData; // pitch, ptr, xsize, ysize
+
+	volData = allocateVolumeData(dims);
+
+	cudaPitchedPtr projData; // pitch, ptr, xsize, ysize
+
+	projData = allocateProjectionData(dims);
+
+	unsigned int ix = 500,iy = 500;
+
+	float* buf = new float[dims.iProjU*dims.iProjV];
+
+	float* slice = new float[dims.iVolX*dims.iVolY];
+	for (int i = 0; i < dims.iVolX*dims.iVolY; ++i)
+		slice[i] = 1.0f;
+
+	for (unsigned int a = 0; a < 241; a += dims.iProjAngles) {
+
+		zeroProjectionData(projData, dims);
+
+		for (int y = 0; y < iy; y += dims.iVolY) {
+			for (int x = 0; x < ix; x += dims.iVolX) { 
+
+				timeval st;
+				tic(st);
+
+				for (int z = 0; z < dims.iVolZ; ++z) {
+//					char sfn[256];
+//					sprintf(sfn, "/home/wpalenst/projects/cone_simulation/phantom_4096/mouse_fem_phantom_%04d.png", 30+z);
+//					float* slice = loadSubImage(sfn, x, y, dims.iVolX, dims.iVolY);
+
+					cudaPitchedPtr ptr;
+					ptr.ptr = slice;
+					ptr.pitch = dims.iVolX*sizeof(float);
+					ptr.xsize = dims.iVolX*sizeof(float);
+					ptr.ysize = dims.iVolY;
+					cudaExtent extentS;
+					extentS.width = dims.iVolX*sizeof(float);
+					extentS.height = dims.iVolY;
+					extentS.depth = 1;
+
+					cudaPos sp = { 0, 0, 0 };
+					cudaPos dp = { 0, 0, z };
+					cudaMemcpy3DParms p;
+					p.srcArray = 0;
+					p.srcPos = sp;
+					p.srcPtr = ptr;
+					p.dstArray = 0;
+					p.dstPos = dp;
+					p.dstPtr = volData;
+					p.extent = extentS;
+					p.kind = cudaMemcpyHostToDevice;
+					cudaError err = cudaMemcpy3D(&p);
+					assert(!err);
+//					delete[] slice;
+				}
+
+				printf("Load: %f\n", toc(st));
+
+#if 0
+
+	cudaPos zp = { 0, 0, 0 };
+
+	cudaPitchedPtr t;
+	t.ptr = new float[1024*1024];
+	t.pitch = 1024*4;
+	t.xsize = 1024*4;
+	t.ysize = 1024;
+
+	cudaMemcpy3DParms p;
+	p.srcArray = 0;
+	p.srcPos = zp;
+	p.srcPtr = volData;
+	p.extent = extentS;
+	p.dstArray = 0;
+	p.dstPtr = t;
+	p.dstPos = zp;
+	p.kind = cudaMemcpyDeviceToHost;
+	cudaError err = cudaMemcpy3D(&p);
+	assert(!err);
+
+	char fn[32];
+	sprintf(fn, "t%d%d.png", x / dims.iVolX, y / dims.iVolY);
+	saveImage(fn, 1024, 1024, (float*)t.ptr);
+	saveImage("s.png", 4096, 4096, slice);
+	delete[] (float*)t.ptr;
+#endif
+
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = base.f##name##X * cos(alpha) - base.f##name##Y * sin(alpha); angle[i].f##name##Y = base.f##name##X * sin(alpha) + base.f##name##Y * cos(alpha); angle[i].f##name##Z = base.f##name##Z; } while(0)
+#define SHIFT(name,i,x,y) do { angle[i].f##name##X += x; angle[i].f##name##Y += y; } while(0)
+				for (int i = 0; i < dims.iProjAngles; ++i) {
+					ROTATE0(Ray, i, (a+i)*.8*M_PI/180);
+					ROTATE0(DetS, i, (a+i)*.8*M_PI/180);
+					ROTATE0(DetU, i, (a+i)*.8*M_PI/180);
+					ROTATE0(DetV, i, (a+i)*.8*M_PI/180);
+
+
+//					SHIFT(Src, i, (-x+1536), (-y+1536));
+//					SHIFT(DetS, i, (-x+1536), (-y+1536));
+				}
+#undef ROTATE0
+#undef SHIFT
+				tic(st);
+
+				astraCUDA3d::Par3DFP(volData, projData, dims, angle, 1.0f);
+
+				printf("FP: %f\n", toc(st));
+
+			}
+		}
+		for (unsigned int aa = 0; aa < dims.iProjAngles; ++aa) {
+			for (unsigned int v = 0; v < dims.iProjV; ++v)
+				cudaMemcpy(buf+v*dims.iProjU, ((float*)projData.ptr)+(v*dims.iProjAngles+aa)*(projData.pitch/sizeof(float)), dims.iProjU*sizeof(float), cudaMemcpyDeviceToHost);
+
+			char fname[32];
+			sprintf(fname, "proj%03d.png", a+aa);
+			saveImage(fname, dims.iProjV, dims.iProjU, buf, 0.0f, 1000.0f);
+		}
+	}
+
+	delete[] buf;
+
+}
+#endif