Merge branch 'cufft'

This makes FFT/FDK more robust and faster by better sync handling, and cleans up error logging/handling.
author: Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl> 2021-11-16 14:10:21 +0100
committer: Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl> 2021-11-16 14:10:21 +0100
commit: ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6 (patch)
tree: 13a82cdad2602b8fd8ce5e861c5133a4c791e6ac /cuda
parent: c6e203411abf3dad3e677aaa1186b927086f8ba7 (diff)
parent: 063c97d04a757e3c288dcf156a63bf1e0ffd074e (diff)
download: astra-ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6.tar.gz
astra-ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6.tar.bz2
astra-ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6.tar.xz
astra-ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6.zip
1 files changed, 58 insertions, 66 deletions
diff --git a/cuda/2d/fft.cu b/cuda/2d/fft.cu
index 2cdb7c3..08acfd4 100644
--- a/cuda/2d/fft.cu
+++ b/cuda/2d/fft.cu
@@ -40,33 +40,18 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 
 using namespace astra;
 
-// TODO: evaluate what we want to do in these situations:
-
-#define CHECK_ERROR(errorMessage) do {                                     \
-  cudaError_t err = cudaThreadSynchronize();                               \
-  if( cudaSuccess != err) {                                                \
-      ASTRA_ERROR("Cuda error %s : %s",                                    \
-              errorMessage,cudaGetErrorString( err));                      \
-      exit(EXIT_FAILURE);                                                  \
-  } } while (0)
-
-#define SAFE_CALL( call) do {                                              \
-  cudaError err = call;                                                    \
-  if( cudaSuccess != err) {                                                \
-      ASTRA_ERROR("Cuda error: %s ",                                       \
-              cudaGetErrorString( err));                                   \
-      exit(EXIT_FAILURE);                                                  \
-  }                                                                        \
-  err = cudaThreadSynchronize();                                           \
-  if( cudaSuccess != err) {                                                \
-      ASTRA_ERROR("Cuda error: %s : ",                                     \
-              cudaGetErrorString( err));                                   \
-      exit(EXIT_FAILURE);                                                  \
-  } } while (0)
-
-
 namespace astraCUDA {
 
+bool checkCufft(cufftResult err, const char *msg)
+{
+	if (err != CUFFT_SUCCESS) {
+		ASTRA_ERROR("%s: CUFFT error %d.", msg, err);
+		return false;
+	} else {
+		return true;
+	}
+}
+
 __global__ static void applyFilter_kernel(int _iProjectionCount,
                                           int _iFreqBinCount,
                                           cufftComplex * _pSinogram,
@@ -115,7 +100,8 @@ static void rescaleInverseFourier(int _iProjectionCount, int _iDetectorCount,
 	rescaleInverseFourier_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount,
 	                                                            _iDetectorCount,
 	                                                            _pfInFourierOutput);
-	CHECK_ERROR("rescaleInverseFourier_kernel failed");
+
+	checkCuda(cudaThreadSynchronize(), "rescaleInverseFourier");
 }
 
 void applyFilter(int _iProjectionCount, int _iFreqBinCount,
@@ -128,7 +114,8 @@ void applyFilter(int _iProjectionCount, int _iFreqBinCount,
 	applyFilter_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount,
 	                                                  _iFreqBinCount,
 	                                                  _pSinogram, _pFilter);
-	CHECK_ERROR("applyFilter_kernel failed");
+
+	checkCuda(cudaThreadSynchronize(), "applyFilter");
 }
 
 static bool invokeCudaFFT(int _iProjectionCount, int _iDetectorCount,
@@ -136,24 +123,22 @@ static bool invokeCudaFFT(int _iProjectionCount, int _iDetectorCount,
                           cufftComplex * _pDevTargetComplex)
 {
 	cufftHandle plan;
-	cufftResult result;
 
-	result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_R2C, _iProjectionCount);
-	if(result != CUFFT_SUCCESS)
-	{
-		ASTRA_ERROR("Failed to plan 1d r2c fft");
+	if (!checkCufft(cufftPlan1d(&plan, _iDetectorCount, CUFFT_R2C, _iProjectionCount), "invokeCudaFFT plan")) {
 		return false;
 	}
 
-	result = cufftExecR2C(plan, (cufftReal *)_pfDevSource, _pDevTargetComplex);
-	cufftDestroy(plan);
+	if (!checkCufft(cufftExecR2C(plan, (cufftReal *)_pfDevSource, _pDevTargetComplex), "invokeCudaFFT exec")) {
+		cufftDestroy(plan);
+		return false;
+	}
 
-	if(result != CUFFT_SUCCESS)
-	{
-		ASTRA_ERROR("Failed to exec 1d r2c fft");
+	if (!checkCuda(cudaDeviceSynchronize(), "invokeCudaFFT sync")) {
+		cufftDestroy(plan);
 		return false;
 	}
 
+	cufftDestroy(plan);
 	return true;
 }
 
@@ -162,26 +147,25 @@ static bool invokeCudaIFFT(int _iProjectionCount, int _iDetectorCount,
                            float * _pfDevTarget)
 {
 	cufftHandle plan;
-	cufftResult result;
 
-	result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_C2R, _iProjectionCount);
-	if(result != CUFFT_SUCCESS)
-	{
-		ASTRA_ERROR("Failed to plan 1d c2r fft");
+	if (!checkCufft(cufftPlan1d(&plan, _iDetectorCount, CUFFT_C2R, _iProjectionCount), "invokeCudaIFFT plan")) {
 		return false;
 	}
 
-	// todo: why do we have to get rid of the const qualifier?
-	result = cufftExecC2R(plan, (cufftComplex *)_pDevSourceComplex,
-	                      (cufftReal *)_pfDevTarget);
-	cufftDestroy(plan);
-
-	if(result != CUFFT_SUCCESS)
+	// Getting rid of the const qualifier is due to cufft API issue?
+	if (!checkCufft(cufftExecC2R(plan, (cufftComplex *)_pDevSourceComplex,
+	                      (cufftReal *)_pfDevTarget), "invokeCudaIFFT exec"))
 	{
-		ASTRA_ERROR("Failed to exec 1d c2r fft");
+		cufftDestroy(plan);
 		return false;
 	}
 
+	if (!checkCuda(cudaDeviceSynchronize(), "invokeCudaIFFT sync")) {
+		cufftDestroy(plan);
+		return false;
+	}
+
+	cufftDestroy(plan);
 	return true;
 }
 
@@ -189,14 +173,12 @@ bool allocateComplexOnDevice(int _iProjectionCount, int _iDetectorCount,
                              cufftComplex ** _ppDevComplex)
 {
 	size_t bufferSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount;
-	SAFE_CALL(cudaMalloc((void **)_ppDevComplex, bufferSize));
-	return true;
+	return checkCuda(cudaMalloc((void **)_ppDevComplex, bufferSize), "fft allocateComplexOnDevice");
 }
 
 bool freeComplexOnDevice(cufftComplex * _pDevComplex)
 {
-	SAFE_CALL(cudaFree(_pDevComplex));
-	return true;
+	return checkCuda(cudaFree(_pDevComplex), "fft freeComplexOnDevice");
 }
 
 bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount,
@@ -204,9 +186,7 @@ bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount,
                                 cufftComplex * _pDevComplexTarget)
 {
 	size_t memSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount;
-	SAFE_CALL(cudaMemcpy(_pDevComplexTarget, _pHostComplexSource, memSize, cudaMemcpyHostToDevice));
-
-	return true;
+	return checkCuda(cudaMemcpy(_pDevComplexTarget, _pHostComplexSource, memSize, cudaMemcpyHostToDevice), "fft uploadComplexArrayToDevice");
 }
 
 bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
@@ -217,25 +197,30 @@ bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
 	float * pfDevRealFFTSource = NULL;
 	size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount;
 
-	SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTSource, bufferMemSize));
-	SAFE_CALL(cudaMemset(pfDevRealFFTSource, 0, bufferMemSize));
+	if (!checkCuda(cudaMalloc((void **)&pfDevRealFFTSource, bufferMemSize), "runCudaFFT malloc"))
+		return false;
+	if (!checkCuda(cudaMemset(pfDevRealFFTSource, 0, bufferMemSize), "runCudaFFT memset")) {
+		cudaFree(pfDevRealFFTSource);
+		return false;
+	}
 
 	for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
 	{
 		const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch;
 		float * pfTargetLocation = pfDevRealFFTSource + iProjectionIndex * _iFFTRealDetectorCount;
 
-		SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice));
+		if (!checkCuda(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice), "runCudaFFT memcpy")) {
+			cudaFree(pfDevRealFFTSource);
+			return false;
+		}
 	}
 
 	bool bResult = invokeCudaFFT(_iProjectionCount, _iFFTRealDetectorCount,
 	                             pfDevRealFFTSource, _pDevTargetComplex);
 	if(!bResult)
-	{
 		return false;
-	}
 
-	SAFE_CALL(cudaFree(pfDevRealFFTSource));
+	cudaFree(pfDevRealFFTSource);
 
 	return true;
 }
@@ -248,7 +233,8 @@ bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
 	float * pfDevRealFFTTarget = NULL;
 	size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount;
 
-	SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTTarget, bufferMemSize));
+	if (!checkCuda(cudaMalloc((void **)&pfDevRealFFTTarget, bufferMemSize), "runCudaIFFT malloc"))
+		return false;
 
 	bool bResult = invokeCudaIFFT(_iProjectionCount, _iFFTRealDetectorCount,
 	                              _pDevSourceComplex, pfDevRealFFTTarget);
@@ -260,17 +246,23 @@ bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
 	rescaleInverseFourier(_iProjectionCount, _iFFTRealDetectorCount,
 	                      pfDevRealFFTTarget);
 
-	SAFE_CALL(cudaMemset(_pfRealTarget, 0, sizeof(float) * _iProjectionCount * _iTargetPitch));
+	if (!checkCuda(cudaMemset(_pfRealTarget, 0, sizeof(float) * _iProjectionCount * _iTargetPitch), "runCudaIFFT memset")) {
+		cudaFree(pfDevRealFFTTarget);
+		return false;
+	}
 
 	for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
 	{
 		const float * pfSourceLocation = pfDevRealFFTTarget + iProjectionIndex * _iFFTRealDetectorCount;
 		float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch;
 
-		SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice));
+		if (!checkCuda(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice), "runCudaIFFT memcpy")) {
+			cudaFree(pfDevRealFFTTarget);
+			return false;
+		}
 	}
 
-	SAFE_CALL(cudaFree(pfDevRealFFTTarget));
+	cudaFree(pfDevRealFFTTarget);
 
 	return true;
 }
author	Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl>	2021-11-16 14:10:21 +0100
committer	Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl>	2021-11-16 14:10:21 +0100
commit	ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6 (patch)
tree	13a82cdad2602b8fd8ce5e861c5133a4c791e6ac /cuda
parent	c6e203411abf3dad3e677aaa1186b927086f8ba7 (diff)
parent	063c97d04a757e3c288dcf156a63bf1e0ffd074e (diff)
download	astra-ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6.tar.gz astra-ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6.tar.bz2 astra-ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6.tar.xz astra-ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6.zip