diff options
| -rw-r--r-- | cuda/2d/arith.cu | 14 | ||||
| -rw-r--r-- | cuda/2d/fan_bp.cu | 15 | ||||
| -rw-r--r-- | cuda/2d/fan_fp.cu | 11 | ||||
| -rw-r--r-- | cuda/2d/par_bp.cu | 10 | ||||
| -rw-r--r-- | cuda/2d/par_fp.cu | 19 | ||||
| -rw-r--r-- | cuda/2d/sart.cu | 2 | ||||
| -rw-r--r-- | cuda/2d/util.cu | 69 | ||||
| -rw-r--r-- | cuda/3d/arith3d.cu | 36 | ||||
| -rw-r--r-- | cuda/3d/cone_bp.cu | 4 | ||||
| -rw-r--r-- | cuda/3d/cone_fp.cu | 17 | ||||
| -rw-r--r-- | cuda/3d/fdk.cu | 6 | ||||
| -rw-r--r-- | cuda/3d/par3d_bp.cu | 4 | ||||
| -rw-r--r-- | cuda/3d/par3d_fp.cu | 30 | ||||
| -rw-r--r-- | cuda/3d/util3d.cu | 103 | ||||
| -rw-r--r-- | include/astra/cuda/2d/util.h | 16 | ||||
| -rw-r--r-- | include/astra/cuda/3d/util3d.h | 4 | 
16 files changed, 133 insertions, 227 deletions
diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu index aa0edae..45622d0 100644 --- a/cuda/2d/arith.cu +++ b/cuda/2d/arith.cu @@ -451,7 +451,7 @@ void processData(float* pfOut, unsigned int pitch, unsigned int width, unsigned  	devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -462,7 +462,7 @@ void processData(float* pfOut, float fParam, unsigned int pitch, unsigned int wi  	devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -473,7 +473,7 @@ void processData(float* pfOut1, float* pfOut2, float fParam1, float fParam2, uns  	devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  } @@ -485,7 +485,7 @@ void processData(float* pfOut, const float* pfIn, unsigned int pitch, unsigned i  	devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -496,7 +496,7 @@ void processData(float* pfOut, const float* pfIn, float fParam, unsigned int pit  	devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -507,7 +507,7 @@ void processData(float* pfOut, const float* pfIn1, const float* pfIn2, float fPa  	devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -518,7 +518,7 @@ void processData(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned  	devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  } diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu index fc42456..2068d03 100644 --- a/cuda/2d/fan_bp.cu +++ b/cuda/2d/fan_bp.cu @@ -322,13 +322,12 @@ bool FanBP_internal(float* D_volumeData, unsigned int volumePitch,  		else  			devFanBP<false><<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);  	} -	cudaThreadSynchronize(); -	cudaTextForceKernelsCompletion(); +	ok = checkCuda(cudaStreamSynchronize(stream), "FanBP");  	cudaStreamDestroy(stream); -	return true; +	return ok;  }  bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch, @@ -354,13 +353,12 @@ bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch,  	for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {  		devFanBP<true><<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);  	} -	cudaThreadSynchronize(); -	cudaTextForceKernelsCompletion(); +	ok = checkCuda(cudaStreamSynchronize(stream), "FanBP_FBPWeighted");  	cudaStreamDestroy(stream); -	return true; +	return ok;  }  // D_projData is a pointer to one padded sinogram line @@ -382,11 +380,8 @@ bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,  	             (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);  	devFanBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, dims, fOutputScale); -	cudaThreadSynchronize(); -	cudaTextForceKernelsCompletion(); - -	return true; +	return checkCuda(cudaThreadSynchronize(), "FanBP_SART");  }  bool FanBP(float* D_volumeData, unsigned int volumePitch, diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu index 9bbb401..342ca4c 100644 --- a/cuda/2d/fan_fp.cu +++ b/cuda/2d/fan_fp.cu @@ -268,16 +268,17 @@ bool FanFP_internal(float* D_volumeData, unsigned int volumePitch,  	for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)  		FanFPvertical<<<dimGrid, dimBlock, 0, stream2>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); -	cudaStreamDestroy(stream1); -	cudaStreamDestroy(stream2); +	bool ok = true; -	cudaThreadSynchronize(); +	ok &= checkCuda(cudaStreamSynchronize(stream1), "fan_fp hor"); +	cudaStreamDestroy(stream1); -	cudaTextForceKernelsCompletion(); +	ok &= checkCuda(cudaStreamSynchronize(stream2), "fan_fp ver"); +	cudaStreamDestroy(stream2);  	cudaFreeArray(D_dataArray); -	return true; +	return ok;  }  bool FanFP(float* D_volumeData, unsigned int volumePitch, diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu index b50b5a5..d7c3ab0 100644 --- a/cuda/2d/par_bp.cu +++ b/cuda/2d/par_bp.cu @@ -231,13 +231,12 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch,  		else  			devBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);  	} -	cudaThreadSynchronize(); -	cudaTextForceKernelsCompletion(); +	bool ok = checkCuda(cudaStreamSynchronize(stream), "par_bp");  	cudaStreamDestroy(stream); -	return true; +	return ok;  }  bool BP(float* D_volumeData, unsigned int volumePitch, @@ -284,11 +283,8 @@ bool BP_SART(float* D_volumeData, unsigned int volumePitch,  	             (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);  	devBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, angle_offset, angle_scaled_sin, angle_scaled_cos, dims, fOutputScale); -	cudaThreadSynchronize(); - -	cudaTextForceKernelsCompletion(); -	return true; +	return checkCuda(cudaThreadSynchronize(), "BP_SART");  } diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu index 8c48280..e947428 100644 --- a/cuda/2d/par_fp.cu +++ b/cuda/2d/par_fp.cu @@ -305,8 +305,8 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,  				dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,  				             (dims.iProjDets+g_detBlockSize-1)/g_detBlockSize); // angle blocks, detector blocks -				// TODO: check if we can't immediately -				//       destroy the stream after use +				// TODO: consider limiting number of handle (chaotic) geoms +				//       with many alternating directions  				cudaStream_t stream;  				cudaStreamCreate(&stream);  				streams.push_back(stream); @@ -323,19 +323,16 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,  		}  	} -	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) -		cudaStreamDestroy(*iter); - -	streams.clear(); - -	cudaThreadSynchronize(); +	bool ok = true; -	cudaTextForceKernelsCompletion(); +	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) { +		ok &= checkCuda(cudaStreamSynchronize(*iter), "par_fp"); +		cudaStreamDestroy(*iter); +	}  	cudaFreeArray(D_dataArray); -		 -	return true; +	return ok;  }  bool FP_simple(float* D_volumeData, unsigned int volumePitch, diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu index 29f5b43..89d58c2 100644 --- a/cuda/2d/sart.cu +++ b/cuda/2d/sart.cu @@ -54,7 +54,7 @@ void MUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int  	devMUL_SART<<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), "MUL_SART");  } diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu index 2a47472..ac360f0 100644 --- a/cuda/2d/util.cu +++ b/cuda/2d/util.cu @@ -40,12 +40,8 @@ bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch,  {  	size_t width = dims.iVolWidth;  	size_t height = dims.iVolHeight; -	// TODO: memory order -	cudaError_t err; -	err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); -	ASTRA_CUDA_ASSERT(err); -	assert(err == cudaSuccess); -	return true; + +	return checkCuda(cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice), "copyVolumeToDevice");  }  bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch, @@ -54,10 +50,8 @@ bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch,  {  	size_t width = dims.iVolWidth;  	size_t height = dims.iVolHeight; -	// TODO: memory order -	cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); -	ASTRA_CUDA_ASSERT(err); -	return true; + +	return checkCuda(cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost), "copyVolumeFromDevice");  } @@ -67,10 +61,8 @@ bool copySinogramFromDevice(float* out_data, unsigned int out_pitch,  {  	size_t width = dims.iProjDets;  	size_t height = dims.iProjAngles; -	// TODO: memory order -	cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); -	ASTRA_CUDA_ASSERT(err); -	return true; + +	return checkCuda(cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost), "copySinogramFromDevice");  }  bool copySinogramToDevice(const float* in_data, unsigned int in_pitch, @@ -79,20 +71,15 @@ bool copySinogramToDevice(const float* in_data, unsigned int in_pitch,  {  	size_t width = dims.iProjDets;  	size_t height = dims.iProjAngles; -	// TODO: memory order -	cudaError_t err; -	err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); -	ASTRA_CUDA_ASSERT(err); -	return true; + +	return checkCuda(cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice), "copySinogramToDevice");  }  bool allocateVolume(float*& ptr, unsigned int width, unsigned int height, unsigned int& pitch)  {  	size_t p; -	cudaError_t ret = cudaMallocPitch((void**)&ptr, &p, sizeof(float)*width, height); -	if (ret != cudaSuccess) { -		reportCudaError(ret); +	if (!checkCuda(cudaMallocPitch((void**)&ptr, &p, sizeof(float)*width, height), "allocateVolume")) {  		ASTRA_ERROR("Failed to allocate %dx%d GPU buffer", width, height);  		return false;  	} @@ -104,11 +91,9 @@ bool allocateVolume(float*& ptr, unsigned int width, unsigned int height, unsign  	return true;  } -void zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned int height) +bool zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned int height)  { -	cudaError_t err; -	err = cudaMemset2D(data, sizeof(float)*pitch, 0, sizeof(float)*width, height); -	ASTRA_CUDA_ASSERT(err); +	return checkCuda(cudaMemset2D(data, sizeof(float)*pitch, 0, sizeof(float)*width, height), "zeroVolume");  }  bool allocateVolumeData(float*& D_ptr, unsigned int& pitch, const SDimensions& dims) @@ -121,14 +106,14 @@ bool allocateProjectionData(float*& D_ptr, unsigned int& pitch, const SDimension  	return allocateVolume(D_ptr, dims.iProjDets, dims.iProjAngles, pitch);  } -void zeroVolumeData(float* D_ptr, unsigned int pitch, const SDimensions& dims) +bool zeroVolumeData(float* D_ptr, unsigned int pitch, const SDimensions& dims)  { -	zeroVolume(D_ptr, pitch, dims.iVolWidth, dims.iVolHeight); +	return zeroVolume(D_ptr, pitch, dims.iVolWidth, dims.iVolHeight);  } -void zeroProjectionData(float* D_ptr, unsigned int pitch, const SDimensions& dims) +bool zeroProjectionData(float* D_ptr, unsigned int pitch, const SDimensions& dims)  { -	zeroVolume(D_ptr, pitch, dims.iProjDets, dims.iProjAngles); +	return zeroVolume(D_ptr, pitch, dims.iProjDets, dims.iProjAngles);  }  void duplicateVolumeData(float* D_dst, float* D_src, unsigned int pitch, const SDimensions& dims) @@ -231,7 +216,7 @@ float dotProduct2D(float* D_data, unsigned int pitch,  	// Step 1: reduce 2D from image to a single vector, taking sum of squares  	reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), "dotProduct2D reduce2D");  	// Step 2: reduce 1D: add up elements in vector  	if (bx * by > 512) @@ -248,31 +233,21 @@ float dotProduct2D(float* D_data, unsigned int pitch,  	float x;  	cudaMemcpy(&x, D_res, 4, cudaMemcpyDeviceToHost); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), "dotProduct2D");  	cudaFree(D_buf);  	return x;  } - -bool cudaTextForceKernelsCompletion() +bool checkCuda(cudaError_t err, const char *msg)  { -	cudaError_t returnedCudaError = cudaThreadSynchronize(); - -	if(returnedCudaError != cudaSuccess) { -		ASTRA_ERROR("Failed to force completion of cuda kernels: %d: %s.", returnedCudaError, cudaGetErrorString(returnedCudaError)); +	if (err != cudaSuccess) { +		ASTRA_ERROR("%s: CUDA error %d: %s.", msg, err, cudaGetErrorString(err));  		return false; +	} else { +		return true;  	} - -	return true;  } -void reportCudaError(cudaError_t err) -{ -	if(err != cudaSuccess) -		ASTRA_ERROR("CUDA error %d: %s.", err, cudaGetErrorString(err)); -} - -  } diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu index fbaa50c..b495f22 100644 --- a/cuda/3d/arith3d.cu +++ b/cuda/3d/arith3d.cu @@ -225,7 +225,7 @@ void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsign  	devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -238,7 +238,7 @@ void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int  	devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -252,7 +252,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, uns  	devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -266,7 +266,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned  	devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -281,7 +281,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2  	devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -296,7 +296,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2  	devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height); -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  } @@ -328,7 +328,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)  		pfOut += step;  	} -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -344,7 +344,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)  		pfOut += step;  	} -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -362,7 +362,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio  		pfIn += step;  	} -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -380,7 +380,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c  		pfIn += step;  	} -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -400,7 +400,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc  		pfIn2 += step;  	} -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -420,7 +420,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc  		pfIn2 += step;  	} -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  } @@ -448,7 +448,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)  		pfOut += step;  	} -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -464,7 +464,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)  		pfOut += step;  	} -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -482,7 +482,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi  		pfIn += step;  	} -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -500,7 +500,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam,  		pfIn += step;  	} -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -520,7 +520,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit  		pfIn2 += step;  	} -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  }  template<typename op> @@ -540,7 +540,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit  		pfIn2 += step;  	} -	cudaTextForceKernelsCompletion(); +	checkCuda(cudaThreadSynchronize(), __FUNCTION__);  } diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu index 7c3fc8d..e265304 100644 --- a/cuda/3d/cone_bp.cu +++ b/cuda/3d/cone_bp.cu @@ -357,7 +357,9 @@ bool ConeBP_Array(cudaPitchedPtr D_volumeData,  				dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, params.iRaysPerVoxelDim, fOutputScale);  		} -		cudaTextForceKernelsCompletion(); +		// TODO: Consider not synchronizing here, if possible. +		if (!checkCuda(cudaThreadSynchronize(), "cone_bp")) +			return false;  		angles = angles + angleCount;  		// printf("%f\n", toc(t)); diff --git a/cuda/3d/cone_fp.cu b/cuda/3d/cone_fp.cu index 4937d24..fede53b 100644 --- a/cuda/3d/cone_fp.cu +++ b/cuda/3d/cone_fp.cu @@ -402,8 +402,9 @@ bool ConeFP_Array_internal(cudaPitchedPtr D_projData,  				dim3 dimGrid(  				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),  (blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); -				// TODO: check if we can't immediately -				//       destroy the stream after use + +				// TODO: consider limiting number of handle (chaotic) geoms +				//       with many alternating directions  				cudaStream_t stream;  				cudaStreamCreate(&stream);  				streams.push_back(stream); @@ -446,16 +447,16 @@ bool ConeFP_Array_internal(cudaPitchedPtr D_projData,  		}  	} -	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) -		cudaStreamDestroy(*iter); - -	streams.clear(); +	bool ok = true; -	cudaTextForceKernelsCompletion(); +	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) { +		ok &= checkCuda(cudaStreamSynchronize(*iter), "cone_fp"); +		cudaStreamDestroy(*iter); +	}  	// printf("%f\n", toc(t)); -	return true; +	return ok;  } diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu index 7b36c93..0b8d2ab 100644 --- a/cuda/3d/fdk.cu +++ b/cuda/3d/fdk.cu @@ -176,7 +176,8 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData,  	devFDK_preweight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fZShift, fDetUSize, fDetVSize, dims); -	cudaTextForceKernelsCompletion(); +	if (!checkCuda(cudaThreadSynchronize(), "FDK_PreWeight")) +		return false;  	if (bShortScan && dims.iProjAngles > 1) {  		ASTRA_DEBUG("Doing Parker weighting"); @@ -225,9 +226,10 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData,  		devFDK_ParkerWeight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fDetUSize, fCentralFanAngle, dims); +		if (!checkCuda(cudaThreadSynchronize(), "FDK_PreWeight ParkerWeight")) +			return false;  	} -	cudaTextForceKernelsCompletion();  	return true;  } diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu index d356b9f..1dc75ce 100644 --- a/cuda/3d/par3d_bp.cu +++ b/cuda/3d/par3d_bp.cu @@ -291,7 +291,9 @@ bool Par3DBP_Array(cudaPitchedPtr D_volumeData,  				dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, params.iRaysPerVoxelDim, fOutputScale);  		} -		cudaTextForceKernelsCompletion(); +		// TODO: Consider not synchronizing here, if possible. +		if (!checkCuda(cudaThreadSynchronize(), "cone_bp")) +			return false;  		angles = angles + angleCount;  		// printf("%f\n", toc(t)); diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu index 1f58516..cf8336c 100644 --- a/cuda/3d/par3d_fp.cu +++ b/cuda/3d/par3d_fp.cu @@ -501,8 +501,8 @@ bool Par3DFP_Array_internal(cudaPitchedPtr D_projData,  				dim3 dimGrid(  				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),  (blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); -				// TODO: check if we can't immediately -				//       destroy the stream after use +				// TODO: consider limiting number of handle (chaotic) geoms +				//       with many alternating directions  				cudaStream_t stream;  				cudaStreamCreate(&stream);  				streams.push_back(stream); @@ -545,17 +545,16 @@ bool Par3DFP_Array_internal(cudaPitchedPtr D_projData,  		}  	} -	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) -		cudaStreamDestroy(*iter); - -	streams.clear(); - -	cudaTextForceKernelsCompletion(); +	bool ok = true; +	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) { +		ok &= checkCuda(cudaStreamSynchronize(*iter), "par3d_fp"); +		cudaStreamDestroy(*iter); +	}  	// printf("%f\n", toc(t)); -	return true; +	return ok;  }  bool Par3DFP(cudaPitchedPtr D_volumeData, @@ -726,17 +725,16 @@ bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData,  		}  	} -	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) -		cudaStreamDestroy(*iter); - -	streams.clear(); - -	cudaTextForceKernelsCompletion(); +	bool ok = true; +	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) { +		ok = ok &= checkCuda(cudaStreamSynchronize(*iter), "Par3DFP_SumSqW"); +		cudaStreamDestroy(*iter); +	}  	// printf("%f\n", toc(t)); -	return true; +	return ok;  } diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu index 844b880..71b5668 100644 --- a/cuda/3d/util3d.cu +++ b/cuda/3d/util3d.cu @@ -46,12 +46,9 @@ cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims)  	cudaPitchedPtr volData; -	cudaError err = cudaMalloc3D(&volData, extentV); -	if (err != cudaSuccess) { -		astraCUDA::reportCudaError(err); +	if (!checkCuda(cudaMalloc3D(&volData, extentV), "allocateVolumeData 3D")) {  		ASTRA_ERROR("Failed to allocate %dx%dx%d GPU buffer", dims.iVolX, dims.iVolY, dims.iVolZ);  		volData.ptr = 0; -		// TODO: return 0 somehow?  	}  	return volData; @@ -65,12 +62,9 @@ cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims)  	cudaPitchedPtr projData; -	cudaError err = cudaMalloc3D(&projData, extentP); -	if (err != cudaSuccess) { -		astraCUDA::reportCudaError(err); +	if (!checkCuda(cudaMalloc3D(&projData, extentP), "allocateProjectionData 3D")) {  		ASTRA_ERROR("Failed to allocate %dx%dx%d GPU buffer", dims.iProjU, dims.iProjAngles, dims.iProjV);  		projData.ptr = 0; -		// TODO: return 0 somehow?  	}  	return projData; @@ -78,11 +72,11 @@ cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims)  bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims)  {  	char* t = (char*)D_data.ptr; -	cudaError err;  	for (unsigned int z = 0; z < dims.iVolZ; ++z) { -		err = cudaMemset2D(t, D_data.pitch, 0, dims.iVolX*sizeof(float), dims.iVolY); -		ASTRA_CUDA_ASSERT(err); +		if (!checkCuda(cudaMemset2D(t, D_data.pitch, 0, dims.iVolX*sizeof(float), dims.iVolY), "zeroVolumeData 3D")) { +			return false; +		}  		t += D_data.pitch * dims.iVolY;  	}  	return true; @@ -90,11 +84,11 @@ bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims)  bool zeroProjectionData(cudaPitchedPtr& D_data, const SDimensions3D& dims)  {  	char* t = (char*)D_data.ptr; -	cudaError err;  	for (unsigned int z = 0; z < dims.iProjV; ++z) { -		err = cudaMemset2D(t, D_data.pitch, 0, dims.iProjU*sizeof(float), dims.iProjAngles); -		ASTRA_CUDA_ASSERT(err); +		if (!checkCuda(cudaMemset2D(t, D_data.pitch, 0, dims.iProjU*sizeof(float), dims.iProjAngles), "zeroProjectionData 3D")) { +			return false; +		}  		t += D_data.pitch * dims.iProjAngles;  	} @@ -128,11 +122,7 @@ bool copyVolumeToDevice(const float* data, cudaPitchedPtr& D_data, const SDimens  	p.extent = extentV;  	p.kind = cudaMemcpyHostToDevice; -	cudaError err; -	err = cudaMemcpy3D(&p); -	ASTRA_CUDA_ASSERT(err); - -	return err == cudaSuccess; +	return checkCuda(cudaMemcpy3D(&p), "copyVolumeToDevice 3D");  }  bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch) @@ -163,11 +153,7 @@ bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SD  	p.extent = extentV;  	p.kind = cudaMemcpyHostToDevice; -	cudaError err; -	err = cudaMemcpy3D(&p); -	ASTRA_CUDA_ASSERT(err); - -	return err == cudaSuccess; +	return checkCuda(cudaMemcpy3D(&p), "copyProjectionsToDevice 3D");  }  bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch) @@ -198,12 +184,9 @@ bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDime  	p.extent = extentV;  	p.kind = cudaMemcpyDeviceToHost; -	cudaError err; -	err = cudaMemcpy3D(&p); -	ASTRA_CUDA_ASSERT(err); - -	return err == cudaSuccess; +	return checkCuda(cudaMemcpy3D(&p), "copyVolumeFromDevice 3D");  } +  bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)  {  	if (!pitch) @@ -232,11 +215,7 @@ bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const  	p.extent = extentV;  	p.kind = cudaMemcpyDeviceToHost; -	cudaError err; -	err = cudaMemcpy3D(&p); -	ASTRA_CUDA_ASSERT(err); - -	return err == cudaSuccess; +	return checkCuda(cudaMemcpy3D(&p), "copyProjectionsFromDevice 3D");  }  bool duplicateVolumeData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims) @@ -258,12 +237,9 @@ bool duplicateVolumeData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, con  	p.extent = extentV;  	p.kind = cudaMemcpyDeviceToDevice; -	cudaError err; -	err = cudaMemcpy3D(&p); -	ASTRA_CUDA_ASSERT(err); - -	return err == cudaSuccess; +	return checkCuda(cudaMemcpy3D(&p), "duplicateVolumeData 3D");  } +  bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims)  {  	cudaExtent extentV; @@ -283,11 +259,7 @@ bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src,  	p.extent = extentV;  	p.kind = cudaMemcpyDeviceToDevice; -	cudaError err; -	err = cudaMemcpy3D(&p); -	ASTRA_CUDA_ASSERT(err); - -	return err == cudaSuccess; +	return checkCuda(cudaMemcpy3D(&p), "duplicateProjectionData 3D");  } @@ -303,9 +275,8 @@ cudaArray* allocateVolumeArray(const SDimensions3D& dims)  	extentA.width = dims.iVolX;  	extentA.height = dims.iVolY;  	extentA.depth = dims.iVolZ; -	cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA); -	if (err != cudaSuccess) { -		astraCUDA::reportCudaError(err); + +	if (!checkCuda(cudaMalloc3DArray(&cuArray, &channelDesc, extentA), "allocateVolumeArray 3D")) {  		ASTRA_ERROR("Failed to allocate %dx%dx%d GPU array", dims.iVolX, dims.iVolY, dims.iVolZ);  		return 0;  	} @@ -320,10 +291,8 @@ cudaArray* allocateProjectionArray(const SDimensions3D& dims)  	extentA.width = dims.iProjU;  	extentA.height = dims.iProjAngles;  	extentA.depth = dims.iProjV; -	cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA); -	if (err != cudaSuccess) { -		astraCUDA::reportCudaError(err); +	if (!checkCuda(cudaMalloc3DArray(&cuArray, &channelDesc, extentA), "allocateProjectionArray 3D")) {  		ASTRA_ERROR("Failed to allocate %dx%dx%d GPU array", dims.iProjU, dims.iProjAngles, dims.iProjV);  		return 0;  	} @@ -352,12 +321,9 @@ bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const  	p.extent = extentA;  	p.kind = cudaMemcpyDeviceToDevice; -	cudaError err = cudaMemcpy3D(&p); -	ASTRA_CUDA_ASSERT(err); -	// TODO: check errors - -	return true; +	return checkCuda(cudaMemcpy3D(&p), "transferVolumeToArray 3D");  } +  bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, const SDimensions3D& dims)  {  	cudaExtent extentA; @@ -379,13 +345,9 @@ bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, con  	p.extent = extentA;  	p.kind = cudaMemcpyDeviceToDevice; -	cudaError err = cudaMemcpy3D(&p); -	ASTRA_CUDA_ASSERT(err); - -	// TODO: check errors - -	return true; +	return checkCuda(cudaMemcpy3D(&p), "transferProjectionsToArray 3D");  } +  bool transferHostProjectionsToArray(const float *projData, cudaArray* array, const SDimensions3D& dims)  {  	cudaExtent extentA; @@ -413,12 +375,7 @@ bool transferHostProjectionsToArray(const float *projData, cudaArray* array, con  	p.extent = extentA;  	p.kind = cudaMemcpyHostToDevice; -	cudaError err = cudaMemcpy3D(&p); -	ASTRA_CUDA_ASSERT(err); - -	// TODO: check errors - -	return true; +	return checkCuda(cudaMemcpy3D(&p), "transferHostProjectionsToArray 3D");  } @@ -430,18 +387,6 @@ float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y,  } -bool cudaTextForceKernelsCompletion() -{ -	cudaError_t returnedCudaError = cudaThreadSynchronize(); - -	if(returnedCudaError != cudaSuccess) { -		ASTRA_ERROR("Failed to force completion of cuda kernels: %d: %s.", returnedCudaError, cudaGetErrorString(returnedCudaError)); -		return false; -	} - -	return true; -} -  int calcNextPowerOfTwo(int _iValue)  {  	int iOutput = 1; diff --git a/include/astra/cuda/2d/util.h b/include/astra/cuda/2d/util.h index 49079ef..0fab9b1 100644 --- a/include/astra/cuda/2d/util.h +++ b/include/astra/cuda/2d/util.h @@ -40,9 +40,6 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.  #define M_PI 3.14159265358979323846  #endif -#define ASTRA_CUDA_ASSERT(err) do {  if (err != cudaSuccess) { astraCUDA::reportCudaError(err); assert(err == cudaSuccess); } } while(0) - -  namespace astraCUDA {  bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch, @@ -59,22 +56,17 @@ bool copySinogramToDevice(const float* in_data, unsigned int in_pitch,  		float* outD_data, unsigned int out_pitch);  bool allocateVolume(float*& D_ptr, unsigned int width, unsigned int height, unsigned int& pitch); -void zeroVolume(float* D_data, unsigned int pitch, unsigned int width, unsigned int height); +bool zeroVolume(float* D_data, unsigned int pitch, unsigned int width, unsigned int height);  bool allocateVolumeData(float*& D_ptr, unsigned int& pitch, const SDimensions& dims);  bool allocateProjectionData(float*& D_ptr, unsigned int& pitch, const SDimensions& dims); -void zeroVolumeData(float* D_ptr, unsigned int pitch, const SDimensions& dims); -void zeroProjectionData(float* D_ptr, unsigned int pitch, const SDimensions& dims); +bool zeroVolumeData(float* D_ptr, unsigned int pitch, const SDimensions& dims); +bool zeroProjectionData(float* D_ptr, unsigned int pitch, const SDimensions& dims);  void duplicateVolumeData(float* D_dst, float* D_src, unsigned int pitch, const SDimensions& dims);  void duplicateProjectionData(float* D_dst, float* D_src, unsigned int pitch, const SDimensions& dims); - - -bool cudaTextForceKernelsCompletion(); -void reportCudaError(cudaError_t err); - - +bool checkCuda(cudaError_t err, const char *msg);  float dotProduct2D(float* D_data, unsigned int pitch,                     unsigned int width, unsigned int height); diff --git a/include/astra/cuda/3d/util3d.h b/include/astra/cuda/3d/util3d.h index 5f805df..e147e42 100644 --- a/include/astra/cuda/3d/util3d.h +++ b/include/astra/cuda/3d/util3d.h @@ -38,6 +38,8 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.  namespace astraCUDA3d { +using astraCUDA::checkCuda; +  cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims);  cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims);  bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims); @@ -58,8 +60,6 @@ bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims);  cudaArray* allocateProjectionArray(const SDimensions3D& dims);  cudaArray* allocateVolumeArray(const SDimensions3D& dims); -bool cudaTextForceKernelsCompletion(); -  float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y, unsigned int z);  int calcNextPowerOfTwo(int _iValue);  | 
