diff options
author | Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl> | 2021-11-16 13:44:13 +0100 |
---|---|---|
committer | Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl> | 2021-11-16 14:06:30 +0100 |
commit | 39582115bc93b5435d25e56891815ae7cb1898fd (patch) | |
tree | 6c728e1125961fc04ba6f77bf9af637925825f5b /cuda | |
parent | b492e3d049e300132d2f22eee7922ff308342a84 (diff) | |
download | astra-39582115bc93b5435d25e56891815ae7cb1898fd.tar.gz astra-39582115bc93b5435d25e56891815ae7cb1898fd.tar.bz2 astra-39582115bc93b5435d25e56891815ae7cb1898fd.tar.xz astra-39582115bc93b5435d25e56891815ae7cb1898fd.zip |
Remove cudaTextForceKernelsCompletion
Diffstat (limited to 'cuda')
-rw-r--r-- | cuda/2d/arith.cu | 14 | ||||
-rw-r--r-- | cuda/2d/fan_bp.cu | 15 | ||||
-rw-r--r-- | cuda/2d/fan_fp.cu | 11 | ||||
-rw-r--r-- | cuda/2d/par_bp.cu | 10 | ||||
-rw-r--r-- | cuda/2d/par_fp.cu | 19 | ||||
-rw-r--r-- | cuda/2d/sart.cu | 2 | ||||
-rw-r--r-- | cuda/2d/util.cu | 17 | ||||
-rw-r--r-- | cuda/3d/arith3d.cu | 36 | ||||
-rw-r--r-- | cuda/3d/cone_bp.cu | 4 | ||||
-rw-r--r-- | cuda/3d/cone_fp.cu | 17 | ||||
-rw-r--r-- | cuda/3d/fdk.cu | 6 | ||||
-rw-r--r-- | cuda/3d/par3d_bp.cu | 4 | ||||
-rw-r--r-- | cuda/3d/par3d_fp.cu | 30 | ||||
-rw-r--r-- | cuda/3d/util3d.cu | 12 |
14 files changed, 83 insertions, 114 deletions
diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu index aa0edae..45622d0 100644 --- a/cuda/2d/arith.cu +++ b/cuda/2d/arith.cu @@ -451,7 +451,7 @@ void processData(float* pfOut, unsigned int pitch, unsigned int width, unsigned devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -462,7 +462,7 @@ void processData(float* pfOut, float fParam, unsigned int pitch, unsigned int wi devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -473,7 +473,7 @@ void processData(float* pfOut1, float* pfOut2, float fParam1, float fParam2, uns devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } @@ -485,7 +485,7 @@ void processData(float* pfOut, const float* pfIn, unsigned int pitch, unsigned i devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -496,7 +496,7 @@ void processData(float* pfOut, const float* pfIn, float fParam, unsigned int pit devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -507,7 +507,7 @@ void processData(float* pfOut, const float* pfIn1, const float* pfIn2, float fPa devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -518,7 +518,7 @@ void processData(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu index fc42456..2068d03 100644 --- a/cuda/2d/fan_bp.cu +++ b/cuda/2d/fan_bp.cu @@ -322,13 +322,12 @@ bool FanBP_internal(float* D_volumeData, unsigned int volumePitch, else devFanBP<false><<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale); } - cudaThreadSynchronize(); - cudaTextForceKernelsCompletion(); + ok = checkCuda(cudaStreamSynchronize(stream), "FanBP"); cudaStreamDestroy(stream); - return true; + return ok; } bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch, @@ -354,13 +353,12 @@ bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch, for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) { devFanBP<true><<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale); } - cudaThreadSynchronize(); - cudaTextForceKernelsCompletion(); + ok = checkCuda(cudaStreamSynchronize(stream), "FanBP_FBPWeighted"); cudaStreamDestroy(stream); - return true; + return ok; } // D_projData is a pointer to one padded sinogram line @@ -382,11 +380,8 @@ bool FanBP_SART(float* D_volumeData, unsigned int volumePitch, (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize); devFanBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, dims, fOutputScale); - cudaThreadSynchronize(); - cudaTextForceKernelsCompletion(); - - return true; + return checkCuda(cudaThreadSynchronize(), "FanBP_SART"); } bool FanBP(float* D_volumeData, unsigned int volumePitch, diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu index 9bbb401..342ca4c 100644 --- a/cuda/2d/fan_fp.cu +++ b/cuda/2d/fan_fp.cu @@ -268,16 +268,17 @@ bool FanFP_internal(float* D_volumeData, unsigned int volumePitch, for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices) FanFPvertical<<<dimGrid, dimBlock, 0, stream2>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); - cudaStreamDestroy(stream1); - cudaStreamDestroy(stream2); + bool ok = true; - cudaThreadSynchronize(); + ok &= checkCuda(cudaStreamSynchronize(stream1), "fan_fp hor"); + cudaStreamDestroy(stream1); - cudaTextForceKernelsCompletion(); + ok &= checkCuda(cudaStreamSynchronize(stream2), "fan_fp ver"); + cudaStreamDestroy(stream2); cudaFreeArray(D_dataArray); - return true; + return ok; } bool FanFP(float* D_volumeData, unsigned int volumePitch, diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu index b50b5a5..d7c3ab0 100644 --- a/cuda/2d/par_bp.cu +++ b/cuda/2d/par_bp.cu @@ -231,13 +231,12 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch, else devBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale); } - cudaThreadSynchronize(); - cudaTextForceKernelsCompletion(); + bool ok = checkCuda(cudaStreamSynchronize(stream), "par_bp"); cudaStreamDestroy(stream); - return true; + return ok; } bool BP(float* D_volumeData, unsigned int volumePitch, @@ -284,11 +283,8 @@ bool BP_SART(float* D_volumeData, unsigned int volumePitch, (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize); devBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, angle_offset, angle_scaled_sin, angle_scaled_cos, dims, fOutputScale); - cudaThreadSynchronize(); - - cudaTextForceKernelsCompletion(); - return true; + return checkCuda(cudaThreadSynchronize(), "BP_SART"); } diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu index 8c48280..e947428 100644 --- a/cuda/2d/par_fp.cu +++ b/cuda/2d/par_fp.cu @@ -305,8 +305,8 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch, dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock, (dims.iProjDets+g_detBlockSize-1)/g_detBlockSize); // angle blocks, detector blocks - // TODO: check if we can't immediately - // destroy the stream after use + // TODO: consider limiting number of handle (chaotic) geoms + // with many alternating directions cudaStream_t stream; cudaStreamCreate(&stream); streams.push_back(stream); @@ -323,19 +323,16 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch, } } - for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) - cudaStreamDestroy(*iter); - - streams.clear(); - - cudaThreadSynchronize(); + bool ok = true; - cudaTextForceKernelsCompletion(); + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) { + ok &= checkCuda(cudaStreamSynchronize(*iter), "par_fp"); + cudaStreamDestroy(*iter); + } cudaFreeArray(D_dataArray); - - return true; + return ok; } bool FP_simple(float* D_volumeData, unsigned int volumePitch, diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu index 29f5b43..89d58c2 100644 --- a/cuda/2d/sart.cu +++ b/cuda/2d/sart.cu @@ -54,7 +54,7 @@ void MUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int devMUL_SART<<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), "MUL_SART"); } diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu index a75e5ab..ac360f0 100644 --- a/cuda/2d/util.cu +++ b/cuda/2d/util.cu @@ -216,7 +216,7 @@ float dotProduct2D(float* D_data, unsigned int pitch, // Step 1: reduce 2D from image to a single vector, taking sum of squares reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), "dotProduct2D reduce2D"); // Step 2: reduce 1D: add up elements in vector if (bx * by > 512) @@ -233,26 +233,13 @@ float dotProduct2D(float* D_data, unsigned int pitch, float x; cudaMemcpy(&x, D_res, 4, cudaMemcpyDeviceToHost); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), "dotProduct2D"); cudaFree(D_buf); return x; } - -bool cudaTextForceKernelsCompletion() -{ - cudaError_t returnedCudaError = cudaThreadSynchronize(); - - if(returnedCudaError != cudaSuccess) { - ASTRA_ERROR("Failed to force completion of cuda kernels: %d: %s.", returnedCudaError, cudaGetErrorString(returnedCudaError)); - return false; - } - - return true; -} - bool checkCuda(cudaError_t err, const char *msg) { if (err != cudaSuccess) { diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu index fbaa50c..b495f22 100644 --- a/cuda/3d/arith3d.cu +++ b/cuda/3d/arith3d.cu @@ -225,7 +225,7 @@ void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsign devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -238,7 +238,7 @@ void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -252,7 +252,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, uns devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -266,7 +266,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -281,7 +281,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2 devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -296,7 +296,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2 devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } @@ -328,7 +328,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims) pfOut += step; } - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -344,7 +344,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) pfOut += step; } - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -362,7 +362,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio pfIn += step; } - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -380,7 +380,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c pfIn += step; } - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -400,7 +400,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc pfIn2 += step; } - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -420,7 +420,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc pfIn2 += step; } - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } @@ -448,7 +448,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims) pfOut += step; } - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -464,7 +464,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) pfOut += step; } - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -482,7 +482,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi pfIn += step; } - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -500,7 +500,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, pfIn += step; } - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -520,7 +520,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit pfIn2 += step; } - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template<typename op> @@ -540,7 +540,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit pfIn2 += step; } - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu index 7c3fc8d..e265304 100644 --- a/cuda/3d/cone_bp.cu +++ b/cuda/3d/cone_bp.cu @@ -357,7 +357,9 @@ bool ConeBP_Array(cudaPitchedPtr D_volumeData, dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, params.iRaysPerVoxelDim, fOutputScale); } - cudaTextForceKernelsCompletion(); + // TODO: Consider not synchronizing here, if possible. + if (!checkCuda(cudaThreadSynchronize(), "cone_bp")) + return false; angles = angles + angleCount; // printf("%f\n", toc(t)); diff --git a/cuda/3d/cone_fp.cu b/cuda/3d/cone_fp.cu index 4937d24..fede53b 100644 --- a/cuda/3d/cone_fp.cu +++ b/cuda/3d/cone_fp.cu @@ -402,8 +402,9 @@ bool ConeFP_Array_internal(cudaPitchedPtr D_projData, dim3 dimGrid( ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), (blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); - // TODO: check if we can't immediately - // destroy the stream after use + + // TODO: consider limiting number of handle (chaotic) geoms + // with many alternating directions cudaStream_t stream; cudaStreamCreate(&stream); streams.push_back(stream); @@ -446,16 +447,16 @@ bool ConeFP_Array_internal(cudaPitchedPtr D_projData, } } - for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) - cudaStreamDestroy(*iter); - - streams.clear(); + bool ok = true; - cudaTextForceKernelsCompletion(); + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) { + ok &= checkCuda(cudaStreamSynchronize(*iter), "cone_fp"); + cudaStreamDestroy(*iter); + } // printf("%f\n", toc(t)); - return true; + return ok; } diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu index 7b36c93..0b8d2ab 100644 --- a/cuda/3d/fdk.cu +++ b/cuda/3d/fdk.cu @@ -176,7 +176,8 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData, devFDK_preweight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fZShift, fDetUSize, fDetVSize, dims); - cudaTextForceKernelsCompletion(); + if (!checkCuda(cudaThreadSynchronize(), "FDK_PreWeight")) + return false; if (bShortScan && dims.iProjAngles > 1) { ASTRA_DEBUG("Doing Parker weighting"); @@ -225,9 +226,10 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData, devFDK_ParkerWeight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fDetUSize, fCentralFanAngle, dims); + if (!checkCuda(cudaThreadSynchronize(), "FDK_PreWeight ParkerWeight")) + return false; } - cudaTextForceKernelsCompletion(); return true; } diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu index d356b9f..1dc75ce 100644 --- a/cuda/3d/par3d_bp.cu +++ b/cuda/3d/par3d_bp.cu @@ -291,7 +291,9 @@ bool Par3DBP_Array(cudaPitchedPtr D_volumeData, dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, params.iRaysPerVoxelDim, fOutputScale); } - cudaTextForceKernelsCompletion(); + // TODO: Consider not synchronizing here, if possible. + if (!checkCuda(cudaThreadSynchronize(), "cone_bp")) + return false; angles = angles + angleCount; // printf("%f\n", toc(t)); diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu index 1f58516..cf8336c 100644 --- a/cuda/3d/par3d_fp.cu +++ b/cuda/3d/par3d_fp.cu @@ -501,8 +501,8 @@ bool Par3DFP_Array_internal(cudaPitchedPtr D_projData, dim3 dimGrid( ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), (blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); - // TODO: check if we can't immediately - // destroy the stream after use + // TODO: consider limiting number of handle (chaotic) geoms + // with many alternating directions cudaStream_t stream; cudaStreamCreate(&stream); streams.push_back(stream); @@ -545,17 +545,16 @@ bool Par3DFP_Array_internal(cudaPitchedPtr D_projData, } } - for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) - cudaStreamDestroy(*iter); - - streams.clear(); - - cudaTextForceKernelsCompletion(); + bool ok = true; + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) { + ok &= checkCuda(cudaStreamSynchronize(*iter), "par3d_fp"); + cudaStreamDestroy(*iter); + } // printf("%f\n", toc(t)); - return true; + return ok; } bool Par3DFP(cudaPitchedPtr D_volumeData, @@ -726,17 +725,16 @@ bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData, } } - for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) - cudaStreamDestroy(*iter); - - streams.clear(); - - cudaTextForceKernelsCompletion(); + bool ok = true; + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) { + ok = ok &= checkCuda(cudaStreamSynchronize(*iter), "Par3DFP_SumSqW"); + cudaStreamDestroy(*iter); + } // printf("%f\n", toc(t)); - return true; + return ok; } diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu index 4f5d134..71b5668 100644 --- a/cuda/3d/util3d.cu +++ b/cuda/3d/util3d.cu @@ -387,18 +387,6 @@ float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y, } -bool cudaTextForceKernelsCompletion() -{ - cudaError_t returnedCudaError = cudaThreadSynchronize(); - - if(returnedCudaError != cudaSuccess) { - ASTRA_ERROR("Failed to force completion of cuda kernels: %d: %s.", returnedCudaError, cudaGetErrorString(returnedCudaError)); - return false; - } - - return true; -} - int calcNextPowerOfTwo(int _iValue) { int iOutput = 1; |