28 files changed, 338 insertions, 330 deletions
diff --git a/Core/regularisers_GPU/Diffus_4thO_GPU_core.cu b/Core/regularisers_GPU/Diffus_4thO_GPU_core.cu
index fd586ef..a4dbe70 100644
--- a/Core/regularisers_GPU/Diffus_4thO_GPU_core.cu
+++ b/Core/regularisers_GPU/Diffus_4thO_GPU_core.cu
@@ -18,6 +18,7 @@ limitations under the License.
 */ 
 
 #include "Diffus_4thO_GPU_core.h"
+#include "shared.h"
 
 /* CUDA implementation of fourth-order diffusion scheme [1] for piecewise-smooth recovery (2D/3D case)
  * The minimisation is performed using explicit scheme. 
@@ -36,18 +37,6 @@ limitations under the License.
  * [1] Hajiaboli, M.R., 2011. An anisotropic fourth-order diffusion filter for image noise removal. International Journal of Computer Vision, 92(2), pp.177-191.
  */
 
-#define CHECK(call)                                                            \
-{                                                                              \
-    const cudaError_t error = call;                                            \
-    if (error != cudaSuccess)                                                  \
-    {                                                                          \
-        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
-        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
-                cudaGetErrorString(error));                                    \
-        exit(1);                                                               \
-    }                                                                          \
-}
-    
 #define BLKXSIZE 8
 #define BLKYSIZE 8
 #define BLKZSIZE 8
@@ -228,7 +217,7 @@ __global__ void Diffusion_update_step3D_kernel(float *Output, float *Input, floa
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
 /********************* MAIN HOST FUNCTION ******************/
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
-extern "C" void Diffus4th_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int N, int M, int Z)
+extern "C" int Diffus4th_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int N, int M, int Z)
 {
 		int dimTotal, dev = 0;
 		CHECK(cudaSetDevice(dev));
@@ -242,7 +231,7 @@ extern "C" void Diffus4th_GPU_main(float *Input, float *Output, float lambdaPar,
         CHECK(cudaMalloc((void**)&d_W_Lapl,dimTotal*sizeof(float)));
                 
         CHECK(cudaMemcpy(d_input,Input,dimTotal*sizeof(float),cudaMemcpyHostToDevice));
-        CHECK(cudaMemcpy(d_output,Input,dimTotal*sizeof(float),cudaMemcpyHostToDevice));      
+        CHECK(cudaMemcpy(d_output,Input,dimTotal*sizeof(float),cudaMemcpyHostToDevice));
         
 	if (Z == 1) {
 	     /*2D case */
@@ -275,4 +264,5 @@ extern "C" void Diffus4th_GPU_main(float *Input, float *Output, float lambdaPar,
         CHECK(cudaFree(d_input));
         CHECK(cudaFree(d_output));
         CHECK(cudaFree(d_W_Lapl));
+        return 0;
 }
diff --git a/Core/regularisers_GPU/Diffus_4thO_GPU_core.h b/Core/regularisers_GPU/Diffus_4thO_GPU_core.h
index 6314c1f..77d5d79 100644
--- a/Core/regularisers_GPU/Diffus_4thO_GPU_core.h
+++ b/Core/regularisers_GPU/Diffus_4thO_GPU_core.h
@@ -3,6 +3,6 @@
 #include "CCPiDefines.h"
 #include <stdio.h>
 
-extern "C" CCPI_EXPORT void Diffus4th_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int N, int M, int Z);
+extern "C" CCPI_EXPORT int Diffus4th_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int N, int M, int Z);
 
 #endif 
diff --git a/Core/regularisers_GPU/LLT_ROF_GPU_core.cu b/Core/regularisers_GPU/LLT_ROF_GPU_core.cu
index 0228bf0..87871be 100644
--- a/Core/regularisers_GPU/LLT_ROF_GPU_core.cu
+++ b/Core/regularisers_GPU/LLT_ROF_GPU_core.cu
@@ -18,6 +18,7 @@ limitations under the License.
 */ 
 
 #include "LLT_ROF_GPU_core.h"
+#include "shared.h"
 
 /* CUDA implementation of Lysaker, Lundervold and Tai (LLT) model [1] combined with Rudin-Osher-Fatemi [2] TV regularisation penalty.
  * 
@@ -40,18 +41,6 @@ limitations under the License.
 * [2] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms"
 */
 
-#define CHECK(call)                                                            \
-{                                                                              \
-    const cudaError_t error = call;                                            \
-    if (error != cudaSuccess)                                                  \
-    {                                                                          \
-        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
-        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
-                cudaGetErrorString(error));                                    \
-        exit(1);                                                               \
-    }                                                                          \
-}
-    
 #define BLKXSIZE 8
 #define BLKYSIZE 8
 #define BLKZSIZE 8
@@ -403,7 +392,7 @@ __global__ void Update3D_LLT_ROF_kernel(float *U0, float *U, float *D1_LLT, floa
 /************************ HOST FUNCTION ****************************/
 /*******************************************************************/
 
-extern "C" void LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z)
+extern "C" int LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z)
 {
 	    // set up device
 		int dev = 0;
@@ -480,4 +469,5 @@ extern "C" void LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, f
         CHECK(cudaFree(D1_ROF));
         CHECK(cudaFree(D2_ROF));
         CHECK(cudaFree(D3_ROF));
+        return 0;
 }
diff --git a/Core/regularisers_GPU/LLT_ROF_GPU_core.h b/Core/regularisers_GPU/LLT_ROF_GPU_core.h
index 4a19d09..a6bfcc7 100644
--- a/Core/regularisers_GPU/LLT_ROF_GPU_core.h
+++ b/Core/regularisers_GPU/LLT_ROF_GPU_core.h
@@ -3,6 +3,6 @@
 #include "CCPiDefines.h"
 #include <stdio.h>
 
-extern "C" CCPI_EXPORT void LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z);
+extern "C" CCPI_EXPORT int LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z);
 
 #endif 
diff --git a/Core/regularisers_GPU/NonlDiff_GPU_core.cu b/Core/regularisers_GPU/NonlDiff_GPU_core.cu
index 8048830..ff7ce4d 100644
--- a/Core/regularisers_GPU/NonlDiff_GPU_core.cu
+++ b/Core/regularisers_GPU/NonlDiff_GPU_core.cu
@@ -18,6 +18,7 @@ limitations under the License.
 */ 
 
 #include "NonlDiff_GPU_core.h"
+#include "shared.h"
 
 /* CUDA implementation of linear and nonlinear diffusion with the regularisation model [1,2] (2D/3D case)
  * The minimisation is performed using explicit scheme. 
@@ -38,18 +39,7 @@ limitations under the License.
  * [2] Black, M.J., Sapiro, G., Marimont, D.H. and Heeger, D., 1998. Robust anisotropic diffusion. IEEE Transactions on image processing, 7(3), pp.421-432.
  */
 
-#define CHECK(call)                                                            \
-{                                                                              \
-    const cudaError_t error = call;                                            \
-    if (error != cudaSuccess)                                                  \
-    {                                                                          \
-        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
-        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
-                cudaGetErrorString(error));                                    \
-        exit(1);                                                               \
-    }                                                                          \
-}
-    
+
 #define BLKXSIZE 8
 #define BLKYSIZE 8
 #define BLKZSIZE 8
@@ -295,7 +285,7 @@ __global__ void NonLinearDiff3D_kernel(float *Input, float *Output, float lambda
 
 /////////////////////////////////////////////////
 // HOST FUNCTION
-extern "C" void NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z)
+extern "C" int NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z)
 {
 	    // set up device
 		int dev = 0;
@@ -350,5 +340,6 @@ extern "C" void NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar,
         CHECK(cudaMemcpy(Output,d_output,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost));
         CHECK(cudaFree(d_input));
         CHECK(cudaFree(d_output));
-        //cudaDeviceReset(); 
+        //cudaDeviceReset();
+        return 0;
 }
diff --git a/Core/regularisers_GPU/NonlDiff_GPU_core.h b/Core/regularisers_GPU/NonlDiff_GPU_core.h
index afd712b..5fe457e 100644
--- a/Core/regularisers_GPU/NonlDiff_GPU_core.h
+++ b/Core/regularisers_GPU/NonlDiff_GPU_core.h
@@ -3,6 +3,6 @@
 #include "CCPiDefines.h"
 #include <stdio.h>
 
-extern "C" CCPI_EXPORT void NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z);
+extern "C" CCPI_EXPORT int NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z);
 
 #endif 
diff --git a/Core/regularisers_GPU/PatchSelect_GPU_core.cu b/Core/regularisers_GPU/PatchSelect_GPU_core.cu
index f558b0f..d173124 100644
--- a/Core/regularisers_GPU/PatchSelect_GPU_core.cu
+++ b/Core/regularisers_GPU/PatchSelect_GPU_core.cu
@@ -19,7 +19,8 @@
  */
 
 #include "PatchSelect_GPU_core.h"
-
+#include "shared.h"
+
 /* CUDA implementation of non-local weight pre-calculation for non-local priors
  * Weights and associated indices are stored into pre-allocated arrays and passed
  * to the regulariser
@@ -36,32 +37,20 @@
  * 1. AR_i - indeces of i neighbours
  * 2. AR_j - indeces of j neighbours
  * 3. Weights_ij - associated weights
- */
-
-// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
-#define checkCudaErrors(err)           __checkCudaErrors (err, __FILE__, __LINE__)
-
-inline void __checkCudaErrors(cudaError err, const char *file, const int line)
-{
-    if (cudaSuccess != err)
-    {
-        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
-                file, line, (int)err, cudaGetErrorString(err));
-        exit(EXIT_FAILURE);
-    }
-}
-
-#define BLKXSIZE 16
-#define BLKYSIZE 16
-#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
-#define M_PI 3.14159265358979323846
-#define EPS 1.0e-8
+ */
+
+
+#define BLKXSIZE 16
+#define BLKYSIZE 16
+#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
+#define M_PI 3.14159265358979323846
+#define EPS 1.0e-8
 #define CONSTVECSIZE5 121
 #define CONSTVECSIZE7 225
 #define CONSTVECSIZE9 361
 #define CONSTVECSIZE11 529
 #define CONSTVECSIZE13 729
-
+
 __device__ void swap(float *xp, float *yp) 
 {
     float temp = *xp; 
@@ -75,9 +64,9 @@ __device__ void swapUS(unsigned short *xp, unsigned short *yp)
     *yp = temp; 
 }
 
-/********************************************************************************/
-__global__ void IndexSelect2D_5_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
-{          
+/********************************************************************************/
+__global__ void IndexSelect2D_5_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
+{          
 
     long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
     float normsum;
@@ -85,10 +74,10 @@ __global__ void IndexSelect2D_5_kernel(float *Ad, unsigned short *H_i_d, unsigne
     float Weight_Vec[CONSTVECSIZE5];
     unsigned short ind_i[CONSTVECSIZE5];
     unsigned short ind_j[CONSTVECSIZE5];
-
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    
     long index = i*M+j;      
     
     counter = 0;
@@ -139,10 +128,10 @@ __global__ void IndexSelect2D_5_kernel(float *Ad, unsigned short *H_i_d, unsigne
         H_j_d[index2] = ind_j[x];
         Weights_d[index2] = Weight_Vec[x];
     }
-} 
-/********************************************************************************/
-__global__ void IndexSelect2D_7_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
-{          
+} 
+/********************************************************************************/
+__global__ void IndexSelect2D_7_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
+{          
 
     long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
     float normsum;
@@ -150,10 +139,10 @@ __global__ void IndexSelect2D_7_kernel(float *Ad, unsigned short *H_i_d, unsigne
     float Weight_Vec[CONSTVECSIZE7];
     unsigned short ind_i[CONSTVECSIZE7];
     unsigned short ind_j[CONSTVECSIZE7];
-
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    
     long index = i*M+j;      
     
     counter = 0;
@@ -204,9 +193,9 @@ __global__ void IndexSelect2D_7_kernel(float *Ad, unsigned short *H_i_d, unsigne
         H_j_d[index2] = ind_j[x];
         Weights_d[index2] = Weight_Vec[x];
     }
-}
-__global__ void IndexSelect2D_9_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
-{          
+}
+__global__ void IndexSelect2D_9_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
+{          
 
     long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
     float normsum;
@@ -214,10 +203,10 @@ __global__ void IndexSelect2D_9_kernel(float *Ad, unsigned short *H_i_d, unsigne
     float Weight_Vec[CONSTVECSIZE9];
     unsigned short ind_i[CONSTVECSIZE9];
     unsigned short ind_j[CONSTVECSIZE9];
-
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    
     long index = i*M+j;      
     
     counter = 0;
@@ -269,8 +258,8 @@ __global__ void IndexSelect2D_9_kernel(float *Ad, unsigned short *H_i_d, unsigne
         Weights_d[index2] = Weight_Vec[x];
     }                     
 }
-__global__ void IndexSelect2D_11_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
-{          
+__global__ void IndexSelect2D_11_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
+{          
 
     long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
     float normsum;
@@ -278,10 +267,10 @@ __global__ void IndexSelect2D_11_kernel(float *Ad, unsigned short *H_i_d, unsign
     float Weight_Vec[CONSTVECSIZE11];
     unsigned short ind_i[CONSTVECSIZE11];
     unsigned short ind_j[CONSTVECSIZE11];
-
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    
     long index = i*M+j;      
     
     counter = 0;
@@ -333,8 +322,8 @@ __global__ void IndexSelect2D_11_kernel(float *Ad, unsigned short *H_i_d, unsign
         Weights_d[index2] = Weight_Vec[x];
     }
 } 
-__global__ void IndexSelect2D_13_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
-{          
+__global__ void IndexSelect2D_13_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
+{          
 
     long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
     float normsum;
@@ -342,10 +331,10 @@ __global__ void IndexSelect2D_13_kernel(float *Ad, unsigned short *H_i_d, unsign
     float Weight_Vec[CONSTVECSIZE13];
     unsigned short ind_i[CONSTVECSIZE13];
     unsigned short ind_j[CONSTVECSIZE13];
-
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    
     long index = i*M+j;      
     
     counter = 0;
@@ -398,29 +387,29 @@ __global__ void IndexSelect2D_13_kernel(float *Ad, unsigned short *H_i_d, unsign
     }
 } 
 
-   
+   
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
 /********************* MAIN HOST FUNCTION ******************/
-/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
-extern "C" void PatchSelect_GPU_main(float *A, unsigned short *H_i, unsigned short *H_j, float *Weights, int N, int M, int SearchWindow, int SimilarWin, int NumNeighb, float h)
-{
-    int deviceCount = -1; // number of devices
-    cudaGetDeviceCount(&deviceCount);
-    if (deviceCount == 0) {
-        fprintf(stderr, "No CUDA devices found\n");
-        return;
+/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
+extern "C" int PatchSelect_GPU_main(float *A, unsigned short *H_i, unsigned short *H_j, float *Weights, int N, int M, int SearchWindow, int SimilarWin, int NumNeighb, float h)
+{
+    int deviceCount = -1; // number of devices
+    cudaGetDeviceCount(&deviceCount);
+    if (deviceCount == 0) {
+        fprintf(stderr, "No CUDA devices found\n");
+        return -1;
     }  
-      
-    int SearchW_full, SimilW_full, counterG, i, j;
+      
+    int SearchW_full, SimilW_full, counterG, i, j;
     float *Ad, *Weights_d, h2, *Eucl_Vec, *Eucl_Vec_d;    
-    unsigned short *H_i_d, *H_j_d;
+    unsigned short *H_i_d, *H_j_d;
     h2 = h*h;
-    
-    dim3 dimBlock(BLKXSIZE,BLKYSIZE);
-    dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE));    
-       
-    SearchW_full = (2*SearchWindow + 1)*(2*SearchWindow + 1); /* the full searching window  size */
-    SimilW_full = (2*SimilarWin + 1)*(2*SimilarWin + 1);   /* the full similarity window  size */
+    
+    dim3 dimBlock(BLKXSIZE,BLKYSIZE);
+    dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE));    
+       
+    SearchW_full = (2*SearchWindow + 1)*(2*SearchWindow + 1); /* the full searching window  size */
+    SimilW_full = (2*SimilarWin + 1)*(2*SimilarWin + 1);   /* the full similarity window  size */
     
     /* generate a 2D Gaussian kernel for NLM procedure */
     Eucl_Vec = (float*) calloc (SimilW_full,sizeof(float));
@@ -432,16 +421,16 @@ extern "C" void PatchSelect_GPU_main(float *A, unsigned short *H_i, unsigned sho
     }} /*main neighb loop */
     
     
-    /*allocate space on the device*/
-    checkCudaErrors( cudaMalloc((void**)&Ad, N*M*sizeof(float)) );
+    /*allocate space on the device*/
+    checkCudaErrors( cudaMalloc((void**)&Ad, N*M*sizeof(float)) );
     checkCudaErrors( cudaMalloc((void**)&H_i_d, N*M*NumNeighb*sizeof(unsigned short)) );
     checkCudaErrors( cudaMalloc((void**)&H_j_d, N*M*NumNeighb*sizeof(unsigned short)) );
     checkCudaErrors( cudaMalloc((void**)&Weights_d, N*M*NumNeighb*sizeof(float)) );
-    checkCudaErrors( cudaMalloc((void**)&Eucl_Vec_d, SimilW_full*sizeof(float)) );
-
-    /* copy data from the host to the device */
+    checkCudaErrors( cudaMalloc((void**)&Eucl_Vec_d, SimilW_full*sizeof(float)) );
+
+    /* copy data from the host to the device */
     checkCudaErrors( cudaMemcpy(Ad,A,N*M*sizeof(float),cudaMemcpyHostToDevice) );
-    checkCudaErrors( cudaMemcpy(Eucl_Vec_d,Eucl_Vec,SimilW_full*sizeof(float),cudaMemcpyHostToDevice) );    
+    checkCudaErrors( cudaMemcpy(Eucl_Vec_d,Eucl_Vec,SimilW_full*sizeof(float),cudaMemcpyHostToDevice) );    
  
     /********************** Run CUDA kernel here ********************/
     if (SearchWindow == 5)  IndexSelect2D_5_kernel<<<dimGrid,dimBlock>>>(Ad, H_i_d, H_j_d, Weights_d, Eucl_Vec_d, N, M, SearchWindow, SearchW_full, SimilarWin, NumNeighb, h2);
@@ -450,19 +439,20 @@ extern "C" void PatchSelect_GPU_main(float *A, unsigned short *H_i, unsigned sho
     else if (SearchWindow == 11)  IndexSelect2D_11_kernel<<<dimGrid,dimBlock>>>(Ad, H_i_d, H_j_d, Weights_d, Eucl_Vec_d, N, M, SearchWindow, SearchW_full, SimilarWin, NumNeighb, h2);
     else if (SearchWindow == 13)  IndexSelect2D_13_kernel<<<dimGrid,dimBlock>>>(Ad, H_i_d, H_j_d, Weights_d, Eucl_Vec_d, N, M, SearchWindow, SearchW_full, SimilarWin, NumNeighb, h2);
     else {
-    fprintf(stderr, "Select the searching window size from 5, 7, 9, 11 or 13\n");
-        return;}    
-    checkCudaErrors(cudaPeekAtLastError() );        
-    checkCudaErrors(cudaDeviceSynchronize());   
-    /***************************************************************/    
-        
+    fprintf(stderr, "Select the searching window size from 5, 7, 9, 11 or 13\n");
+        return -1;}    
+    checkCudaErrors(cudaPeekAtLastError() );        
+    checkCudaErrors(cudaDeviceSynchronize());   
+    /***************************************************************/    
+        
     checkCudaErrors(cudaMemcpy(H_i, H_i_d, N*M*NumNeighb*sizeof(unsigned short),cudaMemcpyDeviceToHost) );
     checkCudaErrors(cudaMemcpy(H_j, H_j_d, N*M*NumNeighb*sizeof(unsigned short),cudaMemcpyDeviceToHost) );  
     checkCudaErrors(cudaMemcpy(Weights, Weights_d, N*M*NumNeighb*sizeof(float),cudaMemcpyDeviceToHost) );   
-    
+    
     cudaFree(Ad); 
     cudaFree(H_i_d); 
     cudaFree(H_j_d);    
     cudaFree(Weights_d);    
-    cudaFree(Eucl_Vec_d);
+    cudaFree(Eucl_Vec_d);
+    return 0;
 }
diff --git a/Core/regularisers_GPU/PatchSelect_GPU_core.h b/Core/regularisers_GPU/PatchSelect_GPU_core.h
index d20fe9f..8c124d3 100644
--- a/Core/regularisers_GPU/PatchSelect_GPU_core.h
+++ b/Core/regularisers_GPU/PatchSelect_GPU_core.h
@@ -3,6 +3,6 @@
 #include "CCPiDefines.h"
 #include <stdio.h>
 
-extern "C" CCPI_EXPORT void PatchSelect_GPU_main(float *A, unsigned short *H_i, unsigned short *H_j, float *Weights, int N, int M, int SearchWindow, int SimilarWin, int NumNeighb, float h);
+extern "C" CCPI_EXPORT int PatchSelect_GPU_main(float *A, unsigned short *H_i, unsigned short *H_j, float *Weights, int N, int M, int SearchWindow, int SimilarWin, int NumNeighb, float h);
 
 #endif 
diff --git a/Core/regularisers_GPU/TGV_GPU_core.cu b/Core/regularisers_GPU/TGV_GPU_core.cu
index 3081011..73232a9 100644
--- a/Core/regularisers_GPU/TGV_GPU_core.cu
+++ b/Core/regularisers_GPU/TGV_GPU_core.cu
@@ -18,6 +18,7 @@ limitations under the License.
 */ 
 
 #include "TGV_GPU_core.h"
+#include "shared.h"
 
 /* CUDA implementation of Primal-Dual denoising method for 
  * Total Generilized Variation (TGV)-L2 model [1] (2D case only)
@@ -36,19 +37,6 @@ limitations under the License.
  * References:
  * [1] K. Bredies "Total Generalized Variation"
  */
-
-#define CHECK(call)                                                            \
-{                                                                              \
-    const cudaError_t error = call;                                            \
-    if (error != cudaSuccess)                                                  \
-    {                                                                          \
-        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
-        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
-                cudaGetErrorString(error));                                    \
-        exit(1);                                                               \
-    }                                                                          \
-}
-    
     
 #define BLKXSIZE2D 16
 #define BLKYSIZE2D 16
@@ -239,7 +227,7 @@ __global__ void newU_kernel(float *U, float *U_old, int N, int M, int num_total)
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
 /********************* MAIN HOST FUNCTION ******************/
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
-extern "C" void TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY)
+extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY)
 {
 		int dimTotal, dev = 0;
 		CHECK(cudaSetDevice(dev));
@@ -320,4 +308,5 @@ extern "C" void TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, fl
         CHECK(cudaFree(V2));
         CHECK(cudaFree(V1_old));
         CHECK(cudaFree(V2_old));
+        return 0;
 }
diff --git a/Core/regularisers_GPU/TGV_GPU_core.h b/Core/regularisers_GPU/TGV_GPU_core.h
index 663378f..5a4eb76 100644
--- a/Core/regularisers_GPU/TGV_GPU_core.h
+++ b/Core/regularisers_GPU/TGV_GPU_core.h
@@ -3,6 +3,6 @@
 #include "CCPiDefines.h"
 #include <stdio.h>
 
-extern "C" CCPI_EXPORT void TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY);
+extern "C" CCPI_EXPORT int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY);
 
 #endif 
diff --git a/Core/regularisers_GPU/TV_FGP_GPU_core.cu b/Core/regularisers_GPU/TV_FGP_GPU_core.cu
index eab7a44..b371c5d 100755
--- a/Core/regularisers_GPU/TV_FGP_GPU_core.cu
+++ b/Core/regularisers_GPU/TV_FGP_GPU_core.cu
@@ -18,6 +18,7 @@ limitations under the License.
 */ 
 
 #include "TV_FGP_GPU_core.h"
+#include "shared.h"
 #include <thrust/device_vector.h>
 #include <thrust/transform_reduce.h>
 
@@ -39,18 +40,6 @@ limitations under the License.
  * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
  */
 
-// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
-#define checkCudaErrors(err)           __checkCudaErrors (err, __FILE__, __LINE__)
-
-inline void __checkCudaErrors(cudaError err, const char *file, const int line)
-{
-    if (cudaSuccess != err)
-    {
-        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
-                file, line, (int)err, cudaGetErrorString(err));
-        exit(EXIT_FAILURE);
-    }
-}
 
 #define BLKXSIZE2D 16
 #define BLKYSIZE2D 16
@@ -354,13 +343,13 @@ __global__ void FGPResidCalc3D_kernel(float *Input1, float *Input2, float* Outpu
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
 
 ////////////MAIN HOST FUNCTION ///////////////
-extern "C" void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
+extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
 {
     int deviceCount = -1; // number of devices
     cudaGetDeviceCount(&deviceCount);
     if (deviceCount == 0) {
         fprintf(stderr, "No CUDA devices found\n");
-        return;
+        return -1;
     }
     
     int count = 0, i;
@@ -570,5 +559,6 @@ extern "C" void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, in
             cudaFree(R2);        
             cudaFree(R3);        
     } 
-    //cudaDeviceReset(); 
+    //cudaDeviceReset();
+    return 0;
 }
diff --git a/Core/regularisers_GPU/TV_FGP_GPU_core.h b/Core/regularisers_GPU/TV_FGP_GPU_core.h
index 107d243..b28cdf3 100755
--- a/Core/regularisers_GPU/TV_FGP_GPU_core.h
+++ b/Core/regularisers_GPU/TV_FGP_GPU_core.h
@@ -5,6 +5,6 @@
 #ifndef _TV_FGP_GPU_
 #define _TV_FGP_GPU_
 
-extern "C" void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); 
+extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
 
 #endif 
diff --git a/Core/regularisers_GPU/TV_ROF_GPU_core.cu b/Core/regularisers_GPU/TV_ROF_GPU_core.cu
index 57de63d..76f5be9 100755
--- a/Core/regularisers_GPU/TV_ROF_GPU_core.cu
+++ b/Core/regularisers_GPU/TV_ROF_GPU_core.cu
@@ -35,18 +35,7 @@ limitations under the License.
 *
 * D. Kazantsev, 2016-18
 */
-
-#define CHECK(call)                                                            \
-{                                                                              \
-    const cudaError_t error = call;                                            \
-    if (error != cudaSuccess)                                                  \
-    {                                                                          \
-        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
-        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
-                cudaGetErrorString(error));                                    \
-        exit(1);                                                               \
-    }                                                                          \
-}
+#include "shared.h"
     
 #define BLKXSIZE 8
 #define BLKYSIZE 8
@@ -304,7 +293,7 @@ __host__ __device__ int sign (float x)
 
 /////////////////////////////////////////////////
 // HOST FUNCTION
-extern "C" void TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z)
+extern "C" int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z)
 {
 	    // set up device
 		int dev = 0;
@@ -364,5 +353,6 @@ extern "C" void TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, in
         CHECK(cudaFree(d_update));
         CHECK(cudaFree(d_D1));
         CHECK(cudaFree(d_D2));        
-        //cudaDeviceReset(); 
+        //cudaDeviceReset();
+        return 0;
 }
diff --git a/Core/regularisers_GPU/TV_ROF_GPU_core.h b/Core/regularisers_GPU/TV_ROF_GPU_core.h
index d772aba..3a09296 100755
--- a/Core/regularisers_GPU/TV_ROF_GPU_core.h
+++ b/Core/regularisers_GPU/TV_ROF_GPU_core.h
@@ -3,6 +3,6 @@
 #include "CCPiDefines.h"
 #include <stdio.h>
 
-extern "C" CCPI_EXPORT void TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z);
+extern "C" CCPI_EXPORT int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z);
 
 #endif 
diff --git a/Core/regularisers_GPU/TV_SB_GPU_core.cu b/Core/regularisers_GPU/TV_SB_GPU_core.cu
index 68b9221..1f494ee 100755
--- a/Core/regularisers_GPU/TV_SB_GPU_core.cu
+++ b/Core/regularisers_GPU/TV_SB_GPU_core.cu
@@ -18,6 +18,7 @@ limitations under the License.
 */ 
 
 #include "TV_SB_GPU_core.h"
+#include "shared.h"
 #include <thrust/device_vector.h>
 #include <thrust/transform_reduce.h>
 
@@ -39,17 +40,6 @@ limitations under the License.
 */
 
 // This will output the proper CUDA error strings in the event that a CUDA host call returns an error
-#define checkCudaErrors(err)           __checkCudaErrors (err, __FILE__, __LINE__)
-
-inline void __checkCudaErrors(cudaError err, const char *file, const int line)
-{
-    if (cudaSuccess != err)
-    {
-        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
-                file, line, (int)err, cudaGetErrorString(err));
-        exit(EXIT_FAILURE);
-    }
-}
 
 #define BLKXSIZE2D 16
 #define BLKYSIZE2D 16
@@ -363,13 +353,13 @@ __global__ void SBResidCalc3D_kernel(float *Input1, float *Input2, float* Output
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
 /********************* MAIN HOST FUNCTION ******************/
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
-extern "C" void TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ)
+extern "C" int TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ)
 {
     int deviceCount = -1; // number of devices
     cudaGetDeviceCount(&deviceCount);
     if (deviceCount == 0) {
         fprintf(stderr, "No CUDA devices found\n");
-        return;
+        return -1;
     }
     
 	int ll, DimTotal;
@@ -557,5 +547,6 @@ extern "C" void TV_SB_GPU_main(float *Input, float *Output, float mu, int iter,
             cudaFree(By);
             cudaFree(Bz);
     } 
-    //cudaDeviceReset(); 
+    //cudaDeviceReset();
+    return 0;
 }
diff --git a/Core/regularisers_GPU/TV_SB_GPU_core.h b/Core/regularisers_GPU/TV_SB_GPU_core.h
index bdc9219..d44ab77 100755
--- a/Core/regularisers_GPU/TV_SB_GPU_core.h
+++ b/Core/regularisers_GPU/TV_SB_GPU_core.h
@@ -5,6 +5,6 @@
 #ifndef _SB_TV_GPU_
 #define _SB_TV_GPU_
 
-extern "C" void TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ);
+extern "C" int TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ);
 
 #endif 
diff --git a/Core/regularisers_GPU/dTV_FGP_GPU_core.cu b/Core/regularisers_GPU/dTV_FGP_GPU_core.cu
index 80a78da..7503ec7 100644
--- a/Core/regularisers_GPU/dTV_FGP_GPU_core.cu
+++ b/Core/regularisers_GPU/dTV_FGP_GPU_core.cu
@@ -16,7 +16,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */ 
-
+#include "shared.h"
 #include "dTV_FGP_GPU_core.h"
 #include <thrust/device_vector.h>
 #include <thrust/transform_reduce.h>
@@ -45,19 +45,6 @@ limitations under the License.
  */
  
 
-// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
-#define checkCudaErrors(err)           __checkCudaErrors (err, __FILE__, __LINE__)
-
-inline void __checkCudaErrors(cudaError err, const char *file, const int line)
-{
-    if (cudaSuccess != err)
-    {
-        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
-                file, line, (int)err, cudaGetErrorString(err));
-        exit(EXIT_FAILURE);
-    }
-}
-
 #define BLKXSIZE2D 16
 #define BLKYSIZE2D 16
 
@@ -468,13 +455,13 @@ __global__ void dTVnonneg3D_kernel(float* Output, int N, int M, int Z, int num_t
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
 
 ////////////MAIN HOST FUNCTION ///////////////
-extern "C" void dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iter, float epsil, float eta, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
+extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iter, float epsil, float eta, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
 {
     int deviceCount = -1; // number of devices
     cudaGetDeviceCount(&deviceCount);
     if (deviceCount == 0) {
         fprintf(stderr, "No CUDA devices found\n");
-        return;
+        return -1;
     }
     
     int count = 0, i;
@@ -748,6 +735,7 @@ extern "C" void dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, f
             cudaFree(InputRef_y);
             cudaFree(InputRef_z);
             cudaFree(d_InputRef);
-    } 
-    //cudaDeviceReset(); 
+    }
+    //cudaDeviceReset();
+    return 0;
 }
diff --git a/Core/regularisers_GPU/dTV_FGP_GPU_core.h b/Core/regularisers_GPU/dTV_FGP_GPU_core.h
index b906636..9020b1a 100644
--- a/Core/regularisers_GPU/dTV_FGP_GPU_core.h
+++ b/Core/regularisers_GPU/dTV_FGP_GPU_core.h
@@ -5,6 +5,6 @@
 #ifndef _dTV_FGP_GPU_
 #define _dTV_FGP_GPU_
 
-extern "C" void dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iter, float epsil, float eta, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
+extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iter, float epsil, float eta, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
 
 #endif 
diff --git a/Core/regularisers_GPU/shared.h b/Core/regularisers_GPU/shared.h
new file mode 100644
index 0000000..fe98cd6
--- /dev/null
+++ b/Core/regularisers_GPU/shared.h
@@ -0,0 +1,42 @@
+/*shared macros*/
+
+
+/*checks CUDA call, should be used in functions returning <int> value
+if error happens, writes to standard error and explicitly returns -1*/
+#define CHECK(call)                                                            \
+{                                                                              \
+    const cudaError_t error = call;                                            \
+    if (error != cudaSuccess)                                                  \
+    {                                                                          \
+        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
+        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
+                cudaGetErrorString(error));                                    \
+        return -1;                                                             \
+    }                                                                          \
+}
+
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#define checkCudaErrors(call)                                                            \
+{                                                                              \
+    const cudaError_t error = call;                                            \
+    if (error != cudaSuccess)                                                  \
+    {                                                                          \
+        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
+        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
+                cudaGetErrorString(error));                                    \
+        return -1;                                                                \
+    }                                                                          \
+}
+/*#define checkCudaErrors(err)           __checkCudaErrors (err, __FILE__, __LINE__)
+
+inline void __checkCudaErrors(cudaError err, const char *file, const int line)
+{
+    if (cudaSuccess != err)
+    {
+        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
+                file, line, (int)err, cudaGetErrorString(err));
+        return;
+    }
+}
+*/
+
diff --git a/Readme.md b/Readme.md
index 089c9fe..cdf823d 100644
--- a/Readme.md
+++ b/Readme.md
@@ -69,7 +69,6 @@ Here an example of build on Linux (see also `run.sh` for additional info):
 
 ```bash
 git clone https://github.com/vais-ral/CCPi-Regularisation-Toolkit.git
-mkdir build
 cd build
 cmake .. -DCONDA_BUILD=OFF -DBUILD_MATLAB_WRAPPER=ON -DBUILD_PYTHON_WRAPPER=ON -DBUILD_CUDA=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./install
 make install
@@ -88,7 +87,7 @@ conda install ccpi-regulariser -c ccpi -c conda-forge
 
 #### Python (conda-build)
 ```
-	export CIL_VERSION=0.10.2
+	export CIL_VERSION=0.10.3
 	conda build Wrappers/Python/conda-recipe --numpy 1.12 --python 3.5 
 	conda install ccpi-regulariser=${CIL_VERSION} --use-local --force
 	cd demos/
@@ -124,7 +123,7 @@ On Windows the `dll` and the mex modules must reside in the same directory. It i
 addpath(/path/to/library);
 ```
 
-#### Legacy Matlab installation
+#### Legacy Matlab installation (partly supported, please use Cmake)
 ```
 	
 	cd /Wrappers/Matlab/mex_compile
diff --git a/Wrappers/Matlab/mex_compile/compileGPU_mex.m b/Wrappers/Matlab/mex_compile/compileGPU_mex.m
index e0311ea..dd1475c 100644
--- a/Wrappers/Matlab/mex_compile/compileGPU_mex.m
+++ b/Wrappers/Matlab/mex_compile/compileGPU_mex.m
@@ -7,11 +7,10 @@
 % In the code bellow we provide a full explicit path to nvcc compiler 
 % ! paths to matlab and CUDA sdk can be different, modify accordingly !
 
-% Tested on Ubuntu 16.04/MATLAB 2016b/cuda7.5/gcc4.9
-
-% Installation HAS NOT been tested on Windows, please contact me if you'll be able to
-% install software on Windows and I gratefully include it into the master release. 
+% Tested on Ubuntu 18.04/MATLAB 2016b/cuda10.0/gcc7.3
 
+% Installation HAS NOT been tested on Windows, please you Cmake build or
+% modify the code bellow accordingly
 fsep = '/';
 
 pathcopyFrom = sprintf(['..' fsep '..' fsep '..' fsep 'Core' fsep 'regularisers_GPU'], 1i);
@@ -28,44 +27,45 @@ fprintf('%s \n', '<<<<<<<<<<<Compiling GPU regularisers (CUDA)>>>>>>>>>>>>>');
 
 fprintf('%s \n', 'Compiling ROF-TV...');
 !/usr/local/cuda/bin/nvcc -O0 -c TV_ROF_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/
-mex -g -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart -lcufft -lmwgpu ROF_TV_GPU.cpp TV_ROF_GPU_core.o
+mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcufft -lmwgpu ROF_TV_GPU.cpp TV_ROF_GPU_core.o
 movefile('ROF_TV_GPU.mex*',Pathmove);
 
 fprintf('%s \n', 'Compiling FGP-TV...');
 !/usr/local/cuda/bin/nvcc -O0 -c TV_FGP_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/
-mex -g -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart -lcufft -lmwgpu FGP_TV_GPU.cpp TV_FGP_GPU_core.o
+mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcufft -lmwgpu FGP_TV_GPU.cpp TV_FGP_GPU_core.o
 movefile('FGP_TV_GPU.mex*',Pathmove);
 
 fprintf('%s \n', 'Compiling SB-TV...');
 !/usr/local/cuda/bin/nvcc -O0 -c TV_SB_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/
-mex -g -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart -lcufft -lmwgpu SB_TV_GPU.cpp TV_SB_GPU_core.o
+mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcufft -lmwgpu SB_TV_GPU.cpp TV_SB_GPU_core.o
 movefile('SB_TV_GPU.mex*',Pathmove);
 
 fprintf('%s \n', 'Compiling TGV...');
 !/usr/local/cuda/bin/nvcc -O0 -c TGV_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/
-mex -g -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart -lcufft -lmwgpu TGV_GPU.cpp TGV_GPU_core.o
+mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcufft -lmwgpu TGV_GPU.cpp TGV_GPU_core.o
 movefile('TGV_GPU.mex*',Pathmove);
 
 fprintf('%s \n', 'Compiling dFGP-TV...');
 !/usr/local/cuda/bin/nvcc -O0 -c dTV_FGP_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/
-mex -g -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart -lcufft -lmwgpu FGP_dTV_GPU.cpp dTV_FGP_GPU_core.o
+mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcufft -lmwgpu FGP_dTV_GPU.cpp dTV_FGP_GPU_core.o
 movefile('FGP_dTV_GPU.mex*',Pathmove);
 
 fprintf('%s \n', 'Compiling NonLinear Diffusion...');
 !/usr/local/cuda/bin/nvcc -O0 -c NonlDiff_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/
-mex -g -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart -lcufft -lmwgpu NonlDiff_GPU.cpp NonlDiff_GPU_core.o
+mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcufft -lmwgpu NonlDiff_GPU.cpp NonlDiff_GPU_core.o
 movefile('NonlDiff_GPU.mex*',Pathmove);
 
 fprintf('%s \n', 'Compiling Anisotropic diffusion of higher order...');
 !/usr/local/cuda/bin/nvcc -O0 -c Diffus_4thO_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/
-mex -g -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart -lcufft -lmwgpu Diffusion_4thO_GPU.cpp Diffus_4thO_GPU_core.o
+mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcufft -lmwgpu Diffusion_4thO_GPU.cpp Diffus_4thO_GPU_core.o
 movefile('Diffusion_4thO_GPU.mex*',Pathmove);
 
 fprintf('%s \n', 'Compiling ROF-LLT...');
 !/usr/local/cuda/bin/nvcc -O0 -c LLT_ROF_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/
-mex -g -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart -lcufft -lmwgpu LLT_ROF_GPU.cpp LLT_ROF_GPU_core.o
+mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcufft -lmwgpu LLT_ROF_GPU.cpp LLT_ROF_GPU_core.o
 movefile('LLT_ROF_GPU.mex*',Pathmove);
 
+
 delete TV_ROF_GPU_core* TV_FGP_GPU_core* TV_SB_GPU_core* dTV_FGP_GPU_core* NonlDiff_GPU_core* Diffus_4thO_GPU_core* TGV_GPU_core* LLT_ROF_GPU_core* CCPiDefines.h
 fprintf('%s \n', 'All successfully compiled!');
 
diff --git a/Wrappers/Python/conda-recipe/build.sh b/Wrappers/Python/conda-recipe/build.sh
index 54bc8e2..eec7c2f 100644
--- a/Wrappers/Python/conda-recipe/build.sh
+++ b/Wrappers/Python/conda-recipe/build.sh
@@ -4,7 +4,7 @@ cp -rv "$RECIPE_DIR/../.." "$SRC_DIR/ccpi"
 cp -rv "$RECIPE_DIR/../../../Core" "$SRC_DIR/Core"
 
 cd $SRC_DIR
-
+##cuda=off
 cmake -G "Unix Makefiles" $RECIPE_DIR/../../../ -DBUILD_PYTHON_WRAPPER=ON -DCONDA_BUILD=ON -DBUILD_CUDA=ON -DCMAKE_BUILD_TYPE="Release" -DLIBRARY_LIB=$CONDA_PREFIX/lib -DLIBRARY_INC=$CONDA_PREFIX -DCMAKE_INSTALL_PREFIX=$PREFIX
 
 
diff --git a/Wrappers/Python/conda-recipe/meta.yaml b/Wrappers/Python/conda-recipe/meta.yaml
index ed73165..808493e 100644
--- a/Wrappers/Python/conda-recipe/meta.yaml
+++ b/Wrappers/Python/conda-recipe/meta.yaml
@@ -1,6 +1,6 @@
 package:
   name: ccpi-regulariser
-  version: 0.10.2
+  version: 0.10.3
 
 
 build:
diff --git a/Wrappers/Python/conda-recipe/run_test.py b/Wrappers/Python/conda-recipe/run_test.py
index 499ae7f..cfb3f53 100755
--- a/Wrappers/Python/conda-recipe/run_test.py
+++ b/Wrappers/Python/conda-recipe/run_test.py
@@ -2,7 +2,7 @@ import unittest
 import numpy as np
 import os
 import timeit
-from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, TGV, LLT_ROF, FGP_dTV, NDF, DIFF4th
+from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, TGV, LLT_ROF, FGP_dTV, NDF, DIFF4th
 from PIL import Image
 
 class TiffReader(object):
@@ -37,6 +37,8 @@ class TestRegularisers(unittest.TestCase):
     
 
     def test_ROF_TV_CPU_vs_GPU(self):
+        #print ("tomas debug test function")
+        print(__name__)
         filename = os.path.join("lena_gray_512.tif")
         plt = TiffReader()
         # read image
@@ -63,11 +65,11 @@ class TestRegularisers(unittest.TestCase):
         
         # set parameters
         pars = {'algorithm': ROF_TV, \
-                'input' : u0,\
-                'regularisation_parameter':0.04,\
-                'number_of_iterations': 1000,\
-                'time_marching_parameter': 0.0001
-                }
+        'input' : u0,\
+        'regularisation_parameter':0.04,\
+        'number_of_iterations': 2500,\
+        'time_marching_parameter': 0.00002
+        }
         print ("#############ROF TV CPU####################")
         start_time = timeit.default_timer()
         rof_cpu = ROF_TV(pars['input'],
@@ -88,8 +90,8 @@ class TestRegularisers(unittest.TestCase):
                              pars['number_of_iterations'], 
                              pars['time_marching_parameter'],'gpu')
         except ValueError as ve:
-            self.assertTrue(True)
-            return
+            self.skipTest("Results not comparable. GPU computing error.")
+
         rms = rmse(Im, rof_gpu)
         pars['rmse'] = rms
         pars['algorithm'] = ROF_TV
@@ -101,10 +103,10 @@ class TestRegularisers(unittest.TestCase):
         diff_im = np.zeros(np.shape(rof_cpu))
         diff_im = abs(rof_cpu - rof_gpu)
         diff_im[diff_im > tolerance] = 1
-
         self.assertLessEqual(diff_im.sum() , 1)
         
     def test_FGP_TV_CPU_vs_GPU(self):
+        print(__name__)
         filename = os.path.join("lena_gray_512.tif")
         plt = TiffReader()
         # read image
@@ -169,10 +171,10 @@ class TestRegularisers(unittest.TestCase):
                       pars['methodTV'],
                       pars['nonneg'],
                       pars['printingOut'],'gpu')
-                                           
+
         except ValueError as ve:
-            self.assertTrue(True)
-            return
+            self.skipTest("Results not comparable. GPU computing error.")
+
         rms = rmse(Im, fgp_gpu)
         pars['rmse'] = rms
         pars['algorithm'] = FGP_TV
@@ -189,6 +191,7 @@ class TestRegularisers(unittest.TestCase):
         self.assertLessEqual(diff_im.sum() , 1)
 
     def test_SB_TV_CPU_vs_GPU(self):
+        print(__name__)
         filename = os.path.join("lena_gray_512.tif")
         plt = TiffReader()
         # read image
@@ -251,10 +254,10 @@ class TestRegularisers(unittest.TestCase):
                       pars['tolerance_constant'], 
                       pars['methodTV'],
                       pars['printingOut'],'gpu')
-                                           
+
         except ValueError as ve:
-            self.assertTrue(True)
-            return
+            self.skipTest("Results not comparable. GPU computing error.")
+
         rms = rmse(Im, sb_gpu)
         pars['rmse'] = rms
         pars['algorithm'] = SB_TV
@@ -269,6 +272,7 @@ class TestRegularisers(unittest.TestCase):
         self.assertLessEqual(diff_im.sum(), 1)
 
     def test_TGV_CPU_vs_GPU(self):
+        print(__name__)
         filename = os.path.join("lena_gray_512.tif")
         plt = TiffReader()
         # read image
@@ -329,10 +333,10 @@ class TestRegularisers(unittest.TestCase):
                       pars['alpha0'],
                       pars['number_of_iterations'],
                       pars['LipshitzConstant'],'gpu')
-                                           
+
         except ValueError as ve:
-            self.assertTrue(True)
-            return
+            self.skipTest("Results not comparable. GPU computing error.")
+
         rms = rmse(Im, tgv_gpu)
         pars['rmse'] = rms
         pars['algorithm'] = TGV
@@ -347,6 +351,7 @@ class TestRegularisers(unittest.TestCase):
         self.assertLessEqual(diff_im.sum() , 1)
 
     def test_LLT_ROF_CPU_vs_GPU(self):
+        print(__name__)
         filename = os.path.join("lena_gray_512.tif")
         plt = TiffReader()
         # read image
@@ -405,8 +410,8 @@ class TestRegularisers(unittest.TestCase):
                       pars['time_marching_parameter'],'gpu')
         
         except ValueError as ve:
-            self.assertTrue(True)
-            return
+            self.skipTest("Results not comparable. GPU computing error.")
+
         rms = rmse(Im, lltrof_gpu)
         pars['rmse'] = rms
         pars['algorithm'] = LLT_ROF
@@ -421,6 +426,7 @@ class TestRegularisers(unittest.TestCase):
         self.assertLessEqual(diff_im.sum(), 1)
 
     def test_NDF_CPU_vs_GPU(self):
+        print(__name__)
         filename = os.path.join("lena_gray_512.tif")
         plt = TiffReader()
         # read image
@@ -483,8 +489,7 @@ class TestRegularisers(unittest.TestCase):
                       pars['penalty_type'],'gpu')
                      
         except ValueError as ve:
-            self.assertTrue(True)
-            return
+            self.skipTest("Results not comparable. GPU computing error.")
         rms = rmse(Im, ndf_gpu)
         pars['rmse'] = rms
         pars['algorithm'] = NDF
@@ -557,8 +562,7 @@ class TestRegularisers(unittest.TestCase):
                       pars['time_marching_parameter'], 'gpu')
                      
         except ValueError as ve:
-            self.assertTrue(True)
-            return
+            self.skipTest("Results not comparable. GPU computing error.")
         rms = rmse(Im, diff4th_gpu)
         pars['rmse'] = rms
         pars['algorithm'] = DIFF4th
@@ -603,8 +607,8 @@ class TestRegularisers(unittest.TestCase):
                 'input' : u0,\
                 'refdata' : u_ref,\
                 'regularisation_parameter':0.04, \
-                'number_of_iterations' :2000 ,\
-                'tolerance_constant':1e-06,\
+                'number_of_iterations' :1000 ,\
+                'tolerance_constant':1e-07,\
                 'eta_const':0.2,\
                 'methodTV': 0 ,\
                 'nonneg': 0 ,\
@@ -643,8 +647,7 @@ class TestRegularisers(unittest.TestCase):
                       pars['nonneg'],
                       pars['printingOut'],'gpu')
         except ValueError as ve:
-            self.assertTrue(True)
-            return
+            self.skipTest("Results not comparable. GPU computing error.")
         rms = rmse(Im, fgp_dtv_gpu)
         pars['rmse'] = rms
         pars['algorithm'] = FGP_dTV
@@ -765,8 +768,8 @@ class TestRegularisers(unittest.TestCase):
              pars_rof_tv['number_of_iterations'],
              pars_rof_tv['time_marching_parameter'],'gpu')
         except ValueError as ve:
-            self.assertTrue(True)
-            return
+            self.skipTest("Results not comparable. GPU computing error.")
+
         rms_rof = rmse(Im, rof_gpu)
         # now compare obtained rms with the expected value
         self.assertLess(abs(rms_rof-rms_rof_exp) , tolerance)
@@ -806,10 +809,10 @@ class TestRegularisers(unittest.TestCase):
               pars_fgp_tv['nonneg'],
               pars_fgp_tv['printingOut'],'gpu')  
         except ValueError as ve:
-            self.assertTrue(True)
-            return
+            self.skipTest("Results not comparable. GPU computing error.")
         rms_fgp = rmse(Im, fgp_gpu)
         # now compare obtained rms with the expected value
+
         self.assertLess(abs(rms_fgp-rms_fgp_exp) , tolerance)
 
 if __name__ == '__main__':
diff --git a/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py b/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py
index 616eab0..6529b5c 100644
--- a/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py
+++ b/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py
@@ -656,8 +656,8 @@ pars = {'algorithm' : FGP_dTV, \
         'input' : u0,\
         'refdata' : u_ref,\
         'regularisation_parameter':0.04, \
-        'number_of_iterations' :2000 ,\
-        'tolerance_constant':1e-06,\
+        'number_of_iterations' :1000 ,\
+        'tolerance_constant':1e-07,\
         'eta_const':0.2,\
         'methodTV': 0 ,\
         'nonneg': 0 ,\
diff --git a/Wrappers/Python/src/gpu_regularisers.pyx b/Wrappers/Python/src/gpu_regularisers.pyx
index 302727e..2b97865 100644
--- a/Wrappers/Python/src/gpu_regularisers.pyx
+++ b/Wrappers/Python/src/gpu_regularisers.pyx
@@ -18,15 +18,17 @@ import cython
 import numpy as np
 cimport numpy as np
 
-cdef extern void TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z);
-cdef extern void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z);
-cdef extern void TV_SB_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int printM, int N, int M, int Z);
-cdef extern void TGV_GPU_main(float *Input, float *Output, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY);
-cdef extern void LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z);
-cdef extern void NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z);
-cdef extern void dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int printM, int N, int M, int Z);
-cdef extern void Diffus4th_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int N, int M, int Z);
-cdef extern void PatchSelect_GPU_main(float *Input, unsigned short *H_i, unsigned short *H_j, float *Weights, int N, int M, int SearchWindow, int SimilarWin, int NumNeighb, float h);
+CUDAErrorMessage = 'CUDA error'
+
+cdef extern int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z);
+cdef extern int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z);
+cdef extern int TV_SB_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int printM, int N, int M, int Z);
+cdef extern int TGV_GPU_main(float *Input, float *Output, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY);
+cdef extern int LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z);
+cdef extern int NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z);
+cdef extern int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int printM, int N, int M, int Z);
+cdef extern int Diffus4th_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int N, int M, int Z);
+cdef extern int PatchSelect_GPU_main(float *Input, unsigned short *H_i, unsigned short *H_j, float *Weights, int N, int M, int SearchWindow, int SimilarWin, int NumNeighb, float h);
 
 # Total-variation Rudin-Osher-Fatemi (ROF)
 def TV_ROF_GPU(inputData,
@@ -186,15 +188,16 @@ def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
     cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \
 		    np.zeros([dims[0],dims[1]], dtype='float32')
           
-    # Running CUDA code here    
-    TV_ROF_GPU_main(            
+    # Running CUDA code here
+    if (TV_ROF_GPU_main(
             &inputData[0,0], &outputData[0,0], 
                        regularisation_parameter,
                        iterations , 
                        time_marching_parameter, 
-                       dims[1], dims[0], 1);   
-     
-    return outputData
+                       dims[1], dims[0], 1)==0):
+        return outputData;
+    else:
+        raise ValueError(CUDAErrorMessage);
     
 def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
                      float regularisation_parameter,
@@ -210,14 +213,15 @@ def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
           
     # Running CUDA code here    
-    TV_ROF_GPU_main(            
+    if (TV_ROF_GPU_main(
             &inputData[0,0,0], &outputData[0,0,0], 
                        regularisation_parameter,
                        iterations , 
                        time_marching_parameter, 
-                       dims[2], dims[1], dims[0]);   
-     
-    return outputData
+                       dims[2], dims[1], dims[0])==0):
+        return outputData;
+    else:
+        raise ValueError(CUDAErrorMessage);
 #****************************************************************#
 #********************** Total-variation FGP *********************#
 #****************************************************************#
@@ -238,16 +242,18 @@ def FGPTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1]], dtype='float32')
           
     # Running CUDA code here    
-    TV_FGP_GPU_main(&inputData[0,0], &outputData[0,0],                        
+    if (TV_FGP_GPU_main(&inputData[0,0], &outputData[0,0],
                        regularisation_parameter, 
                        iterations, 
                        tolerance_param,
                        methodTV,
                        nonneg,
                        printM,
-                       dims[1], dims[0], 1);   
-     
-    return outputData
+                       dims[1], dims[0], 1)==0):
+        return outputData;
+    else:
+        raise ValueError(CUDAErrorMessage);
+
     
 def FGPTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
                      float regularisation_parameter,
@@ -266,16 +272,18 @@ def FGPTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
           
     # Running CUDA code here    
-    TV_FGP_GPU_main(&inputData[0,0,0], &outputData[0,0,0], 
+    if (TV_FGP_GPU_main(&inputData[0,0,0], &outputData[0,0,0],
                        regularisation_parameter , 
                        iterations, 
                        tolerance_param,
                        methodTV,
                        nonneg,
                        printM,
-                       dims[2], dims[1], dims[0]);   
-     
-    return outputData 
+                       dims[2], dims[1], dims[0])==0):
+        return outputData;
+    else:
+        raise ValueError(CUDAErrorMessage);
+
 #***************************************************************#
 #********************** Total-variation SB *********************#
 #***************************************************************#
@@ -295,15 +303,17 @@ def SBTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1]], dtype='float32')
           
     # Running CUDA code here    
-    TV_SB_GPU_main(&inputData[0,0], &outputData[0,0],                        
+    if (TV_SB_GPU_main(&inputData[0,0], &outputData[0,0],
                        regularisation_parameter, 
                        iterations, 
                        tolerance_param,
                        methodTV,
                        printM,
-                       dims[1], dims[0], 1);   
-     
-    return outputData
+                       dims[1], dims[0], 1)==0):
+        return outputData;
+    else:
+        raise ValueError(CUDAErrorMessage);
+
     
 def SBTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
                      float regularisation_parameter,
@@ -321,15 +331,17 @@ def SBTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
           
     # Running CUDA code here    
-    TV_SB_GPU_main(&inputData[0,0,0], &outputData[0,0,0], 
+    if (TV_SB_GPU_main(&inputData[0,0,0], &outputData[0,0,0],
                        regularisation_parameter , 
                        iterations, 
                        tolerance_param,
                        methodTV,
                        printM,
-                       dims[2], dims[1], dims[0]);
-     
-    return outputData 
+                       dims[2], dims[1], dims[0])==0):
+        return outputData;
+    else:
+        raise ValueError(CUDAErrorMessage);
+
 
 #***************************************************************#
 #************************ LLT-ROF model ************************#
@@ -349,8 +361,11 @@ def LLT_ROF_GPU2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1]], dtype='float32')
           
     # Running CUDA code here    
-    LLT_ROF_GPU_main(&inputData[0,0], &outputData[0,0],regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, dims[1],dims[0],1);
-    return outputData
+    if (LLT_ROF_GPU_main(&inputData[0,0], &outputData[0,0],regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, dims[1],dims[0],1)==0):
+        return outputData;
+    else:
+        raise ValueError(CUDAErrorMessage);
+
     
 def LLT_ROF_GPU3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
                      float regularisation_parameterROF,
@@ -367,8 +382,11 @@ def LLT_ROF_GPU3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
           
     # Running CUDA code here    
-    LLT_ROF_GPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, dims[2], dims[1], dims[0]);
-    return outputData 
+    if (LLT_ROF_GPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, dims[2], dims[1], dims[0])==0):
+        return outputData;
+    else:
+        raise ValueError(CUDAErrorMessage);
+
 
 
 #***************************************************************#
@@ -389,13 +407,16 @@ def TGV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
             np.zeros([dims[0],dims[1]], dtype='float32')
                    
     #/* Run TGV iterations for 2D data */
-    TGV_GPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, 
+    if (TGV_GPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter,
                        alpha1,
                        alpha0,
                        iterationsNumb, 
                        LipshitzConst,
-                       dims[1],dims[0])
-    return outputData
+                       dims[1],dims[0])==0):
+        return outputData
+    else:
+        raise ValueError(CUDAErrorMessage);
+
 
 #****************************************************************#
 #**************Directional Total-variation FGP ******************#
@@ -419,7 +440,7 @@ def FGPdTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1]], dtype='float32')
           
     # Running CUDA code here    
-    dTV_FGP_GPU_main(&inputData[0,0], &refdata[0,0], &outputData[0,0],                        
+    if (dTV_FGP_GPU_main(&inputData[0,0], &refdata[0,0], &outputData[0,0],
                        regularisation_parameter, 
                        iterations, 
                        tolerance_param,
@@ -427,9 +448,11 @@ def FGPdTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
                        methodTV,
                        nonneg,
                        printM,
-                       dims[1], dims[0], 1);   
-     
-    return outputData
+                       dims[1], dims[0], 1)==0):
+        return outputData
+    else:
+        raise ValueError(CUDAErrorMessage);
+
     
 def FGPdTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
              np.ndarray[np.float32_t, ndim=3, mode="c"] refdata, 
@@ -450,7 +473,7 @@ def FGPdTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
           
     # Running CUDA code here    
-    dTV_FGP_GPU_main(&inputData[0,0,0], &refdata[0,0,0], &outputData[0,0,0], 
+    if (dTV_FGP_GPU_main(&inputData[0,0,0], &refdata[0,0,0], &outputData[0,0,0],
                        regularisation_parameter , 
                        iterations, 
                        tolerance_param,
@@ -458,8 +481,11 @@ def FGPdTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
                        methodTV,
                        nonneg,
                        printM,
-                       dims[2], dims[1], dims[0]);
-    return outputData 
+                       dims[2], dims[1], dims[0])==0):
+        return outputData;
+    else:
+        raise ValueError(CUDAErrorMessage);
+
 
 #****************************************************************#
 #***************Nonlinear (Isotropic) Diffusion******************#
@@ -483,8 +509,11 @@ def NDF_GPU_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
     
     # Run Nonlinear Diffusion iterations for 2D data 
     # Running CUDA code here  
-    NonlDiff_GPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, dims[1], dims[0], 1)
-    return outputData
+    if (NonlDiff_GPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, dims[1], dims[0], 1)==0):
+        return outputData;
+    else:
+        raise ValueError(CUDAErrorMessage);
+
             
 def NDF_GPU_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
                      float regularisation_parameter,
@@ -502,9 +531,11 @@ def NDF_GPU_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
        
     # Run Nonlinear Diffusion iterations for  3D data 
     # Running CUDA code here  
-    NonlDiff_GPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, dims[2], dims[1], dims[0])
+    if (NonlDiff_GPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, dims[2], dims[1], dims[0])==0):
+        return outputData;
+    else:
+        raise ValueError(CUDAErrorMessage);
 
-    return outputData
 #****************************************************************#
 #************Anisotropic Fourth-Order diffusion******************#
 #****************************************************************#
@@ -522,8 +553,11 @@ def Diff4th_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
     
     # Run Anisotropic Fourth-Order diffusion for 2D data 
     # Running CUDA code here  
-    Diffus4th_GPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, dims[1], dims[0], 1)
-    return outputData
+    if (Diffus4th_GPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, dims[1], dims[0], 1)==0):
+        return outputData
+    else:
+        raise ValueError(CUDAErrorMessage);
+
             
 def Diff4th_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
                      float regularisation_parameter,
@@ -540,9 +574,11 @@ def Diff4th_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
        
     # Run Anisotropic Fourth-Order diffusion for  3D data 
     # Running CUDA code here  
-    Diffus4th_GPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, dims[2], dims[1], dims[0])
+    if (Diffus4th_GPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, dims[2], dims[1], dims[0])==0):
+        return outputData;
+    else:
+        raise ValueError(CUDAErrorMessage);
 
-    return outputData
 #****************************************************************#
 #************Patch-based weights pre-selection******************#
 #****************************************************************#
@@ -571,6 +607,8 @@ def PatchSel_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
             np.zeros([dims[0], dims[1],dims[2]], dtype='uint16')
 
     # Run patch-based weight selection function
-    PatchSelect_GPU_main(&inputData[0,0], &H_j[0,0,0], &H_i[0,0,0], &Weights[0,0,0], dims[2], dims[1], searchwindow, patchwindow,  neighbours,  edge_parameter)
-    
-    return H_i, H_j, Weights      
+    if (PatchSelect_GPU_main(&inputData[0,0], &H_j[0,0,0], &H_i[0,0,0], &Weights[0,0,0], dims[2], dims[1], searchwindow, patchwindow,  neighbours,  edge_parameter)==0):
+        return H_i, H_j, Weights;
+    else:
+        raise ValueError(CUDAErrorMessage);
+
diff --git a/build/jenkins-build.sh b/build/jenkins-build.sh
index 04f8da6..0c397b1 100755
--- a/build/jenkins-build.sh
+++ b/build/jenkins-build.sh
@@ -1,12 +1,29 @@
 #!/usr/bin/env bash
 # Script to builds source code in Jenkins environment
+module try-load conda
 
-module avail
-module load conda
-# it expects that git clone is done before this script launch
+# install miniconda if the module is not present
+if hash conda 2>/dev/null; then
+  echo using conda
+else
+  if [ ! -f Miniconda3-latest-Linux-x86_64.sh ]; then
+    wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+    chmod +x Miniconda3-latest-Linux-x86_64.sh
+  fi
+  ./Miniconda3-latest-Linux-x86_64.sh -u -b -p .
+  PATH=$PATH:./bin
+fi
+
+# presume that git clone is done before this script launch
 # git clone https://github.com/vais-ral/CCPi-Regularisation-Toolkit
-conda install conda-build
+conda install -y conda-build
 #export CIL_VERSION=0.10.2
-export CIL_VERSION=0.10.2
-cd CCPi-Regularisation-Toolkit
+if [[ -n ${CIL_VERSION} ]]
+then
+  echo Using defined version: $CIL_VERSION
+else
+  export CIL_VERSION=0.10.3
+  echo Defining version: $CIL_VERSION
+fi
+#cd CCPi-Regularisation-Toolkit # already there by jenkins
 conda build Wrappers/Python/conda-recipe
diff --git a/run.sh b/run.sh
index 98b792e..a8e5555 100644
--- a/run.sh
+++ b/run.sh
@@ -3,7 +3,7 @@ echo "Building CCPi-regularisation Toolkit using CMake"
 # rm -r build
 # Requires Cython, install it first: 
 # pip install cython
-mkdir build
+# mkdir build
 cd build/
 make clean
 # install Python modules only without CUDA