23 files changed, 1409 insertions, 49 deletions
diff --git a/Core/CMakeLists.txt b/Core/CMakeLists.txt
index 4142ed9..61986dc 100644
--- a/Core/CMakeLists.txt
+++ b/Core/CMakeLists.txt
@@ -85,6 +85,7 @@ message("Adding regularisers as a shared library")
 add_library(cilreg SHARED
 	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_TV_core.c
 	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/SB_TV_core.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffusion_core.c
 	    #${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/LLT_model_core.c
 	    #${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PatchBased_Regul_core.c
 	    #${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TGV_PD_core.c
@@ -133,6 +134,7 @@ if (CUDA_FOUND)
 	${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_FGP_GPU_core.cu
 	${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_SB_GPU_core.cu
 	${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/dTV_FGP_GPU_core.cu
+	${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/NonlDiff_GPU_core.cu
   )
   if (UNIX)
     message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib")
diff --git a/Core/regularisers_CPU/Diffusion_core.c b/Core/regularisers_CPU/Diffusion_core.c
new file mode 100644
index 0000000..51d0a57
--- /dev/null
+++ b/Core/regularisers_CPU/Diffusion_core.c
@@ -0,0 +1,307 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Diffusion_core.h"
+#include "utils.h"
+
+#define EPS 1.0e-5
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+/*sign function*/
+int signNDFc(float x) {
+    return (x > 0) - (x < 0);
+}
+
+/* C-OMP implementation of linear and nonlinear diffusion with the regularisation model [1,2] (2D/3D case)
+ * The minimisation is performed using explicit scheme. 
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume 
+ * 2. lambda - regularization parameter
+ * 3. Edge-preserving parameter (sigma), when sigma equals to zero nonlinear diffusion -> linear diffusion
+ * 4. Number of iterations, for explicit scheme >= 150 is recommended 
+ * 5. tau - time-marching step for explicit scheme
+ * 6. Penalty type: 1 - Huber, 2 - Perona-Malik, 3 - Tukey Biweight
+ *
+ * Output:
+ * [1] Regularized image/volume 
+ *
+ * This function is based on the paper by
+ * [1] Perona, P. and Malik, J., 1990. Scale-space and edge detection using anisotropic diffusion. IEEE Transactions on pattern analysis and machine intelligence, 12(7), pp.629-639.
+ * [2] Black, M.J., Sapiro, G., Marimont, D.H. and Heeger, D., 1998. Robust anisotropic diffusion. IEEE Transactions on image processing, 7(3), pp.421-432.
+ */
+
+float Diffusion_CPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int dimX, int dimY, int dimZ)
+{
+    int i;
+    float sigmaPar2;
+    sigmaPar2 = sigmaPar/sqrt(2.0f);
+    
+    /* copy into output */
+    copyIm(Input, Output, dimX, dimY, dimZ);
+    
+    if (dimZ == 1) {
+    /* running 2D diffusion iterations */
+    for(i=0; i < iterationsNumb; i++) {
+            if (sigmaPar == 0.0f) LinearDiff2D(Input, Output, lambdaPar, tau, dimX, dimY); /* linear diffusion (heat equation) */
+            else NonLinearDiff2D(Input, Output, lambdaPar, sigmaPar2, tau, penaltytype, dimX, dimY); /* nonlinear diffusion */
+		}
+	}
+	else {
+	/* running 3D diffusion iterations */
+    for(i=0; i < iterationsNumb; i++) {
+            if (sigmaPar == 0.0f) LinearDiff3D(Input, Output, lambdaPar, tau, dimX, dimY, dimZ);
+            else NonLinearDiff3D(Input, Output, lambdaPar, sigmaPar2, tau, penaltytype, dimX, dimY, dimZ);
+		}
+	}
+    return *Output;
+}
+
+
+/********************************************************************/
+/***************************2D Functions*****************************/
+/********************************************************************/
+/* linear diffusion (heat equation) */
+float LinearDiff2D(float *Input, float *Output, float lambdaPar, float tau, int dimX, int dimY)
+{
+	int i,j,i1,i2,j1,j2,index;
+	float e,w,n,s,e1,w1,n1,s1;
+	
+#pragma omp parallel for shared(Input) private(index,i,j,i1,i2,j1,j2,e,w,n,s,e1,w1,n1,s1)
+    for(i=0; i<dimX; i++) {
+        /* symmetric boundary conditions (Neuman) */
+        i1 = i+1; if (i1 == dimX) i1 = i-1;
+        i2 = i-1; if (i2 < 0) i2 = i+1;
+        for(j=0; j<dimY; j++) {
+            /* symmetric boundary conditions (Neuman) */
+            j1 = j+1; if (j1 == dimY) j1 = j-1;
+            j2 = j-1; if (j2 < 0) j2 = j+1;
+            index = j*dimX+i;
+            
+                e = Output[j*dimX+i1];
+                w = Output[j*dimX+i2];
+                n = Output[j1*dimX+i];
+                s = Output[j2*dimX+i];
+                
+                e1 = e - Output[index];
+                w1 = w - Output[index];
+                n1 = n - Output[index];
+                s1 = s - Output[index];
+                
+                Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1) - (Output[index] - Input[index]));  
+		}}
+	return *Output;
+}
+
+/* nonlinear diffusion */
+float NonLinearDiff2D(float *Input, float *Output, float lambdaPar, float sigmaPar, float tau, int penaltytype, int dimX, int dimY)
+{
+	int i,j,i1,i2,j1,j2,index;
+	float e,w,n,s,e1,w1,n1,s1;
+	
+#pragma omp parallel for shared(Input) private(index,i,j,i1,i2,j1,j2,e,w,n,s,e1,w1,n1,s1)
+    for(i=0; i<dimX; i++) {
+        /* symmetric boundary conditions (Neuman) */
+        i1 = i+1; if (i1 == dimX) i1 = i-1;
+        i2 = i-1; if (i2 < 0) i2 = i+1;
+        for(j=0; j<dimY; j++) {
+            /* symmetric boundary conditions (Neuman) */
+            j1 = j+1; if (j1 == dimY) j1 = j-1;
+            j2 = j-1; if (j2 < 0) j2 = j+1;
+            index = j*dimX+i;
+            
+                e = Output[j*dimX+i1];
+                w = Output[j*dimX+i2];
+                n = Output[j1*dimX+i];
+                s = Output[j2*dimX+i];
+                
+                e1 = e - Output[index];
+                w1 = w - Output[index];
+                n1 = n - Output[index];
+                s1 = s - Output[index];
+                
+            if (penaltytype == 1){
+            /* Huber penalty */
+            if (fabs(e1) > sigmaPar) e1 =  signNDFc(e1);
+            else e1 = e1/sigmaPar;
+            
+            if (fabs(w1) > sigmaPar) w1 =  signNDFc(w1);
+            else w1 = w1/sigmaPar;
+            
+            if (fabs(n1) > sigmaPar) n1 =  signNDFc(n1);
+            else n1 = n1/sigmaPar;
+            
+            if (fabs(s1) > sigmaPar) s1 =  signNDFc(s1);
+            else s1 = s1/sigmaPar;
+            }
+            else if (penaltytype == 2) {
+            /* Perona-Malik */
+            e1 = (e1)/(1.0f + powf((e1/sigmaPar),2));
+            w1 = (w1)/(1.0f + powf((w1/sigmaPar),2));
+            n1 = (n1)/(1.0f + powf((n1/sigmaPar),2));
+            s1 = (s1)/(1.0f + powf((s1/sigmaPar),2));
+            }
+            else if (penaltytype == 3) {
+            /* Tukey Biweight */
+            if (fabs(e1) <= sigmaPar) e1 =  e1*powf((1.0f - powf((e1/sigmaPar),2)), 2);
+            else e1 = 0.0f;
+            if (fabs(w1) <= sigmaPar) w1 =  w1*powf((1.0f - powf((w1/sigmaPar),2)), 2);
+            else w1 = 0.0f;
+            if (fabs(n1) <= sigmaPar) n1 =  n1*powf((1.0f - powf((n1/sigmaPar),2)), 2);
+            else n1 = 0.0f;
+            if (fabs(s1) <= sigmaPar) s1 =  s1*powf((1.0f - powf((s1/sigmaPar),2)), 2);
+            else s1 = 0.0f;
+            }
+            else {
+				printf("%s \n", "No penalty function selected! Use 1,2 or 3.");
+				break;
+				}
+           Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1) - (Output[index] - Input[index]));  
+		}}
+	return *Output;
+}
+/********************************************************************/
+/***************************3D Functions*****************************/
+/********************************************************************/
+/* linear diffusion (heat equation) */
+float LinearDiff3D(float *Input, float *Output, float lambdaPar, float tau, int dimX, int dimY, int dimZ)
+{
+	int i,j,k,i1,i2,j1,j2,k1,k2,index;
+	float e,w,n,s,u,d,e1,w1,n1,s1,u1,d1;
+	
+#pragma omp parallel for shared(Input) private(index,i,j,i1,i2,j1,j2,e,w,n,s,e1,w1,n1,s1,k,k1,k2,u1,d1,u,d)
+for(k=0; k<dimZ; k++) {
+	k1 = k+1; if (k1 == dimZ) k1 = k-1;
+    k2 = k-1; if (k2 < 0) k2 = k+1;
+    for(i=0; i<dimX; i++) {
+        /* symmetric boundary conditions (Neuman) */
+        i1 = i+1; if (i1 == dimX) i1 = i-1;
+        i2 = i-1; if (i2 < 0) i2 = i+1;
+        for(j=0; j<dimY; j++) {
+            /* symmetric boundary conditions (Neuman) */
+            j1 = j+1; if (j1 == dimY) j1 = j-1;
+            j2 = j-1; if (j2 < 0) j2 = j+1;
+            index = (dimX*dimY)*k + j*dimX+i;
+            
+                e = Output[(dimX*dimY)*k + j*dimX+i1];
+                w = Output[(dimX*dimY)*k + j*dimX+i2];
+                n = Output[(dimX*dimY)*k + j1*dimX+i];
+                s = Output[(dimX*dimY)*k + j2*dimX+i];
+                u = Output[(dimX*dimY)*k1 + j*dimX+i];
+                d = Output[(dimX*dimY)*k2 + j*dimX+i];
+                
+                e1 = e - Output[index];
+                w1 = w - Output[index];
+                n1 = n - Output[index];
+                s1 = s - Output[index];
+                u1 = u - Output[index];
+                d1 = d - Output[index];
+                
+                Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1 + u1 + d1) - (Output[index] - Input[index]));  
+		}}}
+	return *Output;
+}
+
+float NonLinearDiff3D(float *Input, float *Output, float lambdaPar, float sigmaPar, float tau, int penaltytype, int dimX, int dimY, int dimZ)
+{
+	int i,j,k,i1,i2,j1,j2,k1,k2,index;
+	float e,w,n,s,u,d,e1,w1,n1,s1,u1,d1;
+	
+#pragma omp parallel for shared(Input) private(index,i,j,i1,i2,j1,j2,e,w,n,s,e1,w1,n1,s1,k,k1,k2,u1,d1,u,d)
+for(k=0; k<dimZ; k++) {
+	k1 = k+1; if (k1 == dimZ) k1 = k-1;
+    k2 = k-1; if (k2 < 0) k2 = k+1;
+    for(i=0; i<dimX; i++) {
+        /* symmetric boundary conditions (Neuman) */
+        i1 = i+1; if (i1 == dimX) i1 = i-1;
+        i2 = i-1; if (i2 < 0) i2 = i+1;
+        for(j=0; j<dimY; j++) {
+            /* symmetric boundary conditions (Neuman) */
+            j1 = j+1; if (j1 == dimY) j1 = j-1;
+            j2 = j-1; if (j2 < 0) j2 = j+1;
+            index = (dimX*dimY)*k + j*dimX+i;
+            
+                e = Output[(dimX*dimY)*k + j*dimX+i1];
+                w = Output[(dimX*dimY)*k + j*dimX+i2];
+                n = Output[(dimX*dimY)*k + j1*dimX+i];
+                s = Output[(dimX*dimY)*k + j2*dimX+i];
+                u = Output[(dimX*dimY)*k1 + j*dimX+i];
+                d = Output[(dimX*dimY)*k2 + j*dimX+i];
+                
+                e1 = e - Output[index];
+                w1 = w - Output[index];
+                n1 = n - Output[index];
+                s1 = s - Output[index];
+                u1 = u - Output[index];
+                d1 = d - Output[index];
+                
+             if (penaltytype == 1){
+            /* Huber penalty */
+            if (fabs(e1) > sigmaPar) e1 =  signNDFc(e1);
+            else e1 = e1/sigmaPar;
+            
+            if (fabs(w1) > sigmaPar) w1 =  signNDFc(w1);
+            else w1 = w1/sigmaPar;
+            
+            if (fabs(n1) > sigmaPar) n1 =  signNDFc(n1);
+            else n1 = n1/sigmaPar;
+            
+            if (fabs(s1) > sigmaPar) s1 =  signNDFc(s1);
+            else s1 = s1/sigmaPar;
+            
+            if (fabs(u1) > sigmaPar) u1 =  signNDFc(u1);
+            else u1 = u1/sigmaPar;
+            
+            if (fabs(d1) > sigmaPar) d1 =  signNDFc(d1);
+            else d1 = d1/sigmaPar;            
+            }
+            else if (penaltytype == 2) {
+            /* Perona-Malik */
+            e1 = (e1)/(1.0f + powf((e1/sigmaPar),2));
+            w1 = (w1)/(1.0f + powf((w1/sigmaPar),2));
+            n1 = (n1)/(1.0f + powf((n1/sigmaPar),2));
+            s1 = (s1)/(1.0f + powf((s1/sigmaPar),2));
+            u1 = (u1)/(1.0f + powf((u1/sigmaPar),2));
+            d1 = (d1)/(1.0f + powf((d1/sigmaPar),2));
+            }
+            else if (penaltytype == 3) {
+            /* Tukey Biweight */
+            if (fabs(e1) <= sigmaPar) e1 =  e1*powf((1.0f - powf((e1/sigmaPar),2)), 2);
+            else e1 = 0.0f;
+            if (fabs(w1) <= sigmaPar) w1 =  w1*powf((1.0f - powf((w1/sigmaPar),2)), 2);
+            else w1 = 0.0f;
+            if (fabs(n1) <= sigmaPar) n1 =  n1*powf((1.0f - powf((n1/sigmaPar),2)), 2);
+            else n1 = 0.0f;
+            if (fabs(s1) <= sigmaPar) s1 =  s1*powf((1.0f - powf((s1/sigmaPar),2)), 2);
+            else s1 = 0.0f;
+            if (fabs(u1) <= sigmaPar) u1 =  u1*powf((1.0f - powf((u1/sigmaPar),2)), 2);
+            else u1 = 0.0f;
+            if (fabs(d1) <= sigmaPar) d1 =  d1*powf((1.0f - powf((d1/sigmaPar),2)), 2);
+            else d1 = 0.0f;
+            }
+            else {
+				printf("%s \n", "No penalty function selected! Use 1,2 or 3.");
+				break;
+				}
+
+                Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1 + u1 + d1) - (Output[index] - Input[index]));  
+		}}}
+	return *Output;
+}
diff --git a/Core/regularisers_CPU/Diffusion_core.h b/Core/regularisers_CPU/Diffusion_core.h
new file mode 100644
index 0000000..0b4149a
--- /dev/null
+++ b/Core/regularisers_CPU/Diffusion_core.h
@@ -0,0 +1,59 @@
+/*
+This work is part of the Core Imaging Library developed by
+Visual Analytics and Imaging System Group of the Science Technology
+Facilities Council, STFC
+
+Copyright 2017 Daniil Kazantsev
+Copyright 2017 Srikanth Nagella, Edoardo Pasca
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <stdio.h>
+#include "omp.h"
+#include "utils.h"
+#include "CCPiDefines.h"
+
+
+/* C-OMP implementation of linear and nonlinear diffusion with the regularisation model [1,2] (2D/3D case)
+ * The minimisation is performed using explicit scheme. 
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume 
+ * 2. lambda - regularization parameter
+ * 3. Edge-preserving parameter (sigma), when sigma equals to zero nonlinear diffusion -> linear diffusion
+ * 4. Number of iterations, for explicit scheme >= 150 is recommended 
+ * 5. tau - time-marching step for explicit scheme
+ * 6. Penalty type: 1 - Huber, 2 - Perona-Malik, 3 - Tukey Biweight
+ *
+ * Output:
+ * [1] Regularized image/volume 
+ *
+ * This function is based on the paper by
+ * [1] Perona, P. and Malik, J., 1990. Scale-space and edge detection using anisotropic diffusion. IEEE Transactions on pattern analysis and machine intelligence, 12(7), pp.629-639.
+ * [2] Black, M.J., Sapiro, G., Marimont, D.H. and Heeger, D., 1998. Robust anisotropic diffusion. IEEE Transactions on image processing, 7(3), pp.421-432.
+ */
+
+ 
+#ifdef __cplusplus
+extern "C" {
+#endif
+CCPI_EXPORT float Diffusion_CPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb,  float tau, int penaltytype, int dimX, int dimY, int dimZ);
+CCPI_EXPORT float LinearDiff2D(float *Input, float *Output, float lambdaPar, float tau, int dimX, int dimY);
+CCPI_EXPORT float NonLinearDiff2D(float *Input, float *Output, float lambdaPar, float sigmaPar, float tau, int penaltytype, int dimX, int dimY);
+CCPI_EXPORT float LinearDiff3D(float *Input, float *Output, float lambdaPar, float tau, int dimX, int dimY, int dimZ);
+CCPI_EXPORT float NonLinearDiff3D(float *Input, float *Output, float lambdaPar, float sigmaPar, float tau, int penaltytype, int dimX, int dimY, int dimZ);
+#ifdef __cplusplus
+}
+#endif
diff --git a/Core/regularisers_CPU/ROF_TV_core.c b/Core/regularisers_CPU/ROF_TV_core.c
index 9ffb905..213645d 100644
--- a/Core/regularisers_CPU/ROF_TV_core.c
+++ b/Core/regularisers_CPU/ROF_TV_core.c
@@ -23,10 +23,12 @@
 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 
+/*sign function*/
 int sign(float x) {
     return (x > 0) - (x < 0);
 }
 
+
 /* C-OMP implementation of ROF-TV denoising/regularization model [1] (2D/3D case)
  *
  * 
@@ -41,8 +43,6 @@ int sign(float x) {
  *
  * This function is based on the paper by
  * [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms"
- *
- * D. Kazantsev, 2016-18
  */
 
 /* Running iterations of TV-ROF function */
@@ -90,8 +90,7 @@ float D1_func(float *A, float *D1, int dimX, int dimY, int dimZ)
                     j1 = j + 1; if (j1 >= dimY) j1 = j-1;
                     j2 = j - 1; if (j2 < 0) j2 = j+1;
                     k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
-                    k2 = k - 1; if (k2 < 0) k2 = k+1;
-                    /*B[(dimX*dimY)*k + i*dimY+j] = 0.25*(A[(dimX*dimY)*k + (i1)*dimY + j] + A[(dimX*dimY)*k + (i2)*dimY + j] + A[(dimX*dimY)*k + (i)*dimY + j1] + A[(dimX*dimY)*k + (i)*dimY + j2]) -  A[(dimX*dimY)*k + i*dimY + j];*/
+                    k2 = k - 1; if (k2 < 0) k2 = k+1;                    
                     
                     /* Forward-backward differences */
                     NOMx_1 = A[(dimX*dimY)*k + j1*dimX + i] - A[index]; /* x+ */
@@ -104,9 +103,9 @@ float D1_func(float *A, float *D1, int dimX, int dimY, int dimZ)
                     
                     
                     denom1 = NOMx_1*NOMx_1;
-                    denom2 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(fabs(NOMy_1),fabs(NOMy_0)));
+                    denom2 = 0.5f*(sign(NOMy_1) + sign(NOMy_0))*(MIN(fabs(NOMy_1),fabs(NOMy_0)));
                     denom2 = denom2*denom2;
-                    denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(fabs(NOMz_1),fabs(NOMz_0)));
+                    denom3 = 0.5f*(sign(NOMz_1) + sign(NOMz_0))*(MIN(fabs(NOMz_1),fabs(NOMz_0)));
                     denom3 = denom3*denom3;
                     T1 = sqrt(denom1 + denom2 + denom3 + EPS);
                     D1[index] = NOMx_1/T1;
diff --git a/Core/regularisers_CPU/TNV_core.h b/Core/regularisers_CPU/TNV_core.h
index 8178181..1559b83 100644
--- a/Core/regularisers_CPU/TNV_core.h
+++ b/Core/regularisers_CPU/TNV_core.h
@@ -12,12 +12,6 @@
 #define MAX(i,j) ((i)<(j) ? (j):(i))
 #define MIN(i,j) ((i)<(j) ? (i):(j))
 
-static inline int8_t SIGN(int val) {
- if (val < 0) return -1;
- if (val==0) return 0;
- return 1;
-}
-
 /*
 This work is part of the Core Imaging Library developed by
 Visual Analytics and Imaging System Group of the Science Technology
diff --git a/Core/regularisers_CPU/utils.c b/Core/regularisers_CPU/utils.c
index 0c02c45..a141cf4 100644
--- a/Core/regularisers_CPU/utils.c
+++ b/Core/regularisers_CPU/utils.c
@@ -29,6 +29,13 @@ float copyIm(float *A, float *U, int dimX, int dimY, int dimZ)
 	return *U;
 }
 
+/*static inline int8_t SIGN(int val) {
+ if (val < 0) return -1;
+ if (val==0) return 0;
+ return 1;
+}
+*/
+
 /* function that calculates TV energy (ROF model) 
  * min||\nabla u|| + 0.5*lambda*||u -u0||^2
  * */
diff --git a/Core/regularisers_GPU/NonlDiff_GPU_core.cu b/Core/regularisers_GPU/NonlDiff_GPU_core.cu
new file mode 100644
index 0000000..be9f5f1
--- /dev/null
+++ b/Core/regularisers_GPU/NonlDiff_GPU_core.cu
@@ -0,0 +1,354 @@
+ /*
+This work is part of the Core Imaging Library developed by
+Visual Analytics and Imaging System Group of the Science Technology
+Facilities Council, STFC
+
+Copyright 2017 Daniil Kazantsev
+Copyright 2017 Srikanth Nagella, Edoardo Pasca
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/ 
+
+#include "NonlDiff_GPU_core.h"
+
+/* CUDA implementation of linear and nonlinear diffusion with the regularisation model [1,2] (2D/3D case)
+ * The minimisation is performed using explicit scheme. 
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume 
+ * 2. lambda - regularization parameter
+ * 3. Edge-preserving parameter (sigma), when sigma equals to zero nonlinear diffusion -> linear diffusion
+ * 4. Number of iterations, for explicit scheme >= 150 is recommended 
+ * 5. tau - time-marching step for explicit scheme
+ * 6. Penalty type: 1 - Huber, 2 - Perona-Malik, 3 - Tukey Biweight
+ *
+ * Output:
+ * [1] Regularized image/volume 
+ *
+ * This function is based on the paper by
+ * [1] Perona, P. and Malik, J., 1990. Scale-space and edge detection using anisotropic diffusion. IEEE Transactions on pattern analysis and machine intelligence, 12(7), pp.629-639.
+ * [2] Black, M.J., Sapiro, G., Marimont, D.H. and Heeger, D., 1998. Robust anisotropic diffusion. IEEE Transactions on image processing, 7(3), pp.421-432.
+ */
+
+#define CHECK(call)                                                            \
+{                                                                              \
+    const cudaError_t error = call;                                            \
+    if (error != cudaSuccess)                                                  \
+    {                                                                          \
+        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
+        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
+                cudaGetErrorString(error));                                    \
+        exit(1);                                                               \
+    }                                                                          \
+}
+    
+#define BLKXSIZE 8
+#define BLKYSIZE 8
+#define BLKZSIZE 8
+    
+#define BLKXSIZE2D 16
+#define BLKYSIZE2D 16
+#define EPS 1.0e-5
+    
+#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
+
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+__host__ __device__ int signNDF (float x)
+{
+        return (x > 0) - (x < 0);
+}        
+   
+/********************************************************************/
+/***************************2D Functions*****************************/
+/********************************************************************/
+__global__ void LinearDiff2D_kernel(float *Input, float *Output, float lambdaPar, float tau, int N, int M)
+    {
+		int i1,i2,j1,j2;
+		float e,w,n,s,e1,w1,n1,s1;
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        
+        int index = i + N*j;
+        
+        if ((i >= 0) && (i < N) && (j >= 0) && (j < M)) {
+            
+            /* boundary conditions (Neumann reflections) */
+			i1 = i+1; if (i1 == N) i1 = i-1;
+			i2 = i-1; if (i2 < 0) i2 = i+1;
+            j1 = j+1; if (j1 == M) j1 = j-1;
+            j2 = j-1; if (j2 < 0) j2 = j+1;
+            
+		        e = Output[j*N+i1];
+                w = Output[j*N+i2];
+                n = Output[j1*N+i];
+                s = Output[j2*N+i];
+                
+                e1 = e - Output[index];
+                w1 = w - Output[index];
+                n1 = n - Output[index];
+                s1 = s - Output[index];
+                
+                Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1) - (Output[index] - Input[index])); 
+		}
+	} 
+    
+ __global__ void NonLinearDiff2D_kernel(float *Input, float *Output, float lambdaPar, float sigmaPar, float tau, int penaltytype, int N, int M)
+    {
+		int i1,i2,j1,j2;
+		float e,w,n,s,e1,w1,n1,s1;
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        
+        int index = i + N*j;
+        
+        if ((i >= 0) && (i < N) && (j >= 0) && (j < M)) {
+            
+            /* boundary conditions (Neumann reflections) */
+			i1 = i+1; if (i1 == N) i1 = i-1;
+			i2 = i-1; if (i2 < 0) i2 = i+1;
+            j1 = j+1; if (j1 == M) j1 = j-1;
+            j2 = j-1; if (j2 < 0) j2 = j+1;
+            
+		        e = Output[j*N+i1];
+                w = Output[j*N+i2];
+                n = Output[j1*N+i];
+                s = Output[j2*N+i];
+                
+                e1 = e - Output[index];
+                w1 = w - Output[index];
+                n1 = n - Output[index];
+                s1 = s - Output[index];
+                
+            if (penaltytype == 1){
+            /* Huber penalty */
+            if (abs(e1) > sigmaPar) e1 =  signNDF(e1);
+            else e1 = e1/sigmaPar;
+            
+            if (abs(w1) > sigmaPar) w1 =  signNDF(w1);
+            else w1 = w1/sigmaPar;
+            
+            if (abs(n1) > sigmaPar) n1 =  signNDF(n1);
+            else n1 = n1/sigmaPar;
+            
+            if (abs(s1) > sigmaPar) s1 =  signNDF(s1);
+            else s1 = s1/sigmaPar;
+            }
+            else if (penaltytype == 2) {
+            /* Perona-Malik */
+            e1 = (e1)/(1.0f + pow((e1/sigmaPar),2));
+            w1 = (w1)/(1.0f + pow((w1/sigmaPar),2));
+            n1 = (n1)/(1.0f + pow((n1/sigmaPar),2));
+            s1 = (s1)/(1.0f + pow((s1/sigmaPar),2));
+            }
+            else if (penaltytype == 3) {
+            /* Tukey Biweight */
+            if (abs(e1) <= sigmaPar) e1 =  e1*pow((1.0f - pow((e1/sigmaPar),2)), 2);
+            else e1 = 0.0f;
+            if (abs(w1) <= sigmaPar) w1 =  w1*pow((1.0f - pow((w1/sigmaPar),2)), 2);
+            else w1 = 0.0f;
+            if (abs(n1) <= sigmaPar) n1 =  n1*pow((1.0f - pow((n1/sigmaPar),2)), 2);
+            else n1 = 0.0f;
+            if (abs(s1) <= sigmaPar) s1 =  s1*pow((1.0f - pow((s1/sigmaPar),2)), 2);
+            else s1 = 0.0f;
+            }
+            else printf("%s \n", "No penalty function selected! Use 1,2 or 3.");
+                            
+            Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1) - (Output[index] - Input[index])); 
+		}
+	} 
+/********************************************************************/
+/***************************3D Functions*****************************/
+/********************************************************************/
+
+__global__ void LinearDiff3D_kernel(float *Input, float *Output, float lambdaPar, float tau, int N, int M, int Z)
+    {
+		int i1,i2,j1,j2,k1,k2;
+		float e,w,n,s,u,d,e1,w1,n1,s1,u1,d1;
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+		int j = blockDim.y * blockIdx.y + threadIdx.y;
+		int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+		int index = (N*M)*k + i + N*j;        
+        
+        if ((i >= 0) && (i < N) && (j >= 0) && (j < M) && (k >= 0) && (k < Z)) {
+            
+            /* boundary conditions (Neumann reflections) */
+			i1 = i+1; if (i1 == N) i1 = i-1;
+			i2 = i-1; if (i2 < 0) i2 = i+1;
+            j1 = j+1; if (j1 == M) j1 = j-1;
+            j2 = j-1; if (j2 < 0) j2 = j+1;
+			k1 = k+1; if (k1 == Z) k1 = k-1;
+			k2 = k-1; if (k2 < 0) k2 = k+1;
+            
+		        e = Output[(N*M)*k + i1 + N*j];
+                w = Output[(N*M)*k + i2 + N*j];
+                n = Output[(N*M)*k + i + N*j1];
+                s = Output[(N*M)*k + i + N*j2];
+                u = Output[(N*M)*k1 + i + N*j];
+                d = Output[(N*M)*k2 + i + N*j];
+                
+                e1 = e - Output[index];
+                w1 = w - Output[index];
+                n1 = n - Output[index];
+                s1 = s - Output[index];
+                u1 = u - Output[index];
+                d1 = d - Output[index];
+                
+                Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1 + u1 + d1) - (Output[index] - Input[index])); 
+		}
+	} 
+
+__global__ void NonLinearDiff3D_kernel(float *Input, float *Output, float lambdaPar, float sigmaPar, float tau, int penaltytype, int N, int M, int Z)
+    {
+		int i1,i2,j1,j2,k1,k2;
+		float e,w,n,s,u,d,e1,w1,n1,s1,u1,d1;
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+		int j = blockDim.y * blockIdx.y + threadIdx.y;
+		int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+		int index = (N*M)*k + i + N*j;        
+        
+        if ((i >= 0) && (i < N) && (j >= 0) && (j < M) && (k >= 0) && (k < Z)) {
+            
+            /* boundary conditions (Neumann reflections) */
+			i1 = i+1; if (i1 == N) i1 = i-1;
+			i2 = i-1; if (i2 < 0) i2 = i+1;
+            j1 = j+1; if (j1 == M) j1 = j-1;
+            j2 = j-1; if (j2 < 0) j2 = j+1;
+			k1 = k+1; if (k1 == Z) k1 = k-1;
+			k2 = k-1; if (k2 < 0) k2 = k+1;
+            
+		        e = Output[(N*M)*k + i1 + N*j];
+                w = Output[(N*M)*k + i2 + N*j];
+                n = Output[(N*M)*k + i + N*j1];
+                s = Output[(N*M)*k + i + N*j2];
+                u = Output[(N*M)*k1 + i + N*j];
+                d = Output[(N*M)*k2 + i + N*j];
+                
+                e1 = e - Output[index];
+                w1 = w - Output[index];
+                n1 = n - Output[index];
+                s1 = s - Output[index];
+                u1 = u - Output[index];
+                d1 = d - Output[index];
+                
+                
+            if (penaltytype == 1){
+            /* Huber penalty */
+            if (abs(e1) > sigmaPar) e1 =  signNDF(e1);
+            else e1 = e1/sigmaPar;
+            
+            if (abs(w1) > sigmaPar) w1 =  signNDF(w1);
+            else w1 = w1/sigmaPar;
+            
+            if (abs(n1) > sigmaPar) n1 =  signNDF(n1);
+            else n1 = n1/sigmaPar;
+            
+            if (abs(s1) > sigmaPar) s1 =  signNDF(s1);
+            else s1 = s1/sigmaPar;
+            
+            if (abs(u1) > sigmaPar) u1 =  signNDF(u1);
+            else u1 = u1/sigmaPar;
+            
+            if (abs(d1) > sigmaPar) d1 =  signNDF(d1);
+            else d1 = d1/sigmaPar;            
+            }
+            else if (penaltytype == 2) {
+            /* Perona-Malik */
+            e1 = (e1)/(1.0f + pow((e1/sigmaPar),2));
+            w1 = (w1)/(1.0f + pow((w1/sigmaPar),2));
+            n1 = (n1)/(1.0f + pow((n1/sigmaPar),2));
+            s1 = (s1)/(1.0f + pow((s1/sigmaPar),2));
+            u1 = (u1)/(1.0f + pow((u1/sigmaPar),2));
+            d1 = (d1)/(1.0f + pow((d1/sigmaPar),2));
+            }
+            else if (penaltytype == 3) {
+            /* Tukey Biweight */
+            if (abs(e1) <= sigmaPar) e1 =  e1*pow((1.0f - pow((e1/sigmaPar),2)), 2);
+            else e1 = 0.0f;
+            if (abs(w1) <= sigmaPar) w1 =  w1*pow((1.0f - pow((w1/sigmaPar),2)), 2);
+            else w1 = 0.0f;
+            if (abs(n1) <= sigmaPar) n1 =  n1*pow((1.0f - pow((n1/sigmaPar),2)), 2);
+            else n1 = 0.0f;
+            if (abs(s1) <= sigmaPar) s1 =  s1*pow((1.0f - pow((s1/sigmaPar),2)), 2);
+            else s1 = 0.0f;
+            if (abs(u1) <= sigmaPar) u1 =  u1*pow((1.0f - pow((u1/sigmaPar),2)), 2);
+            else u1 = 0.0f;
+            if (abs(d1) <= sigmaPar) d1 =  d1*pow((1.0f - pow((d1/sigmaPar),2)), 2);
+            else d1 = 0.0f;
+            }
+            else printf("%s \n", "No penalty function selected! Use 1,2 or 3.");
+
+            Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1 + u1 + d1) - (Output[index] - Input[index])); 
+		}
+	} 
+
+/////////////////////////////////////////////////
+// HOST FUNCTION
+extern "C" void NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z)
+{
+	    // set up device
+		int dev = 0;
+		CHECK(cudaSetDevice(dev));
+        float *d_input, *d_output;
+        float sigmaPar2;
+        sigmaPar2 = sigmaPar/sqrt(2.0f);
+        
+        CHECK(cudaMalloc((void**)&d_input,N*M*Z*sizeof(float)));
+        CHECK(cudaMalloc((void**)&d_output,N*M*Z*sizeof(float)));
+                
+        CHECK(cudaMemcpy(d_input,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));
+        CHECK(cudaMemcpy(d_output,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));      
+        
+	if (Z == 1) {
+	     /*2D case */ 
+        
+        dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
+        dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D));
+             
+        for(int n=0; n < iterationsNumb; n++) {
+				if (sigmaPar == 0.0f) {
+				/* linear diffusion (heat equation) */
+				LinearDiff2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_output, lambdaPar, tau, N, M);
+				CHECK(cudaDeviceSynchronize());
+				}
+				else {
+				/* nonlinear diffusion */
+				NonLinearDiff2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_output, lambdaPar, sigmaPar2, tau, penaltytype, N, M);
+				CHECK(cudaDeviceSynchronize());
+				}
+        }
+	}
+	else {
+		/*3D case*/
+        dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
+        dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKZSIZE));
+			for(int n=0; n < iterationsNumb; n++) {
+				if (sigmaPar == 0.0f) {
+				/* linear diffusion (heat equation) */
+				LinearDiff3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_output, lambdaPar, tau, N, M, Z);
+				CHECK(cudaDeviceSynchronize());
+				}
+				else {
+				/* nonlinear diffusion */
+				NonLinearDiff3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_output, lambdaPar, sigmaPar2, tau, penaltytype, N, M, Z);
+				CHECK(cudaDeviceSynchronize());
+				}
+			}
+        
+		}        
+        CHECK(cudaMemcpy(Output,d_output,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost));
+        CHECK(cudaFree(d_input));
+        CHECK(cudaFree(d_output));
+        cudaDeviceReset(); 
+}
diff --git a/Core/regularisers_GPU/NonlDiff_GPU_core.h b/Core/regularisers_GPU/NonlDiff_GPU_core.h
new file mode 100644
index 0000000..afd712b
--- /dev/null
+++ b/Core/regularisers_GPU/NonlDiff_GPU_core.h
@@ -0,0 +1,8 @@
+#ifndef __NonlDiffGPU_H__
+#define __NonlDiffGPU_H__
+#include "CCPiDefines.h"
+#include <stdio.h>
+
+extern "C" CCPI_EXPORT void NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z);
+
+#endif 
diff --git a/Readme.md b/Readme.md
index 60c38ab..e73b4fb 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,8 +1,9 @@
 # CCPi-Regularisation Toolkit (CCPi-RGL)
 
 **Iterative image reconstruction (IIR) methods normally require regularisation to stabilise the convergence and make the reconstruction problem more well-posed. 
-CCPi-RGL software consist of 2D/3D regularisation modules for single-channel and multi-channel reconstruction problems. The modules especially suited for IIR, however,
-can also be used as image denoising iterative filters. The core modules are written in C-OMP and CUDA languages and wrappers for Matlab and Python are provided.** 
+CCPi-RGL software consist of 2D/3D regularisation modules for single-channel and multi-channel reconstruction problems. The regularisation modules are well-suited for 
+[splitting algorithms](https://en.wikipedia.org/wiki/Augmented_Lagrangian_method#Alternating_direction_method_of_multipliers), of ADMM or FISTA type. Furthermore, 
+the toolkit can be used independently to solve image denoising problems. The core modules are written in C-OMP and CUDA languages and wrappers for Matlab and Python are provided.** 
 
 <div align="center">
   <img src="docs/images/probl.png" height="225"><br>  
@@ -10,7 +11,7 @@ can also be used as image denoising iterative filters. The core modules are writ
 
 ## Prerequisites: 
 
- * MATLAB (www.mathworks.com/products/matlab/) OR
+ * [MATLAB](www.mathworks.com/products/matlab/) OR
  * Python (tested ver. 3.5); Cython
  * C compilers
  * nvcc (CUDA SDK) compilers
@@ -18,13 +19,14 @@ can also be used as image denoising iterative filters. The core modules are writ
 ## Package modules (regularisers):
 
 ### Single-channel
-1. Rudin-Osher-Fatemi (ROF) Total Variation (explicit PDE minimisation scheme) [2D/3D CPU/GPU]; (Ref. 1)
-2. Fast-Gradient-Projection (FGP) Total Variation [2D/3D CPU/GPU]; (Ref. 2)
-3. Split-Bregman (SB) Total Variation [2D/3D CPU/GPU]; (Ref. 4)
+1. Rudin-Osher-Fatemi (ROF) Total Variation (explicit PDE minimisation scheme) **2D/3D CPU/GPU** (Ref. *1*)
+2. Fast-Gradient-Projection (FGP) Total Variation **2D/3D CPU/GPU** (Ref. *2*)
+3. Split-Bregman (SB) Total Variation **2D/3D CPU/GPU** (Ref. *4*)
+4. Linear and nonlinear diffusion (explicit PDE minimisation scheme) **2D/3D CPU/GPU** (Ref. *6*)
 
 ### Multi-channel
-1. Fast-Gradient-Projection (FGP) Directional Total Variation [2D/3D CPU/GPU]; (Ref. 3,2)
-2. Total Nuclear Variation (TNV) penalty [2D+channels CPU]; (Ref. 5)
+1. Fast-Gradient-Projection (FGP) Directional Total Variation **2D/3D CPU/GPU** (Ref. *3,2*)
+2. Total Nuclear Variation (TNV) penalty **2D+channels CPU** (Ref. *5*)
 
 ## Installation:
 
@@ -48,11 +50,17 @@ can also be used as image denoising iterative filters. The core modules are writ
 ```
 
 ### References:
-1. Rudin, L.I., Osher, S. and Fatemi, E., 1992. Nonlinear total variation based noise removal algorithms. Physica D: nonlinear phenomena, 60(1-4), pp.259-268.
-2. Beck, A. and Teboulle, M., 2009. Fast gradient-based algorithms for constrained total variation image denoising and deblurring problems. IEEE Transactions on Image Processing, 18(11), pp.2419-2434.
-3. Ehrhardt, M.J. and Betcke, M.M., 2016. Multicontrast MRI reconstruction with structure-guided total variation. SIAM Journal on Imaging Sciences, 9(3), pp.1084-1106.
-4. Goldstein, T. and Osher, S., 2009. The split Bregman method for L1-regularized problems. SIAM journal on imaging sciences, 2(2), pp.323-343.
-5. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+*1. Rudin, L.I., Osher, S. and Fatemi, E., 1992. Nonlinear total variation based noise removal algorithms. Physica D: nonlinear phenomena, 60(1-4), pp.259-268.*
+
+*2. Beck, A. and Teboulle, M., 2009. Fast gradient-based algorithms for constrained total variation image denoising and deblurring problems. IEEE Transactions on Image Processing, 18(11), pp.2419-2434.*
+
+*3. Ehrhardt, M.J. and Betcke, M.M., 2016. Multicontrast MRI reconstruction with structure-guided total variation. SIAM Journal on Imaging Sciences, 9(3), pp.1084-1106.*
+
+*4. Goldstein, T. and Osher, S., 2009. The split Bregman method for L1-regularized problems. SIAM journal on imaging sciences, 2(2), pp.323-343.*
+
+*5. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.*
+
+*6. Black, M.J., Sapiro, G., Marimont, D.H. and Heeger, D., 1998. Robust anisotropic diffusion. IEEE Transactions on image processing, 7(3), pp.421-432.*
 
 ### License:
 [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/Wrappers/Matlab/demos/demoMatlab_3Ddenoise.m b/Wrappers/Matlab/demos/demoMatlab_3Ddenoise.m
index fb55097..973d060 100644
--- a/Wrappers/Matlab/demos/demoMatlab_3Ddenoise.m
+++ b/Wrappers/Matlab/demos/demoMatlab_3Ddenoise.m
@@ -53,6 +53,23 @@ figure; imshow(u_sb(:,:,15), [0 1]); title('SB-TV denoised volume (CPU)');
 % tic; u_sbG = SB_TV_GPU(single(vol3D), lambda_reg, iter_sb, epsil_tol); toc; 
 % figure; imshow(u_sbG(:,:,15), [0 1]); title('SB-TV denoised volume (GPU)');
 %%
+%%
+fprintf('Denoise a volume using Nonlinear-Diffusion model (CPU) \n');
+iter_diff = 300; % number of diffusion iterations
+lambda_regDiff = 0.06; % regularisation for the diffusivity 
+sigmaPar = 0.04; % edge-preserving parameter
+tau_param = 0.025; % time-marching constant 
+tic; u_diff = NonlDiff(single(vol3D), lambda_regDiff, sigmaPar, iter_diff, tau_param, 'Huber'); toc; 
+figure; imshow(u_diff(:,:,15), [0 1]); title('Diffusion denoised volume (CPU)');
+%%
+% fprintf('Denoise a volume using Nonlinear-Diffusion model (GPU) \n');
+% iter_diff = 300; % number of diffusion iterations
+% lambda_regDiff = 0.06; % regularisation for the diffusivity 
+% sigmaPar = 0.04; % edge-preserving parameter
+% tau_param = 0.025; % time-marching constant 
+% tic; u_diff_g = NonlDiff_GPU(single(vol3D), lambda_regDiff, sigmaPar, iter_diff, tau_param, 'Huber'); toc; 
+% figure; imshow(u_diff_g(:,:,15), [0 1]); title('Diffusion denoised volume (GPU)');
+%%
 
 %>>>>>>>>>>>>>> MULTI-CHANNEL priors <<<<<<<<<<<<<<< %
 fprintf('Denoise a volume using the FGP-dTV model (CPU) \n');
diff --git a/Wrappers/Matlab/demos/demoMatlab_denoise.m b/Wrappers/Matlab/demos/demoMatlab_denoise.m
index dab98dc..4a0a19a 100644
--- a/Wrappers/Matlab/demos/demoMatlab_denoise.m
+++ b/Wrappers/Matlab/demos/demoMatlab_denoise.m
@@ -46,6 +46,22 @@ figure; imshow(u_sb, [0 1]); title('SB-TV denoised image (CPU)');
 % tic; u_sbG = SB_TV_GPU(single(u0), lambda_reg, iter_sb, epsil_tol); toc; 
 % figure; imshow(u_sbG, [0 1]); title('SB-TV denoised image (GPU)');
 %%
+fprintf('Denoise using Nonlinear-Diffusion model (CPU) \n');
+iter_diff = 800; % number of diffusion iterations
+lambda_regDiff = 0.06; % regularisation for the diffusivity 
+sigmaPar = 0.04; % edge-preserving parameter
+tau_param = 0.025; % time-marching constant 
+tic; u_diff = NonlDiff(single(u0), lambda_regDiff, sigmaPar, iter_diff, tau_param, 'Huber'); toc; 
+figure; imshow(u_diff, [0 1]); title('Diffusion denoised image (CPU)');
+%%
+% fprintf('Denoise using Nonlinear-Diffusion model (GPU) \n');
+% iter_diff = 800; % number of diffusion iterations
+% lambda_regDiff = 0.06; % regularisation for the diffusivity 
+% sigmaPar = 0.04; % edge-preserving parameter
+% tau_param = 0.025; % time-marching constant 
+% tic; u_diff_g = NonlDiff_GPU(single(u0), lambda_regDiff, sigmaPar, iter_diff, tau_param, 'Huber'); toc; 
+% figure; imshow(u_diff_g, [0 1]); title('Diffusion denoised image (GPU)');
+%%
 %>>>>>>>>>>>>>> MULTI-CHANNEL priors <<<<<<<<<<<<<<< %
 
 fprintf('Denoise using the FGP-dTV model (CPU) \n');
diff --git a/Wrappers/Matlab/mex_compile/compileCPU_mex.m b/Wrappers/Matlab/mex_compile/compileCPU_mex.m
index 9892d73..ec799bd 100644
--- a/Wrappers/Matlab/mex_compile/compileCPU_mex.m
+++ b/Wrappers/Matlab/mex_compile/compileCPU_mex.m
@@ -20,7 +20,10 @@ movefile FGP_dTV.mex* ../installed/
 mex TNV.c TNV_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
 movefile TNV.mex* ../installed/
 
-delete SB_TV_core* ROF_TV_core* FGP_TV_core* FGP_dTV_core* TNV_core* utils* CCPiDefines.h
+mex NonlDiff.c Diffusion_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
+movefile NonlDiff.mex* ../installed/
+
+delete SB_TV_core* ROF_TV_core* FGP_TV_core* FGP_dTV_core* TNV_core* utils* Diffusion_core* CCPiDefines.h
 
 fprintf('%s \n', 'All successfully compiled!');
 
diff --git a/Wrappers/Matlab/mex_compile/compileGPU_mex.m b/Wrappers/Matlab/mex_compile/compileGPU_mex.m
index 3dbeb8a..55b51eb 100644
--- a/Wrappers/Matlab/mex_compile/compileGPU_mex.m
+++ b/Wrappers/Matlab/mex_compile/compileGPU_mex.m
@@ -31,7 +31,11 @@ movefile SB_TV_GPU.mex* ../installed/
 mex -g -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart -lcufft -lmwgpu FGP_dTV_GPU.cpp dTV_FGP_GPU_core.o
 movefile FGP_dTV_GPU.mex* ../installed/
 
-delete TV_ROF_GPU_core* TV_FGP_GPU_core* TV_SB_GPU_core* dTV_FGP_GPU_core* CCPiDefines.h
+!/usr/local/cuda/bin/nvcc -O0 -c NonlDiff_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/
+mex -g -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart -lcufft -lmwgpu NonlDiff_GPU.cpp NonlDiff_GPU_core.o
+movefile NonlDiff_GPU.mex* ../installed/
+
+delete TV_ROF_GPU_core* TV_FGP_GPU_core* TV_SB_GPU_core* dTV_FGP_GPU_core* NonlDiff_GPU_core* CCPiDefines.h
 fprintf('%s \n', 'All successfully compiled!');
 
 cd ../../
diff --git a/Wrappers/Matlab/mex_compile/regularisers_CPU/NonlDiff.c b/Wrappers/Matlab/mex_compile/regularisers_CPU/NonlDiff.c
new file mode 100644
index 0000000..e05f5d4
--- /dev/null
+++ b/Wrappers/Matlab/mex_compile/regularisers_CPU/NonlDiff.c
@@ -0,0 +1,87 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "matrix.h"
+#include "mex.h"
+#include "Diffusion_core.h"
+
+/* C-OMP implementation of linear and nonlinear diffusion with the regularisation model [1] (2D/3D case)
+ * The minimisation is performed using explicit scheme.
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume
+ * 2. lambda - regularization parameter
+ * 3. Edge-preserving parameter (sigma), when sigma equals to zero nonlinear diffusion -> linear diffusion
+ * 4. Number of iterations, for explicit scheme >= 150 is recommended  [OPTIONAL parameter]
+ * 5. tau - time-marching step for explicit scheme [OPTIONAL parameter]
+ * 6. Penalty type: 1 - Huber, 2 - Perona-Malik, 3 - Tukey Biweight [OPTIONAL parameter]
+ *
+ * Output:
+ * [1] Regularized image/volume
+ *
+ * This function is based on the paper by
+ * [1] Perona, P. and Malik, J., 1990. Scale-space and edge detection using anisotropic diffusion. IEEE Transactions on pattern analysis and machine intelligence, 12(7), pp.629-639.
+ */
+
+void mexFunction(
+        int nlhs, mxArray *plhs[],
+        int nrhs, const mxArray *prhs[])
+        
+{
+    int number_of_dims, iter_numb, dimX, dimY, dimZ, penaltytype;
+    const int  *dim_array;
+    float *Input, *Output=NULL, lambda, tau, sigma;
+    
+    dim_array = mxGetDimensions(prhs[0]);
+    number_of_dims = mxGetNumberOfDimensions(prhs[0]);
+    
+    /*Handling Matlab input data*/
+    Input  = (float *) mxGetData(prhs[0]);
+    lambda =  (float) mxGetScalar(prhs[1]); /* regularization parameter */
+    sigma = (float) mxGetScalar(prhs[2]); /* Edge-preserving parameter */
+    iter_numb = 300; /* iterations number */
+    tau = 0.025; /* marching step parameter */
+    penaltytype = 1; /* Huber penalty by default */
+    
+    if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); }
+    if ((nrhs < 3) || (nrhs > 6)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter, Edge-preserving parameter, iterations number, time-marching constant, penalty type - Huber, PM or Tukey");
+    if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6))  iter_numb = (int) mxGetScalar(prhs[3]); /* iterations number */
+    if ((nrhs == 5) || (nrhs == 6))  tau =  (float) mxGetScalar(prhs[4]); /* marching step parameter */
+    if (nrhs == 6)  {
+        char *penalty_type;
+        penalty_type = mxArrayToString(prhs[5]); /* Huber, PM or Tukey 'Huber' is the default */
+        if ((strcmp(penalty_type, "Huber") != 0) && (strcmp(penalty_type, "PM") != 0) && (strcmp(penalty_type, "Tukey") != 0)) mexErrMsgTxt("Choose penalty: 'Huber', 'PM' or 'Tukey',");
+        if (strcmp(penalty_type, "Huber") == 0)  penaltytype = 1;  /* enable 'Huber' penalty */
+        if (strcmp(penalty_type, "PM") == 0)  penaltytype = 2;  /* enable Perona-Malik penalty */
+        if (strcmp(penalty_type, "Tukey") == 0)  penaltytype = 3;  /* enable Tikey Biweight penalty */
+        mxFree(penalty_type);
+    }    
+    
+    /*Handling Matlab output data*/
+    dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];
+    
+    /* output arrays*/
+    if (number_of_dims == 2) {
+        dimZ = 1; /*2D case*/
+        /* output image/volume */
+        Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
+    }
+    if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+    
+    Diffusion_CPU_main(Input, Output, lambda, sigma, iter_numb, tau, penaltytype, dimX, dimY, dimZ);
+}
+\ No newline at end of file
diff --git a/Wrappers/Matlab/mex_compile/regularisers_GPU/NonlDiff_GPU.cpp b/Wrappers/Matlab/mex_compile/regularisers_GPU/NonlDiff_GPU.cpp
new file mode 100644
index 0000000..bfba9ea
--- /dev/null
+++ b/Wrappers/Matlab/mex_compile/regularisers_GPU/NonlDiff_GPU.cpp
@@ -0,0 +1,90 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "matrix.h"
+#include "mex.h"
+#include <stdio.h>
+#include <string.h>
+#include "NonlDiff_GPU_core.h"
+
+/* CUDA implementation of linear and nonlinear diffusion with the regularisation model [1,2] (2D/3D case)
+ * The minimisation is performed using explicit scheme. 
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume 
+ * 2. lambda - regularization parameter
+ * 3. Edge-preserving parameter (sigma), when sigma equals to zero nonlinear diffusion -> linear diffusion
+ * 4. Number of iterations, for explicit scheme >= 150 is recommended 
+ * 5. tau - time-marching step for explicit scheme
+ * 6. Penalty type: 1 - Huber, 2 - Perona-Malik, 3 - Tukey Biweight
+ *
+ * Output:
+ * [1] Regularized image/volume 
+ *
+ * This function is based on the paper by
+ * [1] Perona, P. and Malik, J., 1990. Scale-space and edge detection using anisotropic diffusion. IEEE Transactions on pattern analysis and machine intelligence, 12(7), pp.629-639.
+ * [2] Black, M.J., Sapiro, G., Marimont, D.H. and Heeger, D., 1998. Robust anisotropic diffusion. IEEE Transactions on image processing, 7(3), pp.421-432.
+ */
+
+void mexFunction(
+        int nlhs, mxArray *plhs[],
+        int nrhs, const mxArray *prhs[])
+        
+{
+    int number_of_dims, iter_numb, dimX, dimY, dimZ, penaltytype;
+    const int  *dim_array;
+    float *Input, *Output=NULL, lambda, tau, sigma;
+    
+    dim_array = mxGetDimensions(prhs[0]);
+    number_of_dims = mxGetNumberOfDimensions(prhs[0]);
+    
+    /*Handling Matlab input data*/
+    Input  = (float *) mxGetData(prhs[0]);
+    lambda =  (float) mxGetScalar(prhs[1]); /* regularization parameter */
+    sigma = (float) mxGetScalar(prhs[2]); /* Edge-preserving parameter */
+    iter_numb = 300; /* iterations number */
+    tau = 0.025; /* marching step parameter */
+    penaltytype = 1; /* Huber penalty by default */
+    
+    if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); }
+    if ((nrhs < 3) || (nrhs > 6)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter, Edge-preserving parameter, iterations number, time-marching constant, penalty type - Huber, PM or Tukey");
+    if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6))  iter_numb = (int) mxGetScalar(prhs[3]); /* iterations number */
+    if ((nrhs == 5) || (nrhs == 6))  tau =  (float) mxGetScalar(prhs[4]); /* marching step parameter */
+    if (nrhs == 6)  {
+        char *penalty_type;
+        penalty_type = mxArrayToString(prhs[5]); /* Huber, PM or Tukey 'Huber' is the default */
+        if ((strcmp(penalty_type, "Huber") != 0) && (strcmp(penalty_type, "PM") != 0) && (strcmp(penalty_type, "Tukey") != 0)) mexErrMsgTxt("Choose penalty: 'Huber', 'PM' or 'Tukey',");
+        if (strcmp(penalty_type, "Huber") == 0)  penaltytype = 1;  /* enable 'Huber' penalty */
+        if (strcmp(penalty_type, "PM") == 0)  penaltytype = 2;  /* enable Perona-Malik penalty */
+        if (strcmp(penalty_type, "Tukey") == 0)  penaltytype = 3;  /* enable Tikey Biweight penalty */
+        mxFree(penalty_type);
+    }    
+    
+    /*Handling Matlab output data*/
+    dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];
+    
+    /* output arrays*/
+    if (number_of_dims == 2) {
+        dimZ = 1; /*2D case*/
+        /* output image/volume */
+        Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
+    }
+    if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+    
+    NonlDiff_GPU_main(Input, Output, lambda, sigma, iter_numb, tau, penaltytype, dimX, dimY, dimZ);
+}
+\ No newline at end of file
diff --git a/Wrappers/Matlab/mex_compile/regularisers_GPU/ROF_TV_GPU.cpp b/Wrappers/Matlab/mex_compile/regularisers_GPU/ROF_TV_GPU.cpp
index 7bbe3af..f60ba7b 100644
--- a/Wrappers/Matlab/mex_compile/regularisers_GPU/ROF_TV_GPU.cpp
+++ b/Wrappers/Matlab/mex_compile/regularisers_GPU/ROF_TV_GPU.cpp
@@ -37,7 +37,6 @@
  *
  * D. Kazantsev, 2016-18
  */
-
 void mexFunction(
         int nlhs, mxArray *plhs[],
         int nrhs, const mxArray *prhs[])
diff --git a/Wrappers/Python/ccpi/filters/regularisers.py b/Wrappers/Python/ccpi/filters/regularisers.py
index e6814e8..eec8c4d 100644
--- a/Wrappers/Python/ccpi/filters/regularisers.py
+++ b/Wrappers/Python/ccpi/filters/regularisers.py
@@ -2,8 +2,8 @@
 script which assigns a proper device core function based on a flag ('cpu' or 'gpu')
 """
 
-from ccpi.filters.cpu_regularisers_cython import TV_ROF_CPU, TV_FGP_CPU, TV_SB_CPU, dTV_FGP_CPU, TNV_CPU 
-from ccpi.filters.gpu_regularisers import TV_ROF_GPU, TV_FGP_GPU, TV_SB_GPU, dTV_FGP_GPU
+from ccpi.filters.cpu_regularisers_cython import TV_ROF_CPU, TV_FGP_CPU, TV_SB_CPU, dTV_FGP_CPU, TNV_CPU, NDF_CPU
+from ccpi.filters.gpu_regularisers import TV_ROF_GPU, TV_FGP_GPU, TV_SB_GPU, dTV_FGP_GPU, NDF_GPU
 
 def ROF_TV(inputData, regularisation_parameter, iterations,
                      time_marching_parameter,device='cpu'):
@@ -91,3 +91,22 @@ def TNV(inputData, regularisation_parameter, iterations, tolerance_param):
                      regularisation_parameter,
                      iterations, 
                      tolerance_param)
+def NDF(inputData, regularisation_parameter, edge_parameter, iterations,
+                     time_marching_parameter, penalty_type, device='cpu'):
+    if device == 'cpu':
+        return NDF_CPU(inputData,
+                     regularisation_parameter,
+                     edge_parameter,
+                     iterations, 
+                     time_marching_parameter,
+                     penalty_type)
+    elif device == 'gpu':
+        return NDF_GPU(inputData,
+                     regularisation_parameter,
+                     edge_parameter,
+                     iterations, 
+                     time_marching_parameter,
+                     penalty_type)
+    else:
+        raise ValueError('Unknown device {0}. Expecting gpu or cpu'\
+                         .format(device))
diff --git a/Wrappers/Python/demos/demo_cpu_regularisers.py b/Wrappers/Python/demos/demo_cpu_regularisers.py
index 7443b83..3567f91 100644
--- a/Wrappers/Python/demos/demo_cpu_regularisers.py
+++ b/Wrappers/Python/demos/demo_cpu_regularisers.py
@@ -12,7 +12,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 import os
 import timeit
-from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, FGP_dTV, TNV
+from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, FGP_dTV, TNV, NDF
 from qualitymetrics import rmse
 ###############################################################################
 def printParametersToString(pars):
@@ -190,11 +190,58 @@ plt.title('{}'.format('CPU results'))
 
 
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
-print ("_____________FGP-dTV (2D)__________________")
+print ("________________NDF (2D)___________________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
 fig = plt.figure(4)
+plt.suptitle('Performance of NDF regulariser using the CPU')
+a=fig.add_subplot(1,2,1)
+a.set_title('Noisy Image')
+imgplot = plt.imshow(u0,cmap="gray")
+
+# set parameters
+pars = {'algorithm' : NDF, \
+        'input' : u0,\
+        'regularisation_parameter':0.06, \
+        'edge_parameter':0.04,\
+        'number_of_iterations' :1000 ,\
+        'time_marching_parameter':0.025,\
+        'penalty_type':1
+        }
+        
+print ("#############NDF CPU################")
+start_time = timeit.default_timer()
+ndf_cpu = NDF(pars['input'], 
+              pars['regularisation_parameter'],
+              pars['edge_parameter'], 
+              pars['number_of_iterations'],
+              pars['time_marching_parameter'], 
+              pars['penalty_type'],'cpu')  
+             
+rms = rmse(Im, ndf_cpu)
+pars['rmse'] = rms
+
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(1,2,2)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(ndf_cpu, cmap="gray")
+plt.title('{}'.format('CPU results'))
+
+
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("_____________FGP-dTV (2D)__________________")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+
+## plot 
+fig = plt.figure(5)
 plt.suptitle('Performance of FGP-dTV regulariser using the CPU')
 a=fig.add_subplot(1,2,1)
 a.set_title('Noisy Image')
@@ -247,7 +294,7 @@ print ("__________Total nuclear Variation__________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
-fig = plt.figure(5)
+fig = plt.figure(6)
 plt.suptitle('Performance of TNV regulariser using the CPU')
 a=fig.add_subplot(1,2,1)
 a.set_title('Noisy Image')
@@ -321,7 +368,7 @@ print ("_______________ROF-TV (3D)_________________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
-fig = plt.figure(6)
+fig = plt.figure(7)
 plt.suptitle('Performance of ROF-TV regulariser using the CPU')
 a=fig.add_subplot(1,2,1)
 a.set_title('Noisy 15th slice of a volume')
@@ -361,7 +408,7 @@ print ("_______________FGP-TV (3D)__________________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
-fig = plt.figure(7)
+fig = plt.figure(8)
 plt.suptitle('Performance of FGP-TV regulariser using the CPU')
 a=fig.add_subplot(1,2,1)
 a.set_title('Noisy Image')
@@ -410,7 +457,7 @@ print ("_______________SB-TV (3D)_________________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
-fig = plt.figure(8)
+fig = plt.figure(9)
 plt.suptitle('Performance of SB-TV regulariser using the CPU')
 a=fig.add_subplot(1,2,1)
 a.set_title('Noisy Image')
@@ -451,13 +498,58 @@ a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
 imgplot = plt.imshow(sb_cpu3D[10,:,:], cmap="gray")
 plt.title('{}'.format('Recovered volume on the CPU using SB-TV'))
 
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("________________NDF (3D)___________________")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+
+## plot 
+fig = plt.figure(10)
+plt.suptitle('Performance of NDF regulariser using the CPU')
+a=fig.add_subplot(1,2,1)
+a.set_title('Noisy volume')
+imgplot = plt.imshow(noisyVol[10,:,:],cmap="gray")
+
+# set parameters
+pars = {'algorithm' : NDF, \
+        'input' : noisyVol,\
+        'regularisation_parameter':0.06, \
+        'edge_parameter':0.04,\
+        'number_of_iterations' :1000 ,\
+        'time_marching_parameter':0.025,\
+        'penalty_type':  1
+        }
+        
+print ("#############NDF CPU################")
+start_time = timeit.default_timer()
+ndf_cpu3D = NDF(pars['input'], 
+              pars['regularisation_parameter'],
+              pars['edge_parameter'], 
+              pars['number_of_iterations'],
+              pars['time_marching_parameter'], 
+              pars['penalty_type'])  
+             
+rms = rmse(idealVol, ndf_cpu3D)
+pars['rmse'] = rms
+
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(1,2,2)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(ndf_cpu3D[10,:,:], cmap="gray")
+plt.title('{}'.format('Recovered volume on the CPU using NDF iterations'))
 
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 print ("_______________FGP-dTV (3D)__________________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
-fig = plt.figure(9)
+fig = plt.figure(11)
 plt.suptitle('Performance of FGP-dTV regulariser using the CPU')
 a=fig.add_subplot(1,2,1)
 a.set_title('Noisy Image')
diff --git a/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py b/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py
index d8e2da7..05db23e 100644
--- a/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py
+++ b/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py
@@ -12,7 +12,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 import os
 import timeit
-from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, FGP_dTV
+from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, FGP_dTV, NDF
 from qualitymetrics import rmse
 ###############################################################################
 def printParametersToString(pars):
@@ -306,11 +306,98 @@ else:
 
 
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
-print ("____________FGP-dTV bench___________________")
+print ("_______________NDF bench___________________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
 fig = plt.figure(4)
+plt.suptitle('Comparison of NDF regulariser using CPU and GPU implementations')
+a=fig.add_subplot(1,4,1)
+a.set_title('Noisy Image')
+imgplot = plt.imshow(u0,cmap="gray")
+
+# set parameters
+pars = {'algorithm' : NDF, \
+        'input' : u0,\
+        'regularisation_parameter':0.06, \
+        'edge_parameter':0.04,\
+        'number_of_iterations' :1000 ,\
+        'time_marching_parameter':0.025,\
+        'penalty_type':  1
+        }
+        
+print ("#############NDF CPU####################")
+start_time = timeit.default_timer()
+ndf_cpu = NDF(pars['input'], 
+              pars['regularisation_parameter'],
+              pars['edge_parameter'], 
+              pars['number_of_iterations'],
+              pars['time_marching_parameter'], 
+              pars['penalty_type'],'cpu')
+             
+rms = rmse(Im, ndf_cpu)
+pars['rmse'] = rms
+
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(1,4,2)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(ndf_cpu, cmap="gray")
+plt.title('{}'.format('CPU results'))
+
+
+print ("##############NDF GPU##################")
+start_time = timeit.default_timer()
+ndf_gpu = NDF(pars['input'], 
+              pars['regularisation_parameter'],
+              pars['edge_parameter'], 
+              pars['number_of_iterations'],
+              pars['time_marching_parameter'], 
+              pars['penalty_type'],'gpu')
+             
+rms = rmse(Im, ndf_gpu)
+pars['rmse'] = rms
+pars['algorithm'] = NDF
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(1,4,3)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(ndf_gpu, cmap="gray")
+plt.title('{}'.format('GPU results'))
+
+print ("--------Compare the results--------")
+tolerance = 1e-05
+diff_im = np.zeros(np.shape(rof_cpu))
+diff_im = abs(ndf_cpu - ndf_gpu)
+diff_im[diff_im > tolerance] = 1
+a=fig.add_subplot(1,4,4)
+imgplot = plt.imshow(diff_im, vmin=0, vmax=1, cmap="gray")
+plt.title('{}'.format('Pixels larger threshold difference'))
+if (diff_im.sum() > 1):
+    print ("Arrays do not match!")
+else:
+    print ("Arrays match")
+
+
+
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("____________FGP-dTV bench___________________")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+
+## plot 
+fig = plt.figure(5)
 plt.suptitle('Comparison of FGP-dTV regulariser using CPU and GPU implementations')
 a=fig.add_subplot(1,4,1)
 a.set_title('Noisy Image')
diff --git a/Wrappers/Python/demos/demo_gpu_regularisers.py b/Wrappers/Python/demos/demo_gpu_regularisers.py
index 25d8d85..b873700 100644
--- a/Wrappers/Python/demos/demo_gpu_regularisers.py
+++ b/Wrappers/Python/demos/demo_gpu_regularisers.py
@@ -12,7 +12,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 import os
 import timeit
-from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, FGP_dTV
+from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, FGP_dTV, NDF
 from qualitymetrics import rmse
 ###############################################################################
 def printParametersToString(pars):
@@ -50,7 +50,7 @@ u0 = u0.astype('float32')
 u_ref = u_ref.astype('float32')
 
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
-print ("____________ROF-TV bench___________________")
+print ("____________ROF-TV regulariser_____________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
@@ -92,7 +92,7 @@ plt.title('{}'.format('GPU results'))
 
 
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
-print ("____________FGP-TV bench___________________")
+print ("____________FGP-TV regulariser_____________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
@@ -141,7 +141,7 @@ plt.title('{}'.format('GPU results'))
 
 
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
-print ("____________SB-TV bench___________________")
+print ("____________SB-TV regulariser______________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
@@ -186,12 +186,60 @@ a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
 imgplot = plt.imshow(sb_gpu, cmap="gray")
 plt.title('{}'.format('GPU results'))
 
+
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
-print ("____________FGP-dTV bench___________________")
+print ("_______________NDF regulariser_____________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
 fig = plt.figure(4)
+plt.suptitle('Performance of the NDF regulariser using the GPU')
+a=fig.add_subplot(1,2,1)
+a.set_title('Noisy Image')
+imgplot = plt.imshow(u0,cmap="gray")
+
+# set parameters
+pars = {'algorithm' : NDF, \
+        'input' : u0,\
+        'regularisation_parameter':0.06, \
+        'edge_parameter':0.04,\
+        'number_of_iterations' :1000 ,\
+        'time_marching_parameter':0.025,\
+        'penalty_type':  1
+        }
+
+print ("##############NDF GPU##################")
+start_time = timeit.default_timer()
+ndf_gpu = NDF(pars['input'], 
+              pars['regularisation_parameter'],
+              pars['edge_parameter'], 
+              pars['number_of_iterations'],
+              pars['time_marching_parameter'], 
+              pars['penalty_type'],'gpu')  
+             
+rms = rmse(Im, ndf_gpu)
+pars['rmse'] = rms
+pars['algorithm'] = NDF
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(1,2,2)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(ndf_gpu, cmap="gray")
+plt.title('{}'.format('GPU results'))
+
+
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("____________FGP-dTV bench___________________")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+
+## plot 
+fig = plt.figure(5)
 plt.suptitle('Performance of the FGP-dTV regulariser using the GPU')
 a=fig.add_subplot(1,2,1)
 a.set_title('Noisy Image')
@@ -266,7 +314,7 @@ print ("_______________ROF-TV (3D)_________________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
-fig = plt.figure(5)
+fig = plt.figure(6)
 plt.suptitle('Performance of ROF-TV regulariser using the GPU')
 a=fig.add_subplot(1,2,1)
 a.set_title('Noisy 15th slice of a volume')
@@ -306,7 +354,7 @@ print ("_______________FGP-TV (3D)__________________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
-fig = plt.figure(6)
+fig = plt.figure(7)
 plt.suptitle('Performance of FGP-TV regulariser using the GPU')
 a=fig.add_subplot(1,2,1)
 a.set_title('Noisy Image')
@@ -354,7 +402,7 @@ print ("_______________SB-TV (3D)__________________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
-fig = plt.figure(7)
+fig = plt.figure(8)
 plt.suptitle('Performance of SB-TV regulariser using the GPU')
 a=fig.add_subplot(1,2,1)
 a.set_title('Noisy Image')
@@ -395,12 +443,60 @@ a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
 imgplot = plt.imshow(sb_gpu3D[10,:,:], cmap="gray")
 plt.title('{}'.format('Recovered volume on the GPU using SB-TV'))
 
+
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("_______________NDF-TV (3D)_________________")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+
+## plot 
+fig = plt.figure(9)
+plt.suptitle('Performance of NDF regulariser using the GPU')
+a=fig.add_subplot(1,2,1)
+a.set_title('Noisy Image')
+imgplot = plt.imshow(noisyVol[10,:,:],cmap="gray")
+
+# set parameters
+pars = {'algorithm' : NDF, \
+        'input' : noisyVol,\
+        'regularisation_parameter':0.06, \
+        'edge_parameter':0.04,\
+        'number_of_iterations' :1000 ,\
+        'time_marching_parameter':0.025,\
+        'penalty_type':  1
+        }
+
+print ("#############NDF GPU####################")
+start_time = timeit.default_timer()
+ndf_gpu3D = NDF(pars['input'], 
+              pars['regularisation_parameter'],
+              pars['edge_parameter'], 
+              pars['number_of_iterations'],
+              pars['time_marching_parameter'], 
+              pars['penalty_type'],'gpu')
+
+rms = rmse(idealVol, ndf_gpu3D)
+pars['rmse'] = rms
+
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(1,2,2)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(ndf_gpu3D[10,:,:], cmap="gray")
+plt.title('{}'.format('Recovered volume on the GPU using NDF'))
+
+
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 print ("_______________FGP-dTV (3D)________________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
 ## plot 
-fig = plt.figure(8)
+fig = plt.figure(10)
 plt.suptitle('Performance of FGP-dTV regulariser using the GPU')
 a=fig.add_subplot(1,2,1)
 a.set_title('Noisy Image')
diff --git a/Wrappers/Python/setup-regularisers.py.in b/Wrappers/Python/setup-regularisers.py.in
index 0681cc4..b900efe 100644
--- a/Wrappers/Python/setup-regularisers.py.in
+++ b/Wrappers/Python/setup-regularisers.py.in
@@ -37,6 +37,7 @@ extra_include_dirs += [os.path.join(".." , ".." , "Core"),
                        os.path.join(".." , ".." , "Core",  "regularisers_GPU" , "TV_FGP" ) , 
                        os.path.join(".." , ".." , "Core",  "regularisers_GPU" , "TV_ROF" ) , 
                        os.path.join(".." , ".." , "Core",  "regularisers_GPU" , "TV_SB" ) ,
+                       os.path.join(".." , ".." , "Core",  "regularisers_GPU" , "NDF" ) ,
                        os.path.join(".." , ".." , "Core",  "regularisers_GPU" , "dTV_FGP" ) , 
 						   "."]
 
diff --git a/Wrappers/Python/src/cpu_regularisers.pyx b/Wrappers/Python/src/cpu_regularisers.pyx
index abbf3b0..7ed8fa1 100644
--- a/Wrappers/Python/src/cpu_regularisers.pyx
+++ b/Wrappers/Python/src/cpu_regularisers.pyx
@@ -21,10 +21,10 @@ cimport numpy as np
 cdef extern float TV_ROF_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ);
 cdef extern float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
 cdef extern float SB_TV_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ);
+cdef extern float Diffusion_CPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int dimX, int dimY, int dimZ);
 cdef extern float TNV_CPU_main(float *Input, float *u, float lambdaPar, int maxIter, float tol, int dimX, int dimY, int dimZ);
 cdef extern float dTV_FGP_CPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
 
-
 #****************************************************************#
 #********************** Total-variation ROF *********************#
 #****************************************************************#
@@ -275,3 +275,47 @@ def TNV_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
     # Run TNV iterations for 3D (X,Y,Channels) data 
     TNV_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, iterationsNumb, tolerance_param, dims[2], dims[1], dims[0])
     return outputData
+#****************************************************************#
+#***************Nonlinear (Isotropic) Diffusion******************#
+#****************************************************************#
+def NDF_CPU(inputData, regularisation_parameter, edge_parameter, iterationsNumb,time_marching_parameter, penalty_type):
+    if inputData.ndim == 2:
+        return NDF_2D(inputData, regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type)
+    elif inputData.ndim == 3:
+        return NDF_3D(inputData, regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type)
+
+def NDF_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
+                     float regularisation_parameter,
+                     float edge_parameter,
+                     int iterationsNumb,                     
+                     float time_marching_parameter,
+                     int penalty_type):
+    cdef long dims[2]
+    dims[0] = inputData.shape[0]
+    dims[1] = inputData.shape[1]
+    
+    cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \
+            np.zeros([dims[0],dims[1]], dtype='float32')   
+    
+    # Run Nonlinear Diffusion iterations for 2D data 
+    Diffusion_CPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, dims[0], dims[1], 1)    
+    return outputData
+            
+def NDF_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
+                     float regularisation_parameter,
+                     float edge_parameter,
+                     int iterationsNumb,                     
+                     float time_marching_parameter,
+                     int penalty_type):
+    cdef long dims[3]
+    dims[0] = inputData.shape[0]
+    dims[1] = inputData.shape[1]
+    dims[2] = inputData.shape[2]
+    
+    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \
+            np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
+    
+    # Run Nonlinear Diffusion iterations for  3D data 
+    Diffusion_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, dims[2], dims[1], dims[0])
+
+    return outputData
diff --git a/Wrappers/Python/src/gpu_regularisers.pyx b/Wrappers/Python/src/gpu_regularisers.pyx
index 36eec95..b0775054 100644
--- a/Wrappers/Python/src/gpu_regularisers.pyx
+++ b/Wrappers/Python/src/gpu_regularisers.pyx
@@ -21,6 +21,7 @@ cimport numpy as np
 cdef extern void TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z);
 cdef extern void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z);
 cdef extern void TV_SB_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int printM, int N, int M, int Z);
+cdef extern void NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z);
 cdef extern void dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int printM, int N, int M, int Z);
 
 # Total-variation Rudin-Osher-Fatemi (ROF)
@@ -114,6 +115,27 @@ def dTV_FGP_GPU(inputData,
                      methodTV,
                      nonneg,
                      printM)
+# Nonlocal Isotropic Diffusion (NDF)
+def NDF_GPU(inputData,
+                     regularisation_parameter,
+                     edge_parameter,
+                     iterations, 
+                     time_marching_parameter,
+                     penalty_type):
+    if inputData.ndim == 2:
+        return NDF_GPU_2D(inputData,
+                     regularisation_parameter,
+                     edge_parameter,
+                     iterations, 
+                     time_marching_parameter,
+                     penalty_type)
+    elif inputData.ndim == 3:
+        return NDF_GPU_3D(inputData,
+                     regularisation_parameter,
+                     edge_parameter,
+                     iterations, 
+                     time_marching_parameter,
+                     penalty_type)                     
 #****************************************************************#
 #********************** Total-variation ROF *********************#
 #****************************************************************#
@@ -336,3 +358,48 @@ def FGPdTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
                        printM,
                        dims[2], dims[1], dims[0]);
     return outputData 
+
+#****************************************************************#
+#***************Nonlinear (Isotropic) Diffusion******************#
+#****************************************************************#
+def NDF_GPU_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
+                     float regularisation_parameter,
+                     float edge_parameter,
+                     int iterationsNumb,                     
+                     float time_marching_parameter,
+                     int penalty_type):
+    cdef long dims[2]
+    dims[0] = inputData.shape[0]
+    dims[1] = inputData.shape[1]
+    
+    cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \
+            np.zeros([dims[0],dims[1]], dtype='float32')
+    
+    #rangecheck = penalty_type < 1 and penalty_type > 3
+    #if not rangecheck:
+#        raise ValueError('Choose penalty type as 1 for Huber, 2 - Perona-Malik, 3 - Tukey Biweight')
+    
+    # Run Nonlinear Diffusion iterations for 2D data 
+    # Running CUDA code here  
+    NonlDiff_GPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, dims[0], dims[1], 1)    
+    return outputData
+            
+def NDF_GPU_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
+                     float regularisation_parameter,
+                     float edge_parameter,
+                     int iterationsNumb,                     
+                     float time_marching_parameter,
+                     int penalty_type):
+    cdef long dims[3]
+    dims[0] = inputData.shape[0]
+    dims[1] = inputData.shape[1]
+    dims[2] = inputData.shape[2]
+    
+    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \
+            np.zeros([dims[0],dims[1],dims[2]], dtype='float32')    
+       
+    # Run Nonlinear Diffusion iterations for  3D data 
+    # Running CUDA code here  
+    NonlDiff_GPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, dims[2], dims[1], dims[0])
+
+    return outputData