diff options
Diffstat (limited to 'src')
60 files changed, 3597 insertions, 2651 deletions
diff --git a/src/Core/regularisers_CPU/Diffus4th_order_core.c b/src/Core/regularisers_CPU/Diffus4th_order_core.c index 01f4f64..28ac8a9 100644 --- a/src/Core/regularisers_CPU/Diffus4th_order_core.c +++ b/src/Core/regularisers_CPU/Diffus4th_order_core.c @@ -23,61 +23,85 @@ #define EPS 1.0e-7 /* C-OMP implementation of fourth-order diffusion scheme [1] for piecewise-smooth recovery (2D/3D case) - * The minimisation is performed using explicit scheme. + * The minimisation is performed using explicit scheme. * * Input Parameters: - * 1. Noisy image/volume + * 1. Noisy image/volume * 2. lambda - regularization parameter * 3. Edge-preserving parameter (sigma) - * 4. Number of iterations, for explicit scheme >= 150 is recommended + * 4. Number of iterations, for explicit scheme >= 150 is recommended * 5. tau - time-marching step for the explicit scheme + * 6. eplsilon: tolerance constant * * Output: - * [1] Regularized image/volume + * [1] Regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the paper by * [1] Hajiaboli, M.R., 2011. An anisotropic fourth-order diffusion filter for image noise removal. International Journal of Computer Vision, 92(2), pp.177-191. */ -float Diffus4th_CPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ) +float Diffus4th_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, float epsil, int dimX, int dimY, int dimZ) { - int i,DimTotal; - float sigmaPar2; - float *W_Lapl=NULL; + int i,DimTotal,j,count; + float sigmaPar2, re, re1; + re = 0.0f; re1 = 0.0f; + count = 0; + float *W_Lapl=NULL, *Output_prev=NULL; sigmaPar2 = sigmaPar*sigmaPar; DimTotal = dimX*dimY*dimZ; - + W_Lapl = calloc(DimTotal, sizeof(float)); - + + if (epsil != 0.0f) Output_prev = calloc(DimTotal, sizeof(float)); + /* copy into output */ copyIm(Input, Output, (long)(dimX), (long)(dimY), (long)(dimZ)); - - if (dimZ == 1) { - /* running 2D diffusion iterations */ + for(i=0; i < iterationsNumb; i++) { + if ((epsil != 0.0f) && (i % 5 == 0)) copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); + + if (dimZ == 1) { + /* running 2D diffusion iterations */ /* Calculating weighted Laplacian */ Weighted_Laplc2D(W_Lapl, Output, sigmaPar2, dimX, dimY); /* Perform iteration step */ Diffusion_update_step2D(Output, Input, W_Lapl, lambdaPar, sigmaPar2, tau, (long)(dimX), (long)(dimY)); - } - } - else { - /* running 3D diffusion iterations */ - for(i=0; i < iterationsNumb; i++) { - /* Calculating weighted Laplacian */ + } + else { + /* running 3D diffusion iterations */ + /* Calculating weighted Laplacian */ Weighted_Laplc3D(W_Lapl, Output, sigmaPar2, dimX, dimY, dimZ); /* Perform iteration step */ Diffusion_update_step3D(Output, Input, W_Lapl, lambdaPar, sigmaPar2, tau, (long)(dimX), (long)(dimY), (long)(dimZ)); - } - } - free(W_Lapl); - return *Output; + } + + /* check early stopping criteria */ + if ((epsil != 0.0f) && (i % 5 == 0)) { + re = 0.0f; re1 = 0.0f; + for(j=0; j<DimTotal; j++) + { + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); + } + re = sqrtf(re)/sqrtf(re1); + if (re < epsil) count++; + if (count > 3) break; + } + } + free(W_Lapl); + + if (epsil != 0.0f) free(Output_prev); + /*adding info into info_vector */ + infovector[0] = (float)(i); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; } /********************************************************************/ /***************************2D Functions*****************************/ /********************************************************************/ float Weighted_Laplc2D(float *W_Lapl, float *U0, float sigma, long dimX, long dimY) -{ +{ long i,j,i1,i2,j1,j2,index; float gradX, gradX_sq, gradY, gradY_sq, gradXX, gradYY, gradXY, xy_2, denom, V_norm, V_orth, c, c_sq; @@ -90,35 +114,35 @@ float Weighted_Laplc2D(float *W_Lapl, float *U0, float sigma, long dimX, long di /* symmetric boundary conditions */ j1 = j+1; if (j1 == dimY) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; - + index = j*dimX+i; - + gradX = 0.5f*(U0[j*dimX+i2] - U0[j*dimX+i1]); gradX_sq = pow(gradX,2); - + gradY = 0.5f*(U0[j2*dimX+i] - U0[j1*dimX+i]); gradY_sq = pow(gradY,2); - + gradXX = U0[j*dimX+i2] + U0[j*dimX+i1] - 2*U0[index]; gradYY = U0[j2*dimX+i] + U0[j1*dimX+i] - 2*U0[index]; - + gradXY = 0.25f*(U0[j2*dimX+i2] + U0[j1*dimX+i1] - U0[j1*dimX+i2] - U0[j2*dimX+i1]); xy_2 = 2.0f*gradX*gradY*gradXY; - + denom = gradX_sq + gradY_sq; - + if (denom <= EPS) { V_norm = (gradXX*gradX_sq + xy_2 + gradYY*gradY_sq)/EPS; - V_orth = (gradXX*gradY_sq - xy_2 + gradYY*gradX_sq)/EPS; + V_orth = (gradXX*gradY_sq - xy_2 + gradYY*gradX_sq)/EPS; } else { V_norm = (gradXX*gradX_sq + xy_2 + gradYY*gradY_sq)/denom; - V_orth = (gradXX*gradY_sq - xy_2 + gradYY*gradX_sq)/denom; + V_orth = (gradXX*gradY_sq - xy_2 + gradYY*gradX_sq)/denom; } c = 1.0f/(1.0f + denom/sigma); c_sq = c*c; - + W_Lapl[index] = c_sq*V_norm + c*V_orth; } } @@ -140,7 +164,7 @@ float Diffusion_update_step2D(float *Output, float *Input, float *W_Lapl, float j1 = j+1; if (j1 == dimY) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; index = j*dimX+i; - + gradXXc = W_Lapl[j*dimX+i2] + W_Lapl[j*dimX+i1] - 2*W_Lapl[index]; gradYYc = W_Lapl[j2*dimX+i] + W_Lapl[j1*dimX+i] - 2*W_Lapl[index]; @@ -153,10 +177,10 @@ float Diffusion_update_step2D(float *Output, float *Input, float *W_Lapl, float /***************************3D Functions*****************************/ /********************************************************************/ float Weighted_Laplc3D(float *W_Lapl, float *U0, float sigma, long dimX, long dimY, long dimZ) -{ +{ long i,j,k,i1,i2,j1,j2,k1,k2,index; float gradX, gradX_sq, gradY, gradY_sq, gradXX, gradYY, gradXY, xy_2, denom, V_norm, V_orth, c, c_sq, gradZ, gradZ_sq, gradZZ, gradXZ, gradYZ, xyz_1, xyz_2; - + #pragma omp parallel for shared(W_Lapl) private(i,j,k,i1,i2,j1,j2,k1,k2,index,gradX, gradX_sq, gradY, gradY_sq, gradXX, gradYY, gradXY, xy_2, denom, V_norm, V_orth, c, c_sq, gradZ, gradZ_sq, gradZZ, gradXZ, gradYZ, xyz_1, xyz_2) for(i=0; i<dimX; i++) { /* symmetric boundary conditions */ @@ -166,37 +190,37 @@ float Weighted_Laplc3D(float *W_Lapl, float *U0, float sigma, long dimX, long di /* symmetric boundary conditions */ j1 = j+1; if (j1 == dimY) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; - + for(k=0; k<dimZ; k++) { /* symmetric boundary conditions */ k1 = k+1; if (k1 == dimZ) k1 = k-1; k2 = k-1; if (k2 < 0) k2 = k+1; - + index = (dimX*dimY)*k + j*dimX+i; - + gradX = 0.5f*(U0[(dimX*dimY)*k + j*dimX+i2] - U0[(dimX*dimY)*k + j*dimX+i1]); gradX_sq = pow(gradX,2); - + gradY = 0.5f*(U0[(dimX*dimY)*k + j2*dimX+i] - U0[(dimX*dimY)*k + j1*dimX+i]); gradY_sq = pow(gradY,2); - + gradZ = 0.5f*(U0[(dimX*dimY)*k2 + j*dimX+i] - U0[(dimX*dimY)*k1 + j*dimX+i]); gradZ_sq = pow(gradZ,2); - + gradXX = U0[(dimX*dimY)*k + j*dimX+i2] + U0[(dimX*dimY)*k + j*dimX+i1] - 2*U0[index]; gradYY = U0[(dimX*dimY)*k + j2*dimX+i] + U0[(dimX*dimY)*k + j1*dimX+i] - 2*U0[index]; gradZZ = U0[(dimX*dimY)*k2 + j*dimX+i] + U0[(dimX*dimY)*k1 + j*dimX+i] - 2*U0[index]; - + gradXY = 0.25f*(U0[(dimX*dimY)*k + j2*dimX+i2] + U0[(dimX*dimY)*k + j1*dimX+i1] - U0[(dimX*dimY)*k + j1*dimX+i2] - U0[(dimX*dimY)*k + j2*dimX+i1]); gradXZ = 0.25f*(U0[(dimX*dimY)*k2 + j*dimX+i2] - U0[(dimX*dimY)*k2+j*dimX+i1] - U0[(dimX*dimY)*k1+j*dimX+i2] + U0[(dimX*dimY)*k1+j*dimX+i1]); gradYZ = 0.25f*(U0[(dimX*dimY)*k2 +j2*dimX+i] - U0[(dimX*dimY)*k2+j1*dimX+i] - U0[(dimX*dimY)*k1+j2*dimX+i] + U0[(dimX*dimY)*k1+j1*dimX+i]); - + xy_2 = 2.0f*gradX*gradY*gradXY; xyz_1 = 2.0f*gradX*gradZ*gradXZ; xyz_2 = 2.0f*gradY*gradZ*gradYZ; - + denom = gradX_sq + gradY_sq + gradZ_sq; - + if (denom <= EPS) { V_norm = (gradXX*gradX_sq + gradYY*gradY_sq + gradZZ*gradZ_sq + xy_2 + xyz_1 + xyz_2)/EPS; V_orth = ((gradY_sq + gradZ_sq)*gradXX + (gradX_sq + gradZ_sq)*gradYY + (gradX_sq + gradY_sq)*gradZZ - xy_2 - xyz_1 - xyz_2)/EPS; @@ -208,7 +232,7 @@ float Weighted_Laplc3D(float *W_Lapl, float *U0, float sigma, long dimX, long di c = 1.0f/(1.0f + denom/sigma); c_sq = c*c; - + W_Lapl[index] = c_sq*V_norm + c*V_orth; } } @@ -230,18 +254,18 @@ float Diffusion_update_step3D(float *Output, float *Input, float *W_Lapl, float /* symmetric boundary conditions */ j1 = j+1; if (j1 == dimY) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; - + for(k=0; k<dimZ; k++) { /* symmetric boundary conditions */ k1 = k+1; if (k1 == dimZ) k1 = k-1; k2 = k-1; if (k2 < 0) k2 = k+1; - + index = (dimX*dimY)*k + j*dimX+i; - + gradXXc = W_Lapl[(dimX*dimY)*k + j*dimX+i2] + W_Lapl[(dimX*dimY)*k + j*dimX+i1] - 2*W_Lapl[index]; gradYYc = W_Lapl[(dimX*dimY)*k + j2*dimX+i] + W_Lapl[(dimX*dimY)*k + j1*dimX+i] - 2*W_Lapl[index]; gradZZc = W_Lapl[(dimX*dimY)*k2 + j*dimX+i] + W_Lapl[(dimX*dimY)*k1 + j*dimX+i] - 2*W_Lapl[index]; - + Output[index] += tau*(-lambdaPar*(gradXXc + gradYYc + gradZZc) - (Output[index] - Input[index])); } } diff --git a/src/Core/regularisers_CPU/Diffus4th_order_core.h b/src/Core/regularisers_CPU/Diffus4th_order_core.h index d81afcb..e4a8b3e 100644 --- a/src/Core/regularisers_CPU/Diffus4th_order_core.h +++ b/src/Core/regularisers_CPU/Diffus4th_order_core.h @@ -26,26 +26,28 @@ limitations under the License. #include "CCPiDefines.h" /* C-OMP implementation of fourth-order diffusion scheme [1] for piecewise-smooth recovery (2D/3D case) - * The minimisation is performed using explicit scheme. + * The minimisation is performed using explicit scheme. * * Input Parameters: - * 1. Noisy image/volume + * 1. Noisy image/volume * 2. lambda - regularization parameter * 3. Edge-preserving parameter (sigma) - * 4. Number of iterations, for explicit scheme >= 150 is recommended + * 4. Number of iterations, for explicit scheme >= 150 is recommended * 5. tau - time-marching step for explicit scheme + * 6. eplsilon: tolerance constant * * Output: - * [1] Regularized image/volume + * [1] Regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the paper by * [1] Hajiaboli, M.R., 2011. An anisotropic fourth-order diffusion filter for image noise removal. International Journal of Computer Vision, 92(2), pp.177-191. */ - + #ifdef __cplusplus extern "C" { #endif -CCPI_EXPORT float Diffus4th_CPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ); +CCPI_EXPORT float Diffus4th_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, float epsil, int dimX, int dimY, int dimZ); CCPI_EXPORT float Weighted_Laplc2D(float *W_Lapl, float *U0, float sigma, long dimX, long dimY); CCPI_EXPORT float Diffusion_update_step2D(float *Output, float *Input, float *W_Lapl, float lambdaPar, float sigmaPar2, float tau, long dimX, long dimY); CCPI_EXPORT float Weighted_Laplc3D(float *W_Lapl, float *U0, float sigma, long dimX, long dimY, long dimZ); diff --git a/src/Core/regularisers_CPU/Diffusion_core.c b/src/Core/regularisers_CPU/Diffusion_core.c index b765796..7f06dd8 100644 --- a/src/Core/regularisers_CPU/Diffusion_core.c +++ b/src/Core/regularisers_CPU/Diffusion_core.c @@ -30,48 +30,75 @@ int signNDFc(float x) { } /* C-OMP implementation of linear and nonlinear diffusion with the regularisation model [1,2] (2D/3D case) - * The minimisation is performed using explicit scheme. + * The minimisation is performed using explicit scheme. * * Input Parameters: - * 1. Noisy image/volume + * 1. Noisy image/volume * 2. lambda - regularization parameter * 3. Edge-preserving parameter (sigma), when sigma equals to zero nonlinear diffusion -> linear diffusion - * 4. Number of iterations, for explicit scheme >= 150 is recommended + * 4. Number of iterations, for explicit scheme >= 150 is recommended * 5. tau - time-marching step for explicit scheme * 6. Penalty type: 1 - Huber, 2 - Perona-Malik, 3 - Tukey Biweight - * + * 7. eplsilon - tolerance constant + * Output: - * [1] Regularized image/volume + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the paper by * [1] Perona, P. and Malik, J., 1990. Scale-space and edge detection using anisotropic diffusion. IEEE Transactions on pattern analysis and machine intelligence, 12(7), pp.629-639. * [2] Black, M.J., Sapiro, G., Marimont, D.H. and Heeger, D., 1998. Robust anisotropic diffusion. IEEE Transactions on image processing, 7(3), pp.421-432. */ -float Diffusion_CPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int dimX, int dimY, int dimZ) +float Diffusion_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, float epsil, int dimX, int dimY, int dimZ) { int i; - float sigmaPar2; + float sigmaPar2, *Output_prev=NULL; sigmaPar2 = sigmaPar/sqrt(2.0f); - + long j, DimTotal; + float re, re1; + re = 0.0f; re1 = 0.0f; + int count = 0; + DimTotal = (long)(dimX*dimY*dimZ); + + if (epsil != 0.0f) Output_prev = calloc(DimTotal, sizeof(float)); + /* copy into output */ copyIm(Input, Output, (long)(dimX), (long)(dimY), (long)(dimZ)); - - if (dimZ == 1) { - /* running 2D diffusion iterations */ + for(i=0; i < iterationsNumb; i++) { + + if ((epsil != 0.0f) && (i % 5 == 0)) copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); + if (dimZ == 1) { + /* running 2D diffusion iterations */ if (sigmaPar == 0.0f) LinearDiff2D(Input, Output, lambdaPar, tau, (long)(dimX), (long)(dimY)); /* linear diffusion (heat equation) */ else NonLinearDiff2D(Input, Output, lambdaPar, sigmaPar2, tau, penaltytype, (long)(dimX), (long)(dimY)); /* nonlinear diffusion */ + } + else { + /* running 3D diffusion iterations */ + if (sigmaPar == 0.0f) LinearDiff3D(Input, Output, lambdaPar, tau, (long)(dimX), (long)(dimY), (long)(dimZ)); + else NonLinearDiff3D(Input, Output, lambdaPar, sigmaPar2, tau, penaltytype, (long)(dimX), (long)(dimY), (long)(dimZ)); + } + /* check early stopping criteria if epsilon not equal zero */ + if ((epsil != 0.0f) && (i % 5 == 0)) { + re = 0.0f; re1 = 0.0f; + for(j=0; j<DimTotal; j++) + { + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); + } + re = sqrtf(re)/sqrtf(re1); + /* stop if the norm residual is less than the tolerance EPS */ + if (re < epsil) count++; + if (count > 3) break; + } } - } - else { - /* running 3D diffusion iterations */ - for(i=0; i < iterationsNumb; i++) { - if (sigmaPar == 0.0f) LinearDiff3D(Input, Output, lambdaPar, tau, (long)(dimX), (long)(dimY), (long)(dimZ)); - else NonLinearDiff3D(Input, Output, lambdaPar, sigmaPar2, tau, penaltytype, (long)(dimX), (long)(dimY), (long)(dimZ)); - } - } - return *Output; + + free(Output_prev); + /*adding info into info_vector */ + infovector[0] = (float)(i); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; } @@ -83,7 +110,7 @@ float LinearDiff2D(float *Input, float *Output, float lambdaPar, float tau, long { long i,j,i1,i2,j1,j2,index; float e,w,n,s,e1,w1,n1,s1; - + #pragma omp parallel for shared(Input) private(index,i,j,i1,i2,j1,j2,e,w,n,s,e1,w1,n1,s1) for(i=0; i<dimX; i++) { /* symmetric boundary conditions (Neuman) */ @@ -94,18 +121,18 @@ float LinearDiff2D(float *Input, float *Output, float lambdaPar, float tau, long j1 = j+1; if (j1 == dimY) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; index = j*dimX+i; - + e = Output[j*dimX+i1]; w = Output[j*dimX+i2]; n = Output[j1*dimX+i]; s = Output[j2*dimX+i]; - + e1 = e - Output[index]; w1 = w - Output[index]; n1 = n - Output[index]; s1 = s - Output[index]; - - Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1) - (Output[index] - Input[index])); + + Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1) - (Output[index] - Input[index])); }} return *Output; } @@ -115,7 +142,7 @@ float NonLinearDiff2D(float *Input, float *Output, float lambdaPar, float sigmaP { long i,j,i1,i2,j1,j2,index; float e,w,n,s,e1,w1,n1,s1; - + #pragma omp parallel for shared(Input) private(index,i,j,i1,i2,j1,j2,e,w,n,s,e1,w1,n1,s1) for(i=0; i<dimX; i++) { /* symmetric boundary conditions (Neuman) */ @@ -126,28 +153,28 @@ float NonLinearDiff2D(float *Input, float *Output, float lambdaPar, float sigmaP j1 = j+1; if (j1 == dimY) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; index = j*dimX+i; - + e = Output[j*dimX+i1]; w = Output[j*dimX+i2]; n = Output[j1*dimX+i]; s = Output[j2*dimX+i]; - + e1 = e - Output[index]; w1 = w - Output[index]; n1 = n - Output[index]; s1 = s - Output[index]; - + if (penaltytype == 1){ /* Huber penalty */ if (fabs(e1) > sigmaPar) e1 = signNDFc(e1); else e1 = e1/sigmaPar; - + if (fabs(w1) > sigmaPar) w1 = signNDFc(w1); else w1 = w1/sigmaPar; - + if (fabs(n1) > sigmaPar) n1 = signNDFc(n1); else n1 = n1/sigmaPar; - + if (fabs(s1) > sigmaPar) s1 = signNDFc(s1); else s1 = s1/sigmaPar; } @@ -173,7 +200,7 @@ float NonLinearDiff2D(float *Input, float *Output, float lambdaPar, float sigmaP printf("%s \n", "No penalty function selected! Use 1,2 or 3."); break; } - Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1) - (Output[index] - Input[index])); + Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1) - (Output[index] - Input[index])); }} return *Output; } @@ -185,7 +212,7 @@ float LinearDiff3D(float *Input, float *Output, float lambdaPar, float tau, long { long i,j,k,i1,i2,j1,j2,k1,k2,index; float e,w,n,s,u,d,e1,w1,n1,s1,u1,d1; - + #pragma omp parallel for shared(Input) private(index,i,j,i1,i2,j1,j2,e,w,n,s,e1,w1,n1,s1,k,k1,k2,u1,d1,u,d) for(k=0; k<dimZ; k++) { k1 = k+1; if (k1 == dimZ) k1 = k-1; @@ -199,22 +226,22 @@ for(k=0; k<dimZ; k++) { j1 = j+1; if (j1 == dimY) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; index = (dimX*dimY)*k + j*dimX+i; - + e = Output[(dimX*dimY)*k + j*dimX+i1]; w = Output[(dimX*dimY)*k + j*dimX+i2]; n = Output[(dimX*dimY)*k + j1*dimX+i]; s = Output[(dimX*dimY)*k + j2*dimX+i]; u = Output[(dimX*dimY)*k1 + j*dimX+i]; d = Output[(dimX*dimY)*k2 + j*dimX+i]; - + e1 = e - Output[index]; w1 = w - Output[index]; n1 = n - Output[index]; s1 = s - Output[index]; u1 = u - Output[index]; d1 = d - Output[index]; - - Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1 + u1 + d1) - (Output[index] - Input[index])); + + Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1 + u1 + d1) - (Output[index] - Input[index])); }}} return *Output; } @@ -223,7 +250,7 @@ float NonLinearDiff3D(float *Input, float *Output, float lambdaPar, float sigmaP { long i,j,k,i1,i2,j1,j2,k1,k2,index; float e,w,n,s,u,d,e1,w1,n1,s1,u1,d1; - + #pragma omp parallel for shared(Input) private(index,i,j,i1,i2,j1,j2,e,w,n,s,e1,w1,n1,s1,k,k1,k2,u1,d1,u,d) for(k=0; k<dimZ; k++) { k1 = k+1; if (k1 == dimZ) k1 = k-1; @@ -237,40 +264,40 @@ for(k=0; k<dimZ; k++) { j1 = j+1; if (j1 == dimY) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; index = (dimX*dimY)*k + j*dimX+i; - + e = Output[(dimX*dimY)*k + j*dimX+i1]; w = Output[(dimX*dimY)*k + j*dimX+i2]; n = Output[(dimX*dimY)*k + j1*dimX+i]; s = Output[(dimX*dimY)*k + j2*dimX+i]; u = Output[(dimX*dimY)*k1 + j*dimX+i]; d = Output[(dimX*dimY)*k2 + j*dimX+i]; - + e1 = e - Output[index]; w1 = w - Output[index]; n1 = n - Output[index]; s1 = s - Output[index]; u1 = u - Output[index]; d1 = d - Output[index]; - + if (penaltytype == 1){ /* Huber penalty */ if (fabs(e1) > sigmaPar) e1 = signNDFc(e1); else e1 = e1/sigmaPar; - + if (fabs(w1) > sigmaPar) w1 = signNDFc(w1); else w1 = w1/sigmaPar; - + if (fabs(n1) > sigmaPar) n1 = signNDFc(n1); else n1 = n1/sigmaPar; - + if (fabs(s1) > sigmaPar) s1 = signNDFc(s1); else s1 = s1/sigmaPar; - + if (fabs(u1) > sigmaPar) u1 = signNDFc(u1); else u1 = u1/sigmaPar; - + if (fabs(d1) > sigmaPar) d1 = signNDFc(d1); - else d1 = d1/sigmaPar; + else d1 = d1/sigmaPar; } else if (penaltytype == 2) { /* Perona-Malik */ @@ -301,7 +328,7 @@ for(k=0; k<dimZ; k++) { break; } - Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1 + u1 + d1) - (Output[index] - Input[index])); + Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1 + u1 + d1) - (Output[index] - Input[index])); }}} return *Output; } diff --git a/src/Core/regularisers_CPU/Diffusion_core.h b/src/Core/regularisers_CPU/Diffusion_core.h index cc36dad..e394a01 100644 --- a/src/Core/regularisers_CPU/Diffusion_core.h +++ b/src/Core/regularisers_CPU/Diffusion_core.h @@ -27,29 +27,31 @@ limitations under the License. /* C-OMP implementation of linear and nonlinear diffusion with the regularisation model [1,2] (2D/3D case) - * The minimisation is performed using explicit scheme. + * The minimisation is performed using explicit scheme. * * Input Parameters: - * 1. Noisy image/volume + * 1. Noisy image/volume * 2. lambda - regularization parameter * 3. Edge-preserving parameter (sigma), when sigma equals to zero nonlinear diffusion -> linear diffusion - * 4. Number of iterations, for explicit scheme >= 150 is recommended + * 4. Number of iterations, for explicit scheme >= 150 is recommended * 5. tau - time-marching step for explicit scheme * 6. Penalty type: 1 - Huber, 2 - Perona-Malik, 3 - Tukey Biweight - * + * 7. eplsilon - tolerance constant + * Output: - * [1] Regularized image/volume + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the paper by * [1] Perona, P. and Malik, J., 1990. Scale-space and edge detection using anisotropic diffusion. IEEE Transactions on pattern analysis and machine intelligence, 12(7), pp.629-639. * [2] Black, M.J., Sapiro, G., Marimont, D.H. and Heeger, D., 1998. Robust anisotropic diffusion. IEEE Transactions on image processing, 7(3), pp.421-432. */ - + #ifdef __cplusplus extern "C" { #endif -CCPI_EXPORT float Diffusion_CPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int dimX, int dimY, int dimZ); +CCPI_EXPORT float Diffusion_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, float epsil, int dimX, int dimY, int dimZ); CCPI_EXPORT float LinearDiff2D(float *Input, float *Output, float lambdaPar, float tau, long dimX, long dimY); CCPI_EXPORT float NonLinearDiff2D(float *Input, float *Output, float lambdaPar, float sigmaPar, float tau, int penaltytype, long dimX, long dimY); CCPI_EXPORT float LinearDiff3D(float *Input, float *Output, float lambdaPar, float tau, long dimX, long dimY, long dimZ); diff --git a/src/Core/regularisers_CPU/FGP_TV_core.c b/src/Core/regularisers_CPU/FGP_TV_core.c index 68d58b7..a17604e 100644 --- a/src/Core/regularisers_CPU/FGP_TV_core.c +++ b/src/Core/regularisers_CPU/FGP_TV_core.c @@ -3,8 +3,8 @@ This work is part of the Core Imaging Library developed by Visual Analytics and Imaging System Group of the Science Technology Facilities Council, STFC -Copyright 2017 Daniil Kazantsev -Copyright 2017 Srikanth Nagella, Edoardo Pasca +Copyright 2019 Daniil Kazantsev +Copyright 2019 Srikanth Nagella, Edoardo Pasca Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,140 +22,152 @@ limitations under the License. /* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case) * * Input Parameters: - * 1. Noisy image/volume - * 2. lambdaPar - regularization parameter + * 1. Noisy image/volume + * 2. lambdaPar - regularization parameter * 3. Number of iterations - * 4. eplsilon: tolerance constant + * 4. eplsilon: tolerance constant * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) - * 6. nonneg: 'nonnegativity (0 is OFF by default) - * 7. print information: 0 (off) or 1 (on) + * 6. nonneg: 'nonnegativity (0 is OFF by default) * * Output: - * [1] Filtered/regularized image + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the Matlab's code and paper by * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" */ - -float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ) + +float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ) { - int ll; + int ll; long j, DimTotal; - float re, re1; - float tk = 1.0f; - float tkp1=1.0f; + float re, re1; + re = 0.0f; re1 = 0.0f; + float tk = 1.0f; + float tkp1 =1.0f; int count = 0; - + if (dimZ <= 1) { - /*2D case */ - float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL; - DimTotal = (long)(dimX*dimY); - - Output_prev = calloc(DimTotal, sizeof(float)); + /*2D case */ + float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL; + DimTotal = (long)(dimX*dimY); + + if (epsil != 0.0f) Output_prev = calloc(DimTotal, sizeof(float)); P1 = calloc(DimTotal, sizeof(float)); P2 = calloc(DimTotal, sizeof(float)); P1_prev = calloc(DimTotal, sizeof(float)); P2_prev = calloc(DimTotal, sizeof(float)); R1 = calloc(DimTotal, sizeof(float)); - R2 = calloc(DimTotal, sizeof(float)); - - /* begin iterations */ + R2 = calloc(DimTotal, sizeof(float)); + + /* begin iterations */ for(ll=0; ll<iterationsNumb; ll++) { - + + if ((epsil != 0.0f) && (ll % 5 == 0)) copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l); /* computing the gradient of the objective function */ Obj_func2D(Input, Output, R1, R2, lambdaPar, (long)(dimX), (long)(dimY)); - + /* apply nonnegativity */ if (nonneg == 1) for(j=0; j<DimTotal; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;} - + /*Taking a step towards minus of the gradient*/ Grad_func2D(P1, P2, Output, R1, R2, lambdaPar, (long)(dimX), (long)(dimY)); - + /* projection step */ Proj_func2D(P1, P2, methodTV, DimTotal); - + /*updating R and t*/ - tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; + tkp1 = (1.0f + sqrtf(1.0f + 4.0f*tk*tk))*0.5f; Rupd_func2D(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, DimTotal); - - /* check early stopping criteria */ - re = 0.0f; re1 = 0.0f; - for(j=0; j<DimTotal; j++) - { - re += pow(Output[j] - Output_prev[j],2); - re1 += pow(Output[j],2); - } - re = sqrt(re)/sqrt(re1); - if (re < epsil) count++; - if (count > 4) break; - + /*storing old values*/ - copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l); copyIm(P1, P1_prev, (long)(dimX), (long)(dimY), 1l); copyIm(P2, P2_prev, (long)(dimX), (long)(dimY), 1l); tk = tkp1; - } - if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", ll); - free(Output_prev); free(P1); free(P2); free(P1_prev); free(P2_prev); free(R1); free(R2); + + /* check early stopping criteria */ + if ((epsil != 0.0f) && (ll % 5 == 0)) { + re = 0.0f; re1 = 0.0f; + for(j=0; j<DimTotal; j++) + { + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); + } + re = sqrtf(re)/sqrtf(re1); + if (re < epsil) count++; + if (count > 3) break; + } + } + if (epsil != 0.0f) free(Output_prev); + free(P1); free(P2); free(P1_prev); free(P2_prev); free(R1); free(R2); } else { /*3D case*/ - float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL; - DimTotal = (long)(dimX*dimY*dimZ); - - Output_prev = calloc(DimTotal, sizeof(float)); + float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL; + DimTotal = (long)(dimX*dimY*dimZ); + + if (epsil != 0.0f) Output_prev = calloc(DimTotal, sizeof(float)); P1 = calloc(DimTotal, sizeof(float)); P2 = calloc(DimTotal, sizeof(float)); P3 = calloc(DimTotal, sizeof(float)); P1_prev = calloc(DimTotal, sizeof(float)); - P2_prev = calloc(DimTotal, sizeof(float)); - P3_prev = calloc(DimTotal, sizeof(float)); + P2_prev = calloc(DimTotal, sizeof(float)); + P3_prev = calloc(DimTotal, sizeof(float)); R1 = calloc(DimTotal, sizeof(float)); - R2 = calloc(DimTotal, sizeof(float)); - R3 = calloc(DimTotal, sizeof(float)); - + R2 = calloc(DimTotal, sizeof(float)); + R3 = calloc(DimTotal, sizeof(float)); + /* begin iterations */ for(ll=0; ll<iterationsNumb; ll++) { - + + if ((epsil != 0.0f) && (ll % 5 == 0)) copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); + /* computing the gradient of the objective function */ Obj_func3D(Input, Output, R1, R2, R3, lambdaPar, (long)(dimX), (long)(dimY), (long)(dimZ)); - + /* apply nonnegativity */ - if (nonneg == 1) for(j=0; j<DimTotal; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;} - + if (nonneg == 1) for(j=0; j<DimTotal; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;} + /*Taking a step towards minus of the gradient*/ Grad_func3D(P1, P2, P3, Output, R1, R2, R3, lambdaPar, (long)(dimX), (long)(dimY), (long)(dimZ)); - + /* projection step */ Proj_func3D(P1, P2, P3, methodTV, DimTotal); - + /*updating R and t*/ - tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; + tkp1 = (1.0f + sqrtf(1.0f + 4.0f*tk*tk))*0.5f; Rupd_func3D(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, DimTotal); - + /* calculate norm - stopping rules*/ + if ((epsil != 0.0f) && (ll % 5 == 0)) { re = 0.0f; re1 = 0.0f; - for(j=0; j<DimTotal; j++) - { - re += pow(Output[j] - Output_prev[j],2); - re1 += pow(Output[j],2); - } - re = sqrt(re)/sqrt(re1); + for(j=0; j<DimTotal; j++) + { + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); + } + re = sqrtf(re)/sqrtf(re1); /* stop if the norm residual is less than the tolerance EPS */ if (re < epsil) count++; - if (count > 4) break; - + if (count > 3) break; + } + /*storing old values*/ - copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); copyIm(P1, P1_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); copyIm(P2, P2_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); copyIm(P3, P3_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); - tk = tkp1; - } - if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", ll); - free(Output_prev); free(P1); free(P2); free(P3); free(P1_prev); free(P2_prev); free(P3_prev); free(R1); free(R2); free(R3); + tk = tkp1; + } + + if (epsil != 0.0f) free(Output_prev); + free(P1); free(P2); free(P3); free(P1_prev); free(P2_prev); free(P3_prev); free(R1); free(R2); free(R3); } - return *Output; + + /*adding info into info_vector */ + infovector[0] = (float)(ll); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + + return 0; } float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, long dimX, long dimY) @@ -226,7 +238,7 @@ float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float multip; multip = ((tk-1.0f)/tkp1); #pragma omp parallel for shared(P1,P2,P1_old,P2_old,R1,R2,multip) private(i) - for(i=0; i<DimTotal; i++) { + for(i=0; i<DimTotal; i++) { R1[i] = P1[i] + multip*(P1[i] - P1_old[i]); R2[i] = P2[i] + multip*(P2[i] - P2_old[i]); } @@ -261,7 +273,7 @@ float Grad_func3D(float *P1, float *P2, float *P3, float *D, float *R1, float *R for(i=0; i<dimX; i++) { for(j=0; j<dimY; j++) { for(k=0; k<dimZ; k++) { - index = (dimX*dimY)*k + j*dimX+i; + index = (dimX*dimY)*k + j*dimX+i; /* boundary conditions */ if (i == dimX-1) val1 = 0.0f; else val1 = D[index] - D[(dimX*dimY)*k + j*dimX + (i+1)]; if (j == dimY-1) val2 = 0.0f; else val2 = D[index] - D[(dimX*dimY)*k + (j+1)*dimX + i]; @@ -273,13 +285,13 @@ float Grad_func3D(float *P1, float *P2, float *P3, float *D, float *R1, float *R return 1; } float Proj_func3D(float *P1, float *P2, float *P3, int methTV, long DimTotal) -{ +{ float val1, val2, val3, denom, sq_denom; long i; if (methTV == 0) { /* isotropic TV*/ #pragma omp parallel for shared(P1,P2,P3) private(i,val1,val2,val3,sq_denom) - for(i=0; i<DimTotal; i++) { + for(i=0; i<DimTotal; i++) { denom = powf(P1[i],2) + powf(P2[i],2) + powf(P3[i],2); if (denom > 1.0f) { sq_denom = 1.0f/sqrtf(denom); @@ -288,7 +300,7 @@ float Proj_func3D(float *P1, float *P2, float *P3, int methTV, long DimTotal) P3[i] = P3[i]*sq_denom; } } - } + } else { /* anisotropic TV*/ #pragma omp parallel for shared(P1,P2,P3) private(i,val1,val2,val3) @@ -298,7 +310,7 @@ float Proj_func3D(float *P1, float *P2, float *P3, int methTV, long DimTotal) val3 = fabs(P3[i]); if (val1 < 1.0f) {val1 = 1.0f;} if (val2 < 1.0f) {val2 = 1.0f;} - if (val3 < 1.0f) {val3 = 1.0f;} + if (val3 < 1.0f) {val3 = 1.0f;} P1[i] = P1[i]/val1; P2[i] = P2[i]/val2; P3[i] = P3[i]/val3; diff --git a/src/Core/regularisers_CPU/FGP_TV_core.h b/src/Core/regularisers_CPU/FGP_TV_core.h index 3418604..04e6e80 100644 --- a/src/Core/regularisers_CPU/FGP_TV_core.h +++ b/src/Core/regularisers_CPU/FGP_TV_core.h @@ -35,10 +35,11 @@ limitations under the License. * 4. eplsilon: tolerance constant * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) * 6. nonneg: 'nonnegativity (0 is OFF by default) - * 7. print information: 0 (off) or 1 (on) * * Output: - * [1] Filtered/regularized image + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] + * * This function is based on the Matlab's code and paper by * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" @@ -47,7 +48,7 @@ limitations under the License. #ifdef __cplusplus extern "C" { #endif -CCPI_EXPORT float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); +CCPI_EXPORT float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ); CCPI_EXPORT float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, long dimX, long dimY); CCPI_EXPORT float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, long dimX, long dimY); diff --git a/src/Core/regularisers_CPU/FGP_dTV_core.c b/src/Core/regularisers_CPU/FGP_dTV_core.c index 17b75ff..4e1e38c 100644 --- a/src/Core/regularisers_CPU/FGP_dTV_core.c +++ b/src/Core/regularisers_CPU/FGP_dTV_core.c @@ -3,8 +3,8 @@ This work is part of the Core Imaging Library developed by Visual Analytics and Imaging System Group of the Science Technology Facilities Council, STFC -Copyright 2017 Daniil Kazantsev -Copyright 2017 Srikanth Nagella, Edoardo Pasca +Copyright 2019 Daniil Kazantsev +Copyright 2019 Srikanth Nagella, Edoardo Pasca Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -29,155 +29,156 @@ limitations under the License. * 3. lambdaPar - regularization parameter [REQUIRED] * 4. Number of iterations [OPTIONAL] * 5. eplsilon: tolerance constant [OPTIONAL] - * 6. eta: smoothing constant to calculate gradient of the reference [OPTIONAL] * + * 6. eta: smoothing constant to calculate gradient of the reference [OPTIONAL] * * 7. TV-type: methodTV - 'iso' (0) or 'l1' (1) [OPTIONAL] * 8. nonneg: 'nonnegativity (0 is OFF by default) [OPTIONAL] * 9. print information: 0 (off) or 1 (on) [OPTIONAL] * * Output: * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the Matlab's codes and papers by * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" * [2] M. J. Ehrhardt and M. M. Betcke, Multi-Contrast MRI Reconstruction with Structure-Guided Total Variation, SIAM Journal on Imaging Sciences 9(3), pp. 1084–1106 */ - -float dTV_FGP_CPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ) + +float dTV_FGP_CPU_main(float *Input, float *InputRef, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int dimX, int dimY, int dimZ) { - int ll; + int ll; long j, DimTotal; - float re, re1; - float tk = 1.0f; + float re, re1; + re = 0.0f; re1 = 0.0f; + float tk = 1.0f; float tkp1=1.0f; int count = 0; - + + + float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL, *InputRef_x=NULL, *InputRef_y=NULL; + DimTotal = (long)(dimX*dimY*dimZ); + + if (epsil != 0.0f) Output_prev = calloc(DimTotal, sizeof(float)); + P1 = calloc(DimTotal, sizeof(float)); + P2 = calloc(DimTotal, sizeof(float)); + P1_prev = calloc(DimTotal, sizeof(float)); + P2_prev = calloc(DimTotal, sizeof(float)); + R1 = calloc(DimTotal, sizeof(float)); + R2 = calloc(DimTotal, sizeof(float)); + InputRef_x = calloc(DimTotal, sizeof(float)); + InputRef_y = calloc(DimTotal, sizeof(float)); + if (dimZ <= 1) { - /*2D case */ - float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL, *InputRef_x=NULL, *InputRef_y=NULL; - DimTotal = (long)(dimX*dimY); - - Output_prev = calloc(DimTotal, sizeof(float)); - P1 = calloc(DimTotal, sizeof(float)); - P2 = calloc(DimTotal, sizeof(float)); - P1_prev = calloc(DimTotal, sizeof(float)); - P2_prev = calloc(DimTotal, sizeof(float)); - R1 = calloc(DimTotal, sizeof(float)); - R2 = calloc(DimTotal, sizeof(float)); - InputRef_x = calloc(DimTotal, sizeof(float)); - InputRef_y = calloc(DimTotal, sizeof(float)); - - /* calculate gradient field (smoothed) for the reference image */ + /*2D case */ + /* calculate gradient field (smoothed) for the reference image */ GradNorm_func2D(InputRef, InputRef_x, InputRef_y, eta, (long)(dimX), (long)(dimY)); - + /* begin iterations */ for(ll=0; ll<iterationsNumb; ll++) { - - /*projects a 2D vector field R-1,2 onto the orthogonal complement of another 2D vector field InputRef_xy*/ + + if ((epsil != 0.0f) && (ll % 5 == 0)) copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l); + /*projects a 2D vector field R-1,2 onto the orthogonal complement of another 2D vector field InputRef_xy*/ ProjectVect_func2D(R1, R2, InputRef_x, InputRef_y, (long)(dimX), (long)(dimY)); - + /* computing the gradient of the objective function */ Obj_dfunc2D(Input, Output, R1, R2, lambdaPar, (long)(dimX), (long)(dimY)); - + /* apply nonnegativity */ if (nonneg == 1) for(j=0; j<DimTotal; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;} - + /*Taking a step towards minus of the gradient*/ Grad_dfunc2D(P1, P2, Output, R1, R2, InputRef_x, InputRef_y, lambdaPar, (long)(dimX), (long)(dimY)); - + /* projection step */ Proj_dfunc2D(P1, P2, methodTV, DimTotal); - + /*updating R and t*/ tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; Rupd_dfunc2D(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, DimTotal); - - /* check early stopping criteria */ - re = 0.0f; re1 = 0.0f; - for(j=0; j<DimTotal; j++) - { - re += pow(Output[j] - Output_prev[j],2); - re1 += pow(Output[j],2); - } - re = sqrt(re)/sqrt(re1); - if (re < epsil) count++; - if (count > 4) break; - - /*storing old values*/ - copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l); + copyIm(P1, P1_prev, (long)(dimX), (long)(dimY), 1l); copyIm(P2, P2_prev, (long)(dimX), (long)(dimY), 1l); tk = tkp1; + + /* check early stopping criteria */ + if ((epsil != 0.0f) && (ll % 5 == 0)) { + re = 0.0f; re1 = 0.0f; + for(j=0; j<DimTotal; j++) + { + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); + } + re = sqrtf(re)/sqrtf(re1); + if (re < epsil) count++; + if (count > 3) break; + } } - if (printM == 1) printf("FGP-dTV iterations stopped at iteration %i \n", ll); - free(Output_prev); free(P1); free(P2); free(P1_prev); free(P2_prev); free(R1); free(R2); free(InputRef_x); free(InputRef_y); } else { /*3D case*/ - float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL, *InputRef_x=NULL, *InputRef_y=NULL, *InputRef_z=NULL; - DimTotal = (long)(dimX*dimY*dimZ); - - Output_prev = calloc(DimTotal, sizeof(float)); - P1 = calloc(DimTotal, sizeof(float)); - P2 = calloc(DimTotal, sizeof(float)); + float *P3=NULL, *P3_prev=NULL, *R3=NULL, *InputRef_z=NULL; + P3 = calloc(DimTotal, sizeof(float)); - P1_prev = calloc(DimTotal, sizeof(float)); - P2_prev = calloc(DimTotal, sizeof(float)); P3_prev = calloc(DimTotal, sizeof(float)); - R1 = calloc(DimTotal, sizeof(float)); - R2 = calloc(DimTotal, sizeof(float)); - R3 = calloc(DimTotal, sizeof(float)); - InputRef_x = calloc(DimTotal, sizeof(float)); - InputRef_y = calloc(DimTotal, sizeof(float)); - InputRef_z = calloc(DimTotal, sizeof(float)); + R3 = calloc(DimTotal, sizeof(float)); + InputRef_z = calloc(DimTotal, sizeof(float)); /* calculate gradient field (smoothed) for the reference volume */ GradNorm_func3D(InputRef, InputRef_x, InputRef_y, InputRef_z, eta, (long)(dimX), (long)(dimY), (long)(dimZ)); - + /* begin iterations */ for(ll=0; ll<iterationsNumb; ll++) { - /*projects a 3D vector field R-1,2,3 onto the orthogonal complement of another 3D vector field InputRef_xyz*/ + if ((epsil != 0.0f) && (ll % 5 == 0)) copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); + + /*projects a 3D vector field R-1,2,3 onto the orthogonal complement of another 3D vector field InputRef_xyz*/ ProjectVect_func3D(R1, R2, R3, InputRef_x, InputRef_y, InputRef_z, (long)(dimX), (long)(dimY), (long)(dimZ)); - + /* computing the gradient of the objective function */ Obj_dfunc3D(Input, Output, R1, R2, R3, lambdaPar, (long)(dimX), (long)(dimY), (long)(dimZ)); - + /* apply nonnegativity */ - if (nonneg == 1) for(j=0; j<DimTotal; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;} - + if (nonneg == 1) for(j=0; j<DimTotal; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;} + /*Taking a step towards minus of the gradient*/ Grad_dfunc3D(P1, P2, P3, Output, R1, R2, R3, InputRef_x, InputRef_y, InputRef_z, lambdaPar, (long)(dimX), (long)(dimY), (long)(dimZ)); - + /* projection step */ Proj_dfunc3D(P1, P2, P3, methodTV, DimTotal); - + /*updating R and t*/ tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; Rupd_dfunc3D(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, DimTotal); - - /* calculate norm - stopping rules*/ - re = 0.0f; re1 = 0.0f; - for(j=0; j<DimTotal; j++) - { - re += pow(Output[j] - Output_prev[j],2); - re1 += pow(Output[j],2); - } - re = sqrt(re)/sqrt(re1); - /* stop if the norm residual is less than the tolerance EPS */ - if (re < epsil) count++; - if (count > 4) break; - - /*storing old values*/ - copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); + + /*storing old values*/ copyIm(P1, P1_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); copyIm(P2, P2_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); copyIm(P3, P3_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); - tk = tkp1; - } - if (printM == 1) printf("FGP-dTV iterations stopped at iteration %i \n", ll); - free(Output_prev); free(P1); free(P2); free(P3); free(P1_prev); free(P2_prev); free(P3_prev); free(R1); free(R2); free(R3); free(InputRef_x); free(InputRef_y); free(InputRef_z); + tk = tkp1; + + /* check early stopping criteria */ + if ((epsil != 0.0f) && (ll % 5 == 0)) { + re = 0.0f; re1 = 0.0f; + for(j=0; j<DimTotal; j++) + { + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); + } + re = sqrtf(re)/sqrtf(re1); + if (re < epsil) count++; + if (count > 3) break; + } + } + + free(P3); free(P3_prev); free(R3); free(InputRef_z); } - return *Output; + if (epsil != 0.0f) free(Output_prev); + free(P1); free(P2); free(P1_prev); free(P2_prev); free(R1); free(R2); free(InputRef_x); free(InputRef_y); + + /*adding info into info_vector */ + infovector[0] = (float)(ll); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + + return 0; } @@ -248,11 +249,11 @@ float Grad_dfunc2D(float *P1, float *P2, float *D, float *R1, float *R2, float * /* boundary conditions */ if (i == dimX-1) val1 = 0.0f; else val1 = D[index] - D[j*dimX + (i+1)]; if (j == dimY-1) val2 = 0.0f; else val2 = D[index] - D[(j+1)*dimX + i]; - + in_prod = val1*B_x[index] + val2*B_y[index]; /* calculate inner product */ val1 = val1 - in_prod*B_x[index]; val2 = val2 - in_prod*B_y[index]; - + P1[index] = R1[index] + multip*val1; P2[index] = R2[index] + multip*val2; @@ -295,7 +296,7 @@ float Rupd_dfunc2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1 float multip; multip = ((tk-1.0f)/tkp1); #pragma omp parallel for shared(P1,P2,P1_old,P2_old,R1,R2,multip) private(i) - for(i=0; i<DimTotal; i++) { + for(i=0; i<DimTotal; i++) { R1[i] = P1[i] + multip*(P1[i] - P1_old[i]); R2[i] = P2[i] + multip*(P2[i] - P2_old[i]); } @@ -314,12 +315,12 @@ float GradNorm_func3D(float *B, float *B_x, float *B_y, float *B_z, float eta, l for(j=0; j<dimY; j++) { for(k=0; k<dimZ; k++) { index = (dimX*dimY)*k + j*dimX+i; - + /* zero boundary conditions */ if (i == dimX-1) {val1 = 0.0f;} else {val1 = B[(dimX*dimY)*k + j*dimX+(i+1)];} if (j == dimY-1) {val2 = 0.0f;} else {val2 = B[(dimX*dimY)*k + (j+1)*dimX+i];} if (k == dimZ-1) {val3 = 0.0f;} else {val3 = B[(dimX*dimY)*(k+1) + (j)*dimX+i];} - + gradX = val1 - B[index]; gradY = val2 - B[index]; gradZ = val3 - B[index]; @@ -375,17 +376,17 @@ float Grad_dfunc3D(float *P1, float *P2, float *P3, float *D, float *R1, float * for(i=0; i<dimX; i++) { for(j=0; j<dimY; j++) { for(k=0; k<dimZ; k++) { - index = (dimX*dimY)*k + j*dimX+i; + index = (dimX*dimY)*k + j*dimX+i; /* boundary conditions */ if (i == dimX-1) val1 = 0.0f; else val1 = D[index] - D[(dimX*dimY)*k + j*dimX + (i+1)]; if (j == dimY-1) val2 = 0.0f; else val2 = D[index] - D[(dimX*dimY)*k + (j+1)*dimX + i]; if (k == dimZ-1) val3 = 0.0f; else val3 = D[index] - D[(dimX*dimY)*(k+1) + j*dimX + i]; - + in_prod = val1*B_x[index] + val2*B_y[index] + val3*B_z[index]; /* calculate inner product */ val1 = val1 - in_prod*B_x[index]; val2 = val2 - in_prod*B_y[index]; val3 = val3 - in_prod*B_z[index]; - + P1[index] = R1[index] + multip*val1; P2[index] = R2[index] + multip*val2; P3[index] = R3[index] + multip*val3; @@ -393,13 +394,13 @@ float Grad_dfunc3D(float *P1, float *P2, float *P3, float *D, float *R1, float * return 1; } float Proj_dfunc3D(float *P1, float *P2, float *P3, int methTV, long DimTotal) -{ +{ float val1, val2, val3, denom, sq_denom; long i; if (methTV == 0) { /* isotropic TV*/ #pragma omp parallel for shared(P1,P2,P3) private(i,val1,val2,val3,sq_denom) - for(i=0; i<DimTotal; i++) { + for(i=0; i<DimTotal; i++) { denom = powf(P1[i],2) + powf(P2[i],2) + powf(P3[i],2); if (denom > 1.0f) { sq_denom = 1.0f/sqrtf(denom); @@ -408,7 +409,7 @@ float Proj_dfunc3D(float *P1, float *P2, float *P3, int methTV, long DimTotal) P3[i] = P3[i]*sq_denom; } } - } + } else { /* anisotropic TV*/ #pragma omp parallel for shared(P1,P2,P3) private(i,val1,val2,val3) @@ -418,7 +419,7 @@ float Proj_dfunc3D(float *P1, float *P2, float *P3, int methTV, long DimTotal) val3 = fabs(P3[i]); if (val1 < 1.0f) {val1 = 1.0f;} if (val2 < 1.0f) {val2 = 1.0f;} - if (val3 < 1.0f) {val3 = 1.0f;} + if (val3 < 1.0f) {val3 = 1.0f;} P1[i] = P1[i]/val1; P2[i] = P2[i]/val2; P3[i] = P3[i]/val3; diff --git a/src/Core/regularisers_CPU/FGP_dTV_core.h b/src/Core/regularisers_CPU/FGP_dTV_core.h index 442dd30..9ace06d 100644 --- a/src/Core/regularisers_CPU/FGP_dTV_core.h +++ b/src/Core/regularisers_CPU/FGP_dTV_core.h @@ -36,23 +36,24 @@ limitations under the License. * 3. lambdaPar - regularization parameter [REQUIRED] * 4. Number of iterations [OPTIONAL] * 5. eplsilon: tolerance constant [OPTIONAL] - * 6. eta: smoothing constant to calculate gradient of the reference [OPTIONAL] * + * 6. eta: smoothing constant to calculate gradient of the reference [OPTIONAL] * * 7. TV-type: methodTV - 'iso' (0) or 'l1' (1) [OPTIONAL] * 8. nonneg: 'nonnegativity (0 is OFF by default) [OPTIONAL] * 9. print information: 0 (off) or 1 (on) [OPTIONAL] * * Output: * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the Matlab's codes and papers by * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" * [2] M. J. Ehrhardt and M. M. Betcke, Multi-Contrast MRI Reconstruction with Structure-Guided Total Variation, SIAM Journal on Imaging Sciences 9(3), pp. 1084–1106 */ - + #ifdef __cplusplus extern "C" { #endif -CCPI_EXPORT float dTV_FGP_CPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); +CCPI_EXPORT float dTV_FGP_CPU_main(float *Input, float *InputRef, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int dimX, int dimY, int dimZ); CCPI_EXPORT float GradNorm_func2D(float *B, float *B_x, float *B_y, float eta, long dimX, long dimY); CCPI_EXPORT float ProjectVect_func2D(float *R1, float *R2, float *B_x, float *B_y, long dimX, long dimY); diff --git a/src/Core/regularisers_CPU/LLT_ROF_core.c b/src/Core/regularisers_CPU/LLT_ROF_core.c index 8416a14..1064340 100644 --- a/src/Core/regularisers_CPU/LLT_ROF_core.c +++ b/src/Core/regularisers_CPU/LLT_ROF_core.c @@ -18,7 +18,7 @@ limitations under the License. */ #include "LLT_ROF_core.h" -#define EPS_LLT 0.01 +#define EPS_LLT 1.0e-12 #define EPS_ROF 1.0e-12 #define MAX(x, y) (((x) > (y)) ? (x) : (y)) #define MIN(x, y) (((x) < (y)) ? (x) : (y)) @@ -29,49 +29,57 @@ int signLLT(float x) { } /* C-OMP implementation of Lysaker, Lundervold and Tai (LLT) model [1] combined with Rudin-Osher-Fatemi [2] TV regularisation penalty. - * -* This penalty can deliver visually pleasant piecewise-smooth recovery if regularisation parameters are selected well. -* The rule of thumb for selection is to start with lambdaLLT = 0 (just the ROF-TV model) and then proceed to increase -* lambdaLLT starting with smaller values. + * +* This penalty can deliver visually pleasant piecewise-smooth recovery if regularisation parameters are selected well. +* The rule of thumb for selection is to start with lambdaLLT = 0 (just the ROF-TV model) and then proceed to increase +* lambdaLLT starting with smaller values. * * Input Parameters: * 1. U0 - original noise image/volume * 2. lambdaROF - ROF-related regularisation parameter * 3. lambdaLLT - LLT-related regularisation parameter -* 4. tau - time-marching step +* 4. tau - time-marching step * 5. iter - iterations number (for both models) +* 6. eplsilon: tolerance constant * * Output: -* Filtered/regularised image +* [1] Filtered/regularized image/volume +* [2] Information vector which contains [iteration no., reached tolerance] * -* References: +* References: * [1] Lysaker, M., Lundervold, A. and Tai, X.C., 2003. Noise removal using fourth-order partial differential equation with applications to medical magnetic resonance images in space and time. IEEE Transactions on image processing, 12(12), pp.1579-1590. * [2] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" */ -float LLT_ROF_CPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int dimX, int dimY, int dimZ) +float LLT_ROF_CPU_main(float *Input, float *Output, float *infovector, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, float epsil, int dimX, int dimY, int dimZ) { long DimTotal; - int ll; - float *D1_LLT=NULL, *D2_LLT=NULL, *D3_LLT=NULL, *D1_ROF=NULL, *D2_ROF=NULL, *D3_ROF=NULL; - - DimTotal = (long)(dimX*dimY*dimZ); - + int ll, j; + float re, re1; + re = 0.0f; re1 = 0.0f; + int count = 0; + + float *D1_LLT=NULL, *D2_LLT=NULL, *D3_LLT=NULL, *D1_ROF=NULL, *D2_ROF=NULL, *D3_ROF=NULL, *Output_prev=NULL; + DimTotal = (long)(dimX*dimY*dimZ); + D1_ROF = calloc(DimTotal, sizeof(float)); D2_ROF = calloc(DimTotal, sizeof(float)); D3_ROF = calloc(DimTotal, sizeof(float)); - + D1_LLT = calloc(DimTotal, sizeof(float)); D2_LLT = calloc(DimTotal, sizeof(float)); D3_LLT = calloc(DimTotal, sizeof(float)); - + copyIm(Input, Output, (long)(dimX), (long)(dimY), (long)(dimZ)); /* initialize */ - - for(ll = 0; ll < iterationsNumb; ll++) { + if (epsil != 0.0f) Output_prev = calloc(DimTotal, sizeof(float)); + + for(ll = 0; ll < iterationsNumb; ll++) { + if ((epsil != 0.0f) && (ll % 5 == 0)) copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); + if (dimZ == 1) { - /* 2D case */ - /****************ROF******************/ - /* calculate first-order differences */ + /* 2D case */ + /****************ROF******************/ + /* calculate first-order differences */ D1_func_ROF(Output, D1_ROF, (long)(dimX), (long)(dimY), 1l); D2_func_ROF(Output, D2_ROF, (long)(dimX), (long)(dimY), 1l); /****************LLT******************/ @@ -81,21 +89,40 @@ float LLT_ROF_CPU_main(float *Input, float *Output, float lambdaROF, float lambd Update2D_LLT_ROF(Input, Output, D1_LLT, D2_LLT, D1_ROF, D2_ROF, lambdaROF, lambdaLLT, tau, (long)(dimX), (long)(dimY), 1l); } else { - /* 3D case */ - /* calculate first-order differences */ + /* 3D case */ + /* calculate first-order differences */ D1_func_ROF(Output, D1_ROF, (long)(dimX), (long)(dimY), (long)(dimZ)); D2_func_ROF(Output, D2_ROF, (long)(dimX), (long)(dimY), (long)(dimZ)); - D3_func_ROF(Output, D3_ROF, (long)(dimX), (long)(dimY), (long)(dimZ)); + D3_func_ROF(Output, D3_ROF, (long)(dimX), (long)(dimY), (long)(dimZ)); /****************LLT******************/ /* estimate second-order derrivatives */ der3D_LLT(Output, D1_LLT, D2_LLT, D3_LLT,(long)(dimX), (long)(dimY), (long)(dimZ)); /* Joint update for ROF and LLT models */ Update3D_LLT_ROF(Input, Output, D1_LLT, D2_LLT, D3_LLT, D1_ROF, D2_ROF, D3_ROF, lambdaROF, lambdaLLT, tau, (long)(dimX), (long)(dimY), (long)(dimZ)); - } - } /*end of iterations*/ + } + + /* check early stopping criteria */ + if ((epsil != 0.0f) && (ll % 5 == 0)) { + re = 0.0f; re1 = 0.0f; + for(j=0; j<DimTotal; j++) + { + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); + } + re = sqrtf(re)/sqrtf(re1); + if (re < epsil) count++; + if (count > 3) break; + } + + } /*end of iterations*/ free(D1_LLT);free(D2_LLT);free(D3_LLT); free(D1_ROF);free(D2_ROF);free(D3_ROF); - return *Output; + if (epsil != 0.0f) free(Output_prev); + + /*adding info into info_vector */ + infovector[0] = (float)(ll); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; } /*************************************************************************/ @@ -143,17 +170,17 @@ float der3D_LLT(float *U, float *D1, float *D2, float *D3, long dimX, long dimY, j_m = j - 1; if (j_m < 0) j_m = j + 1; k_p = k + 1; if (k_p == dimZ) k_p = k - 1; k_m = k - 1; if (k_m < 0) k_m = k + 1; - + index = (dimX*dimY)*k + j*dimX+i; - + dxx = U[(dimX*dimY)*k + j*dimX+i_p] - 2.0f*U[index] + U[(dimX*dimY)*k + j*dimX+i_m]; dyy = U[(dimX*dimY)*k + j_p*dimX+i] - 2.0f*U[index] + U[(dimX*dimY)*k + j_m*dimX+i]; dzz = U[(dimX*dimY)*k_p + j*dimX+i] - 2.0f*U[index] + U[(dimX*dimY)*k_m + j*dimX+i]; - + denom_xx = fabs(dxx) + EPS_LLT; denom_yy = fabs(dyy) + EPS_LLT; denom_zz = fabs(dzz) + EPS_LLT; - + D1[index] = dxx / denom_xx; D2[index] = dyy / denom_yy; D3[index] = dzz / denom_zz; @@ -172,7 +199,7 @@ float D1_func_ROF(float *A, float *D1, long dimX, long dimY, long dimZ) { float NOMx_1, NOMy_1, NOMy_0, NOMz_1, NOMz_0, denom1, denom2,denom3, T1; long i,j,k,i1,i2,k1,j1,j2,k2,index; - + if (dimZ > 1) { #pragma omp parallel for shared (A, D1, dimX, dimY, dimZ) private(index, i, j, k, i1, j1, k1, i2, j2, k2, NOMx_1,NOMy_1,NOMy_0,NOMz_1,NOMz_0,denom1,denom2,denom3,T1) for(j=0; j<dimY; j++) { @@ -186,17 +213,17 @@ float D1_func_ROF(float *A, float *D1, long dimX, long dimY, long dimZ) j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - + /* Forward-backward differences */ NOMx_1 = A[(dimX*dimY)*k + j1*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[(dimX*dimY)*k + j*dimX + i1] - A[index]; /* y+ */ /*NOMx_0 = (A[(i)*dimY + j] - A[(i2)*dimY + j]); */ /* x- */ NOMy_0 = A[index] - A[(dimX*dimY)*k + j*dimX + i2]; /* y- */ - + NOMz_1 = A[(dimX*dimY)*k1 + j*dimX + i] - A[index]; /* z+ */ NOMz_0 = A[index] - A[(dimX*dimY)*k2 + j*dimX + i]; /* z- */ - - + + denom1 = NOMx_1*NOMx_1; denom2 = 0.5f*(signLLT(NOMy_1) + signLLT(NOMy_0))*(MIN(fabs(NOMy_1),fabs(NOMy_0))); denom2 = denom2*denom2; @@ -216,13 +243,13 @@ float D1_func_ROF(float *A, float *D1, long dimX, long dimY, long dimZ) i2 = i - 1; if (i2 < 0) i2 = i+1; j1 = j + 1; if (j1 >= dimY) j1 = j-1; j2 = j - 1; if (j2 < 0) j2 = j+1; - + /* Forward-backward differences */ NOMx_1 = A[j1*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[j*dimX + i1] - A[index]; /* y+ */ /*NOMx_0 = (A[(i)*dimY + j] - A[(i2)*dimY + j]); */ /* x- */ NOMy_0 = A[index] - A[(j)*dimX + i2]; /* y- */ - + denom1 = NOMx_1*NOMx_1; denom2 = 0.5f*(signLLT(NOMy_1) + signLLT(NOMy_0))*(MIN(fabs(NOMy_1),fabs(NOMy_0))); denom2 = denom2*denom2; @@ -237,7 +264,7 @@ float D2_func_ROF(float *A, float *D2, long dimX, long dimY, long dimZ) { float NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2; long i,j,k,i1,i2,k1,j1,j2,k2,index; - + if (dimZ > 1) { #pragma omp parallel for shared (A, D2, dimX, dimY, dimZ) private(index, i, j, k, i1, j1, k1, i2, j2, k2, NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2) for(j=0; j<dimY; j++) { @@ -251,16 +278,16 @@ float D2_func_ROF(float *A, float *D2, long dimX, long dimY, long dimZ) j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - - + + /* Forward-backward differences */ NOMx_1 = A[(dimX*dimY)*k + (j1)*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[(dimX*dimY)*k + (j)*dimX + i1] - A[index]; /* y+ */ NOMx_0 = A[index] - A[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ NOMz_1 = A[(dimX*dimY)*k1 + j*dimX + i] - A[index]; /* z+ */ NOMz_0 = A[index] - A[(dimX*dimY)*k2 + (j)*dimX + i]; /* z- */ - - + + denom1 = NOMy_1*NOMy_1; denom2 = 0.5f*(signLLT(NOMx_1) + signLLT(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0))); denom2 = denom2*denom2; @@ -280,13 +307,13 @@ float D2_func_ROF(float *A, float *D2, long dimX, long dimY, long dimZ) i2 = i - 1; if (i2 < 0) i2 = i+1; j1 = j + 1; if (j1 >= dimY) j1 = j-1; j2 = j - 1; if (j2 < 0) j2 = j+1; - + /* Forward-backward differences */ NOMx_1 = A[j1*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[j*dimX + i1] - A[index]; /* y+ */ NOMx_0 = A[index] - A[j2*dimX + i]; /* x- */ /*NOMy_0 = A[(i)*dimY + j] - A[(i)*dimY + j2]; */ /* y- */ - + denom1 = NOMy_1*NOMy_1; denom2 = 0.5f*(signLLT(NOMx_1) + signLLT(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0))); denom2 = denom2*denom2; @@ -302,7 +329,7 @@ float D3_func_ROF(float *A, float *D3, long dimX, long dimY, long dimZ) { float NOMx_1, NOMy_1, NOMx_0, NOMy_0, NOMz_1, denom1, denom2, denom3, T3; long index,i,j,k,i1,i2,k1,j1,j2,k2; - + #pragma omp parallel for shared (A, D3, dimX, dimY, dimZ) private(index, i, j, k, i1, j1, k1, i2, j2, k2, NOMx_1, NOMy_1, NOMy_0, NOMx_0, NOMz_1, denom1, denom2, denom3, T3) for(j=0; j<dimY; j++) { for(i=0; i<dimX; i++) { @@ -315,7 +342,7 @@ float D3_func_ROF(float *A, float *D3, long dimX, long dimY, long dimZ) j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - + /* Forward-backward differences */ NOMx_1 = A[(dimX*dimY)*k + (j1)*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[(dimX*dimY)*k + (j)*dimX + i1] - A[index]; /* y+ */ @@ -323,7 +350,7 @@ float D3_func_ROF(float *A, float *D3, long dimX, long dimY, long dimZ) NOMx_0 = A[index] - A[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ NOMz_1 = A[(dimX*dimY)*k1 + j*dimX + i] - A[index]; /* z+ */ /*NOMz_0 = A[(dimX*dimY)*k + (i)*dimY + j] - A[(dimX*dimY)*k2 + (i)*dimY + j]; */ /* z- */ - + denom1 = NOMz_1*NOMz_1; denom2 = 0.5f*(signLLT(NOMx_1) + signLLT(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0))); denom2 = denom2*denom2; @@ -352,19 +379,19 @@ float Update2D_LLT_ROF(float *U0, float *U, float *D1_LLT, float *D2_LLT, float i_m = i - 1; if (i_m < 0) i_m = i + 1; j_p = j + 1; if (j_p == dimY) j_p = j - 1; j_m = j - 1; if (j_m < 0) j_m = j + 1; - + /*LLT-related part*/ dxx = D1_LLT[j*dimX+i_p] - 2.0f*D1_LLT[index] + D1_LLT[j*dimX+i_m]; dyy = D2_LLT[j_p*dimX+i] - 2.0f*D2_LLT[index] + D2_LLT[j_m*dimX+i]; laplc = dxx + dyy; /*build Laplacian*/ - + /*ROF-related part*/ dv1 = D1_ROF[index] - D1_ROF[j_m*dimX + i]; dv2 = D2_ROF[index] - D2_ROF[j*dimX + i_m]; div = dv1 + dv2; /*build Divirgent*/ - + /*combine all into one cost function to minimise */ - U[index] += tau*(2.0f*lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); + U[index] += tau*(lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); } } return *U; @@ -385,26 +412,25 @@ float Update3D_LLT_ROF(float *U0, float *U, float *D1_LLT, float *D2_LLT, float j_m = j - 1; if (j_m < 0) j_m = j + 1; k_p = k + 1; if (k_p == dimZ) k_p = k - 1; k_m = k - 1; if (k_m < 0) k_m = k + 1; - + index = (dimX*dimY)*k + j*dimX+i; - + /*LLT-related part*/ dxx = D1_LLT[(dimX*dimY)*k + j*dimX+i_p] - 2.0f*D1_LLT[index] + D1_LLT[(dimX*dimY)*k + j*dimX+i_m]; dyy = D2_LLT[(dimX*dimY)*k + j_p*dimX+i] - 2.0f*D2_LLT[index] + D2_LLT[(dimX*dimY)*k + j_m*dimX+i]; dzz = D3_LLT[(dimX*dimY)*k_p + j*dimX+i] - 2.0f*D3_LLT[index] + D3_LLT[(dimX*dimY)*k_m + j*dimX+i]; laplc = dxx + dyy + dzz; /*build Laplacian*/ - + /*ROF-related part*/ dv1 = D1_ROF[index] - D1_ROF[(dimX*dimY)*k + j_m*dimX+i]; dv2 = D2_ROF[index] - D2_ROF[(dimX*dimY)*k + j*dimX+i_m]; dv3 = D3_ROF[index] - D3_ROF[(dimX*dimY)*k_m + j*dimX+i]; div = dv1 + dv2 + dv3; /*build Divirgent*/ - + /*combine all into one cost function to minimise */ - U[index] += tau*(2.0f*lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); + U[index] += tau*(lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); } } } return *U; } - diff --git a/src/Core/regularisers_CPU/LLT_ROF_core.h b/src/Core/regularisers_CPU/LLT_ROF_core.h index 8e6591e..abf0d60 100644 --- a/src/Core/regularisers_CPU/LLT_ROF_core.h +++ b/src/Core/regularisers_CPU/LLT_ROF_core.h @@ -26,22 +26,22 @@ limitations under the License. #include "CCPiDefines.h" /* C-OMP implementation of Lysaker, Lundervold and Tai (LLT) model [1] combined with Rudin-Osher-Fatemi [2] TV regularisation penalty. - * -* This penalty can deliver visually pleasant piecewise-smooth recovery if regularisation parameters are selected well. -* The rule of thumb for selection is to start with lambdaLLT = 0 (just the ROF-TV model) and then proceed to increase -* lambdaLLT starting with smaller values. + * +* This penalty can deliver visually pleasant piecewise-smooth recovery if regularisation parameters are selected well. +* The rule of thumb for selection is to start with lambdaLLT = 0 (just the ROF-TV model) and then proceed to increase +* lambdaLLT starting with smaller values. * * Input Parameters: * 1. U0 - original noise image/volume * 2. lambdaROF - ROF-related regularisation parameter * 3. lambdaLLT - LLT-related regularisation parameter -* 4. tau - time-marching step +* 4. tau - time-marching step * 5. iter - iterations number (for both models) * * Output: * Filtered/regularised image * -* References: +* References: * [1] Lysaker, M., Lundervold, A. and Tai, X.C., 2003. Noise removal using fourth-order partial differential equation with applications to medical magnetic resonance images in space and time. IEEE Transactions on image processing, 12(12), pp.1579-1590. * [2] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" */ @@ -49,7 +49,7 @@ limitations under the License. #ifdef __cplusplus extern "C" { #endif -CCPI_EXPORT float LLT_ROF_CPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int dimX, int dimY, int dimZ); +CCPI_EXPORT float LLT_ROF_CPU_main(float *Input, float *Output, float *infovector, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, float epsil, int dimX, int dimY, int dimZ); CCPI_EXPORT float der2D_LLT(float *U, float *D1, float *D2, long dimX, long dimY, long dimZ); CCPI_EXPORT float der3D_LLT(float *U, float *D1, float *D2, float *D3, long dimX, long dimY, long dimZ); diff --git a/src/Core/regularisers_CPU/ROF_TV_core.c b/src/Core/regularisers_CPU/ROF_TV_core.c index 1858442..6d23eef 100644 --- a/src/Core/regularisers_CPU/ROF_TV_core.c +++ b/src/Core/regularisers_CPU/ROF_TV_core.c @@ -19,7 +19,7 @@ #include "ROF_TV_core.h" -#define EPS 1.0e-12 +#define EPS 1.0e-8 #define MAX(x, y) (((x) > (y)) ? (x) : (y)) #define MIN(x, y) (((x) < (y)) ? (x) : (y)) @@ -31,45 +31,72 @@ int sign(float x) { /* C-OMP implementation of ROF-TV denoising/regularization model [1] (2D/3D case) * - * + * * Input Parameters: * 1. Noisy image/volume [REQUIRED] * 2. lambda - regularization parameter [REQUIRED] * 3. tau - marching step for explicit scheme, ~1 is recommended [REQUIRED] * 4. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED] + * 5. eplsilon: tolerance constant * * Output: - * [1] Regularized image/volume + * [1] Regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the paper by * [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" */ /* Running iterations of TV-ROF function */ -float TV_ROF_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ) +float TV_ROF_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float tau, float epsil, int dimX, int dimY, int dimZ) { - float *D1, *D2, *D3; - int i; - long DimTotal; - DimTotal = (long)(dimX*dimY*dimZ); - + float *D1=NULL, *D2=NULL, *D3=NULL, *Output_prev=NULL; + float re, re1; + re = 0.0f; re1 = 0.0f; + int count = 0; + int i; + long DimTotal,j; + DimTotal = (long)(dimX*dimY*dimZ); + D1 = calloc(DimTotal, sizeof(float)); D2 = calloc(DimTotal, sizeof(float)); D3 = calloc(DimTotal, sizeof(float)); - + /* copy into output */ copyIm(Input, Output, (long)(dimX), (long)(dimY), (long)(dimZ)); - + if (epsil != 0.0f) Output_prev = calloc(DimTotal, sizeof(float)); + /* start TV iterations */ - for(i=0; i < iterationsNumb; i++) { + for(i=0; i < iterationsNumb; i++) { + if ((epsil != 0.0f) && (i % 5 == 0)) copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); + /* calculate differences */ D1_func(Output, D1, (long)(dimX), (long)(dimY), (long)(dimZ)); D2_func(Output, D2, (long)(dimX), (long)(dimY), (long)(dimZ)); - if (dimZ > 1) D3_func(Output, D3, (long)(dimX), (long)(dimY), (long)(dimZ)); + if (dimZ > 1) D3_func(Output, D3, (long)(dimX), (long)(dimY), (long)(dimZ)); TV_kernel(D1, D2, D3, Output, Input, lambdaPar, tau, (long)(dimX), (long)(dimY), (long)(dimZ)); - } + + /* check early stopping criteria */ + if ((epsil != 0.0f) && (i % 5 == 0)) { + re = 0.0f; re1 = 0.0f; + for(j=0; j<DimTotal; j++) + { + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); + } + re = sqrtf(re)/sqrtf(re1); + if (re < epsil) count++; + if (count > 3) break; + } + } free(D1);free(D2); free(D3); - return *Output; + if (epsil != 0.0f) free(Output_prev); + + /*adding info into info_vector */ + infovector[0] = (float)(i); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + + return 0; } /* calculate differences 1 */ @@ -77,7 +104,7 @@ float D1_func(float *A, float *D1, long dimX, long dimY, long dimZ) { float NOMx_1, NOMy_1, NOMy_0, NOMz_1, NOMz_0, denom1, denom2,denom3, T1; long i,j,k,i1,i2,k1,j1,j2,k2,index; - + if (dimZ > 1) { #pragma omp parallel for shared (A, D1, dimX, dimY, dimZ) private(index, i, j, k, i1, j1, k1, i2, j2, k2, NOMx_1,NOMy_1,NOMy_0,NOMz_1,NOMz_0,denom1,denom2,denom3,T1) for(j=0; j<dimY; j++) { @@ -90,18 +117,18 @@ float D1_func(float *A, float *D1, long dimX, long dimY, long dimZ) j1 = j + 1; if (j1 >= dimY) j1 = j-1; j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; - k2 = k - 1; if (k2 < 0) k2 = k+1; - + k2 = k - 1; if (k2 < 0) k2 = k+1; + /* Forward-backward differences */ NOMx_1 = A[(dimX*dimY)*k + j1*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[(dimX*dimY)*k + j*dimX + i1] - A[index]; /* y+ */ /*NOMx_0 = (A[(i)*dimY + j] - A[(i2)*dimY + j]); */ /* x- */ NOMy_0 = A[index] - A[(dimX*dimY)*k + j*dimX + i2]; /* y- */ - + NOMz_1 = A[(dimX*dimY)*k1 + j*dimX + i] - A[index]; /* z+ */ NOMz_0 = A[index] - A[(dimX*dimY)*k2 + j*dimX + i]; /* z- */ - - + + denom1 = NOMx_1*NOMx_1; denom2 = 0.5f*(sign(NOMy_1) + sign(NOMy_0))*(MIN(fabs(NOMy_1),fabs(NOMy_0))); denom2 = denom2*denom2; @@ -121,13 +148,13 @@ float D1_func(float *A, float *D1, long dimX, long dimY, long dimZ) i2 = i - 1; if (i2 < 0) i2 = i+1; j1 = j + 1; if (j1 >= dimY) j1 = j-1; j2 = j - 1; if (j2 < 0) j2 = j+1; - + /* Forward-backward differences */ NOMx_1 = A[j1*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[j*dimX + i1] - A[index]; /* y+ */ /*NOMx_0 = (A[(i)*dimY + j] - A[(i2)*dimY + j]); */ /* x- */ NOMy_0 = A[index] - A[(j)*dimX + i2]; /* y- */ - + denom1 = NOMx_1*NOMx_1; denom2 = 0.5f*(sign(NOMy_1) + sign(NOMy_0))*(MIN(fabs(NOMy_1),fabs(NOMy_0))); denom2 = denom2*denom2; @@ -142,7 +169,7 @@ float D2_func(float *A, float *D2, long dimX, long dimY, long dimZ) { float NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2; long i,j,k,i1,i2,k1,j1,j2,k2,index; - + if (dimZ > 1) { #pragma omp parallel for shared (A, D2, dimX, dimY, dimZ) private(index, i, j, k, i1, j1, k1, i2, j2, k2, NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2) for(j=0; j<dimY; j++) { @@ -155,16 +182,16 @@ float D2_func(float *A, float *D2, long dimX, long dimY, long dimZ) j1 = j + 1; if (j1 >= dimY) j1 = j-1; j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; - k2 = k - 1; if (k2 < 0) k2 = k+1; - + k2 = k - 1; if (k2 < 0) k2 = k+1; + /* Forward-backward differences */ NOMx_1 = A[(dimX*dimY)*k + (j1)*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[(dimX*dimY)*k + (j)*dimX + i1] - A[index]; /* y+ */ NOMx_0 = A[index] - A[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ NOMz_1 = A[(dimX*dimY)*k1 + j*dimX + i] - A[index]; /* z+ */ NOMz_0 = A[index] - A[(dimX*dimY)*k2 + (j)*dimX + i]; /* z- */ - - + + denom1 = NOMy_1*NOMy_1; denom2 = 0.5f*(sign(NOMx_1) + sign(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0))); denom2 = denom2*denom2; @@ -184,13 +211,13 @@ float D2_func(float *A, float *D2, long dimX, long dimY, long dimZ) i2 = i - 1; if (i2 < 0) i2 = i+1; j1 = j + 1; if (j1 >= dimY) j1 = j-1; j2 = j - 1; if (j2 < 0) j2 = j+1; - + /* Forward-backward differences */ NOMx_1 = A[j1*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[j*dimX + i1] - A[index]; /* y+ */ NOMx_0 = A[index] - A[j2*dimX + i]; /* x- */ /*NOMy_0 = A[(i)*dimY + j] - A[(i)*dimY + j2]; */ /* y- */ - + denom1 = NOMy_1*NOMy_1; denom2 = 0.5f*(sign(NOMx_1) + sign(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0))); denom2 = denom2*denom2; @@ -206,7 +233,7 @@ float D3_func(float *A, float *D3, long dimX, long dimY, long dimZ) { float NOMx_1, NOMy_1, NOMx_0, NOMy_0, NOMz_1, denom1, denom2, denom3, T3; long index,i,j,k,i1,i2,k1,j1,j2,k2; - + #pragma omp parallel for shared (A, D3, dimX, dimY, dimZ) private(index, i, j, k, i1, j1, k1, i2, j2, k2, NOMx_1, NOMy_1, NOMy_0, NOMx_0, NOMz_1, denom1, denom2, denom3, T3) for(j=0; j<dimY; j++) { for(i=0; i<dimX; i++) { @@ -219,7 +246,7 @@ float D3_func(float *A, float *D3, long dimX, long dimY, long dimZ) j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - + /* Forward-backward differences */ NOMx_1 = A[(dimX*dimY)*k + (j1)*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[(dimX*dimY)*k + (j)*dimX + i1] - A[index]; /* y+ */ @@ -227,7 +254,7 @@ float D3_func(float *A, float *D3, long dimX, long dimY, long dimZ) NOMx_0 = A[index] - A[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ NOMz_1 = A[(dimX*dimY)*k1 + j*dimX + i] - A[index]; /* z+ */ /*NOMz_0 = A[(dimX*dimY)*k + (i)*dimY + j] - A[(dimX*dimY)*k2 + (i)*dimY + j]; */ /* z- */ - + denom1 = NOMz_1*NOMz_1; denom2 = 0.5f*(sign(NOMx_1) + sign(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0))); denom2 = denom2*denom2; @@ -244,7 +271,7 @@ float TV_kernel(float *D1, float *D2, float *D3, float *B, float *A, float lambd { float dv1, dv2, dv3; long index,i,j,k,i1,i2,k1,j1,j2,k2; - + if (dimZ > 1) { #pragma omp parallel for shared (D1, D2, D3, B, dimX, dimY, dimZ) private(index, i, j, k, i1, j1, k1, i2, j2, k2, dv1,dv2,dv3) for(j=0; j<dimY; j++) { @@ -258,13 +285,13 @@ float TV_kernel(float *D1, float *D2, float *D3, float *B, float *A, float lambd j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - + /*divergence components */ dv1 = D1[index] - D1[(dimX*dimY)*k + j2*dimX+i]; dv2 = D2[index] - D2[(dimX*dimY)*k + j*dimX+i2]; dv3 = D3[index] - D3[(dimX*dimY)*k2 + j*dimX+i]; - - B[index] += tau*(2.0f*lambda*(dv1 + dv2 + dv3) - (B[index] - A[index])); + + B[index] += tau*(lambda*(dv1 + dv2 + dv3) - (B[index] - A[index])); }}} } else { @@ -277,12 +304,12 @@ float TV_kernel(float *D1, float *D2, float *D3, float *B, float *A, float lambd i2 = i - 1; if (i2 < 0) i2 = i+1; j1 = j + 1; if (j1 >= dimY) j1 = j-1; j2 = j - 1; if (j2 < 0) j2 = j+1; - + /* divergence components */ dv1 = D1[index] - D1[j2*dimX + i]; dv2 = D2[index] - D2[j*dimX + i2]; - B[index] += tau*(2.0f*lambda*(dv1 + dv2) - (B[index] - A[index])); + B[index] += tau*(lambda*(dv1 + dv2) - (B[index] - A[index])); }} } return *B; diff --git a/src/Core/regularisers_CPU/ROF_TV_core.h b/src/Core/regularisers_CPU/ROF_TV_core.h index 4e320e9..d6949fa 100644 --- a/src/Core/regularisers_CPU/ROF_TV_core.h +++ b/src/Core/regularisers_CPU/ROF_TV_core.h @@ -31,11 +31,13 @@ limitations under the License. * Input Parameters: * 1. Noisy image/volume [REQUIRED] * 2. lambda - regularization parameter [REQUIRED] - * 3. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED] - * 4. tau - marching step for explicit scheme, ~1 is recommended [REQUIRED] + * 3. tau - marching step for explicit scheme, ~1 is recommended [REQUIRED] + * 4. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED] + * 5. eplsilon: tolerance constant * * Output: * [1] Regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the paper by * [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" @@ -46,7 +48,7 @@ limitations under the License. #ifdef __cplusplus extern "C" { #endif -CCPI_EXPORT float TV_ROF_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ); +CCPI_EXPORT float TV_ROF_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float tau, float epsil, int dimX, int dimY, int dimZ); CCPI_EXPORT float TV_kernel(float *D1, float *D2, float *D3, float *B, float *A, float lambda, float tau, long dimX, long dimY, long dimZ); CCPI_EXPORT float D1_func(float *A, float *D1, long dimX, long dimY, long dimZ); @@ -54,4 +56,4 @@ CCPI_EXPORT float D2_func(float *A, float *D2, long dimX, long dimY, long dimZ); CCPI_EXPORT float D3_func(float *A, float *D3, long dimX, long dimY, long dimZ); #ifdef __cplusplus } -#endif
\ No newline at end of file +#endif diff --git a/src/Core/regularisers_CPU/SB_TV_core.c b/src/Core/regularisers_CPU/SB_TV_core.c index 769ea67..8d80787 100755 --- a/src/Core/regularisers_CPU/SB_TV_core.c +++ b/src/Core/regularisers_CPU/SB_TV_core.c @@ -27,122 +27,120 @@ limitations under the License. * 3. Number of iterations [OPTIONAL parameter] * 4. eplsilon - tolerance constant [OPTIONAL parameter] * 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter] -* 6. print information: 0 (off) or 1 (on) [OPTIONAL parameter] * * Output: -* 1. Filtered/regularized image +* [1] Filtered/regularized image/volume +* [2] Information vector which contains [iteration no., reached tolerance] * * [1]. Goldstein, T. and Osher, S., 2009. The split Bregman method for L1-regularized problems. SIAM journal on imaging sciences, 2(2), pp.323-343. */ - -float SB_TV_CPU_main(float *Input, float *Output, float mu, int iter, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ) + +float SB_TV_CPU_main(float *Input, float *Output, float *infovector, float mu, int iter, float epsil, int methodTV, int dimX, int dimY, int dimZ) { - int ll; - long j, DimTotal; - float re, re1, lambda; + int ll; + long j, DimTotal; + float re, re1, lambda; + re = 0.0f; re1 = 0.0f; int count = 0; mu = 1.0f/mu; lambda = 2.0f*mu; - if (dimZ <= 1) { - /* 2D case */ - float *Output_prev=NULL, *Dx=NULL, *Dy=NULL, *Bx=NULL, *By=NULL; - DimTotal = (long)(dimX*dimY); - - Output_prev = calloc(DimTotal, sizeof(float)); + float *Output_prev=NULL, *Dx=NULL, *Dy=NULL, *Bx=NULL, *By=NULL; + DimTotal = (long)(dimX*dimY*dimZ); + Output_prev = calloc(DimTotal, sizeof(float)); Dx = calloc(DimTotal, sizeof(float)); Dy = calloc(DimTotal, sizeof(float)); Bx = calloc(DimTotal, sizeof(float)); By = calloc(DimTotal, sizeof(float)); - + + if (dimZ == 1) { + /* 2D case */ copyIm(Input, Output, (long)(dimX), (long)(dimY), 1l); /*initialize */ - + /* begin outer SB iterations */ for(ll=0; ll<iter; ll++) { - + /* storing old estimate */ copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l); - + /* perform two GS iterations (normally 2 is enough for the convergence) */ gauss_seidel2D(Output, Input, Output_prev, Dx, Dy, Bx, By, (long)(dimX), (long)(dimY), lambda, mu); copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l); /*GS iteration */ gauss_seidel2D(Output, Input, Output_prev, Dx, Dy, Bx, By, (long)(dimX), (long)(dimY), lambda, mu); - + /* TV-related step */ if (methodTV == 1) updDxDy_shrinkAniso2D(Output, Dx, Dy, Bx, By, (long)(dimX), (long)(dimY), lambda); else updDxDy_shrinkIso2D(Output, Dx, Dy, Bx, By, (long)(dimX), (long)(dimY), lambda); - + /* update for Bregman variables */ updBxBy2D(Output, Dx, Dy, Bx, By, (long)(dimX), (long)(dimY)); - + /* check early stopping criteria if epsilon not equal zero */ - if (epsil != 0) { + if ((epsil != 0.0f) && (ll % 5 == 0)) { re = 0.0f; re1 = 0.0f; - for(j=0; j<DimTotal; j++) { - re += pow(Output[j] - Output_prev[j],2); - re1 += pow(Output[j],2); - } - re = sqrt(re)/sqrt(re1); + for(j=0; j<DimTotal; j++) + { + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); + } + re = sqrtf(re)/sqrtf(re1); + /* stop if the norm residual is less than the tolerance EPS */ if (re < epsil) count++; - if (count > 4) break; - } - /*printf("%f %i %i \n", re, ll, count); */ + if (count > 3) break; + } } - if (printM == 1) printf("SB-TV iterations stopped at iteration %i \n", ll); - free(Output_prev); free(Dx); free(Dy); free(Bx); free(By); } else { /* 3D case */ - float *Output_prev=NULL, *Dx=NULL, *Dy=NULL, *Dz=NULL, *Bx=NULL, *By=NULL, *Bz=NULL; - DimTotal = (long)(dimX*dimY*dimZ); - - Output_prev = calloc(DimTotal, sizeof(float)); - Dx = calloc(DimTotal, sizeof(float)); - Dy = calloc(DimTotal, sizeof(float)); + float *Dz=NULL, *Bz=NULL; + Dz = calloc(DimTotal, sizeof(float)); - Bx = calloc(DimTotal, sizeof(float)); - By = calloc(DimTotal, sizeof(float)); Bz = calloc(DimTotal, sizeof(float)); - + copyIm(Input, Output, (long)(dimX), (long)(dimY), (long)(dimZ)); /*initialize */ - + /* begin outer SB iterations */ for(ll=0; ll<iter; ll++) { - + /* storing old estimate */ copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); - + /* perform two GS iterations (normally 2 is enough for the convergence) */ gauss_seidel3D(Output, Input, Output_prev, Dx, Dy, Dz, Bx, By, Bz, (long)(dimX), (long)(dimY), (long)(dimZ), lambda, mu); copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); /*GS iteration */ gauss_seidel3D(Output, Input, Output_prev, Dx, Dy, Dz, Bx, By, Bz, (long)(dimX), (long)(dimY), (long)(dimZ), lambda, mu); - + /* TV-related step */ if (methodTV == 1) updDxDyDz_shrinkAniso3D(Output, Dx, Dy, Dz, Bx, By, Bz, (long)(dimX), (long)(dimY), (long)(dimZ), lambda); else updDxDyDz_shrinkIso3D(Output, Dx, Dy, Dz, Bx, By, Bz, (long)(dimX), (long)(dimY), (long)(dimZ), lambda); - + /* update for Bregman variables */ updBxByBz3D(Output, Dx, Dy, Dz, Bx, By, Bz, (long)(dimX), (long)(dimY), (long)(dimZ)); - + /* check early stopping criteria if epsilon not equal zero */ - if (epsil != 0) { + if ((epsil != 0.0f) && (ll % 5 == 0)) { re = 0.0f; re1 = 0.0f; - for(j=0; j<DimTotal; j++) { - re += pow(Output[j] - Output_prev[j],2); - re1 += pow(Output[j],2); - } - re = sqrt(re)/sqrt(re1); + for(j=0; j<DimTotal; j++) + { + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); + } + re = sqrtf(re)/sqrtf(re1); + /* stop if the norm residual is less than the tolerance EPS */ if (re < epsil) count++; - if (count > 4) break; - } - /*printf("%f %i %i \n", re, ll, count); */ + if (count > 3) break; + } } - if (printM == 1) printf("SB-TV iterations stopped at iteration %i \n", ll); - free(Output_prev); free(Dx); free(Dy); free(Dz); free(Bx); free(By); free(Bz); + free(Dz); free(Bz); } - return *Output; + + free(Output_prev); free(Dx); free(Dy); free(Bx); free(By); + /*adding info into info_vector */ + infovector[0] = (float)(ll); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; } /********************************************************************/ @@ -153,7 +151,7 @@ float gauss_seidel2D(float *U, float *A, float *U_prev, float *Dx, float *Dy, fl float sum, normConst; long i,j,i1,i2,j1,j2,index; normConst = 1.0f/(mu + 4.0f*lambda); - + #pragma omp parallel for shared(U) private(index,i,j,i1,i2,j1,j2,sum) for(i=0; i<dimX; i++) { /* symmetric boundary conditions (Neuman) */ @@ -164,7 +162,7 @@ float gauss_seidel2D(float *U, float *A, float *U_prev, float *Dx, float *Dy, fl j1 = j+1; if (j1 == dimY) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; index = j*dimX+i; - + sum = Dx[j*dimX+i2] - Dx[index] + Dy[j2*dimX+i] - Dy[index] - Bx[j*dimX+i2] + Bx[index] - By[j2*dimX+i] + By[index]; sum += U_prev[j*dimX+i1] + U_prev[j*dimX+i2] + U_prev[j1*dimX+i] + U_prev[j2*dimX+i]; sum *= lambda; @@ -186,16 +184,16 @@ float updDxDy_shrinkAniso2D(float *U, float *Dx, float *Dy, float *Bx, float *By i1 = i+1; if (i1 == dimX) i1 = i-1; j1 = j+1; if (j1 == dimY) j1 = j-1; index = j*dimX+i; - + val1 = (U[j*dimX+i1] - U[index]) + Bx[index]; val2 = (U[j1*dimX+i] - U[index]) + By[index]; - + val11 = fabs(val1) - denom_lam; if (val11 < 0) val11 = 0; val22 = fabs(val2) - denom_lam; if (val22 < 0) val22 = 0; - + if (val1 !=0) Dx[index] = (val1/fabs(val1))*val11; else Dx[index] = 0; if (val2 !=0) Dy[index] = (val2/fabs(val2))*val22; else Dy[index] = 0; - + }} return 1; } @@ -204,7 +202,7 @@ float updDxDy_shrinkIso2D(float *U, float *Dx, float *Dy, float *Bx, float *By, long i,j,i1,j1,index; float val1, val11, val2, denom, denom_lam; denom_lam = 1.0f/lambda; - + #pragma omp parallel for shared(U,denom_lam) private(index,i,j,i1,j1,val1,val11,val2,denom) for(i=0; i<dimX; i++) { for(j=0; j<dimY; j++) { @@ -212,14 +210,14 @@ float updDxDy_shrinkIso2D(float *U, float *Dx, float *Dy, float *Bx, float *By, i1 = i+1; if (i1 == dimX) i1 = i-1; j1 = j+1; if (j1 == dimY) j1 = j-1; index = j*dimX+i; - + val1 = (U[j*dimX+i1] - U[index]) + Bx[index]; val2 = (U[j1*dimX+i] - U[index]) + By[index]; - + denom = sqrt(val1*val1 + val2*val2); - + val11 = (denom - denom_lam); if (val11 < 0) val11 = 0.0f; - + if (denom != 0.0f) { Dx[index] = val11*(val1/denom); Dy[index] = val11*(val2/denom); @@ -241,7 +239,7 @@ float updBxBy2D(float *U, float *Dx, float *Dy, float *Bx, float *By, long dimX, i1 = i+1; if (i1 == dimX) i1 = i-1; j1 = j+1; if (j1 == dimY) j1 = j-1; index = j*dimX+i; - + Bx[index] += (U[j*dimX+i1] - U[index]) - Dx[index]; By[index] += (U[j1*dimX+i] - U[index]) - Dy[index]; }} @@ -269,7 +267,7 @@ float gauss_seidel3D(float *U, float *A, float *U_prev, float *Dx, float *Dy, fl k1 = k+1; if (k1 == dimZ) k1 = k-1; k2 = k-1; if (k2 < 0) k2 = k+1; index = (dimX*dimY)*k + j*dimX+i; - + d_val = Dx[(dimX*dimY)*k + j*dimX+i2] - Dx[index] + Dy[(dimX*dimY)*k + j2*dimX+i] - Dy[index] + Dz[(dimX*dimY)*k2 + j*dimX+i] - Dz[index]; b_val = -Bx[(dimX*dimY)*k + j*dimX+i2] + Bx[index] - By[(dimX*dimY)*k + j2*dimX+i] + By[index] - Bz[(dimX*dimY)*k2 + j*dimX+i] + Bz[index]; sum = d_val + b_val; @@ -295,19 +293,19 @@ float updDxDyDz_shrinkAniso3D(float *U, float *Dx, float *Dy, float *Dz, float * i1 = i+1; if (i1 == dimX) i1 = i-1; j1 = j+1; if (j1 == dimY) j1 = j-1; k1 = k+1; if (k1 == dimZ) k1 = k-1; - + val1 = (U[(dimX*dimY)*k + j*dimX+i1] - U[index]) + Bx[index]; val2 = (U[(dimX*dimY)*k + j1*dimX+i] - U[index]) + By[index]; val3 = (U[(dimX*dimY)*k1 + j*dimX+i] - U[index]) + Bz[index]; - + val11 = fabs(val1) - denom_lam; if (val11 < 0.0f) val11 = 0.0f; val22 = fabs(val2) - denom_lam; if (val22 < 0.0f) val22 = 0.0f; val33 = fabs(val3) - denom_lam; if (val33 < 0.0f) val33 = 0.0f; - + if (val1 !=0.0f) Dx[index] = (val1/fabs(val1))*val11; else Dx[index] = 0.0f; if (val2 !=0.0f) Dy[index] = (val2/fabs(val2))*val22; else Dy[index] = 0.0f; if (val3 !=0.0f) Dz[index] = (val3/fabs(val3))*val33; else Dz[index] = 0.0f; - + }}} return 1; } @@ -325,15 +323,15 @@ float updDxDyDz_shrinkIso3D(float *U, float *Dx, float *Dy, float *Dz, float *Bx i1 = i+1; if (i1 == dimX) i1 = i-1; j1 = j+1; if (j1 == dimY) j1 = j-1; k1 = k+1; if (k1 == dimZ) k1 = k-1; - + val1 = (U[(dimX*dimY)*k + j*dimX+i1] - U[index]) + Bx[index]; val2 = (U[(dimX*dimY)*k + j1*dimX+i] - U[index]) + By[index]; val3 = (U[(dimX*dimY)*k1 + j*dimX+i] - U[index]) + Bz[index]; - + denom = sqrt(val1*val1 + val2*val2 + val3*val3); - + val11 = (denom - denom_lam); if (val11 < 0) val11 = 0.0f; - + if (denom != 0.0f) { Dx[index] = val11*(val1/denom); Dy[index] = val11*(val2/denom); @@ -359,7 +357,7 @@ float updBxByBz3D(float *U, float *Dx, float *Dy, float *Dz, float *Bx, float *B i1 = i+1; if (i1 == dimX) i1 = i-1; j1 = j+1; if (j1 == dimY) j1 = j-1; k1 = k+1; if (k1 == dimZ) k1 = k-1; - + Bx[index] += (U[(dimX*dimY)*k + j*dimX+i1] - U[index]) - Dx[index]; By[index] += (U[(dimX*dimY)*k + j1*dimX+i] - U[index]) - Dy[index]; Bz[index] += (U[(dimX*dimY)*k1 + j*dimX+i] - U[index]) - Dz[index]; diff --git a/src/Core/regularisers_CPU/SB_TV_core.h b/src/Core/regularisers_CPU/SB_TV_core.h index 7485e3b..b94da6a 100644 --- a/src/Core/regularisers_CPU/SB_TV_core.h +++ b/src/Core/regularisers_CPU/SB_TV_core.h @@ -34,10 +34,10 @@ limitations under the License. * 3. Number of iterations [OPTIONAL parameter] * 4. eplsilon - tolerance constant [OPTIONAL parameter] * 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter] -* 6. print information: 0 (off) or 1 (on) [OPTIONAL parameter] -* + * Output: -* 1. Filtered/regularized image +* [1] Filtered/regularized image/volume +* [2] Information vector which contains [iteration no., reached tolerance] * * [1]. Goldstein, T. and Osher, S., 2009. The split Bregman method for L1-regularized problems. SIAM journal on imaging sciences, 2(2), pp.323-343. */ @@ -45,7 +45,7 @@ limitations under the License. #ifdef __cplusplus extern "C" { #endif -CCPI_EXPORT float SB_TV_CPU_main(float *Input, float *Output, float mu, int iter, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ); +CCPI_EXPORT float SB_TV_CPU_main(float *Input, float *Output, float *infovector, float mu, int iter, float epsil, int methodTV, int dimX, int dimY, int dimZ); CCPI_EXPORT float gauss_seidel2D(float *U, float *A, float *U_prev, float *Dx, float *Dy, float *Bx, float *By, long dimX, long dimY, float lambda, float mu); CCPI_EXPORT float updDxDy_shrinkAniso2D(float *U, float *Dx, float *Dy, float *Bx, float *By, long dimX, long dimY, float lambda); diff --git a/src/Core/regularisers_CPU/TGV_core.c b/src/Core/regularisers_CPU/TGV_core.c index 136e0bd..f43b56a 100644 --- a/src/Core/regularisers_CPU/TGV_core.c +++ b/src/Core/regularisers_CPU/TGV_core.c @@ -29,131 +29,168 @@ * 4. parameter to control the second-order term (alpha0) * 5. Number of Chambolle-Pock (Primal-Dual) iterations * 6. Lipshitz constant (default is 12) + * 7. eplsilon: tolerance constant * * Output: - * Filtered/regularised image/volume + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * References: * [1] K. Bredies "Total Generalized Variation" * */ -float TGV_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iter, float L2, int dimX, int dimY, int dimZ) +float TGV_main(float *U0, float *U, float *infovector, float lambda, float alpha1, float alpha0, int iter, float L2, float epsil, int dimX, int dimY, int dimZ) { long DimTotal; - int ll; + int ll, j; + float re, re1; + re = 0.0f; re1 = 0.0f; + int count = 0; float *U_old, *P1, *P2, *Q1, *Q2, *Q3, *V1, *V1_old, *V2, *V2_old, tau, sigma; - + DimTotal = (long)(dimX*dimY*dimZ); copyIm(U0, U, (long)(dimX), (long)(dimY), (long)(dimZ)); /* initialize */ tau = pow(L2,-0.5); sigma = pow(L2,-0.5); - + /* dual variables */ P1 = calloc(DimTotal, sizeof(float)); P2 = calloc(DimTotal, sizeof(float)); - + Q1 = calloc(DimTotal, sizeof(float)); Q2 = calloc(DimTotal, sizeof(float)); Q3 = calloc(DimTotal, sizeof(float)); - + U_old = calloc(DimTotal, sizeof(float)); - + V1 = calloc(DimTotal, sizeof(float)); V1_old = calloc(DimTotal, sizeof(float)); V2 = calloc(DimTotal, sizeof(float)); V2_old = calloc(DimTotal, sizeof(float)); - + if (dimZ == 1) { /*2D case*/ - + /* Primal-dual iterations begin here */ for(ll = 0; ll < iter; ll++) { - + /* Calculate Dual Variable P */ DualP_2D(U, V1, V2, P1, P2, (long)(dimX), (long)(dimY), sigma); - + /*Projection onto convex set for P*/ ProjP_2D(P1, P2, (long)(dimX), (long)(dimY), alpha1); - + /* Calculate Dual Variable Q */ DualQ_2D(V1, V2, Q1, Q2, Q3, (long)(dimX), (long)(dimY), sigma); - + /*Projection onto convex set for Q*/ ProjQ_2D(Q1, Q2, Q3, (long)(dimX), (long)(dimY), alpha0); - + /*saving U into U_old*/ copyIm(U, U_old, (long)(dimX), (long)(dimY), 1l); - + /*adjoint operation -> divergence and projection of P*/ DivProjP_2D(U, U0, P1, P2, (long)(dimX), (long)(dimY), lambda, tau); - + /*get updated solution U*/ newU(U, U_old, (long)(dimX), (long)(dimY)); - + /*saving V into V_old*/ copyIm(V1, V1_old, (long)(dimX), (long)(dimY), 1l); copyIm(V2, V2_old, (long)(dimX), (long)(dimY), 1l); - + /* upd V*/ UpdV_2D(V1, V2, P1, P2, Q1, Q2, Q3, (long)(dimX), (long)(dimY), tau); - + /*get new V*/ newU(V1, V1_old, (long)(dimX), (long)(dimY)); newU(V2, V2_old, (long)(dimX), (long)(dimY)); + + /* check early stopping criteria */ + if ((epsil != 0.0f) && (ll % 5 == 0)) { + re = 0.0f; re1 = 0.0f; + for(j=0; j<DimTotal; j++) + { + re += powf(U[j] - U_old[j],2); + re1 += powf(U[j],2); + } + re = sqrtf(re)/sqrtf(re1); + if (re < epsil) count++; + if (count > 3) break; + } } /*end of iterations*/ } else { /*3D case*/ float *P3, *Q4, *Q5, *Q6, *V3, *V3_old; - + P3 = calloc(DimTotal, sizeof(float)); Q4 = calloc(DimTotal, sizeof(float)); Q5 = calloc(DimTotal, sizeof(float)); Q6 = calloc(DimTotal, sizeof(float)); V3 = calloc(DimTotal, sizeof(float)); V3_old = calloc(DimTotal, sizeof(float)); - + /* Primal-dual iterations begin here */ for(ll = 0; ll < iter; ll++) { - + /* Calculate Dual Variable P */ DualP_3D(U, V1, V2, V3, P1, P2, P3, (long)(dimX), (long)(dimY), (long)(dimZ), sigma); - + /*Projection onto convex set for P*/ ProjP_3D(P1, P2, P3, (long)(dimX), (long)(dimY), (long)(dimZ), alpha1); - + /* Calculate Dual Variable Q */ DualQ_3D(V1, V2, V3, Q1, Q2, Q3, Q4, Q5, Q6, (long)(dimX), (long)(dimY), (long)(dimZ), sigma); - + /*Projection onto convex set for Q*/ ProjQ_3D(Q1, Q2, Q3, Q4, Q5, Q6, (long)(dimX), (long)(dimY), (long)(dimZ), alpha0); - + /*saving U into U_old*/ copyIm(U, U_old, (long)(dimX), (long)(dimY), (long)(dimZ)); - + /*adjoint operation -> divergence and projection of P*/ DivProjP_3D(U, U0, P1, P2, P3, (long)(dimX), (long)(dimY), (long)(dimZ), lambda, tau); - + /*get updated solution U*/ newU3D(U, U_old, (long)(dimX), (long)(dimY), (long)(dimZ)); - + /*saving V into V_old*/ copyIm_3Ar(V1, V2, V3, V1_old, V2_old, V3_old, (long)(dimX), (long)(dimY), (long)(dimZ)); - + /* upd V*/ UpdV_3D(V1, V2, V3, P1, P2, P3, Q1, Q2, Q3, Q4, Q5, Q6, (long)(dimX), (long)(dimY), (long)(dimZ), tau); - + /*get new V*/ newU3D_3Ar(V1, V2, V3, V1_old, V2_old, V3_old, (long)(dimX), (long)(dimY), (long)(dimZ)); + + /* check early stopping criteria */ + if ((epsil != 0.0f) && (ll % 5 == 0)) { + re = 0.0f; re1 = 0.0f; + for(j=0; j<DimTotal; j++) + { + re += powf(U[j] - U_old[j],2); + re1 += powf(U[j],2); + } + re = sqrtf(re)/sqrtf(re1); + if (re < epsil) count++; + if (count > 3) break; + } + } /*end of iterations*/ free(P3);free(Q4);free(Q5);free(Q6);free(V3);free(V3_old); } - + /*freeing*/ free(P1);free(P2);free(Q1);free(Q2);free(Q3);free(U_old); free(V1);free(V2);free(V1_old);free(V2_old); - return *U; + + /*adding info into info_vector */ + infovector[0] = (float)(ll); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + + return 0; } /********************************************************************/ @@ -172,7 +209,7 @@ float DualP_2D(float *U, float *V1, float *V2, float *P1, float *P2, long dimX, else P1[index] += sigma*((U[j*dimX+(i+1)] - U[index]) - V1[index]); if (j == dimY-1) P2[index] += sigma*(-V2[index]); else P2[index] += sigma*((U[(j+1)*dimX+i] - U[index]) - V2[index]); - + }} return 1; } @@ -245,15 +282,15 @@ float DivProjP_2D(float *U, float *U0, float *P1, float *P2, long dimX, long dim for(i=0; i<dimX; i++) { for(j=0; j<dimY; j++) { index = j*dimX+i; - + if (i == 0) P_v1 = P1[index]; else if (i == dimX-1) P_v1 = -P1[j*dimX+(i-1)]; else P_v1 = P1[index] - P1[j*dimX+(i-1)]; - + if (j == 0) P_v2 = P2[index]; else if (j == dimY-1) P_v2 = -P2[(j-1)*dimX+i]; else P_v2 = P2[index] - P2[(j-1)*dimX+i]; - + div = P_v1 + P_v2; U[index] = (lambda*(U[index] + tau*div) + tau*U0[index])/(lambda + tau); }} @@ -276,7 +313,7 @@ float UpdV_2D(float *V1, float *V2, float *P1, float *P2, float *Q1, float *Q2, for(i=0; i<dimX; i++) { for(j=0; j<dimY; j++) { index = j*dimX+i; - + /* boundary conditions (Neuman) */ if (i == 0) { q1 = Q1[index]; @@ -287,7 +324,7 @@ float UpdV_2D(float *V1, float *V2, float *P1, float *P2, float *Q1, float *Q2, else { q1 = Q1[index] - Q1[j*dimX+(i-1)]; q3_x = Q3[index] - Q3[j*dimX+(i-1)]; } - + if (j == 0) { q2 = Q2[index]; q3_y = Q3[index]; } @@ -297,8 +334,8 @@ float UpdV_2D(float *V1, float *V2, float *P1, float *P2, float *Q1, float *Q2, else { q2 = Q2[index] - Q2[(j-1)*dimX+i]; q3_y = Q3[index] - Q3[(j-1)*dimX+i]; } - - + + div1 = q1 + q3_y; div2 = q3_x + q2; V1[index] += tau*(P1[index] + div1); @@ -375,7 +412,7 @@ float DualQ_3D(float *V1, float *V2, float *V3, float *Q1, float *Q2, float *Q3, q44 = V1[(dimX*dimY)*(k+1) + j*dimX+i] - V1[index]; q66 = V2[(dimX*dimY)*(k+1) + j*dimX+i] - V2[index]; } - + Q1[index] += sigma*(q1); /*Q11*/ Q2[index] += sigma*(q2); /*Q22*/ Q3[index] += sigma*(q3); /*Q33*/ @@ -417,7 +454,7 @@ float DivProjP_3D(float *U, float *U0, float *P1, float *P2, float *P3, long dim for(j=0; j<dimY; j++) { for(k=0; k<dimZ; k++) { index = (dimX*dimY)*k + j*dimX+i; - + if (i == 0) P_v1 = P1[index]; else if (i == dimX-1) P_v1 = -P1[(dimX*dimY)*k + j*dimX+(i-1)]; else P_v1 = P1[index] - P1[(dimX*dimY)*k + j*dimX+(i-1)]; @@ -427,7 +464,7 @@ float DivProjP_3D(float *U, float *U0, float *P1, float *P2, float *P3, long dim if (k == 0) P_v3 = P3[index]; else if (k == dimZ-1) P_v3 = -P3[(dimX*dimY)*(k-1) + (j)*dimX+i]; else P_v3 = P3[index] - P3[(dimX*dimY)*(k-1) + (j)*dimX+i]; - + div = P_v1 + P_v2 + P_v3; U[index] = (lambda*(U[index] + tau*div) + tau*U0[index])/(lambda + tau); }}} @@ -446,7 +483,7 @@ float UpdV_3D(float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, q1 = 0.0f; q4x= 0.0f; q5x= 0.0f; q2= 0.0f; q4y= 0.0f; q6y= 0.0f; q6z= 0.0f; q5z= 0.0f; q3= 0.0f; /* Q1 - Q11, Q2 - Q22, Q3 - Q33, Q4 - Q21/Q12, Q5 - Q31/Q13, Q6 - Q32/Q23*/ /* symmetric boundary conditions (Neuman) */ - + if (i == 0) { q1 = Q1[index]; q4x = Q4[index]; @@ -483,11 +520,11 @@ float UpdV_3D(float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, q6z = Q6[index] - Q6[(dimX*dimY)*(k-1) + (j)*dimX+i]; q5z = Q5[index] - Q5[(dimX*dimY)*(k-1) + (j)*dimX+i]; q3 = Q3[index] - Q3[(dimX*dimY)*(k-1) + (j)*dimX+i]; } - + div1 = q1 + q4y + q5z; div2 = q4x + q2 + q6z; div3 = q5x + q6y + q3; - + V1[index] += tau*(P1[index] + div1); V2[index] += tau*(P2[index] + div2); V3[index] += tau*(P3[index] + div3); @@ -529,4 +566,3 @@ float newU3D_3Ar(float *V1, float *V2, float *V3, float *V1_old, float *V2_old, } return 1; } - diff --git a/src/Core/regularisers_CPU/TGV_core.h b/src/Core/regularisers_CPU/TGV_core.h index 11b12c1..652d59f 100644 --- a/src/Core/regularisers_CPU/TGV_core.h +++ b/src/Core/regularisers_CPU/TGV_core.h @@ -25,7 +25,7 @@ limitations under the License. #include "utils.h" #include "CCPiDefines.h" -/* C-OMP implementation of Primal-Dual denoising method for +/* C-OMP implementation of Primal-Dual denoising method for * Total Generilized Variation (TGV)-L2 model [1] (2D/3D) * * Input Parameters: @@ -35,20 +35,22 @@ limitations under the License. * 4. parameter to control the second-order term (alpha0) * 5. Number of Chambolle-Pock (Primal-Dual) iterations * 6. Lipshitz constant (default is 12) - * + * 7. eplsilon: tolerance constant + * * Output: - * Filtered/regularised image/volume + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * References: * [1] K. Bredies "Total Generalized Variation" */ - - + + #ifdef __cplusplus extern "C" { #endif -CCPI_EXPORT float TGV_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iter, float L2, int dimX, int dimY, int dimZ); +CCPI_EXPORT float TGV_main(float *U0, float *U, float *infovector, float lambda, float alpha1, float alpha0, int iter, float L2, float epsil, int dimX, int dimY, int dimZ); /* 2D functions */ CCPI_EXPORT float DualP_2D(float *U, float *V1, float *V2, float *P1, float *P2, long dimX, long dimY, float sigma); diff --git a/src/Core/regularisers_GPU/Diffus_4thO_GPU_core.cu b/src/Core/regularisers_GPU/Diffus_4thO_GPU_core.cu index a4dbe70..afd2026 100644 --- a/src/Core/regularisers_GPU/Diffus_4thO_GPU_core.cu +++ b/src/Core/regularisers_GPU/Diffus_4thO_GPU_core.cu @@ -15,23 +15,28 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ +*/ #include "Diffus_4thO_GPU_core.h" #include "shared.h" +#include <thrust/functional.h> +#include <thrust/device_vector.h> +#include <thrust/transform_reduce.h> /* CUDA implementation of fourth-order diffusion scheme [1] for piecewise-smooth recovery (2D/3D case) - * The minimisation is performed using explicit scheme. + * The minimisation is performed using explicit scheme. * * Input Parameters: - * 1. Noisy image/volume + * 1. Noisy image/volume * 2. lambda - regularization parameter * 3. Edge-preserving parameter (sigma) - * 4. Number of iterations, for explicit scheme >= 150 is recommended + * 4. Number of iterations, for explicit scheme >= 150 is recommended * 5. tau - time-marching step for explicit scheme + * 6. eplsilon: tolerance constant * * Output: - * [1] Regularized image/volume + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the paper by * [1] Hajiaboli, M.R., 2011. An anisotropic fourth-order diffusion filter for image noise removal. International Journal of Computer Vision, 92(2), pp.177-191. @@ -40,7 +45,7 @@ limitations under the License. #define BLKXSIZE 8 #define BLKYSIZE 8 #define BLKZSIZE 8 - + #define BLKXSIZE2D 16 #define BLKYSIZE2D 16 #define EPS 1.0e-7 @@ -52,14 +57,14 @@ __global__ void Weighted_Laplc2D_kernel(float *W_Lapl, float *U0, float sigma, i { int i1,i2,j1,j2; float gradX, gradX_sq, gradY, gradY_sq, gradXX, gradYY, gradXY, xy_2, denom, V_norm, V_orth, c, c_sq; - - int i = blockDim.x * blockIdx.x + threadIdx.x; + + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - + int index = i + dimX*j; - + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY)) { - + /* boundary conditions (Neumann reflections) */ i1 = i+1; if (i1 == dimX) i1 = i-1; i2 = i-1; if (i2 < 0) i2 = i+1; @@ -68,34 +73,34 @@ __global__ void Weighted_Laplc2D_kernel(float *W_Lapl, float *U0, float sigma, i gradX = 0.5f*(U0[j*dimX+i2] - U0[j*dimX+i1]); gradX_sq = powf(gradX,2); - + gradY = 0.5f*(U0[j2*dimX+i] - U0[j1*dimX+i]); gradY_sq = powf(gradY,2); - + gradXX = U0[j*dimX+i2] + U0[j*dimX+i1] - 2*U0[index]; gradYY = U0[j2*dimX+i] + U0[j1*dimX+i] - 2*U0[index]; - + gradXY = 0.25f*(U0[j2*dimX+i2] + U0[j1*dimX+i1] - U0[j1*dimX+i2] - U0[j2*dimX+i1]); xy_2 = 2.0f*gradX*gradY*gradXY; - + denom = gradX_sq + gradY_sq; - + if (denom <= EPS) { V_norm = (gradXX*gradX_sq + xy_2 + gradYY*gradY_sq)/EPS; - V_orth = (gradXX*gradY_sq - xy_2 + gradYY*gradX_sq)/EPS; + V_orth = (gradXX*gradY_sq - xy_2 + gradYY*gradX_sq)/EPS; } else { V_norm = (gradXX*gradX_sq + xy_2 + gradYY*gradY_sq)/denom; - V_orth = (gradXX*gradY_sq - xy_2 + gradYY*gradX_sq)/denom; + V_orth = (gradXX*gradY_sq - xy_2 + gradYY*gradX_sq)/denom; } c = 1.0f/(1.0f + denom/sigma); c_sq = c*c; - + W_Lapl[index] = c_sq*V_norm + c*V_orth; } return; -} +} __global__ void Diffusion_update_step2D_kernel(float *Output, float *Input, float *W_Lapl, float lambdaPar, float sigmaPar2, float tau, int dimX, int dimY) { @@ -104,24 +109,24 @@ __global__ void Diffusion_update_step2D_kernel(float *Output, float *Input, floa int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - + int index = i + dimX*j; - + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY)) { - + /* boundary conditions (Neumann reflections) */ i1 = i+1; if (i1 == dimX) i1 = i-1; i2 = i-1; if (i2 < 0) i2 = i+1; j1 = j+1; if (j1 == dimY) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; - + gradXXc = W_Lapl[j*dimX+i2] + W_Lapl[j*dimX+i1] - 2*W_Lapl[index]; gradYYc = W_Lapl[j2*dimX+i] + W_Lapl[j1*dimX+i] - 2*W_Lapl[index]; Output[index] += tau*(-lambdaPar*(gradXXc + gradYYc) - (Output[index] - Input[index])); } return; -} +} /********************************************************************/ /***************************3D Functions*****************************/ /********************************************************************/ @@ -129,13 +134,13 @@ __global__ void Weighted_Laplc3D_kernel(float *W_Lapl, float *U0, float sigma, i { int i1,i2,j1,j2,k1,k2; float gradX, gradX_sq, gradY, gradY_sq, gradXX, gradYY, gradXY, xy_2, denom, V_norm, V_orth, c, c_sq, gradZ, gradZ_sq, gradZZ, gradXZ, gradYZ, xyz_1, xyz_2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - + /* boundary conditions (Neumann reflections) */ i1 = i+1; if (i1 == dimX) i1 = i-1; i2 = i-1; if (i2 < 0) i2 = i+1; @@ -143,32 +148,32 @@ __global__ void Weighted_Laplc3D_kernel(float *W_Lapl, float *U0, float sigma, i j2 = j-1; if (j2 < 0) j2 = j+1; k1 = k+1; if (k1 == dimZ) k1 = k-1; k2 = k-1; if (k2 < 0) k2 = k+1; - + int index = (dimX*dimY)*k + j*dimX+i; - + gradX = 0.5f*(U0[(dimX*dimY)*k + j*dimX+i2] - U0[(dimX*dimY)*k + j*dimX+i1]); gradX_sq = pow(gradX,2); - + gradY = 0.5f*(U0[(dimX*dimY)*k + j2*dimX+i] - U0[(dimX*dimY)*k + j1*dimX+i]); gradY_sq = pow(gradY,2); - + gradZ = 0.5f*(U0[(dimX*dimY)*k2 + j*dimX+i] - U0[(dimX*dimY)*k1 + j*dimX+i]); gradZ_sq = pow(gradZ,2); - + gradXX = U0[(dimX*dimY)*k + j*dimX+i2] + U0[(dimX*dimY)*k + j*dimX+i1] - 2*U0[index]; gradYY = U0[(dimX*dimY)*k + j2*dimX+i] + U0[(dimX*dimY)*k + j1*dimX+i] - 2*U0[index]; gradZZ = U0[(dimX*dimY)*k2 + j*dimX+i] + U0[(dimX*dimY)*k1 + j*dimX+i] - 2*U0[index]; - + gradXY = 0.25f*(U0[(dimX*dimY)*k + j2*dimX+i2] + U0[(dimX*dimY)*k + j1*dimX+i1] - U0[(dimX*dimY)*k + j1*dimX+i2] - U0[(dimX*dimY)*k + j2*dimX+i1]); gradXZ = 0.25f*(U0[(dimX*dimY)*k2 + j*dimX+i2] - U0[(dimX*dimY)*k2+j*dimX+i1] - U0[(dimX*dimY)*k1+j*dimX+i2] + U0[(dimX*dimY)*k1+j*dimX+i1]); gradYZ = 0.25f*(U0[(dimX*dimY)*k2 +j2*dimX+i] - U0[(dimX*dimY)*k2+j1*dimX+i] - U0[(dimX*dimY)*k1+j2*dimX+i] + U0[(dimX*dimY)*k1+j1*dimX+i]); - + xy_2 = 2.0f*gradX*gradY*gradXY; xyz_1 = 2.0f*gradX*gradZ*gradXZ; xyz_2 = 2.0f*gradY*gradZ*gradYZ; - + denom = gradX_sq + gradY_sq + gradZ_sq; - + if (denom <= EPS) { V_norm = (gradXX*gradX_sq + gradYY*gradY_sq + gradZZ*gradZ_sq + xy_2 + xyz_1 + xyz_2)/EPS; V_orth = ((gradY_sq + gradZ_sq)*gradXX + (gradX_sq + gradZ_sq)*gradYY + (gradX_sq + gradY_sq)*gradZZ - xy_2 - xyz_1 - xyz_2)/EPS; @@ -180,7 +185,7 @@ __global__ void Weighted_Laplc3D_kernel(float *W_Lapl, float *U0, float sigma, i c = 1.0f/(1.0f + denom/sigma); c_sq = c*c; - + W_Lapl[index] = c_sq*V_norm + c*V_orth; } return; @@ -193,9 +198,9 @@ __global__ void Diffusion_update_step3D_kernel(float *Output, float *Input, floa int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - + /* boundary conditions (Neumann reflections) */ i1 = i+1; if (i1 == dimX) i1 = i-1; i2 = i-1; if (i2 < 0) i2 = i+1; @@ -203,66 +208,160 @@ __global__ void Diffusion_update_step3D_kernel(float *Output, float *Input, floa j2 = j-1; if (j2 < 0) j2 = j+1; k1 = k+1; if (k1 == dimZ) k1 = k-1; k2 = k-1; if (k2 < 0) k2 = k+1; - + int index = (dimX*dimY)*k + j*dimX+i; - + gradXXc = W_Lapl[(dimX*dimY)*k + j*dimX+i2] + W_Lapl[(dimX*dimY)*k + j*dimX+i1] - 2*W_Lapl[index]; gradYYc = W_Lapl[(dimX*dimY)*k + j2*dimX+i] + W_Lapl[(dimX*dimY)*k + j1*dimX+i] - 2*W_Lapl[index]; gradZZc = W_Lapl[(dimX*dimY)*k2 + j*dimX+i] + W_Lapl[(dimX*dimY)*k1 + j*dimX+i] - 2*W_Lapl[index]; - + Output[index] += tau*(-lambdaPar*(gradXXc + gradYYc + gradZZc) - (Output[index] - Input[index])); } return; } + +__global__ void Diff4thcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input[index]; + } +} + + +__global__ void Diff4thResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + +__global__ void Diff4thcopy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input[index]; + } +} + +__global__ void Diff4thResidCalc3D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ /********************* MAIN HOST FUNCTION ******************/ /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ -extern "C" int Diffus4th_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int N, int M, int Z) +extern "C" int Diffus4th_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, float epsil, int N, int M, int Z) { - int dimTotal, dev = 0; - CHECK(cudaSetDevice(dev)); - float *d_input, *d_output, *d_W_Lapl; + + int deviceCount = -1; // number of devices + cudaGetDeviceCount(&deviceCount); + if (deviceCount == 0) { + fprintf(stderr, "No CUDA devices found\n"); + return -1; + } + + int dimTotal, n, count = 0; + float *d_input, *d_output, *d_W_Lapl, *d_update_prev=NULL, re; + re = 0.0f; float sigmaPar2; sigmaPar2 = sigmaPar*sigmaPar; dimTotal = N*M*Z; - + CHECK(cudaMalloc((void**)&d_input,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&d_output,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&d_W_Lapl,dimTotal*sizeof(float))); - + if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,dimTotal*sizeof(float)) ); + CHECK(cudaMemcpy(d_input,Input,dimTotal*sizeof(float),cudaMemcpyHostToDevice)); CHECK(cudaMemcpy(d_output,Input,dimTotal*sizeof(float),cudaMemcpyHostToDevice)); - - if (Z == 1) { - /*2D case */ - dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); - dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D)); - - for(int n=0; n < iterationsNumb; n++) { - /* Calculating weighted Laplacian */ - Weighted_Laplc2D_kernel<<<dimGrid,dimBlock>>>(d_W_Lapl, d_output, sigmaPar2, N, M); - CHECK(cudaDeviceSynchronize()); - /* Perform iteration step */ - Diffusion_update_step2D_kernel<<<dimGrid,dimBlock>>>(d_output, d_input, d_W_Lapl, lambdaPar, sigmaPar2, tau, N, M); - CHECK(cudaDeviceSynchronize()); - } - } - else { - /*3D case*/ - dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); - dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKZSIZE)); - for(int n=0; n < iterationsNumb; n++) { - /* Calculating weighted Laplacian */ - Weighted_Laplc3D_kernel<<<dimGrid,dimBlock>>>(d_W_Lapl, d_output, sigmaPar2, N, M, Z); - CHECK(cudaDeviceSynchronize()); - /* Perform iteration step */ - Diffusion_update_step3D_kernel<<<dimGrid,dimBlock>>>(d_output, d_input, d_W_Lapl, lambdaPar, sigmaPar2, tau, N, M, Z); - CHECK(cudaDeviceSynchronize()); - } - } + + /*2D case */ + dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); + dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D)); + dim3 dimBlock3(BLKXSIZE,BLKYSIZE,BLKZSIZE); + dim3 dimGrid3(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKZSIZE)); + + + for(n=0; n < iterationsNumb; n++) { + + if ((epsil != 0.0f) && (n % 5 == 0)) { + if (Z == 1) Diff4thcopy_kernel2D<<<dimGrid,dimBlock>>>(d_output, d_update_prev, N, M, dimTotal); + else Diff4thcopy_kernel3D<<<dimGrid3,dimBlock3>>>(d_output, d_update_prev, N, M, Z, dimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + + if (Z == 1) { + /*2D case */ + /* Calculating weighted Laplacian */ + Weighted_Laplc2D_kernel<<<dimGrid,dimBlock>>>(d_W_Lapl, d_output, sigmaPar2, N, M); + CHECK(cudaDeviceSynchronize()); + /* Perform iteration step */ + Diffusion_update_step2D_kernel<<<dimGrid,dimBlock>>>(d_output, d_input, d_W_Lapl, lambdaPar, sigmaPar2, tau, N, M); + CHECK(cudaDeviceSynchronize()); + } + else { + /* Calculating weighted Laplacian */ + Weighted_Laplc3D_kernel<<<dimGrid3,dimBlock3>>>(d_W_Lapl, d_output, sigmaPar2, N, M, Z); + CHECK(cudaDeviceSynchronize()); + /* Perform iteration step */ + Diffusion_update_step3D_kernel<<<dimGrid3,dimBlock3>>>(d_output, d_input, d_W_Lapl, lambdaPar, sigmaPar2, tau, N, M, Z); + CHECK(cudaDeviceSynchronize()); + } + if ((epsil != 0.0f) && (n % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + if (Z == 1) Diff4thResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_output, d_update_prev, d_W_Lapl, N, M, dimTotal); + else Diff4thResidCalc3D_kernel<<<dimGrid3,dimBlock3>>>(d_output, d_update_prev, d_W_Lapl, N, M, Z, dimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors( cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(d_W_Lapl, d_W_Lapl + dimTotal); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_output, d_output + dimTotal); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } + } + CHECK(cudaMemcpy(Output,d_output,dimTotal*sizeof(float),cudaMemcpyDeviceToHost)); CHECK(cudaFree(d_input)); CHECK(cudaFree(d_output)); CHECK(cudaFree(d_W_Lapl)); + if (epsil != 0.0f) cudaFree(d_update_prev); + + /*adding info into info_vector */ + infovector[0] = (float)(n); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ return 0; } diff --git a/src/Core/regularisers_GPU/Diffus_4thO_GPU_core.h b/src/Core/regularisers_GPU/Diffus_4thO_GPU_core.h index 77d5d79..709bb20 100644 --- a/src/Core/regularisers_GPU/Diffus_4thO_GPU_core.h +++ b/src/Core/regularisers_GPU/Diffus_4thO_GPU_core.h @@ -3,6 +3,6 @@ #include "CCPiDefines.h" #include <stdio.h> -extern "C" CCPI_EXPORT int Diffus4th_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int N, int M, int Z); +extern "C" CCPI_EXPORT int Diffus4th_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, float epsil, int N, int M, int Z); -#endif +#endif diff --git a/src/Core/regularisers_GPU/LLT_ROF_GPU_core.cu b/src/Core/regularisers_GPU/LLT_ROF_GPU_core.cu index 87871be..3c578f3 100644 --- a/src/Core/regularisers_GPU/LLT_ROF_GPU_core.cu +++ b/src/Core/regularisers_GPU/LLT_ROF_GPU_core.cu @@ -15,28 +15,33 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ +*/ #include "LLT_ROF_GPU_core.h" #include "shared.h" +#include <thrust/functional.h> +#include <thrust/device_vector.h> +#include <thrust/transform_reduce.h> /* CUDA implementation of Lysaker, Lundervold and Tai (LLT) model [1] combined with Rudin-Osher-Fatemi [2] TV regularisation penalty. - * -* This penalty can deliver visually pleasant piecewise-smooth recovery if regularisation parameters are selected well. -* The rule of thumb for selection is to start with lambdaLLT = 0 (just the ROF-TV model) and then proceed to increase -* lambdaLLT starting with smaller values. + * +* This penalty can deliver visually pleasant piecewise-smooth recovery if regularisation parameters are selected well. +* The rule of thumb for selection is to start with lambdaLLT = 0 (just the ROF-TV model) and then proceed to increase +* lambdaLLT starting with smaller values. * * Input Parameters: * 1. U0 - original noise image/volume * 2. lambdaROF - ROF-related regularisation parameter * 3. lambdaLLT - LLT-related regularisation parameter -* 4. tau - time-marching step -* 5. iter - iterations number (for both models) -* -* Output: -* Filtered/regularised image +* 4. iter - iterations number (for both models) +* 5. tau - time-marching step +* 6. eplsilon: tolerance constant + + * Output: + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * -* References: +* References: * [1] Lysaker, M., Lundervold, A. and Tai, X.C., 2003. Noise removal using fourth-order partial differential equation with applications to medical magnetic resonance images in space and time. IEEE Transactions on image processing, 12(12), pp.1579-1590. * [2] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" */ @@ -44,12 +49,12 @@ limitations under the License. #define BLKXSIZE 8 #define BLKYSIZE 8 #define BLKZSIZE 8 - + #define BLKXSIZE2D 16 #define BLKYSIZE2D 16 -#define EPS_LLT 0.01 +#define EPS_LLT 1.0e-12 #define EPS_ROF 1.0e-12 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) @@ -60,8 +65,8 @@ limitations under the License. __host__ __device__ int signLLT (float x) { return (x > 0) - (x < 0); -} - +} + /*************************************************************************/ /**********************LLT-related functions *****************************/ /*************************************************************************/ @@ -71,11 +76,11 @@ __global__ void der2D_LLT_kernel(float *U, float *D1, float *D2, int dimX, int d float dxx, dyy, denom_xx, denom_yy; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - + int index = i + dimX*j; - + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY)) { - + /* symmetric boundary conditions (Neuman) */ i_p = i + 1; if (i_p == dimX) i_p = i - 1; i_m = i - 1; if (i_m < 0) i_m = i + 1; @@ -92,18 +97,18 @@ __global__ void der2D_LLT_kernel(float *U, float *D1, float *D2, int dimX, int d D2[index] = dyy / denom_yy; } } - + __global__ void der3D_LLT_kernel(float* U, float *D1, float *D2, float *D3, int dimX, int dimY, int dimZ) { int i_p, i_m, j_m, j_p, k_p, k_m; float dxx, dyy, dzz, denom_xx, denom_yy, denom_zz; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - + /* symmetric boundary conditions (Neuman) */ i_p = i + 1; if (i_p == dimX) i_p = i - 1; i_m = i - 1; if (i_m < 0) i_m = i + 1; @@ -111,17 +116,17 @@ __global__ void der3D_LLT_kernel(float* U, float *D1, float *D2, float *D3, int j_m = j - 1; if (j_m < 0) j_m = j + 1; k_p = k + 1; if (k_p == dimZ) k_p = k - 1; k_m = k - 1; if (k_m < 0) k_m = k + 1; - + int index = (dimX*dimY)*k + j*dimX+i; - + dxx = U[(dimX*dimY)*k + j*dimX+i_p] - 2.0f*U[index] + U[(dimX*dimY)*k + j*dimX+i_m]; dyy = U[(dimX*dimY)*k + j_p*dimX+i] - 2.0f*U[index] + U[(dimX*dimY)*k + j_m*dimX+i]; dzz = U[(dimX*dimY)*k_p + j*dimX+i] - 2.0f*U[index] + U[(dimX*dimY)*k_m + j*dimX+i]; - + denom_xx = abs(dxx) + EPS_LLT; denom_yy = abs(dyy) + EPS_LLT; denom_zz = abs(dzz) + EPS_LLT; - + D1[index] = dxx / denom_xx; D2[index] = dyy / denom_yy; D3[index] = dzz / denom_zz; @@ -139,74 +144,74 @@ __global__ void D1_func2D_ROF_kernel(float* Input, float* D1, int N, int M) float NOMx_1,NOMy_1,NOMy_0,denom1,denom2,T1; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - - int index = i + N*j; - + + int index = i + N*j; + if ((i >= 0) && (i < N) && (j >= 0) && (j < M)) { - + /* boundary conditions (Neumann reflections) */ i1 = i + 1; if (i1 >= N) i1 = i-1; i2 = i - 1; if (i2 < 0) i2 = i+1; j1 = j + 1; if (j1 >= M) j1 = j-1; - + /* Forward-backward differences */ NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */ - NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ + NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ NOMy_0 = Input[index] - Input[j*N + i2]; /* y- */ - + denom1 = NOMx_1*NOMx_1; denom2 = 0.5f*(signLLT((float)NOMy_1) + signLLT((float)NOMy_0))*(MIN(abs((float)NOMy_1),abs((float)NOMy_0))); denom2 = denom2*denom2; T1 = sqrt(denom1 + denom2 + EPS_ROF); D1[index] = NOMx_1/T1; - } + } } - + /* differences 2 */ -__global__ void D2_func2D_ROF_kernel(float* Input, float* D2, int N, int M) +__global__ void D2_func2D_ROF_kernel(float* Input, float* D2, int N, int M) { int i1, j1, j2; float NOMx_1,NOMy_1,NOMx_0,denom1,denom2,T2; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - - int index = i + N*j; - + + int index = i + N*j; + if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) { - + /* boundary conditions (Neumann reflections) */ i1 = i + 1; if (i1 >= N) i1 = i-1; j1 = j + 1; if (j1 >= M) j1 = j-1; - j2 = j - 1; if (j2 < 0) j2 = j+1; - + j2 = j - 1; if (j2 < 0) j2 = j+1; + /* Forward-backward differences */ NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */ NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ NOMx_0 = Input[index] - Input[j2*N + i]; /* x- */ - + denom1 = NOMy_1*NOMy_1; denom2 = 0.5f*(signLLT((float)NOMx_1) + signLLT((float)NOMx_0))*(MIN(abs((float)NOMx_1),abs((float)NOMx_0))); denom2 = denom2*denom2; T2 = sqrt(denom1 + denom2 + EPS_ROF); - D2[index] = NOMy_1/T2; - } + D2[index] = NOMy_1/T2; + } } - + /* differences 1 */ -__global__ void D1_func3D_ROF_kernel(float* Input, float* D1, int dimX, int dimY, int dimZ) +__global__ void D1_func3D_ROF_kernel(float* Input, float* D1, int dimX, int dimY, int dimZ) { float NOMx_1, NOMy_1, NOMy_0, NOMz_1, NOMz_0, denom1, denom2,denom3, T1; int i1,i2,k1,j1,j2,k2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (dimX*dimY)*k + j*dimX+i; - + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - + /* symmetric boundary conditions (Neuman) */ i1 = i + 1; if (i1 >= dimX) i1 = i-1; i2 = i - 1; if (i2 < 0) i2 = i+1; @@ -214,38 +219,38 @@ __global__ void D1_func3D_ROF_kernel(float* Input, float* D1, int dimX, int dimY j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - + /* Forward-backward differences */ NOMx_1 = Input[(dimX*dimY)*k + j1*dimX + i] - Input[index]; /* x+ */ - NOMy_1 = Input[(dimX*dimY)*k + j*dimX + i1] - Input[index]; /* y+ */ + NOMy_1 = Input[(dimX*dimY)*k + j*dimX + i1] - Input[index]; /* y+ */ NOMy_0 = Input[index] - Input[(dimX*dimY)*k + j*dimX + i2]; /* y- */ - + NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + j*dimX + i]; /* z- */ - - + + denom1 = NOMx_1*NOMx_1; denom2 = 0.5*(signLLT(NOMy_1) + signLLT(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0))); denom2 = denom2*denom2; denom3 = 0.5*(signLLT(NOMz_1) + signLLT(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0))); denom3 = denom3*denom3; T1 = sqrt(denom1 + denom2 + denom3 + EPS_ROF); - D1[index] = NOMx_1/T1; - } - } + D1[index] = NOMx_1/T1; + } + } /* differences 2 */ - __global__ void D2_func3D_ROF_kernel(float* Input, float* D2, int dimX, int dimY, int dimZ) + __global__ void D2_func3D_ROF_kernel(float* Input, float* D2, int dimX, int dimY, int dimZ) { float NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2; int i1,i2,k1,j1,j2,k2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - + + int index = (dimX*dimY)*k + j*dimX+i; + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { /* symmetric boundary conditions (Neuman) */ i1 = i + 1; if (i1 >= dimX) i1 = i-1; @@ -254,16 +259,16 @@ __global__ void D1_func3D_ROF_kernel(float* Input, float* D1, int dimX, int dimY j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - - + + /* Forward-backward differences */ NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */ NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */ NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + (j)*dimX + i]; /* z- */ - - + + denom1 = NOMy_1*NOMy_1; denom2 = 0.5*(signLLT(NOMx_1) + signLLT(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0))); denom2 = denom2*denom2; @@ -273,19 +278,19 @@ __global__ void D1_func3D_ROF_kernel(float* Input, float* D1, int dimX, int dimY D2[index] = NOMy_1/T2; } } - + /* differences 3 */ - __global__ void D3_func3D_ROF_kernel(float* Input, float* D3, int dimX, int dimY, int dimZ) + __global__ void D3_func3D_ROF_kernel(float* Input, float* D3, int dimX, int dimY, int dimZ) { float NOMx_1, NOMy_1, NOMx_0, NOMy_0, NOMz_1, denom1, denom2, denom3, T3; int i1,i2,k1,j1,j2,k2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - + + int index = (dimX*dimY)*k + j*dimX+i; + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { i1 = i + 1; if (i1 >= dimX) i1 = i-1; @@ -294,14 +299,14 @@ __global__ void D1_func3D_ROF_kernel(float* Input, float* D1, int dimX, int dimY j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - + /* Forward-backward differences */ NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */ NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */ NOMy_0 = Input[index] - Input[(dimX*dimY)*k + (j)*dimX + i2]; /* y- */ NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ - + denom1 = NOMz_1*NOMz_1; denom2 = 0.5*(signLLT(NOMx_1) + signLLT(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0))); denom2 = denom2*denom2; @@ -317,17 +322,17 @@ __global__ void D1_func3D_ROF_kernel(float* Input, float* D1, int dimX, int dimY __global__ void Update2D_LLT_ROF_kernel(float *U0, float *U, float *D1_LLT, float *D2_LLT, float *D1_ROF, float *D2_ROF, float lambdaROF, float lambdaLLT, float tau, int dimX, int dimY) { - + int i_p, i_m, j_m, j_p; float div, laplc, dxx, dyy, dv1, dv2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - + int index = i + dimX*j; - + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY)) { - + /* symmetric boundary conditions (Neuman) */ i_p = i + 1; if (i_p == dimX) i_p = i - 1; i_m = i - 1; if (i_m < 0) i_m = i + 1; @@ -335,7 +340,7 @@ __global__ void Update2D_LLT_ROF_kernel(float *U0, float *U, float *D1_LLT, floa j_m = j - 1; if (j_m < 0) j_m = j + 1; index = j*dimX+i; - + /*LLT-related part*/ dxx = D1_LLT[j*dimX+i_p] - 2.0f*D1_LLT[index] + D1_LLT[j*dimX+i_m]; dyy = D2_LLT[j_p*dimX+i] - 2.0f*D2_LLT[index] + D2_LLT[j_m*dimX+i]; @@ -344,9 +349,9 @@ __global__ void Update2D_LLT_ROF_kernel(float *U0, float *U, float *D1_LLT, floa dv1 = D1_ROF[index] - D1_ROF[j_m*dimX + i]; dv2 = D2_ROF[index] - D2_ROF[j*dimX + i_m]; div = dv1 + dv2; /*build Divirgent*/ - + /*combine all into one cost function to minimise */ - U[index] += tau*(2.0f*lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); + U[index] += tau*(lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); } } @@ -354,13 +359,13 @@ __global__ void Update3D_LLT_ROF_kernel(float *U0, float *U, float *D1_LLT, floa { int i_p, i_m, j_m, j_p, k_p, k_m; float div, laplc, dxx, dyy, dzz, dv1, dv2, dv3; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - + /* symmetric boundary conditions (Neuman) */ i_p = i + 1; if (i_p == dimX) i_p = i - 1; i_m = i - 1; if (i_m < 0) i_m = i + 1; @@ -368,106 +373,232 @@ __global__ void Update3D_LLT_ROF_kernel(float *U0, float *U, float *D1_LLT, floa j_m = j - 1; if (j_m < 0) j_m = j + 1; k_p = k + 1; if (k_p == dimZ) k_p = k - 1; k_m = k - 1; if (k_m < 0) k_m = k + 1; - + int index = (dimX*dimY)*k + j*dimX+i; - + /*LLT-related part*/ dxx = D1_LLT[(dimX*dimY)*k + j*dimX+i_p] - 2.0f*D1_LLT[index] + D1_LLT[(dimX*dimY)*k + j*dimX+i_m]; dyy = D2_LLT[(dimX*dimY)*k + j_p*dimX+i] - 2.0f*D2_LLT[index] + D2_LLT[(dimX*dimY)*k + j_m*dimX+i]; dzz = D3_LLT[(dimX*dimY)*k_p + j*dimX+i] - 2.0f*D3_LLT[index] + D3_LLT[(dimX*dimY)*k_m + j*dimX+i]; laplc = dxx + dyy + dzz; /*build Laplacian*/ - + /*ROF-related part*/ dv1 = D1_ROF[index] - D1_ROF[(dimX*dimY)*k + j_m*dimX+i]; dv2 = D2_ROF[index] - D2_ROF[(dimX*dimY)*k + j*dimX+i_m]; dv3 = D3_ROF[index] - D3_ROF[(dimX*dimY)*k_m + j*dimX+i]; div = dv1 + dv2 + dv3; /*build Divirgent*/ - + /*combine all into one cost function to minimise */ - U[index] += tau*(2.0f*lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); + U[index] += tau*(lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); } } +__global__ void ROFLLTcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input[index]; + } +} + + +__global__ void ROFLLTResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + +__global__ void ROFLLTcopy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input[index]; + } +} + +__global__ void ROFLLTResidCalc3D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + /*******************************************************************/ /************************ HOST FUNCTION ****************************/ /*******************************************************************/ -extern "C" int LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z) +extern "C" int LLT_ROF_GPU_main(float *Input, float *Output, float *infovector, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, float epsil, int N, int M, int Z) { - // set up device - int dev = 0; - int DimTotal; - DimTotal = N*M*Z; - CHECK(cudaSetDevice(dev)); + int deviceCount = -1; // number of devices + cudaGetDeviceCount(&deviceCount); + if (deviceCount == 0) { + fprintf(stderr, "No CUDA devices found\n"); + return -1; + } + float re; + re = 0.0f; + int DimTotal,count,n; + count = 0; n = 0; float *d_input, *d_update; - float *D1_LLT=NULL, *D2_LLT=NULL, *D3_LLT=NULL, *D1_ROF=NULL, *D2_ROF=NULL, *D3_ROF=NULL; - + float *D1_LLT=NULL, *D2_LLT=NULL, *D1_ROF=NULL, *D2_ROF=NULL, *d_update_prev=NULL; + if (Z == 0) {Z = 1;} - + DimTotal = N*M*Z; + CHECK(cudaMalloc((void**)&d_input,DimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&d_update,DimTotal*sizeof(float))); - + CHECK(cudaMalloc((void**)&D1_LLT,DimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&D2_LLT,DimTotal*sizeof(float))); - CHECK(cudaMalloc((void**)&D3_LLT,DimTotal*sizeof(float))); - + CHECK(cudaMalloc((void**)&D1_ROF,DimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&D2_ROF,DimTotal*sizeof(float))); - CHECK(cudaMalloc((void**)&D3_ROF,DimTotal*sizeof(float))); - + + CHECK(cudaMemcpy(d_input,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); - CHECK(cudaMemcpy(d_update,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); - + CHECK(cudaMemcpy(d_update,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)) + if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,DimTotal*sizeof(float)) );; + if (Z == 1) { // TV - 2D case dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D)); - - for(int n=0; n < iterationsNumb; n++) { + + for(n=0; n < iterationsNumb; n++) { + + + if ((epsil != 0.0f) && (n % 5 == 0)) { + ROFLLTcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, N, M, DimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + /****************ROF******************/ - /* calculate first-order differences */ + /* calculate first-order differences */ D1_func2D_ROF_kernel<<<dimGrid,dimBlock>>>(d_update, D1_ROF, N, M); CHECK(cudaDeviceSynchronize()); - D2_func2D_ROF_kernel<<<dimGrid,dimBlock>>>(d_update, D2_ROF, N, M); - CHECK(cudaDeviceSynchronize()); + D2_func2D_ROF_kernel<<<dimGrid,dimBlock>>>(d_update, D2_ROF, N, M); + CHECK(cudaDeviceSynchronize()); /****************LLT******************/ - /* estimate second-order derrivatives */ - der2D_LLT_kernel<<<dimGrid,dimBlock>>>(d_update, D1_LLT, D2_LLT, N, M); - /* Joint update for ROF and LLT models */ - Update2D_LLT_ROF_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, D1_LLT, D2_LLT, D1_ROF, D2_ROF, lambdaROF, lambdaLLT, tau, N, M); + /* estimate second-order derrivatives */ + der2D_LLT_kernel<<<dimGrid,dimBlock>>>(d_update, D1_LLT, D2_LLT, N, M); + /* Joint update for ROF and LLT models */ + Update2D_LLT_ROF_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, D1_LLT, D2_LLT, D1_ROF, D2_ROF, lambdaROF, lambdaLLT, tau, N, M); CHECK(cudaDeviceSynchronize()); + + if ((epsil != 0.0f) && (n % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + ROFLLTResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, D1_ROF, N, M, DimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors( cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(D1_ROF, D1_ROF + DimTotal); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } } } else { // 3D case dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE)); - - for(int n=0; n < iterationsNumb; n++) { + + float *D3_LLT=NULL, *D3_ROF=NULL; + + CHECK(cudaMalloc((void**)&D3_LLT,DimTotal*sizeof(float))); + CHECK(cudaMalloc((void**)&D3_ROF,DimTotal*sizeof(float))); + + for(n=0; n < iterationsNumb; n++) { + + if ((epsil != 0.0f) && (n % 5 == 0)) { + ROFLLTcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, N, M, Z, DimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + /****************ROF******************/ - /* calculate first-order differences */ + /* calculate first-order differences */ D1_func3D_ROF_kernel<<<dimGrid,dimBlock>>>(d_update, D1_ROF, N, M, Z); CHECK(cudaDeviceSynchronize()); - D2_func3D_ROF_kernel<<<dimGrid,dimBlock>>>(d_update, D2_ROF, N, M, Z); - CHECK(cudaDeviceSynchronize()); + D2_func3D_ROF_kernel<<<dimGrid,dimBlock>>>(d_update, D2_ROF, N, M, Z); + CHECK(cudaDeviceSynchronize()); D3_func3D_ROF_kernel<<<dimGrid,dimBlock>>>(d_update, D3_ROF, N, M, Z); - CHECK(cudaDeviceSynchronize()); + CHECK(cudaDeviceSynchronize()); /****************LLT******************/ /* estimate second-order derrivatives */ - der3D_LLT_kernel<<<dimGrid,dimBlock>>>(d_update, D1_LLT, D2_LLT, D3_LLT, N, M, Z); - /* Joint update for ROF and LLT models */ - Update3D_LLT_ROF_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, D1_LLT, D2_LLT, D3_LLT, D1_ROF, D2_ROF, D3_ROF, lambdaROF, lambdaLLT, tau, N, M, Z); + der3D_LLT_kernel<<<dimGrid,dimBlock>>>(d_update, D1_LLT, D2_LLT, D3_LLT, N, M, Z); + /* Joint update for ROF and LLT models */ + Update3D_LLT_ROF_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, D1_LLT, D2_LLT, D3_LLT, D1_ROF, D2_ROF, D3_ROF, lambdaROF, lambdaLLT, tau, N, M, Z); CHECK(cudaDeviceSynchronize()); + + + if ((epsil != 0.0f) && (n % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + ROFLLTResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, D1_ROF, N, M, Z, DimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors( cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(D1_ROF, D1_ROF + DimTotal); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } + } - } + CHECK(cudaFree(D3_LLT)); + CHECK(cudaFree(D3_ROF)); + } /*end of else */ + CHECK(cudaMemcpy(Output,d_update,DimTotal*sizeof(float),cudaMemcpyDeviceToHost)); + CHECK(cudaFree(d_input)); CHECK(cudaFree(d_update)); + if (epsil != 0.0f) cudaFree(d_update_prev); CHECK(cudaFree(D1_LLT)); CHECK(cudaFree(D2_LLT)); - CHECK(cudaFree(D3_LLT)); CHECK(cudaFree(D1_ROF)); CHECK(cudaFree(D2_ROF)); - CHECK(cudaFree(D3_ROF)); + + infovector[0] = (float)(n); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; } diff --git a/src/Core/regularisers_GPU/LLT_ROF_GPU_core.h b/src/Core/regularisers_GPU/LLT_ROF_GPU_core.h index a6bfcc7..2d566d2 100644 --- a/src/Core/regularisers_GPU/LLT_ROF_GPU_core.h +++ b/src/Core/regularisers_GPU/LLT_ROF_GPU_core.h @@ -3,6 +3,6 @@ #include "CCPiDefines.h" #include <stdio.h> -extern "C" CCPI_EXPORT int LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z); +extern "C" CCPI_EXPORT int LLT_ROF_GPU_main(float *Input, float *Output, float *infovector, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, float epsil, int N, int M, int Z); -#endif +#endif diff --git a/src/Core/regularisers_GPU/NonlDiff_GPU_core.cu b/src/Core/regularisers_GPU/NonlDiff_GPU_core.cu index ff7ce4d..de9abd4 100644 --- a/src/Core/regularisers_GPU/NonlDiff_GPU_core.cu +++ b/src/Core/regularisers_GPU/NonlDiff_GPU_core.cu @@ -15,39 +15,43 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ +*/ #include "NonlDiff_GPU_core.h" #include "shared.h" +#include <thrust/functional.h> +#include <thrust/device_vector.h> +#include <thrust/transform_reduce.h> /* CUDA implementation of linear and nonlinear diffusion with the regularisation model [1,2] (2D/3D case) - * The minimisation is performed using explicit scheme. + * The minimisation is performed using explicit scheme. * * Input Parameters: - * 1. Noisy image/volume + * 1. Noisy image/volume * 2. lambda - regularization parameter * 3. Edge-preserving parameter (sigma), when sigma equals to zero nonlinear diffusion -> linear diffusion - * 4. Number of iterations, for explicit scheme >= 150 is recommended + * 4. Number of iterations, for explicit scheme >= 150 is recommended * 5. tau - time-marching step for explicit scheme * 6. Penalty type: 1 - Huber, 2 - Perona-Malik, 3 - Tukey Biweight - * - * Output: - * [1] Regularized image/volume + * 7. eplsilon: tolerance constant + + * Output: + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the paper by * [1] Perona, P. and Malik, J., 1990. Scale-space and edge detection using anisotropic diffusion. IEEE Transactions on pattern analysis and machine intelligence, 12(7), pp.629-639. * [2] Black, M.J., Sapiro, G., Marimont, D.H. and Heeger, D., 1998. Robust anisotropic diffusion. IEEE Transactions on image processing, 7(3), pp.421-432. */ - #define BLKXSIZE 8 #define BLKYSIZE 8 #define BLKZSIZE 8 - + #define BLKXSIZE2D 16 #define BLKYSIZE2D 16 #define EPS 1.0e-5 - + #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) @@ -56,8 +60,8 @@ limitations under the License. __host__ __device__ int signNDF (float x) { return (x > 0) - (x < 0); -} - +} + /********************************************************************/ /***************************2D Functions*****************************/ /********************************************************************/ @@ -67,69 +71,69 @@ __global__ void LinearDiff2D_kernel(float *Input, float *Output, float lambdaPar float e,w,n,s,e1,w1,n1,s1; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - + int index = i + N*j; - + if ((i >= 0) && (i < N) && (j >= 0) && (j < M)) { - + /* boundary conditions (Neumann reflections) */ i1 = i+1; if (i1 == N) i1 = i-1; i2 = i-1; if (i2 < 0) i2 = i+1; j1 = j+1; if (j1 == M) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; - + e = Output[j*N+i1]; w = Output[j*N+i2]; n = Output[j1*N+i]; s = Output[j2*N+i]; - + e1 = e - Output[index]; w1 = w - Output[index]; n1 = n - Output[index]; s1 = s - Output[index]; - - Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1) - (Output[index] - Input[index])); + + Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1) - (Output[index] - Input[index])); } - } - + } + __global__ void NonLinearDiff2D_kernel(float *Input, float *Output, float lambdaPar, float sigmaPar, float tau, int penaltytype, int N, int M) { int i1,i2,j1,j2; float e,w,n,s,e1,w1,n1,s1; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - + int index = i + N*j; - + if ((i >= 0) && (i < N) && (j >= 0) && (j < M)) { - + /* boundary conditions (Neumann reflections) */ i1 = i+1; if (i1 == N) i1 = i-1; i2 = i-1; if (i2 < 0) i2 = i+1; j1 = j+1; if (j1 == M) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; - + e = Output[j*N+i1]; w = Output[j*N+i2]; n = Output[j1*N+i]; s = Output[j2*N+i]; - + e1 = e - Output[index]; w1 = w - Output[index]; n1 = n - Output[index]; s1 = s - Output[index]; - + if (penaltytype == 1){ /* Huber penalty */ if (abs(e1) > sigmaPar) e1 = signNDF(e1); else e1 = e1/sigmaPar; - + if (abs(w1) > sigmaPar) w1 = signNDF(w1); else w1 = w1/sigmaPar; - + if (abs(n1) > sigmaPar) n1 = signNDF(n1); else n1 = n1/sigmaPar; - + if (abs(s1) > sigmaPar) s1 = signNDF(s1); else s1 = s1/sigmaPar; } @@ -152,10 +156,10 @@ __global__ void LinearDiff2D_kernel(float *Input, float *Output, float lambdaPar else s1 = 0.0f; } else printf("%s \n", "No penalty function selected! Use 1,2 or 3."); - - Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1) - (Output[index] - Input[index])); + + Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1) - (Output[index] - Input[index])); } - } + } /********************************************************************/ /***************************3D Functions*****************************/ /********************************************************************/ @@ -167,11 +171,11 @@ __global__ void LinearDiff3D_kernel(float *Input, float *Output, float lambdaPar int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (N*M)*k + i + N*j; - + + int index = (N*M)*k + i + N*j; + if ((i >= 0) && (i < N) && (j >= 0) && (j < M) && (k >= 0) && (k < Z)) { - + /* boundary conditions (Neumann reflections) */ i1 = i+1; if (i1 == N) i1 = i-1; i2 = i-1; if (i2 < 0) i2 = i+1; @@ -179,24 +183,24 @@ __global__ void LinearDiff3D_kernel(float *Input, float *Output, float lambdaPar j2 = j-1; if (j2 < 0) j2 = j+1; k1 = k+1; if (k1 == Z) k1 = k-1; k2 = k-1; if (k2 < 0) k2 = k+1; - + e = Output[(N*M)*k + i1 + N*j]; w = Output[(N*M)*k + i2 + N*j]; n = Output[(N*M)*k + i + N*j1]; s = Output[(N*M)*k + i + N*j2]; u = Output[(N*M)*k1 + i + N*j]; d = Output[(N*M)*k2 + i + N*j]; - + e1 = e - Output[index]; w1 = w - Output[index]; n1 = n - Output[index]; s1 = s - Output[index]; u1 = u - Output[index]; d1 = d - Output[index]; - - Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1 + u1 + d1) - (Output[index] - Input[index])); + + Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1 + u1 + d1) - (Output[index] - Input[index])); } - } + } __global__ void NonLinearDiff3D_kernel(float *Input, float *Output, float lambdaPar, float sigmaPar, float tau, int penaltytype, int N, int M, int Z) { @@ -205,11 +209,11 @@ __global__ void NonLinearDiff3D_kernel(float *Input, float *Output, float lambda int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (N*M)*k + i + N*j; - + + int index = (N*M)*k + i + N*j; + if ((i >= 0) && (i < N) && (j >= 0) && (j < M) && (k >= 0) && (k < Z)) { - + /* boundary conditions (Neumann reflections) */ i1 = i+1; if (i1 == N) i1 = i-1; i2 = i-1; if (i2 < 0) i2 = i+1; @@ -217,41 +221,41 @@ __global__ void NonLinearDiff3D_kernel(float *Input, float *Output, float lambda j2 = j-1; if (j2 < 0) j2 = j+1; k1 = k+1; if (k1 == Z) k1 = k-1; k2 = k-1; if (k2 < 0) k2 = k+1; - + e = Output[(N*M)*k + i1 + N*j]; w = Output[(N*M)*k + i2 + N*j]; n = Output[(N*M)*k + i + N*j1]; s = Output[(N*M)*k + i + N*j2]; u = Output[(N*M)*k1 + i + N*j]; d = Output[(N*M)*k2 + i + N*j]; - + e1 = e - Output[index]; w1 = w - Output[index]; n1 = n - Output[index]; s1 = s - Output[index]; u1 = u - Output[index]; d1 = d - Output[index]; - - + + if (penaltytype == 1){ /* Huber penalty */ if (abs(e1) > sigmaPar) e1 = signNDF(e1); else e1 = e1/sigmaPar; - + if (abs(w1) > sigmaPar) w1 = signNDF(w1); else w1 = w1/sigmaPar; - + if (abs(n1) > sigmaPar) n1 = signNDF(n1); else n1 = n1/sigmaPar; - + if (abs(s1) > sigmaPar) s1 = signNDF(s1); else s1 = s1/sigmaPar; - + if (abs(u1) > sigmaPar) u1 = signNDF(u1); else u1 = u1/sigmaPar; - + if (abs(d1) > sigmaPar) d1 = signNDF(d1); - else d1 = d1/sigmaPar; + else d1 = d1/sigmaPar; } else if (penaltytype == 2) { /* Perona-Malik */ @@ -279,34 +283,100 @@ __global__ void NonLinearDiff3D_kernel(float *Input, float *Output, float lambda } else printf("%s \n", "No penalty function selected! Use 1,2 or 3."); - Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1 + u1 + d1) - (Output[index] - Input[index])); + Output[index] += tau*(lambdaPar*(e1 + w1 + n1 + s1 + u1 + d1) - (Output[index] - Input[index])); } - } + } + + __global__ void NDFcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total) + { + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input[index]; + } + } + __global__ void NDFResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total) + { + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } + } + __global__ void NDFcopy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total) + { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input[index]; + } + } + __global__ void NDFResidCalc3D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int Z, int num_total) + { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } + } + ///////////////////////////////////////////////// // HOST FUNCTION -extern "C" int NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z) +extern "C" int NonlDiff_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, float epsil, int N, int M, int Z) { - // set up device - int dev = 0; - CHECK(cudaSetDevice(dev)); - float *d_input, *d_output; - float sigmaPar2; + int deviceCount = -1; // number of devices + cudaGetDeviceCount(&deviceCount); + if (deviceCount == 0) { + fprintf(stderr, "No CUDA devices found\n"); + return -1; + } + int n, count, ImSize; + count = 0; + float *d_input, *d_output, *d_update_prev, *d_res; + float sigmaPar2, re = 0.0f; sigmaPar2 = sigmaPar/sqrt(2.0f); - - CHECK(cudaMalloc((void**)&d_input,N*M*Z*sizeof(float))); - CHECK(cudaMalloc((void**)&d_output,N*M*Z*sizeof(float))); - - CHECK(cudaMemcpy(d_input,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice)); - CHECK(cudaMemcpy(d_output,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice)); - + ImSize = N*M*Z; + + + CHECK(cudaMalloc((void**)&d_input,ImSize*sizeof(float))); + CHECK(cudaMalloc((void**)&d_output,ImSize*sizeof(float))); + if (epsil != 0.0f) { + checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_res,ImSize*sizeof(float)) ); + } + + CHECK(cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + CHECK(cudaMemcpy(d_output,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + if (Z == 1) { - /*2D case */ - + /*2D case */ + dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D)); - - for(int n=0; n < iterationsNumb; n++) { + + for(n=0; n < iterationsNumb; n++) { + + if ((epsil != 0.0f) && (n % 5 == 0)) { + NDFcopy_kernel2D<<<dimGrid,dimBlock>>>(d_output, d_update_prev, N, M, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + if (sigmaPar == 0.0f) { /* linear diffusion (heat equation) */ LinearDiff2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_output, lambdaPar, tau, N, M); @@ -317,13 +387,40 @@ extern "C" int NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, f NonLinearDiff2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_output, lambdaPar, sigmaPar2, tau, penaltytype, N, M); CHECK(cudaDeviceSynchronize()); } + + if ((epsil != 0.0f) && (n % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + NDFResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_output, d_update_prev, d_res, N, M, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors( cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(d_res, d_res + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_output, d_output + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } } } else { /*3D case*/ dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKZSIZE)); - for(int n=0; n < iterationsNumb; n++) { + for(n=0; n < iterationsNumb; n++) { + + if ((epsil != 0.0f) && (n % 5 == 0)) { + NDFcopy_kernel3D<<<dimGrid,dimBlock>>>(d_output, d_update_prev, N, M, Z, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + if (sigmaPar == 0.0f) { /* linear diffusion (heat equation) */ LinearDiff3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_output, lambdaPar, tau, N, M, Z); @@ -334,12 +431,38 @@ extern "C" int NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, f NonLinearDiff3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_output, lambdaPar, sigmaPar2, tau, penaltytype, N, M, Z); CHECK(cudaDeviceSynchronize()); } + + if ((epsil != 0.0f) && (n % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + NDFResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_output, d_update_prev, d_res, N, M, Z, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors( cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(d_res, d_res + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_output, d_output + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } } - - } - CHECK(cudaMemcpy(Output,d_output,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost)); + + } + CHECK(cudaMemcpy(Output,d_output,ImSize*sizeof(float),cudaMemcpyDeviceToHost)); CHECK(cudaFree(d_input)); CHECK(cudaFree(d_output)); - //cudaDeviceReset(); + if (epsil != 0.0f) { + CHECK(cudaFree(d_update_prev)); + CHECK(cudaFree(d_res)); + } + + infovector[0] = (float)(n); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ return 0; } diff --git a/src/Core/regularisers_GPU/NonlDiff_GPU_core.h b/src/Core/regularisers_GPU/NonlDiff_GPU_core.h index 5fe457e..48852f8 100644 --- a/src/Core/regularisers_GPU/NonlDiff_GPU_core.h +++ b/src/Core/regularisers_GPU/NonlDiff_GPU_core.h @@ -3,6 +3,6 @@ #include "CCPiDefines.h" #include <stdio.h> -extern "C" CCPI_EXPORT int NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z); +extern "C" CCPI_EXPORT int NonlDiff_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, float epsil, int N, int M, int Z); -#endif +#endif diff --git a/src/Core/regularisers_GPU/TGV_GPU_core.cu b/src/Core/regularisers_GPU/TGV_GPU_core.cu index 849219b..fc462fe 100644 --- a/src/Core/regularisers_GPU/TGV_GPU_core.cu +++ b/src/Core/regularisers_GPU/TGV_GPU_core.cu @@ -15,12 +15,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ +*/ #include "TGV_GPU_core.h" #include "shared.h" +#include <thrust/functional.h> +#include <thrust/device_vector.h> +#include <thrust/transform_reduce.h> -/* CUDA implementation of Primal-Dual denoising method for + +/* CUDA implementation of Primal-Dual denoising method for * Total Generilized Variation (TGV)-L2 model [1] (2D/3D case) * * Input Parameters: @@ -30,15 +34,17 @@ limitations under the License. * 4. parameter to control the second-order term (alpha0) * 5. Number of Chambolle-Pock (Primal-Dual) iterations * 6. Lipshitz constant (default is 12) + * 7. eplsilon: tolerance constant * * Output: - * Filtered/regularised image - * + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] + * References: * [1] K. Bredies "Total Generalized Variation" */ - - + + #define BLKXSIZE2D 16 #define BLKYSIZE2D 16 @@ -52,34 +58,34 @@ limitations under the License. /***************************2D Functions*****************************/ /********************************************************************/ __global__ void DualP_2D_kernel(float *U, float *V1, float *V2, float *P1, float *P2, long dimX, long dimY, float sigma) -{ +{ const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; - - long index = i + (dimX)*j; - + + long index = i + (dimX)*j; + if ((i < dimX) && (j < dimY)) { - /* symmetric boundary conditions (Neuman) */ - if ((i >= 0) && (i < dimX-1)) P1[index] += sigma*((U[(i+1) + dimX*j] - U[index]) - V1[index]); - else if (i == dimX-1) P1[index] -= sigma*(V1[index]); - else P1[index] = 0.0f; + /* symmetric boundary conditions (Neuman) */ + if ((i >= 0) && (i < dimX-1)) P1[index] += sigma*((U[(i+1) + dimX*j] - U[index]) - V1[index]); + else if (i == dimX-1) P1[index] -= sigma*(V1[index]); + else P1[index] = 0.0f; if ((j >= 0) && (j < dimY-1)) P2[index] += sigma*((U[i + dimX*(j+1)] - U[index]) - V2[index]); - else if (j == dimY-1) P2[index] -= sigma*(V2[index]); - else P2[index] = 0.0f; + else if (j == dimY-1) P2[index] -= sigma*(V2[index]); + else P2[index] = 0.0f; } return; -} +} __global__ void ProjP_2D_kernel(float *P1, float *P2, long dimX, long dimY, float alpha1) { float grad_magn; - + const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; - - long index = i + (dimX)*j; - - if ((i < dimX) && (j < dimY)) { + + long index = i + (dimX)*j; + + if ((i < dimX) && (j < dimY)) { grad_magn = sqrtf(powf(P1[index],2) + powf(P2[index],2)); grad_magn = grad_magn/alpha1; if (grad_magn > 1.0f) { @@ -88,20 +94,20 @@ __global__ void ProjP_2D_kernel(float *P1, float *P2, long dimX, long dimY, floa } } return; -} +} __global__ void DualQ_2D_kernel(float *V1, float *V2, float *Q1, float *Q2, float *Q3, long dimX, long dimY, float sigma) { float q1, q2, q11, q22; const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; - - long index = i + (dimX)*j; - - if ((i < dimX) && (j < dimY)) { + + long index = i + (dimX)*j; + + if ((i < dimX) && (j < dimY)) { q1 = 0.0f; q2 = 0.0f; q11 = 0.0f; q22 = 0.0f; - - if ((i >= 0) && (i < dimX-1)) { + + if ((i >= 0) && (i < dimX-1)) { /* boundary conditions (Neuman) */ q1 = V1[(i+1) + dimX*j] - V1[index]; q11 = V2[(i+1) + dimX*j] - V2[index]; @@ -110,23 +116,23 @@ __global__ void DualQ_2D_kernel(float *V1, float *V2, float *Q1, float *Q2, floa q2 = V2[i + dimX*(j+1)] - V2[index]; q22 = V1[i + dimX*(j+1)] - V1[index]; } - + Q1[index] += sigma*(q1); Q2[index] += sigma*(q2); Q3[index] += sigma*(0.5f*(q11 + q22)); - } + } return; -} +} __global__ void ProjQ_2D_kernel(float *Q1, float *Q2, float *Q3, long dimX, long dimY, float alpha0) { float grad_magn; const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; - - long index = i + (dimX)*j; - - if ((i < dimX) && (j < dimY)) { + + long index = i + (dimX)*j; + + if ((i < dimX) && (j < dimY)) { grad_magn = sqrt(powf(Q1[index],2) + powf(Q2[index],2) + 2*powf(Q3[index],2)); grad_magn = grad_magn/alpha0; if (grad_magn > 1.0f) { @@ -136,18 +142,18 @@ __global__ void ProjQ_2D_kernel(float *Q1, float *Q2, float *Q3, long dimX, long } } return; -} +} __global__ void DivProjP_2D_kernel(float *U, float *U0, float *P1, float *P2, long dimX, long dimY, float lambda, float tau) { float P_v1, P_v2, div; const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; - - long index = i + (dimX)*j; + + long index = i + (dimX)*j; if ((i < dimX) && (j < dimY)) { - + if ((i > 0) && (i < dimX-1)) P_v1 = P1[index] - P1[(i-1) + dimX*j]; else if (i == dimX-1) P_v1 = -P1[(i-1) + dimX*j]; else if (i == 0) P_v1 = P1[index]; @@ -158,48 +164,48 @@ __global__ void DivProjP_2D_kernel(float *U, float *U0, float *P1, float *P2, lo else if (j == 0) P_v2 = P2[index]; else P_v2 = 0.0f; - + div = P_v1 + P_v2; U[index] = (lambda*(U[index] + tau*div) + tau*U0[index])/(lambda + tau); } return; -} +} __global__ void UpdV_2D_kernel(float *V1, float *V2, float *P1, float *P2, float *Q1, float *Q2, float *Q3, long dimX, long dimY, float tau) { float q1, q3_x, q2, q3_y, div1, div2; long i1, j1; - + const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; - - long index = i + (dimX)*j; - - if ((i < dimX) && (j < dimY)) { + + long index = i + (dimX)*j; + + if ((i < dimX) && (j < dimY)) { q1 = 0.0f; q3_x = 0.0f; q2 = 0.0f; q3_y = 0.0f; div1 = 0.0f; div2= 0.0f; - + i1 = (i-1) + dimX*j; j1 = (i) + dimX*(j-1); - /* boundary conditions (Neuman) */ + /* boundary conditions (Neuman) */ if ((i > 0) && (i < dimX-1)) { q1 = Q1[index] - Q1[i1]; - q3_x = Q3[index] - Q3[i1]; } + q3_x = Q3[index] - Q3[i1]; } else if (i == 0) { q1 = Q1[index]; - q3_x = Q3[index]; } + q3_x = Q3[index]; } else if (i == dimX-1) { q1 = -Q1[i1]; q3_x = -Q3[i1]; } else { q1 = 0.0f; q3_x = 0.0f; - } - + } + if ((j > 0) && (j < dimY-1)) { q2 = Q2[index] - Q2[j1]; - q3_y = Q3[index] - Q3[j1]; } + q3_y = Q3[index] - Q3[j1]; } else if (j == dimY-1) { q2 = -Q2[j1]; q3_y = -Q3[j1]; } @@ -209,23 +215,23 @@ __global__ void UpdV_2D_kernel(float *V1, float *V2, float *P1, float *P2, float else { q2 = 0.0f; q3_y = 0.0f; - } - + } + div1 = q1 + q3_y; div2 = q3_x + q2; V1[index] += tau*(P1[index] + div1); V2[index] += tau*(P2[index] + div2); } return; -} +} __global__ void copyIm_TGV_kernel(float *U, float *U_old, long dimX, long dimY) { const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; - - long index = i + (dimX)*j; - + + long index = i + (dimX)*j; + if ((i < dimX) && (j < dimY)) { U_old[index] = U[index]; } @@ -235,9 +241,9 @@ __global__ void copyIm_TGV_kernel_ar2(float *V1, float *V2, float *V1_old, float { const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; - - long index = i + (dimX)*j; - + + long index = i + (dimX)*j; + if ((i < dimX) && (j < dimY)) { V1_old[index] = V1[index]; V2_old[index] = V2[index]; @@ -248,9 +254,9 @@ __global__ void newU_kernel(float *U, float *U_old, long dimX, long dimY) { const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; - - long index = i + (dimX)*j; - + + long index = i + (dimX)*j; + if ((i < dimX) && (j < dimY)) { U[index] = 2.0f*U[index] - U_old[index]; } @@ -261,12 +267,12 @@ __global__ void newU_kernel_ar2(float *V1, float *V2, float *V1_old, float *V2_o { const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; - - long index = i + (dimX)*j; - + + long index = i + (dimX)*j; + if ((i < dimX) && (j < dimY)) { V1[index] = 2.0f*V1[index] - V1_old[index]; - V2[index] = 2.0f*V2[index] - V2_old[index]; + V2[index] = 2.0f*V2[index] - V2_old[index]; } } @@ -274,26 +280,26 @@ __global__ void newU_kernel_ar2(float *V1, float *V2, float *V1_old, float *V2_o /***************************3D Functions*****************************/ /********************************************************************/ __global__ void DualP_3D_kernel(float *U, float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ, float sigma) -{ +{ long index; const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; const long k = blockDim.z * blockIdx.z + threadIdx.z; - + index = (dimX*dimY)*k + i*dimX+j; - - if ((i < dimX) && (j < dimY) && (k < dimZ)) { - /* symmetric boundary conditions (Neuman) */ - if ((i >= 0) && (i < dimX-1)) P1[index] += sigma*((U[(dimX*dimY)*k + (i+1)*dimX+j] - U[index]) - V1[index]); - else if (i == dimX-1) P1[index] -= sigma*(V1[index]); + + if ((i < dimX) && (j < dimY) && (k < dimZ)) { + /* symmetric boundary conditions (Neuman) */ + if ((i >= 0) && (i < dimX-1)) P1[index] += sigma*((U[(dimX*dimY)*k + (i+1)*dimX+j] - U[index]) - V1[index]); + else if (i == dimX-1) P1[index] -= sigma*(V1[index]); else P1[index] = 0.0f; - if ((j >= 0) && (j < dimY-1)) P2[index] += sigma*((U[(dimX*dimY)*k + i*dimX+(j+1)] - U[index]) - V2[index]); - else if (j == dimY-1) P2[index] -= sigma*(V2[index]); - else P2[index] = 0.0f; - if ((k >= 0) && (k < dimZ-1)) P3[index] += sigma*((U[(dimX*dimY)*(k+1) + i*dimX+(j)] - U[index]) - V3[index]); - else if (k == dimZ-1) P3[index] -= sigma*(V3[index]); + if ((j >= 0) && (j < dimY-1)) P2[index] += sigma*((U[(dimX*dimY)*k + i*dimX+(j+1)] - U[index]) - V2[index]); + else if (j == dimY-1) P2[index] -= sigma*(V2[index]); + else P2[index] = 0.0f; + if ((k >= 0) && (k < dimZ-1)) P3[index] += sigma*((U[(dimX*dimY)*(k+1) + i*dimX+(j)] - U[index]) - V3[index]); + else if (k == dimZ-1) P3[index] -= sigma*(V3[index]); else P3[index] = 0.0f; - } + } return; } @@ -304,9 +310,9 @@ __global__ void ProjP_3D_kernel(float *P1, float *P2, float *P3, long dimX, long const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; const long k = blockDim.z * blockIdx.z + threadIdx.z; - - index = (dimX*dimY)*k + i*dimX+j; - if ((i < dimX) && (j < dimY) && (k < dimZ)) { + + index = (dimX*dimY)*k + i*dimX+j; + if ((i < dimX) && (j < dimY) && (k < dimZ)) { grad_magn = (sqrtf(powf(P1[index],2) + powf(P2[index],2) + powf(P3[index],2)))/alpha1; if (grad_magn > 1.0f) { P1[index] /= grad_magn; @@ -322,35 +328,35 @@ __global__ void DualQ_3D_kernel(float *V1, float *V2, float *V3, float *Q1, floa float q1, q2, q3, q11, q22, q33, q44, q55, q66; long index; - + const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; const long k = blockDim.z * blockIdx.z + threadIdx.z; - - index = (dimX*dimY)*k + i*dimX+j; + + index = (dimX*dimY)*k + i*dimX+j; long i1 = (dimX*dimY)*k + (i+1)*dimX+j; long j1 = (dimX*dimY)*k + (i)*dimX+(j+1); long k1 = (dimX*dimY)*(k+1) + (i)*dimX+(j); - - if ((i < dimX) && (j < dimY) && (k < dimZ)) { + + if ((i < dimX) && (j < dimY) && (k < dimZ)) { q1 = 0.0f; q11 = 0.0f; q33 = 0.0f; q2 = 0.0f; q22 = 0.0f; q55 = 0.0f; q3 = 0.0f; q44 = 0.0f; q66 = 0.0f; - + /* boundary conditions (Neuman) */ - if ((i >= 0) && (i < dimX-1)) { - q1 = V1[i1] - V1[index]; + if ((i >= 0) && (i < dimX-1)) { + q1 = V1[i1] - V1[index]; q11 = V2[i1] - V2[index]; q33 = V3[i1] - V3[index]; } if ((j >= 0) && (j < dimY-1)) { - q2 = V2[j1] - V2[index]; + q2 = V2[j1] - V2[index]; q22 = V1[j1] - V1[index]; q55 = V3[j1] - V3[index]; } if ((k >= 0) && (k < dimZ-1)) { q3 = V3[k1] - V3[index]; q44 = V1[k1] - V1[index]; q66 = V2[k1] - V2[index]; } - + Q1[index] += sigma*(q1); /*Q11*/ - Q2[index] += sigma*(q2); /*Q22*/ + Q2[index] += sigma*(q2); /*Q22*/ Q3[index] += sigma*(q3); /*Q33*/ Q4[index] += sigma*(0.5f*(q11 + q22)); /* Q21 / Q12 */ Q5[index] += sigma*(0.5f*(q33 + q44)); /* Q31 / Q13 */ @@ -365,11 +371,11 @@ __global__ void ProjQ_3D_kernel(float *Q1, float *Q2, float *Q3, float *Q4, floa long index; const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; - const long k = blockDim.z * blockIdx.z + threadIdx.z; + const long k = blockDim.z * blockIdx.z + threadIdx.z; - index = (dimX*dimY)*k + i*dimX+j; - - if ((i < dimX) && (j < dimY) && (k < dimZ)) { + index = (dimX*dimY)*k + i*dimX+j; + + if ((i < dimX) && (j < dimY) && (k < dimZ)) { grad_magn = sqrtf(powf(Q1[index],2) + powf(Q2[index],2) + powf(Q3[index],2) + 2.0f*powf(Q4[index],2) + 2.0f*powf(Q5[index],2) + 2.0f*powf(Q6[index],2)); grad_magn = grad_magn/alpha0; if (grad_magn > 1.0f) { @@ -382,21 +388,21 @@ __global__ void ProjQ_3D_kernel(float *Q1, float *Q2, float *Q3, float *Q4, floa } } return; -} +} __global__ void DivProjP_3D_kernel(float *U, float *U0, float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ, float lambda, float tau) { float P_v1, P_v2, P_v3, div; long index; const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; - const long k = blockDim.z * blockIdx.z + threadIdx.z; - - index = (dimX*dimY)*k + i*dimX+j; + const long k = blockDim.z * blockIdx.z + threadIdx.z; + + index = (dimX*dimY)*k + i*dimX+j; long i1 = (dimX*dimY)*k + (i-1)*dimX+j; long j1 = (dimX*dimY)*k + (i)*dimX+(j-1); - long k1 = (dimX*dimY)*(k-1) + (i)*dimX+(j); - - if ((i < dimX) && (j < dimY) && (k < dimZ)) { + long k1 = (dimX*dimY)*(k-1) + (i)*dimX+(j); + + if ((i < dimX) && (j < dimY) && (k < dimZ)) { if ((i > 0) && (i < dimX-1)) P_v1 = P1[index] - P1[i1]; else if (i == dimX-1) P_v1 = -P1[i1]; @@ -408,13 +414,13 @@ __global__ void DivProjP_3D_kernel(float *U, float *U0, float *P1, float *P2, fl else if (j == 0) P_v2 = P2[index]; else P_v2 = 0.0f; - if ((k > 0) && (k < dimZ-1)) P_v3 = P3[index] - P3[k1]; + if ((k > 0) && (k < dimZ-1)) P_v3 = P3[index] - P3[k1]; else if (k == dimZ-1) P_v3 = -P3[k1]; else if (k == 0) P_v3 = P3[index]; - else P_v3 = 0.0f; - + else P_v3 = 0.0f; + div = P_v1 + P_v2 + P_v3; - U[index] = (lambda*(U[index] + tau*div) + tau*U0[index])/(lambda + tau); + U[index] = (lambda*(U[index] + tau*div) + tau*U0[index])/(lambda + tau); } return; } @@ -425,37 +431,37 @@ __global__ void UpdV_3D_kernel(float *V1, float *V2, float *V3, float *P1, float const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; const long k = blockDim.z * blockIdx.z + threadIdx.z; - - index = (dimX*dimY)*k + i*dimX+j; + + index = (dimX*dimY)*k + i*dimX+j; long i1 = (dimX*dimY)*k + (i-1)*dimX+j; long j1 = (dimX*dimY)*k + (i)*dimX+(j-1); - long k1 = (dimX*dimY)*(k-1) + (i)*dimX+(j); - - /* Q1 - Q11, Q2 - Q22, Q3 - Q33, Q4 - Q21/Q12, Q5 - Q31/Q13, Q6 - Q32/Q23*/ - if ((i < dimX) && (j < dimY) && (k < dimZ)) { + long k1 = (dimX*dimY)*(k-1) + (i)*dimX+(j); - /* boundary conditions (Neuman) */ + /* Q1 - Q11, Q2 - Q22, Q3 - Q33, Q4 - Q21/Q12, Q5 - Q31/Q13, Q6 - Q32/Q23*/ + if ((i < dimX) && (j < dimY) && (k < dimZ)) { + + /* boundary conditions (Neuman) */ if ((i > 0) && (i < dimX-1)) { q1 = Q1[index] - Q1[i1]; - q4x = Q4[index] - Q4[i1]; - q5x = Q5[index] - Q5[i1]; } + q4x = Q4[index] - Q4[i1]; + q5x = Q5[index] - Q5[i1]; } else if (i == 0) { q1 = Q1[index]; - q4x = Q4[index]; - q5x = Q5[index]; } + q4x = Q4[index]; + q5x = Q5[index]; } else if (i == dimX-1) { q1 = -Q1[i1]; - q4x = -Q4[i1]; + q4x = -Q4[i1]; q5x = -Q5[i1]; } else { q1 = 0.0f; q4x = 0.0f; - q5x = 0.0f; } - + q5x = 0.0f; } + if ((j > 0) && (j < dimY-1)) { q2 = Q2[index] - Q2[j1]; q4y = Q4[index] - Q4[j1]; - q6y = Q6[index] - Q6[j1]; } + q6y = Q6[index] - Q6[j1]; } else if (j == dimY-1) { q2 = -Q2[j1]; q4y = -Q4[j1]; @@ -468,12 +474,12 @@ __global__ void UpdV_3D_kernel(float *V1, float *V2, float *V3, float *P1, float q2 = 0.0f; q4y = 0.0f; q6y = 0.0f; - } + } if ((k > 0) && (k < dimZ-1)) { q6z = Q6[index] - Q6[k1]; q5z = Q5[index] - Q5[k1]; - q3 = Q3[index] - Q3[k1]; } + q3 = Q3[index] - Q3[k1]; } else if (k == dimZ-1) { q6z = -Q6[k1]; q5z = -Q5[k1]; @@ -488,27 +494,27 @@ __global__ void UpdV_3D_kernel(float *V1, float *V2, float *V3, float *P1, float q3 = 0.0f; } div1 = q1 + q4y + q5z; - div2 = q4x + q2 + q6z; + div2 = q4x + q2 + q6z; div3 = q5x + q6y + q3; - + V1[index] += tau*(P1[index] + div1); V2[index] += tau*(P2[index] + div2); V3[index] += tau*(P3[index] + div3); } return; -} +} __global__ void copyIm_TGV_kernel3D(float *U, float *U_old, long dimX, long dimY, long dimZ) { long index; const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; - const long k = blockDim.z * blockIdx.z + threadIdx.z; - + const long k = blockDim.z * blockIdx.z + threadIdx.z; + index = (dimX*dimY)*k + j*dimX+i; - - if ((i < dimX) && (j < dimY) && (k < dimZ)) { - U_old[index] = U[index]; + + if ((i < dimX) && (j < dimY) && (k < dimZ)) { + U_old[index] = U[index]; } } @@ -517,51 +523,79 @@ __global__ void copyIm_TGV_kernel3D_ar3(float *V1, float *V2, float *V3, float * long index; const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; - const long k = blockDim.z * blockIdx.z + threadIdx.z; - + const long k = blockDim.z * blockIdx.z + threadIdx.z; + index = (dimX*dimY)*k + j*dimX+i; - - if ((i < dimX) && (j < dimY) && (k < dimZ)) { + + if ((i < dimX) && (j < dimY) && (k < dimZ)) { V1_old[index] = V1[index]; V2_old[index] = V2[index]; - V3_old[index] = V3[index]; + V3_old[index] = V3[index]; } } -__global__ void newU_kernel3D(float *U, float *U_old, int dimX, int dimY, int dimZ) +__global__ void newU_kernel3D(float *U, float *U_old, long dimX, long dimY, long dimZ) { long index; const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; const long k = blockDim.z * blockIdx.z + threadIdx.z; - + index = (dimX*dimY)*k + j*dimX+i; - - if ((i < dimX) && (j < dimY) && (k < dimZ)) { + + if ((i < dimX) && (j < dimY) && (k < dimZ)) { U[index] = 2.0f*U[index] - U_old[index]; } -} +} __global__ void newU_kernel3D_ar3(float *V1, float *V2, float *V3, float *V1_old, float *V2_old, float *V3_old, long dimX, long dimY, long dimZ) { - long index; - const long i = blockDim.x * blockIdx.x + threadIdx.x; - const long j = blockDim.y * blockIdx.y + threadIdx.y; - const long k = blockDim.z * blockIdx.z + threadIdx.z; - + long index; + const long i = blockDim.x * blockIdx.x + threadIdx.x; + const long j = blockDim.y * blockIdx.y + threadIdx.y; + const long k = blockDim.z * blockIdx.z + threadIdx.z; + index = (dimX*dimY)*k + j*dimX+i; - - if ((i < dimX) && (j < dimY) && (k < dimZ)) { + + if ((i < dimX) && (j < dimY) && (k < dimZ)) { V1[index] = 2.0f*V1[index] - V1_old[index]; V2[index] = 2.0f*V2[index] - V2_old[index]; V3[index] = 2.0f*V3[index] - V3_old[index]; } -} +} + +__global__ void TGVResidCalc2D_kernel(float *Input1, float *Input2, float* Output, long dimX, long dimY, long num_total) +{ + const long i = blockDim.x * blockIdx.x + threadIdx.x; + const long j = blockDim.y * blockIdx.y + threadIdx.y; + + long index = i + (dimX)*j; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + +__global__ void TGVResidCalc3D_kernel(float *Input1, float *Input2, float* Output, long dimX, long dimY, long dimZ, long num_total) +{ + long index; + const long i = blockDim.x * blockIdx.x + threadIdx.x; + const long j = blockDim.y * blockIdx.y + threadIdx.y; + const long k = blockDim.z * blockIdx.z + threadIdx.z; + + index = (dimX*dimY)*k + j*dimX+i; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + + /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ /************************ MAIN HOST FUNCTION ***********************/ /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ -extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY, int dimZ) +extern "C" int TGV_GPU_main(float *U0, float *U, float *infovector, float lambda, float alpha1, float alpha0, int iterationsNumb, float L2, float epsil, int dimX, int dimY, int dimZ) { int deviceCount = -1; // number of devices @@ -569,21 +603,21 @@ extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, flo if (deviceCount == 0) { fprintf(stderr, "No CUDA devices found\n"); return -1; - } - - long dimTotal = (long)(dimX*dimY*dimZ); + } - - float *U_old, *d_U0, *d_U, *P1, *P2, *Q1, *Q2, *Q3, *V1, *V1_old, *V2, *V2_old, tau, sigma; + long dimTotal = (long)(dimX*dimY*dimZ); + float *U_old, *d_U0, *d_U, *P1, *P2, *Q1, *Q2, *Q3, *V1, *V1_old, *V2, *V2_old, tau, sigma, re; + int n, count; + count = 0; re = 0.0f; tau = powf(L2,-0.5f); sigma = tau; - + CHECK(cudaMalloc((void**)&d_U0,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&d_U,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&U_old,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&P1,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&P2,dimTotal*sizeof(float))); - + CHECK(cudaMalloc((void**)&Q1,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&Q2,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&Q3,dimTotal*sizeof(float))); @@ -591,24 +625,24 @@ extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, flo CHECK(cudaMalloc((void**)&V2,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&V1_old,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&V2_old,dimTotal*sizeof(float))); - + CHECK(cudaMemcpy(d_U0,U0,dimTotal*sizeof(float),cudaMemcpyHostToDevice)); - CHECK(cudaMemcpy(d_U,U0,dimTotal*sizeof(float),cudaMemcpyHostToDevice)); + CHECK(cudaMemcpy(d_U,U0,dimTotal*sizeof(float),cudaMemcpyHostToDevice)); cudaMemset(P1, 0, dimTotal*sizeof(float)); cudaMemset(P2, 0, dimTotal*sizeof(float)); cudaMemset(Q1, 0, dimTotal*sizeof(float)); cudaMemset(Q2, 0, dimTotal*sizeof(float)); cudaMemset(Q3, 0, dimTotal*sizeof(float)); cudaMemset(V1, 0, dimTotal*sizeof(float)); - cudaMemset(V2, 0, dimTotal*sizeof(float)); - + cudaMemset(V2, 0, dimTotal*sizeof(float)); + if (dimZ == 1) { /*2D case */ dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D)); - - for(int n=0; n < iterationsNumb; n++) { - + + for(n=0; n < iterationsNumb; n++) { + /* Calculate Dual Variable P */ DualP_2D_kernel<<<dimGrid,dimBlock>>>(d_U, V1, V2, P1, P2, (long)(dimX), (long)(dimY), sigma); checkCudaErrors( cudaDeviceSynchronize() ); @@ -616,7 +650,7 @@ extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, flo /*Projection onto convex set for P*/ ProjP_2D_kernel<<<dimGrid,dimBlock>>>(P1, P2, (long)(dimX), (long)(dimY), alpha1); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); + checkCudaErrors(cudaPeekAtLastError() ); /* Calculate Dual Variable Q */ DualQ_2D_kernel<<<dimGrid,dimBlock>>>(V1, V2, Q1, Q2, Q3, (long)(dimX), (long)(dimY), sigma); checkCudaErrors( cudaDeviceSynchronize() ); @@ -649,30 +683,50 @@ extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, flo newU_kernel_ar2<<<dimGrid,dimBlock>>>(V1, V2, V1_old, V2_old, (long)(dimX), (long)(dimY)); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); + + if ((epsil != 0.0f) && (n % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + TGVResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_U, U_old, V1_old, (long)(dimX), (long)(dimY), dimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(V1_old, V1_old + dimTotal); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_U, d_U + dimTotal); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } } } else { /*3D case */ dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKXSIZE)); - + float *P3, *Q4, *Q5, *Q6, *V3, *V3_old; - - CHECK(cudaMalloc((void**)&P3,dimTotal*sizeof(float))); + + CHECK(cudaMalloc((void**)&P3,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&Q4,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&Q5,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&Q6,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&V3,dimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&V3_old,dimTotal*sizeof(float))); - + cudaMemset(Q4, 0.0f, dimTotal*sizeof(float)); cudaMemset(Q5, 0.0f, dimTotal*sizeof(float)); cudaMemset(Q6, 0.0f, dimTotal*sizeof(float)); cudaMemset(P3, 0.0f, dimTotal*sizeof(float)); - cudaMemset(V3, 0.0f, dimTotal*sizeof(float)); - - for(int n=0; n < iterationsNumb; n++) { - + cudaMemset(V3, 0.0f, dimTotal*sizeof(float)); + + for(n=0; n < iterationsNumb; n++) { + /* Calculate Dual Variable P */ DualP_3D_kernel<<<dimGrid,dimBlock>>>(d_U, V1, V2, V3, P1, P2, P3, (long)(dimX), (long)(dimY), (long)(dimZ), sigma); checkCudaErrors( cudaDeviceSynchronize() ); @@ -702,7 +756,7 @@ extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, flo checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); /*saving V into V_old*/ - copyIm_TGV_kernel3D_ar3<<<dimGrid,dimBlock>>>(V1, V2, V3, V1_old, V2_old, V3_old, (long)(dimX), (long)(dimY), (long)(dimZ)); + copyIm_TGV_kernel3D_ar3<<<dimGrid,dimBlock>>>(V1, V2, V3, V1_old, V2_old, V3_old, (long)(dimX), (long)(dimY), (long)(dimZ)); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); /* upd V*/ @@ -713,23 +767,43 @@ extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, flo newU_kernel3D_ar3<<<dimGrid,dimBlock>>>(V1, V2, V3, V1_old, V2_old, V3_old, (long)(dimX), (long)(dimY), (long)(dimZ)); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - } - + + if ((epsil != 0.0f) && (n % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + TGVResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_U, U_old, V1_old, (long)(dimX), (long)(dimY), (long)(dimZ), dimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(V1_old, V1_old + dimTotal); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_U, d_U + dimTotal); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } + } + CHECK(cudaFree(Q4)); CHECK(cudaFree(Q5)); CHECK(cudaFree(Q6)); CHECK(cudaFree(P3)); CHECK(cudaFree(V3)); - CHECK(cudaFree(V3_old)); + CHECK(cudaFree(V3_old)); } - + CHECK(cudaMemcpy(U,d_U,dimTotal*sizeof(float),cudaMemcpyDeviceToHost)); CHECK(cudaFree(d_U0)); CHECK(cudaFree(d_U)); CHECK(cudaFree(U_old)); CHECK(cudaFree(P1)); CHECK(cudaFree(P2)); - + CHECK(cudaFree(Q1)); CHECK(cudaFree(Q2)); CHECK(cudaFree(Q3)); @@ -738,6 +812,10 @@ extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, flo CHECK(cudaFree(V1_old)); CHECK(cudaFree(V2_old)); - cudaDeviceReset(); + //cudaDeviceReset(); + /*adding info into info_vector */ + infovector[0] = (float)(n); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; } diff --git a/src/Core/regularisers_GPU/TGV_GPU_core.h b/src/Core/regularisers_GPU/TGV_GPU_core.h index e8f9c6e..3f820dd 100644 --- a/src/Core/regularisers_GPU/TGV_GPU_core.h +++ b/src/Core/regularisers_GPU/TGV_GPU_core.h @@ -5,6 +5,6 @@ #include <memory.h> #include <stdio.h> -extern "C" CCPI_EXPORT int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY, int dimZ); +extern "C" CCPI_EXPORT int TGV_GPU_main(float *U0, float *U, float *infovector, float lambda, float alpha1, float alpha0, int iterationsNumb, float L2, float epsil, int dimX, int dimY, int dimZ); -#endif +#endif diff --git a/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu b/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu index b371c5d..ce2548f 100755 --- a/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu +++ b/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu @@ -15,26 +15,27 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ +*/ #include "TV_FGP_GPU_core.h" #include "shared.h" +#include <thrust/functional.h> #include <thrust/device_vector.h> #include <thrust/transform_reduce.h> /* CUDA implementation of FGP-TV [1] denoising/regularization model (2D/3D case) * * Input Parameters: - * 1. Noisy image/volume - * 2. lambdaPar - regularization parameter + * 1. Noisy image/volume + * 2. lambdaPar - regularization parameter * 3. Number of iterations - * 4. eplsilon: tolerance constant + * 4. eplsilon: tolerance constant * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) - * 6. nonneg: 'nonnegativity (0 is OFF by default) - * 7. print information: 0 (off) or 1 (on) + * 6. nonneg: 'nonnegativity (0 is OFF by default) * * Output: - * [1] Filtered/regularized image + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the Matlab's code and paper by * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" @@ -49,23 +50,23 @@ limitations under the License. #define BLKZSIZE 8 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) -struct square { __host__ __device__ float operator()(float x) { return x * x; } }; +// struct square { __host__ __device__ float operator()(float x) { return x * x; } }; /************************************************/ /*****************2D modules*********************/ /************************************************/ __global__ void Obj_func2D_kernel(float *Ad, float *D, float *R1, float *R2, int N, int M, int ImSize, float lambda) { - + float val1,val2; - + //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - - int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { if (xIndex <= 0) {val1 = 0.0f;} else {val1 = R1[(xIndex-1) + N*yIndex];} if (yIndex <= 0) {val2 = 0.0f;} else {val2 = R2[xIndex + N*(yIndex-1)];} //Write final result to global memory @@ -76,21 +77,21 @@ __global__ void Obj_func2D_kernel(float *Ad, float *D, float *R1, float *R2, int __global__ void Grad_func2D_kernel(float *P1, float *P2, float *D, float *R1, float *R2, int N, int M, int ImSize, float multip) { - + float val1,val2; - + //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - + int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { - + + if ((xIndex < N) && (yIndex < M)) { + /* boundary conditions */ if (xIndex >= N-1) val1 = 0.0f; else val1 = D[index] - D[(xIndex+1) + N*yIndex]; if (yIndex >= M-1) val2 = 0.0f; else val2 = D[index] - D[(xIndex) + N*(yIndex + 1)]; - + //Write final result to global memory P1[index] = R1[index] + multip*val1; P2[index] = R2[index] + multip*val2; @@ -100,16 +101,16 @@ __global__ void Grad_func2D_kernel(float *P1, float *P2, float *D, float *R1, fl __global__ void Proj_func2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize) { - - float denom; + + float denom; //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - + int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { - denom = pow(P1[index],2) + pow(P2[index],2); + + if ((xIndex < N) && (yIndex < M)) { + denom = pow(P1[index],2) + pow(P2[index],2); if (denom > 1.0f) { P1[index] = P1[index]/sqrt(denom); P2[index] = P2[index]/sqrt(denom); @@ -119,15 +120,15 @@ __global__ void Proj_func2D_iso_kernel(float *P1, float *P2, int N, int M, int I } __global__ void Proj_func2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize) { - - float val1, val2; + + float val1, val2; //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - + int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { + + if ((xIndex < N) && (yIndex < M)) { val1 = abs(P1[index]); val2 = abs(P2[index]); if (val1 < 1.0f) {val1 = 1.0f;} @@ -142,10 +143,10 @@ __global__ void Rupd_func2D_kernel(float *P1, float *P1_old, float *P2, float *P //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - + int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { + + if ((xIndex < N) && (yIndex < M)) { R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]); R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]); } @@ -155,9 +156,9 @@ __global__ void nonneg2D_kernel(float* Output, int N, int M, int num_total) { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { if (Output[index] < 0.0f) Output[index] = 0.0f; } @@ -167,17 +168,17 @@ __global__ void nonneg2D_kernel(float* Output, int N, int M, int num_total) /************************************************/ __global__ void Obj_func3D_kernel(float *Ad, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float lambda) { - + float val1,val2,val3; - + //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - - if ((i < N) && (j < M) && (k < Z)) { + + if ((i < N) && (j < M) && (k < Z)) { if (i <= 0) {val1 = 0.0f;} else {val1 = R1[(N*M)*(k) + (i-1) + N*j];} if (j <= 0) {val2 = 0.0f;} else {val2 = R2[(N*M)*(k) + i + N*(j-1)];} if (k <= 0) {val3 = 0.0f;} else {val3 = R3[(N*M)*(k-1) + i + N*j];} @@ -189,22 +190,22 @@ __global__ void Obj_func3D_kernel(float *Ad, float *D, float *R1, float *R2, flo __global__ void Grad_func3D_kernel(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float multip) { - + float val1,val2,val3; - + //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - - if ((i < N) && (j < M) && (k < Z)) { + + if ((i < N) && (j < M) && (k < Z)) { /* boundary conditions */ if (i >= N-1) val1 = 0.0f; else val1 = D[index] - D[(N*M)*(k) + (i+1) + N*j]; if (j >= M-1) val2 = 0.0f; else val2 = D[index] - D[(N*M)*(k) + i + N*(j+1)]; if (k >= Z-1) val3 = 0.0f; else val3 = D[index] - D[(N*M)*(k+1) + i + N*j]; - + //Write final result to global memory P1[index] = R1[index] + multip*val1; P2[index] = R2[index] + multip*val2; @@ -215,18 +216,18 @@ __global__ void Grad_func3D_kernel(float *P1, float *P2, float *P3, float *D, fl __global__ void Proj_func3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) { - - float denom,sq_denom; + + float denom,sq_denom; //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { denom = pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2); - + if (denom > 1.0f) { sq_denom = 1.0f/sqrt(denom); P1[index] = P1[index]*sq_denom; @@ -239,15 +240,15 @@ __global__ void Proj_func3D_iso_kernel(float *P1, float *P2, float *P3, int N, i __global__ void Proj_func3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) { - - float val1, val2, val3; + + float val1, val2, val3; //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { val1 = abs(P1[index]); val2 = abs(P2[index]); @@ -267,10 +268,10 @@ __global__ void Rupd_func3D_kernel(float *P1, float *P1_old, float *P2, float *P int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - - if ((i < N) && (j < M) && (k < Z)) { + + if ((i < N) && (j < M) && (k < Z)) { R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]); R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]); R3[index] = P3[index] + multip2*(P3[index] - P3_old[index]); @@ -283,9 +284,9 @@ __global__ void nonneg3D_kernel(float* Output, int N, int M, int Z, int num_tota int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if (index < num_total) { if (Output[index] < 0.0f) Output[index] = 0.0f; } @@ -294,9 +295,9 @@ __global__ void FGPcopy_kernel2D(float *Input, float* Output, int N, int M, int { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { Output[index] = Input[index]; } @@ -307,9 +308,9 @@ __global__ void FGPcopy_kernel3D(float *Input, float* Output, int N, int M, int int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if (index < num_total) { Output[index] = Input[index]; } @@ -319,9 +320,9 @@ __global__ void FGPResidCalc2D_kernel(float *Input1, float *Input2, float* Outpu { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { Output[index] = Input1[index] - Input2[index]; } @@ -332,9 +333,9 @@ __global__ void FGPResidCalc3D_kernel(float *Input1, float *Input2, float* Outpu int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if (index < num_total) { Output[index] = Input1[index] - Input2[index]; } @@ -343,7 +344,7 @@ __global__ void FGPResidCalc3D_kernel(float *Input1, float *Input2, float* Outpu /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ ////////////MAIN HOST FUNCTION /////////////// -extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ) +extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ) { int deviceCount = -1; // number of devices cudaGetDeviceCount(&deviceCount); @@ -351,31 +352,32 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int fprintf(stderr, "No CUDA devices found\n"); return -1; } - + int count = 0, i; - float re, multip,multip2; - float tk = 1.0f; + float re, multip,multip2; + re = 0.0f; + float tk = 1.0f; float tkp1=1.0f; - + if (dimZ <= 1) { /*2D verson*/ - int ImSize = dimX*dimY; - float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL; - - dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); - dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D)); - + int ImSize = dimX*dimY; + float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL; + + dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); + dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D)); + /*allocate space for images on device*/ - checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); - if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); + if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) ); - + checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); cudaMemset(P1, 0, ImSize*sizeof(float)); cudaMemset(P2, 0, ImSize*sizeof(float)); @@ -384,75 +386,80 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int cudaMemset(R1, 0, ImSize*sizeof(float)); cudaMemset(R2, 0, ImSize*sizeof(float)); - /********************** Run CUDA 2D kernel here ********************/ + /********************** Run CUDA 2D kernel here ********************/ multip = (1.0f/(8.0f*lambdaPar)); - + /* The main kernel */ for (i = 0; i < iter; i++) { - + + if ((epsil != 0.0f) && (i % 5 == 0)) { + FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + /* computing the gradient of the objective function */ Obj_func2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, dimX, dimY, ImSize, lambdaPar); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + if (nonneg != 0) { nonneg2D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); } - + /*Taking a step towards minus of the gradient*/ Grad_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P2, d_update, R1, R2, dimX, dimY, ImSize, multip); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + /* projection step */ if (methodTV == 0) Proj_func2D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/ - else Proj_func2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/ + else Proj_func2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/ checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; multip2 = ((tk-1.0f)/tkp1); - + Rupd_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, multip2, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - - if (epsil != 0.0f) { - /* calculate norm - stopping rules using the Thrust library */ - FGPResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize); - float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); - thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); - float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); - - re = (reduction/reduction2); - if (re < epsil) count++; - if (count > 4) break; - - FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - } - + FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + tk = tkp1; + + if ((epsil != 0.0f) && (i % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + FGPResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(P1, P1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } + } - if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i); - /***************************************************************/ //copy result matrix from device to host memory cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); - + cudaFree(d_input); cudaFree(d_update); if (epsil != 0.0f) cudaFree(d_update_prev); @@ -465,15 +472,16 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int } else { /*3D verson*/ - int ImSize = dimX*dimY*dimZ; - float *d_input, *d_update=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL; - + int ImSize = dimX*dimY*dimZ; + float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL; + dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE)); - + + if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) ); /*allocate space for images on device*/ checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) ); @@ -483,7 +491,7 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&R3,ImSize*sizeof(float)) ); - + checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); cudaMemset(P1, 0, ImSize*sizeof(float)); cudaMemset(P2, 0, ImSize*sizeof(float)); @@ -494,61 +502,87 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int cudaMemset(R1, 0, ImSize*sizeof(float)); cudaMemset(R2, 0, ImSize*sizeof(float)); cudaMemset(R3, 0, ImSize*sizeof(float)); - /********************** Run CUDA 3D kernel here ********************/ + /********************** Run CUDA 3D kernel here ********************/ multip = (1.0f/(26.0f*lambdaPar)); - + /* The main kernel */ for (i = 0; i < iter; i++) { - + + if ((epsil != 0.0f) && (i % 5 == 0)) { + FGPcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + /* computing the gradient of the objective function */ Obj_func3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, lambdaPar); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + if (nonneg != 0) { nonneg3D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); } - + /*Taking a step towards minus of the gradient*/ Grad_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, multip); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + /* projection step */ if (methodTV == 0) Proj_func3D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* isotropic kernel */ else Proj_func3D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* anisotropic kernel */ checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; multip2 = ((tk-1.0f)/tkp1); - + Rupd_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, multip2, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + FGPcopy_kernel3D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + FGPcopy_kernel3D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + FGPcopy_kernel3D<<<dimGrid,dimBlock>>>(P3, P3_prev, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + tk = tkp1; + + if ((epsil != 0.0f) && (i % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + FGPResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(P1, P1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } } - if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i); - /***************************************************************/ + /***************************************************************/ //copy result matrix from device to host memory cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); - + if (epsil != 0.0f) cudaFree(d_update_prev); + cudaFree(d_input); - cudaFree(d_update); + cudaFree(d_update); cudaFree(P1); cudaFree(P2); cudaFree(P3); @@ -556,9 +590,12 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int cudaFree(P2_prev); cudaFree(P3_prev); cudaFree(R1); - cudaFree(R2); - cudaFree(R3); - } + cudaFree(R2); + cudaFree(R3); + } //cudaDeviceReset(); + /*adding info into info_vector */ + infovector[0] = (float)(i); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ return 0; } diff --git a/src/Core/regularisers_GPU/TV_FGP_GPU_core.h b/src/Core/regularisers_GPU/TV_FGP_GPU_core.h index bf13508..597b6c6 100755 --- a/src/Core/regularisers_GPU/TV_FGP_GPU_core.h +++ b/src/Core/regularisers_GPU/TV_FGP_GPU_core.h @@ -4,6 +4,6 @@ #include "CCPiDefines.h" #include <memory.h> -extern "C" CCPI_EXPORT int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); +extern "C" CCPI_EXPORT int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ); #endif diff --git a/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu b/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu index 76f5be9..193cf53 100755 --- a/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu +++ b/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu @@ -3,8 +3,8 @@ This work is part of the Core Imaging Library developed by Visual Analytics and Imaging System Group of the Science Technology Facilities Council, STFC -Copyright 2017 Daniil Kazantsev -Copyright 2017 Srikanth Nagella, Edoardo Pasca +Copyright 2019 Daniil Kazantsev +Copyright 2019 Srikanth Nagella, Edoardo Pasca Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,9 +15,13 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ +*/ #include "TV_ROF_GPU_core.h" +#include "shared.h" +#include <thrust/functional.h> +#include <thrust/device_vector.h> +#include <thrust/transform_reduce.h> /* C-OMP implementation of ROF-TV denoising/regularization model [1] (2D/3D case) * @@ -26,25 +30,25 @@ limitations under the License. * 2. lambda - regularization parameter [REQUIRED] * 3. tau - marching step for explicit scheme, ~0.1 is recommended [REQUIRED] * 4. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED] -* -* Output: -* [1] Regularized image/volume +* 5. eplsilon: tolerance constant + + * Output: + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] + * This function is based on the paper by * [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" -* -* D. Kazantsev, 2016-18 */ -#include "shared.h" - + #define BLKXSIZE 8 #define BLKYSIZE 8 #define BLKZSIZE 8 - + #define BLKXSIZE2D 16 #define BLKYSIZE2D 16 -#define EPS 1.0e-12 - +#define EPS 1.0e-8 + #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) @@ -53,148 +57,148 @@ limitations under the License. __host__ __device__ int sign (float x) { return (x > 0) - (x < 0); -} - -/*********************2D case****************************/ - +} + +/*********************2D case****************************/ + /* differences 1 */ - __global__ void D1_func2D(float* Input, float* D1, int N, int M) + __global__ void D1_func2D(float* Input, float* D1, int N, int M) { int i1, j1, i2; float NOMx_1,NOMy_1,NOMy_0,denom1,denom2,T1; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - - int index = i + N*j; - + + int index = i + N*j; + if ((i >= 0) && (i < N) && (j >= 0) && (j < M)) { - + /* boundary conditions (Neumann reflections) */ i1 = i + 1; if (i1 >= N) i1 = i-1; i2 = i - 1; if (i2 < 0) i2 = i+1; j1 = j + 1; if (j1 >= M) j1 = j-1; - + /* Forward-backward differences */ NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */ - NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ + NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ NOMy_0 = Input[index] - Input[j*N + i2]; /* y- */ - + denom1 = NOMx_1*NOMx_1; denom2 = 0.5f*(sign((float)NOMy_1) + sign((float)NOMy_0))*(MIN(abs((float)NOMy_1), abs((float)NOMy_0))); denom2 = denom2*denom2; T1 = sqrt(denom1 + denom2 + EPS); D1[index] = NOMx_1/T1; - } - } - + } + } + /* differences 2 */ - __global__ void D2_func2D(float* Input, float* D2, int N, int M) + __global__ void D2_func2D(float* Input, float* D2, int N, int M) { int i1, j1, j2; float NOMx_1,NOMy_1,NOMx_0,denom1,denom2,T2; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - + int index = i + N*j; - + if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) { - + /* boundary conditions (Neumann reflections) */ i1 = i + 1; if (i1 >= N) i1 = i-1; j1 = j + 1; if (j1 >= M) j1 = j-1; - j2 = j - 1; if (j2 < 0) j2 = j+1; - + j2 = j - 1; if (j2 < 0) j2 = j+1; + /* Forward-backward differences */ NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */ NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ NOMx_0 = Input[index] - Input[j2*N + i]; /* x- */ - + denom1 = NOMy_1*NOMy_1; denom2 = 0.5f*(sign((float)NOMx_1) + sign((float)NOMx_0))*(MIN(abs((float)NOMx_1), abs((float)NOMx_0))); denom2 = denom2*denom2; T2 = sqrt(denom1 + denom2 + EPS); D2[index] = NOMy_1/T2; - } + } } - - __global__ void TV_kernel2D(float *D1, float *D2, float *Update, float *Input, float lambda, float tau, int N, int M) + + __global__ void TV_kernel2D(float *D1, float *D2, float *Update, float *Input, float lambda, float tau, int N, int M) { int i2, j2; float dv1,dv2; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - - int index = i + N*j; - + + int index = i + N*j; + if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) { - + /* boundary conditions (Neumann reflections) */ i2 = i - 1; if (i2 < 0) i2 = i+1; - j2 = j - 1; if (j2 < 0) j2 = j+1; - + j2 = j - 1; if (j2 < 0) j2 = j+1; + /* divergence components */ dv1 = D1[index] - D1[j2*N + i]; dv2 = D2[index] - D2[j*N + i2]; - - Update[index] += tau*(2.0f*lambda*(dv1 + dv2) - (Update[index] - Input[index])); - - } - } -/*********************3D case****************************/ - + + Update[index] += tau*(lambda*(dv1 + dv2) - (Update[index] - Input[index])); + + } + } +/*********************3D case****************************/ + /* differences 1 */ - __global__ void D1_func3D(float* Input, float* D1, int dimX, int dimY, int dimZ) + __global__ void D1_func3D(float* Input, float* D1, int dimX, int dimY, int dimZ) { float NOMx_1, NOMy_1, NOMy_0, NOMz_1, NOMz_0, denom1, denom2,denom3, T1; int i1,i2,k1,j1,j2,k2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - + + int index = (dimX*dimY)*k + j*dimX+i; + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - + /* symmetric boundary conditions (Neuman) */ i1 = i + 1; if (i1 >= dimX) i1 = i-1; i2 = i - 1; if (i2 < 0) i2 = i+1; j1 = j + 1; if (j1 >= dimY) j1 = j-1; j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; - k2 = k - 1; if (k2 < 0) k2 = k+1; - + k2 = k - 1; if (k2 < 0) k2 = k+1; + /* Forward-backward differences */ NOMx_1 = Input[(dimX*dimY)*k + j1*dimX + i] - Input[index]; /* x+ */ - NOMy_1 = Input[(dimX*dimY)*k + j*dimX + i1] - Input[index]; /* y+ */ + NOMy_1 = Input[(dimX*dimY)*k + j*dimX + i1] - Input[index]; /* y+ */ NOMy_0 = Input[index] - Input[(dimX*dimY)*k + j*dimX + i2]; /* y- */ - + NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + j*dimX + i]; /* z- */ - - + + denom1 = NOMx_1*NOMx_1; denom2 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0))); denom2 = denom2*denom2; denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0))); denom3 = denom3*denom3; T1 = sqrt(denom1 + denom2 + denom3 + EPS); - D1[index] = NOMx_1/T1; - } - } + D1[index] = NOMx_1/T1; + } + } /* differences 2 */ - __global__ void D2_func3D(float* Input, float* D2, int dimX, int dimY, int dimZ) + __global__ void D2_func3D(float* Input, float* D2, int dimX, int dimY, int dimZ) { float NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2; int i1,i2,k1,j1,j2,k2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - + + int index = (dimX*dimY)*k + j*dimX+i; + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { /* symmetric boundary conditions (Neuman) */ i1 = i + 1; if (i1 >= dimX) i1 = i-1; @@ -203,16 +207,16 @@ __host__ __device__ int sign (float x) j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - - + + /* Forward-backward differences */ NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */ NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */ NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + (j)*dimX + i]; /* z- */ - - + + denom1 = NOMy_1*NOMy_1; denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0))); denom2 = denom2*denom2; @@ -222,19 +226,19 @@ __host__ __device__ int sign (float x) D2[index] = NOMy_1/T2; } } - + /* differences 3 */ - __global__ void D3_func3D(float* Input, float* D3, int dimX, int dimY, int dimZ) + __global__ void D3_func3D(float* Input, float* D3, int dimX, int dimY, int dimZ) { float NOMx_1, NOMy_1, NOMx_0, NOMy_0, NOMz_1, denom1, denom2, denom3, T3; int i1,i2,k1,j1,j2,k2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - + + int index = (dimX*dimY)*k + j*dimX+i; + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { i1 = i + 1; if (i1 >= dimX) i1 = i-1; @@ -243,14 +247,14 @@ __host__ __device__ int sign (float x) j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - + /* Forward-backward differences */ NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */ NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */ NOMy_0 = Input[index] - Input[(dimX*dimY)*k + (j)*dimX + i2]; /* y- */ NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ - + denom1 = NOMz_1*NOMz_1; denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0))); denom2 = denom2*denom2; @@ -261,18 +265,18 @@ __host__ __device__ int sign (float x) } } - __global__ void TV_kernel3D(float *D1, float *D2, float *D3, float *Update, float *Input, float lambda, float tau, int dimX, int dimY, int dimZ) + __global__ void TV_kernel3D(float *D1, float *D2, float *D3, float *Update, float *Input, float lambda, float tau, int dimX, int dimY, int dimZ) { float dv1, dv2, dv3; int i1,i2,k1,j1,j2,k2; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - + + int index = (dimX*dimY)*k + j*dimX+i; + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - + /* symmetric boundary conditions (Neuman) */ i1 = i + 1; if (i1 >= dimX) i1 = i-1; i2 = i - 1; if (i2 < 0) i2 = i+1; @@ -280,79 +284,200 @@ __host__ __device__ int sign (float x) j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - + /*divergence components */ dv1 = D1[index] - D1[(dimX*dimY)*k + j2*dimX+i]; dv2 = D2[index] - D2[(dimX*dimY)*k + j*dimX+i2]; dv3 = D3[index] - D3[(dimX*dimY)*k2 + j*dimX+i]; - - Update[index] += tau*(2.0f*lambda*(dv1 + dv2 + dv3) - (Update[index] - Input[index])); - - } + + Update[index] += tau*(lambda*(dv1 + dv2 + dv3) - (Update[index] - Input[index])); + + } } +__global__ void ROFcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input[index]; + } +} + + +__global__ void ROFResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + +__global__ void ROFcopy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input[index]; + } +} + +__global__ void ROFResidCalc3D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + ///////////////////////////////////////////////// -// HOST FUNCTION -extern "C" int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z) +///////////////// HOST FUNCTION ///////////////// +extern "C" int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, float lambdaPar, int iter, float tau, float epsil, int N, int M, int Z) { - // set up device - int dev = 0; - CHECK(cudaSetDevice(dev)); - float *d_input, *d_update, *d_D1, *d_D2; - + int deviceCount = -1; // number of devices + cudaGetDeviceCount(&deviceCount); + if (deviceCount == 0) { + fprintf(stderr, "No CUDA devices found\n"); + return -1; + } + float re; + re = 0.0f; + int ImSize, count, n; + count = 0; n = 0; + float *d_input, *d_update, *d_D1, *d_D2, *d_update_prev=NULL; + if (Z == 0) Z = 1; - CHECK(cudaMalloc((void**)&d_input,N*M*Z*sizeof(float))); - CHECK(cudaMalloc((void**)&d_update,N*M*Z*sizeof(float))); - CHECK(cudaMalloc((void**)&d_D1,N*M*Z*sizeof(float))); - CHECK(cudaMalloc((void**)&d_D2,N*M*Z*sizeof(float))); - - CHECK(cudaMemcpy(d_input,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice)); - CHECK(cudaMemcpy(d_update,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice)); - - if (Z > 1) { - // TV - 3D case + ImSize = N*M*Z; + CHECK(cudaMalloc((void**)&d_input,ImSize*sizeof(float))); + CHECK(cudaMalloc((void**)&d_update,ImSize*sizeof(float))); + CHECK(cudaMalloc((void**)&d_D1,ImSize*sizeof(float))); + CHECK(cudaMalloc((void**)&d_D2,ImSize*sizeof(float))); + if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) ); + + CHECK(cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + CHECK(cudaMemcpy(d_update,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + + if (Z == 1) { + // TV - 2D case + dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); + dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D)); + + for(n=0; n < iter; n++) { + + if ((epsil != 0.0f) && (n % 5 == 0)) { + ROFcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, N, M, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + /* calculate differences */ + D1_func2D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M); + CHECK(cudaDeviceSynchronize()); + D2_func2D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M); + CHECK(cudaDeviceSynchronize()); + /*running main kernel*/ + TV_kernel2D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_update, d_input, lambdaPar, tau, N, M); + CHECK(cudaDeviceSynchronize()); + + if ((epsil != 0.0f) && (n % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + ROFResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_D1, N, M, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors( cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(d_D1, d_D1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } + + } + } + else { + // TV - 3D case dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); - dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE)); - + dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE)); + float *d_D3; CHECK(cudaMalloc((void**)&d_D3,N*M*Z*sizeof(float))); - - for(int n=0; n < iter; n++) { + + for(n=0; n < iter; n++) { + + if ((epsil != 0.0f) && (n % 5 == 0)) { + ROFcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, N, M, Z, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + /* calculate differences */ D1_func3D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M, Z); CHECK(cudaDeviceSynchronize()); - D2_func3D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M, Z); - CHECK(cudaDeviceSynchronize()); + D2_func3D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M, Z); + CHECK(cudaDeviceSynchronize()); D3_func3D<<<dimGrid,dimBlock>>>(d_update, d_D3, N, M, Z); - CHECK(cudaDeviceSynchronize()); + CHECK(cudaDeviceSynchronize()); /*running main kernel*/ TV_kernel3D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_D3, d_update, d_input, lambdaPar, tau, N, M, Z); CHECK(cudaDeviceSynchronize()); + + if ((epsil != 0.0f) && (n % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + ROFResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_D1, N, M, Z, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors( cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(d_D1, d_D1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + + } + + } - + CHECK(cudaFree(d_D3)); } - else { - // TV - 2D case - dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); - dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D)); - - for(int n=0; n < iter; n++) { - /* calculate differences */ - D1_func2D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M); - CHECK(cudaDeviceSynchronize()); - D2_func2D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M); - CHECK(cudaDeviceSynchronize()); - /*running main kernel*/ - TV_kernel2D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_update, d_input, lambdaPar, tau, N, M); - CHECK(cudaDeviceSynchronize()); - } - } CHECK(cudaMemcpy(Output,d_update,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost)); + if (epsil != 0.0f) cudaFree(d_update_prev); CHECK(cudaFree(d_input)); CHECK(cudaFree(d_update)); CHECK(cudaFree(d_D1)); - CHECK(cudaFree(d_D2)); - //cudaDeviceReset(); + CHECK(cudaFree(d_D2)); + + infovector[0] = (float)(n); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; } diff --git a/src/Core/regularisers_GPU/TV_ROF_GPU_core.h b/src/Core/regularisers_GPU/TV_ROF_GPU_core.h index 3a09296..0a75124 100755 --- a/src/Core/regularisers_GPU/TV_ROF_GPU_core.h +++ b/src/Core/regularisers_GPU/TV_ROF_GPU_core.h @@ -3,6 +3,6 @@ #include "CCPiDefines.h" #include <stdio.h> -extern "C" CCPI_EXPORT int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z); +extern "C" CCPI_EXPORT int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, float lambdaPar, int iter, float tau, float epsil, int N, int M, int Z); #endif diff --git a/src/Core/regularisers_GPU/TV_SB_GPU_core.cu b/src/Core/regularisers_GPU/TV_SB_GPU_core.cu index 1f494ee..0353868 100755 --- a/src/Core/regularisers_GPU/TV_SB_GPU_core.cu +++ b/src/Core/regularisers_GPU/TV_SB_GPU_core.cu @@ -15,10 +15,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ +*/ #include "TV_SB_GPU_core.h" #include "shared.h" +#include <thrust/functional.h> #include <thrust/device_vector.h> #include <thrust/transform_reduce.h> @@ -31,15 +32,14 @@ limitations under the License. * 4. eplsilon - tolerance constant [OPTIONAL parameter] * 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter] * 6. nonneg: 'nonnegativity (0 is OFF by default) [OPTIONAL parameter] -* 7. print information: 0 (off) or 1 (on) [OPTIONAL parameter] * * Output: -* 1. Filtered/regularized image -* +* [1] Filtered/regularized image/volume +* [2] Information vector which contains [iteration no., reached tolerance] + * [1]. Goldstein, T. and Osher, S., 2009. The split Bregman method for L1-regularized problems. SIAM journal on imaging sciences, 2(2), pp.323-343. */ -// This will output the proper CUDA error strings in the event that a CUDA host call returns an error #define BLKXSIZE2D 16 #define BLKYSIZE2D 16 @@ -49,29 +49,29 @@ limitations under the License. #define BLKZSIZE 8 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) -struct square { __host__ __device__ float operator()(float x) { return x * x; } }; +// struct square { __host__ __device__ float operator()(float x) { return x * x; } }; /************************************************/ /*****************2D modules*********************/ /************************************************/ __global__ void gauss_seidel2D_kernel(float *U, float *A, float *U_prev, float *Dx, float *Dy, float *Bx, float *By, float lambda, float mu, float normConst, int N, int M, int ImSize) { - + float sum; int i1,i2,j1,j2; - + //calculate each thread global index const int i=blockIdx.x*blockDim.x+threadIdx.x; const int j=blockIdx.y*blockDim.y+threadIdx.y; - + int index = j*N+i; - + if ((i < N) && (j < M)) { i1 = i+1; if (i1 == N) i1 = i-1; i2 = i-1; if (i2 < 0) i2 = i+1; j1 = j+1; if (j1 == M) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; - + sum = Dx[j*N+i2] - Dx[index] + Dy[j2*N+i] - Dy[index] - Bx[j*N+i2] + Bx[index] - By[j2*N+i] + By[index]; sum += U_prev[j*N+i1] + U_prev[j*N+i2] + U_prev[j1*N+i] + U_prev[j2*N+i]; sum *= lambda; @@ -82,27 +82,27 @@ __global__ void gauss_seidel2D_kernel(float *U, float *A, float *U_prev, float * } __global__ void updDxDy_shrinkAniso2D_kernel(float *U, float *Dx, float *Dy, float *Bx, float *By, float lambda, int N, int M, int ImSize) { - + int i1,j1; float val1, val11, val2, val22, denom_lam; denom_lam = 1.0f/lambda; - + //calculate each thread global index const int i=blockIdx.x*blockDim.x+threadIdx.x; const int j=blockIdx.y*blockDim.y+threadIdx.y; - + int index = j*N+i; - + if ((i < N) && (j < M)) { i1 = i+1; if (i1 == N) i1 = i-1; j1 = j+1; if (j1 == M) j1 = j-1; - + val1 = (U[j*N+i1] - U[index]) + Bx[index]; val2 = (U[j1*N+i] - U[index]) + By[index]; - + val11 = abs(val1) - denom_lam; if (val11 < 0) val11 = 0; val22 = abs(val2) - denom_lam; if (val22 < 0) val22 = 0; - + if (val1 !=0) Dx[index] = (val1/abs(val1))*val11; else Dx[index] = 0; if (val2 !=0) Dy[index] = (val2/abs(val2))*val22; else Dy[index] = 0; } @@ -111,28 +111,28 @@ __global__ void updDxDy_shrinkAniso2D_kernel(float *U, float *Dx, float *Dy, flo __global__ void updDxDy_shrinkIso2D_kernel(float *U, float *Dx, float *Dy, float *Bx, float *By, float lambda, int N, int M, int ImSize) { - + int i1,j1; float val1, val11, val2, denom_lam, denom; denom_lam = 1.0f/lambda; - + //calculate each thread global index const int i=blockIdx.x*blockDim.x+threadIdx.x; const int j=blockIdx.y*blockDim.y+threadIdx.y; - + int index = j*N+i; - + if ((i < N) && (j < M)) { i1 = i+1; if (i1 == N) i1 = i-1; j1 = j+1; if (j1 == M) j1 = j-1; - + val1 = (U[j*N+i1] - U[index]) + Bx[index]; val2 = (U[j1*N+i] - U[index]) + By[index]; - + denom = sqrt(val1*val1 + val2*val2); - + val11 = (denom - denom_lam); if (val11 < 0) val11 = 0.0f; - + if (denom != 0.0f) { Dx[index] = val11*(val1/denom); Dy[index] = val11*(val2/denom); @@ -146,20 +146,20 @@ __global__ void updDxDy_shrinkIso2D_kernel(float *U, float *Dx, float *Dy, float } __global__ void updBxBy2D_kernel(float *U, float *Dx, float *Dy, float *Bx, float *By, int N, int M, int ImSize) -{ +{ int i1,j1; - + //calculate each thread global index const int i=blockIdx.x*blockDim.x+threadIdx.x; const int j=blockIdx.y*blockDim.y+threadIdx.y; - + int index = j*N+i; - + if ((i < N) && (j < M)) { /* symmetric boundary conditions (Neuman) */ i1 = i+1; if (i1 == N) i1 = i-1; j1 = j+1; if (j1 == M) j1 = j-1; - + Bx[index] += (U[j*N+i1] - U[index]) - Dx[index]; By[index] += (U[j1*N+i] - U[index]) - Dy[index]; } @@ -172,17 +172,17 @@ __global__ void updBxBy2D_kernel(float *U, float *Dx, float *Dy, float *Bx, floa /************************************************/ __global__ void gauss_seidel3D_kernel(float *U, float *A, float *U_prev, float *Dx, float *Dy, float *Dz, float *Bx, float *By, float *Bz, float lambda, float mu, float normConst, int N, int M, int Z, int ImSize) { - + float sum,d_val,b_val; int i1,i2,j1,j2,k1,k2; - + //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { i1 = i+1; if (i1 == N) i1 = i-1; i2 = i-1; if (i2 < 0) i2 = i+1; @@ -190,7 +190,7 @@ __global__ void gauss_seidel3D_kernel(float *U, float *A, float *U_prev, float * j2 = j-1; if (j2 < 0) j2 = j+1; k1 = k+1; if (k1 == Z) k1 = k-1; k2 = k-1; if (k2 < 0) k2 = k+1; - + d_val = Dx[(N*M)*k + j*N+i2] - Dx[index] + Dy[(N*M)*k + j2*N+i] - Dy[index] + Dz[(N*M)*k2 + j*N+i] - Dz[index]; b_val = -Bx[(N*M)*k + j*N+i2] + Bx[index] - By[(N*M)*k + j2*N+i] + By[index] - Bz[(N*M)*k2 + j*N+i] + Bz[index]; sum = d_val + b_val; @@ -203,31 +203,31 @@ __global__ void gauss_seidel3D_kernel(float *U, float *A, float *U_prev, float * } __global__ void updDxDy_shrinkAniso3D_kernel(float *U, float *Dx, float *Dy, float *Dz, float *Bx, float *By, float *Bz, float lambda, int N, int M, int Z, int ImSize) { - + int i1,j1,k1; float val1, val11, val2, val3, val22, val33, denom_lam; denom_lam = 1.0f/lambda; - + //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { i1 = i+1; if (i1 == N) i1 = i-1; j1 = j+1; if (j1 == M) j1 = j-1; k1 = k+1; if (k1 == Z) k1 = k-1; - + val1 = (U[(N*M)*k + i1 + N*j] - U[index]) + Bx[index]; val2 = (U[(N*M)*k + i + N*j1] - U[index]) + By[index]; val3 = (U[(N*M)*k1 + i + N*j] - U[index]) + Bz[index]; - + val11 = abs(val1) - denom_lam; if (val11 < 0.0f) val11 = 0.0f; val22 = abs(val2) - denom_lam; if (val22 < 0.0f) val22 = 0.0f; val33 = abs(val3) - denom_lam; if (val33 < 0.0f) val33 = 0.0f; - + if (val1 !=0.0f) Dx[index] = (val1/abs(val1))*val11; else Dx[index] = 0.0f; if (val2 !=0.0f) Dy[index] = (val2/abs(val2))*val22; else Dy[index] = 0.0f; if (val3 !=0.0f) Dz[index] = (val3/abs(val3))*val33; else Dz[index] = 0.0f; @@ -237,31 +237,31 @@ __global__ void updDxDy_shrinkAniso3D_kernel(float *U, float *Dx, float *Dy, flo __global__ void updDxDy_shrinkIso3D_kernel(float *U, float *Dx, float *Dy, float *Dz, float *Bx, float *By, float *Bz, float lambda, int N, int M, int Z, int ImSize) { - + int i1,j1,k1; float val1, val11, val2, val3, denom_lam, denom; denom_lam = 1.0f/lambda; - + //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { i1 = i+1; if (i1 == N) i1 = i-1; j1 = j+1; if (j1 == M) j1 = j-1; k1 = k+1; if (k1 == Z) k1 = k-1; - + val1 = (U[(N*M)*k + i1 + N*j] - U[index]) + Bx[index]; val2 = (U[(N*M)*k + i + N*j1] - U[index]) + By[index]; val3 = (U[(N*M)*k1 + i + N*j] - U[index]) + Bz[index]; - + denom = sqrt(val1*val1 + val2*val2 + val3*val3); - + val11 = (denom - denom_lam); if (val11 < 0.0f) val11 = 0.0f; - + if (denom != 0.0f) { Dx[index] = val11*(val1/denom); Dy[index] = val11*(val2/denom); @@ -277,22 +277,22 @@ __global__ void updDxDy_shrinkIso3D_kernel(float *U, float *Dx, float *Dy, float } __global__ void updBxBy3D_kernel(float *U, float *Dx, float *Dy, float *Dz, float *Bx, float *By, float *Bz, int N, int M, int Z, int ImSize) -{ +{ int i1,j1,k1; - + //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { /* symmetric boundary conditions (Neuman) */ i1 = i+1; if (i1 == N) i1 = i-1; j1 = j+1; if (j1 == M) j1 = j-1; k1 = k+1; if (k1 == Z) k1 = k-1; - + Bx[index] += (U[(N*M)*k + i1 + N*j] - U[index]) - Dx[index]; By[index] += (U[(N*M)*k + i + N*j1] - U[index]) - Dy[index]; Bz[index] += (U[(N*M)*k1 + i + N*j] - U[index]) - Dz[index]; @@ -304,9 +304,9 @@ __global__ void SBcopy_kernel2D(float *Input, float* Output, int N, int M, int n { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { Output[index] = Input[index]; } @@ -317,9 +317,9 @@ __global__ void SBcopy_kernel3D(float *Input, float* Output, int N, int M, int Z int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if (index < num_total) { Output[index] = Input[index]; } @@ -329,9 +329,9 @@ __global__ void SBResidCalc2D_kernel(float *Input1, float *Input2, float* Output { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { Output[index] = Input1[index] - Input2[index]; } @@ -342,9 +342,9 @@ __global__ void SBResidCalc3D_kernel(float *Input1, float *Input2, float* Output int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if (index < num_total) { Output[index] = Input1[index] - Input2[index]; } @@ -353,7 +353,7 @@ __global__ void SBResidCalc3D_kernel(float *Input1, float *Input2, float* Output /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ /********************* MAIN HOST FUNCTION ******************/ /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ -extern "C" int TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ) +extern "C" int TV_SB_GPU_main(float *Input, float *Output, float *infovector, float mu, int iter, float epsil, int methodTV, int dimX, int dimY, int dimZ) { int deviceCount = -1; // number of devices cudaGetDeviceCount(&deviceCount); @@ -361,134 +361,107 @@ extern "C" int TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, f fprintf(stderr, "No CUDA devices found\n"); return -1; } - + int ll, DimTotal; float re, lambda, normConst; - int count = 0; - mu = 1.0f/mu; + re = 0.0f; + int count = 0; + mu = 1.0f/mu; lambda = 2.0f*mu; + DimTotal = dimX*dimY*dimZ; + float *d_input, *d_update, *d_res, *d_update_prev=NULL, *Dx=NULL, *Dy=NULL, *Bx=NULL, *By=NULL; + + /*allocate space for images on device*/ + checkCudaErrors( cudaMalloc((void**)&d_input,DimTotal*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update,DimTotal*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update_prev,DimTotal*sizeof(float)) ); + if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_res,DimTotal*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&Dx,DimTotal*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&Dy,DimTotal*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&Bx,DimTotal*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&By,DimTotal*sizeof(float)) ); + + checkCudaErrors( cudaMemcpy(d_input,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); + checkCudaErrors( cudaMemcpy(d_update,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); + cudaMemset(Dx, 0, DimTotal*sizeof(float)); + cudaMemset(Dy, 0, DimTotal*sizeof(float)); + cudaMemset(Bx, 0, DimTotal*sizeof(float)); + cudaMemset(By, 0, DimTotal*sizeof(float)); if (dimZ <= 1) { /*2D verson*/ - DimTotal = dimX*dimY; normConst = 1.0f/(mu + 4.0f*lambda); - float *d_input, *d_update, *d_res, *d_update_prev=NULL, *Dx=NULL, *Dy=NULL, *Bx=NULL, *By=NULL; - dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D)); - - /*allocate space for images on device*/ - checkCudaErrors( cudaMalloc((void**)&d_input,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&d_update,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&d_update_prev,DimTotal*sizeof(float)) ); - if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_res,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&Dx,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&Dy,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&Bx,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&By,DimTotal*sizeof(float)) ); - - checkCudaErrors( cudaMemcpy(d_input,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); - checkCudaErrors( cudaMemcpy(d_update,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); - cudaMemset(Dx, 0, DimTotal*sizeof(float)); - cudaMemset(Dy, 0, DimTotal*sizeof(float)); - cudaMemset(Bx, 0, DimTotal*sizeof(float)); - cudaMemset(By, 0, DimTotal*sizeof(float)); - - /********************** Run CUDA 2D kernels here ********************/ - /* The main kernel */ + /********************** Run CUDA 2D kernels here ********************/ for (ll = 0; ll < iter; ll++) { - + /* storing old value */ SBcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); + checkCudaErrors(cudaPeekAtLastError() ); /* perform two GS iterations (normally 2 is enough for the convergence) */ gauss_seidel2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, d_update_prev, Dx, Dy, Bx, By, lambda, mu, normConst, dimX, dimY, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); + checkCudaErrors(cudaPeekAtLastError() ); SBcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); + checkCudaErrors(cudaPeekAtLastError() ); /* 2nd GS iteration */ gauss_seidel2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, d_update_prev, Dx, Dy, Bx, By, lambda, mu, normConst, dimX, dimY, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + /* TV-related step */ - if (methodTV == 1) updDxDy_shrinkAniso2D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Bx, By, lambda, dimX, dimY, DimTotal); - else updDxDy_shrinkIso2D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Bx, By, lambda, dimX, dimY, DimTotal); - + if (methodTV == 1) updDxDy_shrinkAniso2D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Bx, By, lambda, dimX, dimY, DimTotal); + else updDxDy_shrinkIso2D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Bx, By, lambda, dimX, dimY, DimTotal); + /* update for Bregman variables */ updBxBy2D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Bx, By, dimX, dimY, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - if (epsil != 0.0f) { - /* calculate norm - stopping rules using the Thrust library */ - SBResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_res, dimX, dimY, DimTotal); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - thrust::device_vector<float> d_vec(d_res, d_res + DimTotal); - float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); - thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); - float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); - - re = (reduction/reduction2); - if (re < epsil) count++; - if (count > 4) break; - } - - } - if (printM == 1) printf("SB-TV iterations stopped at iteration %i \n", ll); - /***************************************************************/ - //copy result matrix from device to host memory - cudaMemcpy(Output,d_update,DimTotal*sizeof(float),cudaMemcpyDeviceToHost); - - cudaFree(d_input); - cudaFree(d_update); - cudaFree(d_update_prev); - if (epsil != 0.0f) cudaFree(d_res); - cudaFree(Dx); - cudaFree(Dy); - cudaFree(Bx); - cudaFree(By); - } + checkCudaErrors(cudaPeekAtLastError() ); + + if ((epsil != 0.0f) && (ll % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + SBResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_res, dimX, dimY, DimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(d_res, d_res + DimTotal); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } + } + /***************************************************************/ + } else { /*3D verson*/ - DimTotal = dimX*dimY*dimZ; normConst = 1.0f/(mu + 6.0f*lambda); - float *d_input, *d_update, *d_res, *d_update_prev=NULL, *Dx=NULL, *Dy=NULL, *Dz=NULL, *Bx=NULL, *By=NULL, *Bz=NULL; - - dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); - dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE)); - + float *Dz=NULL, *Bz=NULL; + + dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); + dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE)); + /*allocate space for images on device*/ - checkCudaErrors( cudaMalloc((void**)&d_input,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&d_update,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&d_update_prev,DimTotal*sizeof(float)) ); - if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_res,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&Dx,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&Dy,DimTotal*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&Dz,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&Bx,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&By,DimTotal*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&Bz,DimTotal*sizeof(float)) ); - - checkCudaErrors( cudaMemcpy(d_input,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); - checkCudaErrors( cudaMemcpy(d_update,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); - cudaMemset(Dx, 0, DimTotal*sizeof(float)); - cudaMemset(Dy, 0, DimTotal*sizeof(float)); - cudaMemset(Dz, 0, DimTotal*sizeof(float)); - cudaMemset(Bx, 0, DimTotal*sizeof(float)); - cudaMemset(By, 0, DimTotal*sizeof(float)); - cudaMemset(Bz, 0, DimTotal*sizeof(float)); - - /********************** Run CUDA 3D kernels here ********************/ + + cudaMemset(Dz, 0, DimTotal*sizeof(float)); + cudaMemset(Bz, 0, DimTotal*sizeof(float)); + /********************** Run CUDA 3D kernels here ********************/ /* The main kernel */ for (ll = 0; ll < iter; ll++) { - + /* storing old value */ SBcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); @@ -497,56 +470,64 @@ extern "C" int TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, f /* perform two GS iterations (normally 2 is enough for the convergence) */ gauss_seidel3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, d_update_prev, Dx, Dy, Dz, Bx, By, Bz, lambda, mu, normConst, dimX, dimY, dimZ, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); + checkCudaErrors(cudaPeekAtLastError() ); SBcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); + checkCudaErrors(cudaPeekAtLastError() ); /* 2nd GS iteration */ gauss_seidel3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, d_update_prev, Dx, Dy, Dz, Bx, By, Bz, lambda, mu, normConst, dimX, dimY, dimZ, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + /* TV-related step */ if (methodTV == 1) updDxDy_shrinkAniso3D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Dz, Bx, By, Bz, lambda, dimX, dimY, dimZ, DimTotal); else updDxDy_shrinkIso3D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Dz, Bx, By, Bz, lambda, dimX, dimY, dimZ, DimTotal); - + /* update for Bregman variables */ updBxBy3D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Dz, Bx, By, Bz, dimX, dimY, dimZ, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - if (epsil != 0.0f) { - /* calculate norm - stopping rules using the Thrust library */ - SBResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_res, dimX, dimY, dimZ, DimTotal); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - thrust::device_vector<float> d_vec(d_res, d_res + DimTotal); - float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); - thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); - float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); - - re = (reduction/reduction2); - if (re < epsil) count++; - if (count > 4) break; - } + checkCudaErrors(cudaPeekAtLastError() ); + + if ((epsil != 0.0f) && (ll % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + SBResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_res, dimX, dimY, dimZ, DimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(d_res, d_res + DimTotal); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } } - if (printM == 1) printf("SB-TV iterations stopped at iteration %i \n", ll); - /***************************************************************/ - //copy result matrix from device to host memory - cudaMemcpy(Output,d_update,DimTotal*sizeof(float),cudaMemcpyDeviceToHost); - - cudaFree(d_input); - cudaFree(d_update); - cudaFree(d_update_prev); - if (epsil != 0.0f) cudaFree(d_res); - cudaFree(Dx); - cudaFree(Dy); - cudaFree(Dz); - cudaFree(Bx); - cudaFree(By); - cudaFree(Bz); - } - //cudaDeviceReset(); + cudaFree(Dz); + cudaFree(Bz); + + } /**************************end else (3D)********************************/ + + //copy result matrix from device to host memory + cudaMemcpy(Output,d_update,DimTotal*sizeof(float),cudaMemcpyDeviceToHost); + + cudaFree(d_input); + cudaFree(d_update); + cudaFree(d_update_prev); + if (epsil != 0.0f) cudaFree(d_res); + cudaFree(Dx); + cudaFree(Dy); + cudaFree(Bx); + cudaFree(By); + + /*adding info into info_vector */ + infovector[0] = (float)(ll); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; } diff --git a/src/Core/regularisers_GPU/TV_SB_GPU_core.h b/src/Core/regularisers_GPU/TV_SB_GPU_core.h index 901b90f..1eaa6af 100755 --- a/src/Core/regularisers_GPU/TV_SB_GPU_core.h +++ b/src/Core/regularisers_GPU/TV_SB_GPU_core.h @@ -5,6 +5,6 @@ #include <memory.h> -extern "C" CCPI_EXPORT int TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ); +extern "C" CCPI_EXPORT int TV_SB_GPU_main(float *Input, float *Output, float *infovector, float mu, int iter, float epsil, int methodTV, int dimX, int dimY, int dimZ); -#endif +#endif diff --git a/src/Core/regularisers_GPU/dTV_FGP_GPU_core.cu b/src/Core/regularisers_GPU/dTV_FGP_GPU_core.cu index 7503ec7..89fca06 100644 --- a/src/Core/regularisers_GPU/dTV_FGP_GPU_core.cu +++ b/src/Core/regularisers_GPU/dTV_FGP_GPU_core.cu @@ -15,9 +15,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ +*/ #include "shared.h" #include "dTV_FGP_GPU_core.h" +#include <thrust/functional.h> #include <thrust/device_vector.h> #include <thrust/transform_reduce.h> @@ -31,19 +32,19 @@ limitations under the License. * 3. lambdaPar - regularization parameter [REQUIRED] * 4. Number of iterations [OPTIONAL] * 5. eplsilon: tolerance constant [OPTIONAL] - * 6. eta: smoothing constant to calculate gradient of the reference [OPTIONAL] * + * 6. eta: smoothing constant to calculate gradient of the reference [OPTIONAL] * * 7. TV-type: methodTV - 'iso' (0) or 'l1' (1) [OPTIONAL] * 8. nonneg: 'nonnegativity (0 is OFF by default) [OPTIONAL] - * 9. print information: 0 (off) or 1 (on) [OPTIONAL] - * + * Output: * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the Matlab's codes and papers by * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" * [2] M. J. Ehrhardt and M. M. Betcke, Multi-Contrast MRI Reconstruction with Structure-Guided Total Variation, SIAM Journal on Imaging Sciences 9(3), pp. 1084–1106 */ - + #define BLKXSIZE2D 16 #define BLKYSIZE2D 16 @@ -53,7 +54,7 @@ limitations under the License. #define BLKZSIZE 8 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) -struct square { __host__ __device__ float operator()(float x) { return x * x; } }; +//struct square { __host__ __device__ float operator()(float x) { return x * x; } }; /************************************************/ /*****************2D modules*********************/ @@ -61,43 +62,43 @@ struct square { __host__ __device__ float operator()(float x) { return x * x; } __global__ void GradNorm_func2D_kernel(float *Refd, float *Refd_x, float *Refd_y, float eta, int N, int M, int ImSize) { - + float val1, val2, gradX, gradY, magn; //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - - int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { /* boundary conditions */ if (xIndex >= N-1) val1 = 0.0f; else val1 = Refd[(xIndex+1) + N*yIndex]; - if (yIndex >= M-1) val2 = 0.0f; else val2 = Refd[(xIndex) + N*(yIndex + 1)]; - + if (yIndex >= M-1) val2 = 0.0f; else val2 = Refd[(xIndex) + N*(yIndex + 1)]; + gradX = val1 - Refd[index]; gradY = val2 - Refd[index]; magn = pow(gradX,2) + pow(gradY,2); magn = sqrt(magn + pow(eta,2)); Refd_x[index] = gradX/magn; - Refd_y[index] = gradY/magn; + Refd_y[index] = gradY/magn; } return; } __global__ void ProjectVect_func2D_kernel(float *R1, float *R2, float *Refd_x, float *Refd_y, int N, int M, int ImSize) { - + float in_prod; //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - - int index = xIndex + N*yIndex; - + + int index = xIndex + N*yIndex; + if ((xIndex < N) && (yIndex < M)) { in_prod = R1[index]*Refd_x[index] + R2[index]*Refd_y[index]; /* calculate inner product */ R1[index] = R1[index] - in_prod*Refd_x[index]; - R2[index] = R2[index] - in_prod*Refd_y[index]; + R2[index] = R2[index] - in_prod*Refd_y[index]; } return; } @@ -105,19 +106,19 @@ __global__ void ProjectVect_func2D_kernel(float *R1, float *R2, float *Refd_x, f __global__ void Obj_dfunc2D_kernel(float *Ad, float *D, float *R1, float *R2, int N, int M, int ImSize, float lambda) { - + float val1,val2; - + //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - - int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { if (xIndex <= 0) {val1 = 0.0f;} else {val1 = R1[(xIndex-1) + N*yIndex];} if (yIndex <= 0) {val2 = 0.0f;} else {val2 = R2[xIndex + N*(yIndex-1)];} - + //Write final result to global memory D[index] = Ad[index] - lambda*(R1[index] + R2[index] - val1 - val2); } @@ -126,25 +127,25 @@ __global__ void Obj_dfunc2D_kernel(float *Ad, float *D, float *R1, float *R2, in __global__ void Grad_dfunc2D_kernel(float *P1, float *P2, float *D, float *R1, float *R2, float *Refd_x, float *Refd_y, int N, int M, int ImSize, float multip) { - + float val1,val2,in_prod; - + //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - + int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { - + + if ((xIndex < N) && (yIndex < M)) { + /* boundary conditions */ if (xIndex >= N-1) val1 = 0.0f; else val1 = D[index] - D[(xIndex+1) + N*yIndex]; if (yIndex >= M-1) val2 = 0.0f; else val2 = D[index] - D[(xIndex) + N*(yIndex + 1)]; - + in_prod = val1*Refd_x[index] + val2*Refd_y[index]; /* calculate inner product */ val1 = val1 - in_prod*Refd_x[index]; - val2 = val2 - in_prod*Refd_y[index]; - + val2 = val2 - in_prod*Refd_y[index]; + //Write final result to global memory P1[index] = R1[index] + multip*val1; P2[index] = R2[index] + multip*val2; @@ -154,16 +155,16 @@ __global__ void Grad_dfunc2D_kernel(float *P1, float *P2, float *D, float *R1, f __global__ void Proj_dfunc2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize) { - - float denom; + + float denom; //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - + int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { - denom = pow(P1[index],2) + pow(P2[index],2); + + if ((xIndex < N) && (yIndex < M)) { + denom = pow(P1[index],2) + pow(P2[index],2); if (denom > 1.0f) { P1[index] = P1[index]/sqrt(denom); P2[index] = P2[index]/sqrt(denom); @@ -173,15 +174,15 @@ __global__ void Proj_dfunc2D_iso_kernel(float *P1, float *P2, int N, int M, int } __global__ void Proj_dfunc2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize) { - - float val1, val2; + + float val1, val2; //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - + int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { + + if ((xIndex < N) && (yIndex < M)) { val1 = abs(P1[index]); val2 = abs(P2[index]); if (val1 < 1.0f) {val1 = 1.0f;} @@ -196,10 +197,10 @@ __global__ void Rupd_dfunc2D_kernel(float *P1, float *P1_old, float *P2, float * //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - + int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { + + if ((xIndex < N) && (yIndex < M)) { R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]); R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]); } @@ -209,9 +210,9 @@ __global__ void dTVnonneg2D_kernel(float* Output, int N, int M, int num_total) { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { if (Output[index] < 0.0f) Output[index] = 0.0f; } @@ -220,9 +221,9 @@ __global__ void dTVcopy_kernel2D(float *Input, float* Output, int N, int M, int { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { Output[index] = Input[index]; } @@ -233,9 +234,9 @@ __global__ void dTVcopy_kernel3D(float *Input, float* Output, int N, int M, int int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if (index < num_total) { Output[index] = Input[index]; } @@ -245,9 +246,9 @@ __global__ void dTVResidCalc2D_kernel(float *Input1, float *Input2, float* Outpu { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { Output[index] = Input1[index] - Input2[index]; } @@ -258,9 +259,9 @@ __global__ void dTVResidCalc3D_kernel(float *Input1, float *Input2, float* Outpu int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if (index < num_total) { Output[index] = Input1[index] - Input2[index]; } @@ -271,21 +272,21 @@ __global__ void dTVResidCalc3D_kernel(float *Input1, float *Input2, float* Outpu /************************************************/ __global__ void GradNorm_func3D_kernel(float *Refd, float *Refd_x, float *Refd_y, float *Refd_z, float eta, int N, int M, int Z, int ImSize) { - + float val1, val2, val3, gradX, gradY, gradZ, magn; //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - - if ((i < N) && (j < M) && (k < Z)) { + + if ((i < N) && (j < M) && (k < Z)) { /* boundary conditions */ if (i >= N-1) val1 = 0.0f; else val1 = Refd[(N*M)*k + (i+1) + N*j]; if (j >= M-1) val2 = 0.0f; else val2 = Refd[(N*M)*k + i + N*(j+1)]; if (k >= Z-1) val3 = 0.0f; else val3 = Refd[(N*M)*(k+1) + i + N*j]; - + gradX = val1 - Refd[index]; gradY = val2 - Refd[index]; gradZ = val3 - Refd[index]; @@ -300,18 +301,18 @@ __global__ void GradNorm_func3D_kernel(float *Refd, float *Refd_x, float *Refd_y __global__ void ProjectVect_func3D_kernel(float *R1, float *R2, float *R3, float *Refd_x, float *Refd_y, float *Refd_z, int N, int M, int Z, int ImSize) { - + float in_prod; //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { in_prod = R1[index]*Refd_x[index] + R2[index]*Refd_y[index] + R3[index]*Refd_z[index]; /* calculate inner product */ - + R1[index] = R1[index] - in_prod*Refd_x[index]; R2[index] = R2[index] - in_prod*Refd_y[index]; R3[index] = R3[index] - in_prod*Refd_z[index]; @@ -322,16 +323,16 @@ __global__ void ProjectVect_func3D_kernel(float *R1, float *R2, float *R3, float __global__ void Obj_dfunc3D_kernel(float *Ad, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float lambda) { - + float val1,val2,val3; - + //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { if (i <= 0) {val1 = 0.0f;} else {val1 = R1[(N*M)*(k) + (i-1) + N*j];} if (j <= 0) {val2 = 0.0f;} else {val2 = R2[(N*M)*(k) + i + N*(j-1)];} @@ -344,27 +345,27 @@ __global__ void Obj_dfunc3D_kernel(float *Ad, float *D, float *R1, float *R2, fl __global__ void Grad_dfunc3D_kernel(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, float *Refd_x, float *Refd_y, float *Refd_z, int N, int M, int Z, int ImSize, float multip) { - + float val1,val2,val3,in_prod; - + //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { /* boundary conditions */ if (i >= N-1) val1 = 0.0f; else val1 = D[index] - D[(N*M)*(k) + (i+1) + N*j]; if (j >= M-1) val2 = 0.0f; else val2 = D[index] - D[(N*M)*(k) + i + N*(j+1)]; - if (k >= Z-1) val3 = 0.0f; else val3 = D[index] - D[(N*M)*(k+1) + i + N*j]; - + if (k >= Z-1) val3 = 0.0f; else val3 = D[index] - D[(N*M)*(k+1) + i + N*j]; + in_prod = val1*Refd_x[index] + val2*Refd_y[index] + val3*Refd_z[index]; /* calculate inner product */ val1 = val1 - in_prod*Refd_x[index]; val2 = val2 - in_prod*Refd_y[index]; val3 = val3 - in_prod*Refd_z[index]; - + //Write final result to global memory P1[index] = R1[index] + multip*val1; P2[index] = R2[index] + multip*val2; @@ -375,18 +376,18 @@ __global__ void Grad_dfunc3D_kernel(float *P1, float *P2, float *P3, float *D, f __global__ void Proj_dfunc3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) { - - float denom,sq_denom; + + float denom,sq_denom; //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { denom = pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2); - + if (denom > 1.0f) { sq_denom = 1.0f/sqrt(denom); P1[index] = P1[index]*sq_denom; @@ -399,15 +400,15 @@ __global__ void Proj_dfunc3D_iso_kernel(float *P1, float *P2, float *P3, int N, __global__ void Proj_dfunc3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) { - - float val1, val2, val3; + + float val1, val2, val3; //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { val1 = abs(P1[index]); val2 = abs(P2[index]); @@ -429,10 +430,10 @@ __global__ void Rupd_dfunc3D_kernel(float *P1, float *P1_old, float *P2, float * int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - - if ((i < N) && (j < M) && (k < Z)) { + + if ((i < N) && (j < M) && (k < Z)) { R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]); R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]); R3[index] = P3[index] + multip2*(P3[index] - P3_old[index]); @@ -445,9 +446,9 @@ __global__ void dTVnonneg3D_kernel(float* Output, int N, int M, int Z, int num_t int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if (index < num_total) { if (Output[index] < 0.0f) Output[index] = 0.0f; } @@ -455,7 +456,7 @@ __global__ void dTVnonneg3D_kernel(float* Output, int N, int M, int Z, int num_t /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ ////////////MAIN HOST FUNCTION /////////////// -extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iter, float epsil, float eta, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ) +extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float *infovector, float lambdaPar, int iter, float epsil, float eta, int methodTV, int nonneg, int dimX, int dimY, int dimZ) { int deviceCount = -1; // number of devices cudaGetDeviceCount(&deviceCount); @@ -463,20 +464,21 @@ extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, fl fprintf(stderr, "No CUDA devices found\n"); return -1; } - + int count = 0, i; - float re, multip,multip2; - float tk = 1.0f; + float re, multip,multip2; + re = 0.0f; + float tk = 1.0f; float tkp1=1.0f; - + if (dimZ <= 1) { /*2D verson*/ - int ImSize = dimX*dimY; + int ImSize = dimX*dimY; float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL, *InputRef_x=NULL, *InputRef_y=NULL, *d_InputRef=NULL; - + dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D)); - + /*allocate space for images on device*/ checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); @@ -490,10 +492,10 @@ extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, fl checkCudaErrors( cudaMalloc((void**)&d_InputRef,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&InputRef_x,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&InputRef_y,ImSize*sizeof(float)) ); - + checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); checkCudaErrors( cudaMemcpy(d_InputRef,InputRef,ImSize*sizeof(float),cudaMemcpyHostToDevice)); - + cudaMemset(P1, 0, ImSize*sizeof(float)); cudaMemset(P2, 0, ImSize*sizeof(float)); cudaMemset(P1_prev, 0, ImSize*sizeof(float)); @@ -502,85 +504,91 @@ extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, fl cudaMemset(R2, 0, ImSize*sizeof(float)); cudaMemset(InputRef_x, 0, ImSize*sizeof(float)); cudaMemset(InputRef_y, 0, ImSize*sizeof(float)); - + /******************** Run CUDA 2D kernel here ********************/ multip = (1.0f/(8.0f*lambdaPar)); /* calculate gradient vectors for the reference */ GradNorm_func2D_kernel<<<dimGrid,dimBlock>>>(d_InputRef, InputRef_x, InputRef_y, eta, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + /* The main kernel */ for (i = 0; i < iter; i++) { - - /*projects a 2D vector field R-1,2 onto the orthogonal complement of another 2D vector field InputRef_xy*/ + + if ((epsil != 0.0f) && (i % 5 == 0)) { + dTVcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + + /*projects a 2D vector field R-1,2 onto the orthogonal complement of another 2D vector field InputRef_xy*/ ProjectVect_func2D_kernel<<<dimGrid,dimBlock>>>(R1, R2, InputRef_x, InputRef_y, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + /* computing the gradient of the objective function */ Obj_dfunc2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, dimX, dimY, ImSize, lambdaPar); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + if (nonneg != 0) { dTVnonneg2D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); } - + /*Taking a step towards minus of the gradient*/ Grad_dfunc2D_kernel<<<dimGrid,dimBlock>>>(P1, P2, d_update, R1, R2, InputRef_x, InputRef_y, dimX, dimY, ImSize, multip); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + /* projection step */ if (methodTV == 0) Proj_dfunc2D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/ - else Proj_dfunc2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/ + else Proj_dfunc2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/ checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; multip2 = ((tk-1.0f)/tkp1); - + Rupd_dfunc2D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, multip2, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - - if (epsil != 0.0f) { - /* calculate norm - stopping rules using the Thrust library */ - dTVResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize); - float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); - thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); - float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); - - re = (reduction/reduction2); - if (re < epsil) count++; - if (count > 4) break; - - dTVcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - } - + dTVcopy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + dTVcopy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + tk = tkp1; + + if ((epsil != 0.0f) && (i % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + dTVResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(P1, P1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } + } - if (printM == 1) printf("FGP-dTV iterations stopped at iteration %i \n", i); - /***************************************************************/ + /***************************************************************/ //copy result matrix from device to host memory cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); - + cudaFree(d_input); cudaFree(d_update); if (epsil != 0.0f) cudaFree(d_update_prev); @@ -590,19 +598,19 @@ extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, fl cudaFree(P2_prev); cudaFree(R1); cudaFree(R2); - + cudaFree(d_InputRef); cudaFree(InputRef_x); cudaFree(InputRef_y); } else { /*3D verson*/ - int ImSize = dimX*dimY*dimZ; + int ImSize = dimX*dimY*dimZ; float *d_input, *d_update=NULL, *d_update_prev, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL, *InputRef_x=NULL, *InputRef_y=NULL, *InputRef_z=NULL, *d_InputRef=NULL; - + dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE)); - + /*allocate space for images on device*/ checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); @@ -619,11 +627,11 @@ extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, fl checkCudaErrors( cudaMalloc((void**)&d_InputRef,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&InputRef_x,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&InputRef_y,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&InputRef_z,ImSize*sizeof(float)) ); - + checkCudaErrors( cudaMalloc((void**)&InputRef_z,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); checkCudaErrors( cudaMemcpy(d_InputRef,InputRef,ImSize*sizeof(float),cudaMemcpyHostToDevice)); - + cudaMemset(P1, 0, ImSize*sizeof(float)); cudaMemset(P2, 0, ImSize*sizeof(float)); cudaMemset(P3, 0, ImSize*sizeof(float)); @@ -636,89 +644,93 @@ extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, fl cudaMemset(InputRef_x, 0, ImSize*sizeof(float)); cudaMemset(InputRef_y, 0, ImSize*sizeof(float)); cudaMemset(InputRef_z, 0, ImSize*sizeof(float)); - - /********************** Run CUDA 3D kernel here ********************/ + + /********************** Run CUDA 3D kernel here ********************/ multip = (1.0f/(26.0f*lambdaPar)); /* calculate gradient vectors for the reference */ GradNorm_func3D_kernel<<<dimGrid,dimBlock>>>(d_InputRef, InputRef_x, InputRef_y, InputRef_z, eta, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + /* The main kernel */ for (i = 0; i < iter; i++) { - /*projects a 3D vector field R-1,2,3 onto the orthogonal complement of another 3D vector field InputRef_xyz*/ + if ((epsil != 0.0f) && (i % 5 == 0)) { + dTVcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + + /*projects a 3D vector field R-1,2,3 onto the orthogonal complement of another 3D vector field InputRef_xyz*/ ProjectVect_func3D_kernel<<<dimGrid,dimBlock>>>(R1, R2, R3, InputRef_x, InputRef_y, InputRef_z, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + /* computing the gradient of the objective function */ Obj_dfunc3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, lambdaPar); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + if (nonneg != 0) { dTVnonneg3D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); } - + /*Taking a step towards minus of the gradient*/ Grad_dfunc3D_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, d_update, R1, R2, R3, InputRef_x, InputRef_y, InputRef_z, dimX, dimY, dimZ, ImSize, multip); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + /* projection step */ if (methodTV == 0) Proj_dfunc3D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* isotropic kernel */ else Proj_dfunc3D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* anisotropic kernel */ checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; multip2 = ((tk-1.0f)/tkp1); - + Rupd_dfunc3D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, multip2, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - - if (epsil != 0.0f) { - /* calculate norm - stopping rules using the Thrust library */ - dTVResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, dimZ, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize); - float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); - thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); - float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); - - re = (reduction/reduction2); - if (re < epsil) count++; - if (count > 4) break; - - dTVcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - } - + dTVcopy_kernel3D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + dTVcopy_kernel3D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + dTVcopy_kernel3D<<<dimGrid,dimBlock>>>(P3, P3_prev, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + tk = tkp1; + if ((epsil != 0.0f) && (i % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + dTVResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(P1, P1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } } - if (printM == 1) printf("FGP-dTV iterations stopped at iteration %i \n", i); - /***************************************************************/ + /***************************************************************/ //copy result matrix from device to host memory cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); - + cudaFree(d_input); cudaFree(d_update); if (epsil != 0.0f) cudaFree(d_update_prev); @@ -736,6 +748,10 @@ extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, fl cudaFree(InputRef_z); cudaFree(d_InputRef); } - //cudaDeviceReset(); + + + /*adding info into info_vector */ + infovector[0] = (float)(i); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ return 0; } diff --git a/src/Core/regularisers_GPU/dTV_FGP_GPU_core.h b/src/Core/regularisers_GPU/dTV_FGP_GPU_core.h index f9281e8..4a1b16b 100644 --- a/src/Core/regularisers_GPU/dTV_FGP_GPU_core.h +++ b/src/Core/regularisers_GPU/dTV_FGP_GPU_core.h @@ -4,6 +4,6 @@ #include "CCPiDefines.h" #include <memory.h> -extern "C" CCPI_EXPORT int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iter, float epsil, float eta, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); +extern "C" CCPI_EXPORT int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float *infovector, float lambdaPar, int iter, float epsil, float eta, int methodTV, int nonneg, int dimX, int dimY, int dimZ); -#endif +#endif diff --git a/src/Core/regularisers_GPU/shared.h b/src/Core/regularisers_GPU/shared.h index fe98cd6..698dddd 100644 --- a/src/Core/regularisers_GPU/shared.h +++ b/src/Core/regularisers_GPU/shared.h @@ -1,4 +1,13 @@ /*shared macros*/ +template <typename T> +struct square +{ + __host__ __device__ + T operator()(const T& x) const { + return (float)(x*x); + } +}; + /*checks CUDA call, should be used in functions returning <int> value diff --git a/src/Matlab/CMakeLists.txt b/src/Matlab/CMakeLists.txt index b97f845..0897d7a 100755 --- a/src/Matlab/CMakeLists.txt +++ b/src/Matlab/CMakeLists.txt @@ -38,9 +38,9 @@ find_package(Matlab REQUIRED COMPONENTS MAIN_PROGRAM MX_LIBRARY ENG_LIBRARY ) #set (MEX_TARGETS "CPU_TNV;CPU_ROF")
#list(APPEND MEX_TARGETS "CPU_TNV")
#list(APPEND MEX_TARGETS "CPU_ROF")
-
+ file(GLOB CPU_MEX_FILES
- "${CMAKE_SOURCE_DIR}/Matlab/mex_compile/regularisers_CPU/*.c"
+ "${CMAKE_SOURCE_DIR}/src/Matlab/mex_compile/regularisers_CPU/*.c"
#"${CMAKE_SOURCE_DIR}/Matlab/mex_compile/regularisers_GPU/*.c"
)
@@ -85,10 +85,10 @@ foreach(tgt RANGE 0 ${num}) )
target_include_directories(${current_target}
- PUBLIC ${CMAKE_SOURCE_DIR}/Core/regularisers_CPU
- ${CMAKE_SOURCE_DIR}/Core/regularisers_GPU
- ${CMAKE_SOURCE_DIR}/Core/inpainters_CPU
- ${CMAKE_SOURCE_DIR}/Core/
+ PUBLIC ${CMAKE_SOURCE_DIR}/src/Core/regularisers_CPU
+ ${CMAKE_SOURCE_DIR}/src/Core/regularisers_GPU
+ ${CMAKE_SOURCE_DIR}/src/Core/inpainters_CPU
+ ${CMAKE_SOURCE_DIR}/src/Core/
${MATLAB_INCLUDE_DIR})
set_property(TARGET ${current_target} PROPERTY C_STANDARD 99)
list(APPEND CPU_MEX_TARGETS ${current_target})
@@ -101,7 +101,7 @@ if (BUILD_CUDA) find_package(CUDA)
if (CUDA_FOUND)
file(GLOB GPU_MEX_FILES
- "${CMAKE_SOURCE_DIR}/Matlab/mex_compile/regularisers_GPU/*.cpp"
+ "${CMAKE_SOURCE_DIR}/src/Matlab/mex_compile/regularisers_GPU/*.cpp"
)
list(LENGTH GPU_MEX_FILES num)
@@ -131,14 +131,16 @@ message("number of GPU files " ${num}) )
target_include_directories(${current_target}
- PUBLIC ${CMAKE_SOURCE_DIR}/Core/regularisers_CPU
- ${CMAKE_SOURCE_DIR}/Core/regularisers_GPU
- ${CMAKE_SOURCE_DIR}/Core/inpainters_CPU
- ${CMAKE_SOURCE_DIR}/Core/
+ PUBLIC ${CMAKE_SOURCE_DIR}/src/Core/regularisers_CPU
+ ${CMAKE_SOURCE_DIR}/src/Core/regularisers_GPU
+ ${CMAKE_SOURCE_DIR}/src/Core/inpainters_CPU
+ ${CMAKE_SOURCE_DIR}/src/Core/
${MATLAB_INCLUDE_DIR})
list(APPEND GPU_MEX_TARGETS ${current_target})
- INSTALL(TARGETS ${current_target} DESTINATION "${MATLAB_DEST}")
+ INSTALL(TARGETS ${current_target} DESTINATION "${MATLAB_DEST}") + +
endforeach()
add_custom_target(MatlabWrapperGPU DEPENDS ${GPU_MEX_TARGETS})
diff --git a/src/Matlab/mex_compile/compileCPU_mex_Linux.m b/src/Matlab/mex_compile/compileCPU_mex_Linux.m index 72a828e..19fb1a5 100644 --- a/src/Matlab/mex_compile/compileCPU_mex_Linux.m +++ b/src/Matlab/mex_compile/compileCPU_mex_Linux.m @@ -1,17 +1,18 @@ -% execute this mex file on Linux in Matlab once +% execute this mex file on Linux in Matlab once. See also Cmake driven +% build if this fails fsep = '/'; -pathcopyFrom = sprintf(['..' fsep '..' fsep '..' fsep 'Core' fsep 'regularisers_CPU'], 1i); -pathcopyFrom1 = sprintf(['..' fsep '..' fsep '..' fsep 'Core' fsep 'CCPiDefines.h'], 1i); -pathcopyFrom2 = sprintf(['..' fsep '..' fsep '..' fsep 'Core' fsep 'inpainters_CPU'], 1i); +pathcopyFrom = sprintf(['..' fsep '..' fsep 'Core' fsep 'regularisers_CPU'], 1i); +pathcopyFrom1 = sprintf(['..' fsep '..' fsep 'Core' fsep 'CCPiDefines.h'], 1i); +pathcopyFrom2 = sprintf(['..' fsep '..' fsep 'Core' fsep 'inpainters_CPU'], 1i); copyfile(pathcopyFrom, 'regularisers_CPU'); copyfile(pathcopyFrom1, 'regularisers_CPU'); copyfile(pathcopyFrom2, 'regularisers_CPU'); cd regularisers_CPU - +%% Pathmove = sprintf(['..' fsep 'installed' fsep], 1i); fprintf('%s \n', '<<<<<<<<<<<Compiling CPU regularisers>>>>>>>>>>>>>'); @@ -27,15 +28,15 @@ movefile('FGP_TV.mex*',Pathmove); fprintf('%s \n', 'Compiling SB-TV...'); mex SB_TV.c SB_TV_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp" movefile('SB_TV.mex*',Pathmove); - + fprintf('%s \n', 'Compiling dFGP-TV...'); mex FGP_dTV.c FGP_dTV_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp" movefile('FGP_dTV.mex*',Pathmove); - + fprintf('%s \n', 'Compiling TNV...'); mex TNV.c TNV_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp" movefile('TNV.mex*',Pathmove); - + fprintf('%s \n', 'Compiling NonLinear Diffusion...'); mex NonlDiff.c Diffusion_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp" movefile('NonlDiff.mex*',Pathmove); @@ -47,11 +48,11 @@ movefile('Diffusion_4thO.mex*',Pathmove); fprintf('%s \n', 'Compiling TGV...'); mex TGV.c TGV_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp" movefile('TGV.mex*',Pathmove); - + fprintf('%s \n', 'Compiling ROF-LLT...'); mex LLT_ROF.c LLT_ROF_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp" movefile('LLT_ROF.mex*',Pathmove); - + fprintf('%s \n', 'Compiling NonLocal-TV...'); mex PatchSelect.c PatchSelect_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp" mex Nonlocal_TV.c Nonlocal_TV_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp" @@ -66,16 +67,15 @@ movefile('TV_energy.mex*',Pathmove); fprintf('%s \n', 'Compiling Nonlinear/Linear diffusion inpainting...'); mex NonlDiff_Inp.c Diffusion_Inpaint_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp" movefile('NonlDiff_Inp.mex*',Pathmove); - + fprintf('%s \n', 'Compiling Nonlocal marching method for inpainting...'); mex NonlocalMarching_Inpaint.c NonlocalMarching_Inpaint_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp" movefile('NonlocalMarching_Inpaint.mex*',Pathmove); - + delete SB_TV_core* ROF_TV_core* FGP_TV_core* FGP_dTV_core* TNV_core* utils* Diffusion_core* Diffus4th_order_core* TGV_core* LLT_ROF_core* CCPiDefines.h delete PatchSelect_core* Nonlocal_TV_core* delete Diffusion_Inpaint_core* NonlocalMarching_Inpaint_core* fprintf('%s \n', '<<<<<<< Regularisers successfully compiled! >>>>>>>'); -pathA2 = sprintf(['..' fsep '..' fsep], 1i); -cd(pathA2); -cd demos +pathA2 = sprintf(['..' fsep '..' fsep '..' fsep '..' fsep 'demos'], 1i); +cd(pathA2);
\ No newline at end of file diff --git a/src/Matlab/mex_compile/compileCPU_mex_WINDOWS.m b/src/Matlab/mex_compile/compileCPU_mex_WINDOWS.m index 6f7541c..3a9e2af 100644 --- a/src/Matlab/mex_compile/compileCPU_mex_WINDOWS.m +++ b/src/Matlab/mex_compile/compileCPU_mex_WINDOWS.m @@ -10,9 +10,9 @@ fsep = '/'; -pathcopyFrom = sprintf(['..' fsep '..' fsep '..' fsep 'Core' fsep 'regularisers_CPU'], 1i); -pathcopyFrom1 = sprintf(['..' fsep '..' fsep '..' fsep 'Core' fsep 'CCPiDefines.h'], 1i); -pathcopyFrom2 = sprintf(['..' fsep '..' fsep '..' fsep 'Core' fsep 'inpainters_CPU'], 1i); +pathcopyFrom = sprintf(['..' fsep '..' fsep 'Core' fsep 'regularisers_CPU'], 1i); +pathcopyFrom1 = sprintf(['..' fsep '..' fsep 'Core' fsep 'CCPiDefines.h'], 1i); +pathcopyFrom2 = sprintf(['..' fsep '..' fsep 'Core' fsep 'inpainters_CPU'], 1i); copyfile(pathcopyFrom, 'regularisers_CPU'); copyfile(pathcopyFrom1, 'regularisers_CPU'); @@ -79,7 +79,6 @@ fprintf('%s \n', 'Compiling Nonlocal marching method for inpaiting...'); mex NonlocalMarching_Inpaint.c NonlocalMarching_Inpaint_core.c utils.c COMPFLAGS="\$COMPFLAGS -fopenmp -Wall -std=c99" movefile('NonlocalMarching_Inpaint.mex*',Pathmove); - %% %%% The second approach to compile using TDM-GCC which follows this %%% discussion: @@ -129,7 +128,5 @@ fprintf('%s \n', 'Regularisers successfully compiled!'); %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -%pathA2 = sprintf(['..' fsep '..' fsep], 1i); -%cd(pathA2); -%cd demos +pathA2 = sprintf(['..' fsep '..' fsep '..' fsep '..' fsep 'demos'], 1i); +cd(pathA2); diff --git a/src/Matlab/mex_compile/compileGPU_mex.m b/src/Matlab/mex_compile/compileGPU_mex.m index dd1475c..7e15233 100644 --- a/src/Matlab/mex_compile/compileGPU_mex.m +++ b/src/Matlab/mex_compile/compileGPU_mex.m @@ -4,17 +4,18 @@ % In order to compile CUDA modules one needs to have nvcc-compiler % installed (see CUDA SDK), check it under MATLAB with !nvcc --version -% In the code bellow we provide a full explicit path to nvcc compiler +% In the code bellow we provide a full explicit path to nvcc compiler % ! paths to matlab and CUDA sdk can be different, modify accordingly ! % Tested on Ubuntu 18.04/MATLAB 2016b/cuda10.0/gcc7.3 - +%>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<< % Installation HAS NOT been tested on Windows, please you Cmake build or % modify the code bellow accordingly fsep = '/'; -pathcopyFrom = sprintf(['..' fsep '..' fsep '..' fsep 'Core' fsep 'regularisers_GPU'], 1i); -pathcopyFrom1 = sprintf(['..' fsep '..' fsep '..' fsep 'Core' fsep 'CCPiDefines.h'], 1i); +pathcopyFrom = sprintf(['..' fsep '..' fsep 'Core' fsep 'regularisers_GPU'], 1i); +pathcopyFrom1 = sprintf(['..' fsep '..' fsep 'Core' fsep 'CCPiDefines.h'], 1i); + copyfile(pathcopyFrom, 'regularisers_GPU'); copyfile(pathcopyFrom1, 'regularisers_GPU'); @@ -67,8 +68,8 @@ movefile('LLT_ROF_GPU.mex*',Pathmove); delete TV_ROF_GPU_core* TV_FGP_GPU_core* TV_SB_GPU_core* dTV_FGP_GPU_core* NonlDiff_GPU_core* Diffus_4thO_GPU_core* TGV_GPU_core* LLT_ROF_GPU_core* CCPiDefines.h +delete PatchSelect_core* Nonlocal_TV_core* shared.h fprintf('%s \n', 'All successfully compiled!'); -pathA2 = sprintf(['..' fsep '..' fsep], 1i); +pathA2 = sprintf(['..' fsep '..' fsep '..' fsep '..' fsep 'demos'], 1i); cd(pathA2); -cd demos
\ No newline at end of file diff --git a/src/Matlab/mex_compile/installed/MEXed_files_location.txt b/src/Matlab/mex_compile/installed/MEXed_files_location.txt deleted file mode 100644 index e69de29..0000000 --- a/src/Matlab/mex_compile/installed/MEXed_files_location.txt +++ /dev/null diff --git a/src/Matlab/mex_compile/regularisers_CPU/Diffusion_4thO.c b/src/Matlab/mex_compile/regularisers_CPU/Diffusion_4thO.c index 66ea9be..887a76d 100644 --- a/src/Matlab/mex_compile/regularisers_CPU/Diffusion_4thO.c +++ b/src/Matlab/mex_compile/regularisers_CPU/Diffusion_4thO.c @@ -29,9 +29,11 @@ * 3. Edge-preserving parameter (sigma) [REQUIRED] * 4. Number of iterations, for explicit scheme >= 150 is recommended [OPTIONAL, default 300] * 5. tau - time-marching step for the explicit scheme [OPTIONAL, default 0.015] + * 6. eplsilon - tolerance constant [OPTIONAL parameter] * * Output: * [1] Regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the paper by * [1] Hajiaboli, M.R., 2011. An anisotropic fourth-order diffusion filter for image noise removal. International Journal of Computer Vision, 92(2), pp.177-191. @@ -45,7 +47,8 @@ void mexFunction( int number_of_dims, iter_numb; mwSize dimX, dimY, dimZ; const mwSize *dim_array; - float *Input, *Output=NULL, lambda, tau, sigma; + float *Input, *Output=NULL, lambda, tau, sigma, epsil; + float *infovec=NULL; dim_array = mxGetDimensions(prhs[0]); number_of_dims = mxGetNumberOfDimensions(prhs[0]); @@ -54,13 +57,15 @@ void mexFunction( Input = (float *) mxGetData(prhs[0]); lambda = (float) mxGetScalar(prhs[1]); /* regularization parameter */ sigma = (float) mxGetScalar(prhs[2]); /* Edge-preserving parameter */ - iter_numb = 300; /* iterations number */ - tau = 0.01; /* marching step parameter */ + iter_numb = 500; /* iterations number */ + tau = 0.0025; /* marching step parameter */ + epsil = 1.0e-06; /*tolerance parameter*/ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if ((nrhs < 3) || (nrhs > 5)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter, Edge-preserving parameter, iterations number, time-marching constant"); - if ((nrhs == 4) || (nrhs == 5)) iter_numb = (int) mxGetScalar(prhs[3]); /* iterations number */ - if (nrhs == 5) tau = (float) mxGetScalar(prhs[4]); /* marching step parameter */ + if ((nrhs < 3) || (nrhs > 6)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter, Edge-preserving parameter, iterations number, time-marching constant, tolerance"); + if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6)) iter_numb = (int) mxGetScalar(prhs[3]); /* iterations number */ + if ((nrhs == 5) || (nrhs == 6)) tau = (float) mxGetScalar(prhs[4]); /* marching step parameter */ + if (nrhs == 6) epsil = (float) mxGetScalar(prhs[5]); /* tolerance parameter */ /*Handling Matlab output data*/ dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; @@ -73,5 +78,9 @@ void mexFunction( } if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - Diffus4th_CPU_main(Input, Output, lambda, sigma, iter_numb, tau, dimX, dimY, dimZ); -}
\ No newline at end of file + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); + + Diffus4th_CPU_main(Input, Output, infovec, lambda, sigma, iter_numb, tau, epsil, dimX, dimY, dimZ); +} diff --git a/src/Matlab/mex_compile/regularisers_CPU/FGP_TV.c b/src/Matlab/mex_compile/regularisers_CPU/FGP_TV.c index 642362f..251ac52 100644 --- a/src/Matlab/mex_compile/regularisers_CPU/FGP_TV.c +++ b/src/Matlab/mex_compile/regularisers_CPU/FGP_TV.c @@ -29,10 +29,10 @@ * 4. eplsilon: tolerance constant * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) * 6. nonneg: 'nonnegativity (0 is OFF by default) - * 7. print information: 0 (off) or 1 (on) * * Output: * [1] Filtered/regularized image + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the Matlab's code and paper by * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" @@ -44,54 +44,55 @@ void mexFunction( int nrhs, const mxArray *prhs[]) { - int number_of_dims, iter, methTV, printswitch, nonneg; + int number_of_dims, iter, methTV, nonneg; mwSize dimX, dimY, dimZ; const mwSize *dim_array; - float *Input, *Output=NULL, lambda, epsil; + float *Input, *infovec=NULL, *Output=NULL, lambda, epsil; number_of_dims = mxGetNumberOfDimensions(prhs[0]); dim_array = mxGetDimensions(prhs[0]); /*Handling Matlab input data*/ - if ((nrhs < 2) || (nrhs > 7)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D/3D), Regularization parameter, Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1'), nonnegativity switch, print switch"); + if ((nrhs < 2) || (nrhs > 6)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D/3D), Regularization parameter, Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1'), nonnegativity switch"); Input = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */ lambda = (float) mxGetScalar(prhs[1]); /* regularization parameter */ - iter = 300; /* default iterations number */ - epsil = 0.0001; /* default tolerance constant */ + iter = 400; /* default iterations number */ + epsil = 1.0e-06; /* default tolerance constant */ methTV = 0; /* default isotropic TV penalty */ nonneg = 0; /* default nonnegativity switch, off - 0 */ - printswitch = 0; /*default print is switched, off - 0 */ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7)) iter = (int) mxGetScalar(prhs[2]); /* iterations number */ - if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7)) epsil = (float) mxGetScalar(prhs[3]); /* tolerance constant */ - if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7)) { + if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6)) iter = (int) mxGetScalar(prhs[2]); /* iterations number */ + if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6)) epsil = (float) mxGetScalar(prhs[3]); /* tolerance constant */ + if ((nrhs == 5) || (nrhs == 6)) { char *penalty_type; penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */ if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',"); if (strcmp(penalty_type, "l1") == 0) methTV = 1; /* enable 'l1' penalty */ mxFree(penalty_type); } - if ((nrhs == 6) || (nrhs == 7)) { + if (nrhs == 6) { nonneg = (int) mxGetScalar(prhs[5]); if ((nonneg != 0) && (nonneg != 1)) mexErrMsgTxt("Nonnegativity constraint can be enabled by choosing 1 or off - 0"); } - if (nrhs == 7) { - printswitch = (int) mxGetScalar(prhs[6]); - if ((printswitch != 0) && (printswitch != 1)) mexErrMsgTxt("Print can be enabled by choosing 1 or off - 0"); - } - /*Handling Matlab output data*/ - dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; + /*Handling Matlab output data*/ + dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; + if (number_of_dims == 2) { dimZ = 1; /*2D case*/ - Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); + Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); } - if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + if (number_of_dims == 3) { + Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + } + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); /* running the function */ - TV_FGP_CPU_main(Input, Output, lambda, iter, epsil, methTV, nonneg, printswitch, dimX, dimY, dimZ); -}
\ No newline at end of file + TV_FGP_CPU_main(Input, Output, infovec, lambda, iter, epsil, methTV, nonneg, dimX, dimY, dimZ); +} diff --git a/src/Matlab/mex_compile/regularisers_CPU/FGP_dTV.c b/src/Matlab/mex_compile/regularisers_CPU/FGP_dTV.c index 1a0c070..f1b70a8 100644 --- a/src/Matlab/mex_compile/regularisers_CPU/FGP_dTV.c +++ b/src/Matlab/mex_compile/regularisers_CPU/FGP_dTV.c @@ -33,10 +33,10 @@ * 6. eta: smoothing constant to calculate gradient of the reference [OPTIONAL] * * 7. TV-type: methodTV - 'iso' (0) or 'l1' (1) [OPTIONAL] * 8. nonneg: 'nonnegativity (0 is OFF by default) [OPTIONAL] - * 9. print information: 0 (off) or 1 (on) [OPTIONAL] * * Output: - * [1] Filtered/regularized image/volume + * [1] Filtered/regularized image + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the Matlab's codes and papers by * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" @@ -49,28 +49,27 @@ void mexFunction( int nrhs, const mxArray *prhs[]) { - int number_of_dims, iter, methTV, printswitch, nonneg; + int number_of_dims, iter, methTV, nonneg; mwSize dimX, dimY, dimZ; const mwSize *dim_array; const mwSize *dim_array2; - float *Input, *InputRef, *Output=NULL, lambda, epsil, eta; - + float *Input, *InputRef, *Output=NULL, *infovec=NULL, lambda, epsil, eta; + number_of_dims = mxGetNumberOfDimensions(prhs[0]); dim_array = mxGetDimensions(prhs[0]); dim_array2 = mxGetDimensions(prhs[1]); /*Handling Matlab input data*/ - if ((nrhs < 3) || (nrhs > 9)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Reference(2D/3D), Regularization parameter, iterations number, tolerance, smoothing constant, penalty type ('iso' or 'l1'), nonnegativity switch, print switch"); + if ((nrhs < 3) || (nrhs > 8)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Reference(2D/3D), Regularization parameter, iterations number, tolerance, smoothing constant, penalty type ('iso' or 'l1'), nonnegativity switch"); Input = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */ InputRef = (float *) mxGetData(prhs[1]); /* reference image (2D/3D) */ lambda = (float) mxGetScalar(prhs[2]); /* regularization parameter */ iter = 300; /* default iterations number */ - epsil = 0.0001; /* default tolerance constant */ + epsil = 1.0e-06; /* default tolerance constant */ eta = 0.01; /* default smoothing constant */ methTV = 0; /* default isotropic TV penalty */ nonneg = 0; /* default nonnegativity switch, off - 0 */ - printswitch = 0; /*default print is switched, off - 0 */ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } @@ -80,35 +79,32 @@ void mexFunction( dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; if (number_of_dims == 2) { if ((dimX != dim_array2[0]) || (dimY != dim_array2[1])) mexErrMsgTxt("The input images have different dimensionalities");} if (number_of_dims == 3) { if ((dimX != dim_array2[0]) || (dimY != dim_array2[1]) || (dimZ != dim_array2[2])) mexErrMsgTxt("The input volumes have different dimensionalities");} - - - if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8) || (nrhs == 9)) iter = (int) mxGetScalar(prhs[3]); /* iterations number */ - if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8) || (nrhs == 9)) epsil = (float) mxGetScalar(prhs[4]); /* tolerance constant */ - if ((nrhs == 6) || (nrhs == 7) || (nrhs == 8) || (nrhs == 9)) { - eta = (float) mxGetScalar(prhs[5]); /* smoothing constant for the gradient of InputRef */ - } - if ((nrhs == 7) || (nrhs == 8) || (nrhs == 9)) { + + if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8)) iter = (int) mxGetScalar(prhs[3]); /* iterations number */ + if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8)) epsil = (float) mxGetScalar(prhs[4]); /* tolerance constant */ + if ((nrhs == 6) || (nrhs == 7) || (nrhs == 8)) eta = (float) mxGetScalar(prhs[5]); /* smoothing constant for the gradient of InputRef */ + if ((nrhs == 7) || (nrhs == 8)) { char *penalty_type; penalty_type = mxArrayToString(prhs[6]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */ if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',"); if (strcmp(penalty_type, "l1") == 0) methTV = 1; /* enable 'l1' penalty */ mxFree(penalty_type); } - if ((nrhs == 8) || (nrhs == 9)) { + if ((nrhs == 8)) { nonneg = (int) mxGetScalar(prhs[7]); if ((nonneg != 0) && (nonneg != 1)) mexErrMsgTxt("Nonnegativity constraint can be enabled by choosing 1 or off - 0"); } - if (nrhs == 9) { - printswitch = (int) mxGetScalar(prhs[8]); - if ((printswitch != 0) && (printswitch != 1)) mexErrMsgTxt("Print can be enabled by choosing 1 or off - 0"); - } - + if (number_of_dims == 2) { dimZ = 1; /*2D case*/ Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); } if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); + /* running the function */ - dTV_FGP_CPU_main(Input, InputRef, Output, lambda, iter, epsil, eta, methTV, nonneg, printswitch, dimX, dimY, dimZ); -}
\ No newline at end of file + dTV_FGP_CPU_main(Input, InputRef, Output, infovec, lambda, iter, epsil, eta, methTV, nonneg, dimX, dimY, dimZ); +} diff --git a/src/Matlab/mex_compile/regularisers_CPU/LLT_ROF.c b/src/Matlab/mex_compile/regularisers_CPU/LLT_ROF.c index ab45446..5c6de9d 100644 --- a/src/Matlab/mex_compile/regularisers_CPU/LLT_ROF.c +++ b/src/Matlab/mex_compile/regularisers_CPU/LLT_ROF.c @@ -32,9 +32,11 @@ * 3. lambdaLLT - LLT-related regularisation parameter * 4. tau - time-marching step * 5. iter - iterations number (for both models) +* 6. eplsilon - tolerance constant [OPTIONAL parameter] * * Output: -* Filtered/regularised image +* [1] Regularized image/volume +* [2] Information vector which contains [iteration no., reached tolerance] * * References: * [1] Lysaker, M., Lundervold, A. and Tai, X.C., 2003. Noise removal using fourth-order partial differential equation with applications to medical magnetic resonance images in space and time. IEEE Transactions on image processing, 12(12), pp.1579-1590. @@ -49,12 +51,13 @@ void mexFunction( int number_of_dims, iterationsNumb; mwSize dimX, dimY, dimZ; const mwSize *dim_array; - float *Input, *Output=NULL, lambdaROF, lambdaLLT, tau; + float *Input, *Output=NULL, lambdaROF, lambdaLLT, tau, epsil; + float *infovec=NULL; dim_array = mxGetDimensions(prhs[0]); number_of_dims = mxGetNumberOfDimensions(prhs[0]); - if ((nrhs < 3) || (nrhs > 5)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter (ROF), Regularisation parameter (LTT), iterations number, time-marching parameter"); + if ((nrhs < 3) || (nrhs > 6)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter (ROF), Regularisation parameter (LTT), iterations number, time-marching parameter, tolerance"); /*Handling Matlab input data*/ Input = (float *) mxGetData(prhs[0]); @@ -62,10 +65,12 @@ void mexFunction( lambdaLLT = (float) mxGetScalar(prhs[2]); /* ROF regularization parameter */ iterationsNumb = 250; tau = 0.0025; + epsil = 1.0e-06; /*tolerance parameter*/ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if ((nrhs == 4) || (nrhs == 5)) iterationsNumb = (int) mxGetScalar(prhs[3]); /* iterations number */ - if (nrhs == 5) tau = (float) mxGetScalar(prhs[4]); /* marching step parameter */ + if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6)) iterationsNumb = (int) mxGetScalar(prhs[3]); /* iterations number */ + if ((nrhs == 5) || (nrhs == 6)) tau = (float) mxGetScalar(prhs[4]); /* marching step parameter */ + if (nrhs == 6) epsil = (float) mxGetScalar(prhs[5]); /* epsilon */ /*Handling Matlab output data*/ dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; @@ -77,6 +82,10 @@ void mexFunction( Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); } if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); - LLT_ROF_CPU_main(Input, Output, lambdaROF, lambdaLLT, iterationsNumb, tau, dimX, dimY, dimZ); -}
\ No newline at end of file + LLT_ROF_CPU_main(Input, Output, infovec, lambdaROF, lambdaLLT, iterationsNumb, tau, epsil, dimX, dimY, dimZ); +} diff --git a/src/Matlab/mex_compile/regularisers_CPU/NonlDiff.c b/src/Matlab/mex_compile/regularisers_CPU/NonlDiff.c index ec35b8b..2ca17d2 100644 --- a/src/Matlab/mex_compile/regularisers_CPU/NonlDiff.c +++ b/src/Matlab/mex_compile/regularisers_CPU/NonlDiff.c @@ -30,9 +30,11 @@ * 4. Number of iterations, for explicit scheme >= 150 is recommended [OPTIONAL parameter] * 5. tau - time-marching step for explicit scheme [OPTIONAL parameter] * 6. Penalty type: 1 - Huber, 2 - Perona-Malik, 3 - Tukey Biweight [OPTIONAL parameter] + * 7. eplsilon - tolerance constant [OPTIONAL parameter] * * Output: - * [1] Regularized image/volume + * [1] Regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the paper by * [1] Perona, P. and Malik, J., 1990. Scale-space and edge detection using anisotropic diffusion. IEEE Transactions on pattern analysis and machine intelligence, 12(7), pp.629-639. @@ -47,7 +49,8 @@ void mexFunction( mwSize dimX, dimY, dimZ; const mwSize *dim_array; - float *Input, *Output=NULL, lambda, tau, sigma; + float *Input, *Output=NULL, lambda, tau, sigma, epsil; + float *infovec=NULL; dim_array = mxGetDimensions(prhs[0]); number_of_dims = mxGetNumberOfDimensions(prhs[0]); @@ -59,12 +62,13 @@ void mexFunction( iter_numb = 300; /* iterations number */ tau = 0.025; /* marching step parameter */ penaltytype = 1; /* Huber penalty by default */ + epsil = 1.0e-06; /*tolerance parameter*/ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if ((nrhs < 3) || (nrhs > 6)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter, Edge-preserving parameter, iterations number, time-marching constant, penalty type - Huber, PM or Tukey"); - if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6)) iter_numb = (int) mxGetScalar(prhs[3]); /* iterations number */ - if ((nrhs == 5) || (nrhs == 6)) tau = (float) mxGetScalar(prhs[4]); /* marching step parameter */ - if (nrhs == 6) { + if ((nrhs < 3) || (nrhs > 7)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter, Edge-preserving parameter, iterations number, time-marching constant, penalty type - Huber, PM or Tukey, tolerance"); + if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7)) iter_numb = (int) mxGetScalar(prhs[3]); /* iterations number */ + if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7)) tau = (float) mxGetScalar(prhs[4]); /* marching step parameter */ + if ((nrhs == 6) || (nrhs == 7)) { char *penalty_type; penalty_type = mxArrayToString(prhs[5]); /* Huber, PM or Tukey 'Huber' is the default */ if ((strcmp(penalty_type, "Huber") != 0) && (strcmp(penalty_type, "PM") != 0) && (strcmp(penalty_type, "Tukey") != 0)) mexErrMsgTxt("Choose penalty: 'Huber', 'PM' or 'Tukey',"); @@ -73,6 +77,7 @@ void mexFunction( if (strcmp(penalty_type, "Tukey") == 0) penaltytype = 3; /* enable Tikey Biweight penalty */ mxFree(penalty_type); } + if ((nrhs == 7)) epsil = (float) mxGetScalar(prhs[6]); /* epsilon */ /*Handling Matlab output data*/ dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; @@ -85,5 +90,9 @@ void mexFunction( } if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - Diffusion_CPU_main(Input, Output, lambda, sigma, iter_numb, tau, penaltytype, dimX, dimY, dimZ); -}
\ No newline at end of file + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); + + Diffusion_CPU_main(Input, Output, infovec, lambda, sigma, iter_numb, tau, penaltytype, epsil, dimX, dimY, dimZ); +} diff --git a/src/Matlab/mex_compile/regularisers_CPU/Nonlocal_TV.c b/src/Matlab/mex_compile/regularisers_CPU/Nonlocal_TV.c index 014c0a0..34b9915 100644 --- a/src/Matlab/mex_compile/regularisers_CPU/Nonlocal_TV.c +++ b/src/Matlab/mex_compile/regularisers_CPU/Nonlocal_TV.c @@ -51,8 +51,8 @@ void mexFunction( long number_of_dims, dimX, dimY, dimZ; int IterNumb, NumNeighb = 0; unsigned short *H_i, *H_j, *H_k; - const int *dim_array; - const int *dim_array2; + const mwSize *dim_array; + const mwSize *dim_array2; float *A_orig, *Output=NULL, *Weights, lambda; dim_array = mxGetDimensions(prhs[0]); diff --git a/src/Matlab/mex_compile/regularisers_CPU/PatchSelect.c b/src/Matlab/mex_compile/regularisers_CPU/PatchSelect.c index f942539..1acab29 100644 --- a/src/Matlab/mex_compile/regularisers_CPU/PatchSelect.c +++ b/src/Matlab/mex_compile/regularisers_CPU/PatchSelect.c @@ -52,11 +52,11 @@ void mexFunction( { int number_of_dims, SearchWindow, SimilarWin, NumNeighb; mwSize dimX, dimY, dimZ; - unsigned short *H_i=NULL, *H_j=NULL, *H_k=NULL; - const int *dim_array; + const mwSize *dim_array; + unsigned short *H_i=NULL, *H_j=NULL, *H_k=NULL; float *A, *Weights = NULL, h; - int dim_array2[3]; /* for 2D data */ - int dim_array3[4]; /* for 3D data */ + mwSize dim_array2[3]; /* for 2D data */ + mwSize dim_array3[4]; /* for 3D data */ dim_array = mxGetDimensions(prhs[0]); number_of_dims = mxGetNumberOfDimensions(prhs[0]); diff --git a/src/Matlab/mex_compile/regularisers_CPU/ROF_TV.c b/src/Matlab/mex_compile/regularisers_CPU/ROF_TV.c index 55ef2b1..ffe7b91 100644 --- a/src/Matlab/mex_compile/regularisers_CPU/ROF_TV.c +++ b/src/Matlab/mex_compile/regularisers_CPU/ROF_TV.c @@ -29,9 +29,11 @@ * 2. lambda - regularization parameter [REQUIRED] * 3. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED] * 4. tau - marching step for explicit scheme, ~1 is recommended [REQUIRED] + * 5. eplsilon: tolerance constant [REQUIRED] * * Output: * [1] Regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the paper by * [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" @@ -47,7 +49,8 @@ void mexFunction( int number_of_dims, iter_numb; mwSize dimX, dimY, dimZ; const mwSize *dim_array_i; - float *Input, *Output=NULL, lambda, tau; + float *Input, *Output=NULL, lambda, tau, epsil; + float *infovec=NULL; dim_array_i = mxGetDimensions(prhs[0]); number_of_dims = mxGetNumberOfDimensions(prhs[0]); @@ -57,9 +60,10 @@ void mexFunction( lambda = (float) mxGetScalar(prhs[1]); /* regularization parameter */ iter_numb = (int) mxGetScalar(prhs[2]); /* iterations number */ tau = (float) mxGetScalar(prhs[3]); /* marching step parameter */ + epsil = (float) mxGetScalar(prhs[4]); /* tolerance */ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if(nrhs != 4) mexErrMsgTxt("Four inputs reqired: Image(2D,3D), regularization parameter, iterations number, marching step constant"); + if(nrhs != 5) mexErrMsgTxt("Four inputs reqired: Image(2D,3D), regularization parameter, iterations number, marching step constant, tolerance"); /*Handling Matlab output data*/ dimX = dim_array_i[0]; dimY = dim_array_i[1]; dimZ = dim_array_i[2]; @@ -72,6 +76,10 @@ void mexFunction( if (number_of_dims == 3) { Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array_i, mxSINGLE_CLASS, mxREAL)); } + + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); - TV_ROF_CPU_main(Input, Output, lambda, iter_numb, tau, dimX, dimY, dimZ); -}
\ No newline at end of file + TV_ROF_CPU_main(Input, Output, infovec, lambda, iter_numb, tau, epsil, dimX, dimY, dimZ); +} diff --git a/src/Matlab/mex_compile/regularisers_CPU/SB_TV.c b/src/Matlab/mex_compile/regularisers_CPU/SB_TV.c index 8636322..d1bdb3a 100644 --- a/src/Matlab/mex_compile/regularisers_CPU/SB_TV.c +++ b/src/Matlab/mex_compile/regularisers_CPU/SB_TV.c @@ -28,10 +28,10 @@ * 3. Number of iterations [OPTIONAL parameter] * 4. eplsilon - tolerance constant [OPTIONAL parameter] * 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter] -* 6. print information: 0 (off) or 1 (on) [OPTIONAL parameter] * -* Output: -* 1. Filtered/regularized image + * Output: + * [1] Regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the Matlab's code and paper by * [1]. Goldstein, T. and Osher, S., 2009. The split Bregman method for L1-regularized problems. SIAM journal on imaging sciences, 2(2), pp.323-343. @@ -42,40 +42,36 @@ void mexFunction( int nrhs, const mxArray *prhs[]) { - int number_of_dims, iter, methTV, printswitch; + int number_of_dims, iter, methTV; mwSize dimX, dimY, dimZ; const mwSize *dim_array; float *Input, *Output=NULL, lambda, epsil; + float *infovec=NULL; number_of_dims = mxGetNumberOfDimensions(prhs[0]); dim_array = mxGetDimensions(prhs[0]); /*Handling Matlab input data*/ - if ((nrhs < 2) || (nrhs > 6)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D/3D), Regularization parameter, Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1'), print switch"); + if ((nrhs < 2) || (nrhs > 5)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter,iterations number, tolerance, penalty type ('iso' or 'l1')"); Input = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */ lambda = (float) mxGetScalar(prhs[1]); /* regularization parameter */ - iter = 100; /* default iterations number */ - epsil = 0.0001; /* default tolerance constant */ + iter = 200; /* default iterations number */ + epsil = 1.0e-06; /* default tolerance constant */ methTV = 0; /* default isotropic TV penalty */ - printswitch = 0; /*default print is switched, off - 0 */ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6)) iter = (int) mxGetScalar(prhs[2]); /* iterations number */ - if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6)) epsil = (float) mxGetScalar(prhs[3]); /* tolerance constant */ - if ((nrhs == 5) || (nrhs == 6)) { + if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5)) iter = (int) mxGetScalar(prhs[2]); /* iterations number */ + if ((nrhs == 4) || (nrhs == 5)) epsil = (float) mxGetScalar(prhs[3]); /* tolerance constant */ + if ((nrhs == 5)) { char *penalty_type; penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */ if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',"); if (strcmp(penalty_type, "l1") == 0) methTV = 1; /* enable 'l1' penalty */ mxFree(penalty_type); } - if (nrhs == 6) { - printswitch = (int) mxGetScalar(prhs[5]); - if ((printswitch != 0) && (printswitch != 1)) mexErrMsgTxt("Print can be enabled by choosing 1 or off - 0"); - } /*Handling Matlab output data*/ dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; @@ -86,6 +82,10 @@ void mexFunction( } if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); + /* running the function */ - SB_TV_CPU_main(Input, Output, lambda, iter, epsil, methTV, printswitch, dimX, dimY, dimZ); + SB_TV_CPU_main(Input, Output, infovec, lambda, iter, epsil, methTV, dimX, dimY, dimZ); } diff --git a/src/Matlab/mex_compile/regularisers_CPU/TGV.c b/src/Matlab/mex_compile/regularisers_CPU/TGV.c index aa4eed4..9e32ae4 100644 --- a/src/Matlab/mex_compile/regularisers_CPU/TGV.c +++ b/src/Matlab/mex_compile/regularisers_CPU/TGV.c @@ -20,7 +20,7 @@ limitations under the License. #include "mex.h" #include "TGV_core.h" -/* C-OMP implementation of Primal-Dual denoising method for +/* C-OMP implementation of Primal-Dual denoising method for * Total Generilized Variation (TGV)-L2 model [1] (2D/3D) * * Input Parameters: @@ -30,9 +30,12 @@ limitations under the License. * 4. parameter to control the second-order term (alpha0) * 5. Number of Chambolle-Pock (Primal-Dual) iterations * 6. Lipshitz constant (default is 12) + * 7. eplsilon - tolerance constant [OPTIONAL parameter] * * Output: - * Filtered/regulariaed image + * [1] Regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] + * * * References: * [1] K. Bredies "Total Generalized Variation" @@ -41,43 +44,51 @@ limitations under the License. void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) - + { int number_of_dims, iter; mwSize dimX, dimY, dimZ; const mwSize *dim_array; - - float *Input, *Output=NULL, lambda, alpha0, alpha1, L2; - + + float *Input, *Output=NULL, lambda, alpha0, alpha1, L2, epsil; + float *infovec=NULL; + number_of_dims = mxGetNumberOfDimensions(prhs[0]); dim_array = mxGetDimensions(prhs[0]); - + /*Handling Matlab input data*/ - if ((nrhs < 2) || (nrhs > 6)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D), Regularisation parameter, alpha0, alpha1, iterations number, Lipshitz Constant"); - + if ((nrhs < 2) || (nrhs > 7)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D), Regularisation parameter, alpha0, alpha1, iterations number, Lipshitz Constant, tolerance"); + Input = (float *) mxGetData(prhs[0]); /*noisy image/volume */ lambda = (float) mxGetScalar(prhs[1]); /* regularisation parameter */ - alpha1 = 1.0f; /* parameter to control the first-order term */ - alpha0 = 0.5f; /* parameter to control the second-order term */ - iter = 300; /* Iterations number */ + alpha1 = 1.0f; /* parameter to control the first-order term */ + alpha0 = 2.0f; /* parameter to control the second-order term */ + iter = 500; /* Iterations number */ L2 = 12.0f; /* Lipshitz constant */ - - if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6)) alpha1 = (float) mxGetScalar(prhs[2]); /* parameter to control the first-order term */ - if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6)) alpha0 = (float) mxGetScalar(prhs[3]); /* parameter to control the second-order term */ - if ((nrhs == 5) || (nrhs == 6)) iter = (int) mxGetScalar(prhs[4]); /* Iterations number */ - if (nrhs == 6) L2 = (float) mxGetScalar(prhs[5]); /* Lipshitz constant */ - + epsil = 1.0e-06; /*tolerance parameter*/ + + if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } + if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7)) alpha1 = (float) mxGetScalar(prhs[2]); /* parameter to control the first-order term */ + if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7)) alpha0 = (float) mxGetScalar(prhs[3]); /* parameter to control the second-order term */ + if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7)) iter = (int) mxGetScalar(prhs[4]); /* Iterations number */ + if ((nrhs == 6) || (nrhs == 7)) L2 = (float) mxGetScalar(prhs[5]); /* Lipshitz constant */ + if (nrhs == 7) epsil = (float) mxGetScalar(prhs[6]); /* epsilon */ + /*Handling Matlab output data*/ dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; - + if (number_of_dims == 2) { dimZ = 1; /*2D case*/ - Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); + Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); } if (number_of_dims == 3) { Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - } + } + + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); + /* running the function */ - TGV_main(Input, Output, lambda, alpha1, alpha0, iter, L2, dimX, dimY, dimZ); + TGV_main(Input, Output, infovec, lambda, alpha1, alpha0, iter, L2, epsil, dimX, dimY, dimZ); } diff --git a/src/Matlab/mex_compile/regularisers_GPU/Diffusion_4thO_GPU.cpp b/src/Matlab/mex_compile/regularisers_GPU/Diffusion_4thO_GPU.cpp index 0cc042b..42874ef 100644 --- a/src/Matlab/mex_compile/regularisers_GPU/Diffusion_4thO_GPU.cpp +++ b/src/Matlab/mex_compile/regularisers_GPU/Diffusion_4thO_GPU.cpp @@ -23,15 +23,17 @@ /* CUDA implementation of fourth-order diffusion scheme [1] for piecewise-smooth recovery (2D/3D case) * The minimisation is performed using explicit scheme. * - * Input Parameters: + * Input Parameters: * 1. Noisy image/volume [REQUIRED] * 2. lambda - regularization parameter [REQUIRED] * 3. Edge-preserving parameter (sigma) [REQUIRED] * 4. Number of iterations, for explicit scheme >= 150 is recommended [OPTIONAL, default 300] * 5. tau - time-marching step for the explicit scheme [OPTIONAL, default 0.015] + * 6. eplsilon - tolerance constant [OPTIONAL parameter] * * Output: * [1] Regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the paper by * [1] Hajiaboli, M.R., 2011. An anisotropic fourth-order diffusion filter for image noise removal. International Journal of Computer Vision, 92(2), pp.177-191. @@ -45,7 +47,8 @@ void mexFunction( int number_of_dims, iter_numb; mwSize dimX, dimY, dimZ; const mwSize *dim_array; - float *Input, *Output=NULL, lambda, tau, sigma; + float *Input, *Output=NULL, lambda, tau, sigma, epsil; + float *infovec=NULL; dim_array = mxGetDimensions(prhs[0]); number_of_dims = mxGetNumberOfDimensions(prhs[0]); @@ -54,13 +57,15 @@ void mexFunction( Input = (float *) mxGetData(prhs[0]); lambda = (float) mxGetScalar(prhs[1]); /* regularization parameter */ sigma = (float) mxGetScalar(prhs[2]); /* Edge-preserving parameter */ - iter_numb = 300; /* iterations number */ - tau = 0.01; /* marching step parameter */ + iter_numb = 500; /* iterations number */ + tau = 0.0025; /* marching step parameter */ + epsil = 1.0e-06; /*tolerance parameter*/ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if ((nrhs < 3) || (nrhs > 5)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter, Edge-preserving parameter, iterations number, time-marching constant"); - if ((nrhs == 4) || (nrhs == 5)) iter_numb = (int) mxGetScalar(prhs[3]); /* iterations number */ - if (nrhs == 5) tau = (float) mxGetScalar(prhs[4]); /* marching step parameter */ + if ((nrhs < 3) || (nrhs > 6)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter, Edge-preserving parameter, iterations number, time-marching constant, tolerance"); + if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6)) iter_numb = (int) mxGetScalar(prhs[3]); /* iterations number */ + if ((nrhs == 5) || (nrhs == 6)) tau = (float) mxGetScalar(prhs[4]); /* marching step parameter */ + if (nrhs == 6) epsil = (float) mxGetScalar(prhs[5]); /* tolerance parameter */ /*Handling Matlab output data*/ dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; @@ -73,5 +78,9 @@ void mexFunction( } if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - Diffus4th_GPU_main(Input, Output, lambda, sigma, iter_numb, tau, dimX, dimY, dimZ); -}
\ No newline at end of file + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); + + Diffus4th_GPU_main(Input, Output, infovec, lambda, sigma, iter_numb, tau, epsil, dimX, dimY, dimZ); +} diff --git a/src/Matlab/mex_compile/regularisers_GPU/FGP_TV_GPU.cpp b/src/Matlab/mex_compile/regularisers_GPU/FGP_TV_GPU.cpp index c174e75..d08e50d 100644 --- a/src/Matlab/mex_compile/regularisers_GPU/FGP_TV_GPU.cpp +++ b/src/Matlab/mex_compile/regularisers_GPU/FGP_TV_GPU.cpp @@ -22,76 +22,77 @@ /* GPU (CUDA) implementation of FGP-TV [1] denoising/regularization model (2D/3D case) * - * Input Parameters: + * Input Parameters: * 1. Noisy image/volume * 2. lambdaPar - regularization parameter * 3. Number of iterations * 4. eplsilon: tolerance constant * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) * 6. nonneg: 'nonnegativity (0 is OFF by default) - * 7. print information: 0 (off) or 1 (on) * * Output: * [1] Filtered/regularized image + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the Matlab's code and paper by * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" */ + void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { - int number_of_dims, iter, methTV, printswitch, nonneg; + int number_of_dims, iter, methTV, nonneg; mwSize dimX, dimY, dimZ; const mwSize *dim_array; - - float *Input, *Output=NULL, lambda, epsil; + float *Input, *infovec=NULL, *Output=NULL, lambda, epsil; number_of_dims = mxGetNumberOfDimensions(prhs[0]); dim_array = mxGetDimensions(prhs[0]); /*Handling Matlab input data*/ - if ((nrhs < 2) || (nrhs > 7)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D/3D), Regularization parameter. The full list of parameters: Image(2D/3D), Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1'), nonnegativity switch, print switch"); + if ((nrhs < 2) || (nrhs > 6)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D/3D), Regularization parameter, Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1'), nonnegativity switch"); Input = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */ lambda = (float) mxGetScalar(prhs[1]); /* regularization parameter */ - iter = 300; /* default iterations number */ - epsil = 0.0001; /* default tolerance constant */ + iter = 400; /* default iterations number */ + epsil = 1.0e-06; /* default tolerance constant */ methTV = 0; /* default isotropic TV penalty */ nonneg = 0; /* default nonnegativity switch, off - 0 */ - printswitch = 0; /*default print is switched, off - 0 */ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7)) iter = (int) mxGetScalar(prhs[2]); /* iterations number */ - if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7)) epsil = (float) mxGetScalar(prhs[3]); /* tolerance constant */ - if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7)) { + if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6)) iter = (int) mxGetScalar(prhs[2]); /* iterations number */ + if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6)) epsil = (float) mxGetScalar(prhs[3]); /* tolerance constant */ + if ((nrhs == 5) || (nrhs == 6)) { char *penalty_type; penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */ if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',"); if (strcmp(penalty_type, "l1") == 0) methTV = 1; /* enable 'l1' penalty */ mxFree(penalty_type); } - if ((nrhs == 6) || (nrhs == 7)) { + if (nrhs == 6) { nonneg = (int) mxGetScalar(prhs[5]); if ((nonneg != 0) && (nonneg != 1)) mexErrMsgTxt("Nonnegativity constraint can be enabled by choosing 1 or off - 0"); } - if (nrhs == 7) { - printswitch = (int) mxGetScalar(prhs[6]); - if ((printswitch != 0) && (printswitch != 1)) mexErrMsgTxt("Print can be enabled by choosing 1 or off - 0"); - } - /*Handling Matlab output data*/ - dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; + /*Handling Matlab output data*/ + dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; + if (number_of_dims == 2) { dimZ = 1; /*2D case*/ - Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); + Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); } - if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + if (number_of_dims == 3) { + Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + } + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); /* running the function */ - TV_FGP_GPU_main(Input, Output, lambda, iter, epsil, methTV, nonneg, printswitch, dimX, dimY, dimZ); -}
\ No newline at end of file + TV_FGP_GPU_main(Input, Output, infovec, lambda, iter, epsil, methTV, nonneg, dimX, dimY, dimZ); +} diff --git a/src/Matlab/mex_compile/regularisers_GPU/FGP_dTV_GPU.cpp b/src/Matlab/mex_compile/regularisers_GPU/FGP_dTV_GPU.cpp index 3f5a4b3..2db4556 100644 --- a/src/Matlab/mex_compile/regularisers_GPU/FGP_dTV_GPU.cpp +++ b/src/Matlab/mex_compile/regularisers_GPU/FGP_dTV_GPU.cpp @@ -33,43 +33,43 @@ * 6. eta: smoothing constant to calculate gradient of the reference [OPTIONAL] * * 7. TV-type: methodTV - 'iso' (0) or 'l1' (1) [OPTIONAL] * 8. nonneg: 'nonnegativity (0 is OFF by default) [OPTIONAL] - * 9. print information: 0 (off) or 1 (on) [OPTIONAL] * * Output: - * [1] Filtered/regularized image/volume + * [1] Filtered/regularized image + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the Matlab's codes and papers by * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" * [2] M. J. Ehrhardt and M. M. Betcke, Multi-Contrast MRI Reconstruction with Structure-Guided Total Variation, SIAM Journal on Imaging Sciences 9(3), pp. 1084–1106 */ + + void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { - int number_of_dims, iter, methTV, printswitch, nonneg; + int number_of_dims, iter, methTV, nonneg; mwSize dimX, dimY, dimZ; const mwSize *dim_array; - const mwSize *dim_array2; - - float *Input, *InputRef, *Output=NULL, lambda, epsil, eta; - + const mwSize *dim_array2; + float *Input, *InputRef, *Output=NULL, *infovec=NULL, lambda, epsil, eta; + number_of_dims = mxGetNumberOfDimensions(prhs[0]); dim_array = mxGetDimensions(prhs[0]); dim_array2 = mxGetDimensions(prhs[1]); /*Handling Matlab input data*/ - if ((nrhs < 3) || (nrhs > 9)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Reference(2D/3D), Regularization parameter, iterations number, tolerance, smoothing constant, penalty type ('iso' or 'l1'), nonnegativity switch, print switch"); + if ((nrhs < 3) || (nrhs > 8)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Reference(2D/3D), Regularization parameter, iterations number, tolerance, smoothing constant, penalty type ('iso' or 'l1'), nonnegativity switch"); Input = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */ InputRef = (float *) mxGetData(prhs[1]); /* reference image (2D/3D) */ lambda = (float) mxGetScalar(prhs[2]); /* regularization parameter */ iter = 300; /* default iterations number */ - epsil = 0.0001; /* default tolerance constant */ + epsil = 1.0e-06; /* default tolerance constant */ eta = 0.01; /* default smoothing constant */ methTV = 0; /* default isotropic TV penalty */ nonneg = 0; /* default nonnegativity switch, off - 0 */ - printswitch = 0; /*default print is switched, off - 0 */ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } @@ -79,35 +79,32 @@ void mexFunction( dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; if (number_of_dims == 2) { if ((dimX != dim_array2[0]) || (dimY != dim_array2[1])) mexErrMsgTxt("The input images have different dimensionalities");} if (number_of_dims == 3) { if ((dimX != dim_array2[0]) || (dimY != dim_array2[1]) || (dimZ != dim_array2[2])) mexErrMsgTxt("The input volumes have different dimensionalities");} - - - if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8) || (nrhs == 9)) iter = (int) mxGetScalar(prhs[3]); /* iterations number */ - if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8) || (nrhs == 9)) epsil = (float) mxGetScalar(prhs[4]); /* tolerance constant */ - if ((nrhs == 6) || (nrhs == 7) || (nrhs == 8) || (nrhs == 9)) { - eta = (float) mxGetScalar(prhs[5]); /* smoothing constant for the gradient of InputRef */ - } - if ((nrhs == 7) || (nrhs == 8) || (nrhs == 9)) { + + if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8)) iter = (int) mxGetScalar(prhs[3]); /* iterations number */ + if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8)) epsil = (float) mxGetScalar(prhs[4]); /* tolerance constant */ + if ((nrhs == 6) || (nrhs == 7) || (nrhs == 8)) eta = (float) mxGetScalar(prhs[5]); /* smoothing constant for the gradient of InputRef */ + if ((nrhs == 7) || (nrhs == 8)) { char *penalty_type; penalty_type = mxArrayToString(prhs[6]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */ if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',"); if (strcmp(penalty_type, "l1") == 0) methTV = 1; /* enable 'l1' penalty */ mxFree(penalty_type); } - if ((nrhs == 8) || (nrhs == 9)) { + if ((nrhs == 8)) { nonneg = (int) mxGetScalar(prhs[7]); if ((nonneg != 0) && (nonneg != 1)) mexErrMsgTxt("Nonnegativity constraint can be enabled by choosing 1 or off - 0"); } - if (nrhs == 9) { - printswitch = (int) mxGetScalar(prhs[8]); - if ((printswitch != 0) && (printswitch != 1)) mexErrMsgTxt("Print can be enabled by choosing 1 or off - 0"); - } - + if (number_of_dims == 2) { dimZ = 1; /*2D case*/ Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); } if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); + /* running the function */ - dTV_FGP_GPU_main(Input, InputRef, Output, lambda, iter, epsil, eta, methTV, nonneg, printswitch, dimX, dimY, dimZ); -}
\ No newline at end of file + dTV_FGP_GPU_main(Input, InputRef, Output, infovec, lambda, iter, epsil, eta, methTV, nonneg, dimX, dimY, dimZ); +} diff --git a/src/Matlab/mex_compile/regularisers_GPU/LLT_ROF_GPU.cpp b/src/Matlab/mex_compile/regularisers_GPU/LLT_ROF_GPU.cpp index e8da4ce..ff5d577 100644 --- a/src/Matlab/mex_compile/regularisers_GPU/LLT_ROF_GPU.cpp +++ b/src/Matlab/mex_compile/regularisers_GPU/LLT_ROF_GPU.cpp @@ -32,9 +32,11 @@ * 3. lambdaLLT - LLT-related regularisation parameter * 4. tau - time-marching step * 5. iter - iterations number (for both models) +* 6. eplsilon - tolerance constant [OPTIONAL parameter] * * Output: -* Filtered/regularised image +* [1] Regularized image/volume +* [2] Information vector which contains [iteration no., reached tolerance] * * References: * [1] Lysaker, M., Lundervold, A. and Tai, X.C., 2003. Noise removal using fourth-order partial differential equation with applications to medical magnetic resonance images in space and time. IEEE Transactions on image processing, 12(12), pp.1579-1590. @@ -48,14 +50,14 @@ void mexFunction( { int number_of_dims, iterationsNumb; mwSize dimX, dimY, dimZ; - const mwSize *dim_array; - - float *Input, *Output=NULL, lambdaROF, lambdaLLT, tau; + const mwSize *dim_array; + float *Input, *Output=NULL, lambdaROF, lambdaLLT, tau, epsil; + float *infovec=NULL; dim_array = mxGetDimensions(prhs[0]); number_of_dims = mxGetNumberOfDimensions(prhs[0]); - if ((nrhs < 3) || (nrhs > 5)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter (ROF), Regularisation parameter (LTT), iterations number, time-marching parameter"); + if ((nrhs < 3) || (nrhs > 6)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter (ROF), Regularisation parameter (LTT), iterations number, time-marching parameter, tolerance"); /*Handling Matlab input data*/ Input = (float *) mxGetData(prhs[0]); @@ -63,10 +65,12 @@ void mexFunction( lambdaLLT = (float) mxGetScalar(prhs[2]); /* ROF regularization parameter */ iterationsNumb = 250; tau = 0.0025; + epsil = 1.0e-06; /*tolerance parameter*/ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if ((nrhs == 4) || (nrhs == 5)) iterationsNumb = (int) mxGetScalar(prhs[3]); /* iterations number */ - if (nrhs == 5) tau = (float) mxGetScalar(prhs[4]); /* marching step parameter */ + if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6)) iterationsNumb = (int) mxGetScalar(prhs[3]); /* iterations number */ + if ((nrhs == 5) || (nrhs == 6)) tau = (float) mxGetScalar(prhs[4]); /* marching step parameter */ + if (nrhs == 6) epsil = (float) mxGetScalar(prhs[5]); /* epsilon */ /*Handling Matlab output data*/ dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; @@ -78,6 +82,10 @@ void mexFunction( Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); } if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); - LLT_ROF_GPU_main(Input, Output, lambdaROF, lambdaLLT, iterationsNumb, tau, dimX, dimY, dimZ); -}
\ No newline at end of file + LLT_ROF_GPU_main(Input, Output, infovec, lambdaROF, lambdaLLT, iterationsNumb, tau, epsil, dimX, dimY, dimZ); +} diff --git a/src/Matlab/mex_compile/regularisers_GPU/NonlDiff_GPU.cpp b/src/Matlab/mex_compile/regularisers_GPU/NonlDiff_GPU.cpp index 1cd0cdc..43627c8 100644 --- a/src/Matlab/mex_compile/regularisers_GPU/NonlDiff_GPU.cpp +++ b/src/Matlab/mex_compile/regularisers_GPU/NonlDiff_GPU.cpp @@ -26,19 +26,20 @@ * The minimisation is performed using explicit scheme. * * Input Parameters: - * 1. Noisy image/volume + * 1. Noisy image/volume * 2. lambda - regularization parameter * 3. Edge-preserving parameter (sigma), when sigma equals to zero nonlinear diffusion -> linear diffusion - * 4. Number of iterations, for explicit scheme >= 150 is recommended - * 5. tau - time-marching step for explicit scheme - * 6. Penalty type: 1 - Huber, 2 - Perona-Malik, 3 - Tukey Biweight + * 4. Number of iterations, for explicit scheme >= 150 is recommended [OPTIONAL parameter] + * 5. tau - time-marching step for explicit scheme [OPTIONAL parameter] + * 6. Penalty type: 1 - Huber, 2 - Perona-Malik, 3 - Tukey Biweight [OPTIONAL parameter] + * 7. eplsilon - tolerance constant [OPTIONAL parameter] * * Output: * [1] Regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the paper by * [1] Perona, P. and Malik, J., 1990. Scale-space and edge detection using anisotropic diffusion. IEEE Transactions on pattern analysis and machine intelligence, 12(7), pp.629-639. - * [2] Black, M.J., Sapiro, G., Marimont, D.H. and Heeger, D., 1998. Robust anisotropic diffusion. IEEE Transactions on image processing, 7(3), pp.421-432. */ void mexFunction( @@ -48,9 +49,10 @@ void mexFunction( { int number_of_dims, iter_numb, penaltytype; mwSize dimX, dimY, dimZ; - const mwSize *dim_array; + const mwSize *dim_array; - float *Input, *Output=NULL, lambda, tau, sigma; + float *Input, *Output=NULL, lambda, tau, sigma, epsil; + float *infovec=NULL; dim_array = mxGetDimensions(prhs[0]); number_of_dims = mxGetNumberOfDimensions(prhs[0]); @@ -62,12 +64,13 @@ void mexFunction( iter_numb = 300; /* iterations number */ tau = 0.025; /* marching step parameter */ penaltytype = 1; /* Huber penalty by default */ + epsil = 1.0e-06; /*tolerance parameter*/ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if ((nrhs < 3) || (nrhs > 6)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter, Edge-preserving parameter, iterations number, time-marching constant, penalty type - Huber, PM or Tukey"); - if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6)) iter_numb = (int) mxGetScalar(prhs[3]); /* iterations number */ - if ((nrhs == 5) || (nrhs == 6)) tau = (float) mxGetScalar(prhs[4]); /* marching step parameter */ - if (nrhs == 6) { + if ((nrhs < 3) || (nrhs > 7)) mexErrMsgTxt("At least 3 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter, Edge-preserving parameter, iterations number, time-marching constant, penalty type - Huber, PM or Tukey, tolerance"); + if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7)) iter_numb = (int) mxGetScalar(prhs[3]); /* iterations number */ + if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7)) tau = (float) mxGetScalar(prhs[4]); /* marching step parameter */ + if ((nrhs == 6) || (nrhs == 7)) { char *penalty_type; penalty_type = mxArrayToString(prhs[5]); /* Huber, PM or Tukey 'Huber' is the default */ if ((strcmp(penalty_type, "Huber") != 0) && (strcmp(penalty_type, "PM") != 0) && (strcmp(penalty_type, "Tukey") != 0)) mexErrMsgTxt("Choose penalty: 'Huber', 'PM' or 'Tukey',"); @@ -76,6 +79,7 @@ void mexFunction( if (strcmp(penalty_type, "Tukey") == 0) penaltytype = 3; /* enable Tikey Biweight penalty */ mxFree(penalty_type); } + if ((nrhs == 7)) epsil = (float) mxGetScalar(prhs[6]); /* epsilon */ /*Handling Matlab output data*/ dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; @@ -88,5 +92,9 @@ void mexFunction( } if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - NonlDiff_GPU_main(Input, Output, lambda, sigma, iter_numb, tau, penaltytype, dimX, dimY, dimZ); -}
\ No newline at end of file + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); + + NonlDiff_GPU_main(Input, Output, infovec, lambda, sigma, iter_numb, tau, penaltytype, epsil, dimX, dimY, dimZ); +} diff --git a/src/Matlab/mex_compile/regularisers_GPU/ROF_TV_GPU.cpp b/src/Matlab/mex_compile/regularisers_GPU/ROF_TV_GPU.cpp index bd01d55..d9b7e83 100644 --- a/src/Matlab/mex_compile/regularisers_GPU/ROF_TV_GPU.cpp +++ b/src/Matlab/mex_compile/regularisers_GPU/ROF_TV_GPU.cpp @@ -28,9 +28,11 @@ * 2. lambda - regularization parameter [REQUIRED] * 3. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED] * 4. tau - marching step for explicit scheme, ~1 is recommended [REQUIRED] + * 5. eplsilon: tolerance constant [REQUIRED] * * Output: * [1] Regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the paper by * [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" @@ -42,13 +44,13 @@ void mexFunction( int nrhs, const mxArray *prhs[]) { - int number_of_dims, iter_numb; + int number_of_dims, iter_numb; mwSize dimX, dimY, dimZ; - const mwSize *dim_array; + const mwSize *dim_array_i; + float *Input, *Output=NULL, lambda, tau, epsil; + float *infovec=NULL; - float *Input, *Output=NULL, lambda, tau; - - dim_array = mxGetDimensions(prhs[0]); + dim_array_i = mxGetDimensions(prhs[0]); number_of_dims = mxGetNumberOfDimensions(prhs[0]); /*Handling Matlab input data*/ @@ -56,19 +58,26 @@ void mexFunction( lambda = (float) mxGetScalar(prhs[1]); /* regularization parameter */ iter_numb = (int) mxGetScalar(prhs[2]); /* iterations number */ tau = (float) mxGetScalar(prhs[3]); /* marching step parameter */ + epsil = (float) mxGetScalar(prhs[4]); /* tolerance */ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if(nrhs != 4) mexErrMsgTxt("Four inputs reqired: Image(2D,3D), regularization parameter, iterations number, marching step constant"); + if(nrhs != 5) mexErrMsgTxt("Four inputs reqired: Image(2D,3D), regularization parameter, iterations number, marching step constant, tolerance"); /*Handling Matlab output data*/ - dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; + dimX = dim_array_i[0]; dimY = dim_array_i[1]; dimZ = dim_array_i[2]; /* output arrays*/ if (number_of_dims == 2) { dimZ = 1; /*2D case*/ /* output image/volume */ - Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); + Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array_i, mxSINGLE_CLASS, mxREAL)); } - if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + if (number_of_dims == 3) { + Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array_i, mxSINGLE_CLASS, mxREAL)); + } + + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); - TV_ROF_GPU_main(Input, Output, lambda, iter_numb, tau, dimX, dimY, dimZ); -}
\ No newline at end of file + TV_ROF_GPU_main(Input, Output, infovec, lambda, iter_numb, tau, epsil, dimX, dimY, dimZ); +} diff --git a/src/Matlab/mex_compile/regularisers_GPU/SB_TV_GPU.cpp b/src/Matlab/mex_compile/regularisers_GPU/SB_TV_GPU.cpp index 9d1328f..562dc65 100644 --- a/src/Matlab/mex_compile/regularisers_GPU/SB_TV_GPU.cpp +++ b/src/Matlab/mex_compile/regularisers_GPU/SB_TV_GPU.cpp @@ -22,17 +22,7 @@ /* CUDA mex-file for implementation of Split Bregman - TV denoising-regularisation model (2D/3D) [1] * -* Input Parameters: -* 1. Noisy image/volume -* 2. lambda - regularisation parameter -* 3. Number of iterations [OPTIONAL parameter] -* 4. eplsilon - tolerance constant [OPTIONAL parameter] -* 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter] -* 6. print information: 0 (off) or 1 (on) [OPTIONAL parameter] -* -* Output: -* 1. Filtered/regularized image -* + * This function is based on the Matlab's code and paper by * [1]. Goldstein, T. and Osher, S., 2009. The split Bregman method for L1-regularized problems. SIAM journal on imaging sciences, 2(2), pp.323-343. */ @@ -42,40 +32,36 @@ void mexFunction( int nrhs, const mxArray *prhs[]) { - int number_of_dims, iter, methTV, printswitch; + int number_of_dims, iter, methTV; mwSize dimX, dimY, dimZ; const mwSize *dim_array; float *Input, *Output=NULL, lambda, epsil; + float *infovec=NULL; number_of_dims = mxGetNumberOfDimensions(prhs[0]); dim_array = mxGetDimensions(prhs[0]); /*Handling Matlab input data*/ - if ((nrhs < 2) || (nrhs > 6)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D/3D), Regularization parameter, Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1'), print switch"); + if ((nrhs < 2) || (nrhs > 5)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D/3D), Regularisation parameter,iterations number, tolerance, penalty type ('iso' or 'l1')"); Input = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */ lambda = (float) mxGetScalar(prhs[1]); /* regularization parameter */ - iter = 100; /* default iterations number */ - epsil = 0.0001; /* default tolerance constant */ + iter = 200; /* default iterations number */ + epsil = 1.0e-06; /* default tolerance constant */ methTV = 0; /* default isotropic TV penalty */ - printswitch = 0; /*default print is switched, off - 0 */ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6)) iter = (int) mxGetScalar(prhs[2]); /* iterations number */ - if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6)) epsil = (float) mxGetScalar(prhs[3]); /* tolerance constant */ - if ((nrhs == 5) || (nrhs == 6)) { + if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5)) iter = (int) mxGetScalar(prhs[2]); /* iterations number */ + if ((nrhs == 4) || (nrhs == 5)) epsil = (float) mxGetScalar(prhs[3]); /* tolerance constant */ + if ((nrhs == 5)) { char *penalty_type; penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */ if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',"); if (strcmp(penalty_type, "l1") == 0) methTV = 1; /* enable 'l1' penalty */ mxFree(penalty_type); } - if (nrhs == 6) { - printswitch = (int) mxGetScalar(prhs[5]); - if ((printswitch != 0) && (printswitch != 1)) mexErrMsgTxt("Print can be enabled by choosing 1 or off - 0"); - } /*Handling Matlab output data*/ dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; @@ -86,6 +72,10 @@ void mexFunction( } if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); + /* running the function */ - TV_SB_GPU_main(Input, Output, lambda, iter, epsil, methTV, printswitch, dimX, dimY, dimZ); + TV_SB_GPU_main(Input, Output, infovec, lambda, iter, epsil, methTV, dimX, dimY, dimZ); } diff --git a/src/Matlab/mex_compile/regularisers_GPU/TGV_GPU.cpp b/src/Matlab/mex_compile/regularisers_GPU/TGV_GPU.cpp index 1173282..eb1f043 100644 --- a/src/Matlab/mex_compile/regularisers_GPU/TGV_GPU.cpp +++ b/src/Matlab/mex_compile/regularisers_GPU/TGV_GPU.cpp @@ -23,6 +23,9 @@ limitations under the License. /* CUDA implementation of Primal-Dual denoising method for * Total Generilized Variation (TGV)-L2 model [1] (2D/3D) * + /* C-OMP implementation of Primal-Dual denoising method for + * Total Generilized Variation (TGV)-L2 model [1] (2D/3D) + * * Input Parameters: * 1. Noisy image/volume (2D/3D) * 2. lambda - regularisation parameter @@ -30,9 +33,12 @@ limitations under the License. * 4. parameter to control the second-order term (alpha0) * 5. Number of Chambolle-Pock (Primal-Dual) iterations * 6. Lipshitz constant (default is 12) + * 7. eplsilon - tolerance constant [OPTIONAL parameter] * * Output: - * Filtered/regularised image + * [1] Regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] + * * * References: * [1] K. Bredies "Total Generalized Variation" @@ -46,26 +52,30 @@ void mexFunction( int number_of_dims, iter; mwSize dimX, dimY, dimZ; const mwSize *dim_array; - float *Input, *Output=NULL, lambda, alpha0, alpha1, L2; + + float *Input, *Output=NULL, lambda, alpha0, alpha1, L2, epsil; + float *infovec=NULL; number_of_dims = mxGetNumberOfDimensions(prhs[0]); dim_array = mxGetDimensions(prhs[0]); /*Handling Matlab input data*/ - if ((nrhs < 2) || (nrhs > 6)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D), Regularisation parameter, alpha0, alpha1, iterations number, Lipshitz Constant"); + if ((nrhs < 2) || (nrhs > 7)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D), Regularisation parameter, alpha0, alpha1, iterations number, Lipshitz Constant"); - Input = (float *) mxGetData(prhs[0]); /*noisy image (2D) */ + Input = (float *) mxGetData(prhs[0]); /*noisy image/volume */ lambda = (float) mxGetScalar(prhs[1]); /* regularisation parameter */ alpha1 = 1.0f; /* parameter to control the first-order term */ alpha0 = 2.0f; /* parameter to control the second-order term */ iter = 500; /* Iterations number */ L2 = 12.0f; /* Lipshitz constant */ + epsil = 1.0e-06; /*tolerance parameter*/ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); } - if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6)) alpha1 = (float) mxGetScalar(prhs[2]); /* parameter to control the first-order term */ - if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6)) alpha0 = (float) mxGetScalar(prhs[3]); /* parameter to control the second-order term */ - if ((nrhs == 5) || (nrhs == 6)) iter = (int) mxGetScalar(prhs[4]); /* Iterations number */ - if (nrhs == 6) L2 = (float) mxGetScalar(prhs[5]); /* Lipshitz constant */ + if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7)) alpha1 = (float) mxGetScalar(prhs[2]); /* parameter to control the first-order term */ + if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7)) alpha0 = (float) mxGetScalar(prhs[3]); /* parameter to control the second-order term */ + if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7)) iter = (int) mxGetScalar(prhs[4]); /* Iterations number */ + if ((nrhs == 6) || (nrhs == 7)) L2 = (float) mxGetScalar(prhs[5]); /* Lipshitz constant */ + if (nrhs == 7) epsil = (float) mxGetScalar(prhs[6]); /* epsilon */ /*Handling Matlab output data*/ dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; @@ -74,8 +84,14 @@ void mexFunction( dimZ = 1; /*2D case*/ Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); } - if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + if (number_of_dims == 3) { + Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + } + + mwSize vecdim[1]; + vecdim[0] = 2; + infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL)); /* running the function */ - TGV_GPU_main(Input, Output, lambda, alpha1, alpha0, iter, L2, dimX, dimY, dimZ); + TGV_GPU_main(Input, Output, infovec, lambda, alpha1, alpha0, iter, L2, epsil, dimX, dimY, dimZ); } diff --git a/src/Python/ccpi/filters/regularisers.py b/src/Python/ccpi/filters/regularisers.py index 588ea32..398e11c 100644 --- a/src/Python/ccpi/filters/regularisers.py +++ b/src/Python/ccpi/filters/regularisers.py @@ -7,21 +7,23 @@ try: from ccpi.filters.gpu_regularisers import TV_ROF_GPU, TV_FGP_GPU, TV_SB_GPU, dTV_FGP_GPU, NDF_GPU, Diff4th_GPU, TGV_GPU, LLT_ROF_GPU, PATCHSEL_GPU gpu_enabled = True except ImportError: - gpu_enabled = False + gpu_enabled = False from ccpi.filters.cpu_regularisers import NDF_INPAINT_CPU, NVM_INPAINT_CPU def ROF_TV(inputData, regularisation_parameter, iterations, - time_marching_parameter,device='cpu'): + time_marching_parameter,tolerance_param,device='cpu'): if device == 'cpu': return TV_ROF_CPU(inputData, regularisation_parameter, - iterations, - time_marching_parameter) + iterations, + time_marching_parameter, + tolerance_param) elif device == 'gpu' and gpu_enabled: return TV_ROF_GPU(inputData, regularisation_parameter, - iterations, - time_marching_parameter) + iterations, + time_marching_parameter, + tolerance_param) else: if not gpu_enabled and device == 'gpu': raise ValueError ('GPU is not available') @@ -29,134 +31,165 @@ def ROF_TV(inputData, regularisation_parameter, iterations, .format(device)) def FGP_TV(inputData, regularisation_parameter,iterations, - tolerance_param, methodTV, nonneg, printM, device='cpu'): + tolerance_param, methodTV, nonneg, device='cpu'): if device == 'cpu': return TV_FGP_CPU(inputData, regularisation_parameter, - iterations, + iterations, tolerance_param, methodTV, - nonneg, - printM) + nonneg) elif device == 'gpu' and gpu_enabled: return TV_FGP_GPU(inputData, regularisation_parameter, - iterations, + iterations, tolerance_param, methodTV, - nonneg, - printM) + nonneg) else: if not gpu_enabled and device == 'gpu': raise ValueError ('GPU is not available') raise ValueError('Unknown device {0}. Expecting gpu or cpu'\ .format(device)) def SB_TV(inputData, regularisation_parameter, iterations, - tolerance_param, methodTV, printM, device='cpu'): + tolerance_param, methodTV, device='cpu'): if device == 'cpu': return TV_SB_CPU(inputData, regularisation_parameter, - iterations, + iterations, tolerance_param, - methodTV, - printM) + methodTV) elif device == 'gpu' and gpu_enabled: return TV_SB_GPU(inputData, regularisation_parameter, - iterations, + iterations, tolerance_param, - methodTV, - printM) + methodTV) else: if not gpu_enabled and device == 'gpu': raise ValueError ('GPU is not available') raise ValueError('Unknown device {0}. Expecting gpu or cpu'\ .format(device)) -def FGP_dTV(inputData, refdata, regularisation_parameter, iterations, - tolerance_param, eta_const, methodTV, nonneg, printM, device='cpu'): +def LLT_ROF(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, + time_marching_parameter, tolerance_param, device='cpu'): if device == 'cpu': - return dTV_FGP_CPU(inputData, - refdata, - regularisation_parameter, - iterations, - tolerance_param, - eta_const, - methodTV, - nonneg, - printM) + return LLT_ROF_CPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param) elif device == 'gpu' and gpu_enabled: - return dTV_FGP_GPU(inputData, - refdata, - regularisation_parameter, - iterations, - tolerance_param, - eta_const, - methodTV, - nonneg, - printM) + return LLT_ROF_GPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param) + else: + if not gpu_enabled and device == 'gpu': + raise ValueError ('GPU is not available') + raise ValueError('Unknown device {0}. Expecting gpu or cpu'\ + .format(device)) +def TGV(inputData, regularisation_parameter, alpha1, alpha0, iterations, + LipshitzConst, tolerance_param, device='cpu'): + if device == 'cpu': + return TGV_CPU(inputData, + regularisation_parameter, + alpha1, + alpha0, + iterations, + LipshitzConst, + tolerance_param) + elif device == 'gpu' and gpu_enabled: + return TGV_GPU(inputData, + regularisation_parameter, + alpha1, + alpha0, + iterations, + LipshitzConst, + tolerance_param) else: if not gpu_enabled and device == 'gpu': raise ValueError ('GPU is not available') raise ValueError('Unknown device {0}. Expecting gpu or cpu'\ .format(device)) -def TNV(inputData, regularisation_parameter, iterations, tolerance_param): - return TNV_CPU(inputData, - regularisation_parameter, - iterations, - tolerance_param) def NDF(inputData, regularisation_parameter, edge_parameter, iterations, - time_marching_parameter, penalty_type, device='cpu'): + time_marching_parameter, penalty_type, tolerance_param, device='cpu'): if device == 'cpu': return NDF_CPU(inputData, regularisation_parameter, edge_parameter, - iterations, + iterations, time_marching_parameter, - penalty_type) + penalty_type, + tolerance_param) elif device == 'gpu' and gpu_enabled: return NDF_GPU(inputData, regularisation_parameter, edge_parameter, - iterations, + iterations, time_marching_parameter, - penalty_type) + penalty_type, + tolerance_param) else: if not gpu_enabled and device == 'gpu': raise ValueError ('GPU is not available') raise ValueError('Unknown device {0}. Expecting gpu or cpu'\ .format(device)) def Diff4th(inputData, regularisation_parameter, edge_parameter, iterations, - time_marching_parameter, device='cpu'): + time_marching_parameter, tolerance_param, device='cpu'): if device == 'cpu': return Diff4th_CPU(inputData, regularisation_parameter, edge_parameter, - iterations, - time_marching_parameter) + iterations, + time_marching_parameter, + tolerance_param) elif device == 'gpu' and gpu_enabled: return Diff4th_GPU(inputData, regularisation_parameter, edge_parameter, - iterations, - time_marching_parameter) + iterations, + time_marching_parameter, + tolerance_param) + else: + if not gpu_enabled and device == 'gpu': + raise ValueError ('GPU is not available') + raise ValueError('Unknown device {0}. Expecting gpu or cpu'\ + .format(device)) +def FGP_dTV(inputData, refdata, regularisation_parameter, iterations, + tolerance_param, eta_const, methodTV, nonneg, device='cpu'): + if device == 'cpu': + return dTV_FGP_CPU(inputData, + refdata, + regularisation_parameter, + iterations, + tolerance_param, + eta_const, + methodTV, + nonneg) + elif device == 'gpu' and gpu_enabled: + return dTV_FGP_GPU(inputData, + refdata, + regularisation_parameter, + iterations, + tolerance_param, + eta_const, + methodTV, + nonneg) else: if not gpu_enabled and device == 'gpu': raise ValueError ('GPU is not available') raise ValueError('Unknown device {0}. Expecting gpu or cpu'\ .format(device)) - +def TNV(inputData, regularisation_parameter, iterations, tolerance_param): + return TNV_CPU(inputData, + regularisation_parameter, + iterations, + tolerance_param) def PatchSelect(inputData, searchwindow, patchwindow, neighbours, edge_parameter, device='cpu'): if device == 'cpu': return PATCHSEL_CPU(inputData, searchwindow, patchwindow, - neighbours, + neighbours, edge_parameter) elif device == 'gpu' and gpu_enabled: return PATCHSEL_GPU(inputData, searchwindow, patchwindow, - neighbours, + neighbours, edge_parameter) else: if not gpu_enabled and device == 'gpu': @@ -168,47 +201,14 @@ def NLTV(inputData, H_i, H_j, H_k, Weights, regularisation_parameter, iterations return NLTV_CPU(inputData, H_i, H_j, - H_k, + H_k, Weights, regularisation_parameter, iterations) - -def TGV(inputData, regularisation_parameter, alpha1, alpha0, iterations, - LipshitzConst, device='cpu'): - if device == 'cpu': - return TGV_CPU(inputData, - regularisation_parameter, - alpha1, - alpha0, - iterations, - LipshitzConst) - elif device == 'gpu' and gpu_enabled: - return TGV_GPU(inputData, - regularisation_parameter, - alpha1, - alpha0, - iterations, - LipshitzConst) - else: - if not gpu_enabled and device == 'gpu': - raise ValueError ('GPU is not available') - raise ValueError('Unknown device {0}. Expecting gpu or cpu'\ - .format(device)) -def LLT_ROF(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, - time_marching_parameter, device='cpu'): - if device == 'cpu': - return LLT_ROF_CPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter) - elif device == 'gpu' and gpu_enabled: - return LLT_ROF_GPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter) - else: - if not gpu_enabled and device == 'gpu': - raise ValueError ('GPU is not available') - raise ValueError('Unknown device {0}. Expecting gpu or cpu'\ - .format(device)) def NDF_INP(inputData, maskData, regularisation_parameter, edge_parameter, iterations, time_marching_parameter, penalty_type): - return NDF_INPAINT_CPU(inputData, maskData, regularisation_parameter, + return NDF_INPAINT_CPU(inputData, maskData, regularisation_parameter, edge_parameter, iterations, time_marching_parameter, penalty_type) - + def NVM_INP(inputData, maskData, SW_increment, iterations): return NVM_INPAINT_CPU(inputData, maskData, SW_increment, iterations) diff --git a/src/Python/setup-regularisers.py.in b/src/Python/setup-regularisers.py.in index 82d9f9f..4c578e3 100644 --- a/src/Python/setup-regularisers.py.in +++ b/src/Python/setup-regularisers.py.in @@ -44,7 +44,7 @@ extra_include_dirs += [os.path.join(".." , "Core"), os.path.join(".." , "Core", "regularisers_GPU" , "LLTROF" ) , os.path.join(".." , "Core", "regularisers_GPU" , "NDF" ) , os.path.join(".." , "Core", "regularisers_GPU" , "dTV_FGP" ) , - os.path.join(".." , "Core", "regularisers_GPU" , "DIFF4th" ) , + os.path.join(".." , "Core", "regularisers_GPU" , "Diff4th" ) , os.path.join(".." , "Core", "regularisers_GPU" , "PatchSelect" ) , "."] @@ -68,7 +68,7 @@ setup( ], zip_safe = False, - packages = {'ccpi','ccpi.filters', 'ccpi.supp'}, + packages = {'ccpi', 'ccpi.filters', 'ccpi.supp'}, ) diff --git a/src/Python/src/cpu_regularisers.pyx b/src/Python/src/cpu_regularisers.pyx index 11a0617..add641b 100644 --- a/src/Python/src/cpu_regularisers.pyx +++ b/src/Python/src/cpu_regularisers.pyx @@ -18,15 +18,15 @@ import cython import numpy as np cimport numpy as np -cdef extern float TV_ROF_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ); -cdef extern float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); -cdef extern float SB_TV_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ); -cdef extern float LLT_ROF_CPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int dimX, int dimY, int dimZ); -cdef extern float TGV_main(float *Input, float *Output, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY, int dimZ); -cdef extern float Diffusion_CPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int dimX, int dimY, int dimZ); -cdef extern float Diffus4th_CPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ); +cdef extern float TV_ROF_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float tau, float epsil, int dimX, int dimY, int dimZ); +cdef extern float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ); +cdef extern float SB_TV_CPU_main(float *Input, float *Output, float *infovector, float mu, int iter, float epsil, int methodTV, int dimX, int dimY, int dimZ); +cdef extern float LLT_ROF_CPU_main(float *Input, float *Output, float *infovector, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, float epsil, int dimX, int dimY, int dimZ); +cdef extern float TGV_main(float *Input, float *Output, float *infovector, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, float epsil, int dimX, int dimY, int dimZ); +cdef extern float Diffusion_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, float epsil, int dimX, int dimY, int dimZ); +cdef extern float Diffus4th_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, float epsil, int dimX, int dimY, int dimZ); +cdef extern float dTV_FGP_CPU_main(float *Input, float *InputRef, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int dimX, int dimY, int dimZ); cdef extern float TNV_CPU_main(float *Input, float *u, float lambdaPar, int maxIter, float tol, int dimX, int dimY, int dimZ); -cdef extern float dTV_FGP_CPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); cdef extern float PatchSelect_CPU_main(float *Input, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int SearchWindow, int SimilarWin, int NumNeighb, float h, int switchM); cdef extern float Nonlocal_TV_CPU_main(float *A_orig, float *Output, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int NumNeighb, float lambdaReg, int IterNumb); @@ -37,341 +37,482 @@ cdef extern float TV_energy3D(float *U, float *U0, float *E_val, float lambdaPar #****************************************************************# #********************** Total-variation ROF *********************# #****************************************************************# -def TV_ROF_CPU(inputData, regularisation_parameter, iterationsNumb, marching_step_parameter): +def TV_ROF_CPU(inputData, regularisation_parameter, iterationsNumb, marching_step_parameter,tolerance_param): if inputData.ndim == 2: - return TV_ROF_2D(inputData, regularisation_parameter, iterationsNumb, marching_step_parameter) + return TV_ROF_2D(inputData, regularisation_parameter, iterationsNumb, marching_step_parameter,tolerance_param) elif inputData.ndim == 3: - return TV_ROF_3D(inputData, regularisation_parameter, iterationsNumb, marching_step_parameter) + return TV_ROF_3D(inputData, regularisation_parameter, iterationsNumb, marching_step_parameter,tolerance_param) -def TV_ROF_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, +def TV_ROF_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameter, - int iterationsNumb, - float marching_step_parameter): + int iterationsNumb, + float marching_step_parameter, + float tolerance_param): cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] - + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - - # Run ROF iterations for 2D data - TV_ROF_CPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, iterationsNumb, marching_step_parameter, dims[1], dims[0], 1) - - return outputData - -def TV_ROF_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + # Run ROF iterations for 2D data + TV_ROF_CPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], regularisation_parameter, iterationsNumb, marching_step_parameter, tolerance_param, dims[1], dims[0], 1) + + return (outputData,infovec) + +def TV_ROF_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, int iterationsNumb, - float marching_step_parameter): + float marching_step_parameter, + float tolerance_param): cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] - + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0],dims[1],dims[2]], dtype='float32') - - # Run ROF iterations for 3D data - TV_ROF_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, iterationsNumb, marching_step_parameter, dims[2], dims[1], dims[0]) + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') - return outputData + # Run ROF iterations for 3D data + TV_ROF_CPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], regularisation_parameter, iterationsNumb, marching_step_parameter, tolerance_param, dims[2], dims[1], dims[0]) + + return (outputData,infovec) #****************************************************************# #********************** Total-variation FGP *********************# #****************************************************************# #******** Total-variation Fast-Gradient-Projection (FGP)*********# -def TV_FGP_CPU(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, nonneg, printM): +def TV_FGP_CPU(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, nonneg): if inputData.ndim == 2: - return TV_FGP_2D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, nonneg, printM) + return TV_FGP_2D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, nonneg) elif inputData.ndim == 3: - return TV_FGP_3D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, nonneg, printM) + return TV_FGP_3D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, nonneg) -def TV_FGP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, +def TV_FGP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameter, - int iterationsNumb, + int iterationsNumb, float tolerance_param, int methodTV, - int nonneg, - int printM): - + int nonneg): + cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] - + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - + + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + #/* Run FGP-TV iterations for 2D data */ - TV_FGP_CPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, - iterationsNumb, + TV_FGP_CPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], regularisation_parameter, + iterationsNumb, tolerance_param, methodTV, nonneg, - printM, dims[1],dims[0],1) - - return outputData - -def TV_FGP_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + + return (outputData,infovec) + +def TV_FGP_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, - int iterationsNumb, + int iterationsNumb, float tolerance_param, int methodTV, - int nonneg, - int printM): + int nonneg): + cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] - + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0], dims[1], dims[2]], dtype='float32') - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + #/* Run FGP-TV iterations for 3D data */ - TV_FGP_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, - iterationsNumb, + TV_FGP_CPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], regularisation_parameter, + iterationsNumb, tolerance_param, methodTV, nonneg, - printM, dims[2], dims[1], dims[0]) - return outputData + return (outputData,infovec) #***************************************************************# #********************** Total-variation SB *********************# #***************************************************************# #*************** Total-variation Split Bregman (SB)*************# -def TV_SB_CPU(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, printM): +def TV_SB_CPU(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV): if inputData.ndim == 2: - return TV_SB_2D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, printM) + return TV_SB_2D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV) elif inputData.ndim == 3: - return TV_SB_3D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, printM) + return TV_SB_3D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV) -def TV_SB_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, +def TV_SB_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameter, - int iterationsNumb, + int iterationsNumb, float tolerance_param, - int methodTV, - int printM): - + int methodTV): + cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] - + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + #/* Run SB-TV iterations for 2D data */ - SB_TV_CPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, - iterationsNumb, + SB_TV_CPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], + regularisation_parameter, + iterationsNumb, tolerance_param, methodTV, - printM, - dims[1],dims[0],1) - - return outputData - -def TV_SB_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + dims[1],dims[0], 1) + + return (outputData,infovec) + +def TV_SB_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, - int iterationsNumb, + int iterationsNumb, float tolerance_param, - int methodTV, - int printM): + int methodTV): cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] - + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0], dims[1], dims[2]], dtype='float32') - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + #/* Run SB-TV iterations for 3D data */ - SB_TV_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, - iterationsNumb, + SB_TV_CPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], + regularisation_parameter, + iterationsNumb, tolerance_param, methodTV, - printM, dims[2], dims[1], dims[0]) - return outputData + return (outputData,infovec) +#***************************************************************# +#******************* ROF - LLT regularisation ******************# +#***************************************************************# +def LLT_ROF_CPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param): + if inputData.ndim == 2: + return LLT_ROF_2D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param) + elif inputData.ndim == 3: + return LLT_ROF_3D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param) + +def LLT_ROF_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, + float regularisation_parameterROF, + float regularisation_parameterLLT, + int iterations, + float time_marching_parameter, + float tolerance_param): + + cdef long dims[2] + dims[0] = inputData.shape[0] + dims[1] = inputData.shape[1] + + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ + np.zeros([dims[0],dims[1]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + + #/* Run ROF-LLT iterations for 2D data */ + LLT_ROF_CPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, + tolerance_param, + dims[1],dims[0],1) + return (outputData,infovec) + +def LLT_ROF_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + float regularisation_parameterROF, + float regularisation_parameterLLT, + int iterations, + float time_marching_parameter, + float tolerance_param): + + cdef long dims[3] + dims[0] = inputData.shape[0] + dims[1] = inputData.shape[1] + dims[2] = inputData.shape[2] + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ + np.zeros([dims[0], dims[1], dims[2]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + + #/* Run ROF-LLT iterations for 3D data */ + LLT_ROF_CPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], regularisation_parameterROF, regularisation_parameterLLT, iterations, + time_marching_parameter, + tolerance_param, + dims[2], dims[1], dims[0]) + return (outputData,infovec) #***************************************************************# #***************** Total Generalised Variation *****************# #***************************************************************# -def TGV_CPU(inputData, regularisation_parameter, alpha1, alpha0, iterations, LipshitzConst): +def TGV_CPU(inputData, regularisation_parameter, alpha1, alpha0, iterations, LipshitzConst, tolerance_param): if inputData.ndim == 2: - return TGV_2D(inputData, regularisation_parameter, alpha1, alpha0, - iterations, LipshitzConst) + return TGV_2D(inputData, regularisation_parameter, alpha1, alpha0, + iterations, LipshitzConst, tolerance_param) elif inputData.ndim == 3: - return TGV_3D(inputData, regularisation_parameter, alpha1, alpha0, - iterations, LipshitzConst) + return TGV_3D(inputData, regularisation_parameter, alpha1, alpha0, + iterations, LipshitzConst, tolerance_param) -def TGV_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, +def TGV_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameter, float alpha1, float alpha0, - int iterationsNumb, - float LipshitzConst): - + int iterationsNumb, + float LipshitzConst, + float tolerance_param): + cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] - + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + #/* Run TGV iterations for 2D data */ - TGV_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, + TGV_main(&inputData[0,0], &outputData[0,0], &infovec[0], regularisation_parameter, alpha1, alpha0, - iterationsNumb, + iterationsNumb, LipshitzConst, + tolerance_param, dims[1],dims[0],1) - return outputData -def TGV_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + return (outputData,infovec) +def TGV_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, float alpha1, float alpha0, - int iterationsNumb, - float LipshitzConst): - + int iterationsNumb, + float LipshitzConst, + float tolerance_param): + cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] - + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0], dims[1], dims[2]], dtype='float32') - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + #/* Run TGV iterations for 3D data */ - TGV_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, + TGV_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], regularisation_parameter, alpha1, alpha0, - iterationsNumb, + iterationsNumb, LipshitzConst, + tolerance_param, dims[2], dims[1], dims[0]) - return outputData + return (outputData,infovec) -#***************************************************************# -#******************* ROF - LLT regularisation ******************# -#***************************************************************# -def LLT_ROF_CPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter): +#****************************************************************# +#***************Nonlinear (Isotropic) Diffusion******************# +#****************************************************************# +def NDF_CPU(inputData, regularisation_parameter, edge_parameter, iterationsNumb,time_marching_parameter, penalty_type,tolerance_param): if inputData.ndim == 2: - return LLT_ROF_2D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter) + return NDF_2D(inputData, regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, tolerance_param) elif inputData.ndim == 3: - return LLT_ROF_3D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter) + return NDF_3D(inputData, regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, tolerance_param) -def LLT_ROF_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, - float regularisation_parameterROF, - float regularisation_parameterLLT, - int iterations, - float time_marching_parameter): - +def NDF_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, + float regularisation_parameter, + float edge_parameter, + int iterationsNumb, + float time_marching_parameter, + int penalty_type, + float tolerance_param): cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] - + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - - #/* Run ROF-LLT iterations for 2D data */ - LLT_ROF_CPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, dims[1],dims[0],1) - return outputData - -def LLT_ROF_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, - float regularisation_parameterROF, - float regularisation_parameterLLT, - int iterations, - float time_marching_parameter): - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + + # Run Nonlinear Diffusion iterations for 2D data + Diffusion_CPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], + regularisation_parameter, edge_parameter, iterationsNumb, + time_marching_parameter, penalty_type, + tolerance_param, + dims[1], dims[0], 1) + return (outputData,infovec) + +def NDF_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + float regularisation_parameter, + float edge_parameter, + int iterationsNumb, + float time_marching_parameter, + int penalty_type, + float tolerance_param): cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] - + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ - np.zeros([dims[0], dims[1], dims[2]], dtype='float32') - - #/* Run ROF-LLT iterations for 3D data */ - LLT_ROF_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, dims[2], dims[1], dims[0]) - return outputData + np.zeros([dims[0],dims[1],dims[2]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + + # Run Nonlinear Diffusion iterations for 3D data + Diffusion_CPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], + regularisation_parameter, edge_parameter, iterationsNumb, + time_marching_parameter, penalty_type, + tolerance_param, + dims[2], dims[1], dims[0]) + return (outputData,infovec) +#****************************************************************# +#*************Anisotropic Fourth-Order diffusion*****************# +#****************************************************************# +def Diff4th_CPU(inputData, regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter,tolerance_param): + if inputData.ndim == 2: + return Diff4th_2D(inputData, regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter,tolerance_param) + elif inputData.ndim == 3: + return Diff4th_3D(inputData, regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter,tolerance_param) +def Diff4th_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, + float regularisation_parameter, + float edge_parameter, + int iterationsNumb, + float time_marching_parameter, + float tolerance_param): + cdef long dims[2] + dims[0] = inputData.shape[0] + dims[1] = inputData.shape[1] + + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ + np.zeros([dims[0],dims[1]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + + # Run Anisotropic Fourth-Order diffusion for 2D data + Diffus4th_CPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], + regularisation_parameter, + edge_parameter, iterationsNumb, + time_marching_parameter, + tolerance_param, + dims[1], dims[0], 1) + return (outputData,infovec) + +def Diff4th_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + float regularisation_parameter, + float edge_parameter, + int iterationsNumb, + float time_marching_parameter, + float tolerance_param): + cdef long dims[3] + dims[0] = inputData.shape[0] + dims[1] = inputData.shape[1] + dims[2] = inputData.shape[2] + + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ + np.zeros([dims[0],dims[1],dims[2]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + + # Run Anisotropic Fourth-Order diffusion for 3D data + Diffus4th_CPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], + regularisation_parameter, edge_parameter, + iterationsNumb, time_marching_parameter, + tolerance_param, + dims[2], dims[1], dims[0]) + return (outputData,infovec) #****************************************************************# #**************Directional Total-variation FGP ******************# #****************************************************************# #******** Directional TV Fast-Gradient-Projection (FGP)*********# -def dTV_FGP_CPU(inputData, refdata, regularisation_parameter, iterationsNumb, tolerance_param, eta_const, methodTV, nonneg, printM): +def dTV_FGP_CPU(inputData, refdata, regularisation_parameter, iterationsNumb, tolerance_param, eta_const, methodTV, nonneg): if inputData.ndim == 2: - return dTV_FGP_2D(inputData, refdata, regularisation_parameter, iterationsNumb, tolerance_param, eta_const, methodTV, nonneg, printM) + return dTV_FGP_2D(inputData, refdata, regularisation_parameter, iterationsNumb, tolerance_param, eta_const, methodTV, nonneg) elif inputData.ndim == 3: - return dTV_FGP_3D(inputData, refdata, regularisation_parameter, iterationsNumb, tolerance_param, eta_const, methodTV, nonneg, printM) + return dTV_FGP_3D(inputData, refdata, regularisation_parameter, iterationsNumb, tolerance_param, eta_const, methodTV, nonneg) -def dTV_FGP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, +def dTV_FGP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, np.ndarray[np.float32_t, ndim=2, mode="c"] refdata, float regularisation_parameter, - int iterationsNumb, + int iterationsNumb, float tolerance_param, float eta_const, int methodTV, - int nonneg, - int printM): - + int nonneg): + cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] - + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + #/* Run FGP-dTV iterations for 2D data */ - dTV_FGP_CPU_main(&inputData[0,0], &refdata[0,0], &outputData[0,0], regularisation_parameter, - iterationsNumb, + dTV_FGP_CPU_main(&inputData[0,0], &refdata[0,0], &outputData[0,0], &infovec[0], + regularisation_parameter, + iterationsNumb, tolerance_param, eta_const, - methodTV, + methodTV, nonneg, - printM, dims[1], dims[0], 1) - - return outputData - + return (outputData,infovec) + def dTV_FGP_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, np.ndarray[np.float32_t, ndim=3, mode="c"] refdata, float regularisation_parameter, - int iterationsNumb, + int iterationsNumb, float tolerance_param, float eta_const, int methodTV, - int nonneg, - int printM): + int nonneg): cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] - + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0], dims[1], dims[2]], dtype='float32') - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + #/* Run FGP-dTV iterations for 3D data */ - dTV_FGP_CPU_main(&inputData[0,0,0], &refdata[0,0,0], &outputData[0,0,0], regularisation_parameter, - iterationsNumb, + dTV_FGP_CPU_main(&inputData[0,0,0], &refdata[0,0,0], &outputData[0,0,0], &infovec[0], + regularisation_parameter, + iterationsNumb, tolerance_param, eta_const, methodTV, nonneg, - printM, dims[2], dims[1], dims[0]) - return outputData - + return (outputData,infovec) + #****************************************************************# #*********************Total Nuclear Variation********************# #****************************************************************# def TNV_CPU(inputData, regularisation_parameter, iterationsNumb, tolerance_param): if inputData.ndim == 2: - return + return elif inputData.ndim == 3: return TNV_3D(inputData, regularisation_parameter, iterationsNumb, tolerance_param) -def TNV_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, +def TNV_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, int iterationsNumb, float tolerance_param): @@ -379,101 +520,13 @@ def TNV_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] - - cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ - np.zeros([dims[0],dims[1],dims[2]], dtype='float32') - - # Run TNV iterations for 3D (X,Y,Channels) data - TNV_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, iterationsNumb, tolerance_param, dims[2], dims[1], dims[0]) - return outputData -#****************************************************************# -#***************Nonlinear (Isotropic) Diffusion******************# -#****************************************************************# -def NDF_CPU(inputData, regularisation_parameter, edge_parameter, iterationsNumb,time_marching_parameter, penalty_type): - if inputData.ndim == 2: - return NDF_2D(inputData, regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type) - elif inputData.ndim == 3: - return NDF_3D(inputData, regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type) - -def NDF_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, - float regularisation_parameter, - float edge_parameter, - int iterationsNumb, - float time_marching_parameter, - int penalty_type): - cdef long dims[2] - dims[0] = inputData.shape[0] - dims[1] = inputData.shape[1] - - cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ - np.zeros([dims[0],dims[1]], dtype='float32') - - # Run Nonlinear Diffusion iterations for 2D data - Diffusion_CPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, dims[1], dims[0], 1) - return outputData - -def NDF_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, - float regularisation_parameter, - float edge_parameter, - int iterationsNumb, - float time_marching_parameter, - int penalty_type): - cdef long dims[3] - dims[0] = inputData.shape[0] - dims[1] = inputData.shape[1] - dims[2] = inputData.shape[2] - - cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ - np.zeros([dims[0],dims[1],dims[2]], dtype='float32') - - # Run Nonlinear Diffusion iterations for 3D data - Diffusion_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, dims[2], dims[1], dims[0]) - - return outputData -#****************************************************************# -#*************Anisotropic Fourth-Order diffusion*****************# -#****************************************************************# -def Diff4th_CPU(inputData, regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter): - if inputData.ndim == 2: - return Diff4th_2D(inputData, regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter) - elif inputData.ndim == 3: - return Diff4th_3D(inputData, regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter) - -def Diff4th_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, - float regularisation_parameter, - float edge_parameter, - int iterationsNumb, - float time_marching_parameter): - cdef long dims[2] - dims[0] = inputData.shape[0] - dims[1] = inputData.shape[1] - - cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ - np.zeros([dims[0],dims[1]], dtype='float32') - - # Run Anisotropic Fourth-Order diffusion for 2D data - Diffus4th_CPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, dims[1], dims[0], 1) - return outputData - -def Diff4th_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, - float regularisation_parameter, - float edge_parameter, - int iterationsNumb, - float time_marching_parameter): - cdef long dims[3] - dims[0] = inputData.shape[0] - dims[1] = inputData.shape[1] - dims[2] = inputData.shape[2] - cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0],dims[1],dims[2]], dtype='float32') - - # Run Anisotropic Fourth-Order diffusion for 3D data - Diffus4th_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, dims[2], dims[1], dims[0]) + # Run TNV iterations for 3D (X,Y,Channels) data + TNV_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, iterationsNumb, tolerance_param, dims[2], dims[1], dims[0]) return outputData - #****************************************************************# #***************Patch-based weights calculation******************# #****************************************************************# @@ -491,14 +544,14 @@ def PatchSel_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, dims[0] = neighbours dims[1] = inputData.shape[0] dims[2] = inputData.shape[1] - - + + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] Weights = \ np.zeros([dims[0], dims[1],dims[2]], dtype='float32') - + cdef np.ndarray[np.uint16_t, ndim=3, mode="c"] H_i = \ np.zeros([dims[0], dims[1],dims[2]], dtype='uint16') - + cdef np.ndarray[np.uint16_t, ndim=3, mode="c"] H_j = \ np.zeros([dims[0], dims[1],dims[2]], dtype='uint16') @@ -516,16 +569,16 @@ def PatchSel_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] dims[3] = neighbours - + cdef np.ndarray[np.float32_t, ndim=4, mode="c"] Weights = \ np.zeros([dims[3],dims[0],dims[1],dims[2]], dtype='float32') - + cdef np.ndarray[np.uint16_t, ndim=4, mode="c"] H_i = \ np.zeros([dims[3],dims[0],dims[1],dims[2]], dtype='uint16') - + cdef np.ndarray[np.uint16_t, ndim=4, mode="c"] H_j = \ np.zeros([dims[3],dims[0],dims[1],dims[2]], dtype='uint16') - + cdef np.ndarray[np.uint16_t, ndim=4, mode="c"] H_k = \ np.zeros([dims[3],dims[0],dims[1],dims[2]], dtype='uint16') @@ -553,10 +606,10 @@ def NLTV_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] neighbours = H_i.shape[0] - + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - + # Run nonlocal TV regularisation Nonlocal_TV_CPU_main(&inputData[0,0], &outputData[0,0], &H_i[0,0,0], &H_j[0,0,0], &H_i[0,0,0], &Weights[0,0,0], dims[1], dims[0], 0, neighbours, regularisation_parameter, iterations) return outputData @@ -570,7 +623,7 @@ def NDF_INPAINT_CPU(inputData, maskData, regularisation_parameter, edge_paramete elif inputData.ndim == 3: return NDF_INP_3D(inputData, maskData, regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type) -def NDF_INP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, +def NDF_INP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, np.ndarray[np.uint8_t, ndim=2, mode="c"] maskData, float regularisation_parameter, float edge_parameter, @@ -585,12 +638,12 @@ def NDF_INP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - - # Run Inpaiting by Diffusion iterations for 2D data + + # Run Inpaiting by Diffusion iterations for 2D data Diffusion_Inpaint_CPU_main(&inputData[0,0], &maskData[0,0], &outputData[0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, dims[1], dims[0], 1) return outputData - -def NDF_INP_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + +def NDF_INP_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, np.ndarray[np.uint8_t, ndim=3, mode="c"] maskData, float regularisation_parameter, float edge_parameter, @@ -601,11 +654,11 @@ def NDF_INP_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] - + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0],dims[1],dims[2]], dtype='float32') - - # Run Inpaiting by Diffusion iterations for 3D data + + # Run Inpaiting by Diffusion iterations for 3D data Diffusion_Inpaint_CPU_main(&inputData[0,0,0], &maskData[0,0,0], &outputData[0,0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, dims[2], dims[1], dims[0]) return outputData @@ -616,27 +669,27 @@ def NVM_INPAINT_CPU(inputData, maskData, SW_increment, iterationsNumb): if inputData.ndim == 2: return NVM_INP_2D(inputData, maskData, SW_increment, iterationsNumb) elif inputData.ndim == 3: - return + return -def NVM_INP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, +def NVM_INP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, np.ndarray[np.uint8_t, ndim=2, mode="c"] maskData, int SW_increment, int iterationsNumb): cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] - + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ - np.zeros([dims[0],dims[1]], dtype='float32') - + np.zeros([dims[0],dims[1]], dtype='float32') + cdef np.ndarray[np.uint8_t, ndim=2, mode="c"] maskData_upd = \ np.zeros([dims[0],dims[1]], dtype='uint8') - - # Run Inpaiting by Nonlocal vertical marching method for 2D data - NonlocalMarching_Inpaint_main(&inputData[0,0], &maskData[0,0], &outputData[0,0], + + # Run Inpaiting by Nonlocal vertical marching method for 2D data + NonlocalMarching_Inpaint_main(&inputData[0,0], &maskData[0,0], &outputData[0,0], &maskData_upd[0,0], SW_increment, iterationsNumb, 1, dims[1], dims[0], 1) - + return (outputData, maskData_upd) @@ -649,36 +702,36 @@ def TV_ENERGY(inputData, inputData0, regularisation_parameter, typeFunctional): elif inputData.ndim == 3: return TV_ENERGY_3D(inputData, inputData0, regularisation_parameter, typeFunctional) -def TV_ENERGY_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, - np.ndarray[np.float32_t, ndim=2, mode="c"] inputData0, +def TV_ENERGY_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, + np.ndarray[np.float32_t, ndim=2, mode="c"] inputData0, float regularisation_parameter, int typeFunctional): - + cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] outputData = \ np.zeros([1], dtype='float32') - - # run function + + # run function TV_energy2D(&inputData[0,0], &inputData0[0,0], &outputData[0], regularisation_parameter, typeFunctional, dims[1], dims[0]) - + return outputData - + def TV_ENERGY_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, - np.ndarray[np.float32_t, ndim=3, mode="c"] inputData0, + np.ndarray[np.float32_t, ndim=3, mode="c"] inputData0, float regularisation_parameter, int typeFunctional): - + cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] outputData = \ np.zeros([1], dtype='float32') - + # Run function TV_energy3D(&inputData[0,0,0], &inputData0[0,0,0], &outputData[0], regularisation_parameter, typeFunctional, dims[2], dims[1], dims[0]) diff --git a/src/Python/src/gpu_regularisers.pyx b/src/Python/src/gpu_regularisers.pyx index b52f669..84ee981 100644 --- a/src/Python/src/gpu_regularisers.pyx +++ b/src/Python/src/gpu_regularisers.pyx @@ -20,190 +20,195 @@ cimport numpy as np CUDAErrorMessage = 'CUDA error' -cdef extern int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z); -cdef extern int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z); -cdef extern int TV_SB_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int printM, int N, int M, int Z); -cdef extern int TGV_GPU_main(float *Input, float *Output, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY, int dimZ); -cdef extern int LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z); -cdef extern int NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z); -cdef extern int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int printM, int N, int M, int Z); -cdef extern int Diffus4th_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int N, int M, int Z); +cdef extern int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, float lambdaPar, int iter, float tau, float epsil, int N, int M, int Z); +cdef extern int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int N, int M, int Z); +cdef extern int TV_SB_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int N, int M, int Z); +cdef extern int LLT_ROF_GPU_main(float *Input, float *Output, float *infovector, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, float epsil, int N, int M, int Z); +cdef extern int TGV_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, float epsil, int dimX, int dimY, int dimZ); +cdef extern int NonlDiff_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, float epsil, int N, int M, int Z); +cdef extern int Diffus4th_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, float epsil, int N, int M, int Z); +cdef extern int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int N, int M, int Z); cdef extern int PatchSelect_GPU_main(float *Input, unsigned short *H_i, unsigned short *H_j, float *Weights, int N, int M, int SearchWindow, int SimilarWin, int NumNeighb, float h); # Total-variation Rudin-Osher-Fatemi (ROF) def TV_ROF_GPU(inputData, regularisation_parameter, - iterations, - time_marching_parameter): + iterations, + time_marching_parameter, + tolerance_param): if inputData.ndim == 2: - return ROFTV2D(inputData, + return ROFTV2D(inputData, regularisation_parameter, iterations, - time_marching_parameter) + time_marching_parameter, + tolerance_param) elif inputData.ndim == 3: - return ROFTV3D(inputData, + return ROFTV3D(inputData, regularisation_parameter, - iterations, - time_marching_parameter) - + iterations, + time_marching_parameter, + tolerance_param) + # Total-variation Fast-Gradient-Projection (FGP) def TV_FGP_GPU(inputData, regularisation_parameter, - iterations, + iterations, tolerance_param, methodTV, - nonneg, - printM): + nonneg): if inputData.ndim == 2: return FGPTV2D(inputData, regularisation_parameter, - iterations, + iterations, tolerance_param, methodTV, - nonneg, - printM) + nonneg) elif inputData.ndim == 3: return FGPTV3D(inputData, regularisation_parameter, - iterations, + iterations, tolerance_param, methodTV, - nonneg, - printM) + nonneg) # Total-variation Split Bregman (SB) def TV_SB_GPU(inputData, regularisation_parameter, - iterations, + iterations, tolerance_param, - methodTV, - printM): + methodTV): if inputData.ndim == 2: return SBTV2D(inputData, regularisation_parameter, - iterations, + iterations, tolerance_param, - methodTV, - printM) + methodTV) elif inputData.ndim == 3: return SBTV3D(inputData, regularisation_parameter, - iterations, + iterations, tolerance_param, - methodTV, - printM) + methodTV) # LLT-ROF model -def LLT_ROF_GPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter): +def LLT_ROF_GPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param): if inputData.ndim == 2: - return LLT_ROF_GPU2D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter) + return LLT_ROF_GPU2D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param) elif inputData.ndim == 3: - return LLT_ROF_GPU3D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter) + return LLT_ROF_GPU3D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param) # Total Generilised Variation (TGV) -def TGV_GPU(inputData, regularisation_parameter, alpha1, alpha0, iterations, LipshitzConst): +def TGV_GPU(inputData, regularisation_parameter, alpha1, alpha0, iterations, LipshitzConst, tolerance_param): if inputData.ndim == 2: - return TGV2D(inputData, regularisation_parameter, alpha1, alpha0, iterations, LipshitzConst) + return TGV2D(inputData, regularisation_parameter, alpha1, alpha0, iterations, LipshitzConst, tolerance_param) elif inputData.ndim == 3: - return TGV3D(inputData, regularisation_parameter, alpha1, alpha0, iterations, LipshitzConst) + return TGV3D(inputData, regularisation_parameter, alpha1, alpha0, iterations, LipshitzConst, tolerance_param) # Directional Total-variation Fast-Gradient-Projection (FGP) def dTV_FGP_GPU(inputData, refdata, regularisation_parameter, - iterations, + iterations, tolerance_param, eta_const, methodTV, - nonneg, - printM): + nonneg): if inputData.ndim == 2: return FGPdTV2D(inputData, refdata, regularisation_parameter, - iterations, + iterations, tolerance_param, eta_const, methodTV, - nonneg, - printM) + nonneg) elif inputData.ndim == 3: return FGPdTV3D(inputData, refdata, regularisation_parameter, - iterations, + iterations, tolerance_param, eta_const, methodTV, - nonneg, - printM) + nonneg) # Nonlocal Isotropic Diffusion (NDF) def NDF_GPU(inputData, regularisation_parameter, edge_parameter, - iterations, + iterations, time_marching_parameter, - penalty_type): + penalty_type, + tolerance_param): if inputData.ndim == 2: return NDF_GPU_2D(inputData, regularisation_parameter, edge_parameter, - iterations, + iterations, time_marching_parameter, - penalty_type) + penalty_type, + tolerance_param) elif inputData.ndim == 3: return NDF_GPU_3D(inputData, regularisation_parameter, edge_parameter, - iterations, + iterations, time_marching_parameter, - penalty_type) + penalty_type, + tolerance_param) # Anisotropic Fourth-Order diffusion def Diff4th_GPU(inputData, regularisation_parameter, edge_parameter, - iterations, - time_marching_parameter): + iterations, + time_marching_parameter, + tolerance_param): if inputData.ndim == 2: return Diff4th_2D(inputData, regularisation_parameter, edge_parameter, - iterations, - time_marching_parameter) + iterations, + time_marching_parameter, + tolerance_param) elif inputData.ndim == 3: return Diff4th_3D(inputData, regularisation_parameter, edge_parameter, - iterations, - time_marching_parameter) - + iterations, + time_marching_parameter, + tolerance_param) + #****************************************************************# #********************** Total-variation ROF *********************# #****************************************************************# -def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, +def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameter, - int iterations, - float time_marching_parameter): - + int iterations, + float time_marching_parameter, + float tolerance_param): + cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + # Running CUDA code here if (TV_ROF_GPU_main( - &inputData[0,0], &outputData[0,0], + &inputData[0,0], &outputData[0,0], &infovec[0], regularisation_parameter, - iterations , - time_marching_parameter, + iterations, + time_marching_parameter, + tolerance_param, dims[1], dims[0], 1)==0): - return outputData; + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); - -def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + +def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, - int iterations, - float time_marching_parameter): - + int iterations, + float time_marching_parameter, + float tolerance_param): + cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] @@ -211,76 +216,79 @@ def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0],dims[1],dims[2]], dtype='float32') - - # Running CUDA code here + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + # Running CUDA code here if (TV_ROF_GPU_main( - &inputData[0,0,0], &outputData[0,0,0], + &inputData[0,0,0], &outputData[0,0,0], &infovec[0], regularisation_parameter, - iterations , - time_marching_parameter, + iterations, + time_marching_parameter, + tolerance_param, dims[2], dims[1], dims[0])==0): - return outputData; + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); #****************************************************************# #********************** Total-variation FGP *********************# #****************************************************************# #******** Total-variation Fast-Gradient-Projection (FGP)*********# -def FGPTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, +def FGPTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameter, - int iterations, + int iterations, float tolerance_param, int methodTV, - int nonneg, - int printM): - + int nonneg): + cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - - # Running CUDA code here - if (TV_FGP_GPU_main(&inputData[0,0], &outputData[0,0], - regularisation_parameter, - iterations, + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + # Running CUDA code here + if (TV_FGP_GPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], + regularisation_parameter, + iterations, tolerance_param, methodTV, nonneg, - printM, dims[1], dims[0], 1)==0): - return outputData; + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); - -def FGPTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, +def FGPTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, - int iterations, + int iterations, float tolerance_param, int methodTV, - int nonneg, - int printM): - + int nonneg): + cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0],dims[1],dims[2]], dtype='float32') - - # Running CUDA code here - if (TV_FGP_GPU_main(&inputData[0,0,0], &outputData[0,0,0], - regularisation_parameter , - iterations, + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + # Running CUDA code here + if (TV_FGP_GPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], + regularisation_parameter, + iterations, tolerance_param, methodTV, nonneg, - printM, dims[2], dims[1], dims[0])==0): - return outputData; + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); @@ -288,40 +296,39 @@ def FGPTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, #********************** Total-variation SB *********************# #***************************************************************# #*************** Total-variation Split Bregman (SB)*************# -def SBTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, +def SBTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameter, - int iterations, + int iterations, float tolerance_param, - int methodTV, - int printM): - + int methodTV): + cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - - # Running CUDA code here - if (TV_SB_GPU_main(&inputData[0,0], &outputData[0,0], - regularisation_parameter, - iterations, + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + # Running CUDA code here + if (TV_SB_GPU_main(&inputData[0,0], &outputData[0,0],&infovec[0], + regularisation_parameter, + iterations, tolerance_param, methodTV, - printM, dims[1], dims[0], 1)==0): - return outputData; + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); - -def SBTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + +def SBTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, - int iterations, + int iterations, float tolerance_param, - int methodTV, - int printM): - + int methodTV): + cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] @@ -329,16 +336,17 @@ def SBTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0],dims[1],dims[2]], dtype='float32') - - # Running CUDA code here - if (TV_SB_GPU_main(&inputData[0,0,0], &outputData[0,0,0], - regularisation_parameter , - iterations, + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + # Running CUDA code here + if (TV_SB_GPU_main(&inputData[0,0,0], &outputData[0,0,0],&infovec[0], + regularisation_parameter , + iterations, tolerance_param, methodTV, - printM, dims[2], dims[1], dims[0])==0): - return outputData; + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); @@ -347,32 +355,39 @@ def SBTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, #************************ LLT-ROF model ************************# #***************************************************************# #************Joint LLT-ROF model for higher order **************# -def LLT_ROF_GPU2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, +def LLT_ROF_GPU2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameterROF, float regularisation_parameterLLT, - int iterations, - float time_marching_parameter): - + int iterations, + float time_marching_parameter, + float tolerance_param): + cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - - # Running CUDA code here - if (LLT_ROF_GPU_main(&inputData[0,0], &outputData[0,0],regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, dims[1],dims[0],1)==0): - return outputData; + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + # Running CUDA code here + if (LLT_ROF_GPU_main(&inputData[0,0], &outputData[0,0],&infovec[0],regularisation_parameterROF, regularisation_parameterLLT, iterations, + time_marching_parameter, + tolerance_param, + dims[1],dims[0],1)==0): + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); - -def LLT_ROF_GPU3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + +def LLT_ROF_GPU3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameterROF, float regularisation_parameterLLT, - int iterations, - float time_marching_parameter): - + int iterations, + float time_marching_parameter, + float tolerance_param): + cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] @@ -380,10 +395,16 @@ def LLT_ROF_GPU3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0],dims[1],dims[2]], dtype='float32') - - # Running CUDA code here - if (LLT_ROF_GPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, dims[2], dims[1], dims[0])==0): - return outputData; + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + # Running CUDA code here + if (LLT_ROF_GPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], regularisation_parameterROF, regularisation_parameterLLT, + iterations, + time_marching_parameter, + tolerance_param, + dims[2], dims[1], dims[0])==0): + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); @@ -391,38 +412,43 @@ def LLT_ROF_GPU3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, #***************************************************************# #***************** Total Generalised Variation *****************# #***************************************************************# -def TGV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, +def TGV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameter, float alpha1, float alpha0, - int iterationsNumb, - float LipshitzConst): - + int iterationsNumb, + float LipshitzConst, + float tolerance_param): + cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] - + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + #/* Run TGV iterations for 2D data */ - if (TGV_GPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, + if (TGV_GPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], regularisation_parameter, alpha1, alpha0, - iterationsNumb, + iterationsNumb, LipshitzConst, + tolerance_param, dims[1],dims[0], 1)==0): - return outputData + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); -def TGV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, +def TGV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, float alpha1, float alpha0, - int iterationsNumb, - float LipshitzConst): - + int iterationsNumb, + float LipshitzConst, + float tolerance_param): + cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] @@ -430,178 +456,205 @@ def TGV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0],dims[1],dims[2]], dtype='float32') - - # Running CUDA code here + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + # Running CUDA code here if (TGV_GPU_main( - &inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, + &inputData[0,0,0], &outputData[0,0,0], &infovec[0], regularisation_parameter, alpha1, alpha0, - iterationsNumb, + iterationsNumb, LipshitzConst, + tolerance_param, dims[2], dims[1], dims[0])==0): - return outputData; + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); - #****************************************************************# -#**************Directional Total-variation FGP ******************# +#***************Nonlinear (Isotropic) Diffusion******************# #****************************************************************# -#******** Directional TV Fast-Gradient-Projection (FGP)*********# -def FGPdTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, - np.ndarray[np.float32_t, ndim=2, mode="c"] refdata, +def NDF_GPU_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameter, - int iterations, - float tolerance_param, - float eta_const, - int methodTV, - int nonneg, - int printM): - + float edge_parameter, + int iterationsNumb, + float time_marching_parameter, + int penalty_type, + float tolerance_param): cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ - np.zeros([dims[0],dims[1]], dtype='float32') - - # Running CUDA code here - if (dTV_FGP_GPU_main(&inputData[0,0], &refdata[0,0], &outputData[0,0], - regularisation_parameter, - iterations, - tolerance_param, - eta_const, - methodTV, - nonneg, - printM, - dims[1], dims[0], 1)==0): - return outputData + np.zeros([dims[0],dims[1]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + #rangecheck = penalty_type < 1 and penalty_type > 3 + #if not rangecheck: +# raise ValueError('Choose penalty type as 1 for Huber, 2 - Perona-Malik, 3 - Tukey Biweight') + + # Run Nonlinear Diffusion iterations for 2D data + # Running CUDA code here + if (NonlDiff_GPU_main(&inputData[0,0], &outputData[0,0],&infovec[0], + regularisation_parameter, + edge_parameter, iterationsNumb, + time_marching_parameter, penalty_type, + tolerance_param, + dims[1], dims[0], 1)==0): + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); - - -def FGPdTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, - np.ndarray[np.float32_t, ndim=3, mode="c"] refdata, +def NDF_GPU_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, - int iterations, - float tolerance_param, - float eta_const, - int methodTV, - int nonneg, - int printM): - + float edge_parameter, + int iterationsNumb, + float time_marching_parameter, + int penalty_type, + float tolerance_param): cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ - np.zeros([dims[0],dims[1],dims[2]], dtype='float32') - - # Running CUDA code here - if (dTV_FGP_GPU_main(&inputData[0,0,0], &refdata[0,0,0], &outputData[0,0,0], - regularisation_parameter , - iterations, - tolerance_param, - eta_const, - methodTV, - nonneg, - printM, - dims[2], dims[1], dims[0])==0): - return outputData; + np.zeros([dims[0],dims[1],dims[2]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + # Run Nonlinear Diffusion iterations for 3D data + # Running CUDA code here + if (NonlDiff_GPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], + regularisation_parameter, edge_parameter, + iterationsNumb, time_marching_parameter, + penalty_type, + tolerance_param, + dims[2], dims[1], dims[0])==0): + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); - #****************************************************************# -#***************Nonlinear (Isotropic) Diffusion******************# +#************Anisotropic Fourth-Order diffusion******************# #****************************************************************# -def NDF_GPU_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, +def Diff4th_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameter, float edge_parameter, - int iterationsNumb, + int iterationsNumb, float time_marching_parameter, - int penalty_type): + float tolerance_param): cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] - + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - - #rangecheck = penalty_type < 1 and penalty_type > 3 - #if not rangecheck: -# raise ValueError('Choose penalty type as 1 for Huber, 2 - Perona-Malik, 3 - Tukey Biweight') - - # Run Nonlinear Diffusion iterations for 2D data - # Running CUDA code here - if (NonlDiff_GPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, dims[1], dims[0], 1)==0): - return outputData; + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + # Run Anisotropic Fourth-Order diffusion for 2D data + # Running CUDA code here + if (Diffus4th_GPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], + regularisation_parameter, edge_parameter, iterationsNumb, + time_marching_parameter, + tolerance_param, + dims[1], dims[0], 1)==0): + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); - -def NDF_GPU_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, +def Diff4th_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, float edge_parameter, - int iterationsNumb, + int iterationsNumb, float time_marching_parameter, - int penalty_type): + float tolerance_param): cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] - + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ - np.zeros([dims[0],dims[1],dims[2]], dtype='float32') - - # Run Nonlinear Diffusion iterations for 3D data - # Running CUDA code here - if (NonlDiff_GPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, penalty_type, dims[2], dims[1], dims[0])==0): - return outputData; + np.zeros([dims[0],dims[1],dims[2]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + # Run Anisotropic Fourth-Order diffusion for 3D data + # Running CUDA code here + if (Diffus4th_GPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], + regularisation_parameter, edge_parameter, + iterationsNumb, time_marching_parameter, + tolerance_param, + dims[2], dims[1], dims[0])==0): + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); - #****************************************************************# -#************Anisotropic Fourth-Order diffusion******************# +#**************Directional Total-variation FGP ******************# #****************************************************************# -def Diff4th_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, +#******** Directional TV Fast-Gradient-Projection (FGP)*********# +def FGPdTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, + np.ndarray[np.float32_t, ndim=2, mode="c"] refdata, float regularisation_parameter, - float edge_parameter, - int iterationsNumb, - float time_marching_parameter): + int iterations, + float tolerance_param, + float eta_const, + int methodTV, + int nonneg): + cdef long dims[2] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] - + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ - np.zeros([dims[0],dims[1]], dtype='float32') - - # Run Anisotropic Fourth-Order diffusion for 2D data - # Running CUDA code here - if (Diffus4th_GPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, dims[1], dims[0], 1)==0): - return outputData + np.zeros([dims[0],dims[1]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + # Running CUDA code here + if (dTV_FGP_GPU_main(&inputData[0,0], &refdata[0,0], &outputData[0,0], &infovec[0], + regularisation_parameter, + iterations, + tolerance_param, + eta_const, + methodTV, + nonneg, + dims[1], dims[0], 1)==0): + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); - -def Diff4th_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + +def FGPdTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + np.ndarray[np.float32_t, ndim=3, mode="c"] refdata, float regularisation_parameter, - float edge_parameter, - int iterationsNumb, - float time_marching_parameter): + int iterations, + float tolerance_param, + float eta_const, + int methodTV, + int nonneg): + cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] - + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ - np.zeros([dims[0],dims[1],dims[2]], dtype='float32') - - # Run Anisotropic Fourth-Order diffusion for 3D data - # Running CUDA code here - if (Diffus4th_GPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, dims[2], dims[1], dims[0])==0): - return outputData; + np.zeros([dims[0],dims[1],dims[2]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + + # Running CUDA code here + if (dTV_FGP_GPU_main(&inputData[0,0,0], &refdata[0,0,0], &outputData[0,0,0], &infovec[0], + regularisation_parameter , + iterations, + tolerance_param, + eta_const, + methodTV, + nonneg, + dims[2], dims[1], dims[0])==0): + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); @@ -621,14 +674,14 @@ def PatchSel_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, cdef long dims[3] dims[0] = neighbours dims[1] = inputData.shape[0] - dims[2] = inputData.shape[1] - + dims[2] = inputData.shape[1] + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] Weights = \ np.zeros([dims[0], dims[1],dims[2]], dtype='float32') - + cdef np.ndarray[np.uint16_t, ndim=3, mode="c"] H_i = \ np.zeros([dims[0], dims[1],dims[2]], dtype='uint16') - + cdef np.ndarray[np.uint16_t, ndim=3, mode="c"] H_j = \ np.zeros([dims[0], dims[1],dims[2]], dtype='uint16') @@ -637,4 +690,3 @@ def PatchSel_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, return H_i, H_j, Weights; else: raise ValueError(CUDAErrorMessage); - |