mex code did not give improvements as compraed to matlab code

Question

Abeera Tariq 2015 年 6 月 16 日

0
リンク

この質問への直接リンク

https://jp.mathworks.com/matlabcentral/answers/224024-mex-code-did-not-give-improvements-as-compraed-to-matlab-code

編集済み: James Tursa 2015 年 6 月 17 日

I converted a piece of Matlab code into mex. but I am not getting the time efficiency and it is far slower than matlab code. I don't know that what is the issue with code.. It's logic is working perfectly fine with correct answer. May be I need to destroy the arrays or I need to free the memory.. Can someone help me in the code?

    #include "mex.h"
    #include <conio.h>
    #include <math.h>
    #include<stdlib.h>
    void loopy( float ImgWeight[256][256],float ImgTemp[256][256],float PatchSetT[62001][64],float Row[125], float Col[125],float Ni,float I[249][249], float IWaveletMatrix[8][8], float WaveletMatrix[8][8], float DctMatrix[10][10] )
    {
        float TranArray[10][64];
        float PatchArray[10][8][8];
        float PatchArra[10][8][8];
        float Z[10][64];
        float PatchSet[64][62001];
        float  CurPatchIndx[10];
        int N=249;
        int M=249;
        int i,j,k,l,f,g,h,q,r,x,a,b,kTmp,rmax,rmin,cmax,cmin,y,zz;
        float CurRow,CurCol,Off;
        int temp=0;
        int f2=64;
        float sum22;
        float Threshold=33.6f;
        float ArrayNo=10;
        float SearchWin=20;
        float CurArray[64][10];
        float C1[64][64];
        float xi[64][10];
        float sum12 = 0;
        float multiply1[64][10];
        float tran[10][64];
        int looplengthN,looplengthM;
        float z;
        int ColIndx,RowIndx;
        float C[64][64];
        float multiply[64][10];
        float tY[64][10];
        float Y[10][64];
        float tDctMatrix[10][10];
    //     
    //     looplengthN = sizeof Row/ sizeof (Row[0]);
    // printf("%d\t",looplengthN);
    // looplengthM = sizeof Col/ sizeof (Col[0]);
    //  printf("%d\t",looplengthM);
        for (y = 0; y <125 ; y++){ 
            for( zz = 0 ; zz < 125 ; zz++ ){
                float **sum;
                float *sum0;
                float **sum1;
                float *sum10;
                float **B;
                float *B0;
                float **idx1;
                float *idx10;
                float *sum2;
                float *v;
                int *indexes;
                float *idxl;
                float *dis;
                CurRow = Row[y];
                CurCol = Col[zz];
                Off= (CurCol-1)*Ni+CurRow;
                rmin=((CurRow-20)<1)?1:(CurRow-20);
                rmax=(N<(CurRow+20))?N:(CurRow+20);
                cmin=((CurCol-20)<1)?1:(CurCol-20);
                cmax=(M<(CurCol+20))?M:(CurCol+20);
                q=rmax*cmax;
                sum0= (float *)mxMalloc(q*f2*sizeof(*sum0));
                sum= (float **)mxMalloc(q*sizeof(*sum));
                for (i = 0; i < q; i++){
                    sum[i]=sum0+i*f2;
                }
                sum10= (float *)mxMalloc(q*f2*sizeof(*sum10));
                sum1= (float **)mxMalloc(q*sizeof(*sum1));
                for (i = 0; i < q; i++){
                    sum1[i]=sum10+i*f2;
                }
                B0= (float *)mxMalloc(q*f2*sizeof(*B0));
                B= (float **)mxMalloc(q*sizeof(*B));
                for (i = 0; i < q; i++){
                    B[i]=B0+i*f2;
                }
                idx10= (float *)mxMalloc(rmax*cmax*sizeof(*idx10));
                idx1= (float **)mxMalloc(rmax*sizeof(*idx1));
                for (i = 0; i < rmax; i++){
                    idx1[i]=idx10+i*cmax;
                }
                sum2= (float *)mxMalloc(q*sizeof(float));
                v=(float *)mxMalloc(f2*sizeof(float));
                idxl= (float *)mxMalloc(q*sizeof(float));
                dis= (float *)mxMalloc(q*sizeof(float));
                indexes= (int *)mxMalloc(q*sizeof(int));
                for (i = 0; i < rmax; i++){
                    for( j = 0 ; j <cmax ; j++ ){
                        idx1[i][j]= I[i][j];
                    }}
                for (i = 0; i < rmax; i++)
                { for (j = 0; j < cmax; j++)
                  {idxl[i + j*rmax] = idx1[i][j];}}
                r=0;
                for(l=0; l<(rmax*cmax); l++)
                {r=idxl[l]-1;
                 for(k=0; k<f2; k++)
                 {B[l][k]=PatchSetT[r][k];
                 }}
                for (i = 0,j=(Off-1); i <f2; i++)
                {v[i]  = PatchSetT[j][i]; }
                for (i = 0; i<f2 ; i++) {
                    for (j = 0 ; j < (rmax*cmax); j++) {
                        sum[j][i] = B[j][i] - v[i];
                        sum1[j][i]= powf(sum[j][i],2); }}
                for (i = 0; i <(rmax*cmax) ; i++) {
                    sum22=0;
                    for (j = 0 ; j < f2; j++) {
                        sum22+=sum1[i][j];}
                    sum2[i]=sum22;}
                for(i = 0; i <(rmax*cmax) ; i++)
                {   dis[i]= sum2[i]/f2;}
                for( f = 0; f < rmax*cmax; f++)
                    indexes[f] = f;
                for(g = 0; g < rmax*cmax; g++)
                {for(h = 0; h < rmax*cmax; h++)
                 {if(dis[indexes[g]] < dis[indexes[h]])
                  {kTmp = indexes[g];
                   indexes[g] = indexes[h];
                   indexes[h] = kTmp;}}}
                for (j = 0; j <10; j++)
                { temp=indexes[j];
                  CurPatchIndx[j]=idxl[temp];
                }
                for (i = 0; i < 62001; i++){
                    for( j = 0 ; j < 64 ; j++ ){
                        PatchSet[j][i] = PatchSetT[i][j];}}
                x=0;
                for(l=0; l<64; l++)
                {
                    for(k=0; k<10; k++)
                    { x=CurPatchIndx[k]-1;
                      CurArray[l][k]=PatchSet[l][x];
                    }
    //printf("\n");
                }
                for ( i = 0; i < 10; i++){
                    for ( j = 0; j < 8; j++){
                        for ( k = 0; k < 8; k++)
                        {PatchArray[i][j][k] = CurArray[k * 8 + j][i];
                        }}}
                for ( i = 0; i < 10; i++){
                    for ( j = 0; j < 8; j++){
                        for ( k = 0; k < 8; k++)
                        {
                            xi[k * 8 + j][i]=PatchArray[i][j][k] ;}}}
                a,b=0;
                for ( i = 0; i < 8; i++){
                    for ( j = 0; j < 8; j++){
                        for ( k = 0; k < 8; k++){
                            for ( l = 0; l < 8; l++){
                                a = ((i == 0) ? 0 : (i * 8));
                                b = ((j == 0) ? 0 : (j * 8));
                                C[k + a][l + b] = WaveletMatrix[i][j] * WaveletMatrix[k][l]; }}}}
                for (i = 0; i < 64; i++) {
                    for (j = 0; j < 10; j++) {
                        for (k = 0; k < 64; k++) {
                            sum12 = sum12+ C[i][k]*xi[k][j];}
                        multiply[i][j] = sum12;
                        sum12 = 0;}}
                for (i = 0; i < 64; i++){
                    for( j = 0 ; j < 10 ; j++ ){
                        tran[j][i] = multiply[i][j];}}
                sum12=0;
                for (i = 0; i < 10; i++) {
                    for (j = 0; j < 64; j++) {
                        for (k = 0; k < 10; k++) {
                            sum12 = sum12 + DctMatrix[i][k]*tran[k][j];}
                        TranArray[i][j] = sum12;
                        sum12 = 0;}}
                for(i=0; i<10; i++){
                    for(j=0; j<64; j++){
                        {if (TranArray[i][j]<0)
                         { Z[i][j]=TranArray[i][j]*(-1);}
                         else
                         {Z[i][j]=TranArray[i][j];}
                        }
                        if (Z[i][j]>Threshold)
                        {Z[i][j]=1;
                        }
                        else
                        { Z[i][j]=0; }
                        Z[i][j]=TranArray[i][j]*Z[i][j];
                    }}
                for (i = 0; i < 10; i++){
                    for( j = 0 ; j < 10 ; j++ ){
                        tDctMatrix[j][i] = DctMatrix[i][j];}}
                sum12=0;
                for (i = 0; i < 10; i++) {
                    for (j = 0; j < 64; j++) {
                        for (k = 0; k < 10; k++) {
                            sum12 = sum12 + tDctMatrix[i][k]*Z[k][j]; }
                        Y[i][j] = sum12;
                        sum12 = 0;}}
                for( i = 0 ; i < 10 ; i++ ){
                    for (j = 0; j < 64; j++){
                        tY[j][i]=Y[i][j];
                    }}
                a,b=0;
                for ( i = 0; i < 8; i++)
                {for ( j = 0; j < 8; j++)
                 {for ( k = 0; k < 8; k++)
                      for ( l = 0; l < 8; l++)
                      {  a = ((i == 0) ? 0 : (i * 8));
                        b = ((j == 0) ? 0 : (j * 8));
                        C1[k + a][l + b] = IWaveletMatrix[i][j] * IWaveletMatrix[k][l];
                      }}}
                sum12=0;
                for (i = 0; i < 64; i++) {
                    for (j = 0; j < 10; j++) {
                        for (k = 0; k < 64; k++) {
                            sum12 = sum12 + C1[i][k]*tY[k][j];}
                        multiply1[i][j] = sum12;
                        sum12 = 0;}}
                for ( i = 0; i < 10; i++){
                    for ( j = 0; j < 8; j++){
                        for ( k = 0; k < 8; k++)
                        {PatchArray[i][j][k] = multiply1[k * 8 + j][i];
                        }}}
                for ( i =0; i <10; i++){
                    z=fmodf(CurPatchIndx[i],Ni);
                    if(z==0)
                    {RowIndx=(int)Ni;
                    }
                    else{RowIndx=(int)z;}
                             RowIndx  =RowIndx-1; 
                    ColIndx=(int)ceilf(CurPatchIndx[i]/Ni);
                             ColIndx= ColIndx-1;
                      for (g=0,j= RowIndx; g<8, j <= RowIndx+7;g++, j++) {
                    for ( h=0,k = ColIndx; h<8, k <= ColIndx+7;h++, k++) {
                            ImgTemp[k][j] = ImgTemp[k][j] + PatchArray[i][h][g];
                            ImgWeight[k][j]=ImgWeight[k][j]+1;
                        }}}
                mxFree(sum2);
                mxFree(v);
                mxFree(indexes);
                mxFree(idxl);
                mxFree(dis);
                mxFree(sum);
                mxFree(sum1);
                mxFree(B);
                mxFree(idx1);
                mxFree(sum0);
                mxFree(sum10);
                mxFree(B0);
                mxFree(idx10);
            }
        }
    }
    void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
    {
    //DECLARING ALL THE ARGUMENTS
        float (*Patch)[64];  float *row;  float *col;  float Ni; float (*I)[249]; float (*IWM)[8]; float (*WM)[8]; float (*DM)[10];
    //float PRECISION CORRESPONDANCE OF THE OUTPUT
        float (*outMatrix)[256];
        float (*outMatrix1)[256];
        int dims[2] = {256,256};
        Patch = (float (*)[64])mxGetData(prhs[0]);
        row = (float *)mxGetData(prhs[1]);
        col = (float *)mxGetData(prhs[2]);
        I = (float (*)[249])mxGetData(prhs[3]);
        IWM = (float (*)[8])mxGetData(prhs[4]);
        WM = (float (*)[8])mxGetData(prhs[5]);
        DM = (float (*)[10])mxGetData(prhs[6]);
        Ni = (float)mxGetScalar(prhs[7]);
    // SPECIAL CASE FOR ARRAYS
        plhs[0] = mxCreateNumericArray(2,dims,mxSINGLE_CLASS,mxREAL);
        plhs[1] = mxCreateNumericArray(2,dims,mxSINGLE_CLASS,mxREAL);
        outMatrix = (float (*)[256])mxGetData(plhs[0]);
        outMatrix1 = (float (*)[256])mxGetData(plhs[1]);
        loopy(outMatrix,outMatrix1,Patch,row,col,Ni,I,IWM,WM,DM);
    }

3 件のコメント
1 件の古いコメントを表示1 件の古いコメントを非表示

James Tursa 2015 年 6 月 16 日

What is the MATLAB code that you converted? Can you post it?

Abeera Tariq 2015 年 6 月 17 日

MATLAB Online で開く

Have a look at the code.

for  i  =  1:NN
    for  j  =1:MM
          CurRow      =   Row(i);
          CurCol      =   Col(j);
          Off      =   (CurCol-1)*N + CurRow;
           % PatchSearch
                  [S, M]   =   size(I);
          f2      =   size(PatchSetT,2);
          rmin    =   max( CurRow-SearchWin, 1 );
          rmax    =   min( CurRow+SearchWin, S );
          cmin    =   max( CurCol-SearchWin, 1 );
          cmax    =   min( CurCol+SearchWin, M );
          idx     =   I(rmin:rmax, cmin:cmax);
          idx     =   idx(:);
          B       =   PatchSetT(idx, :);        
          v       =   PatchSetT(Off, :);
          dis     =   (B(:,1) - v(1)).^2;
  for k = 2:f2
      dis   =  dis + (B(:,k) - v(k)).^2;
  end
          dis   =  dis./f2;
          [~,ind]   =  sort(dis); 
          CurPatchIndx        =  idx( ind(1:ArrayNo) );
          CurArray = PatchSet(:, CurPatchIndx);
          for k = 1:ArrayNo
              PatchArray(:,:,k) = reshape(CurArray(:,k),PatchSize,PatchSize);
          end
          [~,~,K]=size(PatchArray);
y = kron(WaveletMatrix,WaveletMatrix) * reshape(PatchArray,[],K);
 TranArray = DctMatrix*y';                                                             %DWT2DCT
        Z = TranArray.*(abs(TranArray)>Threshold);
        NonZero = length(find(Z>0));
                  block_size = 8;                                                        %IDWT2DCT
          y = (DctMatrix'*Z); 
          X = kron(IWaveletMatrix,IWaveletMatrix) * y';
          X = reshape(X,block_size,block_size,[]);
          PatchArray = real(X);
          for k = 1:length(CurPatchIndx)                 % Compute Row Number
              if mod((CurPatchIndx(k)),N) == 0
          RowIndx = N;
              else 
          RowIndx = mod((CurPatchIndx(k)),N);
              end
              ColIndx  = ceil(double((CurPatchIndx(k)))/N);     %Compute Col No
              ImgTemp(RowIndx:RowIndx+7, ColIndx:ColIndx+7)    =   ImgTemp(RowIndx:RowIndx+7, ColIndx:ColIndx+7) + PatchArray(:,:,k)';
              ImgWeight(RowIndx:RowIndx+7, ColIndx:ColIndx+7)  =   ImgWeight(RowIndx:RowIndx+7, ColIndx:ColIndx+7) + 1;
          end
      end
  end

サインインしてコメントする。

サインインしてこの質問に回答する。

Answer 1

Philip Borghesani 2015 年 6 月 16 日

0
リンク

この回答への直接リンク

https://jp.mathworks.com/matlabcentral/answers/224024-mex-code-did-not-give-improvements-as-compraed-to-matlab-code#answer_182894

I will hazard a guess that your performance problem is the allocation of data inside your for loops. Allocating memory is slow in matlab and c. You should be able to allocate buffers large enough for the operation up frount and reuse those buffers each loop.

I strongly suggest rewriting this code without multidimensional non contiguous arrays for data storage. This method of storing data is almost never used in production code because it leads to bugs and is inefficient.

I like the ideas espoused in this posting: Multidimentional arrays are evil

10 件のコメント
8 件の古いコメントを表示8 件の古いコメントを非表示

James Tursa 2015 年 6 月 17 日

編集済み: James Tursa 2015 年 6 月 17 日

I was perhaps being a bit too loose with my terminology as it relates to your specific code. You have 3D arrays in your code. E.g., PatchArray[10][8][8] is one of them. If you wanted to use dynamic sizing for pointers into this type of array and still use the multi-level [ ][ ][ ] indexing syntax, one would have to use multiple levels of malloc'ed memory, which I casually termed "arrays of pointers to arrays" since it looked like that was where your code was headed. So I was using the term in a more general sense, which you are free to ignore.

But the bottom line point is still the same. Going to a lot of effort to preserve the multi-level [ ][ ][ ] syntax for dynamic sizes is simply not worth it in my opinion (and others as well). You will be much better served in the long run if you use a simple pointer (i.e., regard the memory block as a 1D array) and then do the indexing arithmetic manually. This route also makes it easier to do the mapping for column based matrices/arrays which is the native MATLAB storage convention.

Philip Borghesani 2015 年 6 月 17 日

MATLAB Online で開く

You are building unneeded multi-level arrays with this sort of code:

 sum0= (float *)mxMalloc(q*f2*sizeof(*sum0));
 sum= (float **)mxMalloc(q*sizeof(*sum));
 for (i = 0; i < q; i++){
    sum[i]=sum0+i*f2;
 }

There is no need to build or allocate the sum array of pointers.

A later loop using sum can be rewritten as:

for (j = 0 ; j < (rmax*cmax); j++) {
   float *sum=sum0+j*f2;
   for (i = 0; i<f2 ; i++) {                   
       sum[j] = B[j][i] - v[i];   // or use *sum=...  and on the next line 
                      // powf(*sum++,2); if you understand what it is doing
   ...

sum1 and B can be replaced the same way and the code should run much faster.

James Tursa 2015 年 6 月 17 日

編集済み: James Tursa 2015 年 6 月 17 日

"There is no need to build or allocate the sum array of pointers."

I'm afraid Abeera was just following a generic example I wrote in an earlier post of how one could use the [ ][ ] syntax for dynamic sized arrays. So don't blame him for that one.

As you point out, there are ways to avoid some of the malloc'ed arrays by moving towards a linear indexing model. E.g., you have traded his left-hand-side double index sum[j][i] for a single index sum[j]. And again I agree with this philosophy, that moving completely away from the multi-level [ ][ ] indexing syntax and instead embracing single level [ ] linear indexing models is preferable in the long run.

サインインしてコメントする。

mex code did not give improvements as compraed to matlab code

3 件のコメント
1 件の古いコメントを表示1 件の古いコメントを非表示

回答 (1 件)

10 件のコメント
8 件の古いコメントを表示8 件の古いコメントを非表示

参考

カテゴリ

タグ

Community Treasure Hunt

mex code did not give improvements as compraed to matlab code

3 件のコメント 1 件の古いコメントを表示1 件の古いコメントを非表示

回答 (1 件)

10 件のコメント 8 件の古いコメントを表示8 件の古いコメントを非表示

参考

カテゴリ

タグ

Community Treasure Hunt

3 件のコメント
1 件の古いコメントを表示1 件の古いコメントを非表示

10 件のコメント
8 件の古いコメントを表示8 件の古いコメントを非表示