フィルターのクリア

Scaling of vectorized code which is limited by memory access

8 ビュー (過去 30 日間)
ConvexHull
ConvexHull 2019 年 8 月 9 日
コメント済み: ConvexHull 2019 年 8 月 12 日
Dear all,
what possibilities are there to make a vectorized code faster, where the speed is limited by memory access?
I have attached two runs on two different machines with the same vectorized MATLAB code. Here var_z_analyse is a huge array of fixed size, which will be accessed/querried by a long list of indices which may vary in size and content for each call. May parfor be a solution in this context?
For example the first line does not benefit from 4 to 18 cores, whereas the last two lines scale very well.
  • first machine has 4 cores on one node
4.png
  • seconde machine has 18 cores on one node
18.png
Regards
  11 件のコメント
ConvexHull
ConvexHull 2019 年 8 月 11 日
Thank you Bruno,
i already have optimized the fortran code for single performance. There's nothing more to get.
Regards
ConvexHull
ConvexHull 2019 年 8 月 11 日
編集済み: ConvexHull 2019 年 8 月 11 日
Back to the roots. The question is really simple.
We have a fixed list of data B and a list of ids C which may vary in size and content for each call.
A=B(:,:,C(:));
How to handle such stuff in Matlab/C/Fortran with or without openmp in context of speed/scaling for a medium size problem?

サインインしてコメントする。

採用された回答

Bruno Luong
Bruno Luong 2019 年 8 月 11 日
編集済み: Bruno Luong 2019 年 8 月 11 日
Here is my C-mex implementation, on my laptop it accelerates by 20 fold from your original MATLAB code (compiled with MSVS 2017, 1 thread)
N=19012;
M=130000;
u_xi = rand(1,M);
u_eta = rand(1,M);
a_I = ceil(rand(1,M)*N);
var_z_analyse = rand(3,3,N);
tic
for i=1:1000
V_x = zeros(3,1,M);
V_y = zeros(1,3,M);
A = var_z_analyse(:,:,a_I);
V_x(2,1,:) = u_xi(:);
V_x(1,1,:) = 1;
V_x(3,1,:) = V_x(2,1,:).*V_x(2,1,:);
V_y(1,2,:) = u_eta(:);
V_y(1,1,:) = 1;
V_y(1,3,:) = V_y(1,2,:).*V_y(1,2,:);
outer_product = bsxfun(@times,V_x,V_y);
u_z = sum(sum(outer_product.*A,1),2);
end
toc % 21.871430 seconds.
tic
for i=1:1000
u_z_mex = hash(u_xi,u_eta,var_z_analyse,a_I);
end
toc % 1.057965 seconds.
norm(u_z_mex(:) - u_z(:)) / norm(u_z(:)) % 7.5515e-17
The hash.c C-mex code (quick and dirty without checking correctness of the input)
/*************************************************
* CALLING:
* >> u_z_mex = hash(u_xi,u_eta,var_z_analyse,a_I);
* COMPILING:
* >> mex -O hash.c
************************************************/
#include "mex.h"
#include "matrix.h"
#define XI prhs[0]
#define ETA prhs[1]
#define VAR_Z_ANALYSE prhs[2]
#define A_I prhs[3]
#define UZ plhs[0]
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
int i,j,k,M;
double xi_k_pow[3], eta_k, xi_k, etap, s, ss;
double *Ak, *u_eta, *u_xi, *a_I, *uz, *var_z_analyse;
var_z_analyse = (double*)mxGetData(VAR_Z_ANALYSE);
u_eta = (double*)mxGetData(ETA);
u_xi = (double*)mxGetData(XI);
a_I = (double*)mxGetData(A_I);
M = (int)mxGetNumberOfElements(A_I);
UZ = mxCreateDoubleMatrix(1,M,mxREAL);
uz = (double*)mxGetData(UZ);
xi_k_pow[0] = 1;
for (k=M; k--;)
{
Ak = var_z_analyse + 9*((int)(*a_I++)-1);
eta_k = *u_eta++;
xi_k = *u_xi++;
xi_k_pow[1] = xi_k;
xi_k_pow[2] = xi_k*xi_k;
etap = 1.0;
s = 0.0;
for (i=3; i--;)
{
ss = 0.0;
for (j=0; j<3; j++) ss += xi_k_pow[j] * *(Ak++);
s += etap * ss;
etap *= eta_k;
}
*uz++ = s;
}
}
  6 件のコメント
Bruno Luong
Bruno Luong 2019 年 8 月 11 日
Folding for-loop variant code, 10% faster still
/*************************************************
* CALLING:
* >> u_z_mex = hash(u_xi,u_eta,var_z_analyse,a_I);
* COMPILING:
* >> mex -O hash.c
************************************************/
#include "mex.h"
#include "matrix.h"
#define XI prhs[0]
#define ETA prhs[1]
#define VAR_Z_ANALYSE prhs[2]
#define A_I prhs[3]
#define UZ plhs[0]
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
int k, M;
double eta_k, xi_k, s, ss;
double *Ak, *u_eta, *u_xi, *a_I, *uz, *var_z_analyse;
var_z_analyse = (double*)mxGetData(VAR_Z_ANALYSE);
u_eta = (double*)mxGetData(ETA);
u_xi = (double*)mxGetData(XI);
a_I = (double*)mxGetData(A_I);
M = (int)mxGetNumberOfElements(A_I);
UZ = mxCreateDoubleMatrix(1,M,mxREAL);
uz = (double*)mxGetData(UZ);
for (k=M; k--;)
{
Ak = var_z_analyse + 9*(int)(*a_I++);
eta_k = *u_eta++;
xi_k = *u_xi++;
s = *(--Ak);
s = s*xi_k + *(--Ak);
s = s*xi_k + *(--Ak);
ss = *(--Ak);
ss = ss*xi_k + *(--Ak);
ss = ss*xi_k + *(--Ak);
s = s*eta_k + ss;
ss = *(--Ak);
ss = ss*xi_k + *(--Ak);
ss = ss*xi_k + *(--Ak);
*uz++ = s*eta_k + ss;
}
}
ConvexHull
ConvexHull 2019 年 8 月 12 日
Well done!

サインインしてコメントする。

その他の回答 (0 件)

カテゴリ

Help Center および File ExchangeMATLAB Compiler についてさらに検索

タグ

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!

Translated by