GPU speed up, matrix multiplication
22 ビュー (過去 30 日間)
古いコメントを表示
I use GPU (Tesla K80) to speed up the matrix multiplication in matlab 2016a and cuda 7.5. At first, the procedure runs fast, about 0.0001S per loop, after a certain number of iterations, the procedure runs slowly, about 0.04S per loop. ############################ main.m clear; A = 100 * 100000; C = 100 * 100000; for i = 1:10000 tic; B = MatrixMul(A, C); toc; end ############## MatrixMul.cu

if true
#include "mex.h" #include "gpu/mxGPUArray.h"
void _global_ TimesTwo(double const * const A, double const * const C, double * const B, int const N, int const rowsA, int const rowsC, int const colsA, int const colsC) { int const i = blockDim.x * blockIdx.x + threadIdx.x; int j; if (i < rowsA * rowsC) { int co_x = i % rowsA; int co_y = i / rowsA; B[i] = 0; for (j = 0; j < colsA; j++) { B[i] += A[ rowsA * j + co_x] * C[ rowsC * j + co_y]; } } }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[]) { mxGPUArray const *A; mxGPUArray const *C; mxGPUArray *B;
double const *d_A; double const *d_C; double *d_B; int N;
char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput"; char const * const errMsg = "Invalid input to MEX file."; int const threadsPerBlock = 256; int blocksPerGrid;
mxInitGPU(); A = mxGPUCreateFromMxArray(prhs[0]); C = mxGPUCreateFromMxArray(prhs[1]);
d_A = (double const *)(mxGPUGetDataReadOnly(A)); d_C = (double const *)(mxGPUGetDataReadOnly(C));
const mwSize *dimsA = mxGPUGetDimensions(A); const mwSize *dimsC = mxGPUGetDimensions(C);
size_t nrowsA = dimsA[0]; size_t ncolsA = dimsA[1]; size_t nrowsC = dimsC[0]; size_t ncolsC = dimsC[1]; mwSize dims[2] = {nrowsA, nrowsC};
B = mxGPUCreateGPUArray(2, dims, mxGPUGetClassID(A), mxGPUGetComplexity(A), MX_GPU_DO_NOT_INITIALIZE); d_B = (double *)(mxGPUGetData(B));
N = (int)(nrowsA * nrowsC); blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; TimesTwo<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, d_B, N, nrowsA, nrowsC, ncolsA, ncolsC);
plhs[0] = mxGPUCreateMxArrayOnGPU(B); mxGPUDestroyGPUArray(A); mxGPUDestroyGPUArray(B); mxGPUDestroyGPUArray(C); }

<<

>>
end
0 件のコメント
回答 (1 件)
Joss Knight
2018 年 4 月 28 日
tic and toc are not giving the correct timings for your first set of iterations, because your kernels are launching asynchronously. You need to use gputimeit or add a call to wait(gpuDevice).
Also, your kernel is not efficient, you should be using cublas to perform matrix multiplication.
0 件のコメント
参考
カテゴリ
Help Center および File Exchange で GPU CUDA and MEX Programming についてさらに検索
Community Treasure Hunt
Find the treasures in MATLAB Central and discover how the community can help you!
Start Hunting!