classdef ExternalLib_API < coder.ExternalDependency
%#codegen
methods (Static)
function bName = getDescriptiveName(~)
bName = 'ExternalLib_API';
end
function tf = isSupportedContext(ctx)
if ctx.isMatlabHostTarget()
tf = true;
else
error('CUBLAS library not available for this target');
end
end
function updateBuildInfo(buildInfo, ctx)
[~, linkLibExt, ~, ~] = ctx.getStdLibInfo();
% Include header file path
% Include header files later using coder.cinclude
hdrFilePath = 'C:\My_Includes';
buildInfo.addIncludePaths(hdrFilePath);
% Include link files
linkFiles = strcat('libcublas', linkLibExt);
linkPath = 'C:\My_Libs';
linkPriority = '';
linkPrecompiled = true;
linkLinkOnly = true;
group = '';
buildInfo.addLinkObjects(linkFiles, linkPath, ...
linkPriority, linkPrecompiled, linkLinkOnly, group);
linkFiles = strcat('libcudart', linkLibExt);
buildInfo.addLinkObjects(linkFiles, linkPath, ...
linkPriority, linkPrecompiled, linkLinkOnly, group);
end
%API for library function 'cuda_MatrixMultiply'
function C = GPU_MatrixMultiply(A, B)
assert(isa(A,'single'), 'A must be single.');
assert(isa(B,'single'), 'B must be single.');
if(coder.target('MATLAB'))
C=A*B;
else
% Include header files
% for external functions and typedefs
% Header path included earlier using updateBuildInfo
coder.cinclude('"cuda_runtime.h"');
coder.cinclude('"cublas_v2.h"');
% Compute dimensions of input matrices
m = int32(size(A, 1));
k = int32(size(A, 2));
n = int32(size(B, 2));
% Declare pointers to matrices on destination GPU
d_A = coder.opaque('float*');
d_B = coder.opaque('float*');
d_C = coder.opaque('float*');
% Compute memory to be allocated for matrices
% Single = 4 bytes
size_A = m*k*4;
size_B = k*n*4;
size_C = m*n*4;
% Define error variables
error = coder.opaque('cudaError_t');
cudaSuccessV = coder.opaque('cudaError_t', ...
'cudaSuccess');
% Assign memory on destination GPU
error = coder.ceval('cudaMalloc', ...
coder.wref(d_A), size_A);
assert(error == cudaSuccessV, ...
'cudaMalloc(A) failed');
error = coder.ceval('cudaMalloc', ...
coder.wref(d_B), size_B);
assert(error == cudaSuccessV, ...
'cudaMalloc(B) failed');
error = coder.ceval('cudaMalloc', ...
coder.wref(d_C), size_C);
assert(error == cudaSuccessV, ...
'cudaMalloc(C) failed');
% Define direction of copying
hostToDevice = coder.opaque('cudaMemcpyKind', ...
'cudaMemcpyHostToDevice');
% Copy matrices to destination GPU
error = coder.ceval('cudaMemcpy', ...
d_A, coder.rref(A), size_A, hostToDevice);
assert(error == cudaSuccessV, 'cudaMemcpy(A) failed');
error = coder.ceval('cudaMemcpy', ...
d_B, coder.rref(B), size_B, hostToDevice);
assert(error == cudaSuccessV, 'cudaMemcpy(B) failed');
% Define type and size for result
C = zeros(m, n, 'single');
error = coder.ceval('cudaMemcpy', ...
d_C, coder.rref(C), size_C, hostToDevice);
assert(error == cudaSuccessV, 'cudaMemcpy(C) failed');
% Define handle variables for external library
handle = coder.opaque('cublasHandle_t');
blasSuccess = coder.opaque('cublasStatus_t', ...
'CUBLAS_STATUS_SUCCESS');
% Initialize external library
ret = coder.opaque('cublasStatus_t');
ret = coder.ceval('cublasCreate', coder.wref(handle));
assert(ret == blasSuccess, 'cublasCreate failed');
TRANSA = coder.opaque('cublasOperation_t', ...
'CUBLAS_OP_N');
alpha = single(1);
beta = single(0);
% Multiply matrices on GPU
ret = coder.ceval('cublasSgemm', handle, ...
TRANSA,TRANSA,m,n,k, ...
coder.rref(alpha),d_A,m, ...
d_B,k, ...
coder.rref(beta),d_C,k);
assert(ret == blasSuccess, 'cublasSgemm failed');
% Copy result back to local host
deviceToHost = coder.opaque('cudaMemcpyKind', ...
'cudaMemcpyDeviceToHost');
error = coder.ceval('cudaMemcpy', coder.wref(C), ...
d_C, size_C, deviceToHost);
assert(error == cudaSuccessV, 'cudaMemcpy(C) failed');
end
end
end
end