24 #ifndef SpatialOps_CudaMatVecUtility_h 25 #define SpatialOps_CudaMatVecUtility_h 27 #include <cuda_runtime.h> 28 #include <cublas_v2.h> 32 #define SPATIALOPS_GPU_FLT_RADIX FLT_RADIX 36 namespace CudaMatVecUtility
38 namespace MatVecStatus
43 IMAGINARY_NUMBER_FOUND,
46 typedef MatVecStatus::MatVecStatus_e MatVecStatus_e;
48 namespace MatVecOptions
56 typedef MatVecOptions::MatVecOptions_e MatVecOptions_e;
58 inline void _cublas_err_chk(cublasStatus_t err,
const char* file,
const int line)
61 if( err != CUBLAS_STATUS_SUCCESS ){
62 std::ostringstream msg;
63 msg <<
"Nebo error in CUBLAS execution.\n";
64 msg <<
"CUBLAS error in " << file <<
":" << line <<
".\n";
65 msg <<
"\tError: " << err <<
".\n";
66 throw std::runtime_error(msg.str());
70 extern "C" void cublas_err_chk(cublasStatus_t err) { _cublas_err_chk(err, __FILE__, __LINE__); }
72 template<
typename ValT>
73 __global__
void mat_vec_copy_column_major( ValT *
const *
const destination,
74 ValT
const *
const *
const source,
75 int const matrixWidth,
76 int const elementsTotal,
79 int x = blockIdx.x * blockDim.x + threadIdx.x;
81 for ( ; x < xyz ; x += blockDim.x * gridDim.x ){
82 mat_vec_copy_inner_column_major( destination[x], source, matrixWidth, elementsTotal, x);
85 template<
typename ValT>
86 __device__ __inline__
void mat_vec_copy_inner_column_major( ValT *
const destination,
87 ValT
const *
const *
const source,
88 int const matrixWidth,
89 int const elementsTotal,
92 int y = blockIdx.y * blockDim.y + threadIdx.y;
93 for ( ; y < elementsTotal; y += blockDim.y * gridDim.y ){
94 destination[y] = source[(y/matrixWidth)+((y%matrixWidth)*matrixWidth)][xyz];
98 template<
typename ValT>
99 __global__
void mat_vec_copy_row_major( ValT *
const *
const destination,
100 ValT
const *
const *
const source,
101 int const matrixWidth,
102 int const elementsTotal,
105 int x = blockIdx.x * blockDim.x + threadIdx.x;
107 for ( ; x < xyz ; x += blockDim.x * gridDim.x ){
108 mat_vec_copy_inner_row_major( destination[x], source, matrixWidth, elementsTotal, x);
111 template<
typename ValT>
112 __device__ __inline__
void mat_vec_copy_inner_row_major( ValT *
const destination,
113 ValT
const *
const *
const source,
114 int const matrixWidth,
115 int const elementsTotal,
118 int y = blockIdx.y * blockDim.y + threadIdx.y;
119 for ( ; y < elementsTotal; y += blockDim.y * gridDim.y ){
120 destination[y] = source[(y%matrixWidth)+((y/matrixWidth)*matrixWidth)][xyz];
124 template<
typename FieldT,
typename MatT>
125 void sync_field_streams(FieldT field, MatT other,
size_t otherLength,
const bool usingFirstField)
127 cudaEventRecord(field.get_last_event(), field.get_stream());
129 const size_t endValue = usingFirstField ? 1 : 0;
130 while( endValue < otherLength-- ){
131 cudaStreamWaitEvent(other.at(otherLength).get_stream(),field.get_last_event(), 0);
135 template<
typename FieldT,
typename MatT>
136 void sync_field_streams(MatT field,
size_t otherLength, FieldT other,
const bool usingFirstField)
138 const size_t endValue = usingFirstField ? 1 : 0;
139 while( endValue < otherLength-- ){
140 cudaEventRecord(field.at(otherLength).get_last_event(), field.at(otherLength).get_stream());
141 cudaStreamWaitEvent(other.get_stream(),field.at(otherLength).get_last_event(), 0);
146 template<
typename ValT>
147 void print_device_values(ValT* d_values,
size_t const size,
short int deviceIndex, std::string s)
150 ema::cuda::CUDADeviceInterface& CDI = ema::cuda::CUDADeviceInterface::self();
151 CDI.memcpy_from(h_local, d_values, size*
sizeof(ValT), deviceIndex);
152 std::cout << s << std::endl;
153 for(
size_t i = 0; i < size; i++)
155 std::cout << h_local[i] << std::endl;
157 std::cout << std::endl;
161 __device__ __inline__
const T flat_index_one_based(T
const i, T
const j, T
const dim)
163 return (i-1)*dim + (j-1);
166 template<
typename ValT>
167 __device__ __inline__
void swap(ValT& g, ValT& h, ValT& y)
172 template<
typename ValT>
173 __global__
void balanc(ValT *
const *
const mats,
int const n,
size_t const numberofMatrices)
175 int matNumber = blockIdx.x * blockDim.x + threadIdx.x;
176 for ( ; matNumber < numberofMatrices ; matNumber += blockDim.x * gridDim.x ){
178 float s,r,g,f,c,sqrdx;
180 sqrdx=SPATIALOPS_GPU_FLT_RADIX*SPATIALOPS_GPU_FLT_RADIX;
188 c += ::fabs(mats[flat_index_one_based(j,i,n)][matNumber]);
189 r += ::fabs(mats[flat_index_one_based(i,j,n)][matNumber]);
192 g=r/SPATIALOPS_GPU_FLT_RADIX;
196 f *= SPATIALOPS_GPU_FLT_RADIX;
199 g=r*SPATIALOPS_GPU_FLT_RADIX;
201 f /= SPATIALOPS_GPU_FLT_RADIX;
204 if ((c+r)/f < 0.95*s) {
207 for (j=1;j<=n;j++) mats[flat_index_one_based(i,j,n)][matNumber] *= g;
208 for (j=1;j<=n;j++) mats[flat_index_one_based(j,i,n)][matNumber] *= f;
216 template<
typename ValT>
217 __global__
void elmhes(ValT *
const *
const mats,
int const n,
size_t const numberofMatrices)
220 int matNumber = blockIdx.x * blockDim.x + threadIdx.x;
221 for ( ; matNumber < numberofMatrices ; matNumber += blockDim.x * gridDim.x ){
225 for(m = 2; m < n; m++) {
228 for(j = m; j <= n; j++) {
229 if(::fabs(mats[flat_index_one_based(j, m-1, n)][matNumber]) > ::fabs(x)) {
230 x = mats[flat_index_one_based(j, m-1, n)][matNumber];
235 for(j = m-1; j<= n; j++) swap(mats[flat_index_one_based(i,j,n)][matNumber], mats[flat_index_one_based(m,j,n)][matNumber],y);
236 for(j = 1; j<= n; j++) swap(mats[flat_index_one_based(j,i,n)][matNumber], mats[flat_index_one_based(j,m,n)][matNumber],y);
239 for(i = m+1; i <= n; i++) {
240 if((y=mats[flat_index_one_based(i, m-1, n)][matNumber]) != 0.0) {
242 mats[flat_index_one_based(i, m-1, n)][matNumber] = y;
243 for(j = m; j <= n; j++)
244 mats[flat_index_one_based(i, j, n)][matNumber] -= y*mats[flat_index_one_based(m, j, n)][matNumber];
245 for(j = 1; j <= n; j++)
246 mats[flat_index_one_based(j, m, n)][matNumber] += y*mats[flat_index_one_based(j, i, n)][matNumber];
254 template<
typename ValT>
255 __device__ __inline__ MatVecStatus_e hqr_helper(ValT *
const *
const mats,
int const matNumber,
int const n, ValT*
const *
const wr, ValT*
const *
const wi, MatVecOptions_e
const option)
257 int nn, m, l, k, j, its, i, mmin;
258 ValT z, y, x, w, v, u, t, s, r, q, p, anorm, maxTmp;
261 for(i=1; i<=n; i++) {
263 for(j=(maxTmp<1?1:maxTmp); j<=n; j++) {
264 anorm += ::fabs(mats[flat_index_one_based(i, j, n)][matNumber]);
273 for (l=nn;l>=2;l--) {
274 s=::fabs(mats[flat_index_one_based(l-1, l-1, n)][matNumber])+::fabs(mats[flat_index_one_based(l,l,n)][matNumber]);
276 if((ValT)(::fabs(mats[flat_index_one_based(l,l-1,n)][matNumber]) + s) == s)
break;
278 x=mats[flat_index_one_based(nn,nn,n)][matNumber];
280 wr[nn][matNumber]=x+t;
281 if(!(option == MatVecOptions::DISREGARD_IMAGINARY || option == MatVecOptions::PROHIBIT_IMAGINARY))
282 wi[nn][matNumber]=0.0;
285 y=mats[flat_index_one_based(nn-1,nn-1,n)][matNumber];
286 w=mats[flat_index_one_based(nn, nn-1,n)][matNumber]*mats[flat_index_one_based(nn-1,nn,n)][matNumber];
293 z=p+(p>=0.0?::fabs(z):-::fabs(z));
294 wr[nn-1][matNumber]=wr[nn][matNumber]=x+z;
295 if (z) wr[nn][matNumber]=x-w/z;
296 if(!(option == MatVecOptions::DISREGARD_IMAGINARY || option == MatVecOptions::PROHIBIT_IMAGINARY))
297 wi[nn-1][matNumber]=wi[nn][matNumber]=0.0;
299 if(option != MatVecOptions::PROHIBIT_IMAGINARY)
301 wr[nn-1][matNumber]=wr[nn][matNumber]=x+p;
302 if(option != MatVecOptions::DISREGARD_IMAGINARY)
303 wi[nn-1][matNumber]= -(wi[nn][matNumber]=z);
307 return MatVecStatus::IMAGINARY_NUMBER_FOUND;
312 if (its == 30)
return MatVecStatus::MAX_ITERATION;
313 if (its == 10 || its == 20) {
315 for (i=1; i<=nn;i++) mats[flat_index_one_based(i,i,n)][matNumber] -= x;
316 s=::fabs(mats[flat_index_one_based(nn,nn-1,n)][matNumber])+::fabs(mats[flat_index_one_based(nn-1,nn-2,n)][matNumber]);
321 for (m=(nn-2);m>=l;m--) {
322 z=mats[flat_index_one_based(m,m,n)][matNumber];
325 p=(r*s-w)/mats[flat_index_one_based(m+1,m,n)][matNumber]+mats[flat_index_one_based(m,m+1,n)][matNumber];
326 q=mats[flat_index_one_based(m+1,m+1,n)][matNumber]-z-r-s;
327 r=mats[flat_index_one_based(m+2,m+1,n)][matNumber];
328 s=::fabs(p)+::fabs(q)+::fabs(r);
333 u=::fabs(mats[flat_index_one_based(m,m-1,n)][matNumber])*(::fabs(q)+::fabs(r));
334 v=::fabs(p)*(::fabs(mats[flat_index_one_based(m-1,m-1,n)][matNumber])+::fabs(z)+::fabs(mats[flat_index_one_based(m+1,m+1,n)][matNumber]));
335 if ((ValT)(u+v) == v)
break;
337 for (i=m+2;i<=nn;i++) {
338 mats[flat_index_one_based(i, i-2,n)][matNumber]=0.0;
339 if (i != (m+2)) mats[flat_index_one_based(i, i-3,n)][matNumber]=0.0;
341 for (k=m;k<=nn-1;k++) {
343 p=mats[flat_index_one_based(k,k-1,n)][matNumber];
344 q=mats[flat_index_one_based(k+1,k-1,n)][matNumber];
346 if (k != (nn-1)) r=mats[flat_index_one_based(k+2,k-1,n)][matNumber];
347 if ((x=::fabs(p)+::fabs(q)+::fabs(r)) != 0.0) {
353 maxTmp = ::sqrt(p*p+q*q+r*r);
354 if ((s=(p >= 0.0?::fabs(maxTmp):-::fabs(maxTmp))) != 0.0) {
357 mats[flat_index_one_based(k, k-1,n)][matNumber] = -mats[flat_index_one_based(k,k-1,n)][matNumber];
359 mats[flat_index_one_based(k,k-1,n)][matNumber] = -s*x;
366 for (j=k;j<=nn;j++) {
367 p=mats[flat_index_one_based(k,j,n)][matNumber]+q*mats[flat_index_one_based(k+1,j,n)][matNumber];
369 p += r*mats[flat_index_one_based(k+2,j,n)][matNumber];
370 mats[flat_index_one_based(k+2,j,n)][matNumber] -= p*z;
372 mats[flat_index_one_based(k+1,j,n)][matNumber] -= p*y;
373 mats[flat_index_one_based(k,j,n)][matNumber] -= p*x;
375 mmin = nn<k+3 ? nn : k+3;
376 for (i=l;i<=mmin;i++) {
377 p=x*mats[flat_index_one_based(i,k,n)][matNumber]+y*mats[flat_index_one_based(i,k+1,n)][matNumber];
379 p += z*mats[flat_index_one_based(i, k+2,n)][matNumber];
380 mats[flat_index_one_based(i,k+2,n)][matNumber] -= p*r;
382 mats[flat_index_one_based(i,k+1,n)][matNumber] -= p*q;
383 mats[flat_index_one_based(i,k,n)][matNumber] -= p;
392 return MatVecStatus::SUCCESS;
395 template<
typename ValT>
396 __global__
void hqr(ValT *
const *
const mats,
int const n,
int const batchSize, ValT** wr, ValT** wi,
int status[], MatVecOptions_e
const option)
398 int matNumber = blockIdx.x * blockDim.x + threadIdx.x;
399 const bool useImaginary = !(option == MatVecOptions::DISREGARD_IMAGINARY || option == MatVecOptions::PROHIBIT_IMAGINARY);
400 for ( ; matNumber < batchSize ; matNumber += blockDim.x * gridDim.x ){
401 status[matNumber] = hqr_helper(mats, matNumber, n, wr-1, useImaginary ? wi-1 : NULL, option);
405 template<
typename ValT>
406 __global__
void eigen_decomposition(ValT *
const *
const mArray,
int const n,
int const batchSize, ValT ** result, ValT ** resultComplex,
int* status, MatVecOptions_e
const option)
408 balanc(mArray, n, batchSize);
409 elmhes(mArray, n, batchSize);
410 hqr(mArray, n, batchSize, result, resultComplex, status, option);
413 template<
typename ValT>
421 static cublasStatus_t cublasgetrsBatched( cublasHandle_t handle,
422 cublasOperation_t trans,
425 const double *Aarray[],
433 return cublasDgetrsBatched(handle,
445 static cublasStatus_t cublasgetrfBatched( cublasHandle_t handle,
453 return cublasDgetrfBatched(handle, n, Aarray, lda, PivotArray, infoArray, batchSize);
455 static cublasStatus_t cublasgetriBatched( cublasHandle_t handle,
457 const double *Aarray[],
465 return cublasDgetriBatched(handle,
476 static cublasStatus_t cublasgemmBatched( cublasHandle_t handle,
477 cublasOperation_t transa,
478 cublasOperation_t transb,
483 const double *Aarray[],
485 const double *Barray[],
492 return cublasDgemmBatched(handle,
509 #endif // SpatialOps_CudaMatVecUtility_h