SpatialOps
CudaMatVecUtility.h
1 /*
2  * Copyright (c) 2014-2017 The University of Utah
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to
6  * deal in the Software without restriction, including without limitation the
7  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8  * sell copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20  * IN THE SOFTWARE.
21  * ----------------------------------------------------------------------------
22 */
23 
24 #ifndef SpatialOps_CudaMatVecUtility_h
25 #define SpatialOps_CudaMatVecUtility_h
26 #include <cfloat>
27 #include <cuda_runtime.h>
28 #include <cublas_v2.h>
29 #include <iostream>
30 #include <math.h>
31 
32 #define SPATIALOPS_GPU_FLT_RADIX FLT_RADIX
33 
34 namespace SpatialOps
35 {
36  namespace CudaMatVecUtility
37  {
38  namespace MatVecStatus
39  {
40  enum MatVecStatus_e{
41  SUCCESS=0,
42  MAX_ITERATION,
43  IMAGINARY_NUMBER_FOUND,
44  };
45  }
46  typedef MatVecStatus::MatVecStatus_e MatVecStatus_e;
47 
48  namespace MatVecOptions
49  {
50  enum MatVecOptions_e{
51  NONE=0,
52  DISREGARD_IMAGINARY,
53  PROHIBIT_IMAGINARY,
54  };
55  }
56  typedef MatVecOptions::MatVecOptions_e MatVecOptions_e;
57 
58  inline void _cublas_err_chk(cublasStatus_t err, const char* file, const int line)
59  {
60 #ifndef NDEBUG
61  if( err != CUBLAS_STATUS_SUCCESS ){
62  std::ostringstream msg;
63  msg << "Nebo error in CUBLAS execution.\n";
64  msg << "CUBLAS error in " << file << ":" << line << ".\n";
65  msg << "\tError: " << err << ".\n";
66  throw std::runtime_error(msg.str());
67  }
68 #endif /* NDEBUG */
69  }
70  extern "C" void cublas_err_chk(cublasStatus_t err) { _cublas_err_chk(err, __FILE__, __LINE__); }
71 
72  template<typename ValT>
73  __global__ void mat_vec_copy_column_major( ValT * const * const destination,
74  ValT const * const * const source,
75  int const matrixWidth,
76  int const elementsTotal,
77  int const xyz )
78  {
79  int x = blockIdx.x * blockDim.x + threadIdx.x;
80 
81  for ( ; x < xyz ; x += blockDim.x * gridDim.x ){
82  mat_vec_copy_inner_column_major( destination[x], source, matrixWidth, elementsTotal, x);
83  }
84  }
85  template<typename ValT>
86  __device__ __inline__ void mat_vec_copy_inner_column_major( ValT * const destination,
87  ValT const * const * const source,
88  int const matrixWidth,
89  int const elementsTotal,
90  int const xyz )
91  {
92  int y = blockIdx.y * blockDim.y + threadIdx.y;
93  for ( ; y < elementsTotal; y += blockDim.y * gridDim.y ){
94  destination[y] = source[(y/matrixWidth)+((y%matrixWidth)*matrixWidth)][xyz];
95  }
96  }
97 
98  template<typename ValT>
99  __global__ void mat_vec_copy_row_major( ValT * const * const destination,
100  ValT const * const * const source,
101  int const matrixWidth,
102  int const elementsTotal,
103  int const xyz )
104  {
105  int x = blockIdx.x * blockDim.x + threadIdx.x;
106 
107  for ( ; x < xyz ; x += blockDim.x * gridDim.x ){
108  mat_vec_copy_inner_row_major( destination[x], source, matrixWidth, elementsTotal, x);
109  }
110  }
111  template<typename ValT>
112  __device__ __inline__ void mat_vec_copy_inner_row_major( ValT * const destination,
113  ValT const * const * const source,
114  int const matrixWidth,
115  int const elementsTotal,
116  int const xyz )
117  {
118  int y = blockIdx.y * blockDim.y + threadIdx.y;
119  for ( ; y < elementsTotal; y += blockDim.y * gridDim.y ){
120  destination[y] = source[(y%matrixWidth)+((y/matrixWidth)*matrixWidth)][xyz];
121  }
122  }
123 
124  template<typename FieldT, typename MatT>
125  void sync_field_streams(FieldT field, MatT other, size_t otherLength, const bool usingFirstField)
126  {
127  cudaEventRecord(field.get_last_event(), field.get_stream());
128 
129  const size_t endValue = usingFirstField ? 1 : 0;
130  while( endValue < otherLength-- ){
131  cudaStreamWaitEvent(other.at(otherLength).get_stream(),field.get_last_event(), 0);
132  }
133  }
134 
135  template<typename FieldT, typename MatT>
136  void sync_field_streams(MatT field, size_t otherLength, FieldT other, const bool usingFirstField)
137  {
138  const size_t endValue = usingFirstField ? 1 : 0;
139  while( endValue < otherLength-- ){
140  cudaEventRecord(field.at(otherLength).get_last_event(), field.at(otherLength).get_stream());
141  cudaStreamWaitEvent(other.get_stream(),field.at(otherLength).get_last_event(), 0);
142  }
143 
144  }
145 
146  template<typename ValT>
147  void print_device_values(ValT* d_values, size_t const size, short int deviceIndex, std::string s)
148  {
149  ValT h_local[size];
150  ema::cuda::CUDADeviceInterface& CDI = ema::cuda::CUDADeviceInterface::self();
151  CDI.memcpy_from(h_local, d_values, size*sizeof(ValT), deviceIndex);
152  std::cout << s << std::endl;
153  for(size_t i = 0; i < size; i++)
154  {
155  std::cout << h_local[i] << std::endl;
156  }
157  std::cout << std::endl;
158  }
159 
160  template<typename T>
161  __device__ __inline__ const T flat_index_one_based(T const i, T const j, T const dim)
162  {
163  return (i-1)*dim + (j-1);
164  }
165 
166  template<typename ValT>
167  __device__ __inline__ void swap(ValT& g, ValT& h, ValT& y)
168  {
169  y = g; g = h; h = y;
170  }
171 
172  template<typename ValT>
173  __global__ void balanc(ValT * const * const mats, int const n, size_t const numberofMatrices)
174  {
175  int matNumber = blockIdx.x * blockDim.x + threadIdx.x;
176  for ( ; matNumber < numberofMatrices ; matNumber += blockDim.x * gridDim.x ){
177  int last,j,i;
178  float s,r,g,f,c,sqrdx;
179 
180  sqrdx=SPATIALOPS_GPU_FLT_RADIX*SPATIALOPS_GPU_FLT_RADIX;
181  last=0;
182  while (last == 0) {
183  last = 1;
184  for (i=1;i<=n;i++) {
185  r=c=0.0;
186  for (j=1;j<=n;j++)
187  if (j != i) {
188  c += ::fabs(mats[flat_index_one_based(j,i,n)][matNumber]);
189  r += ::fabs(mats[flat_index_one_based(i,j,n)][matNumber]);
190  }
191  if (c && r) {
192  g=r/SPATIALOPS_GPU_FLT_RADIX;
193  f=1.0;
194  s=c+r;
195  while (c<g) {
196  f *= SPATIALOPS_GPU_FLT_RADIX;
197  c *= sqrdx;
198  }
199  g=r*SPATIALOPS_GPU_FLT_RADIX;
200  while (c>g) {
201  f /= SPATIALOPS_GPU_FLT_RADIX;
202  c /= sqrdx;
203  }
204  if ((c+r)/f < 0.95*s) {
205  last=0;
206  g=1.0/f;
207  for (j=1;j<=n;j++) mats[flat_index_one_based(i,j,n)][matNumber] *= g;
208  for (j=1;j<=n;j++) mats[flat_index_one_based(j,i,n)][matNumber] *= f;
209  }
210  }
211  }
212  }
213  }
214  }
215 
216  template<typename ValT>
217  __global__ void elmhes(ValT * const * const mats, int const n, size_t const numberofMatrices)
218  {
219 
220  int matNumber = blockIdx.x * blockDim.x + threadIdx.x;
221  for ( ; matNumber < numberofMatrices ; matNumber += blockDim.x * gridDim.x ){
222  int m,j,i;
223  ValT y,x;
224 
225  for(m = 2; m < n; m++) {
226  x = 0.0;
227  i=m;
228  for(j = m; j <= n; j++) {
229  if(::fabs(mats[flat_index_one_based(j, m-1, n)][matNumber]) > ::fabs(x)) {
230  x = mats[flat_index_one_based(j, m-1, n)][matNumber];
231  i = j;
232  }
233  }
234  if(i != m) {
235  for(j = m-1; j<= n; j++) swap(mats[flat_index_one_based(i,j,n)][matNumber], mats[flat_index_one_based(m,j,n)][matNumber],y);
236  for(j = 1; j<= n; j++) swap(mats[flat_index_one_based(j,i,n)][matNumber], mats[flat_index_one_based(j,m,n)][matNumber],y);
237  }
238  if(x) {
239  for(i = m+1; i <= n; i++) {
240  if((y=mats[flat_index_one_based(i, m-1, n)][matNumber]) != 0.0) {
241  y /= x;
242  mats[flat_index_one_based(i, m-1, n)][matNumber] = y;
243  for(j = m; j <= n; j++)
244  mats[flat_index_one_based(i, j, n)][matNumber] -= y*mats[flat_index_one_based(m, j, n)][matNumber];
245  for(j = 1; j <= n; j++)
246  mats[flat_index_one_based(j, m, n)][matNumber] += y*mats[flat_index_one_based(j, i, n)][matNumber];
247  }
248  }
249  }
250  }
251  }
252  }
253 
254  template<typename ValT>
255  __device__ __inline__ MatVecStatus_e hqr_helper(ValT * const * const mats, int const matNumber, int const n, ValT* const * const wr, ValT* const * const wi, MatVecOptions_e const option)
256  {
257  int nn, m, l, k, j, its, i, mmin;
258  ValT z, y, x, w, v, u, t, s, r, q, p, anorm, maxTmp;
259 
260  anorm = 0.0;
261  for(i=1; i<=n; i++) {
262  maxTmp = i-1;
263  for(j=(maxTmp<1?1:maxTmp); j<=n; j++) {
264  anorm += ::fabs(mats[flat_index_one_based(i, j, n)][matNumber]);
265  }
266  }
267 
268  nn=n;
269  t=0.0;
270  while(nn >= 1) {
271  its = 0;
272  do {
273  for (l=nn;l>=2;l--) {
274  s=::fabs(mats[flat_index_one_based(l-1, l-1, n)][matNumber])+::fabs(mats[flat_index_one_based(l,l,n)][matNumber]);
275  if(s==0.0) s=anorm;
276  if((ValT)(::fabs(mats[flat_index_one_based(l,l-1,n)][matNumber]) + s) == s) break;
277  }
278  x=mats[flat_index_one_based(nn,nn,n)][matNumber];
279  if(l==nn) {
280  wr[nn][matNumber]=x+t;
281  if(!(option == MatVecOptions::DISREGARD_IMAGINARY || option == MatVecOptions::PROHIBIT_IMAGINARY))
282  wi[nn][matNumber]=0.0;
283  nn--;
284  } else {
285  y=mats[flat_index_one_based(nn-1,nn-1,n)][matNumber];
286  w=mats[flat_index_one_based(nn, nn-1,n)][matNumber]*mats[flat_index_one_based(nn-1,nn,n)][matNumber];
287  if(l == (nn-1)) {
288  p=0.5*(y-x);
289  q=p*p+w;
290  z=::sqrt(::fabs(q));
291  x += t;
292  if(q >= 0.0) {
293  z=p+(p>=0.0?::fabs(z):-::fabs(z));
294  wr[nn-1][matNumber]=wr[nn][matNumber]=x+z;
295  if (z) wr[nn][matNumber]=x-w/z;
296  if(!(option == MatVecOptions::DISREGARD_IMAGINARY || option == MatVecOptions::PROHIBIT_IMAGINARY))
297  wi[nn-1][matNumber]=wi[nn][matNumber]=0.0;
298  } else {
299  if(option != MatVecOptions::PROHIBIT_IMAGINARY)
300  {
301  wr[nn-1][matNumber]=wr[nn][matNumber]=x+p;
302  if(option != MatVecOptions::DISREGARD_IMAGINARY)
303  wi[nn-1][matNumber]= -(wi[nn][matNumber]=z);
304  }
305  else
306  {
307  return MatVecStatus::IMAGINARY_NUMBER_FOUND;
308  }
309  }
310  nn -= 2;
311  } else {
312  if (its == 30) return MatVecStatus::MAX_ITERATION;
313  if (its == 10 || its == 20) {
314  t += x;
315  for (i=1; i<=nn;i++) mats[flat_index_one_based(i,i,n)][matNumber] -= x;
316  s=::fabs(mats[flat_index_one_based(nn,nn-1,n)][matNumber])+::fabs(mats[flat_index_one_based(nn-1,nn-2,n)][matNumber]);
317  y=x=0.75*s;
318  w = -0.4375*s*s;
319  }
320  ++its;
321  for (m=(nn-2);m>=l;m--) {
322  z=mats[flat_index_one_based(m,m,n)][matNumber];
323  r=x-z;
324  s=y-z;
325  p=(r*s-w)/mats[flat_index_one_based(m+1,m,n)][matNumber]+mats[flat_index_one_based(m,m+1,n)][matNumber];
326  q=mats[flat_index_one_based(m+1,m+1,n)][matNumber]-z-r-s;
327  r=mats[flat_index_one_based(m+2,m+1,n)][matNumber];
328  s=::fabs(p)+::fabs(q)+::fabs(r);
329  p /= s;
330  q /= s;
331  r /= s;
332  if (m == l) break;
333  u=::fabs(mats[flat_index_one_based(m,m-1,n)][matNumber])*(::fabs(q)+::fabs(r));
334  v=::fabs(p)*(::fabs(mats[flat_index_one_based(m-1,m-1,n)][matNumber])+::fabs(z)+::fabs(mats[flat_index_one_based(m+1,m+1,n)][matNumber]));
335  if ((ValT)(u+v) == v) break;
336  }
337  for (i=m+2;i<=nn;i++) {
338  mats[flat_index_one_based(i, i-2,n)][matNumber]=0.0;
339  if (i != (m+2)) mats[flat_index_one_based(i, i-3,n)][matNumber]=0.0;
340  }
341  for (k=m;k<=nn-1;k++) {
342  if (k != m) {
343  p=mats[flat_index_one_based(k,k-1,n)][matNumber];
344  q=mats[flat_index_one_based(k+1,k-1,n)][matNumber];
345  r=0.0;
346  if (k != (nn-1)) r=mats[flat_index_one_based(k+2,k-1,n)][matNumber];
347  if ((x=::fabs(p)+::fabs(q)+::fabs(r)) != 0.0) {
348  p /= x;
349  q /= x;
350  r /= x;
351  }
352  }
353  maxTmp = ::sqrt(p*p+q*q+r*r);
354  if ((s=(p >= 0.0?::fabs(maxTmp):-::fabs(maxTmp))) != 0.0) {
355  if (k == m) {
356  if (l != m)
357  mats[flat_index_one_based(k, k-1,n)][matNumber] = -mats[flat_index_one_based(k,k-1,n)][matNumber];
358  } else
359  mats[flat_index_one_based(k,k-1,n)][matNumber] = -s*x;
360  p += s;
361  x=p/s;
362  y=q/s;
363  z=r/s;
364  q /= p;
365  r /= p;
366  for (j=k;j<=nn;j++) {
367  p=mats[flat_index_one_based(k,j,n)][matNumber]+q*mats[flat_index_one_based(k+1,j,n)][matNumber];
368  if (k != (nn-1)) {
369  p += r*mats[flat_index_one_based(k+2,j,n)][matNumber];
370  mats[flat_index_one_based(k+2,j,n)][matNumber] -= p*z;
371  }
372  mats[flat_index_one_based(k+1,j,n)][matNumber] -= p*y;
373  mats[flat_index_one_based(k,j,n)][matNumber] -= p*x;
374  }
375  mmin = nn<k+3 ? nn : k+3;
376  for (i=l;i<=mmin;i++) {
377  p=x*mats[flat_index_one_based(i,k,n)][matNumber]+y*mats[flat_index_one_based(i,k+1,n)][matNumber];
378  if (k != (nn-1)) {
379  p += z*mats[flat_index_one_based(i, k+2,n)][matNumber];
380  mats[flat_index_one_based(i,k+2,n)][matNumber] -= p*r;
381  }
382  mats[flat_index_one_based(i,k+1,n)][matNumber] -= p*q;
383  mats[flat_index_one_based(i,k,n)][matNumber] -= p;
384  }
385  }
386  }
387  }
388  }
389  } while (l < nn-1);
390  }
391 
392  return MatVecStatus::SUCCESS;
393  }
394 
395  template<typename ValT>
396  __global__ void hqr(ValT * const * const mats, int const n, int const batchSize, ValT** wr, ValT** wi, int status[], MatVecOptions_e const option)
397  {
398  int matNumber = blockIdx.x * blockDim.x + threadIdx.x;
399  const bool useImaginary = !(option == MatVecOptions::DISREGARD_IMAGINARY || option == MatVecOptions::PROHIBIT_IMAGINARY);
400  for ( ; matNumber < batchSize ; matNumber += blockDim.x * gridDim.x ){
401  status[matNumber] = hqr_helper(mats, matNumber, n, wr-1, useImaginary ? wi-1 : NULL, option); //Call with 1-based indexing
402  }
403  }
404 
405  template<typename ValT>
406  __global__ void eigen_decomposition(ValT * const * const mArray, int const n, int const batchSize, ValT ** result, ValT ** resultComplex, int* status, MatVecOptions_e const option)
407  {
408  balanc(mArray, n, batchSize);
409  elmhes(mArray, n, batchSize);
410  hqr(mArray, n, batchSize, result, resultComplex, status, option);
411  }
412 
413  template<typename ValT>
415 
416  template<>
417  class CublasTypeDispatcher<double>
418  {
419  public:
420 
421  static cublasStatus_t cublasgetrsBatched( cublasHandle_t handle,
422  cublasOperation_t trans,
423  int n,
424  int nrhs,
425  const double *Aarray[],
426  int lda,
427  const int *devIpiv,
428  double *Barray[],
429  int ldb,
430  int *info,
431  int batchSize )
432  {
433  return cublasDgetrsBatched(handle,
434  trans,
435  n,
436  nrhs,
437  Aarray,
438  lda,
439  devIpiv,
440  Barray,
441  ldb,
442  info,
443  batchSize);
444  }
445  static cublasStatus_t cublasgetrfBatched( cublasHandle_t handle,
446  int n,
447  double *Aarray[],
448  int lda,
449  int *PivotArray,
450  int *infoArray,
451  int batchSize )
452  {
453  return cublasDgetrfBatched(handle, n, Aarray, lda, PivotArray, infoArray, batchSize);
454  }
455  static cublasStatus_t cublasgetriBatched( cublasHandle_t handle,
456  int n,
457  const double *Aarray[],
458  int lda,
459  int *PivotArray,
460  double *Carray[],
461  int ldc,
462  int *infoArray,
463  int batchSize )
464  {
465  return cublasDgetriBatched(handle,
466  n,
467  Aarray,
468  lda,
469  PivotArray,
470  Carray,
471  ldc,
472  infoArray,
473  batchSize);
474  }
475 
476  static cublasStatus_t cublasgemmBatched( cublasHandle_t handle,
477  cublasOperation_t transa,
478  cublasOperation_t transb,
479  int m,
480  int n,
481  int k,
482  const double *alpha,
483  const double *Aarray[],
484  int lda,
485  const double *Barray[],
486  int ldb,
487  const double *beta,
488  double *Carray[],
489  int ldc,
490  int batchSize )
491  {
492  return cublasDgemmBatched(handle,
493  transa, transb,
494  m, n, k,
495  alpha,
496  Aarray, lda,
497  Barray, ldb,
498  beta,
499  Carray, ldc,
500  batchSize);
501  }
502 
503  private:
505  };
506  } // namespace CudaMatVecUtility
507 } // namespace SpatialOps
508 
509 #endif // SpatialOps_CudaMatVecUtility_h