MADNESS  0.10.1
mxm.h
Go to the documentation of this file.
1 /*
2  This file is part of MADNESS.
3 
4  Copyright (C) 2007,2010 Oak Ridge National Laboratory
5 
6  This program is free software; you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation; either version 2 of the License, or
9  (at your option) any later version.
10 
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with this program; if not, write to the Free Software
18  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 
20  For more information please contact:
21 
22  Robert J. Harrison
23  Oak Ridge National Laboratory
24  One Bethel Valley Road
25  P.O. Box 2008, MS-6367
26 
27  email: harrisonrj@ornl.gov
28  tel: 865-241-3937
29  fax: 865-572-0680
30 
31 
32  $Id$
33 */
34 
35 #ifndef MADNESS_TENSOR_MXM_H__INCLUDED
36 #define MADNESS_TENSOR_MXM_H__INCLUDED
37 
38 #include <madness/madness_config.h>
39 
40 // This just to check if config is actually working
41 //#ifndef HAVE_MTXMQ
42 //#error "MTXMQ missing"
43 //#endif
44 
45 #define HAVE_FAST_BLAS
46 #ifdef HAVE_FAST_BLAS
47 //#ifdef HAVE_INTEL_MKL
48 #include <madness/tensor/cblas.h>
49 #endif
50 
51 /// \file tensor/mxm.h
52 /// \brief Internal use only
53 
54 // This file is ONLY included into tensor.cc ... separated here just
55 // to shrink file size. Don't try to include anywhere else
56 
57 // Due to both flakey compilers and performance concerns,
58 // we use a simple reference implementation of the mxm
59 // routines for all except T=double.
60 
61 
62 namespace madness {
63 
64  // Start with reference implementations. Then provide optimized implementations, falling back to reference if not available on specific platforms
65 
66  /// Matrix \c += Matrix * matrix reference implementation (slow but correct)
67  template <typename T, typename Q, typename S>
68  static inline void mxm_reference(long dimi, long dimj, long dimk,
69  T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
70  const S* MADNESS_RESTRICT b) {
71  /*
72  c(i,j) = c(i,j) + sum(k) a(i,k)*b(k,j)
73 
74  where it is assumed that the last index in each array is has unit
75  stride and the dimensions are as provided.
76  */
77 
78  for (long i=0; i<dimi; ++i) {
79  for (long k=0; k<dimk; ++k) {
80  for (long j=0; j<dimj; ++j) {
81  c[i*dimj+j] += a[i*dimk+k]*b[k*dimj+j];
82  }
83  }
84  }
85  }
86 
87 
88  /// Matrix \c += Matrix transpose * matrix ... reference implementation (slow but correct)
89  template <typename T, typename Q, typename S>
90  static inline
91  void mTxm_reference(long dimi, long dimj, long dimk,
92  T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
93  const S* MADNESS_RESTRICT b) {
94  /*
95  c(i,j) = c(i,j) + sum(k) a(k,i)*b(k,j)
96 
97  where it is assumed that the last index in each array is has unit
98  stride and the dimensions are as provided.
99 
100  i loop might be long in anticipated application
101  */
102 
103  for (long k=0; k<dimk; ++k) {
104  for (long j=0; j<dimj; ++j) {
105  for (long i=0; i<dimi; ++i) {
106  c[i*dimj+j] += a[k*dimi+i]*b[k*dimj+j];
107  }
108  }
109  }
110  }
111 
112  /// Matrix \c += Matrix * matrix transpose ... reference implementation (slow but correct)
113  template <typename T, typename Q, typename S>
114  static inline void mxmT_reference (long dimi, long dimj, long dimk,
115  T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
116  const S* MADNESS_RESTRICT b) {
117  /*
118  c(i,j) = c(i,j) + sum(k) a(i,k)*b(j,k)
119 
120  where it is assumed that the last index in each array is has unit
121  stride and the dimensions are as provided.
122 
123  i loop might be long in anticipated application
124  */
125 
126  for (long i=0; i<dimi; ++i) {
127  for (long j=0; j<dimj; ++j) {
128  T sum = 0;
129  for (long k=0; k<dimk; ++k) {
130  sum += a[i*dimk+k]*b[j*dimk+k];
131  }
132  c[i*dimj+j] += sum;
133  }
134  }
135  }
136 
137  /// Matrix \c += Matrix transpose * matrix transpose reference implementation (slow but correct)
138  template <typename T, typename Q, typename S>
139  static inline void mTxmT_reference(long dimi, long dimj, long dimk,
140  T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
141  const S* MADNESS_RESTRICT b) {
142  /*
143  c(i,j) = c(i,j) + sum(k) a(k,i)*b(j,k)
144 
145  where it is assumed that the last index in each array is has unit
146  stride and the dimensions are as provided.
147  */
148 
149  for (long i=0; i<dimi; ++i) {
150  for (long j=0; j<dimj; ++j) {
151  for (long k=0; k<dimk; ++k) {
152  c[i*dimj+j] += a[k*dimi+i]*b[j*dimk+k];
153  }
154  }
155  }
156  }
157 
158  /// Matrix = Matrix transpose * matrix ... slow reference implementation
159 
160  /// This routine does \c C=AT*B whereas mTxm does C=C+AT*B.
161  /// \code
162  /// c(i,j) = sum(k) a(k,i)*b(k,j) <------ does not accumulate into C
163  /// \endcode
164  ///
165  /// \c ldb is the last dimension of b in C storage (the leading dimension
166  /// in fortran storage). It is here to accomodate multiplying by a matrix
167  /// stored with \c ldb>dimj which happens in madness when transforming with
168  /// low rank matrices. A matrix in dense storage has \c ldb=dimj which is
169  /// the default for backward compatibility.
170  template <typename aT, typename bT, typename cT>
171  void mTxmq_reference(long dimi, long dimj, long dimk,
172  cT* MADNESS_RESTRICT c, const aT* a, const bT* b, long ldb=-1) {
173  if (ldb == -1) ldb=dimj;
174  MADNESS_ASSERT(ldb>=dimj);
175  //std::cout << "IN GENERIC mTxmq " << tensor_type_names[TensorTypeData<aT>::id] << " " << tensor_type_names[TensorTypeData<bT>::id] << " " << tensor_type_names[TensorTypeData<cT>::id] << "\n";
176  for (long i=0; i<dimi; ++i,c+=dimj,++a) {
177  for (long j=0; j<dimj; ++j) c[j] = 0.0;
178  const aT *aik_ptr = a;
179  for (long k=0; k<dimk; ++k,aik_ptr+=dimi) {
180  aT aki = *aik_ptr;
181  for (long j=0; j<dimj; ++j) {
182  c[j] += aki*b[k*ldb+j];
183  }
184  }
185  }
186  }
187 
188 
189 #if defined(HAVE_FAST_BLAS) && !defined(HAVE_INTEL_MKL)
190  // MKL provides support for mixed real/complex operations but most other libraries do not
191 
192  /// Matrix += Matrix * matrix ... BLAS/MKL interface version
193 
194  /// Does \c C=C+A*B
195  /// \code
196  /// c(i,j) = c(i,j) + sum(k) a(i,k)*b(k,j)
197  /// \endcode
198  template <typename T>
199  void mxm(long dimi, long dimj, long dimk,
200  T* MADNESS_RESTRICT c, const T* a, const T* b) {
201  const T one = 1.0; // alpha in *gemm
202  cblas::gemm(cblas::NoTrans,cblas::NoTrans,dimj,dimi,dimk,one,b,dimj,a,dimk,one,c,dimj);
203  }
204 
205  /// Matrix += Matrix transpose * matrix ... MKL interface version
206 
207  /// Does \c C=C+AT*B
208  /// \code
209  /// c(i,j) = c(i,j) + sum(k) a(k,i)*b(k,j)
210  /// \endcode
211  template <typename T>
212  void mTxm(long dimi, long dimj, long dimk,
213  T* MADNESS_RESTRICT c, const T* a, const T* b) {
214  const T one = 1.0; // alpha in *gemm
215  cblas::gemm(cblas::NoTrans,cblas::Trans,dimj,dimi,dimk,one,b,dimj,a,dimi,one,c,dimj);
216  }
217 
218  /// Matrix += Matrix * matrix transpose ... MKL interface version
219 
220  /// Does \c C=C+A*BT
221  /// \code
222  /// c(i,j) = c(i,j) + sum(k) a(i,k)*b(j,k)
223  /// \endcode
224  template <typename T>
225  void mxmT(long dimi, long dimj, long dimk,
226  T* MADNESS_RESTRICT c, const T* a, const T* b) {
227  const T one = 1.0; // alpha in *gemm
228  cblas::gemm(cblas::Trans,cblas::NoTrans,dimj,dimi,dimk,one,b,dimk,a,dimk,one,c,dimj);
229  }
230 
231  /// Matrix += Matrix transpose * matrix transpose ... MKL interface version
232 
233  /// Does \c C=C+AT*BT
234  /// \code
235  /// c(i,j) = c(i,j) + sum(k) a(k,i)*b(j,k)
236  /// \endcode
237  template <typename T>
238  void mTxmT(long dimi, long dimj, long dimk,
239  T* MADNESS_RESTRICT c, const T* a, const T* b) {
240  const T one = 1.0; // alpha in *gemm
241  cblas::gemm(cblas::Trans,cblas::Trans,dimj,dimi,dimk,one,b,dimk,a,dimi,one,c,dimj);
242  }
243 
244  /// Matrix = Matrix transpose * matrix ... MKL interface version
245 
246  /// Does \c C=AT*B whereas mTxm does C=C+AT*B.
247  /// \code
248  /// c(i,j) = sum(k) a(k,i)*b(k,j) <------ does not accumulate into C
249  /// \endcode
250  ///
251  /// \c ldb is the last dimension of b in C storage (the leading dimension
252  /// in fortran storage). It is here to accomodate multiplying by a matrix
253  /// stored with \c ldb>dimj which happens in madness when transforming with
254  /// low rank matrices. A matrix in dense storage has \c ldb=dimj which is
255  /// the default for backward compatibility.
256  template <typename T>
257  void mTxmq(long dimi, long dimj, long dimk,
258  T* MADNESS_RESTRICT c, const T* a, const T* b, long ldb=-1) {
259  if (ldb == -1) ldb=dimj;
260  MADNESS_ASSERT(ldb>=dimj);
261 
262  if (dimi==0 || dimj==0) return; // nothing to do and *GEMM will complain
263  if (dimk==0) {
264  for (long i=0; i<dimi*dimj; i++) c[i] = 0.0;
265  }
266 
267  const T one = 1.0; // alpha in *gemm
268  const T zero = 0.0; // beta in *gemm
269  cblas::gemm(cblas::NoTrans,cblas::Trans,dimj,dimi,dimk,one,b,ldb,a,dimi,zero,c,dimj);
270  }
271 
272 #ifdef HAVE_MTXMQ
273  template <>
274  void mTxmq(long dimi, long dimj, long dimk, double* MADNESS_RESTRICT c, const double* a, const double* b, long ldb);
275 
276  // Bootstrap complex*real from real*real
277  template <typename T>
278  void mTxmq(long dimi, long dimj, long dimk, std::complex<T>* MADNESS_RESTRICT c, const std::complex<T>* a, const T* b, long ldb) {
279  T* Rc = new T[dimi*dimj];
280  T* Ic = new T[dimi*dimj];
281  T* Ra = new T[dimi*dimk];
282  T* Ia = new T[dimi*dimk];
283 
284  for (long i=0; i<dimi*dimk; i++) {
285  Ra[i] = a[i].real();
286  Ia[i] = a[i].imag();
287  }
288  mTxmq(dimi,dimj,dimk,Rc,Ra,b,ldb);
289  mTxmq(dimi,dimj,dimk,Ic,Ia,b,ldb);
290  for (long i=0; i<dimi*dimj; i++) c[i] = std::complex<T>(Rc[i],Ic[i]);
291 
292  delete[] Rc;
293  delete[] Ic;
294  delete[] Ra;
295  delete[] Ia;
296  }
297 
298 #endif
299 
300 #endif
301 
302 #ifdef HAVE_INTEL_MKL
303  /// Matrix += Matrix * matrix ... MKL interface version
304 
305  /// Does \c C=C+A*B
306  /// \code
307  /// c(i,j) = c(i,j) + sum(k) a(i,k)*b(k,j)
308  /// \endcode
309  template <typename aT, typename bT, typename cT>
310  void mxm(long dimi, long dimj, long dimk,
311  cT* MADNESS_RESTRICT c, const aT* a, const bT* b) {
312  const cT one = 1.0; // alpha in *gemm
313  cblas::gemm(cblas::NoTrans,cblas::NoTrans,dimj,dimi,dimk,one,b,dimj,a,dimk,one,c,dimj);
314  }
315 
316  /// Matrix += Matrix transpose * matrix ... MKL interface version
317 
318  /// Does \c C=C+AT*B
319  /// \code
320  /// c(i,j) = c(i,j) + sum(k) a(k,i)*b(k,j)
321  /// \endcode
322  template <typename aT, typename bT, typename cT>
323  void mTxm(long dimi, long dimj, long dimk,
324  cT* MADNESS_RESTRICT c, const aT* a, const bT* b) {
325  const cT one = 1.0; // alpha in *gemm
326  cblas::gemm(cblas::NoTrans,cblas::Trans,dimj,dimi,dimk,one,b,dimj,a,dimi,one,c,dimj);
327  }
328 
329  /// Matrix += Matrix * matrix transpose ... MKL interface version
330 
331  /// Does \c C=C+A*BT
332  /// \code
333  /// c(i,j) = c(i,j) + sum(k) a(i,k)*b(j,k)
334  /// \endcode
335  template <typename aT, typename bT, typename cT>
336  void mxmT(long dimi, long dimj, long dimk,
337  cT* MADNESS_RESTRICT c, const aT* a, const bT* b) {
338  const cT one = 1.0; // alpha in *gemm
339  cblas::gemm(cblas::Trans,cblas::NoTrans,dimj,dimi,dimk,one,b,dimk,a,dimk,one,c,dimj);
340  }
341 
342  /// Matrix += Matrix transpose * matrix transpose ... MKL interface version
343 
344  /// Does \c C=C+AT*BT
345  /// \code
346  /// c(i,j) = c(i,j) + sum(k) a(k,i)*b(j,k)
347  /// \endcode
348  template <typename aT, typename bT, typename cT>
349  void mTxmT(long dimi, long dimj, long dimk,
350  cT* MADNESS_RESTRICT c, const aT* a, const bT* b) {
351  const cT one = 1.0; // alpha in *gemm
352  cblas::gemm(cblas::Trans,cblas::Trans,dimj,dimi,dimk,one,b,dimk,a,dimi,one,c,dimj);
353  }
354 
355  /// Matrix = Matrix transpose * matrix ... MKL interface version
356 
357  /// Does \c C=AT*B whereas mTxm does C=C+AT*B.
358  /// \code
359  /// c(i,j) = sum(k) a(k,i)*b(k,j) <------ does not accumulate into C
360  /// \endcode
361  ///
362  /// \c ldb is the last dimension of b in C storage (the leading dimension
363  /// in fortran storage). It is here to accomodate multiplying by a matrix
364  /// stored with \c ldb>dimj which happens in madness when transforming with
365  /// low rank matrices. A matrix in dense storage has \c ldb=dimj which is
366  /// the default for backward compatibility.
367  template <typename aT, typename bT, typename cT>
368  void mTxmq(long dimi, long dimj, long dimk,
369  cT* MADNESS_RESTRICT c, const aT* a, const bT* b, long ldb=-1) {
370  if (ldb == -1) ldb=dimj;
371  MADNESS_ASSERT(ldb>=dimj);
372 
373  if (dimi==0 || dimj==0) return; // nothing to do and *GEMM will complain
374  if (dimk==0) {
375  for (long i=0; i<dimi*dimj; i++) c[i] = 0.0;
376  }
377 
378  const cT one = 1.0; // alpha in *gemm
379  const cT zero = 0.0; // beta in *gemm
380  cblas::gemm(cblas::NoTrans,cblas::Trans,dimj,dimi,dimk,one,b,ldb,a,dimi,zero,c,dimj);
381  }
382 
383 #ifdef HAVE_MTXMQ
384 template <>
385 void mTxmq(long dimi, long dimj, long dimk, double* MADNESS_RESTRICT c, const double* a, const double* b, long ldb);
386 #endif
387 
388 #else
389 
390  // Fall back to reference implementations
391 
392  template <typename T, typename Q, typename S>
393  static inline void mxm(long dimi, long dimj, long dimk,
394  T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
395  const S* MADNESS_RESTRICT b) {
396  mxm_reference(dimi, dimj, dimk, c, a, b);
397  }
398 
399  template <typename T, typename Q, typename S>
400  static inline
401  void mTxm(long dimi, long dimj, long dimk,
402  T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
403  const S* MADNESS_RESTRICT b) {
404  mTxm_reference(dimi, dimj, dimk, c, a, b);
405  }
406 
407  template <typename T, typename Q, typename S>
408  static inline void mxmT(long dimi, long dimj, long dimk,
409  T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
410  const S* MADNESS_RESTRICT b) {
411  mxmT_reference(dimi, dimj, dimk, c, a, b);
412  }
413 
414  template <typename T, typename Q, typename S>
415  static inline void mTxmT(long dimi, long dimj, long dimk,
416  T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
417  const S* MADNESS_RESTRICT b) {
418  mTxmT_reference(dimi, dimj, dimk, c, a, b);
419  }
420 
421  template <typename aT, typename bT, typename cT>
422  void mTxmq(long dimi, long dimj, long dimk,
423  cT* MADNESS_RESTRICT c, const aT* a, const bT* b, long ldb=-1) {
424  mTxmq_reference(dimi, dimj, dimk, c, a, b, ldb);
425  }
426 
427  // The following are restricted to double only
428 
429  /// Matrix transpose * matrix (hand unrolled version)
430 
431  template <>
432  inline void mTxm(long dimi, long dimj, long dimk,
433  double* MADNESS_RESTRICT c, const double* MADNESS_RESTRICT a,
434  const double* MADNESS_RESTRICT b) {
435  /*
436  c(i,j) = c(i,j) + sum(k) a(k,i)*b(k,j) <--- NOTE ACCUMULATION INTO C
437 
438  where it is assumed that the last index in each array is has unit
439  stride and the dimensions are as provided.
440 
441  i loop might be long in anticipated application
442 
443  4-way unrolled k loop ... empirically fastest on PIII
444  compared to 2/3 way unrolling (though not by much).
445  */
446 
447  long dimk4 = (dimk/4)*4;
448  for (long i=0; i<dimi; ++i,c+=dimj) {
449  const double* ai = a+i;
450  const double* p = b;
451  for (long k=0; k<dimk4; k+=4,ai+=4*dimi,p+=4*dimj) {
452  double ak0i = ai[0 ];
453  double ak1i = ai[dimi];
454  double ak2i = ai[dimi+dimi];
455  double ak3i = ai[dimi+dimi+dimi];
456  const double* bk0 = p;
457  const double* bk1 = p+dimj;
458  const double* bk2 = p+dimj+dimj;
459  const double* bk3 = p+dimj+dimj+dimj;
460  for (long j=0; j<dimj; ++j) {
461  c[j] += ak0i*bk0[j] + ak1i*bk1[j] + ak2i*bk2[j] + ak3i*bk3[j];
462  }
463  }
464  for (long k=dimk4; k<dimk; ++k) {
465  double aki = a[k*dimi+i];
466  const double* bk = b+k*dimj;
467  for (long j=0; j<dimj; ++j) {
468  c[j] += aki*bk[j];
469  }
470  }
471  }
472  }
473 
474 
475 
476  /// Matrix * matrix transpose (hand unrolled version)
477 
478  template <>
479  inline void mxmT(long dimi, long dimj, long dimk,
480  double* MADNESS_RESTRICT c,
481  const double* MADNESS_RESTRICT a, const double* MADNESS_RESTRICT b) {
482  /*
483  c(i,j) = c(i,j) + sum(k) a(i,k)*b(j,k)
484 
485  where it is assumed that the last index in each array is has unit
486  stride and the dimensions are as provided.
487 
488  j loop might be long in anticipated application
489 
490  Unrolled i loop. Empirically fastest on PIII compared
491  to unrolling j, or both i&j.
492  */
493 
494  long dimi2 = (dimi/2)*2;
495  for (long i=0; i<dimi2; i+=2) {
496  const double* ai0 = a+i*dimk;
497  const double* ai1 = a+i*dimk+dimk;
498  double* MADNESS_RESTRICT ci0 = c+i*dimj;
499  double* MADNESS_RESTRICT ci1 = c+i*dimj+dimj;
500  for (long j=0; j<dimj; ++j) {
501  double sum0 = 0;
502  double sum1 = 0;
503  const double* bj = b + j*dimk;
504  for (long k=0; k<dimk; ++k) {
505  sum0 += ai0[k]*bj[k];
506  sum1 += ai1[k]*bj[k];
507  }
508  ci0[j] += sum0;
509  ci1[j] += sum1;
510  }
511  }
512  for (long i=dimi2; i<dimi; ++i) {
513  const double* ai = a+i*dimk;
514  double* MADNESS_RESTRICT ci = c+i*dimj;
515  for (long j=0; j<dimj; ++j) {
516  double sum = 0;
517  const double* bj = b+j*dimk;
518  for (long k=0; k<dimk; ++k) {
519  sum += ai[k]*bj[k];
520  }
521  ci[j] += sum;
522  }
523  }
524  }
525 
526  /// Matrix * matrix (hand unrolled version)
527  template <>
528  inline void mxm(long dimi, long dimj, long dimk,
529  double* MADNESS_RESTRICT c, const double* MADNESS_RESTRICT a, const double* MADNESS_RESTRICT b) {
530  /*
531  c(i,j) = c(i,j) + sum(k) a(i,k)*b(k,j)
532 
533  where it is assumed that the last index in each array is has unit
534  stride and the dimensions are as provided.
535 
536  4-way unrolled k loop ... empirically fastest on PIII
537  compared to 2/3 way unrolling (though not by much).
538  */
539 
540  long dimk4 = (dimk/4)*4;
541  for (long i=0; i<dimi; ++i, c+=dimj,a+=dimk) {
542  const double* p = b;
543  for (long k=0; k<dimk4; k+=4,p+=4*dimj) {
544  double aik0 = a[k ];
545  double aik1 = a[k+1];
546  double aik2 = a[k+2];
547  double aik3 = a[k+3];
548  const double* bk0 = p;
549  const double* bk1 = bk0+dimj;
550  const double* bk2 = bk1+dimj;
551  const double* bk3 = bk2+dimj;
552  for (long j=0; j<dimj; ++j) {
553  c[j] += aik0*bk0[j] + aik1*bk1[j] + aik2*bk2[j] + aik3*bk3[j];
554  }
555  }
556  for (long k=dimk4; k<dimk; ++k) {
557  double aik = a[k];
558  for (long j=0; j<dimj; ++j) {
559  c[j] += aik*b[k*dimj+j];
560  }
561  }
562  }
563  }
564 
565  /// Matrix transpose * matrix transpose (hand tiled and unrolled)
566  template <>
567  inline void mTxmT(long dimi, long dimj, long dimk,
568  double* MADNESS_RESTRICT csave, const double* MADNESS_RESTRICT asave, const double* MADNESS_RESTRICT b) {
569  /*
570  c(i,j) = c(i,j) + sum(k) a(k,i)*b(j,k)
571 
572  where it is assumed that the last index in each array is has unit
573  stride and the dimensions are as provided.
574 
575  Tiled k, copy row of a into temporary, and unroll j once.
576  */
577 
578  const int ktile=32;
579  double ai[ktile];
580  long dimj2 = (dimj/2)*2;
581 
582  for (long klo=0; klo<dimk; klo+=ktile, asave+=ktile*dimi, b+=ktile) {
583  long khi = klo+ktile;
584  if (khi > dimk) khi = dimk;
585  long nk = khi-klo;
586 
587  const double * MADNESS_RESTRICT a = asave;
588  double * MADNESS_RESTRICT c = csave;
589  for (long i=0; i<dimi; ++i,c+=dimj,++a) {
590  const double* q = a;
591  for (long k=0; k<nk; ++k,q+=dimi) ai[k] = *q;
592 
593  const double* bj0 = b;
594  for (long j=0; j<dimj2; j+=2,bj0+=2*dimk) {
595  const double* bj1 = bj0+dimk;
596  double sum0 = 0;
597  double sum1 = 0;
598  for (long k=0; k<nk; ++k) {
599  sum0 += ai[k]*bj0[k];
600  sum1 += ai[k]*bj1[k];
601  }
602  c[j ] += sum0;
603  c[j+1] += sum1;
604  }
605 
606  for (long j=dimj2; j<dimj; ++j,bj0+=dimk) {
607  double sum = 0;
608  for (long k=0; k<nk; ++k) {
609  sum += ai[k]*bj0[k];
610  }
611  c[j] += sum;
612  }
613  }
614  }
615  }
616 
617  /*
618  * mtxm, but with padded buffers.
619  *
620  * ext_b is the extent of the b array, so shrink() isn't needed.
621  */
622  template <typename aT, typename bT, typename cT>
623  void mTxmq_padding(long dimi, long dimj, long dimk, long ext_b,
624  cT* c, const aT* a, const bT* b) {
625  const int alignment = 4;
626  bool free_b = false;
627  long effj = dimj;
628 
629  /* Setup a buffer for c if needed */
630  cT* c_buf = c;
631  if (dimj%alignment) {
632  effj = (dimj | 3) + 1;
633  c_buf = (cT*)malloc(sizeof(cT)*dimi*effj);
634  }
635 
636  /* Copy b into a buffer if needed */
637  if (ext_b%alignment) {
638  free_b = true;
639  bT* b_buf = (bT*)malloc(sizeof(bT)*dimk*effj);
640 
641  bT* bp = b_buf;
642  for (long k=0; k<dimk; k++, bp += effj, b += ext_b)
643  memcpy(bp, b, sizeof(bT)*dimj);
644 
645  b = b_buf;
646  ext_b = effj;
647  }
648 
649  cT* c_work = c_buf;
650  /* mTxm */
651  for (long i=0; i<dimi; ++i,c_work+=effj,++a) {
652  for (long j=0; j<dimj; ++j) c_work[j] = 0.0;
653  const aT *aik_ptr = a;
654  for (long k=0; k<dimk; ++k,aik_ptr+=dimi) {
655  aT aki = *aik_ptr;
656  for (long j=0; j<dimj; ++j) {
657  c_work[j] += aki*b[k*ext_b+j];
658  }
659  }
660  }
661 
662  /* Copy c out if needed */
663  if (dimj%alignment) {
664  cT* ct = c_buf;
665  for (long i=0; i<dimi; i++, ct += effj, c += dimj)
666  memcpy(c, ct, sizeof(cT)*dimj);
667 
668  free(c_buf);
669  }
670 
671  /* Free the buffer for b */
672  if (free_b) free((bT*)b);
673  }
674 
675 #ifdef HAVE_IBMBGQ
676  extern void bgq_mtxmq_padded(long ni, long nj, long nk, long ej,
677  double* c, const double* a, const double* b);
678  extern void bgq_mtxmq_padded(long ni, long nj, long nk, long ej,
679  __complex__ double* c, const __complex__ double* a, const __complex__ double* b);
680  extern void bgq_mtxmq_padded(long ni, long nj, long nk, long ej,
681  __complex__ double* c, const double* a, const __complex__ double* b);
682  extern void bgq_mtxmq_padded(long ni, long nj, long nk, long ej,
683  __complex__ double* c, const __complex__ double* a, const double* b);
684 
685  template <>
686  inline void mTxmq_padding(long ni, long nj, long nk, long ej,
687  double* c, const double* a, const double* b) {
688  bgq_mtxmq_padded(ni, nj, nk, ej, c, a, b);
689  }
690 
691  template <>
692  inline void mTxmq_padding(long ni, long nj, long nk, long ej,
693  __complex__ double* c, const __complex__ double* a, const __complex__ double* b) {
694  bgq_mtxmq_padded(ni, nj, nk, ej, c, a, b);
695  }
696 
697  template <>
698  inline void mTxmq_padding(long ni, long nj, long nk, long ej,
699  __complex__ double* c, const double* a, const __complex__ double* b) {
700  bgq_mtxmq_padded(ni, nj, nk, ej, c, a, b);
701  }
702 
703  template <>
704  inline void mTxmq_padding(long ni, long nj, long nk, long ej,
705  __complex__ double* c, const __complex__ double* a, const double* b) {
706  bgq_mtxmq_padded(ni, nj, nk, ej, c, a, b);
707  }
708 #elif defined(HAVE_IBMBGP)
709  extern void bgpmTxmq(long ni, long nj, long nk, double* MADNESS_RESTRICT c,
710  const double* a, const double* b);
711  extern void bgpmTxmq(long ni, long nj, long nk, double_complex* MADNESS_RESTRICT c,
712  const double_complex* a, const double_complex* b);
713 
714  template <>
715  inline void mTxmq(long ni, long nj, long nk, double* MADNESS_RESTRICT c, const double* a, const double* b) {
716  bgpmTxmq(ni, nj, nk, c, a, b);
717  }
718 
719  template <>
720  inline void mTxmq(long ni, long nj, long nk, double_complex* MADNESS_RESTRICT c, const double_complex* a, const double_complex* b) {
721  bgpmTxmq(ni, nj, nk, c, a, b);
722  }
723 
724 // #elif defined(X86_64) && !defined(DISABLE_SSE3)
725 // template <>
726 // void mTxmq(long dimi, long dimj, long dimk,
727 // double* MADNESS_RESTRICT c, const double* a, const double* b);
728 
729 // template <>
730 // void mTxmq(long dimi, long dimj, long dimk,
731 // double_complex* MADNESS_RESTRICT c, const double_complex* a, const double_complex* b);
732 
733 // #ifndef __INTEL_COMPILER
734 // template <>
735 // void mTxmq(long dimi, long dimj, long dimk,
736 // double_complex* MADNESS_RESTRICT c, const double_complex* a, const double* b);
737 // #endif
738 
739 // #elif defined(X86_32)
740 // template <>
741 // void mTxmq(long dimi, long dimj, long dimk,
742 // double* MADNESS_RESTRICT c, const double* a, const double* b);
743 #endif // HAVE_IBMBGQ
744 
745 #endif // HAVE_INTEL_MKL
746 
747 }
748 #endif // MADNESS_TENSOR_MXM_H__INCLUDED
double q(double t)
Definition: DKops.h:18
Define BLAS like functions.
std::complex< double > double_complex
Definition: cfft.h:14
char * malloc()
char * p(char *buf, const char *name, int k, int initial_level, double thresh, int order)
Definition: derivatives.cc:72
auto T(World &world, response_space &f) -> response_space
Definition: global_functions.cc:34
Macros and tools pertaining to the configuration of MADNESS.
#define MADNESS_ASSERT(condition)
Assert a condition that should be free of side-effects since in release builds this might be a no-op.
Definition: madness_exception.h:134
void gemm(const CBLAS_TRANSPOSE OpA, const CBLAS_TRANSPOSE OpB, const integer m, const integer n, const integer k, const float alpha, const float *a, const integer lda, const float *b, const integer ldb, const float beta, float *c, const integer ldc)
Multiplies a matrix by a vector.
Definition: cblas.h:352
@ NoTrans
Definition: cblas_types.h:78
@ Trans
Definition: cblas_types.h:79
File holds all helper structures necessary for the CC_Operator and CC2 class.
Definition: DFParameters.h:10
static void mTxm_reference(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const Q *MADNESS_RESTRICT a, const S *MADNESS_RESTRICT b)
Matrix += Matrix transpose * matrix ... reference implementation (slow but correct)
Definition: mxm.h:91
void mTxmT(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const T *a, const T *b)
Matrix += Matrix transpose * matrix transpose ... MKL interface version.
Definition: mxm.h:238
void mxm(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const T *a, const T *b)
Matrix += Matrix * matrix ... BLAS/MKL interface version.
Definition: mxm.h:199
static void mxm_reference(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const Q *MADNESS_RESTRICT a, const S *MADNESS_RESTRICT b)
Matrix += Matrix * matrix reference implementation (slow but correct)
Definition: mxm.h:68
void mTxm(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const T *a, const T *b)
Matrix += Matrix transpose * matrix ... MKL interface version.
Definition: mxm.h:212
static void mxmT_reference(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const Q *MADNESS_RESTRICT a, const S *MADNESS_RESTRICT b)
Matrix += Matrix * matrix transpose ... reference implementation (slow but correct)
Definition: mxm.h:114
void mTxmq_reference(long dimi, long dimj, long dimk, cT *MADNESS_RESTRICT c, const aT *a, const bT *b, long ldb=-1)
Matrix = Matrix transpose * matrix ... slow reference implementation.
Definition: mxm.h:171
void mTxmq_padding(long dimi, long dimj, long dimk, long ext_b, cT *c, const aT *a, const bT *b)
Definition: mtxmq.h:96
Function< T, NDIM > sum(World &world, const std::vector< Function< T, NDIM > > &f, bool fence=true)
Returns new function — q = sum_i f[i].
Definition: vmra.h:1421
static void mTxmT_reference(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const Q *MADNESS_RESTRICT a, const S *MADNESS_RESTRICT b)
Matrix += Matrix transpose * matrix transpose reference implementation (slow but correct)
Definition: mxm.h:139
void bgq_mtxmq_padded(long dimi, long dimj, long dimk, long extb, __complex__ double *c_x, const __complex__ double *a_x, const __complex__ double *b_x)
Definition: bgq_mtxm.cc:10
void mxmT(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const T *a, const T *b)
Matrix += Matrix * matrix transpose ... MKL interface version.
Definition: mxm.h:225
void mTxmq(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const T *a, const T *b, long ldb=-1)
Matrix = Matrix transpose * matrix ... MKL interface version.
Definition: mxm.h:257
static const double b
Definition: nonlinschro.cc:119
static const double a
Definition: nonlinschro.cc:118
double Q(double a)
Definition: relops.cc:20
static const double c
Definition: relops.cc:10
static const long k
Definition: rk.cc:44