MADNESS 0.10.1
mxm.h
Go to the documentation of this file.
1/*
2 This file is part of MADNESS.
3
4 Copyright (C) 2007,2010 Oak Ridge National Laboratory
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
20 For more information please contact:
21
22 Robert J. Harrison
23 Oak Ridge National Laboratory
24 One Bethel Valley Road
25 P.O. Box 2008, MS-6367
26
27 email: harrisonrj@ornl.gov
28 tel: 865-241-3937
29 fax: 865-572-0680
30
31
32 $Id$
33*/
34
35#ifndef MADNESS_TENSOR_MXM_H__INCLUDED
36#define MADNESS_TENSOR_MXM_H__INCLUDED
37
39
40// This just to check if config is actually working
41//#ifndef HAVE_MTXMQ
42//#error "MTXMQ missing"
43//#endif
44
45#define HAVE_FAST_BLAS
46#ifdef HAVE_FAST_BLAS
47//#ifdef HAVE_INTEL_MKL
49#endif
50
51/// \file tensor/mxm.h
52/// \brief Internal use only
53
54// This file is ONLY included into tensor.cc ... separated here just
55// to shrink file size. Don't try to include anywhere else
56
57// Due to both flakey compilers and performance concerns,
58// we use a simple reference implementation of the mxm
59// routines for all except T=double.
60
61
62namespace madness {
63
64 // Start with reference implementations. Then provide optimized implementations, falling back to reference if not available on specific platforms
65
66 /// Matrix \c += Matrix * matrix reference implementation (slow but correct)
67 template <typename T, typename Q, typename S>
68 static inline void mxm_reference(long dimi, long dimj, long dimk,
69 T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
70 const S* MADNESS_RESTRICT b) {
71 /*
72 c(i,j) = c(i,j) + sum(k) a(i,k)*b(k,j)
73
74 where it is assumed that the last index in each array is has unit
75 stride and the dimensions are as provided.
76 */
77
78 for (long i=0; i<dimi; ++i) {
79 for (long k=0; k<dimk; ++k) {
80 for (long j=0; j<dimj; ++j) {
81 c[i*dimj+j] += a[i*dimk+k]*b[k*dimj+j];
82 }
83 }
84 }
85 }
86
87
88 /// Matrix \c += Matrix transpose * matrix ... reference implementation (slow but correct)
89 template <typename T, typename Q, typename S>
90 static inline
91 void mTxm_reference(long dimi, long dimj, long dimk,
92 T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
93 const S* MADNESS_RESTRICT b) {
94 /*
95 c(i,j) = c(i,j) + sum(k) a(k,i)*b(k,j)
96
97 where it is assumed that the last index in each array is has unit
98 stride and the dimensions are as provided.
99
100 i loop might be long in anticipated application
101 */
102
103 for (long k=0; k<dimk; ++k) {
104 for (long j=0; j<dimj; ++j) {
105 for (long i=0; i<dimi; ++i) {
106 c[i*dimj+j] += a[k*dimi+i]*b[k*dimj+j];
107 }
108 }
109 }
110 }
111
112 /// Matrix \c += Matrix * matrix transpose ... reference implementation (slow but correct)
113 template <typename T, typename Q, typename S>
114 static inline void mxmT_reference (long dimi, long dimj, long dimk,
115 T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
116 const S* MADNESS_RESTRICT b) {
117 /*
118 c(i,j) = c(i,j) + sum(k) a(i,k)*b(j,k)
119
120 where it is assumed that the last index in each array is has unit
121 stride and the dimensions are as provided.
122
123 i loop might be long in anticipated application
124 */
125
126 for (long i=0; i<dimi; ++i) {
127 for (long j=0; j<dimj; ++j) {
128 T sum = 0;
129 for (long k=0; k<dimk; ++k) {
130 sum += a[i*dimk+k]*b[j*dimk+k];
131 }
132 c[i*dimj+j] += sum;
133 }
134 }
135 }
136
137 /// Matrix \c += Matrix transpose * matrix transpose reference implementation (slow but correct)
138 template <typename T, typename Q, typename S>
139 static inline void mTxmT_reference(long dimi, long dimj, long dimk,
140 T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
141 const S* MADNESS_RESTRICT b) {
142 /*
143 c(i,j) = c(i,j) + sum(k) a(k,i)*b(j,k)
144
145 where it is assumed that the last index in each array is has unit
146 stride and the dimensions are as provided.
147 */
148
149 for (long i=0; i<dimi; ++i) {
150 for (long j=0; j<dimj; ++j) {
151 for (long k=0; k<dimk; ++k) {
152 c[i*dimj+j] += a[k*dimi+i]*b[j*dimk+k];
153 }
154 }
155 }
156 }
157
158 /// Matrix = Matrix transpose * matrix ... slow reference implementation
159
160 /// This routine does \c C=AT*B whereas mTxm does C=C+AT*B.
161 /// \code
162 /// c(i,j) = sum(k) a(k,i)*b(k,j) <------ does not accumulate into C
163 /// \endcode
164 ///
165 /// \c ldb is the last dimension of b in C storage (the leading dimension
166 /// in fortran storage). It is here to accomodate multiplying by a matrix
167 /// stored with \c ldb>dimj which happens in madness when transforming with
168 /// low rank matrices. A matrix in dense storage has \c ldb=dimj which is
169 /// the default for backward compatibility.
170 template <typename aT, typename bT, typename cT>
171 void mTxmq_reference(long dimi, long dimj, long dimk,
172 cT* MADNESS_RESTRICT c, const aT* a, const bT* b, long ldb=-1) {
173 if (ldb == -1) ldb=dimj;
174 MADNESS_ASSERT(ldb>=dimj);
175 //std::cout << "IN GENERIC mTxmq " << tensor_type_names[TensorTypeData<aT>::id] << " " << tensor_type_names[TensorTypeData<bT>::id] << " " << tensor_type_names[TensorTypeData<cT>::id] << "\n";
176 for (long i=0; i<dimi; ++i,c+=dimj,++a) {
177 for (long j=0; j<dimj; ++j) c[j] = 0.0;
178 const aT *aik_ptr = a;
179 for (long k=0; k<dimk; ++k,aik_ptr+=dimi) {
180 aT aki = *aik_ptr;
181 for (long j=0; j<dimj; ++j) {
182 c[j] += aki*b[k*ldb+j];
183 }
184 }
185 }
186 }
187
188
189#if defined(HAVE_FAST_BLAS) && !defined(HAVE_INTEL_MKL)
190 // MKL provides support for mixed real/complex operations but most other libraries do not
191
192 /// Matrix += Matrix * matrix ... BLAS/MKL interface version
193
194 /// Does \c C=C+A*B
195 /// \code
196 /// c(i,j) = c(i,j) + sum(k) a(i,k)*b(k,j)
197 /// \endcode
198 template <typename T>
199 void mxm(long dimi, long dimj, long dimk,
200 T* MADNESS_RESTRICT c, const T* a, const T* b) {
201 const T one = 1.0; // alpha in *gemm
202 cblas::gemm(cblas::NoTrans,cblas::NoTrans,dimj,dimi,dimk,one,b,dimj,a,dimk,one,c,dimj);
203 }
204
205 /// Matrix += Matrix transpose * matrix ... MKL interface version
206
207 /// Does \c C=C+AT*B
208 /// \code
209 /// c(i,j) = c(i,j) + sum(k) a(k,i)*b(k,j)
210 /// \endcode
211 template <typename T>
212 void mTxm(long dimi, long dimj, long dimk,
213 T* MADNESS_RESTRICT c, const T* a, const T* b) {
214 const T one = 1.0; // alpha in *gemm
215 cblas::gemm(cblas::NoTrans,cblas::Trans,dimj,dimi,dimk,one,b,dimj,a,dimi,one,c,dimj);
216 }
217
218 /// Matrix += Matrix * matrix transpose ... MKL interface version
219
220 /// Does \c C=C+A*BT
221 /// \code
222 /// c(i,j) = c(i,j) + sum(k) a(i,k)*b(j,k)
223 /// \endcode
224 template <typename T>
225 void mxmT(long dimi, long dimj, long dimk,
226 T* MADNESS_RESTRICT c, const T* a, const T* b) {
227 const T one = 1.0; // alpha in *gemm
228 cblas::gemm(cblas::Trans,cblas::NoTrans,dimj,dimi,dimk,one,b,dimk,a,dimk,one,c,dimj);
229 }
230
231 /// Matrix += Matrix transpose * matrix transpose ... MKL interface version
232
233 /// Does \c C=C+AT*BT
234 /// \code
235 /// c(i,j) = c(i,j) + sum(k) a(k,i)*b(j,k)
236 /// \endcode
237 template <typename T>
238 void mTxmT(long dimi, long dimj, long dimk,
239 T* MADNESS_RESTRICT c, const T* a, const T* b) {
240 const T one = 1.0; // alpha in *gemm
241 cblas::gemm(cblas::Trans,cblas::Trans,dimj,dimi,dimk,one,b,dimk,a,dimi,one,c,dimj);
242 }
243
244 /// Matrix = Matrix transpose * matrix ... MKL interface version
245
246 /// Does \c C=AT*B whereas mTxm does C=C+AT*B.
247 /// \code
248 /// c(i,j) = sum(k) a(k,i)*b(k,j) <------ does not accumulate into C
249 /// \endcode
250 ///
251 /// \c ldb is the last dimension of b in C storage (the leading dimension
252 /// in fortran storage). It is here to accomodate multiplying by a matrix
253 /// stored with \c ldb>dimj which happens in madness when transforming with
254 /// low rank matrices. A matrix in dense storage has \c ldb=dimj which is
255 /// the default for backward compatibility.
256 template <typename T>
257 void mTxmq(long dimi, long dimj, long dimk,
258 T* MADNESS_RESTRICT c, const T* a, const T* b, long ldb=-1) {
259 if (ldb == -1) ldb=dimj;
260 MADNESS_ASSERT(ldb>=dimj);
261
262 if (dimi==0 || dimj==0) return; // nothing to do and *GEMM will complain
263 if (dimk==0) {
264 for (long i=0; i<dimi*dimj; i++) c[i] = 0.0;
265 }
266
267 const T one = 1.0; // alpha in *gemm
268 const T zero = 0.0; // beta in *gemm
269 cblas::gemm(cblas::NoTrans,cblas::Trans,dimj,dimi,dimk,one,b,ldb,a,dimi,zero,c,dimj);
270 }
271
272#ifdef HAVE_MTXMQ
273 template <>
274 void mTxmq(long dimi, long dimj, long dimk, double* MADNESS_RESTRICT c, const double* a, const double* b, long ldb);
275
276 // Bootstrap complex*real from real*real
277 template <typename T>
278 void mTxmq(long dimi, long dimj, long dimk, std::complex<T>* MADNESS_RESTRICT c, const std::complex<T>* a, const T* b, long ldb) {
279 T* Rc = new T[dimi*dimj];
280 T* Ic = new T[dimi*dimj];
281 T* Ra = new T[dimi*dimk];
282 T* Ia = new T[dimi*dimk];
283
284 for (long i=0; i<dimi*dimk; i++) {
285 Ra[i] = a[i].real();
286 Ia[i] = a[i].imag();
287 }
288 mTxmq(dimi,dimj,dimk,Rc,Ra,b,ldb);
289 mTxmq(dimi,dimj,dimk,Ic,Ia,b,ldb);
290 for (long i=0; i<dimi*dimj; i++) c[i] = std::complex<T>(Rc[i],Ic[i]);
291
292 delete[] Rc;
293 delete[] Ic;
294 delete[] Ra;
295 delete[] Ia;
296 }
297
298#endif
299
300#endif
301
302#ifdef HAVE_INTEL_MKL
303 /// Matrix += Matrix * matrix ... MKL interface version
304
305 /// Does \c C=C+A*B
306 /// \code
307 /// c(i,j) = c(i,j) + sum(k) a(i,k)*b(k,j)
308 /// \endcode
309 template <typename aT, typename bT, typename cT>
310 void mxm(long dimi, long dimj, long dimk,
311 cT* MADNESS_RESTRICT c, const aT* a, const bT* b) {
312 const cT one = 1.0; // alpha in *gemm
313 cblas::gemm(cblas::NoTrans,cblas::NoTrans,dimj,dimi,dimk,one,b,dimj,a,dimk,one,c,dimj);
314 }
315
316 /// Matrix += Matrix transpose * matrix ... MKL interface version
317
318 /// Does \c C=C+AT*B
319 /// \code
320 /// c(i,j) = c(i,j) + sum(k) a(k,i)*b(k,j)
321 /// \endcode
322 template <typename aT, typename bT, typename cT>
323 void mTxm(long dimi, long dimj, long dimk,
324 cT* MADNESS_RESTRICT c, const aT* a, const bT* b) {
325 const cT one = 1.0; // alpha in *gemm
326 cblas::gemm(cblas::NoTrans,cblas::Trans,dimj,dimi,dimk,one,b,dimj,a,dimi,one,c,dimj);
327 }
328
329 /// Matrix += Matrix * matrix transpose ... MKL interface version
330
331 /// Does \c C=C+A*BT
332 /// \code
333 /// c(i,j) = c(i,j) + sum(k) a(i,k)*b(j,k)
334 /// \endcode
335 template <typename aT, typename bT, typename cT>
336 void mxmT(long dimi, long dimj, long dimk,
337 cT* MADNESS_RESTRICT c, const aT* a, const bT* b) {
338 const cT one = 1.0; // alpha in *gemm
339 cblas::gemm(cblas::Trans,cblas::NoTrans,dimj,dimi,dimk,one,b,dimk,a,dimk,one,c,dimj);
340 }
341
342 /// Matrix += Matrix transpose * matrix transpose ... MKL interface version
343
344 /// Does \c C=C+AT*BT
345 /// \code
346 /// c(i,j) = c(i,j) + sum(k) a(k,i)*b(j,k)
347 /// \endcode
348 template <typename aT, typename bT, typename cT>
349 void mTxmT(long dimi, long dimj, long dimk,
350 cT* MADNESS_RESTRICT c, const aT* a, const bT* b) {
351 const cT one = 1.0; // alpha in *gemm
352 cblas::gemm(cblas::Trans,cblas::Trans,dimj,dimi,dimk,one,b,dimk,a,dimi,one,c,dimj);
353 }
354
355 /// Matrix = Matrix transpose * matrix ... MKL interface version
356
357 /// Does \c C=AT*B whereas mTxm does C=C+AT*B.
358 /// \code
359 /// c(i,j) = sum(k) a(k,i)*b(k,j) <------ does not accumulate into C
360 /// \endcode
361 ///
362 /// \c ldb is the last dimension of b in C storage (the leading dimension
363 /// in fortran storage). It is here to accomodate multiplying by a matrix
364 /// stored with \c ldb>dimj which happens in madness when transforming with
365 /// low rank matrices. A matrix in dense storage has \c ldb=dimj which is
366 /// the default for backward compatibility.
367 template <typename aT, typename bT, typename cT>
368 void mTxmq(long dimi, long dimj, long dimk,
369 cT* MADNESS_RESTRICT c, const aT* a, const bT* b, long ldb=-1) {
370 if (ldb == -1) ldb=dimj;
371 MADNESS_ASSERT(ldb>=dimj);
372
373 if (dimi==0 || dimj==0) return; // nothing to do and *GEMM will complain
374 if (dimk==0) {
375 for (long i=0; i<dimi*dimj; i++) c[i] = 0.0;
376 }
377
378 const cT one = 1.0; // alpha in *gemm
379 const cT zero = 0.0; // beta in *gemm
380 cblas::gemm(cblas::NoTrans,cblas::Trans,dimj,dimi,dimk,one,b,ldb,a,dimi,zero,c,dimj);
381 }
382
383#ifdef HAVE_MTXMQ
384template <>
385void mTxmq(long dimi, long dimj, long dimk, double* MADNESS_RESTRICT c, const double* a, const double* b, long ldb);
386#endif
387
388#else
389
390 // Fall back to reference implementations
391
392 template <typename T, typename Q, typename S>
393 static inline void mxm(long dimi, long dimj, long dimk,
394 T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
395 const S* MADNESS_RESTRICT b) {
396 mxm_reference(dimi, dimj, dimk, c, a, b);
397 }
398
399 template <typename T, typename Q, typename S>
400 static inline
401 void mTxm(long dimi, long dimj, long dimk,
402 T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
403 const S* MADNESS_RESTRICT b) {
404 mTxm_reference(dimi, dimj, dimk, c, a, b);
405 }
406
407 template <typename T, typename Q, typename S>
408 static inline void mxmT(long dimi, long dimj, long dimk,
409 T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
410 const S* MADNESS_RESTRICT b) {
411 mxmT_reference(dimi, dimj, dimk, c, a, b);
412 }
413
414 template <typename T, typename Q, typename S>
415 static inline void mTxmT(long dimi, long dimj, long dimk,
416 T* MADNESS_RESTRICT c, const Q* MADNESS_RESTRICT a,
417 const S* MADNESS_RESTRICT b) {
418 mTxmT_reference(dimi, dimj, dimk, c, a, b);
419 }
420
421 template <typename aT, typename bT, typename cT>
422 void mTxmq(long dimi, long dimj, long dimk,
423 cT* MADNESS_RESTRICT c, const aT* a, const bT* b, long ldb=-1) {
424 mTxmq_reference(dimi, dimj, dimk, c, a, b, ldb);
425 }
426
427 // The following are restricted to double only
428
429 /// Matrix transpose * matrix (hand unrolled version)
430
431 template <>
432 inline void mTxm(long dimi, long dimj, long dimk,
433 double* MADNESS_RESTRICT c, const double* MADNESS_RESTRICT a,
434 const double* MADNESS_RESTRICT b) {
435 /*
436 c(i,j) = c(i,j) + sum(k) a(k,i)*b(k,j) <--- NOTE ACCUMULATION INTO C
437
438 where it is assumed that the last index in each array is has unit
439 stride and the dimensions are as provided.
440
441 i loop might be long in anticipated application
442
443 4-way unrolled k loop ... empirically fastest on PIII
444 compared to 2/3 way unrolling (though not by much).
445 */
446
447 long dimk4 = (dimk/4)*4;
448 for (long i=0; i<dimi; ++i,c+=dimj) {
449 const double* ai = a+i;
450 const double* p = b;
451 for (long k=0; k<dimk4; k+=4,ai+=4*dimi,p+=4*dimj) {
452 double ak0i = ai[0 ];
453 double ak1i = ai[dimi];
454 double ak2i = ai[dimi+dimi];
455 double ak3i = ai[dimi+dimi+dimi];
456 const double* bk0 = p;
457 const double* bk1 = p+dimj;
458 const double* bk2 = p+dimj+dimj;
459 const double* bk3 = p+dimj+dimj+dimj;
460 for (long j=0; j<dimj; ++j) {
461 c[j] += ak0i*bk0[j] + ak1i*bk1[j] + ak2i*bk2[j] + ak3i*bk3[j];
462 }
463 }
464 for (long k=dimk4; k<dimk; ++k) {
465 double aki = a[k*dimi+i];
466 const double* bk = b+k*dimj;
467 for (long j=0; j<dimj; ++j) {
468 c[j] += aki*bk[j];
469 }
470 }
471 }
472 }
473
474
475
476 /// Matrix * matrix transpose (hand unrolled version)
477
478 template <>
479 inline void mxmT(long dimi, long dimj, long dimk,
480 double* MADNESS_RESTRICT c,
481 const double* MADNESS_RESTRICT a, const double* MADNESS_RESTRICT b) {
482 /*
483 c(i,j) = c(i,j) + sum(k) a(i,k)*b(j,k)
484
485 where it is assumed that the last index in each array is has unit
486 stride and the dimensions are as provided.
487
488 j loop might be long in anticipated application
489
490 Unrolled i loop. Empirically fastest on PIII compared
491 to unrolling j, or both i&j.
492 */
493
494 long dimi2 = (dimi/2)*2;
495 for (long i=0; i<dimi2; i+=2) {
496 const double* ai0 = a+i*dimk;
497 const double* ai1 = a+i*dimk+dimk;
498 double* MADNESS_RESTRICT ci0 = c+i*dimj;
499 double* MADNESS_RESTRICT ci1 = c+i*dimj+dimj;
500 for (long j=0; j<dimj; ++j) {
501 double sum0 = 0;
502 double sum1 = 0;
503 const double* bj = b + j*dimk;
504 for (long k=0; k<dimk; ++k) {
505 sum0 += ai0[k]*bj[k];
506 sum1 += ai1[k]*bj[k];
507 }
508 ci0[j] += sum0;
509 ci1[j] += sum1;
510 }
511 }
512 for (long i=dimi2; i<dimi; ++i) {
513 const double* ai = a+i*dimk;
514 double* MADNESS_RESTRICT ci = c+i*dimj;
515 for (long j=0; j<dimj; ++j) {
516 double sum = 0;
517 const double* bj = b+j*dimk;
518 for (long k=0; k<dimk; ++k) {
519 sum += ai[k]*bj[k];
520 }
521 ci[j] += sum;
522 }
523 }
524 }
525
526 /// Matrix * matrix (hand unrolled version)
527 template <>
528 inline void mxm(long dimi, long dimj, long dimk,
529 double* MADNESS_RESTRICT c, const double* MADNESS_RESTRICT a, const double* MADNESS_RESTRICT b) {
530 /*
531 c(i,j) = c(i,j) + sum(k) a(i,k)*b(k,j)
532
533 where it is assumed that the last index in each array is has unit
534 stride and the dimensions are as provided.
535
536 4-way unrolled k loop ... empirically fastest on PIII
537 compared to 2/3 way unrolling (though not by much).
538 */
539
540 long dimk4 = (dimk/4)*4;
541 for (long i=0; i<dimi; ++i, c+=dimj,a+=dimk) {
542 const double* p = b;
543 for (long k=0; k<dimk4; k+=4,p+=4*dimj) {
544 double aik0 = a[k ];
545 double aik1 = a[k+1];
546 double aik2 = a[k+2];
547 double aik3 = a[k+3];
548 const double* bk0 = p;
549 const double* bk1 = bk0+dimj;
550 const double* bk2 = bk1+dimj;
551 const double* bk3 = bk2+dimj;
552 for (long j=0; j<dimj; ++j) {
553 c[j] += aik0*bk0[j] + aik1*bk1[j] + aik2*bk2[j] + aik3*bk3[j];
554 }
555 }
556 for (long k=dimk4; k<dimk; ++k) {
557 double aik = a[k];
558 for (long j=0; j<dimj; ++j) {
559 c[j] += aik*b[k*dimj+j];
560 }
561 }
562 }
563 }
564
565 /// Matrix transpose * matrix transpose (hand tiled and unrolled)
566 template <>
567 inline void mTxmT(long dimi, long dimj, long dimk,
568 double* MADNESS_RESTRICT csave, const double* MADNESS_RESTRICT asave, const double* MADNESS_RESTRICT b) {
569 /*
570 c(i,j) = c(i,j) + sum(k) a(k,i)*b(j,k)
571
572 where it is assumed that the last index in each array is has unit
573 stride and the dimensions are as provided.
574
575 Tiled k, copy row of a into temporary, and unroll j once.
576 */
577
578 const int ktile=32;
579 double ai[ktile];
580 long dimj2 = (dimj/2)*2;
581
582 for (long klo=0; klo<dimk; klo+=ktile, asave+=ktile*dimi, b+=ktile) {
583 long khi = klo+ktile;
584 if (khi > dimk) khi = dimk;
585 long nk = khi-klo;
586
587 const double * MADNESS_RESTRICT a = asave;
588 double * MADNESS_RESTRICT c = csave;
589 for (long i=0; i<dimi; ++i,c+=dimj,++a) {
590 const double* q = a;
591 for (long k=0; k<nk; ++k,q+=dimi) ai[k] = *q;
592
593 const double* bj0 = b;
594 for (long j=0; j<dimj2; j+=2,bj0+=2*dimk) {
595 const double* bj1 = bj0+dimk;
596 double sum0 = 0;
597 double sum1 = 0;
598 for (long k=0; k<nk; ++k) {
599 sum0 += ai[k]*bj0[k];
600 sum1 += ai[k]*bj1[k];
601 }
602 c[j ] += sum0;
603 c[j+1] += sum1;
604 }
605
606 for (long j=dimj2; j<dimj; ++j,bj0+=dimk) {
607 double sum = 0;
608 for (long k=0; k<nk; ++k) {
609 sum += ai[k]*bj0[k];
610 }
611 c[j] += sum;
612 }
613 }
614 }
615 }
616
617 /*
618 * mtxm, but with padded buffers.
619 *
620 * ext_b is the extent of the b array, so shrink() isn't needed.
621 */
622 template <typename aT, typename bT, typename cT>
623 void mTxmq_padding(long dimi, long dimj, long dimk, long ext_b,
624 cT* c, const aT* a, const bT* b) {
625 const int alignment = 4;
626 bool free_b = false;
627 long effj = dimj;
628
629 /* Setup a buffer for c if needed */
630 cT* c_buf = c;
631 if (dimj%alignment) {
632 effj = (dimj | 3) + 1;
633 c_buf = (cT*)malloc(sizeof(cT)*dimi*effj);
634 }
635
636 /* Copy b into a buffer if needed */
637 if (ext_b%alignment) {
638 free_b = true;
639 bT* b_buf = (bT*)malloc(sizeof(bT)*dimk*effj);
640
641 bT* bp = b_buf;
642 for (long k=0; k<dimk; k++, bp += effj, b += ext_b)
643 memcpy(bp, b, sizeof(bT)*dimj);
644
645 b = b_buf;
646 ext_b = effj;
647 }
648
649 cT* c_work = c_buf;
650 /* mTxm */
651 for (long i=0; i<dimi; ++i,c_work+=effj,++a) {
652 for (long j=0; j<dimj; ++j) c_work[j] = 0.0;
653 const aT *aik_ptr = a;
654 for (long k=0; k<dimk; ++k,aik_ptr+=dimi) {
655 aT aki = *aik_ptr;
656 for (long j=0; j<dimj; ++j) {
657 c_work[j] += aki*b[k*ext_b+j];
658 }
659 }
660 }
661
662 /* Copy c out if needed */
663 if (dimj%alignment) {
664 cT* ct = c_buf;
665 for (long i=0; i<dimi; i++, ct += effj, c += dimj)
666 memcpy(c, ct, sizeof(cT)*dimj);
667
668 free(c_buf);
669 }
670
671 /* Free the buffer for b */
672 if (free_b) free((bT*)b);
673 }
674
675#ifdef HAVE_IBMBGQ
676 extern void bgq_mtxmq_padded(long ni, long nj, long nk, long ej,
677 double* c, const double* a, const double* b);
678 extern void bgq_mtxmq_padded(long ni, long nj, long nk, long ej,
679 __complex__ double* c, const __complex__ double* a, const __complex__ double* b);
680 extern void bgq_mtxmq_padded(long ni, long nj, long nk, long ej,
681 __complex__ double* c, const double* a, const __complex__ double* b);
682 extern void bgq_mtxmq_padded(long ni, long nj, long nk, long ej,
683 __complex__ double* c, const __complex__ double* a, const double* b);
684
685 template <>
686 inline void mTxmq_padding(long ni, long nj, long nk, long ej,
687 double* c, const double* a, const double* b) {
688 bgq_mtxmq_padded(ni, nj, nk, ej, c, a, b);
689 }
690
691 template <>
692 inline void mTxmq_padding(long ni, long nj, long nk, long ej,
693 __complex__ double* c, const __complex__ double* a, const __complex__ double* b) {
694 bgq_mtxmq_padded(ni, nj, nk, ej, c, a, b);
695 }
696
697 template <>
698 inline void mTxmq_padding(long ni, long nj, long nk, long ej,
699 __complex__ double* c, const double* a, const __complex__ double* b) {
700 bgq_mtxmq_padded(ni, nj, nk, ej, c, a, b);
701 }
702
703 template <>
704 inline void mTxmq_padding(long ni, long nj, long nk, long ej,
705 __complex__ double* c, const __complex__ double* a, const double* b) {
706 bgq_mtxmq_padded(ni, nj, nk, ej, c, a, b);
707 }
708#elif defined(HAVE_IBMBGP)
709 extern void bgpmTxmq(long ni, long nj, long nk, double* MADNESS_RESTRICT c,
710 const double* a, const double* b);
711 extern void bgpmTxmq(long ni, long nj, long nk, double_complex* MADNESS_RESTRICT c,
712 const double_complex* a, const double_complex* b);
713
714 template <>
715 inline void mTxmq(long ni, long nj, long nk, double* MADNESS_RESTRICT c, const double* a, const double* b) {
716 bgpmTxmq(ni, nj, nk, c, a, b);
717 }
718
719 template <>
720 inline void mTxmq(long ni, long nj, long nk, double_complex* MADNESS_RESTRICT c, const double_complex* a, const double_complex* b) {
721 bgpmTxmq(ni, nj, nk, c, a, b);
722 }
723
724// #elif defined(X86_64) && !defined(DISABLE_SSE3)
725// template <>
726// void mTxmq(long dimi, long dimj, long dimk,
727// double* MADNESS_RESTRICT c, const double* a, const double* b);
728
729// template <>
730// void mTxmq(long dimi, long dimj, long dimk,
731// double_complex* MADNESS_RESTRICT c, const double_complex* a, const double_complex* b);
732
733// #ifndef __INTEL_COMPILER
734// template <>
735// void mTxmq(long dimi, long dimj, long dimk,
736// double_complex* MADNESS_RESTRICT c, const double_complex* a, const double* b);
737// #endif
738
739// #elif defined(X86_32)
740// template <>
741// void mTxmq(long dimi, long dimj, long dimk,
742// double* MADNESS_RESTRICT c, const double* a, const double* b);
743#endif // HAVE_IBMBGQ
744
745#endif // HAVE_INTEL_MKL
746
747}
748#endif // MADNESS_TENSOR_MXM_H__INCLUDED
double q(double t)
Definition DKops.h:18
Define BLAS like functions.
std::complex< double > double_complex
Definition cfft.h:14
char * malloc()
char * p(char *buf, const char *name, int k, int initial_level, double thresh, int order)
Definition derivatives.cc:72
auto T(World &world, response_space &f) -> response_space
Definition global_functions.cc:34
Macros and tools pertaining to the configuration of MADNESS.
#define MADNESS_ASSERT(condition)
Assert a condition that should be free of side-effects since in release builds this might be a no-op.
Definition madness_exception.h:134
void gemm(const CBLAS_TRANSPOSE OpA, const CBLAS_TRANSPOSE OpB, const integer m, const integer n, const integer k, const float alpha, const float *a, const integer lda, const float *b, const integer ldb, const float beta, float *c, const integer ldc)
Multiplies a matrix by a vector.
Definition cblas.h:352
@ NoTrans
Definition cblas_types.h:78
@ Trans
Definition cblas_types.h:79
Namespace for all elements and tools of MADNESS.
Definition DFParameters.h:10
static void mTxm_reference(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const Q *MADNESS_RESTRICT a, const S *MADNESS_RESTRICT b)
Matrix += Matrix transpose * matrix ... reference implementation (slow but correct)
Definition mxm.h:91
void mTxmT(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const T *a, const T *b)
Matrix += Matrix transpose * matrix transpose ... MKL interface version.
Definition mxm.h:238
void mxm(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const T *a, const T *b)
Matrix += Matrix * matrix ... BLAS/MKL interface version.
Definition mxm.h:199
static void mxm_reference(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const Q *MADNESS_RESTRICT a, const S *MADNESS_RESTRICT b)
Matrix += Matrix * matrix reference implementation (slow but correct)
Definition mxm.h:68
void mTxm(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const T *a, const T *b)
Matrix += Matrix transpose * matrix ... MKL interface version.
Definition mxm.h:212
static void mxmT_reference(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const Q *MADNESS_RESTRICT a, const S *MADNESS_RESTRICT b)
Matrix += Matrix * matrix transpose ... reference implementation (slow but correct)
Definition mxm.h:114
void mTxmq_reference(long dimi, long dimj, long dimk, cT *MADNESS_RESTRICT c, const aT *a, const bT *b, long ldb=-1)
Matrix = Matrix transpose * matrix ... slow reference implementation.
Definition mxm.h:171
void mTxmq_padding(long dimi, long dimj, long dimk, long ext_b, cT *c, const aT *a, const bT *b)
Definition mtxmq.h:96
static void mTxmT_reference(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const Q *MADNESS_RESTRICT a, const S *MADNESS_RESTRICT b)
Matrix += Matrix transpose * matrix transpose reference implementation (slow but correct)
Definition mxm.h:139
void bgq_mtxmq_padded(long dimi, long dimj, long dimk, long extb, __complex__ double *c_x, const __complex__ double *a_x, const __complex__ double *b_x)
Definition bgq_mtxm.cc:10
void mxmT(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const T *a, const T *b)
Matrix += Matrix * matrix transpose ... MKL interface version.
Definition mxm.h:225
void mTxmq(long dimi, long dimj, long dimk, T *MADNESS_RESTRICT c, const T *a, const T *b, long ldb=-1)
Matrix = Matrix transpose * matrix ... MKL interface version.
Definition mxm.h:257
static const double b
Definition nonlinschro.cc:119
static const double a
Definition nonlinschro.cc:118
double Q(double a)
Definition relops.cc:20
static const double c
Definition relops.cc:10
static const long k
Definition rk.cc:44
AtomicInt sum
Definition test_atomicint.cc:46