MADNESS  0.10.1
aligned.h
Go to the documentation of this file.
1 /*
2  This file is part of MADNESS.
3 
4  Copyright (C) 2007,2010 Oak Ridge National Laboratory
5 
6  This program is free software; you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation; either version 2 of the License, or
9  (at your option) any later version.
10 
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with this program; if not, write to the Free Software
18  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 
20  For more information please contact:
21 
22  Robert J. Harrison
23  Oak Ridge National Laboratory
24  One Bethel Valley Road
25  P.O. Box 2008, MS-6367
26 
27  email: harrisonrj@ornl.gov
28  tel: 865-241-3937
29  fax: 865-572-0680
30 
31  $Id$
32 */
33 #ifndef MADNESS_TENSOR_ALIGNED_H__INCLUDED
34 #define MADNESS_TENSOR_ALIGNED_H__INCLUDED
35 
36 /*!
37  \file tensor/aligned.h
38  \brief Provides routines for internal use optimized for aligned data
39 
40  This stuff used to be implemented in assembly but it is too much
41  effort keeping that working especially for multiple compilers.
42 */
43 
44 #include <madness/madness_config.h>
45 #include <madness/tensor/tensor.h>
46 #include <madness/tensor/cblas.h>
47 #include <cstring>
48 #include <climits>
49 
50 namespace madness {
51 
52  template <typename T>
53  static
54  inline
55  void aligned_zero(long n, T* a) {
56 #ifdef HAVE_MEMSET
57  // A hand coded SSE2 loop is faster only for data in the L1 cache
58  std::memset((void *) a, 0, n*sizeof(T));
59 #else
60  long n4 = (n>>2)<<2;
61  long rem = n-n4;
62  for (long i=0; i<n4; i+=4,a+=4) {
63  a[0] = 0;
64  a[1] = 0;
65  a[2] = 0;
66  a[3] = 0;
67  }
68  for (long i=0; i<rem; ++i) *a++ = 0;
69 #endif
70  }
71 
72  template <typename T, typename Q>
73  static
74  inline
75  void aligned_axpy(long n, T* MADNESS_RESTRICT a, const T* MADNESS_RESTRICT b, Q s) {
76  long n4 = (n>>2)<<2;
77  long rem = n-n4;
78  for (long i=0; i<n4; i+=4,a+=4,b+=4) {
79  a[0] += s*b[0];
80  a[1] += s*b[1];
81  a[2] += s*b[2];
82  a[3] += s*b[3];
83  }
84  for (long i=0; i<rem; ++i) *a++ += s * *b++;
85  }
86 
87  /* Jeff: In the following three template specializations, a long is implicitly
88  * cast into the MADNESS integer type, which defaults to int64_t but can
89  * be int32_t, in which case, there could be an overflow for n>INT_MAX.
90  *
91  * I am choosing to ignore this issue for now. I know all of the workarounds
92  * but it seems unlikely that they will be necessary because 2^31 is a big number. */
93  template <>
94  //static
95  inline
96  void aligned_axpy(long n, double * MADNESS_RESTRICT a, const double * MADNESS_RESTRICT b, double s) {
97  madness::cblas::axpy((integer)n, s, (double*)b, 1, (double*)a, 1);
98  }
99 
100  /* Jeff: I have no idea if casting double_complex to complex_real8 is valid... */
101 
102  template <>
103  //static
104  inline
105  void aligned_axpy(long n, double_complex * MADNESS_RESTRICT a, const double_complex * MADNESS_RESTRICT b, double_complex s) {
107  }
108 
109  template <>
110  //static
111  inline
112  void aligned_axpy(long n, double_complex * MADNESS_RESTRICT a, const double_complex * MADNESS_RESTRICT b, double s) {
113  complex_real8 cs(s,0.0); // turn real into complex
115  }
116 
117  template <typename T, typename Q>
118  static
119  inline
120  void aligned_add(long n, T* MADNESS_RESTRICT a, const Q* MADNESS_RESTRICT b) {
121  long n4 = (n>>2)<<2;
122  long rem = n-n4;
123  for (long i=0; i<n4; i+=4,a+=4,b+=4) {
124  a[0] += b[0];
125  a[1] += b[1];
126  a[2] += b[2];
127  a[3] += b[3];
128  }
129  for (long i=0; i<rem; ++i) *a++ += *b++;
130  }
131 
132  template <typename T, typename Q>
133  static
134  inline
135  void aligned_sub(long n, T* MADNESS_RESTRICT a, const Q* MADNESS_RESTRICT b) {
136  long n4 = (n>>2)<<2;
137  long rem = n-n4;
138  for (long i=0; i<n4; i+=4,a+=4,b+=4) {
139  a[0] -= b[0];
140  a[1] -= b[1];
141  a[2] -= b[2];
142  a[3] -= b[3];
143  }
144  for (long i=0; i<rem; ++i) *a++ -= *b++;
145  }
146 }
147 
148 #endif // MADNESS_TENSOR_ALIGNED_H__INCLUDED
Define BLAS like functions.
std::complex< double > double_complex
Definition: cfft.h:14
int integer
Definition: crayio.c:25
std::complex< double > complex_real8
Fortran double complex.
Definition: fortran_ctypes.h:83
auto T(World &world, response_space &f) -> response_space
Definition: global_functions.cc:34
Macros and tools pertaining to the configuration of MADNESS.
void axpy(const integer n, const float alpha, float *x, const integer incx, float *y, const integer incy)
Scale and add a vector to another.
Definition: cblas.h:687
File holds all helper structures necessary for the CC_Operator and CC2 class.
Definition: DFParameters.h:10
void aligned_add(long n, double *MADNESS_RESTRICT a, const double *MADNESS_RESTRICT b)
static void aligned_zero(long n, T *a)
Definition: aligned.h:55
void aligned_sub(long n, double *MADNESS_RESTRICT a, const double *MADNESS_RESTRICT b)
static void aligned_axpy(long n, T *MADNESS_RESTRICT a, const T *MADNESS_RESTRICT b, Q s)
Definition: aligned.h:75
static const double b
Definition: nonlinschro.cc:119
static const double a
Definition: nonlinschro.cc:118
double Q(double a)
Definition: relops.cc:20
Defines and implements most of Tensor.