MADNESS 0.10.1
aligned.h
Go to the documentation of this file.
1/*
2 This file is part of MADNESS.
3
4 Copyright (C) 2007,2010 Oak Ridge National Laboratory
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
20 For more information please contact:
21
22 Robert J. Harrison
23 Oak Ridge National Laboratory
24 One Bethel Valley Road
25 P.O. Box 2008, MS-6367
26
27 email: harrisonrj@ornl.gov
28 tel: 865-241-3937
29 fax: 865-572-0680
30
31 $Id$
32*/
33#ifndef MADNESS_TENSOR_ALIGNED_H__INCLUDED
34#define MADNESS_TENSOR_ALIGNED_H__INCLUDED
35
36/*!
37 \file tensor/aligned.h
38 \brief Provides routines for internal use optimized for aligned data
39
40 This stuff used to be implemented in assembly but it is too much
41 effort keeping that working especially for multiple compilers.
42*/
43
47#include <cstring>
48#include <climits>
49
50namespace madness {
51
52 template <typename T>
53 static
54 inline
55 void aligned_zero(long n, T* a) {
56#ifdef HAVE_MEMSET
57 // A hand coded SSE2 loop is faster only for data in the L1 cache
58 std::memset((void *) a, 0, n*sizeof(T));
59#else
60 long n4 = (n>>2)<<2;
61 long rem = n-n4;
62 for (long i=0; i<n4; i+=4,a+=4) {
63 a[0] = 0;
64 a[1] = 0;
65 a[2] = 0;
66 a[3] = 0;
67 }
68 for (long i=0; i<rem; ++i) *a++ = 0;
69#endif
70 }
71
72 template <typename T, typename Q>
73 static
74 inline
75 void aligned_axpy(long n, T* MADNESS_RESTRICT a, const T* MADNESS_RESTRICT b, Q s) {
76 long n4 = (n>>2)<<2;
77 long rem = n-n4;
78 for (long i=0; i<n4; i+=4,a+=4,b+=4) {
79 a[0] += s*b[0];
80 a[1] += s*b[1];
81 a[2] += s*b[2];
82 a[3] += s*b[3];
83 }
84 for (long i=0; i<rem; ++i) *a++ += s * *b++;
85 }
86
87 /* Jeff: In the following three template specializations, a long is implicitly
88 * cast into the MADNESS integer type, which defaults to int64_t but can
89 * be int32_t, in which case, there could be an overflow for n>INT_MAX.
90 *
91 * I am choosing to ignore this issue for now. I know all of the workarounds
92 * but it seems unlikely that they will be necessary because 2^31 is a big number. */
93 template <>
94 //static
95 inline
96 void aligned_axpy(long n, double * MADNESS_RESTRICT a, const double * MADNESS_RESTRICT b, double s) {
97 madness::cblas::axpy((integer)n, s, (double*)b, 1, (double*)a, 1);
98 }
99
100 /* Jeff: I have no idea if casting double_complex to complex_real8 is valid... */
101
102 template <>
103 //static
104 inline
105 void aligned_axpy(long n, double_complex * MADNESS_RESTRICT a, const double_complex * MADNESS_RESTRICT b, double_complex s) {
107 }
108
109 template <>
110 //static
111 inline
112 void aligned_axpy(long n, double_complex * MADNESS_RESTRICT a, const double_complex * MADNESS_RESTRICT b, double s) {
113 complex_real8 cs(s,0.0); // turn real into complex
115 }
116
117 template <typename T, typename Q>
118 static
119 inline
120 void aligned_add(long n, T* MADNESS_RESTRICT a, const Q* MADNESS_RESTRICT b) {
121 long n4 = (n>>2)<<2;
122 long rem = n-n4;
123 for (long i=0; i<n4; i+=4,a+=4,b+=4) {
124 a[0] += b[0];
125 a[1] += b[1];
126 a[2] += b[2];
127 a[3] += b[3];
128 }
129 for (long i=0; i<rem; ++i) *a++ += *b++;
130 }
131
132 template <typename T, typename Q>
133 static
134 inline
135 void aligned_sub(long n, T* MADNESS_RESTRICT a, const Q* MADNESS_RESTRICT b) {
136 long n4 = (n>>2)<<2;
137 long rem = n-n4;
138 for (long i=0; i<n4; i+=4,a+=4,b+=4) {
139 a[0] -= b[0];
140 a[1] -= b[1];
141 a[2] -= b[2];
142 a[3] -= b[3];
143 }
144 for (long i=0; i<rem; ++i) *a++ -= *b++;
145 }
146}
147
148#endif // MADNESS_TENSOR_ALIGNED_H__INCLUDED
Define BLAS like functions.
std::complex< double > double_complex
Definition cfft.h:14
int integer
Definition crayio.c:25
std::complex< double > complex_real8
Fortran double complex.
Definition fortran_ctypes.h:83
auto T(World &world, response_space &f) -> response_space
Definition global_functions.cc:34
Macros and tools pertaining to the configuration of MADNESS.
void axpy(const integer n, const float alpha, float *x, const integer incx, float *y, const integer incy)
Scale and add a vector to another.
Definition cblas.h:687
Namespace for all elements and tools of MADNESS.
Definition DFParameters.h:10
void aligned_add(long n, double *MADNESS_RESTRICT a, const double *MADNESS_RESTRICT b)
static void aligned_zero(long n, T *a)
Definition aligned.h:55
void aligned_sub(long n, double *MADNESS_RESTRICT a, const double *MADNESS_RESTRICT b)
static void aligned_axpy(long n, T *MADNESS_RESTRICT a, const T *MADNESS_RESTRICT b, Q s)
Definition aligned.h:75
static const double b
Definition nonlinschro.cc:119
static const double a
Definition nonlinschro.cc:118
double Q(double a)
Definition relops.cc:20
Defines and implements most of Tensor.