madness/api-doc/aligned_8h_source.html

/*

  This file is part of MADNESS.


  Copyright (C) 2007,2010 Oak Ridge National Laboratory


  This program is free software; you can redistribute it and/or modify

  it under the terms of the GNU General Public License as published by

  the Free Software Foundation; either version 2 of the License, or

  (at your option) any later version.


  This program is distributed in the hope that it will be useful,

  but WITHOUT ANY WARRANTY; without even the implied warranty of

  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

  GNU General Public License for more details.


  You should have received a copy of the GNU General Public License

  along with this program; if not, write to the Free Software

  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA


  For more information please contact:


  Robert J. Harrison

  Oak Ridge National Laboratory

  One Bethel Valley Road

  P.O. Box 2008, MS-6367


  email: harrisonrj@ornl.gov

  tel:   865-241-3937

  fax:   865-572-0680


  $Id$

*/

#ifndef MADNESS_TENSOR_ALIGNED_H__INCLUDED

#define MADNESS_TENSOR_ALIGNED_H__INCLUDED


/*!

  \file tensor/aligned.h

  \brief Provides routines for internal use optimized for aligned data


  This stuff used to be implemented in assembly but it is too much

  effort keeping that working especially for multiple compilers.

*/


#include <madness/madness_config.h>

#include <madness/tensor/tensor.h>

#include <madness/tensor/cblas.h>

#include <cstring>

#include <climits>


namespace madness {


    template <typename T>

    static

    inline


    void aligned_zero(long n, T* a) {

#ifdef HAVE_MEMSET

        // A hand coded SSE2 loop is faster only for data in the L1 cache

        std::memset((void *) a, 0, n*sizeof(T));

#else

        long n4 = (n>>2)<<2;

        long rem = n-n4;

        for (long i=0; i<n4; i+=4,a+=4) {

            a[0] = 0;

            a[1] = 0;

            a[2] = 0;

            a[3] = 0;

        }

        for (long i=0; i<rem; ++i) *a++ = 0;

#endif

    }


    template <typename T, typename Q>

    static

    inline


    void aligned_axpy(long n, T* MADNESS_RESTRICT a, const T* MADNESS_RESTRICT b, Q s) {

        long n4 = (n>>2)<<2;

        long rem = n-n4;

        for (long i=0; i<n4; i+=4,a+=4,b+=4) {

            a[0] += s*b[0];

            a[1] += s*b[1];

            a[2] += s*b[2];

            a[3] += s*b[3];

        }

        for (long i=0; i<rem; ++i) *a++ += s * *b++;

    }


    /* Jeff: In the following three template specializations, a long is implicitly

     *       cast into the MADNESS integer type, which defaults to int64_t but can

     *       be int32_t, in which case, there could be an overflow for n>INT_MAX.

     *

     *       I am choosing to ignore this issue for now. I know all of the workarounds

     *       but it seems unlikely that they will be necessary because 2^31 is a big number. */

    template <>

    //static

    inline


    void aligned_axpy(long n, double * MADNESS_RESTRICT a, const double * MADNESS_RESTRICT b, double s) {

        madness::cblas::axpy((integer)n, s, (double*)b, 1, (double*)a, 1);

    }


    /* Jeff: I have no idea if casting double_complex to complex_real8 is valid... */


    template <>

    //static

    inline


    void aligned_axpy(long n, double_complex * MADNESS_RESTRICT a, const double_complex * MADNESS_RESTRICT b, double_complex s) {

        madness::cblas::axpy((integer)n, (complex_real8)s, (complex_real8*)b, 1, (complex_real8*)a, 1);

    }


    template <>

    //static

    inline


    void aligned_axpy(long n, double_complex * MADNESS_RESTRICT a, const double_complex * MADNESS_RESTRICT b, double s) {

        complex_real8 cs(s,0.0); // turn real into complex

        madness::cblas::axpy((integer)n, cs, (complex_real8*)b, 1, (complex_real8*)a, 1);

    }


    template <typename T, typename Q>

    static

    inline


    void aligned_add(long n, T* MADNESS_RESTRICT a, const Q* MADNESS_RESTRICT b) {

        long n4 = (n>>2)<<2;

        long rem = n-n4;

        for (long i=0; i<n4; i+=4,a+=4,b+=4) {

            a[0] += b[0];

            a[1] += b[1];

            a[2] += b[2];

            a[3] += b[3];

        }

        for (long i=0; i<rem; ++i) *a++ += *b++;

    }


    template <typename T, typename Q>

    static

    inline


    void aligned_sub(long n, T* MADNESS_RESTRICT a, const Q* MADNESS_RESTRICT b) {

        long n4 = (n>>2)<<2;

        long rem = n-n4;

        for (long i=0; i<n4; i+=4,a+=4,b+=4) {

            a[0] -= b[0];

            a[1] -= b[1];

            a[2] -= b[2];

            a[3] -= b[3];

        }

        for (long i=0; i<rem; ++i) *a++ -= *b++;

    }


}


#endif // MADNESS_TENSOR_ALIGNED_H__INCLUDED

cblas.h
Define BLAS like functions.

double_complex
std::complex< double > double_complex
Definition cfft.h:14

integer
int integer
Definition crayio.c:25

complex_real8
std::complex< double > complex_real8
Fortran double complex.
Definition fortran_ctypes.h:83

T
auto T(World &world, response_space &f) -> response_space
Definition global_functions.cc:28

madness_config.h
Macros and tools pertaining to the configuration of MADNESS.

madness::cblas::axpy
void axpy(const integer n, const float alpha, float *x, const integer incx, float *y, const integer incy)
Scale and add a vector to another.
Definition cblas.h:687

madness
Namespace for all elements and tools of MADNESS.
Definition DFParameters.h:10

madness::aligned_add
void aligned_add(long n, double *MADNESS_RESTRICT a, const double *MADNESS_RESTRICT b)

madness::aligned_zero
static void aligned_zero(long n, T *a)
Definition aligned.h:55

madness::aligned_sub
void aligned_sub(long n, double *MADNESS_RESTRICT a, const double *MADNESS_RESTRICT b)

madness::aligned_axpy
static void aligned_axpy(long n, T *MADNESS_RESTRICT a, const T *MADNESS_RESTRICT b, Q s)
Definition aligned.h:75

b
static const double b
Definition nonlinschro.cc:119

a
static const double a
Definition nonlinschro.cc:118

Q
double Q(double a)
Definition relops.cc:20

tensor.h
Defines and implements most of Tensor.