1 #ifndef MADNESS_SYSTOLIC_H
2 #define MADNESS_SYSTOLIC_H
41 # include <tbb/parallel_for.h>
58 std::vector<int64_t>
map;
64 tbb::parallel_for(0, nthread, [=](
const int id) {
73 const int pairlo =
rank*
A.coltile()/2;
75 for (
int loop=0; loop<(neven-1); ++loop) {
78 tbb::parallel_for(0, nthread,
79 [
this,neven,pairlo,nthread,loop](
const int id) {
80 for (
int pair=
id; pair<
nlocal; pair+=nthread) {
82 int rp = neven/2-1-(pair+pairlo);
83 int iii = (rp+loop)%(neven-1);
84 int jjj = (2*neven-2-rp+loop)%(neven-1);
85 if (rp == 0) jjj = neven-1;
102 tbb::parallel_for(0, nthread, [=](
const int id) {
116 A.local_colrange(ilo, ihi);
120 int pairlo =
rank*
A.coltile()/2;
122 int threadid = env.
id();
125 for (
int loop=0; loop<(neven-1); ++loop) {
128 for (
int pair=env.
id(); pair<
nlocal; pair+=nthread) {
130 int rp = neven/2-1-(pair+pairlo);
131 int iii = (rp+loop)%(neven-1);
132 int jjj = (2*neven-2-rp+loop)%(neven-1);
133 if (rp == 0) jjj = neven-1;
144 if (threadid == 0)
cycle();
169 for (int64_t i=0; i<
nlocal; ++i) {
260 for (int64_t i=0; i<
nlocal-1; ++i) {
271 else if (
rank == 0) {
282 std::vector<T> buf(
rowdim);
289 std::vector<T> buf1(
rowdim);
290 std::vector<T> buf2(
rowdim);
297 std::memcpy(ilast, &buf2[0],
rowdim*
sizeof(
T));
298 std::memcpy(jfirst, &buf1[0],
rowdim*
sizeof(
T));
308 virtual void get_id(std::pair<void*,unsigned short>&
id)
const {
321 ,
nproc(
A.process_coldim()*
A.process_rowdim())
324 ,
nlocal((
A.local_coldim()+1)/2)
340 for (int64_t i=0; i<
nlocal; ++i) {
354 A.get_colrange(
p,
lo, hi);
355 int p_nlocal = (hi -
lo + 2)/2;
357 for (
int i=0; i<p_nlocal; ++i) {
360 map[ii+i+neven] =
lo+i+p_nlocal;
365 std::reverse(
map.begin(),
map.begin()+neven);
378 virtual void kernel(
int i,
int j,
T* rowi,
T* rowj) = 0;
403 #ifdef HAVE_INTEL_TBB
408 const int nthread = env.
nthread();
412 done = tbb::parallel_reduce(tbb::blocked_range<int>(0,nthread),
true,
413 [=] (
const tbb::blocked_range<int>& range,
bool init) ->
bool {
414 for(
int id = range.begin();
id < range.end(); ++
id)
419 [] (
const bool l,
const bool r) {
return l && r; });
456 return A.get_world();
This header should include pretty much everything needed for the parallel runtime.
Definition: test_ar.cc:118
Definition: safempi.h:289
const long * dims() const
Returns the array of tensor dimensions.
Definition: basetensor.h:153
long size() const
Returns the number of elements in the tensor.
Definition: basetensor.h:138
Manages data associated with a row/column/block distributed array.
Definition: distributed_matrix.h:388
static std::enable_if< detail::function_traits< fnT >::value||detail::memfunc_traits< fnT >::value >::type make_id(std::pair< void *, unsigned short > &id, fnT fn)
Definition: thread.h:916
void set_nthread(int nthread)
Call this to reset the number of threads before the task is submitted.
Definition: thread.h:1078
Base class for parallel algorithms that employ a systolic loop to generate all row pairs in parallel.
Definition: systolic.h:48
const int64_t nproc
No. of processes with rows of the matrix (not size of world)
Definition: systolic.h:51
const int64_t rowdim
A(coldim,rowdim)
Definition: systolic.h:53
const int tag
MPI tag to be used for messages.
Definition: systolic.h:56
const int64_t nlocal
No. of local pairs.
Definition: systolic.h:54
void solve_sequential()
Invoked by the user to run the algorithm with one thread mostly for debugging.
Definition: systolic.h:443
DistributedMatrix< T > & A
Definition: systolic.h:50
virtual void kernel(int i, int j, T *rowi, T *rowj)=0
Threadsafe routine to apply the operation to rows i and j of the matrix.
virtual void start_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the start of each iteration.
Definition: systolic.h:392
void iteration(const int nthread)
Definition: systolic.h:61
int64_t get_rowdim() const
Returns length of row.
Definition: systolic.h:448
std::vector< T * > iptr
Definition: systolic.h:57
const int64_t coldim
A(coldim,rowdim)
Definition: systolic.h:52
virtual ~SystolicMatrixAlgorithm()
Definition: systolic.h:370
virtual bool converged(const TaskThreadEnv &env) const =0
Invoked simultaneously by all threads after each sweep to test for convergence.
virtual void get_id(std::pair< void *, unsigned short > &id) const
Get the task id.
Definition: systolic.h:308
const ProcessID rank
Rank of current process.
Definition: systolic.h:55
void run(World &world, const TaskThreadEnv &env)
Invoked by the task queue to run the algorithm with multiple threads.
Definition: systolic.h:407
ProcessID get_rank() const
Returns rank of this process in the world.
Definition: systolic.h:460
void cycle()
Cycles data around the loop ... only one thread should invoke this.
Definition: systolic.h:183
SystolicMatrixAlgorithm(DistributedMatrix< T > &A, int tag, int nthread=ThreadPool::size()+1)
A must be a column distributed matrix with an even column tile >= 2.
Definition: systolic.h:319
std::vector< T * > jptr
Indirection for implementing cyclic buffer !! SHOULD BE VOLATILE ?????
Definition: systolic.h:57
virtual void end_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the end of each iteration before convergence test.
Definition: systolic.h:400
void unshuffle()
Call this after iterating to restore correct order of rows in original matrix.
Definition: systolic.h:164
std::vector< int64_t > map
Used to keep track of actual row indices.
Definition: systolic.h:58
int64_t get_coldim() const
Returns length of column.
Definition: systolic.h:452
World & get_world() const
Returns a reference to the world.
Definition: systolic.h:455
All world tasks must be derived from this public interface.
Definition: taskfn.h:69
volatile World * world
Definition: taskfn.h:72
Used to pass information about the thread environment to a user's task.
Definition: thread.h:466
int id() const
Get the ID of this thread.
Definition: thread.h:506
bool barrier() const
Definition: thread.h:514
int nthread() const
Get the number of threads collaborating on this task.
Definition: thread.h:499
A tensor is a multidimension array.
Definition: tensor.h:317
T * ptr()
Returns a pointer to the internal data.
Definition: tensor.h:1824
static std::size_t size()
Returns the number of threads in the pool.
Definition: thread.h:1413
SafeMPI::Request Irecv(T *buf, int count, int source, int tag=SafeMPI::DEFAULT_SEND_RECV_TAG) const
Async receive data of up to count elements from process source.
Definition: worldmpi.h:321
void Send(const T *buf, long lenbuf, int dest, int tag=SafeMPI::DEFAULT_SEND_RECV_TAG) const
Send array of lenbuf elements to process dest.
Definition: worldmpi.h:347
void Recv(T *buf, long lenbuf, int src, int tag) const
Receive data of up to lenbuf elements from process src.
Definition: worldmpi.h:374
A parallel world class.
Definition: world.h:132
static void await(SafeMPI::Request &request, bool dowork=true)
Wait for a MPI request to complete.
Definition: world.h:516
WorldMpiInterface & mpi
MPI interface.
Definition: world.h:202
char * p(char *buf, const char *name, int k, int initial_level, double thresh, int order)
Definition: derivatives.cc:72
static double lo
Definition: dirac-hatom.cc:23
auto T(World &world, response_space &f) -> response_space
Definition: global_functions.cc:34
#define MADNESS_ASSERT(condition)
Assert a condition that should be free of side-effects since in release builds this might be a no-op.
Definition: madness_exception.h:134
File holds all helper structures necessary for the CC_Operator and CC2 class.
Definition: DFParameters.h:10
Defines and implements most of Tensor.
int ProcessID
Used to clearly identify process number/rank.
Definition: worldtypes.h:43