MADNESS 0.10.1
worldgop.h
Go to the documentation of this file.
1/*
2 This file is part of MADNESS.
3
4 Copyright (C) 2007,2010 Oak Ridge National Laboratory
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
20 For more information please contact:
21
22 Robert J. Harrison
23 Oak Ridge National Laboratory
24 One Bethel Valley Road
25 P.O. Box 2008, MS-6367
26
27 email: harrisonrj@ornl.gov
28 tel: 865-241-3937
29 fax: 865-572-0680
30 */
31
32#ifndef MADNESS_WORLD_WORLDGOP_H__INCLUDED
33#define MADNESS_WORLD_WORLDGOP_H__INCLUDED
34
35/// \file worldgop.h
36/// \brief Implements global operations
37
38/// If you can recall the Intel hypercubes, their comm lib used GOP as
39/// the abbreviation.
40
41#include <functional>
42#include <type_traits>
45#include <madness/world/world.h>
48#include <madness/world/group.h>
50#include <madness/world/units.h>
51
52namespace madness {
53
54 // Forward declarations
55 class World;
56 class WorldAmInterface;
57 class WorldTaskQueue;
58 namespace detail {
59
60 class DeferredCleanup;
61
62 } // namespace detail
63
64 template <typename T>
65 struct WorldSumOp {
66 inline T operator()(const T& a, const T& b) const {
67 return a+b;
68 }
69 };
70
71 template <typename T>
72 struct WorldMultOp {
73 inline T operator()(const T& a, const T& b) const {
74 return a*b;
75 }
76 };
77
78 template <typename T>
79 struct WorldMaxOp {
80 inline T operator()(const T& a, const T& b) const {
81 return a>b? a : b;
82 }
83 };
84
85 template <typename T>
87 inline T operator()(const T& a, const T& b) const {
88 using std::abs;
89 return abs(a)>abs(b)? abs(a) : abs(b);
90 }
91 };
92
93 template <typename T>
94 struct WorldMinOp {
95 inline T operator()(const T& a, const T& b) const {
96 return a<b? a : b;
97 }
98 };
99
100 template <typename T>
102 inline T operator()(const T& a, const T& b) const {
103 using std::abs;
104 return abs(a)<abs(b)? abs(a) : abs(b);
105 }
106 };
107
108 template <typename T>
110 inline T operator()(const T& a, const T& b) const {
111 return a & b;
112 }
113 };
114
115 template <typename T>
117 inline T operator()(const T& a, const T& b) const {
118 return a | b;
119 }
120 };
121
122 template <typename T>
124 inline T operator()(const T& a, const T& b) const {
125 return a ^ b;
126 }
127 };
128
129 template <typename T>
131 inline T operator()(const T& a, const T& b) const {
132 return a && b;
133 }
134 };
135
136 template <typename T>
138 inline T operator()(const T& a, const T& b) const {
139 return a || b;
140 }
141 };
142
143
144 /// Provides collectives that interoperate with the AM and task interfaces
145
146 /// If native AM interoperates with MPI we probably should map these to MPI.
148 private:
149 World& world_; ///< World object that this is a part of
150 std::shared_ptr<detail::DeferredCleanup> deferred_; ///< Deferred cleanup object.
151 bool debug_; ///< Debug mode
152 bool forbid_fence_=false; ///< forbid calling fence() in case of several active worlds
153 int max_reducebcast_msg_size_ = std::numeric_limits<int>::max(); ///< maximum size of messages (in bytes) sent by reduce and broadcast
154 bool in_do_cleanup_ = false; ///< set while this gop's deferred_->do_cleanup() is running inside fence_impl, so that destructors invoked during cleanup can skip cross-rank handshakes safely (see is_in_do_cleanup)
155
157
158 // Message tags
159 struct PointToPointTag { };
160 struct LazySyncTag { };
162 struct BcastTag { };
163 struct GroupBcastTag { };
164 struct ReduceTag { };
165 struct GroupReduceTag { };
166 struct AllReduceTag { };
168
169
170 /// Delayed send callback object
171
172 /// This callback object is used to send local data to a remove process
173 /// once it has been set.
174 /// \tparam keyT The data key
175 /// \tparam valueT The type of data to be sent
176 template <typename keyT, typename valueT>
178 private:
179 World& world_; ///< The communication world
180 const ProcessID dest_; ///< The destination process id
181 const keyT key_; ///< The distributed id associated with \c value_
182 Future<valueT> value_; ///< The data to be sent
183
184 // Not allowed
187
188 public:
189
190 /// Constructor
191 DelayedSend(World& world, const ProcessID dest,
192 const keyT& key, const Future<valueT>& value) :
193 world_(world), dest_(dest), key_(key), value_(value)
194 { }
195
196 virtual ~DelayedSend() { }
197
198 /// Notify this object that the future has been set.
199
200 /// This will set the value of the future on the remote node and delete
201 /// this callback object.
202 virtual void notify() {
205 delete this;
206 }
207 }; // class DelayedSend
208
209 /// Receive data from remote node
210
211 /// \tparam valueT The data type stored in cache
212 /// \param key The distributed ID
213 /// \return A future to the data
214 template <typename valueT, typename keyT>
215 static Future<valueT> recv_internal(const keyT& key) {
216 return detail::DistCache<keyT>::template get_cache_value<valueT>(key);
217 }
218
219 /// Send \c value to \c dest
220
221 /// Send non-future data to \c dest.
222 /// \tparam keyT The key type
223 /// \tparam valueT The value type
224 /// \param dest The node where the data will be sent
225 /// \param key The key that is associated with the data
226 /// \param value The data to be sent to \c dest
227 template <typename keyT, typename valueT>
228 typename std::enable_if<!is_future<valueT>::value >::type
229 send_internal(const ProcessID dest, const keyT& key, const valueT& value) const {
230 typedef detail::DistCache<keyT> dist_cache;
231
232 if(world_.rank() == dest) {
233 // When dest is this process, skip the task and set the future immediately.
234 dist_cache::set_cache_value(key, value);
235 } else {
236 // Spawn a remote task to set the value
237 world_.taskq.add(dest, dist_cache::template set_cache_value<valueT>, key,
238 value, TaskAttributes::hipri());
239 }
240 }
241
242 /// Send \c value to \c dest
243
244 /// Send data that is stored in a future to \c dest. The data in
245 /// \c value is only sent to the remote process once it has been set.
246 /// \tparam keyT The key type
247 /// \tparam valueT The value type
248 /// \param dest The node where the data will be sent
249 /// \param key The key that is associated with the data
250 /// \param value The data to be sent to \c dest
251 template <typename keyT, typename valueT>
252 void send_internal(ProcessID dest, const keyT& key, const Future<valueT>& value) const {
253 typedef detail::DistCache<keyT> dist_cache;
254
255 if(world_.rank() == dest) {
256 dist_cache::set_cache_value(key, value);
257 } else {
258 // The destination is not this node, so send it to the destination.
259 if(value.probe()) {
260 // Spawn a remote task to set the value
261 world_.taskq.add(dest, dist_cache::template set_cache_value<valueT>, key,
262 value.get(), TaskAttributes::hipri());
263 } else {
264 // The future is not ready, so create a callback object that will
265 // send value to the destination node when it is ready.
266 DelayedSend<keyT, valueT>* delayed_send_callback =
267 new DelayedSend<keyT, valueT>(world_, dest, key, value);
268 const_cast<Future<valueT>&>(value).register_callback(delayed_send_callback);
269
270 }
271 }
272 }
273
274 /// Lazy sync parent task
275
276 /// Send signal to the parent process in the binary tree for a lazy sync
277 /// operation.
278 /// \tparam keyT The key type
279 /// \param parent The parent process of this process in the binary tree
280 /// \param key The lazy sync key
281 template <typename keyT>
282 void lazy_sync_parent(const ProcessID parent, const keyT& key,
283 const ProcessID, const ProcessID) const
284 {
285 send_internal(parent, key, key.proc());
286 }
287
288 /// Lazy sync parent task
289
290 /// Send signal to the child processes in the binary tree for a lazy
291 /// sync operation. After the signal has been sent to the children, the
292 /// sync operation, \c op, will be run.
293 /// \tparam keyT The key type
294 /// \tparam opT The sync operation type
295 /// \param child0 The first child process of this process in the binary tree
296 /// \param child1 The second child process of this process in the binary tree
297 /// \param key The key associated with the sync operation
298 /// \param op The sync operation that will be run
299 template <typename keyT, typename opT>
300 void lazy_sync_children(const ProcessID child0, const ProcessID child1,
301 const keyT& key, opT& op, const ProcessID) const
302 {
303 // Signal children to execute the operation.
304 if(child0 != -1)
305 send_internal(child0, key, 1);
306 if(child1 != -1)
307 send_internal(child1, key, 1);
308
309 // Execute the operation on this process.
310 op();
311 }
312
313 /// Start a distributed lazy sync operation
314
315 /// \param key The sync key
316 /// \param op The sync operation to be executed on this process
317 template <typename tagT, typename keyT, typename opT>
318 void lazy_sync_internal(const ProcessID parent, const ProcessID child0,
319 const ProcessID child1, const keyT& key, const opT& op) const {
320 typedef ProcessKey<keyT, tagT> key_type;
321
322 // Get signals from parent and children.
323 madness::Future<ProcessID> child0_signal = (child0 != -1 ?
324 recv_internal<ProcessID>(key_type(key, child0)) :
326 madness::Future<ProcessID> child1_signal = (child1 != -1 ?
327 recv_internal<ProcessID>(key_type(key, child1)) :
329 madness::Future<ProcessID> parent_signal = (parent != -1 ?
330 recv_internal<ProcessID>(key_type(key, parent)) :
332
333 // Construct the task that notifies children to run the operation
334 key_type my_key(key, world_.rank());
335 auto lazy_sync_children_fn = & WorldGopInterface::template lazy_sync_children<key_type, opT>;
336 world_.taskq.add(*this, lazy_sync_children_fn,
337 child0_signal, child1_signal, my_key, op, parent_signal,
339
340 // Send signal to parent
341 if(parent != -1) {
342 if(child0_signal.probe() && child1_signal.probe())
343 send_internal(parent, my_key, world_.rank());
344 else {
345 auto lazy_sync_parent_fn = & WorldGopInterface::template lazy_sync_parent<key_type>;
346 world_.taskq.add(*this, lazy_sync_parent_fn,
347 parent, my_key, child0_signal, child1_signal,
349 }
350 }
351 }
352
353
354 template <typename keyT, typename valueT, typename taskfnT>
355 static void bcast_handler(const AmArg& arg) {
356 // Deserialize message arguments
357 taskfnT taskfn;
358 keyT key;
359 valueT value;
360 ProcessID root;
361
362 arg & taskfn & key & value & root;
363
364 // Add task to queue
365 arg.get_world()->taskq.add(arg.get_world()->gop, taskfn, key,
366 value, root, TaskAttributes::hipri());
367 }
368
369 template <typename keyT, typename valueT, typename taskfnT>
370 static void group_bcast_handler(const AmArg& arg) {
371 // Deserialize message arguments
372 taskfnT taskfn;
373 keyT key;
374 valueT value;
375 ProcessID group_root;
376 DistributedID group_key;
377
378 arg & taskfn & key & value & group_root & group_key;
379
380 // Get the local group
381 const Future<Group> group = Group::get_group(group_key);
382
383 // Add task to queue
384 arg.get_world()->taskq.add(arg.get_world()->gop, taskfn, key, value,
385 group_root, group, TaskAttributes::hipri());
386 }
387
388
389 /// Broadcast task
390
391 /// This task will set the local cache with the broadcast data and send
392 /// it to child processes in the binary tree.
393 template <typename keyT, typename valueT>
394 void bcast_task(const keyT& key, const valueT& value, const ProcessID root) const {
395 typedef void (WorldGopInterface::*taskfnT)(const keyT&, const valueT&,
396 const ProcessID) const;
397
398 // Compute binary tree data
399 ProcessID parent = -1, child0 = -1, child1 = -1;
400 world_.mpi.binary_tree_info(root, parent, child0, child1);
401
402 // Set the local data, except on the root process
403 if(parent != -1)
405
406 if(child0 != -1) { // Check that this process has children in the binary tree
407
408 // Get handler function and arguments
409 void (*handler)(const AmArg&) =
410 & WorldGopInterface::template bcast_handler<keyT, valueT, taskfnT>;
411 AmArg* const args0 = new_am_arg(
412 & WorldGopInterface::template bcast_task<keyT, valueT>,
413 key, value, root);
414
415 // Send active message to children
416 if(child1 != -1) {
417 AmArg* const args1 = copy_am_arg(*args0);
418 world_.am.send(child1, handler, args1, RMI::ATTR_UNORDERED);
419 }
420 world_.am.send(child0, handler, args0, RMI::ATTR_UNORDERED);
421 }
422 }
423
424 template <typename keyT, typename valueT>
425 void group_bcast_task(const keyT& key, const valueT& value,
426 const ProcessID group_root, const Group& group) const
427 {
428 typedef void (WorldGopInterface::*taskfnT)(const keyT&, const valueT&,
429 const ProcessID, const Group&) const;
430
431 // Set the local data, except on the root process
432 ProcessID parent = -1, child0 = -1, child1 = -1;
433 group.make_tree(group_root, parent, child0, child1);
434
435 // Set the local data
436 if(parent != -1) {
438 group.remote_update();
439 }
440
441 if(child0 != -1) { // Check that this process has children in the binary tree
442
443 // Get handler function and arguments
444 void (*handler)(const AmArg&) =
445 & WorldGopInterface::template group_bcast_handler<keyT, valueT, taskfnT>;
446 AmArg* const args0 = new_am_arg(
447 & WorldGopInterface::template group_bcast_task<keyT, valueT>,
448 key, value, group_root, group.id());
449
450 // Send active message to children
451 if(child1 != -1) {
452 AmArg* const args1 = copy_am_arg(*args0);
453 world_.am.send(child1, handler, args1, RMI::ATTR_UNORDERED);
454 }
455 world_.am.send(child0, handler, args0, RMI::ATTR_UNORDERED);
456 }
457 }
458
459 /// Broadcast
460
461 /// Broadcast data from the \c root process to all processes in \c world.
462 /// The input/output data is held by \c value.
463 /// \tparam tagT The tag type that is attached to \c keyT
464 /// \tparam keyT The base key type
465 /// \tparam valueT The value type that will be broadcast
466 /// \param[in] key The key associated with this broadcast
467 /// \param[in,out] value On the \c root process, this is used as the input
468 /// data that will be broadcast to all other processes in the group.
469 /// On other processes it is used as the output to the broadcast
470 /// \param root The process that owns the data to be broadcast
471 /// \throw madness::Exception When \c value has been set, except on the
472 /// \c root process.
473 template <typename tagT, typename keyT, typename valueT>
474 void bcast_internal(const keyT& key, Future<valueT>& value, const ProcessID root) const {
475 MADNESS_ASSERT((root >= 0) && (root < world_.size()));
476 MADNESS_ASSERT((world_.rank() == root) || (! value.probe()));
477
478 // Add operation tag to key
479 typedef TaggedKey<keyT, tagT> key_type;
480 const key_type tagged_key(key);
481
482 if(world_.size() > 1) { // Do nothing for the trivial case
483 if(world_.rank() == root) {
484 // This process owns the data to be broadcast.
485
486 // Spawn remote tasks that will set the local cache for this
487 // broadcast on other nodes.
488 if(value.probe())
489 // The value is ready so send it now
490 bcast_task(tagged_key, value.get(), root);
491 else {
492 // The value is not ready so spawn a task to send the
493 // data when it is ready.
494 auto bcast_task_fn = & WorldGopInterface::template bcast_task<key_type, valueT>;
495 world_.taskq.add(*this, bcast_task_fn,
496 tagged_key, value, root, TaskAttributes::hipri());
497 }
498 } else {
499 MADNESS_ASSERT(! value.probe());
500
501 // Get the broadcast value from local cache
503 }
504 }
505 }
506
507 /// Group broadcast
508
509 /// Broadcast data from the \c group_root process to all processes in
510 /// \c group. The input/output data is held by \c value.
511 /// \tparam tagT The tag type that is attached to \c keyT
512 /// \tparam keyT The base key type
513 /// \tparam valueT The value type that will be broadcast
514 /// \param[in] key The key associated with this broadcast
515 /// \param[in,out] value On the \c group_root process, this is used as the
516 /// input data that will be broadcast to all other processes in the group.
517 /// On other processes it is used as the output to the broadcast
518 /// \param group_root The process in \c group that owns the data to be
519 /// broadcast
520 /// \param group The process group where value will be broadcast
521 /// \throw madness::Exception When \c value has been set, except on the
522 /// \c group_root process.
523 template <typename tagT, typename keyT, typename valueT>
524 void bcast_internal(const keyT& key, Future<valueT>& value,
525 const ProcessID group_root, const Group& group) const
526 {
527 // Construct the internal broadcast key
528 typedef TaggedKey<keyT, tagT> key_type;
529 const key_type tagged_key(key);
530
531 if(group.rank() == group_root) {
532 // This process owns the data to be broadcast.
533 if(value.probe())
534 group_bcast_task(tagged_key, value.get(), group_root, group);
535 else {
536 auto group_bcast_task_fn = & WorldGopInterface::template group_bcast_task<key_type, valueT>;
537 world_.taskq.add(this, group_bcast_task_fn,
538 tagged_key, value, group_root, group,
540 }
541 } else {
542 MADNESS_ASSERT(! value.probe());
543
544 // This is not the root process, so retrieve the broadcast data
546
547 // Increment local use counter for group
548 group.local_update();
549 }
550 }
551
552 template <typename valueT, typename opT>
553 static typename detail::result_of<opT>::type
554 reduce_task(const valueT& value, const opT& op) {
555 typename detail::result_of<opT>::type result = op();
556 op(result, value);
557 return result;
558 }
559
560 template <typename opT>
561 static typename detail::result_of<opT>::type
562 reduce_result_task(const std::vector<Future<typename detail::result_of<opT>::type> >& results,
563 const opT& op)
564 {
565 MADNESS_ASSERT(results.size() != 0ul);
566 Future<typename detail::result_of<opT>::type> result = results.front();
567 for(std::size_t i = 1ul; i < results.size(); ++i)
568 op(result.get(), results[i].get());
569 return result.get();
570 }
571
572 /// Distributed reduce
573
574 /// \tparam tagT The tag type to be added to the key type
575 /// \tparam keyT The key type
576 /// \tparam valueT The data type to be reduced
577 /// \tparam opT The reduction operation type
578 /// \param key The key associated with this reduction
579 /// \param value The local value to be reduced
580 /// \param op The reduction operation to be applied to local and remote data
581 /// \param root The process that will receive the result of the reduction
582 /// \return A future to the reduce value on the root process, otherwise an
583 /// uninitialized future that may be ignored.
584 template <typename tagT, typename keyT, typename valueT, typename opT>
586 reduce_internal(const ProcessID parent, const ProcessID child0,
587 const ProcessID child1, const ProcessID root, const keyT& key,
588 const valueT& value, const opT& op)
589 {
590 // Create tagged key
591 typedef ProcessKey<keyT, tagT> key_type;
592 typedef typename detail::result_of<opT>::type result_type;
593 typedef typename remove_future<valueT>::type value_type;
594 std::vector<Future<result_type> > results;
595 results.reserve(3);
596
597 // Add local data to vector of values to reduce
598 results.push_back(world_.taskq.add(WorldGopInterface::template reduce_task<value_type, opT>,
599 value, op, TaskAttributes::hipri()));
600
601 // Reduce child data
602 if(child0 != -1)
603 results.push_back(recv_internal<result_type>(key_type(key, child0)));
604 if(child1 != -1)
605 results.push_back(recv_internal<result_type>(key_type(key, child1)));
606
607 // Submit the local reduction task
608 Future<result_type> local_result =
609 world_.taskq.add(WorldGopInterface::template reduce_result_task<opT>,
610 results, op, TaskAttributes::hipri());
611
612 // Send reduced value to parent or, if this is the root process, set the
613 // result future.
614 if(parent == -1)
615 return local_result;
616 else
617 send_internal(parent, key_type(key, world_.rank()), local_result);
618
620 }
621
622 /// Implementation of fence
623
624 /// \param[in] epilogue the action to execute (by the calling thread) immediately after the fence
625 /// \param[in] pause_during_epilogue whether to suspend work while executing epilogue
626 /// \param[in] debug set to true to print progress statistics using madness::print(); the default is false.
627 /// \warning currently only \c pause_during_epilogue=false is supported
628 void fence_impl(std::function<void()> epilogue = []{},
629 bool pause_during_epilogue = false,
630 bool debug = false);
631
633 int result = std::numeric_limits<int>::max();
634 const auto* initial_max_reducebcast_msg_size_cstr = std::getenv("MAD_MAX_REDUCEBCAST_MSG_SIZE");
635 if (initial_max_reducebcast_msg_size_cstr) {
636 auto result_u64 = cstr_to_memory_size(initial_max_reducebcast_msg_size_cstr);
637 const auto do_print = SafeMPI::COMM_WORLD.Get_rank() == 0 && !madness::quiet();
638 if (result_u64>std::numeric_limits<int>::max()) {
639 if (do_print)
640 std::cout
641 << "!!MADNESS WARNING: Invalid value for environment variable MAD_MAX_REDUCEBCAST_MSG_SIZE.\n"
642 << "!!MADNESS WARNING: MAD_MAX_REDUCEBCAST_MSG_SIZE = "
643 << result_u64 << "\n";
644 result = std::numeric_limits<int>::max();
645 }
646 result = static_cast<int>(result_u64);
647 if(do_print) {
648 std::cout
649 << "MADNESS max msg size for GOP reduce/broadcast set to "
650 << result << " bytes.\n";
651 }
652 }
653 return result;
654 }
655
656 public:
657
658 // In the World constructor can ONLY rely on MPI and MPI being initialized
662
664 deferred_->destroy(true);
665 deferred_->do_cleanup();
666 }
667
668
669 /// Set debug flag to new value and return old value
670 bool set_debug(bool value) {
671 bool status = debug_;
672 debug_ = value;
673 return status;
674 }
675
676 /// Set forbid_fence flag to new value and return old value
677 bool set_forbid_fence(bool value) {
678 bool status = forbid_fence_;
679 forbid_fence_ = value;
680 return status;
681 }
682
683 /// Set the maximum size of messages (in bytes) sent by reduce and broadcast
684
685 /// \param sz the maximum size of messages (in bytes) sent by reduce and broadcast
686 /// \return the previous maximum size of messages (in bytes) sent by reduce and broadcast
687 /// \pre `sz>0`
689 MADNESS_ASSERT(sz>0);
690 std::swap(max_reducebcast_msg_size_,sz);
692 }
693
694
695 /// Returns the maximum size of messages (in bytes) sent by reduce and broadcast
696
697 /// \return the maximum size of messages (in bytes) sent by reduce and broadcast
701
702 /// Reports whether this gop is currently inside its deferred-cleanup
703 /// phase (invoked from fence_impl after the global-termination loop).
704
705 /// \return true iff `deferred_->do_cleanup()` is being run from within
706 /// this gop's `fence_impl`.
707 /// \note Intended for destructors of objects stored in the deferred
708 /// list to specialize their teardown when they can rely on the fence
709 /// having already established global quiescence -- in particular, to
710 /// skip cross-rank handshakes (e.g. \c lazy_sync) that would otherwise
711 /// schedule \c lazy_sync_children tasks the fence cannot drain.
712 /// Symmetric, collective use of the deferred list is assumed: every
713 /// rank must add the same set of objects to its deferred list, so
714 /// every rank performs the matching cleanup in lockstep.
715 bool is_in_do_cleanup() const {
716 return in_do_cleanup_;
717 }
718
719 /// Synchronizes all processes in communicator ... does NOT fence pending AM or tasks
720 void barrier() {
721 long i = world_.rank();
722 sum(i);
723 if (i != world_.size()*(world_.size()-1)/2) error("bad value after sum in barrier");
724 }
725
726
727 /// Synchronizes all processes in communicator AND globally ensures no pending AM or tasks
728
729 /// \internal Runs Dykstra-like termination algorithm on binary tree by
730 /// locally ensuring ntask=0 and all am sent and processed,
731 /// and then participating in a global sum of nsent and nrecv.
732 /// Then globally checks that nsent=nrecv and that both are
733 /// constant over two traversals. We are then sure
734 /// that all tasks and AM are processed and there no AM in
735 /// flight.
736 /// \param[in] debug set to true to print progress statistics using madness::print(); the default is false.
737 void fence(bool debug = false);
738
739 /// Executes an action on single (this) thread after ensuring all other work is done
740
741 /// \param[in] action the action to execute (by the calling thread)
742 void serial_invoke(std::function<void()> action);
743
744 /// Broadcasts bytes from process root while still processing AM & tasks
745
746 /// Optimizations can be added for long messages
747 void broadcast(void* buf, size_t nbyte, ProcessID root, bool dowork = true, Tag bcast_tag = -1);
748
749
750 /// Broadcasts typed contiguous data from process root while still processing AM & tasks
751
752 /// Optimizations can be added for long messages
753 template <typename T, typename = std::enable_if_t<madness::is_trivially_copyable_v<T>>>
754 inline void broadcast(T* buf, size_t nelem, ProcessID root) {
755 broadcast((void *) buf, nelem*sizeof(T), root);
756 }
757
758 /// Broadcast of a scalar from node 0 to all other nodes
759 template <typename T, typename = std::enable_if_t<madness::is_trivially_copyable_v<T>>>
760 void broadcast(T& t) {
761 broadcast(&t, 1, 0);
762 }
763
764 /// Broadcast of a scalar from node root to all other nodes
765 template <typename T, typename = std::enable_if_t<madness::is_trivially_copyable_v<T>>>
766 void broadcast(T& t, ProcessID root) {
767 broadcast(&t, 1, root);
768 }
769
770 /// Broadcast a serializable object
771 template <typename objT,
772 typename = std::void_t<decltype(std::declval<archive::BufferInputArchive&>()&std::declval<objT&>())>,
773 typename = std::void_t<decltype(std::declval<archive::BufferOutputArchive&>()&std::declval<const objT&>())>>
774 void broadcast_serializable(objT& obj, ProcessID root) {
775 MADNESS_ASSERT(root < world_.size());
776 if (world_.size() == 1) return;
777
778 size_t BUFLEN;
779 if (world_.rank() == root) {
781 count & obj;
782 BUFLEN = count.size();
783 }
784 broadcast(BUFLEN, root);
785
786 unsigned char* buf = new unsigned char[BUFLEN];
787 if (world_.rank() == root) {
788 archive::BufferOutputArchive ar(buf,BUFLEN);
789 ar & obj;
790 }
791 broadcast(buf, BUFLEN, root);
792 if (world_.rank() != root) {
793 archive::BufferInputArchive ar(buf,BUFLEN);
794 ar & obj;
795 }
796 delete [] buf;
797 }
798
799 /// Inplace global reduction (like MPI all_reduce) while still processing AM & tasks
800
801 /// Optimizations can be added for long messages and to reduce the memory footprint
802 template <typename T, class opT>
803 void reduce(T* buf, std::size_t nelem, opT op) {
804 static_assert(madness::is_trivially_copyable_v<T>, "T must be trivially copyable");
805
806 ProcessID parent, child0, child1;
807 world_.mpi.binary_tree_info(0, parent, child0, child1);
808 const std::size_t nelem_per_maxmsg =
809 max_reducebcast_msg_size() / sizeof(T);
810
811 const auto buf_size = ((sizeof(T) * std::min(nelem_per_maxmsg, nelem) +
812 std::alignment_of_v<T> - 1) /
813 std::alignment_of_v<T>) * std::alignment_of_v<T>;
814 struct free_dtor {
815 void operator()(T *ptr) {
816 if (ptr != nullptr)
817 std::free(ptr);
818 };
819 };
820 using sptr_t = std::unique_ptr<T[], free_dtor>;
821
822 auto aligned_buf_alloc = [&]() -> T* {
823 // posix_memalign requires alignment to be an integer multiple of sizeof(void*)!! so ensure that
824 const std::size_t alignment =
825 ((std::alignment_of_v<T> + sizeof(void *) - 1) /
826 sizeof(void *)) *
827 sizeof(void *);
828#ifdef HAVE_POSIX_MEMALIGN
829 void *ptr;
830 if (posix_memalign(&ptr, alignment, buf_size) != 0) {
831 throw std::bad_alloc();
832 }
833 return static_cast<T *>(ptr);
834#else
835 return static_cast<T *>(std::aligned_alloc(alignment, buf_size));
836#endif
837 };
838
839 sptr_t buf0;
840 if (child0 != -1)
841 buf0 = sptr_t(aligned_buf_alloc(),
842 free_dtor{});
843 sptr_t buf1(nullptr);
844 if (child1 != -1)
845 buf1 = sptr_t(aligned_buf_alloc(),
846 free_dtor{});
847
848 auto reduce_impl = [&,this](T* buf, size_t nelem) {
849 MADNESS_ASSERT(nelem <= nelem_per_maxmsg);
850 SafeMPI::Request req0, req1;
851 Tag gsum_tag = world_.mpi.unique_tag();
852
853 if (child0 != -1)
854 req0 = world_.mpi.Irecv(buf0.get(), nelem * sizeof(T), MPI_BYTE,
855 child0, gsum_tag);
856 if (child1 != -1)
857 req1 = world_.mpi.Irecv(buf1.get(), nelem * sizeof(T), MPI_BYTE,
858 child1, gsum_tag);
859
860 if (child0 != -1) {
861 World::await(req0);
862 for (long i = 0; i < (long)nelem; ++i)
863 buf[i] = op(buf[i], buf0[i]);
864 }
865 if (child1 != -1) {
866 World::await(req1);
867 for (long i = 0; i < (long)nelem; ++i)
868 buf[i] = op(buf[i], buf1[i]);
869 }
870
871 if (parent != -1) {
872 req0 = world_.mpi.Isend(buf, nelem * sizeof(T), MPI_BYTE, parent,
873 gsum_tag);
874 World::await(req0);
875 }
876
877 broadcast(buf, nelem, 0);
878 };
879
880 while (nelem) {
881 const int n = std::min(nelem_per_maxmsg, nelem);
882 reduce_impl(buf, n);
883 nelem -= n;
884 buf += n;
885 }
886 }
887
888 /// Inplace global sum while still processing AM & tasks
889 template <typename T>
890 inline void sum(T* buf, size_t nelem) {
891 reduce< T, WorldSumOp<T> >(buf, nelem, WorldSumOp<T>());
892 }
893
894 /// Inplace global min while still processing AM & tasks
895 template <typename T>
896 inline void min(T* buf, size_t nelem) {
897 reduce< T, WorldMinOp<T> >(buf, nelem, WorldMinOp<T>());
898 }
899
900 /// Inplace global max while still processing AM & tasks
901 template <typename T>
902 inline void max(T* buf, size_t nelem) {
903 reduce< T, WorldMaxOp<T> >(buf, nelem, WorldMaxOp<T>());
904 }
905
906 /// Inplace global absmin while still processing AM & tasks
907 template <typename T>
908 inline void absmin(T* buf, size_t nelem) {
909 reduce< T, WorldAbsMinOp<T> >(buf, nelem, WorldAbsMinOp<T>());
910 }
911
912 /// Inplace global absmax while still processing AM & tasks
913 template <typename T>
914 inline void absmax(T* buf, size_t nelem) {
915 reduce< T, WorldAbsMaxOp<T> >(buf, nelem, WorldAbsMaxOp<T>());
916 }
917
918 /// Inplace global product while still processing AM & tasks
919 template <typename T>
920 inline void product(T* buf, size_t nelem) {
921 reduce< T, WorldMultOp<T> >(buf, nelem, WorldMultOp<T>());
922 }
923
924 template <typename T>
925 inline void bit_and(T* buf, size_t nelem) {
926 reduce< T, WorldBitAndOp<T> >(buf, nelem, WorldBitAndOp<T>());
927 }
928
929 template <typename T>
930 inline void bit_or(T* buf, size_t nelem) {
931 reduce< T, WorldBitOrOp<T> >(buf, nelem, WorldBitOrOp<T>());
932 }
933
934 template <typename T>
935 inline void bit_xor(T* buf, size_t nelem) {
936 reduce< T, WorldBitXorOp<T> >(buf, nelem, WorldBitXorOp<T>());
937 }
938
939 template <typename T>
940 inline void logic_and(T* buf, size_t nelem) {
941 reduce< T, WorldLogicAndOp<T> >(buf, nelem, WorldLogicAndOp<T>());
942 }
943
944 template <typename T>
945 inline void logic_or(T* buf, size_t nelem) {
946 reduce< T, WorldLogicOrOp<T> >(buf, nelem, WorldLogicOrOp<T>());
947 }
948
949 /// Global sum of a scalar while still processing AM & tasks
950 template <typename T>
951 void sum(T& a) {
952 sum(&a, 1);
953 }
954
955 /// Global max of a scalar while still processing AM & tasks
956 template <typename T>
957 void max(T& a) {
958 max(&a, 1);
959 }
960
961 /// Global min of a scalar while still processing AM & tasks
962 template <typename T>
963 void min(T& a) {
964 min(&a, 1);
965 }
966
967 /// Concatenate an STL vector of serializable stuff onto node 0
968
969 /// \param[in] v input vector
970 /// \param[in] bufsz the max number of bytes in the result; must be less than std::numeric_limits<int>::max()
971 /// \return on rank 0 returns the concatenated vector, elsewhere returns an empty vector
972 template <typename T>
973 std::vector<T> concat0(const std::vector<T>& v, size_t bufsz=1024*1024) {
974 MADNESS_ASSERT(bufsz <= std::numeric_limits<int>::max());
975 // bufsz must be multiple of alignment!!! so ensure that
976 bufsz = ((bufsz + sizeof(void*) - 1) / sizeof(void*)) * sizeof(void*);
977
978 ProcessID parent, child0, child1;
979 world_.mpi.binary_tree_info(0, parent, child0, child1);
980 int child0_nbatch = 0, child1_nbatch = 0;
981
982 struct free_dtor {
983 void operator()(std::byte *ptr) {
984 if (ptr != nullptr)
985 std::free(ptr);
986 };
987 };
988 using sptr_t = std::unique_ptr<std::byte[], free_dtor>;
989
990 auto aligned_buf_alloc = [&]() -> std::byte* {
991#ifdef HAVE_POSIX_MEMALIGN
992 void *ptr;
993 if (posix_memalign(&ptr, sizeof(void *), bufsz) != 0) {
994 throw std::bad_alloc();
995 }
996 return static_cast<std::byte *>(ptr);
997#else
998 return static_cast<std::byte *>(
999 std::aligned_alloc(sizeof(void *), bufsz));
1000#endif
1001 };
1002
1003 auto buf0 = sptr_t(aligned_buf_alloc(),
1004 free_dtor{});
1005 auto buf1 = sptr_t(aligned_buf_alloc(),
1006 free_dtor{});
1007
1008 // transfer data in chunks at most this large
1009 const int batch_size = static_cast<int>(
1010 std::min(static_cast<size_t>(max_reducebcast_msg_size()), bufsz));
1011
1012 // precompute max # of tags any node ... will need, and allocate them on every node to avoid tag counter divergence
1013 const int max_nbatch = bufsz / batch_size;
1014 // one tag is reserved for sending the number of messages to expect and the size of the last message
1015 const int max_ntags = max_nbatch + 1;
1017 std::vector<Tag> tags; // stores tags used to send each batch
1018 tags.reserve(max_nbatch);
1019 for(int t=0; t<max_ntags; ++t) tags.push_back(world_.mpi.unique_tag());
1020
1021 if (child0 != -1 || child1 != -1) {
1022 // receive # of batches
1023
1024 auto receive_nbatch = [&,this]() {
1025 if (child0 != -1) {
1026 world_.mpi.Recv(&child0_nbatch, 1, MPI_INT, child0,
1027 tags[0]);
1028 }
1029 if (child1 != -1) {
1030 world_.mpi.Recv(&child1_nbatch, 1, MPI_INT, child1,
1031 tags[0]);
1032 }
1033 };
1034
1035 receive_nbatch();
1036
1037 // receive data in batches
1038
1039 auto receive_batch = [&,this](const int batch, const size_t buf_offset) {
1040 SafeMPI::Request req0, req1;
1041 if (child0 != -1 && batch < child0_nbatch) {
1042 int msg_size = batch_size;
1043 // if last batch, receive # of bytes to expect
1044 if (batch + 1 == child0_nbatch) {
1045 auto req = world_.mpi.Irecv(
1046 &msg_size, 1, MPI_INT, child0, tags[0]);
1047 World::await(req);
1048 }
1049
1050 req0 = world_.mpi.Irecv(buf0.get() + buf_offset,
1051 msg_size, MPI_BYTE, child0,
1052 tags[batch + 1]);
1053 }
1054 if (child1 != -1 && batch < child1_nbatch) {
1055 int msg_size = batch_size;
1056 // if last batch, receive # of bytes to expect
1057 if (batch + 1 == child1_nbatch) {
1058 auto req = world_.mpi.Irecv(
1059 &msg_size, 1, MPI_INT, child1, tags[0]);
1060 World::await(req);
1061 }
1062 req1 = world_.mpi.Irecv(buf1.get() + buf_offset,
1063 msg_size, MPI_BYTE, child1,
1064 tags[batch + 1]);
1065 }
1066
1067 if (child0 != -1 && batch < child0_nbatch) {
1068 World::await(req0);
1069 }
1070 if (child1 != -1 && batch < child1_nbatch) {
1071 World::await(req1);
1072 }
1073 };
1074
1075 size_t buf_offset = 0;
1076 int batch = 0;
1077 while (buf_offset < bufsz) {
1078 receive_batch(batch, buf_offset);
1079 buf_offset += batch_size;
1080 buf_offset = std::min(buf_offset, bufsz);
1081 ++batch;
1082 }
1083 }
1084
1085
1086 std::vector<T> left, right;
1087 if (child0 != -1) {
1088 archive::BufferInputArchive ar(buf0.get(), bufsz);
1089 ar & left;
1090 }
1091 if (child1 != -1) {
1092 archive::BufferInputArchive ar(buf1.get(), bufsz);
1093 ar & right;
1094 for (unsigned int i = 0; i < right.size(); ++i)
1095 left.push_back(right[i]);
1096 }
1097 for (unsigned int i=0; i<v.size(); ++i) left.push_back(v[i]);
1098
1099 // send data in batches
1100 if (parent != -1) {
1101 archive::BufferOutputArchive ar(buf0.get(), bufsz);
1102 ar & left;
1103 const auto total_nbytes_to_send = ar.size();
1104
1105 // send nbatches to expect
1106 const int nbatch = (total_nbytes_to_send + batch_size - 1) / batch_size;
1107 world_.mpi.Send(&nbatch, 1, MPI_INT, parent,
1108 tags[0]);
1109
1110 size_t buf_offset = 0;
1111 int batch = 0;
1112 while (buf_offset < bufsz) {
1113
1114 // send data in batches
1115 auto send_batch = [&,this](const int batch, const size_t buf_offset) {
1116 const int nbytes_to_send = static_cast<int>(
1117 std::min(static_cast<size_t>(batch_size),
1118 total_nbytes_to_send - buf_offset));
1119 // if last batch, send # of bytes to expect
1120 if (batch + 1 == nbatch) {
1121 auto req = world_.mpi.Isend(
1122 &nbytes_to_send, 1, MPI_INT, parent, tags[0]);
1123 World::await(req);
1124 }
1125 auto req0 =
1126 world_.mpi.Isend(buf0.get() + buf_offset, nbytes_to_send,
1127 MPI_BYTE, parent, tags[batch + 1]);
1128 World::await(req0);
1129 };
1130
1131 send_batch(batch, buf_offset);
1132 buf_offset += batch_size;
1133 buf_offset = std::min(buf_offset, bufsz);
1134 ++batch;
1135 }
1136 }
1137
1138 if (parent == -1) return left;
1139 else return std::vector<T>();
1140 }
1141
1142 /// Receive data from \c source
1143
1144 /// \tparam valueT The data type stored in cache
1145 /// \tparam keyT The key type
1146 /// \param source The process that is sending the data to this process
1147 /// \param key The key associated with the received data
1148 /// \return A future that will be set with the received data
1149 /// \note It is the user's responsibility to ensure that \c key does not
1150 /// conflict with other calls to \c recv. Keys may be reused after the
1151 /// associated operation has finished.
1152 template <typename valueT, typename keyT>
1153 static Future<valueT> recv(const ProcessID source, const keyT& key) {
1154 return recv_internal<valueT>(ProcessKey<keyT, PointToPointTag>(key, source));
1155 }
1156
1157 /// Send value to \c dest
1158
1159 /// \tparam keyT The key type
1160 /// \tparam valueT The value type (this may be a \c Future type)
1161 /// \param dest The process where the data will be sent
1162 /// \param key The key that is associated with the data
1163 /// \param value The data to be sent to \c dest
1164 /// \note It is the user's responsibility to ensure that \c key does not
1165 /// conflict with other calls to \c send. Keys may be reused after the
1166 /// associated operation has finished.
1167 template <typename keyT, typename valueT>
1168 void send(const ProcessID dest, const keyT& key, const valueT& value) const {
1170 }
1171
1172 /// Lazy sync
1173
1174 /// Lazy sync functions are asynchronous barriers with a nullary functor
1175 /// that is called after all processes have called it with the same
1176 /// key. You can think of lazy_sync as an asynchronous barrier. The
1177 /// lazy_sync functor must have the following signature:
1178 /// \code
1179 /// class SyncFunc {
1180 /// public:
1181 /// // typedefs
1182 /// typedef void result_type;
1183 ///
1184 /// // Constructors
1185 /// SyncFunc(const SyncFunc&);
1186 ///
1187 /// // The function that performs the sync operation
1188 /// void operator()();
1189 ///
1190 /// }; // class SyncFunc
1191 /// \endcode
1192 /// \tparam keyT The key type
1193 /// \tparam opT The operation type
1194 /// \param key The sync key
1195 /// \param op The sync operation to be executed on this process
1196 /// \note It is the user's responsibility to ensure that \c key does not
1197 /// conflict with other calls to \c lazy_sync. Keys may be reused after
1198 /// the associated operation has finished.
1199 template <typename keyT, typename opT>
1200 void lazy_sync(const keyT& key, const opT& op) const {
1201 if(world_.size() > 1) { // Do nothing for the trivial case
1202 // Get the binary tree data
1203 Hash<keyT> hasher;
1204 const ProcessID root = hasher(key) % world_.size();
1205 ProcessID parent = -1, child0 = -1, child1 = -1;
1206 world_.mpi.binary_tree_info(root, parent, child0, child1);
1207
1208 lazy_sync_internal<LazySyncTag>(parent, child0, child1, key, op);
1209 } else {
1210 auto lazy_sync_children_fn = & WorldGopInterface::template lazy_sync_children<keyT, opT>;
1211 // There is only one process, so run the sync operation now.
1212 world_.taskq.add(*this, lazy_sync_children_fn,
1213 -1, -1, key, op, -1, TaskAttributes::hipri());
1214 }
1215 }
1216
1217
1218 /// Group lazy sync
1219
1220 /// Lazy sync functions are asynchronous barriers with a nullary functor
1221 /// that is called after all processes in the group have called it with
1222 /// the same key. You can think of lazy_sync as an asynchronous barrier.
1223 /// The \c op functor must have the following signature:
1224 /// \code
1225 /// class SyncFunc {
1226 /// public:
1227 /// // typedefs
1228 /// typedef void result_type;
1229 ///
1230 /// // Constructors
1231 /// SyncFunc(const SyncFunc&);
1232 ///
1233 /// // The function that performs the sync operation
1234 /// void operator()();
1235 ///
1236 /// }; // class SyncFunc
1237 /// \endcode
1238 /// \tparam keyT The key type
1239 /// \tparam opT The operation type
1240 /// \param key The sync key
1241 /// \param op The sync operation to be executed on this process
1242 /// \note It is the user's responsibility to ensure that \c key does not
1243 /// conflict with other calls to \c lazy_sync. Keys may be reused after
1244 /// the associated operation has finished.
1245 template <typename keyT, typename opT>
1246 void lazy_sync(const keyT& key, const opT& op, const Group& group) const {
1247 MADNESS_ASSERT(! group.empty());
1248 MADNESS_ASSERT(group.get_world().id() == world_.id());
1249
1250 if(group.size() > 1) { // Do nothing for the trivial case
1251 // Get the binary tree data
1252 Hash<keyT> hasher;
1253 const ProcessID group_root = hasher(key) % group.size();
1254 ProcessID parent = -1, child0 = -1, child1 = -1;
1255 group.make_tree(group_root, parent, child0, child1);
1256
1257 lazy_sync_internal<GroupLazySyncTag>(parent, child0, child1, key, op);
1258 } else {
1259 auto lazy_sync_children_fn = & WorldGopInterface::template lazy_sync_children<keyT, opT>;
1260 world_.taskq.add(*this, lazy_sync_children_fn,
1261 -1, -1, key, op, -1, TaskAttributes::hipri());
1262 }
1263 }
1264
1265 /// Broadcast
1266
1267 /// Broadcast data from the \c root process to all processes. The input/
1268 /// output data is held by \c value.
1269 /// \param[in] key The key associated with this broadcast
1270 /// \param[in,out] value On the \c root process, this is used as the input
1271 /// data that will be broadcast to all other processes. On other
1272 /// processes it is used as the output to the broadcast.
1273 /// \param root The process that owns the data to be broadcast
1274 /// \throw madness::Exception When \c root is less than 0 or greater
1275 /// than or equal to the world size.
1276 /// \throw madness::Exception When \c value has been set, except on the
1277 /// \c root process.
1278 /// \note It is the user's responsibility to ensure that \c key does not
1279 /// conflict with other calls to \c bcast. Keys may be reused after
1280 /// the associated operation has finished.
1281 template <typename keyT, typename valueT>
1282 void bcast(const keyT& key, Future<valueT>& value, const ProcessID root) const {
1283 MADNESS_ASSERT((root >= 0) && (root < world_.size()));
1284 MADNESS_ASSERT((world_.rank() == root) || (! value.probe()));
1285
1286 if(world_.size() > 1) // Do nothing for the trivial case
1287 bcast_internal<BcastTag>(key, value, root);
1288 }
1289
1290 /// Group broadcast
1291
1292 /// Broadcast data from the \c group_root process to all processes in
1293 /// \c group. The input/output data is held by \c value.
1294 /// \param[in] key The key associated with this broadcast
1295 /// \param[in,out] value On the \c group_root process, this is used as the
1296 /// input data that will be broadcast to all other processes in the group.
1297 /// On other processes it is used as the output to the broadcast
1298 /// \param group_root The process in \c group that owns the data to be
1299 /// broadcast
1300 /// \param group The process group where value will be broadcast
1301 /// \throw madness::Exception When \c group is empty
1302 /// \throw madness::Exception When \c group is not registered
1303 /// \throw madness::Exception When the world id of \c group is not
1304 /// equal to that of the world used to construct this object
1305 /// \throw madness::Exception When this process is not in the group
1306 /// \throw madness::Exception When \c group_root is less than 0 or
1307 /// greater than or equal to \c group size
1308 /// \throw madness::Exception When \c data has been set except on the
1309 /// \c root process
1310 /// \note It is the user's responsibility to ensure that \c key does not
1311 /// conflict with other calls to \c bcast. Keys may be reused after
1312 /// the associated operation has finished.
1313 template <typename keyT, typename valueT>
1314 void bcast(const keyT& key, Future<valueT>& value,
1315 const ProcessID group_root, const Group& group) const
1316 {
1317 MADNESS_ASSERT(! group.empty());
1318 MADNESS_ASSERT(group.get_world().id() == world_.id());
1319 MADNESS_ASSERT((group_root >= 0) && (group_root < group.size()));
1320 MADNESS_ASSERT((group.rank() == group_root) || (! value.probe()));
1321
1322 if(group.size() > 1) // Do nothing for the trivial case
1323 bcast_internal<GroupBcastTag>(key, value, group_root, group);
1324 }
1325
1326 /// Distributed reduce
1327
1328 /// The reduce functor must have the following signature:
1329 /// \code
1330 /// class ReduceFunc {
1331 /// public:
1332 /// // Typedefs
1333 /// typedef ... result_type;
1334 /// typedef ... argument_type;
1335 ///
1336 /// // Constructors
1337 /// ReduceFunc(const ReduceFunc&);
1338 ///
1339 /// // Initialization operation, which returns a default result object
1340 /// result_type operator()() const;
1341 ///
1342 /// // Reduce two result objects
1343 /// void operator()(result_type&, const result_type&) const;
1344 ///
1345 /// // Reduce a result object and an argument object
1346 /// void operator()(result_type&, const argument_type&) const;
1347 /// }; // class ReduceFunc
1348 /// \endcode
1349 /// \tparam keyT The key type
1350 /// \tparam valueT The data type to be reduced
1351 /// \tparam opT The reduction operation type
1352 /// \param key The key associated with this reduction
1353 /// \param value The local value to be reduced
1354 /// \param op The reduction operation to be applied to local and remote data
1355 /// \param root The process that will receive the result of the reduction
1356 /// \return A future to the reduce value on the root process, otherwise an
1357 /// uninitialized future that may be ignored.
1358 /// \note It is the user's responsibility to ensure that \c key does not
1359 /// conflict with other calls to \c reduce. Keys may be reused after
1360 /// the associated operation has finished.
1361 template <typename keyT, typename valueT, typename opT>
1363 reduce(const keyT& key, const valueT& value, const opT& op, const ProcessID root) {
1364 MADNESS_ASSERT((root >= 0) && (root < world_.size()));
1365
1366 // Get the binary tree data
1367 ProcessID parent = -1, child0 = -1, child1 = -1;
1368 world_.mpi.binary_tree_info(root, parent, child0, child1);
1369
1370 return reduce_internal<ReduceTag>(parent, child0, child1, root, key,
1371 value, op);
1372 }
1373
1374 /// Distributed group reduce
1375
1376 /// The reduce functor must have the following signature:
1377 /// \code
1378 /// class ReduceFunc {
1379 /// public:
1380 /// // Typedefs
1381 /// typedef ... result_type;
1382 /// typedef ... argument_type;
1383 ///
1384 /// // Constructors
1385 /// ReduceFunc(const ReduceFunc&);
1386 ///
1387 /// // Initialization operation, which returns a default result object
1388 /// result_type operator()() const;
1389 ///
1390 /// // Reduce two result objects
1391 /// void operator()(result_type&, const result_type&) const;
1392 ///
1393 /// // Reduce a result object and an argument object
1394 /// void operator()(result_type&, const argument_type&) const;
1395 /// }; // class ReduceFunc
1396 /// \endcode
1397 /// \tparam keyT The key type
1398 /// \tparam valueT The data type to be reduced
1399 /// \tparam opT The reduction operation type
1400 /// \param key The key associated with this reduction
1401 /// \param value The local value to be reduced
1402 /// \param op The reduction operation to be applied to local and remote data
1403 /// \param group_root The group process that will receive the result of the reduction
1404 /// \param group The group that will preform the reduction
1405 /// \return A future to the reduce value on the root process, otherwise an
1406 /// uninitialized future that may be ignored.
1407 /// \throw madness::Exception When \c group is empty
1408 /// \throw madness::Exception When \c group is not registered
1409 /// \throw madness::Exception When the world id of \c group is not
1410 /// equal to that of the world used to construct this object
1411 /// \throw madness::Exception When this process is not in the group
1412 /// \throw madness::Exception When \c group_root is less than zero or
1413 /// greater than or equal to \c group size.
1414 /// \note It is the user's responsibility to ensure that \c key does not
1415 /// conflict with other calls to \c reduce. Keys may be reused after
1416 /// the associated operation has finished.
1417 template <typename keyT, typename valueT, typename opT>
1419 reduce(const keyT& key, const valueT& value, const opT& op,
1420 const ProcessID group_root, const Group& group)
1421 {
1422 MADNESS_ASSERT(! group.empty());
1423 MADNESS_ASSERT(group.get_world().id() == world_.id());
1424 MADNESS_ASSERT((group_root >= 0) && (group_root < group.size()));
1425
1426 // Get the binary tree data
1427 ProcessID parent = -1, child0 = -1, child1 = -1;
1428 group.make_tree(group_root, parent, child0, child1);
1429
1430 return reduce_internal<ReduceTag>(parent, child0, child1, group_root,
1431 key, value, op);
1432 }
1433
1434 /// Distributed all reduce
1435
1436 /// The reduce functor must have the following signature:
1437 /// \code
1438 /// class ReduceFunc {
1439 /// public:
1440 /// // Typedefs
1441 /// typedef ... result_type;
1442 /// typedef ... argument_type;
1443 ///
1444 /// // Constructors
1445 /// ReduceFunc(const ReduceFunc&);
1446 ///
1447 /// // Initialization operation, which returns a default result object
1448 /// result_type operator()() const;
1449 ///
1450 /// // Reduce two result objects
1451 /// void operator()(result_type&, const result_type&) const;
1452 ///
1453 /// // Reduce a result object and an argument object
1454 /// void operator()(result_type&, const argument_type&) const;
1455 /// }; // class ReduceFunc
1456 /// \endcode
1457 /// \tparam keyT The key type
1458 /// \tparam valueT The data type to be reduced
1459 /// \tparam opT The reduction operation type
1460 /// \param key The key associated with this reduction
1461 /// \param value The local value to be reduced
1462 /// \param op The reduction operation to be applied to local and remote data
1463 /// \return A future to the reduce value on the root process, otherwise an
1464 /// uninitialized future that may be ignored.
1465 /// \note It is the user's responsibility to ensure that \c key does not
1466 /// conflict with other calls to \c all_reduce. Keys may be reused after
1467 /// the associated operation has finished.
1468 template <typename keyT, typename valueT, typename opT>
1470 all_reduce(const keyT& key, const valueT& value, const opT& op) {
1471 // Compute the parent and child processes of this process in a binary tree.
1472 Hash<keyT> hasher;
1473 const ProcessID root = hasher(key) % world_.size();
1474 ProcessID parent = -1, child0 = -1, child1 = -1;
1475 world_.mpi.binary_tree_info(root, parent, child0, child1);
1476
1477 // Reduce the data
1479 reduce_internal<AllReduceTag>(parent, child0, child1, root,
1480 key, value, op);
1481
1482 if(world_.rank() != root)
1484
1485 // Broadcast the result of the reduction to all processes
1486 bcast_internal<AllReduceTag>(key, reduce_result, root);
1487
1488 return reduce_result;
1489 }
1490
1491 /// Distributed, group all reduce
1492
1493 /// The reduce functor must have the following signature:
1494 /// \code
1495 /// class ReduceFunc {
1496 /// public:
1497 /// // Typedefs
1498 /// typedef ... result_type;
1499 /// typedef ... argument_type;
1500 ///
1501 /// // Constructors
1502 /// ReduceFunc(const ReduceFunc&);
1503 ///
1504 /// // Initialization operation, which returns a default result object
1505 /// result_type operator()() const;
1506 ///
1507 /// // Reduce two result objects
1508 /// void operator()(result_type&, const result_type&) const;
1509 ///
1510 /// // Reduce a result object and an argument object
1511 /// void operator()(result_type&, const argument_type&) const;
1512 /// }; // class ReduceFunc
1513 /// \endcode
1514 /// \tparam keyT The key type
1515 /// \tparam valueT The data type to be reduced
1516 /// \tparam opT The reduction operation type
1517 /// \param key The key associated with this reduction
1518 /// \param value The local value to be reduced
1519 /// \param op The reduction operation to be applied to local and remote data
1520 /// \param group The group that will preform the reduction
1521 /// \return A future to the reduce value on the root process, otherwise an
1522 /// uninitialized future that may be ignored
1523 /// \throw madness::Exception When \c group is empty
1524 /// \throw madness::Exception When \c group is not registered
1525 /// \throw madness::Exception When the world id of \c group is not
1526 /// equal to that of the world used to construct this object
1527 /// \throw madness::Exception When this process is not in the group
1528 /// \note It is the user's responsibility to ensure that \c key does not
1529 /// conflict with other calls to \c reduce. Keys may be reused after
1530 /// the associated operation has finished.
1531 template <typename keyT, typename valueT, typename opT>
1533 all_reduce(const keyT& key, const valueT& value, const opT& op, const Group& group) {
1534 MADNESS_ASSERT(! group.empty());
1535 MADNESS_ASSERT(group.get_world().id() == world_.id());
1536
1537 // Compute the parent and child processes of this process in a binary tree.
1538 Hash<keyT> hasher;
1539 const ProcessID group_root = hasher(key) % group.size();
1540 ProcessID parent = -1, child0 = -1, child1 = -1;
1541 group.make_tree(group_root, parent, child0, child1);
1542
1543 // Reduce the data
1545 reduce_internal<GroupAllReduceTag>(parent, child0, child1,
1546 group_root, key, value, op);
1547
1548
1549 if(group.rank() != group_root)
1551
1552 // Broadcast the result of the reduction to all processes in the group
1553 bcast_internal<GroupAllReduceTag>(key, reduce_result, 0, group);
1554
1555 return reduce_result;
1556 }
1557 }; // class WorldGopInterface
1558
1559} // namespace madness
1560
1561#endif // MADNESS_WORLD_WORLDGOP_H__INCLUDED
Implements an archive wrapping a memory buffer.
void binary_tree_info(int root, int &parent, int &child0, int &child1)
Construct info about a binary tree with given root.
Definition safempi.cc:39
int Get_rank() const
Definition safempi.h:721
static int unique_tag_period()
Definition safempi.h:843
int unique_tag()
Returns a unique tag for temporary use (1023<tag<4095)
Definition safempi.h:837
Definition safempi.h:296
World active message that extends an RMI message.
Definition worldam.h:80
The class used for callbacks (e.g., dependency tracking).
Definition dependency_interface.h:61
A future is a possibly yet unevaluated value.
Definition future.h:370
T & get(bool dowork=true) &
Gets the value, waiting if necessary.
Definition future.h:571
static const Future< T > default_initializer()
See "Gotchas" on Futures about why this exists and how to use it.
Definition future.h:459
bool probe() const
Check whether this future has been assigned.
Definition future.h:628
A collection of processes.
Definition group.h:50
void remote_update() const
Update remote usage count.
Definition group.h:383
void local_update() const
Update local usage count.
Definition group.h:369
ProcessID size() const
Group size accessor.
Definition group.h:429
const DistributedID & id() const
Group id accessor.
Definition group.h:396
bool empty() const
Quary empty group.
Definition group.h:391
ProcessID rank() const
Group rank accessor.
Definition group.h:412
static madness::Future< Group > get_group(const DistributedID &did)
Get group from the registry.
Definition group.cc:90
void make_tree(const ProcessID group_root, ProcessID &parent, ProcessID &child1, ProcessID &child2) const
Compute the binary tree parents and children.
Definition group.h:449
World & get_world() const
Parent world accessor.
Definition group.h:404
Key object that includes the process information.
Definition distributed_id.h:80
static const attrT ATTR_UNORDERED
Definition worldrmi.h:180
Key object that uses a tag to differentiate keys.
Definition distributed_id.h:177
static TaskAttributes hipri()
Definition thread.h:456
void send(ProcessID dest, am_handlerT op, const AmArg *arg, const int attr=RMI::ATTR_ORDERED)
Sends a managed non-blocking active message.
Definition worldam.h:278
Delayed send callback object.
Definition worldgop.h:177
DelayedSend(World &world, const ProcessID dest, const keyT &key, const Future< valueT > &value)
Constructor.
Definition worldgop.h:191
virtual ~DelayedSend()
Definition worldgop.h:196
World & world_
The communication world.
Definition worldgop.h:179
const ProcessID dest_
The destination process id.
Definition worldgop.h:180
Future< valueT > value_
The data to be sent.
Definition worldgop.h:182
const keyT key_
The distributed id associated with value_.
Definition worldgop.h:181
DelayedSend< keyT, valueT > & operator=(const DelayedSend< keyT, valueT > &)
DelayedSend(const DelayedSend< keyT, valueT > &)
virtual void notify()
Notify this object that the future has been set.
Definition worldgop.h:202
Provides collectives that interoperate with the AM and task interfaces.
Definition worldgop.h:147
int max_reducebcast_msg_size() const
Returns the maximum size of messages (in bytes) sent by reduce and broadcast.
Definition worldgop.h:698
void lazy_sync(const keyT &key, const opT &op, const Group &group) const
Group lazy sync.
Definition worldgop.h:1246
void send_internal(ProcessID dest, const keyT &key, const Future< valueT > &value) const
Send value to dest.
Definition worldgop.h:252
void max(T *buf, size_t nelem)
Inplace global max while still processing AM & tasks.
Definition worldgop.h:902
static void bcast_handler(const AmArg &arg)
Definition worldgop.h:355
void lazy_sync(const keyT &key, const opT &op) const
Lazy sync.
Definition worldgop.h:1200
World & world_
World object that this is a part of.
Definition worldgop.h:149
int set_max_reducebcast_msg_size(int sz)
Set the maximum size of messages (in bytes) sent by reduce and broadcast.
Definition worldgop.h:688
std::shared_ptr< detail::DeferredCleanup > deferred_
Deferred cleanup object.
Definition worldgop.h:150
void reduce(T *buf, std::size_t nelem, opT op)
Inplace global reduction (like MPI all_reduce) while still processing AM & tasks.
Definition worldgop.h:803
void broadcast(T &t)
Broadcast of a scalar from node 0 to all other nodes.
Definition worldgop.h:760
~WorldGopInterface()
Definition worldgop.h:663
void broadcast_serializable(objT &obj, ProcessID root)
Broadcast a serializable object.
Definition worldgop.h:774
void lazy_sync_internal(const ProcessID parent, const ProcessID child0, const ProcessID child1, const keyT &key, const opT &op) const
Start a distributed lazy sync operation.
Definition worldgop.h:318
bool in_do_cleanup_
set while this gop's deferred_->do_cleanup() is running inside fence_impl, so that destructors invoke...
Definition worldgop.h:154
void sum(T &a)
Global sum of a scalar while still processing AM & tasks.
Definition worldgop.h:951
int max_reducebcast_msg_size_
maximum size of messages (in bytes) sent by reduce and broadcast
Definition worldgop.h:153
void fence(bool debug=false)
Synchronizes all processes in communicator AND globally ensures no pending AM or tasks.
Definition worldgop.cc:176
void broadcast(void *buf, size_t nbyte, ProcessID root, bool dowork=true, Tag bcast_tag=-1)
Broadcasts bytes from process root while still processing AM & tasks.
Definition worldgop.cc:188
void bit_and(T *buf, size_t nelem)
Definition worldgop.h:925
void bcast_internal(const keyT &key, Future< valueT > &value, const ProcessID root) const
Broadcast.
Definition worldgop.h:474
void lazy_sync_parent(const ProcessID parent, const keyT &key, const ProcessID, const ProcessID) const
Lazy sync parent task.
Definition worldgop.h:282
static Future< valueT > recv_internal(const keyT &key)
Receive data from remote node.
Definition worldgop.h:215
void absmin(T *buf, size_t nelem)
Inplace global absmin while still processing AM & tasks.
Definition worldgop.h:908
void bit_or(T *buf, size_t nelem)
Definition worldgop.h:930
std::enable_if<!is_future< valueT >::value >::type send_internal(const ProcessID dest, const keyT &key, const valueT &value) const
Send value to dest.
Definition worldgop.h:229
WorldGopInterface(World &world)
Definition worldgop.h:659
bool set_forbid_fence(bool value)
Set forbid_fence flag to new value and return old value.
Definition worldgop.h:677
void bcast(const keyT &key, Future< valueT > &value, const ProcessID group_root, const Group &group) const
Group broadcast.
Definition worldgop.h:1314
void bcast_task(const keyT &key, const valueT &value, const ProcessID root) const
Broadcast task.
Definition worldgop.h:394
void group_bcast_task(const keyT &key, const valueT &value, const ProcessID group_root, const Group &group) const
Definition worldgop.h:425
void send(const ProcessID dest, const keyT &key, const valueT &value) const
Send value to dest.
Definition worldgop.h:1168
void logic_or(T *buf, size_t nelem)
Definition worldgop.h:945
void bcast(const keyT &key, Future< valueT > &value, const ProcessID root) const
Broadcast.
Definition worldgop.h:1282
int initial_max_reducebcast_msg_size()
Definition worldgop.h:632
void serial_invoke(std::function< void()> action)
Executes an action on single (this) thread after ensuring all other work is done.
Definition worldgop.cc:180
static void group_bcast_handler(const AmArg &arg)
Definition worldgop.h:370
Future< typename detail::result_of< opT >::type > all_reduce(const keyT &key, const valueT &value, const opT &op)
Distributed all reduce.
Definition worldgop.h:1470
Future< typename detail::result_of< opT >::type > reduce_internal(const ProcessID parent, const ProcessID child0, const ProcessID child1, const ProcessID root, const keyT &key, const valueT &value, const opT &op)
Distributed reduce.
Definition worldgop.h:586
bool forbid_fence_
forbid calling fence() in case of several active worlds
Definition worldgop.h:152
static detail::result_of< opT >::type reduce_result_task(const std::vector< Future< typename detail::result_of< opT >::type > > &results, const opT &op)
Definition worldgop.h:562
void absmax(T *buf, size_t nelem)
Inplace global absmax while still processing AM & tasks.
Definition worldgop.h:914
void broadcast(T *buf, size_t nelem, ProcessID root)
Broadcasts typed contiguous data from process root while still processing AM & tasks.
Definition worldgop.h:754
void fence_impl(std::function< void()> epilogue=[]{}, bool pause_during_epilogue=false, bool debug=false)
Implementation of fence.
Definition worldgop.cc:50
static detail::result_of< opT >::type reduce_task(const valueT &value, const opT &op)
Definition worldgop.h:554
void product(T *buf, size_t nelem)
Inplace global product while still processing AM & tasks.
Definition worldgop.h:920
void min(T *buf, size_t nelem)
Inplace global min while still processing AM & tasks.
Definition worldgop.h:896
void logic_and(T *buf, size_t nelem)
Definition worldgop.h:940
bool is_in_do_cleanup() const
Definition worldgop.h:715
static Future< valueT > recv(const ProcessID source, const keyT &key)
Receive data from source.
Definition worldgop.h:1153
Future< typename detail::result_of< opT >::type > reduce(const keyT &key, const valueT &value, const opT &op, const ProcessID group_root, const Group &group)
Distributed group reduce.
Definition worldgop.h:1419
bool debug_
Debug mode.
Definition worldgop.h:151
void min(T &a)
Global min of a scalar while still processing AM & tasks.
Definition worldgop.h:963
void max(T &a)
Global max of a scalar while still processing AM & tasks.
Definition worldgop.h:957
std::vector< T > concat0(const std::vector< T > &v, size_t bufsz=1024 *1024)
Concatenate an STL vector of serializable stuff onto node 0.
Definition worldgop.h:973
Future< typename detail::result_of< opT >::type > reduce(const keyT &key, const valueT &value, const opT &op, const ProcessID root)
Distributed reduce.
Definition worldgop.h:1363
void barrier()
Synchronizes all processes in communicator ... does NOT fence pending AM or tasks.
Definition worldgop.h:720
Future< typename detail::result_of< opT >::type > all_reduce(const keyT &key, const valueT &value, const opT &op, const Group &group)
Distributed, group all reduce.
Definition worldgop.h:1533
void lazy_sync_children(const ProcessID child0, const ProcessID child1, const keyT &key, opT &op, const ProcessID) const
Lazy sync parent task.
Definition worldgop.h:300
void bcast_internal(const keyT &key, Future< valueT > &value, const ProcessID group_root, const Group &group) const
Group broadcast.
Definition worldgop.h:524
void sum(T *buf, size_t nelem)
Inplace global sum while still processing AM & tasks.
Definition worldgop.h:890
void bit_xor(T *buf, size_t nelem)
Definition worldgop.h:935
friend class detail::DeferredCleanup
Definition worldgop.h:156
bool set_debug(bool value)
Set debug flag to new value and return old value.
Definition worldgop.h:670
void broadcast(T &t, ProcessID root)
Broadcast of a scalar from node root to all other nodes.
Definition worldgop.h:766
std::enable_if<!std::is_pointer< T >::value, SafeMPI::Request >::type Isend(const T &datum, int dest, int tag=SafeMPI::DEFAULT_SEND_RECV_TAG) const
Isend one element.
Definition worldmpi.h:308
SafeMPI::Request Irecv(T *buf, int count, int source, int tag=SafeMPI::DEFAULT_SEND_RECV_TAG) const
Async receive data of up to count elements from process source.
Definition worldmpi.h:321
void Send(const T *buf, long lenbuf, int dest, int tag=SafeMPI::DEFAULT_SEND_RECV_TAG) const
Send array of lenbuf elements to process dest.
Definition worldmpi.h:347
void Recv(T *buf, long lenbuf, int src, int tag) const
Receive data of up to lenbuf elements from process src.
Definition worldmpi.h:374
void add(TaskInterface *t)
Add a new local task, taking ownership of the pointer.
Definition world_task_queue.h:466
A parallel world class.
Definition world.h:132
WorldTaskQueue & taskq
Task queue.
Definition world.h:206
ProcessID rank() const
Returns the process rank in this World (same as MPI_Comm_rank()).
Definition world.h:320
static void await(SafeMPI::Request &request, bool dowork=true)
Wait for a MPI request to complete.
Definition world.h:534
WorldMpiInterface & mpi
MPI interface.
Definition world.h:204
ProcessID size() const
Returns the number of processes in this World (same as MPI_Comm_size()).
Definition world.h:330
unsigned long id() const
Definition world.h:315
WorldGopInterface & gop
Global operations.
Definition world.h:207
WorldAmInterface & am
AM interface.
Definition world.h:205
Wraps an archive around a memory buffer for input.
Definition buffer_archive.h:134
Wraps an archive around a memory buffer for output.
Definition buffer_archive.h:59
std::size_t size() const
Return the amount of data stored (counted) in the buffer.
Definition buffer_archive.h:123
Deferred cleanup of shared_ptr's.
Definition deferred_cleanup.h:60
Distributed caching utility.
Definition dist_cache.h:54
static void get_cache_value(const keyT &key, madness::Future< valueT > &value)
Get the cache value accosted with key.
Definition dist_cache.h:185
static void set_cache_value(const keyT &key, const valueT &value)
Set the cache value accosted with key.
Definition dist_cache.h:146
static bool debug
Definition dirac-hatom.cc:16
auto T(World &world, response_space &f) -> response_space
Definition global_functions.cc:28
Tensor< typename Tensor< T >::scalar_type > arg(const Tensor< T > &t)
Return a new tensor holding the argument of each element of t (complex types only)
Definition tensor.h:2518
static const double v
Definition hatom_sf_dirac.cc:20
Tensor< double > op(const Tensor< double > &x)
Definition kain.cc:508
#define max(a, b)
Definition lda.h:51
#define MADNESS_ASSERT(condition)
Assert a condition that should be free of side-effects since in release builds this might be a no-op.
Definition madness_exception.h:134
Intracomm COMM_WORLD
Definition safempi.cc:67
Definition potentialmanager.cc:41
Namespace for all elements and tools of MADNESS.
Definition DFParameters.h:10
std::pair< uniqueidT, std::size_t > DistributedID
Distributed ID which is used to identify objects.
Definition distributed_id.h:48
double abs(double x)
Definition complexfun.h:48
AmArg * copy_am_arg(const AmArg &arg)
Definition worldam.h:170
AmArg * new_am_arg(const argT &... args)
Convenience template for serializing arguments into a new AmArg.
Definition worldam.h:194
bool quiet()
Check if the MADNESS runtime was initialized for quiet operation.
Definition world.cc:77
void error(const char *msg)
Definition world.cc:142
std::string type(const PairType &n)
Definition PNOParameters.h:18
std::uint64_t cstr_to_memory_size(const char *str)
Unit-aware conversion of a C string to a size_t.
Definition units.cc:14
static long abs(long a)
Definition tensor.h:218
static const double b
Definition nonlinschro.cc:119
static const double a
Definition nonlinschro.cc:118
int posix_memalign(void **memptr, std::size_t alignment, std::size_t size)
Definition posixmem.h:44
Definition test_dc.cc:47
Hash functor.
Definition worldhash.h:233
Definition worldgop.h:86
T operator()(const T &a, const T &b) const
Definition worldgop.h:87
Definition worldgop.h:101
T operator()(const T &a, const T &b) const
Definition worldgop.h:102
Definition worldgop.h:109
T operator()(const T &a, const T &b) const
Definition worldgop.h:110
Definition worldgop.h:116
T operator()(const T &a, const T &b) const
Definition worldgop.h:117
Definition worldgop.h:123
T operator()(const T &a, const T &b) const
Definition worldgop.h:124
Definition worldgop.h:162
Definition worldgop.h:164
Definition worldgop.h:130
T operator()(const T &a, const T &b) const
Definition worldgop.h:131
Definition worldgop.h:137
T operator()(const T &a, const T &b) const
Definition worldgop.h:138
Definition worldgop.h:79
T operator()(const T &a, const T &b) const
Definition worldgop.h:80
Definition worldgop.h:94
T operator()(const T &a, const T &b) const
Definition worldgop.h:95
Definition worldgop.h:72
T operator()(const T &a, const T &b) const
Definition worldgop.h:73
Definition worldgop.h:65
T operator()(const T &a, const T &b) const
Definition worldgop.h:66
fnT::result_type type
Definition function_traits.h:97
T type
Type with Future removed.
Definition type_traits.h:111
#define MPI_INT
Definition stubmpi.h:81
#define MPI_BYTE
Definition stubmpi.h:77
AtomicInt sum
Definition test_atomicint.cc:46
std::pair< int, double > valueT
Definition test_binsorter.cc:6
double source(const coordT &r)
Definition testperiodic.cc:48
const char * status[2]
Definition testperiodic.cc:43
Declares the World class for the parallel runtime environment.
Defines TaskInterface and implements WorldTaskQueue and associated stuff.
Defines types used by the parallel runtime.
int ProcessID
Used to clearly identify process number/rank.
Definition worldtypes.h:43
int Tag
Used to clearly identify message tag/type.
Definition worldtypes.h:44