1#ifndef SRC_APPS_CHEM_EXCHANGEOPERATOR_H_
2#define SRC_APPS_CHEM_EXCHANGEOPERATOR_H_
16template<
typename T, std::
size_t NDIM>
35 double t1 = double(mul1_timer) * 0.001;
36 double t2 = double(apply_timer) * 0.001;
37 double t3 = double(mul2_timer) * 0.001;
45 j[
"total"] = elapsed_time;
50 auto timings= gather_timings(world);
51 if (world.
rank() == 0) {
52 printf(
" cpu time spent in multiply1 %8.2fs\n", timings[
"multiply1"].
template get<double>());
53 printf(
" cpu time spent in apply %8.2fs\n", timings[
"apply"].
template get<double>());
54 printf(
" cpu time spent in multiply2 %8.2fs\n", timings[
"multiply2"].
template get<double>());
55 printf(
" total wall time %8.2fs\n", timings[
"total"].
template get<double>());
78 mo_bra =
copy(world, bra);
79 mo_ket =
copy(world, ket);
82 std::string
info()
const {
return "K";}
131 j[
"symmetric"] = symmetric_;
134 j[
"mul_tol"] = mul_tol;
135 j[
"printlevel"] = printlevel;
138 auto timings = gather_timings(world);
146 vecfuncT K_macrotask_efficient(
const vecfuncT& vket,
const double mul_tol = 0.0)
const;
149 vecfuncT K_macrotask_efficient_row(
const vecfuncT& vket,
const double mul_tol = 0.0)
const;
152 vecfuncT K_small_memory(
const vecfuncT& vket,
const double mul_tol = 0.0)
const;
155 vecfuncT K_large_memory(
const vecfuncT& vket,
const double mul_tol = 0.0)
const;
159 const vecfuncT& vket, std::shared_ptr<real_convolution_3d> poisson,
160 const bool symmetric,
const double mul_tol = 0.0);
163 inline bool printprogress()
const {
return (printlevel>=4) and (not (printdebug()));}
169 bool symmetric_ =
false;
182 double mul_tol = 1.e-7;
183 bool symmetric =
false;
195 bool symmetric =
false;
198 const std::string policy)
const override {
200 partitionT partition1 = do_1d_partition(vsize1, policy);
201 partitionT partition2 = do_1d_partition(vsize2, policy);
203 for (
auto i = partition1.begin(); i != partition1.end(); ++i) {
205 for (
auto j = i; j != partition1.end(); ++j) {
206 Batch batch(i->first.input[0], j->first.input[0], _);
207 double priority=compute_priority(batch);
208 result.push_back(std::make_pair(batch,priority));
211 for (
auto j = partition2.begin(); j != partition2.end(); ++j) {
212 Batch batch(i->first.input[0], j->first.input[0], _);
213 double priority=compute_priority(batch);
214 result.push_back(std::make_pair(batch,priority));
226 long nrow = batch.
input[0].size();
227 long ncol = batch.
input[1].size();
228 return double(nrow * ncol);
234 : nresult(nresult),
lo(
lo), mul_tol(mul_tol), symmetric(symmetric) {
240 typedef std::tuple<const std::vector<Function<T, NDIM>>&,
241 const std::vector<Function<T, NDIM>>&,
244 using resultT = std::vector<Function<T, NDIM>>;
249 std::size_t n = std::get<0>(argtuple).size();
250 resultT result = zero_functions_compressed<T, NDIM>(world, n);
254 std::vector<Function<T, NDIM>>
259 World& world = vf_batch.front().world();
260 resultT Kf = zero_functions_compressed<T, NDIM>(world, nresult);
262 bool diagonal_block = batch.input[0] == batch.input[1];
263 auto& bra_range = batch.input[1];
264 auto& vf_range = batch.input[0];
266 if (vf_range.is_full_size()) vf_range.end = vf_batch.size();
267 if (bra_range.is_full_size()) bra_range.end = bra_batch.size();
272 if (symmetric and diagonal_block) {
273 auto ket_batch = bra_range.copy_batch(vket);
274 vecfuncT resultcolumn = compute_diagonal_batch_in_symmetric_matrix(world, ket_batch, bra_batch,
277 for (
int i = vf_range.begin; i < vf_range.end; ++i){
278 Kf[i] += resultcolumn[i - vf_range.begin];}
280 }
else if (symmetric and not diagonal_block) {
281 auto[resultcolumn, resultrow]=compute_offdiagonal_batch_in_symmetric_matrix(world, vket, bra_batch,
284 for (
int i = bra_range.begin; i < bra_range.end; ++i){
285 Kf[i] += resultcolumn[i - bra_range.begin];}
286 for (
int i = vf_range.begin; i < vf_range.end; ++i){
287 Kf[i] += resultrow[i - vf_range.begin];}
289 auto ket_batch = bra_range.copy_batch(vket);
290 vecfuncT resultcolumn = compute_batch_in_asymmetric_matrix(world, ket_batch, bra_batch, vf_batch);
291 for (
int i = vf_range.begin; i < vf_range.end; ++i)
292 Kf[i] += resultcolumn[i - vf_range.begin];
309 double mul_tol = 0.0;
310 double symmetric =
true;
327 double mul_tol = 0.0;
328 double symmetric =
false;
341 std::pair<vecfuncT, vecfuncT> compute_offdiagonal_batch_in_symmetric_matrix(
World& subworld,
352 double mul_tol = 1.e-7;
353 bool symmetric =
false;
366 : nresult(nresult),
lo(
lo), mul_tol(mul_tol), algorithm_(algorithm) {
368 name=
"MacroTaskExchangeRow";
372 typedef std::tuple<const std::vector<Function<T, NDIM>>&,
373 const std::vector<Function<T, NDIM>>&,
376 using resultT = std::vector<Function<T, NDIM>>;
381 std::size_t n = std::get<0>(argtuple).size();
382 resultT result = zero_functions_compressed<T, NDIM>(world, n);
390 std::vector<Function<T, NDIM>>
394 std::vector<Function<T,NDIM>> result;
395 if (algorithm_==fetch_compute) {
396 result=row_fetch_compute(vket,mo_bra,mo_ket);
397 }
else if (algorithm_==multiworld_efficient_row) {
398 result=row(vket,mo_bra,mo_ket);
400 MADNESS_EXCEPTION(
"unknown algorithm in Exchange::MacroTaskExchangeRow::operator()",1);
405 std::vector<Function<T,NDIM>>
411 World& world = vket.front().world();
414 resultT Kf = zero_functions_compressed<T, NDIM>(world, 1);
415 vecfuncT psif = zero_functions_compressed<T,NDIM>(world, mo_bra.
size());
421 MADNESS_CHECK_THROW(vket.size()==1,
"out-of-bounds error in Exchange::MacroTaskExchangeRow::operator()");
422 size_t min_tile = 10;
423 size_t ntile = std::min(mo_bra.size(), min_tile);
425 for (
size_t ilo=0; ilo<mo_bra.size(); ilo+=ntile){
427 size_t iend = std::min(ilo+ntile,mo_bra.size());
428 vecfuncT tmp_mo_bra(mo_bra.begin()+ilo,mo_bra.begin()+iend);
429 auto tmp_psif =
mul_sparse(world, vket[i], tmp_mo_bra, mul_tol);
432 mul1_timer += long((cpu1 - cpu0) * 1000l);
435 tmp_psif =
apply(world, *poisson.get(), tmp_psif);
438 apply_timer += long((cpu1 - cpu0) * 1000l);
441 vecfuncT tmp_mo_ket(mo_ket.begin()+ilo,mo_ket.begin()+iend);
442 auto tmp_Kf =
dot(world, tmp_mo_ket, tmp_psif);
444 mul2_timer += long((cpu1 - cpu0) * 1000l);
453 std::vector<Function<T,NDIM>>
459 double total_execution_time=0.0;
460 double total_fetch_time=0.0;
461 double total_fetch_spawn_time=0.0;
463 resultT Kf = zero_functions_compressed<T, NDIM>(*subworld_ptr, 1);
469 std::shared_ptr<World> fetching_world(
new World(comm.
Clone()));
470 std::shared_ptr<World> executing_world(
new World(comm.
Clone()));
472 print(
"time to create two worlds:",cpu1-cpu0,
"seconds");
473 print(
"executing_world.id()",executing_world->id(),
"fetching_world.id()",fetching_world->id(),
"in MacroTaskExchangeRow");
484 MADNESS_CHECK_THROW(vket.size()==1,
"out-of-bounds error in Exchange::MacroTaskExchangeRow::operator()");
485 size_t min_tile = 10;
486 size_t ntile = std::min(mo_bra.size(), min_tile);
496 auto fetch_data = [&](
World& world,
const Tile& tile) {
498 "bra and ket size mismatch in Exchange::MacroTaskExchangeRow::execute()");
500 std::size_t sz=tile.iend-tile.ilo;
503 for (
size_t i=tile.ilo; i<tile.iend; ++i) {
504 auto f=
copy(world,mo_bra[i],
false);
505 subworld_bra[i-tile.ilo]=
f;
506 subworld_ket.push_back(
copy(world, mo_ket[i],
false));
508 return std::make_pair(subworld_bra,subworld_ket);
514 "bra and ket size mismatch in Exchange::MacroTaskExchangeRow::execute()");
516 auto world_id=world.
id();
517 auto phi_id=phi.world().id();
518 auto bra_id=mo_bra.front().world().id();
519 auto ket_id=mo_ket.front().world().id();
520 std::string msg=
"world mismatch in Exchange::MacroTaskExchangeRow::execute(): ";
521 msg+=
"world.id()="+std::to_string(world_id)+
", ";
522 msg+=
"phi.world().id()="+std::to_string(phi_id)+
", ";
523 msg+=
"bra.world().id()="+std::to_string(bra_id)+
", ";
524 msg+=
"ket.world().id()="+std::to_string(ket_id);
525 if (not (world_id==phi_id && world_id==bra_id && world_id==ket_id)) {
531 auto tmp_psif =
mul_sparse(world, phi, mo_bra, mul_tol);
534 mul1_timer += long((cpu1 - cpu0) * 1000l);
537 tmp_psif =
apply(world, *poisson.get(), tmp_psif);
540 apply_timer += long((cpu1 - cpu0) * 1000l);
543 auto tmp_Kf =
dot(world, mo_ket, tmp_psif);
545 mul2_timer += long((cpu1 - cpu0) * 1000l);
547 return tmp_Kf.truncate();
551 std::vector<Tile> tiles;
552 for (
size_t ilo=0; ilo<mo_bra.size(); ilo+=ntile) {
553 tiles.push_back(Tile{ilo,std::min(ilo+ntile,mo_bra.size())});
559 for (
size_t itile=0; itile<tiles.size(); ++itile) {
560 Tile& tile = tiles[itile];
564 print(
"fetching tile",tile.ilo,
"into world",executing_world->id());
565 std::tie(tmp_mo_bra1,tmp_mo_ket1)=fetch_data(*executing_world,tiles[itile]);
566 fetching_world->gop.set_forbid_fence(
false);
568 executing_world->gop.fence();
570 total_fetch_time += (t1 - t0);
571 total_fetch_spawn_time += (t2 - t0);
576 fetching_world->gop.set_forbid_fence(
true);
577 if (itile<tiles.size()-1) {
579 print(
"fetching tile",tiles[itile+1].ilo,
"into world",fetching_world->id(),
" at time ",
wall_time());
580 std::tie(tmp_mo_bra2,tmp_mo_ket2)=fetch_data(*fetching_world,tiles[itile+1]);
582 fetching_world->gop.set_forbid_fence(
false);
587 total_fetch_time += (t1 - t0);
588 total_fetch_spawn_time += (t2 - t0);
590 print(
"executing tile",tile.ilo,
"in world",executing_world->id());
592 Kf[0]+=execute(*executing_world,poisson1,phi1,tmp_mo_bra1,tmp_mo_ket1);
594 print(
"time to execute tile",tile.ilo,
"in world",executing_world->id(),dpu1-dpu0,
"seconds");
595 total_execution_time += dpu1-dpu0;
597 fetching_world->gop.fence();
600 std::swap(poisson1,poisson2);
601 std::swap(phi1,phi2);
602 std::swap(tmp_mo_bra2,tmp_mo_bra1);
603 std::swap(tmp_mo_ket2,tmp_mo_ket1);
604 std::swap(executing_world,fetching_world);
610 fetching_world->gop.fence();
611 executing_world->gop.fence();
613 print(
"overall time: ",cpu2-cpu0,
"seconds");
614 print(
"total execution time:",total_execution_time,
"seconds");
615 print(
"total fetch time:",total_fetch_time,
"seconds");
616 print(
"total fetch spawn time:",total_fetch_spawn_time,
"seconds");
Operators for the molecular HF and DFT code.
Wrapper around MPI_Comm. Has a shallow copy constructor; use Create(Get_group()) for deep copy.
Definition safempi.h:497
Intracomm Clone() const
Definition safempi.h:696
a batch consists of a 2D-input batch and a 1D-output batch: K-batch <- (I-batch, J-batch)
Definition macrotaskpartitioner.h:124
std::vector< Batch_1D > input
Definition macrotaskpartitioner.h:127
custom partitioning for the exchange operator in exchangeoperator.h
Definition exchangeoperator.h:357
MacroTaskPartitionerRow()
Definition exchangeoperator.h:359
Definition exchangeoperator.h:348
resultT allocator(World &world, const argtupleT &argtuple) const
Definition exchangeoperator.h:380
std::vector< Function< T, NDIM > > row_fetch_compute(const std::vector< Function< T, NDIM > > &vket, const std::vector< Function< T, NDIM > > &mo_bra, const std::vector< Function< T, NDIM > > &mo_ket)
Definition exchangeoperator.h:454
std::vector< Function< T, NDIM > > operator()(const std::vector< Function< T, NDIM > > &vket, const std::vector< Function< T, NDIM > > &mo_bra, const std::vector< Function< T, NDIM > > &mo_ket)
compute exchange row-wise for a fixed orbital phi_i of vket
Definition exchangeoperator.h:391
long nresult
Definition exchangeoperator.h:350
std::tuple< const std::vector< Function< T, NDIM > > &, const std::vector< Function< T, NDIM > > &, const std::vector< Function< T, NDIM > > & > argtupleT
Definition exchangeoperator.h:374
std::vector< Function< T, NDIM > > row(const std::vector< Function< T, NDIM > > &vket, const std::vector< Function< T, NDIM > > &mo_bra, const std::vector< Function< T, NDIM > > &mo_ket)
Definition exchangeoperator.h:406
std::vector< Function< T, NDIM > > resultT
Definition exchangeoperator.h:376
Algorithm algorithm_
Definition exchangeoperator.h:354
MacroTaskExchangeRow(const long nresult, const double lo, const double mul_tol, const Algorithm algorithm)
Definition exchangeoperator.h:365
custom partitioning for the exchange operator in exchangeoperator.h
Definition exchangeoperator.h:189
double compute_priority(const Batch &batch) const override
compute the priority of this task for non-dumb scheduling
Definition exchangeoperator.h:224
MacroTaskPartitionerExchange(const bool symmetric)
Definition exchangeoperator.h:191
partitionT do_partitioning(const std::size_t &vsize1, const std::size_t &vsize2, const std::string policy) const override
override this if you want your own partitioning
Definition exchangeoperator.h:197
Definition exchangeoperator.h:178
vecfuncT compute_diagonal_batch_in_symmetric_matrix(World &subworld, const vecfuncT &ket_batch, const vecfuncT &bra_batch, const vecfuncT &vf_batch) const
compute a batch of the exchange matrix, with identical ranges, exploiting the matrix symmetry
Definition exchangeoperator.h:304
MacroTaskExchangeSimple(const long nresult, const double lo, const double mul_tol, const bool symmetric)
Definition exchangeoperator.h:233
long nresult
Definition exchangeoperator.h:180
std::vector< Function< T, NDIM > > resultT
Definition exchangeoperator.h:244
vecfuncT compute_batch_in_asymmetric_matrix(World &subworld, const vecfuncT &ket_batch, const vecfuncT &bra_batch, const vecfuncT &vf_batch) const
compute a batch of the exchange matrix, with non-identical ranges
Definition exchangeoperator.h:323
std::vector< Function< T, NDIM > > operator()(const std::vector< Function< T, NDIM > > &vf_batch, const std::vector< Function< T, NDIM > > &bra_batch, const std::vector< Function< T, NDIM > > &vket)
Definition exchangeoperator.h:255
std::tuple< const std::vector< Function< T, NDIM > > &, const std::vector< Function< T, NDIM > > &, const std::vector< Function< T, NDIM > > & > argtupleT
Definition exchangeoperator.h:242
resultT allocator(World &world, const argtupleT &argtuple) const
Definition exchangeoperator.h:248
Definition exchangeoperator.h:17
static std::atomic< long > mul1_timer
timing
Definition exchangeoperator.h:23
Exchange< T, NDIM >::ExchangeAlgorithm Algorithm
Definition exchangeoperator.h:60
bool printtimings() const
Definition exchangeoperator.h:164
ExchangeImpl & symmetric(const bool flag)
Definition exchangeoperator.h:102
ExchangeImpl & set_printlevel(const long &level)
Definition exchangeoperator.h:117
nlohmann::json statistics
statistics of the Cloud (timings, memory) and of the parameters of this run
Definition exchangeoperator.h:176
static double elapsed_time
Definition exchangeoperator.h:24
void print_timer(World &world) const
Definition exchangeoperator.h:49
World & get_world() const
Definition exchangeoperator.h:124
nlohmann::json get_statistics() const
Definition exchangeoperator.h:126
ExchangeImpl & set_macro_task_info(const MacroTaskInfo &info)
Definition exchangeoperator.h:107
static void reset_timer()
Definition exchangeoperator.h:26
vecfuncT mo_bra
is the exchange matrix symmetric? K phi_i = \sum_k \phi_k \int \phi_k \phi_i
Definition exchangeoperator.h:170
World & world
Definition exchangeoperator.h:167
std::shared_ptr< MacroTaskQ > taskq
Definition exchangeoperator.h:168
Function< T, NDIM > functionT
Definition exchangeoperator.h:18
nlohmann::json gather_statistics() const
return some statistics about the current settings
Definition exchangeoperator.h:129
bool is_symmetric() const
Definition exchangeoperator.h:95
ExchangeImpl & set_taskq(std::shared_ptr< MacroTaskQ > taskq1)
Definition exchangeoperator.h:97
std::vector< functionT > vecfuncT
Definition exchangeoperator.h:19
ExchangeImpl & set_algorithm(const Algorithm &alg)
Definition exchangeoperator.h:112
bool printprogress() const
Definition exchangeoperator.h:163
static std::atomic< long > apply_timer
Definition exchangeoperator.h:21
ExchangeImpl(World &world, const double lo, const double thresh)
default ctor
Definition exchangeoperator.h:65
std::string info() const
Definition exchangeoperator.h:82
nlohmann::json gather_timings(World &world) const
Definition exchangeoperator.h:34
bool printtimings_detail() const
Definition exchangeoperator.h:165
bool printdebug() const
Definition exchangeoperator.h:162
std::shared_ptr< MacroTaskQ > get_taskq() const
Definition exchangeoperator.h:122
static auto set_poisson(World &world, const double lo, const double econv=FunctionDefaults< 3 >::get_thresh())
Definition exchangeoperator.h:84
static std::atomic< long > mul2_timer
Definition exchangeoperator.h:22
void set_bra_and_ket(const vecfuncT &bra, const vecfuncT &ket)
set the bra and ket orbital spaces, and the occupation
Definition exchangeoperator.h:77
Definition SCFOperators.h:105
MacroTaskInfo macro_task_info
Definition SCFOperators.h:165
static std::string to_string(const ExchangeAlgorithm alg)
Definition SCFOperators.h:144
ExchangeAlgorithm
Definition SCFOperators.h:117
@ multiworld_efficient_row
Definition SCFOperators.h:118
Function< T, NDIM > operator()(const Function< T, NDIM > &ket) const
Definition SCFOperators.h:195
std::vector< functionT > vecfuncT
Definition SCFOperators.h:111
std::string info() const
print some information about this operator
Definition SCFOperators.h:175
FunctionDefaults holds default paramaters as static class members.
Definition funcdefaults.h:100
static const double & get_thresh()
Returns the default threshold.
Definition funcdefaults.h:177
A multiresolution adaptive numerical function.
Definition mra.h:139
Definition macrotaskq.h:1240
partition one (two) vectors into 1D (2D) batches.
Definition macrotaskpartitioner.h:182
std::list< std::pair< Batch, double > > partitionT
Definition macrotaskpartitioner.h:186
The Nemo class.
Definition nemo.h:326
nlohmann::json statistics
Definition SCFOperators.h:64
std::shared_ptr< MacroTaskQ > taskq
Definition SCFOperators.h:71
void sum(T *buf, size_t nelem)
Inplace global sum while still processing AM & tasks.
Definition worldgop.h:872
A parallel world class.
Definition world.h:132
ProcessID rank() const
Returns the process rank in this World (same as MPI_Comm_rank()).
Definition world.h:320
ProcessID size() const
Returns the number of processes in this World (same as MPI_Comm_size()).
Definition world.h:330
unsigned long id() const
Definition world.h:315
WorldGopInterface & gop
Global operations.
Definition world.h:207
Declares the Cloud class for storing data and transfering them between worlds.
double(* f)(const coord_3d &)
Definition derivatives.cc:54
static double lo
Definition dirac-hatom.cc:23
std::vector< Spinor > truncate(std::vector< Spinor > arg)
Definition dirac-hatom.cc:503
Fcwf apply(World &world, real_convolution_3d &op, const Fcwf &psi)
Definition fcwf.cc:281
Fcwf copy(Fcwf psi)
Definition fcwf.cc:338
auto T(World &world, response_space &f) -> response_space
Definition global_functions.cc:28
Declares the macrotaskq and MacroTaskBase classes.
General header file for using MADNESS.
#define MADNESS_CHECK(condition)
Check a condition — even in a release build the condition is always evaluated so it can have side eff...
Definition madness_exception.h:182
#define MADNESS_EXCEPTION(msg, value)
Macro for throwing a MADNESS exception.
Definition madness_exception.h:119
#define MADNESS_CHECK_THROW(condition, msg)
Check a condition — even in a release build the condition is always evaluated so it can have side eff...
Definition madness_exception.h:207
void print(const tensorT &t)
Definition mcpfit.cc:140
Namespace for all elements and tools of MADNESS.
Definition DFParameters.h:10
static SeparatedConvolution< double, 3 > * CoulombOperatorPtr(World &world, double lo, double eps, const array_of_bools< 3 > &lattice_sum=FunctionDefaults< 3 >::get_bc().is_periodic(), int k=FunctionDefaults< 3 >::get_k())
Factory function generating separated kernel for convolution with 1/r in 3D.
Definition operator.h:1818
Function< TENSOR_RESULT_TYPE(L, R), NDIM > mul_sparse(const Function< L, NDIM > &left, const Function< R, NDIM > &right, double tol, bool fence=true)
Sparse multiplication — left and right must be reconstructed and if tol!=0 have tree of norms already...
Definition mra.h:1836
Function< TENSOR_RESULT_TYPE(T, R), NDIM > dot(World &world, const std::vector< Function< T, NDIM > > &a, const std::vector< Function< R, NDIM > > &b, bool fence=true)
Multiplies and sums two vectors of functions r = \sum_i a[i] * b[i].
Definition vmra.h:1565
double wall_time()
Returns the wall time in seconds relative to an arbitrary origin.
Definition timers.cc:48
std::string name(const FuncType &type, const int ex=-1)
Definition ccpairfunction.h:28
Function< T, NDIM > copy(const Function< T, NDIM > &f, const std::shared_ptr< WorldDCPmapInterface< Key< NDIM > > > &pmap, bool fence=true)
Create a new copy of the function with different distribution and optional fence.
Definition mra.h:2096
static const double thresh
Definition rk.cc:45
Definition macrotaskq.h:280
static MacroTaskInfo preset(const std::string name)
Definition macrotaskq.h:313
nlohmann::json to_json() const
Definition macrotaskq.h:400
class to temporarily redirect output to cout
Definition print.h:277
double cpu_time()
Definition test_list.cc:43
constexpr std::size_t NDIM
Definition testgconv.cc:54