14 #ifdef DASH_ENABLE_IPM 21 using std::setprecision;
23 static void *_aligned_malloc(
size_t size,
size_t alignment) {
25 posix_memalign(&buffer, alignment, size);
29 typedef double ElementType;
30 typedef int64_t index_t;
34 dash::CSRPattern<1, dash::ROW_MAJOR, index_t>
37 dash::util::TimeMeasure::Clock
40 #ifndef DASH__ALGORITHM__COPY__USE_WAIT 41 const std::string dash_async_copy_variant =
"flush";
43 const std::string dash_async_copy_variant =
"wait";
52 size_t num_iterations;
61 typedef enum local_copy_method_t {
70 double time_copy_min_us;
71 double time_copy_max_us;
72 double time_copy_med_us;
73 double time_copy_sdv_us;
82 index_t source_unit_id,
83 index_t target_unit_id,
86 local_copy_method l_copy_method = DASH_COPY);
88 void print_measurement_header();
89 void print_measurement_record(
90 const std::string & scenario,
91 const std::string & local_copy_method,
108 int main(
int argc,
char** argv)
111 #ifdef DASH_ENABLE_IPM 112 MPI_Pcontrol(0,
"off");
113 MPI_Pcontrol(0,
"clear");
123 auto ts_start = Timer::Now();
124 size_t num_numa_nodes = uloc.num_numa();
125 size_t num_local_cores = uloc.node_domain().num_cores();
127 size_t numa_node_cores = num_local_cores / num_numa_nodes;
129 size_t socket_cores = numa_node_cores * 2;
131 size_t num_nodes = dash::util::Locality::NumNodes();
134 bench_params.print_header();
135 bench_params.print_pinning();
138 size_t num_iterations = params.num_iterations;
139 size_t num_repeats = params.num_repeats;
140 size_t size_inc = params.size_min;
142 auto bench_cfg = bench_params.config();
144 print_params(bench_params, params);
146 print_measurement_header();
158 num_repeats = params.num_repeats;
159 for (
size_t i = 0; i < num_iterations && num_repeats > 0;
160 ++i, num_repeats /= params.rep_base)
162 auto block_size = std::pow(params.size_base,i) * size_inc;
165 num_repeats = std::max<size_t>(num_repeats, params.min_repeats);
169 u_init = (u_dst + num_local_cores) % dash::size();
170 ts_start = Timer::Now();
171 res = copy_block_to_local(
size, i, num_repeats, u_src, u_dst, u_init,
173 time_s = Timer::ElapsedSince(ts_start) * 1.0e-06;
174 print_measurement_record(
"local",
"std::copy", bench_cfg,
175 u_src, u_dst, u_init,
size, num_repeats,
176 time_s, res, params);
181 num_repeats = params.num_repeats;
182 for (
size_t i = 0; i < num_iterations && num_repeats > 0;
183 ++i, num_repeats /= params.rep_base)
185 auto block_size = std::pow(params.size_base,i) * size_inc;
188 num_repeats = std::max<size_t>(num_repeats, params.min_repeats);
192 u_init = (u_dst + num_local_cores) % dash::size();
193 ts_start = Timer::Now();
194 res = copy_block_to_local(
size, i, num_repeats, u_src, u_dst, u_init,
196 time_s = Timer::ElapsedSince(ts_start) * 1.0e-06;
197 print_measurement_record(
"local",
"dash::copy", bench_cfg,
198 u_src, u_dst, u_init,
size, num_repeats,
199 time_s, res, params);
204 num_repeats = params.num_repeats;
205 for (
size_t i = 0; i < num_iterations && num_repeats > 0;
206 ++i, num_repeats /= params.rep_base)
208 auto block_size = std::pow(params.size_base,i) * size_inc;
211 num_repeats = std::max<size_t>(num_repeats, params.min_repeats);
214 u_dst = (u_src + numa_node_cores) % dash::size();
215 u_init = (u_dst + num_local_cores) % dash::size();
216 ts_start = Timer::Now();
217 res = copy_block_to_local(
size, i, num_repeats, u_src, u_dst, u_init,
219 time_s = Timer::ElapsedSince(ts_start) * 1.0e-06;
220 print_measurement_record(
"socket.b",
"dash::copy", bench_cfg,
221 u_src, u_dst, u_init,
size, num_repeats,
222 time_s, res, params);
227 num_repeats = params.num_repeats;
228 for (
size_t i = 0; i < num_iterations && num_repeats > 0;
229 ++i, num_repeats /= params.rep_base)
231 auto block_size = std::pow(params.size_base,i) * size_inc;
234 num_repeats = std::max<size_t>(num_repeats, params.min_repeats);
237 u_dst = (u_src + num_local_cores) % dash::size();
238 u_init = (u_src + numa_node_cores) % dash::size();
239 ts_start = Timer::Now();
240 res = copy_block_to_local(
size, i, num_repeats, u_src, u_dst, u_init,
241 params, DASH_COPY_ASYNC);
242 time_s = Timer::ElapsedSince(ts_start) * 1.0e-06;
243 print_measurement_record(
"rmt.async",
"dash::copy", bench_cfg,
244 u_src, u_dst, u_init,
size, num_repeats,
245 time_s, res, params);
250 cout <<
"Benchmark finished" << endl;
257 measurement copy_block_to_local(
261 index_t source_unit_id,
262 index_t target_unit_id,
263 index_t init_unit_id,
265 local_copy_method l_copy_method)
268 pattern_t pattern(
size, dash::BLOCKED);
271 result.time_init_s = 0;
272 result.time_copy_s = 0;
273 result.time_copy_min_us = 0;
274 result.time_copy_max_us = 0;
282 index_t block_index = source_unit_id;
283 auto source_block = pattern.block(block_index);
284 size_t block_size = source_block.size();
285 size_t block_bytes = block_size *
sizeof(ElementType);
286 index_t copy_start_idx = source_block.offset(0);
287 index_t copy_end_idx = copy_start_idx + block_size;
288 auto block_unit_id = pattern.unit_at(copy_start_idx);
290 size_t cache_size = uloc.cache_line_size(2);
292 size_t align_size = 128;
294 cache_size = (cache_size % align_size == 0)
296 : ((cache_size / align_size) + 1) * align_size;
311 DASH_LOG_DEBUG(
"copy_block_to_local()",
313 "block index:", block_index,
314 "block size:", block_size,
315 "copy index range:", copy_start_idx,
"-", copy_end_idx);
317 if (source_unit_id != block_unit_id) {
319 "copy_block_to_local: Invalid distribution of global array");
323 ElementType * local_array =
nullptr;
324 if (
myid == target_unit_id) {
325 local_array =
static_cast<ElementType *
>(
326 _aligned_malloc(block_bytes, align_size));
332 std::srand(time(NULL));
334 double total_copy_us = 0;
335 double total_init_us = 0;
336 std::vector<double> history_copy_us;
338 for (
size_t r = 0; r < num_repeats; ++r) {
340 Timer::timestamp_t ts_init_start = Timer::Now();
343 auto src_g_begin = global_array.
begin() + copy_start_idx;
345 auto src_g_end = global_array.
begin() + copy_end_idx;
347 ElementType * src_l_begin = src_g_begin.
local();
351 for (
size_t l = 0; l < block_size; ++l) {
353 + (std::rand() / RAND_MAX);
357 if (params.flush_cache &&
myid == init_unit_id) {
360 ElementType * block_values =
new ElementType[block_size];
361 for (
size_t p = 0; p < block_size; ++p) {
362 block_values[p] = ((
myid+1) * 100000)
367 block_values + block_size,
370 delete[] block_values;
376 #ifdef DASH_ENABLE_IPM 377 MPI_Pcontrol(0,
"on");
380 if(
myid == target_unit_id) {
381 total_init_us += Timer::ElapsedSince(ts_init_start);
382 ElementType * copy_lend =
nullptr;
384 auto ts_copy_start = Timer::Now();
385 if (l_copy_method == STD_COPY) {
387 src_l_begin + block_size,
389 }
else if (l_copy_method == MEMCPY) {
390 copy_lend = local_array + block_size;
391 std::memcpy(local_array, src_l_begin, block_size *
sizeof(ElementType));
392 }
else if (l_copy_method == DASH_COPY_ASYNC) {
401 auto copy_us = Timer::ElapsedSince(ts_copy_start);
402 total_copy_us += copy_us;
403 history_copy_us.push_back(copy_us);
408 if (copy_lend != local_array + block_size) {
410 "copy_block_to_local: " <<
411 "Unexpected end of copy output range " <<
412 "expected: " << local_array + block_size <<
" " <<
413 "actual: " << copy_lend);
416 for (
size_t l = 0; l < block_size; ++l) {
417 ElementType expected = global_array[copy_start_idx + l];
418 ElementType actual = local_array[l];
419 if (actual != expected) {
421 "copy_block_to_local: Validation failed " <<
422 "for copied element at offset " << l <<
" " <<
423 "in repetition " << r <<
": " <<
424 "expected: " << expected <<
" " <<
425 "actual: " << actual);
431 #ifdef DASH_ENABLE_IPM 432 MPI_Pcontrol(0,
"off");
440 if (local_array !=
nullptr) {
444 if(
myid == target_unit_id) {
445 time_copy_us.
set(total_copy_us);
446 time_init_us.
set(total_init_us);
448 std::sort(history_copy_us.begin(), history_copy_us.end());
449 time_copy_med_us.
set(history_copy_us[history_copy_us.size() / 2]);
450 time_copy_sdv_us.
set(dash::math::sigma(history_copy_us.begin(),
451 history_copy_us.end()));
452 time_copy_min_us.
set(history_copy_us.front());
453 time_copy_max_us.
set(history_copy_us.back());
457 global_array.deallocate();
460 "copy_block_to_local",
461 "Waiting for completion of copy operation");
464 double mb_copied =
static_cast<double>(block_bytes * num_repeats)
467 result.time_init_s = time_init_us.
get() * 1.0e-6;
468 result.time_copy_s = time_copy_us.
get() * 1.0e-6;
469 result.time_copy_min_us = time_copy_min_us.
get();
470 result.time_copy_max_us = time_copy_max_us.
get();
471 result.time_copy_med_us = time_copy_med_us.
get();
472 result.time_copy_sdv_us = time_copy_sdv_us.
get();
473 result.mb_per_s = mb_copied / result.time_copy_s;
478 void print_measurement_header()
482 << std::setw(5) <<
"units" <<
"," 483 << std::setw(9) <<
"mpi.impl" <<
"," 484 << std::setw(10) <<
"scenario" <<
"," 485 << std::setw(12) <<
"copy.type" <<
"," 486 << std::setw(7) <<
"src.u" <<
"," 487 << std::setw(7) <<
"dest.u" <<
"," 488 << std::setw(7) <<
"init.u" <<
"," 489 << std::setw(8) <<
"repeats" <<
"," 490 << std::setw(9) <<
"block.n" <<
"," 491 << std::setw(9) <<
"block.kb" <<
"," 492 << std::setw(7) <<
"init.s" <<
"," 493 << std::setw(8) <<
"copy.s" <<
"," 494 << std::setw(12) <<
"copy.min.us" <<
"," 495 << std::setw(12) <<
"copy.med.us" <<
"," 496 << std::setw(12) <<
"copy.max.us" <<
"," 497 << std::setw(12) <<
"copy.sdv.us" <<
"," 498 << std::setw(7) <<
"time.s" <<
"," 499 << std::setw(9) <<
"mb/s" 504 void print_measurement_record(
505 const std::string & scenario,
506 const std::string & local_copy_method,
514 measurement measurement,
518 std::string mpi_impl = dash__toxstr(DASH_MPI_IMPL_ID);
520 size_t g_size_kb = (
size *
sizeof(ElementType)) / 1024;
521 size_t block_kb = (block_n *
sizeof(ElementType)) / 1024;
522 double mbps = measurement.mb_per_s;
523 double init_s = measurement.time_init_s;
524 double copy_s = measurement.time_copy_s;
525 double copy_min_us = measurement.time_copy_min_us;
526 double copy_max_us = measurement.time_copy_max_us;
527 double copy_med_us = measurement.time_copy_med_us;
528 double copy_sdv_us = measurement.time_copy_sdv_us;
531 << std::setw(9) << mpi_impl <<
"," 532 << std::setw(10) << scenario <<
"," 533 << std::setw(12) << local_copy_method <<
"," 534 << std::setw(7) << unit_src <<
"," 535 << std::setw(7) << unit_dest <<
"," 536 << std::setw(7) << unit_init <<
"," 537 << std::setw(8) << num_repeats <<
"," 538 << std::setw(9) << block_n <<
"," 539 << std::setw(9) << block_kb <<
"," 540 << std::fixed << setprecision(2) << setw(7) << init_s <<
"," 541 << std::fixed << setprecision(5) << setw(8) << copy_s <<
"," 542 << std::fixed << setprecision(2) << setw(12) << copy_min_us <<
"," 543 << std::fixed << setprecision(2) << setw(12) << copy_med_us <<
"," 544 << std::fixed << setprecision(2) << setw(12) << copy_max_us <<
"," 545 << std::fixed << setprecision(2) << setw(12) << copy_sdv_us <<
"," 546 << std::fixed << setprecision(2) << setw(7) << secs <<
"," 547 << std::fixed << setprecision(2) << setw(9) << mbps
555 params.size_base = 4;
556 params.num_iterations = 8;
557 params.rep_base = params.size_base;
558 params.num_repeats = 0;
559 params.min_repeats = 1;
560 params.verify =
false;
561 params.local_only =
false;
562 params.flush_cache =
false;
563 params.size_min = 64;
565 for (
auto i = 1; i < argc; i += 2) {
566 std::string flag = argv[i];
568 params.size_base = atoi(argv[i+1]);
569 }
else if (flag ==
"-smin") {
570 params.size_min = atoi(argv[i+1]);
571 }
else if (flag ==
"-i") {
572 params.num_iterations = atoi(argv[i+1]);
573 }
else if (flag ==
"-rmax") {
574 params.num_repeats = atoi(argv[i+1]);
575 }
else if (flag ==
"-rmin") {
576 params.min_repeats = atoi(argv[i+1]);
577 }
else if (flag ==
"-rb") {
578 params.rep_base = atoi(argv[i+1]);
579 }
else if (flag ==
"-verify") {
580 params.verify =
true;
582 }
else if (flag ==
"-lo") {
583 params.local_only =
true;
585 }
else if (flag ==
"-fcache") {
586 params.flush_cache =
true;
590 if (params.num_repeats == 0) {
591 params.num_repeats = 8 * std::pow(params.rep_base, params.num_iterations);
604 bench_cfg.print_section_start(
"Runtime arguments");
605 bench_cfg.print_param(
"-smin",
"initial block size", params.size_min);
606 bench_cfg.print_param(
"-sb",
"block size base", params.size_base);
607 bench_cfg.print_param(
"-rmax",
"initial repeats", params.num_repeats);
608 bench_cfg.print_param(
"-rmin",
"min, repeats", params.min_repeats);
609 bench_cfg.print_param(
"-rb",
"rep. base", params.rep_base);
610 bench_cfg.print_param(
"-i",
"iterations", params.num_iterations);
611 bench_cfg.print_param(
"-verify",
"verification", params.verify);
612 bench_cfg.print_param(
"-lo",
"local only", params.local_only);
613 bench_cfg.print_param(
"-fcache",
"no copying from cache", params.flush_cache);
614 bench_cfg.print_section_end();
global_unit_t myid()
Shortcut to query the global unit ID of the calling unit.
size_t size()
Return the number of units in the global team.
dash::Future< ValueType * > copy_async(InputIt in_first, InputIt in_last, OutputIt out_first)
Asynchronous variant of dash::copy.
int32_t dart_unit_t
Data type for storing a unit ID.
DASH_CONSTEXPR local_type local() const
Convert global iterator to native pointer.
void finalize()
Finalize the DASH library and the underlying runtime system.
reference get()
Get a reference on the shared value.
void set(const value_type &val)
Set the value of the shared element.
void sort(GlobRandomIt begin, GlobRandomIt end)
Sorts the elements in the range, defined by [begin, end) in ascending order.
OutputIt copy(InputIt in_first, InputIt in_last, OutputIt out_first)
Copies the elements in the range, defined by [in_first, in_last), to another range beginning at out_f...
Shared access to a value in global memory across a team.
void barrier()
A global barrier involving all units.
void init(int *argc, char ***argv)
Initialize the DASH library and the underlying runtime system.
iterator begin() noexcept
Global pointer to the beginning of the array.
bool allocate(size_type nelem, dash::DistributionSpec< 1 > distribution, dash::Team &team=dash::Team::All())
Delayed allocation of global memory using a one-dimensional distribution spec.
local_type local
Local proxy object, allows use in range-based for loops.
Wrapper of a single dart_unit_locality_t object.