1 | /* |
2 | Copyright (c) 2005-2019 Intel Corporation |
3 | |
4 | Licensed under the Apache License, Version 2.0 (the "License"); |
5 | you may not use this file except in compliance with the License. |
6 | You may obtain a copy of the License at |
7 | |
8 | http://www.apache.org/licenses/LICENSE-2.0 |
9 | |
10 | Unless required by applicable law or agreed to in writing, software |
11 | distributed under the License is distributed on an "AS IS" BASIS, |
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | See the License for the specific language governing permissions and |
14 | limitations under the License. |
15 | */ |
16 | |
17 | /* Common part for the partitioner whitebox tests */ |
18 | |
19 | #include <typeinfo> |
20 | |
21 | #include "tbb/tbb_thread.h" |
22 | #include "tbb/enumerable_thread_specific.h" |
23 | |
24 | #include "string.h" |
25 | #include "harness_assert.h" |
26 | #include "test_partitioner.h" |
27 | #include <numeric> |
28 | |
29 | #if TBB_USE_DEBUG |
30 | // reducing number of simulations due to test timeout |
31 | const size_t max_simulated_threads = 256; |
32 | #else |
33 | const size_t max_simulated_threads = 640; |
34 | #endif |
35 | |
36 | typedef tbb::enumerable_thread_specific<size_t> ThreadNumsType; |
37 | size_t g_threadNumInitialValue = 10; |
38 | ThreadNumsType g_threadNums(g_threadNumInitialValue); |
39 | |
40 | namespace whitebox_simulation { |
41 | size_t whitebox_thread_index = 0; |
42 | test_partitioner_utils::BinaryTree reference_tree; |
43 | } |
44 | |
45 | // simulate a subset of task.h |
46 | namespace tbb { |
47 | namespace internal { |
48 | typedef unsigned short affinity_id; |
49 | } |
50 | class fake_task { |
51 | public: |
52 | typedef internal::affinity_id affinity_id; |
53 | void set_affinity(affinity_id a) { my_affinity = a; } |
54 | affinity_id affinity() const { return my_affinity; } |
55 | void set_parent(fake_task* p) { my_parent = p; } |
56 | fake_task *parent() const { return my_parent; } |
57 | bool is_stolen_task() const { return false; } |
58 | intptr_t ref_count() const { return 1; } |
59 | bool is_cancelled() const { return false; } |
60 | static void spawn(fake_task &) {} // for legacy in partitioner.h |
61 | virtual fake_task* execute() = 0; // enables dynamic_cast |
62 | |
63 | fake_task() : my_parent(0), my_affinity(0) {} |
64 | virtual ~fake_task() {} |
65 | private: |
66 | fake_task *my_parent; |
67 | affinity_id my_affinity; |
68 | }; |
69 | namespace task_arena { |
70 | static const int not_initialized = -2;//should match corresponding value in task_arena.h |
71 | }//namespace task_arena |
72 | namespace this_task_arena { |
73 | inline int current_thread_index() { return (int)whitebox_simulation::whitebox_thread_index; } |
74 | } |
75 | }//namespace tbb |
76 | |
77 | #define __TBB_task_H |
78 | #define __TBB_task_arena_H |
79 | #define get_initial_auto_partitioner_divisor my_get_initial_auto_partitioner_divisor |
80 | #define affinity_partitioner_base_v3 my_affinity_partitioner_base_v3 |
81 | #define task fake_task |
82 | #define __TBB_STATIC_THRESHOLD 0 |
83 | #include "tbb/partitioner.h" |
84 | #undef __TBB_STATIC_THRESHOLD |
85 | #undef task |
86 | #undef affinity_partitioner_base_v3 |
87 | #undef get_initial_auto_partitioner_divisor |
88 | |
89 | // replace library functions to simulate concurrency |
90 | namespace tbb { |
91 | namespace internal { |
92 | size_t my_get_initial_auto_partitioner_divisor() { |
93 | const size_t X_FACTOR = 4; |
94 | return X_FACTOR * g_threadNums.local(); |
95 | } |
96 | |
97 | void* __TBB_EXPORTED_FUNC NFS_Allocate( size_t n_element, size_t element_size, void* hint ); |
98 | void __TBB_EXPORTED_FUNC NFS_Free( void* ); |
99 | |
100 | void my_affinity_partitioner_base_v3::resize( unsigned factor ) { |
101 | // Check factor to avoid asking for number of workers while there might be no arena. |
102 | size_t new_size = factor ? factor * g_threadNums.local() : 0; |
103 | if (new_size != my_size) { |
104 | if (my_array) { |
105 | NFS_Free(my_array); |
106 | // Following two assignments must be done here for sake of exception safety. |
107 | my_array = NULL; |
108 | my_size = 0; |
109 | } |
110 | if (new_size) { |
111 | my_array = static_cast<affinity_id*>(NFS_Allocate(new_size, sizeof(affinity_id), NULL )); |
112 | memset(my_array, 0, sizeof(affinity_id) * new_size); |
113 | my_size = new_size; |
114 | } |
115 | } |
116 | } |
117 | |
118 | } //namespace internal |
119 | // simulate a subset of parallel_for |
120 | namespace interface9 { |
121 | namespace internal { |
122 | |
123 | // parallel_for algorithm that executes sequentially |
124 | template<typename Range, typename Body, typename Partitioner> |
125 | class start_for : public fake_task { |
126 | Range my_range; |
127 | Body my_body; |
128 | typename Partitioner::task_partition_type my_partition; |
129 | size_t m_executedBegin, m_executedEnd; |
130 | bool m_firstTimeRun; |
131 | size_t m_joinedBegin, m_joinedEnd; |
132 | test_partitioner_utils::BinaryTree* m_tree; |
133 | public: |
134 | start_for( const Range& range, const Body& body, Partitioner& partitioner, |
135 | test_partitioner_utils::BinaryTree* tree ) : |
136 | my_range(range), my_body(body), my_partition(partitioner), |
137 | m_executedBegin(0), m_executedEnd(0), m_firstTimeRun(true), |
138 | m_joinedBegin(/* grows left */ range.end()), m_joinedEnd(range.end()), m_tree(tree) |
139 | { |
140 | if (m_tree) { |
141 | m_tree->push_node( test_partitioner_utils::make_node(my_range.begin(), my_range.end(), affinity()) ); |
142 | } |
143 | } |
144 | //! Splitting constructor used to generate children. |
145 | /** parent_ becomes left child. Newly constructed object is right child. */ |
146 | start_for( start_for& parent_, typename Partitioner::split_type& split_obj) : |
147 | my_range(parent_.my_range, split_obj), |
148 | my_body(parent_.my_body), |
149 | my_partition(parent_.my_partition, split_obj), |
150 | m_executedBegin(0), m_executedEnd(0), m_firstTimeRun(true), |
151 | m_joinedBegin(/* grows left */ my_range.end()), m_joinedEnd(my_range.end()), |
152 | m_tree(parent_.m_tree) |
153 | { |
154 | set_parent(parent_.parent()); |
155 | my_partition.set_affinity(*this); |
156 | |
157 | if (m_tree) { |
158 | // collecting splitting statistics |
159 | m_tree->push_node( test_partitioner_utils::make_node(my_range.begin(), |
160 | my_range.end(), |
161 | affinity()) ); |
162 | m_tree->push_node( test_partitioner_utils::make_node(parent_.my_range.begin(), |
163 | parent_.my_range.end(), |
164 | parent_.affinity()) ); |
165 | } |
166 | } |
167 | //! Construct right child from the given range as response to the demand. |
168 | /** parent_ remains left child. Newly constructed object is right child. */ |
169 | start_for( start_for& parent_, const Range& r, depth_t d ) : |
170 | my_range(r), |
171 | my_body(parent_.my_body), |
172 | my_partition(parent_.my_partition, tbb::split()), |
173 | m_executedBegin(0), m_executedEnd(0), m_firstTimeRun(true), |
174 | m_joinedBegin(/* grows left */ r.end()), m_joinedEnd(r.end()), |
175 | m_tree(parent_.m_tree) |
176 | { |
177 | set_parent(parent_.parent()); |
178 | my_partition.set_affinity(*this); |
179 | my_partition.align_depth( d ); |
180 | } |
181 | fake_task* execute() __TBB_override { |
182 | my_partition.check_being_stolen( *this ); |
183 | size_t origBegin = my_range.begin(); |
184 | size_t origEnd = my_range.end(); |
185 | |
186 | my_partition.execute(*this, my_range); |
187 | |
188 | ASSERT(m_executedEnd == m_joinedBegin, "Non-continuous execution" ); |
189 | m_executedEnd = m_joinedEnd; |
190 | |
191 | ASSERT(origBegin == m_executedBegin && origEnd == m_executedEnd, |
192 | "Not all iterations were processed" ); |
193 | return NULL; |
194 | } |
195 | //! Run body for range, serves as callback for partitioner |
196 | void run_body( Range &r ) { |
197 | if( r.is_ensure_non_emptiness() ) |
198 | ASSERT( !r.empty(), "Empty ranges are not allowed" ); |
199 | my_body(r); |
200 | if (m_firstTimeRun) { |
201 | m_firstTimeRun = false; |
202 | m_executedBegin = m_executedEnd = r.begin(); |
203 | } |
204 | ASSERT(m_executedBegin <= r.begin() && m_executedEnd <= r.end(), |
205 | "Non-continuous execution" ); |
206 | m_executedEnd = r.end(); |
207 | } |
208 | //! spawn right task, serves as callback for partitioner |
209 | void offer_work(typename Partitioner::split_type& split_obj) { |
210 | start_for sibling(*this, split_obj); |
211 | sibling.execute(); |
212 | join(sibling.m_executedBegin, sibling.m_executedEnd); |
213 | } |
214 | //! spawn right task, serves as callback for partitioner |
215 | void offer_work(const Range& r, depth_t d = 0) { |
216 | start_for sibling(*this, r, d); |
217 | sibling.execute(); |
218 | join(sibling.m_executedBegin, sibling.m_executedEnd); |
219 | } |
220 | void join(size_t siblingExecutedBegin, size_t siblingExecutedEnd) { |
221 | ASSERT(siblingExecutedEnd == m_joinedBegin, "?" ); |
222 | m_joinedBegin = siblingExecutedBegin; |
223 | } |
224 | }; |
225 | |
226 | } //namespace internal |
227 | } //namespace interfaceX |
228 | } //namespace tbb |
229 | |
230 | namespace whitebox_simulation { |
231 | using namespace tbb::interface9::internal; |
232 | template<typename Range, typename Body, typename Partitioner> |
233 | void parallel_for( const Range& range, const Body& body, Partitioner& partitioner, |
234 | test_partitioner_utils::BinaryTree* tree = NULL) { |
235 | if (!range.empty()) { |
236 | flag_task parent; |
237 | start_for<Range, Body, Partitioner> start(range, body, partitioner, tree); |
238 | start.set_parent(&parent); |
239 | start.execute(); |
240 | } |
241 | } |
242 | |
243 | } //namespace whitebox_simulation |
244 | |
245 | template <typename Range, typename Body, typename Partitioner> |
246 | void test_case(Range& range, const Body& body, Partitioner& partitioner, |
247 | test_partitioner_utils::BinaryTree* tree = NULL) { |
248 | whitebox_simulation::parallel_for(range, body, partitioner, tree); |
249 | } |
250 | |
251 | // Functions generate size for range objects used in tests |
252 | template <typename T> |
253 | size_t default_range_size_generator(T* factor, unsigned index, size_t thread_num) { |
254 | return size_t(factor[index] * thread_num); |
255 | } |
256 | |
257 | size_t shifted_left_range_size_generator(size_t* factor, unsigned index, size_t thread_num) { |
258 | return factor[index] * thread_num - 1; |
259 | } |
260 | |
261 | size_t shifted_right_range_size_generator(size_t* factor, unsigned index, size_t thread_num) { |
262 | return factor[index] * thread_num + 1; |
263 | } |
264 | |
265 | size_t max_range_size_generator(size_t*, unsigned, size_t) { |
266 | return size_t(-1); |
267 | } |
268 | |
269 | size_t simple_size_generator(size_t*, unsigned index, size_t) { |
270 | return index; |
271 | } |
272 | |
273 | namespace uniform_iterations_distribution { |
274 | |
275 | /* |
276 | * Test checks uniform distribution of range's iterations among all tasks just after |
277 | * work distribution phase has been completed and just before work balancing phase has been started |
278 | */ |
279 | |
280 | using namespace test_partitioner_utils; |
281 | |
282 | class ParallelTestBody { |
283 | public: |
284 | struct use_case_settings_t; |
285 | |
286 | typedef void (*CheckerFuncType)(const char*, size_t, const use_case_settings_t*, const RangeStatisticData&); |
287 | |
288 | struct use_case_settings_t { |
289 | size_t thread_num; // number of threads used during current use case |
290 | unsigned factors_array_len; // size of 'factors' array |
291 | size_t range_begin; // beginning of range iterations |
292 | bool provide_feedback; // 'true' if range should give feedback |
293 | bool ensure_non_empty_size; // don't allow empty size ranges |
294 | |
295 | size_t above_threads_size_tolerance; // allowed value for number of created ranges |
296 | // when initial size of the range was greater or |
297 | // equal to number of threads |
298 | |
299 | size_t below_threads_size_tolerance; // allowed value for number of created ranges |
300 | // when initial size of the range was less than |
301 | // number of threads |
302 | |
303 | size_t between_min_max_ranges_tolerance; // allowed value for difference of iterations |
304 | // between bigger and lesser ranges |
305 | |
306 | CheckerFuncType checker; // checker function for a particular test case |
307 | }; |
308 | |
309 | ParallelTestBody(size_t parallel_group_thread_starting_index) |
310 | : m_parallel_group_thread_starting_index(parallel_group_thread_starting_index) { } |
311 | |
312 | void operator()(size_t) const { ASSERT( false, "Empty ParallelTestBody called" ); } |
313 | |
314 | static void uniform_distribution_checker(const char* rangeName, size_t rangeSize, const use_case_settings_t* settings, |
315 | const RangeStatisticData& stat) |
316 | { |
317 | // Checking that all threads were given a task |
318 | if (rangeSize >= settings->thread_num) { |
319 | uint64_t disparity = |
320 | max(stat.m_rangeNum, settings->thread_num) - min(stat.m_rangeNum, settings->thread_num); |
321 | if (disparity > settings->above_threads_size_tolerance) { |
322 | REPORT("ERROR: '%s (f=%d|e=%d)': |#ranges(%llu)-#threads(%llu)|=%llu > %llu=tolerance\n" , |
323 | rangeName, int(settings->provide_feedback), int(settings->ensure_non_empty_size), stat.m_rangeNum, |
324 | settings->thread_num, disparity, uint64_t(settings->above_threads_size_tolerance)); |
325 | ASSERT(disparity <= settings->above_threads_size_tolerance, "Incorrect number of range " |
326 | "objects was created before work balancing phase started" ); |
327 | } |
328 | } else if (settings->ensure_non_empty_size && rangeSize != 0) { |
329 | uint64_t disparity = max(stat.m_rangeNum, rangeSize) - min(stat.m_rangeNum, rangeSize); |
330 | if (disparity > settings->below_threads_size_tolerance ) { |
331 | REPORT("ERROR: '%s (f=%d|e=%d)': |#ranges-range size|=%llu > %llu=tolerance\n" , |
332 | rangeName, int(settings->provide_feedback), int(settings->ensure_non_empty_size), |
333 | disparity, uint64_t(settings->below_threads_size_tolerance)); |
334 | ASSERT(disparity <= settings->below_threads_size_tolerance, "Incorrect number of range objects" |
335 | " was created before work balancing phase started" ); |
336 | } |
337 | } |
338 | // Checking difference between min and max number of range iterations |
339 | size_t diff = stat.m_maxRangeSize - stat.m_minRangeSize; |
340 | if (diff > settings->between_min_max_ranges_tolerance) { |
341 | REPORT("ERROR: '%s (f=%d|e=%d)': range size difference=%llu > %llu=tolerance\n" , |
342 | rangeName, int(settings->provide_feedback), int(settings->ensure_non_empty_size), |
343 | uint64_t(diff), uint64_t(settings->between_min_max_ranges_tolerance)); |
344 | ASSERT(diff <= settings->between_min_max_ranges_tolerance, "Uniform iteration distribution error" ); |
345 | } |
346 | } |
347 | // Checker for test cases where ranges don't provide feedback during proportional split to |
348 | // partitioner and differ from tbb::blocked_range implementation in their splitting algorithm |
349 | static void nonuniform_distribution_checker(const char* rangeName, size_t rangeSize, const use_case_settings_t* settings, |
350 | const RangeStatisticData& stat) |
351 | { |
352 | if (stat.m_rangeNum > settings->thread_num) { |
353 | REPORT("ERROR: '%s (f=%d|e=%d)': %llu=#ranges > #threads=%llu\n" , |
354 | rangeName, int(settings->provide_feedback), int(settings->ensure_non_empty_size), |
355 | uint64_t(stat.m_rangeNum), uint64_t(settings->thread_num)); |
356 | ASSERT(stat.m_rangeNum <= settings->thread_num, |
357 | "Incorrect number of range objects was created before work balancing phase started" ); |
358 | } |
359 | // Checking difference between min and max number of range iterations |
360 | size_t diff = stat.m_maxRangeSize - stat.m_minRangeSize; |
361 | if (diff > rangeSize) { |
362 | REPORT("ERROR: '%s (f=%d|e=%d)': range size difference=%llu > %llu=initial range size\n" , |
363 | rangeName, int(settings->provide_feedback), int(settings->ensure_non_empty_size), |
364 | uint64_t(diff), uint64_t(rangeSize)); |
365 | ASSERT(diff <= rangeSize, "Iteration distribution error" ); |
366 | } |
367 | } |
368 | |
369 | protected: |
370 | size_t m_parallel_group_thread_starting_index; // starting index of thread |
371 | |
372 | template <typename Range, typename Partitioner, typename T> |
373 | void test(use_case_settings_t& settings, T factors[], size_t (*rsgFunc)(T*, unsigned, size_t) |
374 | = &default_range_size_generator<T>) const |
375 | { |
376 | for (unsigned i = 0; i < settings.factors_array_len; ++i) { |
377 | size_t range_end = rsgFunc(factors, i, settings.thread_num); |
378 | RangeStatisticData stat = { /*range num=*/ 0, /*minimal size of range=*/ 0, |
379 | /*maximal size of range=*/ 0, /*minimal size of range was not rewritten yet=*/ false }; |
380 | Range range = Range(settings.range_begin, range_end, &stat, settings.provide_feedback, |
381 | settings.ensure_non_empty_size); |
382 | Partitioner my_partitioner; |
383 | test_case(range, SimpleBody(), my_partitioner, NULL); |
384 | size_t range_size = range_end - settings.range_begin; |
385 | const char* rangeName = typeid(range).name(); |
386 | settings.checker(rangeName, range_size, &settings, stat); |
387 | } |
388 | } |
389 | }; |
390 | |
391 | template <typename ParallelTestBody> |
392 | void test() { |
393 | size_t hw_threads_num = tbb::tbb_thread::hardware_concurrency(); |
394 | size_t threadsToRunOn = std::min<size_t>(max_simulated_threads, hw_threads_num); |
395 | |
396 | size_t parallel_group_thread_starting_index = 1; |
397 | while( parallel_group_thread_starting_index <= max_simulated_threads - threadsToRunOn ) { |
398 | NativeParallelFor(threadsToRunOn, ParallelTestBody(parallel_group_thread_starting_index)); |
399 | parallel_group_thread_starting_index += threadsToRunOn; |
400 | } |
401 | NativeParallelFor(max_simulated_threads - parallel_group_thread_starting_index, |
402 | ParallelTestBody(parallel_group_thread_starting_index)); |
403 | } |
404 | |
405 | namespace task_affinity_whitebox { |
406 | size_t range_begin = 0; |
407 | size_t range_end = 20; |
408 | } |
409 | |
410 | template<typename Partitioner> |
411 | void check_tree(const test_partitioner_utils::BinaryTree&); |
412 | |
413 | template<> |
414 | void check_tree<tbb::affinity_partitioner>(const test_partitioner_utils::BinaryTree& tree) { |
415 | ASSERT(tree == whitebox_simulation::reference_tree, |
416 | "affinity_partitioner distributes tasks differently from run to run" ); |
417 | } |
418 | |
419 | template<> |
420 | void check_tree<tbb::static_partitioner>(const test_partitioner_utils::BinaryTree& tree) { |
421 | std::vector<test_partitioner_utils::TreeNode* > tree_leafs; |
422 | tree.fill_leafs(tree_leafs); |
423 | typedef std::vector<size_t> Slots; |
424 | Slots affinity_slots(tree_leafs.size() + 1, 0); |
425 | |
426 | for (std::vector<test_partitioner_utils::TreeNode*>::iterator i = tree_leafs.begin(); i != tree_leafs.end(); ++i) { |
427 | affinity_slots[(*i)->m_affinity]++; |
428 | if ((*i)->m_affinity == 0) |
429 | ASSERT((*i)->m_range_begin == task_affinity_whitebox::range_begin, |
430 | "Task with affinity 0 was executed with wrong range" ); |
431 | } |
432 | |
433 | typedef std::iterator_traits<Slots::iterator>::difference_type slots_difference_type; |
434 | ASSERT(std::count(affinity_slots.begin(), affinity_slots.end(), size_t(0)) == slots_difference_type(1), |
435 | "static_partitioner incorrectly distributed tasks by threads" ); |
436 | ASSERT(std::count(affinity_slots.begin(), affinity_slots.end(), size_t(1)) == slots_difference_type(g_threadNums.local()), |
437 | "static_partitioner incorrectly distributed tasks by threads" ); |
438 | ASSERT(affinity_slots[tbb::this_task_arena::current_thread_index() + 1] == 0, |
439 | "static_partitioner incorrectly assigns task with 0 affinity" ); |
440 | ASSERT(std::accumulate(affinity_slots.begin(), affinity_slots.end(), size_t(0)) == g_threadNums.local(), |
441 | "static_partitioner has created more tasks than the number of threads" ); |
442 | } |
443 | |
444 | template<typename Partitioner> |
445 | void test_task_affinity() { |
446 | using namespace task_affinity_whitebox; |
447 | test_partitioner_utils::SimpleBody body; |
448 | for (size_t p = 1; p <= 50; ++p) { |
449 | g_threadNums.local() = p; |
450 | whitebox_simulation::whitebox_thread_index = 0; |
451 | test_partitioner_utils::TestRanges::BlockedRange range(range_begin, range_end, /*statData*/NULL, |
452 | /*provide_feedback*/false, /*ensure_non_empty_size*/false); |
453 | Partitioner partitioner; |
454 | whitebox_simulation::reference_tree = test_partitioner_utils::BinaryTree(); |
455 | whitebox_simulation::parallel_for(range, body, partitioner, &(whitebox_simulation::reference_tree)); |
456 | while (whitebox_simulation::whitebox_thread_index < p) { |
457 | test_partitioner_utils::BinaryTree tree; |
458 | whitebox_simulation::parallel_for(range, body, partitioner, &tree); |
459 | check_tree<Partitioner>(tree); |
460 | whitebox_simulation::whitebox_thread_index++; |
461 | } |
462 | range_begin++; |
463 | range_end += 2; |
464 | } |
465 | } |
466 | |
467 | } /* namespace uniform_iterations_distribution */ |
468 | |