1 | /* |
2 | Copyright (c) 2005-2019 Intel Corporation |
3 | |
4 | Licensed under the Apache License, Version 2.0 (the "License"); |
5 | you may not use this file except in compliance with the License. |
6 | You may obtain a copy of the License at |
7 | |
8 | http://www.apache.org/licenses/LICENSE-2.0 |
9 | |
10 | Unless required by applicable law or agreed to in writing, software |
11 | distributed under the License is distributed on an "AS IS" BASIS, |
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | See the License for the specific language governing permissions and |
14 | limitations under the License. |
15 | */ |
16 | |
17 | /** This test checks the automatic propagation of master thread FPU settings |
18 | into the worker threads. **/ |
19 | |
20 | #include "harness_fp.h" |
21 | #include "harness.h" |
22 | #define private public |
23 | #include "tbb/task.h" |
24 | #undef private |
25 | #include "tbb/parallel_for.h" |
26 | #include "tbb/task_scheduler_init.h" |
27 | |
28 | const int N = 500000; |
29 | |
30 | #if ( __TBB_x86_32 || __TBB_x86_64 ) && __TBB_CPU_CTL_ENV_PRESENT && !defined(__TBB_WIN32_USE_CL_BUILTINS) |
31 | #include "harness_barrier.h" |
32 | |
33 | class CheckNoSseStatusPropagationBody : public NoAssign { |
34 | Harness::SpinBarrier &barrier; |
35 | public: |
36 | CheckNoSseStatusPropagationBody( Harness::SpinBarrier &_barrier ) : barrier(_barrier) {} |
37 | void operator()( const tbb::blocked_range<int>& ) const { |
38 | barrier.wait(); |
39 | tbb::internal::cpu_ctl_env ctl; |
40 | ctl.get_env(); |
41 | ASSERT( (ctl.mxcsr & SSE_STATUS_MASK) == 0, "FPU control status bits have been propagated." ); |
42 | } |
43 | }; |
44 | |
45 | void CheckNoSseStatusPropagation() { |
46 | tbb::internal::cpu_ctl_env ctl; |
47 | ctl.get_env(); |
48 | ctl.mxcsr |= SSE_STATUS_MASK; |
49 | ctl.set_env(); |
50 | const int num_threads = tbb::task_scheduler_init::default_num_threads(); |
51 | Harness::SpinBarrier barrier(num_threads); |
52 | tbb::task_scheduler_init init(num_threads); |
53 | tbb::parallel_for( tbb::blocked_range<int>(0, num_threads), CheckNoSseStatusPropagationBody(barrier) ); |
54 | ctl.mxcsr &= ~SSE_STATUS_MASK; |
55 | ctl.set_env(); |
56 | } |
57 | #else /* Other archs */ |
58 | void CheckNoSseStatusPropagation() {} |
59 | #endif /* Other archs */ |
60 | |
61 | class RoundingModeCheckBody { |
62 | int m_mode; |
63 | int m_sseMode; |
64 | public: |
65 | void operator() ( int /*iter*/ ) const { |
66 | ASSERT( GetRoundingMode() == m_mode, "FPU control state has not been propagated." ); |
67 | ASSERT( GetSseMode() == m_sseMode, "SSE control state has not been propagated." ); |
68 | } |
69 | |
70 | RoundingModeCheckBody ( int mode, int sseMode ) : m_mode(mode), m_sseMode(sseMode) {} |
71 | }; |
72 | |
73 | void TestArenaFpuEnvPropagation( int id ) { |
74 | // TBB scheduler instance in a master thread captures the FPU control state |
75 | // at the moment of its initialization and passes it to the workers toiling |
76 | // on its behalf. |
77 | for( int k = 0; k < NumSseModes; ++k ) { |
78 | int sse_mode = SseModes[(k + id) % NumSseModes]; |
79 | SetSseMode( sse_mode ); |
80 | for( int i = 0; i < NumRoundingModes; ++i ) { |
81 | int mode = RoundingModes[(i + id) % NumRoundingModes]; |
82 | SetRoundingMode( mode ); |
83 | // New mode must be set before TBB scheduler is initialized |
84 | tbb::task_scheduler_init init; |
85 | tbb::parallel_for( 0, N, 1, RoundingModeCheckBody(mode, sse_mode) ); |
86 | ASSERT( GetRoundingMode() == mode, NULL ); |
87 | } |
88 | } |
89 | } |
90 | |
91 | #if __TBB_FP_CONTEXT |
92 | void TestArenaFpuEnvPersistence( int id ) { |
93 | // Since the following loop uses auto-initialization, the scheduler instance |
94 | // implicitly created by the first parallel_for invocation will persist |
95 | // until the thread ends, and thus workers will use the mode set by the |
96 | // first iteration. |
97 | int captured_mode = RoundingModes[id % NumRoundingModes]; |
98 | int captured_sse_mode = SseModes[id % NumSseModes]; |
99 | for( int k = 0; k < NumSseModes; ++k ) { |
100 | int sse_mode = SseModes[(k + id) % NumSseModes]; |
101 | SetSseMode( sse_mode ); |
102 | for( int i = 0; i < NumRoundingModes; ++i ) { |
103 | int mode = RoundingModes[(i + id) % NumRoundingModes]; |
104 | SetRoundingMode( mode ); |
105 | tbb::parallel_for( 0, N, 1, RoundingModeCheckBody(captured_mode, captured_sse_mode) ); |
106 | ASSERT( GetRoundingMode() == mode, NULL ); |
107 | } |
108 | } |
109 | } |
110 | #endif |
111 | |
112 | class LauncherBody { |
113 | public: |
114 | void operator() ( int id ) const { |
115 | TestArenaFpuEnvPropagation( id ); |
116 | #if __TBB_FP_CONTEXT |
117 | TestArenaFpuEnvPersistence( id ); |
118 | #endif |
119 | } |
120 | }; |
121 | |
122 | void TestFpuEnvPropagation () { |
123 | const int p = tbb::task_scheduler_init::default_num_threads(); |
124 | // The test should be run in an oversubscription mode. So create 4*p threads but |
125 | // limit the oversubscription for big machines (p>32) with 4*32+(p-32) threads. |
126 | const int num_threads = p + (NumRoundingModes-1)*min(p,32); |
127 | NativeParallelFor ( num_threads, LauncherBody() ); |
128 | } |
129 | |
130 | void TestCpuCtlEnvApi () { |
131 | for( int k = 0; k < NumSseModes; ++k ) { |
132 | SetSseMode( SseModes[k] ); |
133 | for( int i = 0; i < NumRoundingModes; ++i ) { |
134 | SetRoundingMode( RoundingModes[i] ); |
135 | ASSERT( GetRoundingMode() == RoundingModes[i], NULL ); |
136 | ASSERT( GetSseMode() == SseModes[k], NULL ); |
137 | } |
138 | } |
139 | } |
140 | |
141 | #if __TBB_FP_CONTEXT |
142 | const int numModes = NumRoundingModes*NumSseModes; |
143 | const int numArenas = 4; |
144 | tbb::task_group_context *contexts[numModes]; |
145 | // +1 for a default context |
146 | int roundingModes[numModes+numArenas]; |
147 | int sseModes[numModes+numArenas]; |
148 | |
149 | class TestContextFpuEnvBody { |
150 | int arenaNum; |
151 | int mode; |
152 | int depth; |
153 | public: |
154 | TestContextFpuEnvBody( int _arenaNum, int _mode, int _depth = 0 ) : arenaNum(_arenaNum), mode(_mode), depth(_depth) {} |
155 | void operator()( const tbb::blocked_range<int> &r ) const; |
156 | }; |
157 | |
158 | inline void SetMode( int mode ) { |
159 | SetRoundingMode( roundingModes[mode] ); |
160 | SetSseMode( sseModes[mode] ); |
161 | } |
162 | |
163 | inline void AssertMode( int mode ) { |
164 | ASSERT( GetRoundingMode() == roundingModes[mode], "FPU control state has not been set correctly." ); |
165 | ASSERT( GetSseMode() == sseModes[mode], "SSE control state has not been set correctly." ); |
166 | } |
167 | |
168 | inline int SetNextMode( int mode, int step ) { |
169 | const int nextMode = (mode+step)%numModes; |
170 | SetMode( nextMode ); |
171 | return nextMode; |
172 | } |
173 | |
174 | class TestContextFpuEnvTask : public tbb::task { |
175 | int arenaNum; |
176 | int mode; |
177 | int depth; |
178 | #if __TBB_CPU_CTL_ENV_PRESENT |
179 | static const int MAX_DEPTH = 3; |
180 | #else |
181 | static const int MAX_DEPTH = 4; |
182 | #endif |
183 | public: |
184 | TestContextFpuEnvTask( int _arenaNum, int _mode, int _depth = 0 ) : arenaNum(_arenaNum), mode(_mode), depth(_depth) {} |
185 | tbb::task* execute() __TBB_override { |
186 | AssertMode( mode ); |
187 | if ( depth < MAX_DEPTH ) { |
188 | // Test default context. |
189 | const int newMode1 = SetNextMode( mode, depth+1 ); |
190 | tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, mode, depth+1 ) ); |
191 | AssertMode( newMode1 ); |
192 | |
193 | // Test user default context. |
194 | const int newMode2 = SetNextMode( newMode1, depth+1 ); |
195 | tbb::task_group_context ctx1; |
196 | const int newMode3 = SetNextMode( newMode2, depth+1 ); |
197 | tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, mode, depth+1 ), ctx1 ); |
198 | AssertMode( newMode3 ); |
199 | |
200 | // Test user context which captured FPU control settings. |
201 | const int newMode4 = SetNextMode( newMode3, depth+1 ); |
202 | // Capture newMode4 |
203 | ctx1.capture_fp_settings(); |
204 | const int newMode5 = SetNextMode( newMode4, depth+1 ); |
205 | tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, newMode4, depth+1 ), ctx1 ); |
206 | AssertMode( newMode5 ); |
207 | |
208 | // And again test user context which captured FPU control settings to check multiple captures. |
209 | const int newMode6 = SetNextMode( newMode5, depth+1 ); |
210 | // Capture newMode6 |
211 | ctx1.capture_fp_settings(); |
212 | const int newMode7 = SetNextMode( newMode6, depth+1 ); |
213 | tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, newMode6, depth+1 ), ctx1 ); |
214 | AssertMode( newMode7 ); |
215 | |
216 | // Test an isolated context. The isolated context should use default FPU control settings. |
217 | const int newMode8 = SetNextMode( newMode7, depth+1 ); |
218 | tbb::task_group_context ctx2( tbb::task_group_context::isolated ); |
219 | const int newMode9 = SetNextMode( newMode8, depth+1 ); |
220 | tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, numModes+arenaNum, depth+1 ), ctx2 ); |
221 | AssertMode( newMode9 ); |
222 | |
223 | // The binding should not owerrite captured FPU control settings. |
224 | const int newMode10 = SetNextMode( newMode9, depth+1 ); |
225 | tbb::task_group_context ctx3; |
226 | ctx3.capture_fp_settings(); |
227 | const int newMode11 = SetNextMode( newMode10, depth+1 ); |
228 | tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, newMode10, depth+1 ), ctx3 ); |
229 | AssertMode( newMode11 ); |
230 | |
231 | // Restore initial mode since user code in tbb::task::execute should not change FPU settings. |
232 | SetMode( mode ); |
233 | } |
234 | |
235 | return NULL; |
236 | } |
237 | }; |
238 | |
239 | void TestContextFpuEnvBody::operator()( const tbb::blocked_range<int> &r ) const { |
240 | AssertMode( mode ); |
241 | |
242 | const int newMode = SetNextMode( mode, depth+2 ); |
243 | |
244 | int end = r.end(); |
245 | if ( end-1 == numModes ) { |
246 | // For a default context our mode should be inherited. |
247 | tbb::task::spawn_root_and_wait( |
248 | *new( tbb::task::allocate_root() ) TestContextFpuEnvTask( arenaNum, mode, depth ) ); |
249 | AssertMode( newMode ); |
250 | end--; |
251 | } |
252 | for ( int i=r.begin(); i<end; ++i ) { |
253 | tbb::task::spawn_root_and_wait( |
254 | *new( tbb::task::allocate_root(*contexts[i]) ) TestContextFpuEnvTask( arenaNum, i, depth ) ); |
255 | AssertMode( newMode ); |
256 | } |
257 | |
258 | // Restore initial mode since user code in tbb::task::execute should not change FPU settings. |
259 | SetMode( mode ); |
260 | } |
261 | |
262 | class TestContextFpuEnvNativeLoopBody { |
263 | public: |
264 | void operator() ( int arenaNum ) const { |
265 | SetMode(numModes+arenaNum); |
266 | tbb::task_scheduler_init init; |
267 | tbb::task::spawn_root_and_wait( *new (tbb::task::allocate_root() ) TestContextFpuEnvTask( arenaNum, numModes+arenaNum ) ); |
268 | } |
269 | }; |
270 | |
271 | #if TBB_USE_EXCEPTIONS |
272 | const int NUM_ITERS = 1000; |
273 | class TestContextFpuEnvEhBody { |
274 | int mode; |
275 | int eh_iter; |
276 | int depth; |
277 | public: |
278 | TestContextFpuEnvEhBody( int _mode, int _eh_iter, int _depth = 0 ) : mode(_mode), eh_iter(_eh_iter), depth(_depth) {} |
279 | void operator()( const tbb::blocked_range<int> &r ) const { |
280 | AssertMode( mode ); |
281 | if ( depth < 1 ) { |
282 | const int newMode1 = SetNextMode( mode, 1 ); |
283 | tbb::task_group_context ctx; |
284 | ctx.capture_fp_settings(); |
285 | const int newMode2 = SetNextMode( newMode1, 1 ); |
286 | try { |
287 | tbb::parallel_for( tbb::blocked_range<int>(0, NUM_ITERS), TestContextFpuEnvEhBody(newMode1,rand()%NUM_ITERS,1), tbb::simple_partitioner(), ctx ); |
288 | } catch (...) { |
289 | AssertMode( newMode2 ); |
290 | if ( r.begin() == eh_iter ) throw; |
291 | } |
292 | AssertMode( newMode2 ); |
293 | SetMode( mode ); |
294 | } else if ( r.begin() == eh_iter ) throw 0; |
295 | } |
296 | }; |
297 | |
298 | class TestContextFpuEnvEhNativeLoopBody { |
299 | public: |
300 | void operator() ( int arenaNum ) const { |
301 | SetMode( arenaNum%numModes ); |
302 | try { |
303 | tbb::parallel_for( tbb::blocked_range<int>(0, NUM_ITERS), TestContextFpuEnvEhBody((arenaNum+1)%numModes,rand()%NUM_ITERS), |
304 | tbb::simple_partitioner(), *contexts[(arenaNum+1)%numModes] ); |
305 | ASSERT( false, "parallel_for has not thrown an exception." ); |
306 | } catch (...) { |
307 | AssertMode( arenaNum%numModes ); |
308 | } |
309 | } |
310 | }; |
311 | #endif /* TBB_USE_EXCEPTIONS */ |
312 | |
313 | void TestContextFpuEnv() { |
314 | // Prepare contexts' fp modes. |
315 | for ( int i = 0, modeNum = 0; i < NumRoundingModes; ++i ) { |
316 | const int roundingMode = RoundingModes[i]; |
317 | SetRoundingMode( roundingMode ); |
318 | for( int j = 0; j < NumSseModes; ++j, ++modeNum ) { |
319 | const int sseMode = SseModes[j]; |
320 | SetSseMode( sseMode ); |
321 | |
322 | contexts[modeNum] = new tbb::task_group_context( tbb::task_group_context::isolated, |
323 | tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings ); |
324 | roundingModes[modeNum] = roundingMode; |
325 | sseModes[modeNum] = sseMode; |
326 | } |
327 | } |
328 | // Prepare arenas' fp modes. |
329 | for ( int arenaNum = 0; arenaNum < numArenas; ++arenaNum ) { |
330 | roundingModes[numModes+arenaNum] = roundingModes[arenaNum%numModes]; |
331 | sseModes[numModes+arenaNum] = sseModes[arenaNum%numModes]; |
332 | } |
333 | NativeParallelFor( numArenas, TestContextFpuEnvNativeLoopBody() ); |
334 | #if TBB_USE_EXCEPTIONS |
335 | NativeParallelFor( numArenas, TestContextFpuEnvEhNativeLoopBody() ); |
336 | #endif |
337 | for ( int modeNum = 0; modeNum < numModes; ++modeNum ) |
338 | delete contexts[modeNum]; |
339 | } |
340 | |
341 | tbb::task_group_context glbIsolatedCtx( tbb::task_group_context::isolated ); |
342 | int glbIsolatedCtxMode = -1; |
343 | |
344 | struct TestGlobalIsolatedContextTask : public tbb::task { |
345 | tbb::task* execute() __TBB_override { |
346 | AssertFPMode( glbIsolatedCtxMode ); |
347 | return NULL; |
348 | } |
349 | }; |
350 | |
351 | #include "tbb/mutex.h" |
352 | |
353 | struct TestGlobalIsolatedContextNativeLoopBody { |
354 | void operator()( int threadId ) const { |
355 | FPModeContext fpGuard( threadId ); |
356 | static tbb::mutex rootAllocMutex; |
357 | rootAllocMutex.lock(); |
358 | if ( glbIsolatedCtxMode == -1 ) |
359 | glbIsolatedCtxMode = threadId; |
360 | tbb::task &root = *new (tbb::task::allocate_root( glbIsolatedCtx )) TestGlobalIsolatedContextTask(); |
361 | rootAllocMutex.unlock(); |
362 | tbb::task::spawn_root_and_wait( root ); |
363 | } |
364 | }; |
365 | |
366 | void TestGlobalIsolatedContext() { |
367 | ASSERT( numArenas > 1, NULL ); |
368 | NativeParallelFor( numArenas, TestGlobalIsolatedContextNativeLoopBody() ); |
369 | } |
370 | #endif /* __TBB_FP_CONTEXT */ |
371 | |
372 | int TestMain () { |
373 | TestCpuCtlEnvApi(); |
374 | TestFpuEnvPropagation(); |
375 | CheckNoSseStatusPropagation(); |
376 | #if __TBB_FP_CONTEXT |
377 | TestContextFpuEnv(); |
378 | TestGlobalIsolatedContext(); |
379 | #endif |
380 | return Harness::Done; |
381 | } |
382 | |