1/*
2 Copyright (c) 2005-2019 Intel Corporation
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15*/
16
17/** This test checks the automatic propagation of master thread FPU settings
18 into the worker threads. **/
19
20#include "harness_fp.h"
21#include "harness.h"
22#define private public
23#include "tbb/task.h"
24#undef private
25#include "tbb/parallel_for.h"
26#include "tbb/task_scheduler_init.h"
27
28const int N = 500000;
29
30#if ( __TBB_x86_32 || __TBB_x86_64 ) && __TBB_CPU_CTL_ENV_PRESENT && !defined(__TBB_WIN32_USE_CL_BUILTINS)
31#include "harness_barrier.h"
32
33class CheckNoSseStatusPropagationBody : public NoAssign {
34 Harness::SpinBarrier &barrier;
35public:
36 CheckNoSseStatusPropagationBody( Harness::SpinBarrier &_barrier ) : barrier(_barrier) {}
37 void operator()( const tbb::blocked_range<int>& ) const {
38 barrier.wait();
39 tbb::internal::cpu_ctl_env ctl;
40 ctl.get_env();
41 ASSERT( (ctl.mxcsr & SSE_STATUS_MASK) == 0, "FPU control status bits have been propagated." );
42 }
43};
44
45void CheckNoSseStatusPropagation() {
46 tbb::internal::cpu_ctl_env ctl;
47 ctl.get_env();
48 ctl.mxcsr |= SSE_STATUS_MASK;
49 ctl.set_env();
50 const int num_threads = tbb::task_scheduler_init::default_num_threads();
51 Harness::SpinBarrier barrier(num_threads);
52 tbb::task_scheduler_init init(num_threads);
53 tbb::parallel_for( tbb::blocked_range<int>(0, num_threads), CheckNoSseStatusPropagationBody(barrier) );
54 ctl.mxcsr &= ~SSE_STATUS_MASK;
55 ctl.set_env();
56}
57#else /* Other archs */
58void CheckNoSseStatusPropagation() {}
59#endif /* Other archs */
60
61class RoundingModeCheckBody {
62 int m_mode;
63 int m_sseMode;
64public:
65 void operator() ( int /*iter*/ ) const {
66 ASSERT( GetRoundingMode() == m_mode, "FPU control state has not been propagated." );
67 ASSERT( GetSseMode() == m_sseMode, "SSE control state has not been propagated." );
68 }
69
70 RoundingModeCheckBody ( int mode, int sseMode ) : m_mode(mode), m_sseMode(sseMode) {}
71};
72
73void TestArenaFpuEnvPropagation( int id ) {
74 // TBB scheduler instance in a master thread captures the FPU control state
75 // at the moment of its initialization and passes it to the workers toiling
76 // on its behalf.
77 for( int k = 0; k < NumSseModes; ++k ) {
78 int sse_mode = SseModes[(k + id) % NumSseModes];
79 SetSseMode( sse_mode );
80 for( int i = 0; i < NumRoundingModes; ++i ) {
81 int mode = RoundingModes[(i + id) % NumRoundingModes];
82 SetRoundingMode( mode );
83 // New mode must be set before TBB scheduler is initialized
84 tbb::task_scheduler_init init;
85 tbb::parallel_for( 0, N, 1, RoundingModeCheckBody(mode, sse_mode) );
86 ASSERT( GetRoundingMode() == mode, NULL );
87 }
88 }
89}
90
91#if __TBB_FP_CONTEXT
92void TestArenaFpuEnvPersistence( int id ) {
93 // Since the following loop uses auto-initialization, the scheduler instance
94 // implicitly created by the first parallel_for invocation will persist
95 // until the thread ends, and thus workers will use the mode set by the
96 // first iteration.
97 int captured_mode = RoundingModes[id % NumRoundingModes];
98 int captured_sse_mode = SseModes[id % NumSseModes];
99 for( int k = 0; k < NumSseModes; ++k ) {
100 int sse_mode = SseModes[(k + id) % NumSseModes];
101 SetSseMode( sse_mode );
102 for( int i = 0; i < NumRoundingModes; ++i ) {
103 int mode = RoundingModes[(i + id) % NumRoundingModes];
104 SetRoundingMode( mode );
105 tbb::parallel_for( 0, N, 1, RoundingModeCheckBody(captured_mode, captured_sse_mode) );
106 ASSERT( GetRoundingMode() == mode, NULL );
107 }
108 }
109}
110#endif
111
112class LauncherBody {
113public:
114 void operator() ( int id ) const {
115 TestArenaFpuEnvPropagation( id );
116#if __TBB_FP_CONTEXT
117 TestArenaFpuEnvPersistence( id );
118#endif
119 }
120};
121
122void TestFpuEnvPropagation () {
123 const int p = tbb::task_scheduler_init::default_num_threads();
124 // The test should be run in an oversubscription mode. So create 4*p threads but
125 // limit the oversubscription for big machines (p>32) with 4*32+(p-32) threads.
126 const int num_threads = p + (NumRoundingModes-1)*min(p,32);
127 NativeParallelFor ( num_threads, LauncherBody() );
128}
129
130void TestCpuCtlEnvApi () {
131 for( int k = 0; k < NumSseModes; ++k ) {
132 SetSseMode( SseModes[k] );
133 for( int i = 0; i < NumRoundingModes; ++i ) {
134 SetRoundingMode( RoundingModes[i] );
135 ASSERT( GetRoundingMode() == RoundingModes[i], NULL );
136 ASSERT( GetSseMode() == SseModes[k], NULL );
137 }
138 }
139}
140
141#if __TBB_FP_CONTEXT
142const int numModes = NumRoundingModes*NumSseModes;
143const int numArenas = 4;
144tbb::task_group_context *contexts[numModes];
145// +1 for a default context
146int roundingModes[numModes+numArenas];
147int sseModes[numModes+numArenas];
148
149class TestContextFpuEnvBody {
150 int arenaNum;
151 int mode;
152 int depth;
153public:
154 TestContextFpuEnvBody( int _arenaNum, int _mode, int _depth = 0 ) : arenaNum(_arenaNum), mode(_mode), depth(_depth) {}
155 void operator()( const tbb::blocked_range<int> &r ) const;
156};
157
158inline void SetMode( int mode ) {
159 SetRoundingMode( roundingModes[mode] );
160 SetSseMode( sseModes[mode] );
161}
162
163inline void AssertMode( int mode ) {
164 ASSERT( GetRoundingMode() == roundingModes[mode], "FPU control state has not been set correctly." );
165 ASSERT( GetSseMode() == sseModes[mode], "SSE control state has not been set correctly." );
166}
167
168inline int SetNextMode( int mode, int step ) {
169 const int nextMode = (mode+step)%numModes;
170 SetMode( nextMode );
171 return nextMode;
172}
173
174class TestContextFpuEnvTask : public tbb::task {
175 int arenaNum;
176 int mode;
177 int depth;
178#if __TBB_CPU_CTL_ENV_PRESENT
179 static const int MAX_DEPTH = 3;
180#else
181 static const int MAX_DEPTH = 4;
182#endif
183public:
184 TestContextFpuEnvTask( int _arenaNum, int _mode, int _depth = 0 ) : arenaNum(_arenaNum), mode(_mode), depth(_depth) {}
185 tbb::task* execute() __TBB_override {
186 AssertMode( mode );
187 if ( depth < MAX_DEPTH ) {
188 // Test default context.
189 const int newMode1 = SetNextMode( mode, depth+1 );
190 tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, mode, depth+1 ) );
191 AssertMode( newMode1 );
192
193 // Test user default context.
194 const int newMode2 = SetNextMode( newMode1, depth+1 );
195 tbb::task_group_context ctx1;
196 const int newMode3 = SetNextMode( newMode2, depth+1 );
197 tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, mode, depth+1 ), ctx1 );
198 AssertMode( newMode3 );
199
200 // Test user context which captured FPU control settings.
201 const int newMode4 = SetNextMode( newMode3, depth+1 );
202 // Capture newMode4
203 ctx1.capture_fp_settings();
204 const int newMode5 = SetNextMode( newMode4, depth+1 );
205 tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, newMode4, depth+1 ), ctx1 );
206 AssertMode( newMode5 );
207
208 // And again test user context which captured FPU control settings to check multiple captures.
209 const int newMode6 = SetNextMode( newMode5, depth+1 );
210 // Capture newMode6
211 ctx1.capture_fp_settings();
212 const int newMode7 = SetNextMode( newMode6, depth+1 );
213 tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, newMode6, depth+1 ), ctx1 );
214 AssertMode( newMode7 );
215
216 // Test an isolated context. The isolated context should use default FPU control settings.
217 const int newMode8 = SetNextMode( newMode7, depth+1 );
218 tbb::task_group_context ctx2( tbb::task_group_context::isolated );
219 const int newMode9 = SetNextMode( newMode8, depth+1 );
220 tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, numModes+arenaNum, depth+1 ), ctx2 );
221 AssertMode( newMode9 );
222
223 // The binding should not owerrite captured FPU control settings.
224 const int newMode10 = SetNextMode( newMode9, depth+1 );
225 tbb::task_group_context ctx3;
226 ctx3.capture_fp_settings();
227 const int newMode11 = SetNextMode( newMode10, depth+1 );
228 tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, newMode10, depth+1 ), ctx3 );
229 AssertMode( newMode11 );
230
231 // Restore initial mode since user code in tbb::task::execute should not change FPU settings.
232 SetMode( mode );
233 }
234
235 return NULL;
236 }
237};
238
239void TestContextFpuEnvBody::operator()( const tbb::blocked_range<int> &r ) const {
240 AssertMode( mode );
241
242 const int newMode = SetNextMode( mode, depth+2 );
243
244 int end = r.end();
245 if ( end-1 == numModes ) {
246 // For a default context our mode should be inherited.
247 tbb::task::spawn_root_and_wait(
248 *new( tbb::task::allocate_root() ) TestContextFpuEnvTask( arenaNum, mode, depth ) );
249 AssertMode( newMode );
250 end--;
251 }
252 for ( int i=r.begin(); i<end; ++i ) {
253 tbb::task::spawn_root_and_wait(
254 *new( tbb::task::allocate_root(*contexts[i]) ) TestContextFpuEnvTask( arenaNum, i, depth ) );
255 AssertMode( newMode );
256 }
257
258 // Restore initial mode since user code in tbb::task::execute should not change FPU settings.
259 SetMode( mode );
260}
261
262class TestContextFpuEnvNativeLoopBody {
263public:
264 void operator() ( int arenaNum ) const {
265 SetMode(numModes+arenaNum);
266 tbb::task_scheduler_init init;
267 tbb::task::spawn_root_and_wait( *new (tbb::task::allocate_root() ) TestContextFpuEnvTask( arenaNum, numModes+arenaNum ) );
268 }
269};
270
271#if TBB_USE_EXCEPTIONS
272const int NUM_ITERS = 1000;
273class TestContextFpuEnvEhBody {
274 int mode;
275 int eh_iter;
276 int depth;
277public:
278 TestContextFpuEnvEhBody( int _mode, int _eh_iter, int _depth = 0 ) : mode(_mode), eh_iter(_eh_iter), depth(_depth) {}
279 void operator()( const tbb::blocked_range<int> &r ) const {
280 AssertMode( mode );
281 if ( depth < 1 ) {
282 const int newMode1 = SetNextMode( mode, 1 );
283 tbb::task_group_context ctx;
284 ctx.capture_fp_settings();
285 const int newMode2 = SetNextMode( newMode1, 1 );
286 try {
287 tbb::parallel_for( tbb::blocked_range<int>(0, NUM_ITERS), TestContextFpuEnvEhBody(newMode1,rand()%NUM_ITERS,1), tbb::simple_partitioner(), ctx );
288 } catch (...) {
289 AssertMode( newMode2 );
290 if ( r.begin() == eh_iter ) throw;
291 }
292 AssertMode( newMode2 );
293 SetMode( mode );
294 } else if ( r.begin() == eh_iter ) throw 0;
295 }
296};
297
298class TestContextFpuEnvEhNativeLoopBody {
299public:
300 void operator() ( int arenaNum ) const {
301 SetMode( arenaNum%numModes );
302 try {
303 tbb::parallel_for( tbb::blocked_range<int>(0, NUM_ITERS), TestContextFpuEnvEhBody((arenaNum+1)%numModes,rand()%NUM_ITERS),
304 tbb::simple_partitioner(), *contexts[(arenaNum+1)%numModes] );
305 ASSERT( false, "parallel_for has not thrown an exception." );
306 } catch (...) {
307 AssertMode( arenaNum%numModes );
308 }
309 }
310};
311#endif /* TBB_USE_EXCEPTIONS */
312
313void TestContextFpuEnv() {
314 // Prepare contexts' fp modes.
315 for ( int i = 0, modeNum = 0; i < NumRoundingModes; ++i ) {
316 const int roundingMode = RoundingModes[i];
317 SetRoundingMode( roundingMode );
318 for( int j = 0; j < NumSseModes; ++j, ++modeNum ) {
319 const int sseMode = SseModes[j];
320 SetSseMode( sseMode );
321
322 contexts[modeNum] = new tbb::task_group_context( tbb::task_group_context::isolated,
323 tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings );
324 roundingModes[modeNum] = roundingMode;
325 sseModes[modeNum] = sseMode;
326 }
327 }
328 // Prepare arenas' fp modes.
329 for ( int arenaNum = 0; arenaNum < numArenas; ++arenaNum ) {
330 roundingModes[numModes+arenaNum] = roundingModes[arenaNum%numModes];
331 sseModes[numModes+arenaNum] = sseModes[arenaNum%numModes];
332 }
333 NativeParallelFor( numArenas, TestContextFpuEnvNativeLoopBody() );
334#if TBB_USE_EXCEPTIONS
335 NativeParallelFor( numArenas, TestContextFpuEnvEhNativeLoopBody() );
336#endif
337 for ( int modeNum = 0; modeNum < numModes; ++modeNum )
338 delete contexts[modeNum];
339}
340
341tbb::task_group_context glbIsolatedCtx( tbb::task_group_context::isolated );
342int glbIsolatedCtxMode = -1;
343
344struct TestGlobalIsolatedContextTask : public tbb::task {
345 tbb::task* execute() __TBB_override {
346 AssertFPMode( glbIsolatedCtxMode );
347 return NULL;
348 }
349};
350
351#include "tbb/mutex.h"
352
353struct TestGlobalIsolatedContextNativeLoopBody {
354 void operator()( int threadId ) const {
355 FPModeContext fpGuard( threadId );
356 static tbb::mutex rootAllocMutex;
357 rootAllocMutex.lock();
358 if ( glbIsolatedCtxMode == -1 )
359 glbIsolatedCtxMode = threadId;
360 tbb::task &root = *new (tbb::task::allocate_root( glbIsolatedCtx )) TestGlobalIsolatedContextTask();
361 rootAllocMutex.unlock();
362 tbb::task::spawn_root_and_wait( root );
363 }
364};
365
366void TestGlobalIsolatedContext() {
367 ASSERT( numArenas > 1, NULL );
368 NativeParallelFor( numArenas, TestGlobalIsolatedContextNativeLoopBody() );
369}
370#endif /* __TBB_FP_CONTEXT */
371
372int TestMain () {
373 TestCpuCtlEnvApi();
374 TestFpuEnvPropagation();
375 CheckNoSseStatusPropagation();
376#if __TBB_FP_CONTEXT
377 TestContextFpuEnv();
378 TestGlobalIsolatedContext();
379#endif
380 return Harness::Done;
381}
382