| 1 | /* |
| 2 | Copyright (c) 2005-2019 Intel Corporation |
| 3 | |
| 4 | Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | you may not use this file except in compliance with the License. |
| 6 | You may obtain a copy of the License at |
| 7 | |
| 8 | http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | |
| 10 | Unless required by applicable law or agreed to in writing, software |
| 11 | distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | See the License for the specific language governing permissions and |
| 14 | limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | /** This test checks the automatic propagation of master thread FPU settings |
| 18 | into the worker threads. **/ |
| 19 | |
| 20 | #include "harness_fp.h" |
| 21 | #include "harness.h" |
| 22 | #define private public |
| 23 | #include "tbb/task.h" |
| 24 | #undef private |
| 25 | #include "tbb/parallel_for.h" |
| 26 | #include "tbb/task_scheduler_init.h" |
| 27 | |
| 28 | const int N = 500000; |
| 29 | |
| 30 | #if ( __TBB_x86_32 || __TBB_x86_64 ) && __TBB_CPU_CTL_ENV_PRESENT && !defined(__TBB_WIN32_USE_CL_BUILTINS) |
| 31 | #include "harness_barrier.h" |
| 32 | |
| 33 | class CheckNoSseStatusPropagationBody : public NoAssign { |
| 34 | Harness::SpinBarrier &barrier; |
| 35 | public: |
| 36 | CheckNoSseStatusPropagationBody( Harness::SpinBarrier &_barrier ) : barrier(_barrier) {} |
| 37 | void operator()( const tbb::blocked_range<int>& ) const { |
| 38 | barrier.wait(); |
| 39 | tbb::internal::cpu_ctl_env ctl; |
| 40 | ctl.get_env(); |
| 41 | ASSERT( (ctl.mxcsr & SSE_STATUS_MASK) == 0, "FPU control status bits have been propagated." ); |
| 42 | } |
| 43 | }; |
| 44 | |
| 45 | void CheckNoSseStatusPropagation() { |
| 46 | tbb::internal::cpu_ctl_env ctl; |
| 47 | ctl.get_env(); |
| 48 | ctl.mxcsr |= SSE_STATUS_MASK; |
| 49 | ctl.set_env(); |
| 50 | const int num_threads = tbb::task_scheduler_init::default_num_threads(); |
| 51 | Harness::SpinBarrier barrier(num_threads); |
| 52 | tbb::task_scheduler_init init(num_threads); |
| 53 | tbb::parallel_for( tbb::blocked_range<int>(0, num_threads), CheckNoSseStatusPropagationBody(barrier) ); |
| 54 | ctl.mxcsr &= ~SSE_STATUS_MASK; |
| 55 | ctl.set_env(); |
| 56 | } |
| 57 | #else /* Other archs */ |
| 58 | void CheckNoSseStatusPropagation() {} |
| 59 | #endif /* Other archs */ |
| 60 | |
| 61 | class RoundingModeCheckBody { |
| 62 | int m_mode; |
| 63 | int m_sseMode; |
| 64 | public: |
| 65 | void operator() ( int /*iter*/ ) const { |
| 66 | ASSERT( GetRoundingMode() == m_mode, "FPU control state has not been propagated." ); |
| 67 | ASSERT( GetSseMode() == m_sseMode, "SSE control state has not been propagated." ); |
| 68 | } |
| 69 | |
| 70 | RoundingModeCheckBody ( int mode, int sseMode ) : m_mode(mode), m_sseMode(sseMode) {} |
| 71 | }; |
| 72 | |
| 73 | void TestArenaFpuEnvPropagation( int id ) { |
| 74 | // TBB scheduler instance in a master thread captures the FPU control state |
| 75 | // at the moment of its initialization and passes it to the workers toiling |
| 76 | // on its behalf. |
| 77 | for( int k = 0; k < NumSseModes; ++k ) { |
| 78 | int sse_mode = SseModes[(k + id) % NumSseModes]; |
| 79 | SetSseMode( sse_mode ); |
| 80 | for( int i = 0; i < NumRoundingModes; ++i ) { |
| 81 | int mode = RoundingModes[(i + id) % NumRoundingModes]; |
| 82 | SetRoundingMode( mode ); |
| 83 | // New mode must be set before TBB scheduler is initialized |
| 84 | tbb::task_scheduler_init init; |
| 85 | tbb::parallel_for( 0, N, 1, RoundingModeCheckBody(mode, sse_mode) ); |
| 86 | ASSERT( GetRoundingMode() == mode, NULL ); |
| 87 | } |
| 88 | } |
| 89 | } |
| 90 | |
| 91 | #if __TBB_FP_CONTEXT |
| 92 | void TestArenaFpuEnvPersistence( int id ) { |
| 93 | // Since the following loop uses auto-initialization, the scheduler instance |
| 94 | // implicitly created by the first parallel_for invocation will persist |
| 95 | // until the thread ends, and thus workers will use the mode set by the |
| 96 | // first iteration. |
| 97 | int captured_mode = RoundingModes[id % NumRoundingModes]; |
| 98 | int captured_sse_mode = SseModes[id % NumSseModes]; |
| 99 | for( int k = 0; k < NumSseModes; ++k ) { |
| 100 | int sse_mode = SseModes[(k + id) % NumSseModes]; |
| 101 | SetSseMode( sse_mode ); |
| 102 | for( int i = 0; i < NumRoundingModes; ++i ) { |
| 103 | int mode = RoundingModes[(i + id) % NumRoundingModes]; |
| 104 | SetRoundingMode( mode ); |
| 105 | tbb::parallel_for( 0, N, 1, RoundingModeCheckBody(captured_mode, captured_sse_mode) ); |
| 106 | ASSERT( GetRoundingMode() == mode, NULL ); |
| 107 | } |
| 108 | } |
| 109 | } |
| 110 | #endif |
| 111 | |
| 112 | class LauncherBody { |
| 113 | public: |
| 114 | void operator() ( int id ) const { |
| 115 | TestArenaFpuEnvPropagation( id ); |
| 116 | #if __TBB_FP_CONTEXT |
| 117 | TestArenaFpuEnvPersistence( id ); |
| 118 | #endif |
| 119 | } |
| 120 | }; |
| 121 | |
| 122 | void TestFpuEnvPropagation () { |
| 123 | const int p = tbb::task_scheduler_init::default_num_threads(); |
| 124 | // The test should be run in an oversubscription mode. So create 4*p threads but |
| 125 | // limit the oversubscription for big machines (p>32) with 4*32+(p-32) threads. |
| 126 | const int num_threads = p + (NumRoundingModes-1)*min(p,32); |
| 127 | NativeParallelFor ( num_threads, LauncherBody() ); |
| 128 | } |
| 129 | |
| 130 | void TestCpuCtlEnvApi () { |
| 131 | for( int k = 0; k < NumSseModes; ++k ) { |
| 132 | SetSseMode( SseModes[k] ); |
| 133 | for( int i = 0; i < NumRoundingModes; ++i ) { |
| 134 | SetRoundingMode( RoundingModes[i] ); |
| 135 | ASSERT( GetRoundingMode() == RoundingModes[i], NULL ); |
| 136 | ASSERT( GetSseMode() == SseModes[k], NULL ); |
| 137 | } |
| 138 | } |
| 139 | } |
| 140 | |
| 141 | #if __TBB_FP_CONTEXT |
| 142 | const int numModes = NumRoundingModes*NumSseModes; |
| 143 | const int numArenas = 4; |
| 144 | tbb::task_group_context *contexts[numModes]; |
| 145 | // +1 for a default context |
| 146 | int roundingModes[numModes+numArenas]; |
| 147 | int sseModes[numModes+numArenas]; |
| 148 | |
| 149 | class TestContextFpuEnvBody { |
| 150 | int arenaNum; |
| 151 | int mode; |
| 152 | int depth; |
| 153 | public: |
| 154 | TestContextFpuEnvBody( int _arenaNum, int _mode, int _depth = 0 ) : arenaNum(_arenaNum), mode(_mode), depth(_depth) {} |
| 155 | void operator()( const tbb::blocked_range<int> &r ) const; |
| 156 | }; |
| 157 | |
| 158 | inline void SetMode( int mode ) { |
| 159 | SetRoundingMode( roundingModes[mode] ); |
| 160 | SetSseMode( sseModes[mode] ); |
| 161 | } |
| 162 | |
| 163 | inline void AssertMode( int mode ) { |
| 164 | ASSERT( GetRoundingMode() == roundingModes[mode], "FPU control state has not been set correctly." ); |
| 165 | ASSERT( GetSseMode() == sseModes[mode], "SSE control state has not been set correctly." ); |
| 166 | } |
| 167 | |
| 168 | inline int SetNextMode( int mode, int step ) { |
| 169 | const int nextMode = (mode+step)%numModes; |
| 170 | SetMode( nextMode ); |
| 171 | return nextMode; |
| 172 | } |
| 173 | |
| 174 | class TestContextFpuEnvTask : public tbb::task { |
| 175 | int arenaNum; |
| 176 | int mode; |
| 177 | int depth; |
| 178 | #if __TBB_CPU_CTL_ENV_PRESENT |
| 179 | static const int MAX_DEPTH = 3; |
| 180 | #else |
| 181 | static const int MAX_DEPTH = 4; |
| 182 | #endif |
| 183 | public: |
| 184 | TestContextFpuEnvTask( int _arenaNum, int _mode, int _depth = 0 ) : arenaNum(_arenaNum), mode(_mode), depth(_depth) {} |
| 185 | tbb::task* execute() __TBB_override { |
| 186 | AssertMode( mode ); |
| 187 | if ( depth < MAX_DEPTH ) { |
| 188 | // Test default context. |
| 189 | const int newMode1 = SetNextMode( mode, depth+1 ); |
| 190 | tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, mode, depth+1 ) ); |
| 191 | AssertMode( newMode1 ); |
| 192 | |
| 193 | // Test user default context. |
| 194 | const int newMode2 = SetNextMode( newMode1, depth+1 ); |
| 195 | tbb::task_group_context ctx1; |
| 196 | const int newMode3 = SetNextMode( newMode2, depth+1 ); |
| 197 | tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, mode, depth+1 ), ctx1 ); |
| 198 | AssertMode( newMode3 ); |
| 199 | |
| 200 | // Test user context which captured FPU control settings. |
| 201 | const int newMode4 = SetNextMode( newMode3, depth+1 ); |
| 202 | // Capture newMode4 |
| 203 | ctx1.capture_fp_settings(); |
| 204 | const int newMode5 = SetNextMode( newMode4, depth+1 ); |
| 205 | tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, newMode4, depth+1 ), ctx1 ); |
| 206 | AssertMode( newMode5 ); |
| 207 | |
| 208 | // And again test user context which captured FPU control settings to check multiple captures. |
| 209 | const int newMode6 = SetNextMode( newMode5, depth+1 ); |
| 210 | // Capture newMode6 |
| 211 | ctx1.capture_fp_settings(); |
| 212 | const int newMode7 = SetNextMode( newMode6, depth+1 ); |
| 213 | tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, newMode6, depth+1 ), ctx1 ); |
| 214 | AssertMode( newMode7 ); |
| 215 | |
| 216 | // Test an isolated context. The isolated context should use default FPU control settings. |
| 217 | const int newMode8 = SetNextMode( newMode7, depth+1 ); |
| 218 | tbb::task_group_context ctx2( tbb::task_group_context::isolated ); |
| 219 | const int newMode9 = SetNextMode( newMode8, depth+1 ); |
| 220 | tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, numModes+arenaNum, depth+1 ), ctx2 ); |
| 221 | AssertMode( newMode9 ); |
| 222 | |
| 223 | // The binding should not owerrite captured FPU control settings. |
| 224 | const int newMode10 = SetNextMode( newMode9, depth+1 ); |
| 225 | tbb::task_group_context ctx3; |
| 226 | ctx3.capture_fp_settings(); |
| 227 | const int newMode11 = SetNextMode( newMode10, depth+1 ); |
| 228 | tbb::parallel_for( tbb::blocked_range<int>(0, numModes+1), TestContextFpuEnvBody( arenaNum, newMode10, depth+1 ), ctx3 ); |
| 229 | AssertMode( newMode11 ); |
| 230 | |
| 231 | // Restore initial mode since user code in tbb::task::execute should not change FPU settings. |
| 232 | SetMode( mode ); |
| 233 | } |
| 234 | |
| 235 | return NULL; |
| 236 | } |
| 237 | }; |
| 238 | |
| 239 | void TestContextFpuEnvBody::operator()( const tbb::blocked_range<int> &r ) const { |
| 240 | AssertMode( mode ); |
| 241 | |
| 242 | const int newMode = SetNextMode( mode, depth+2 ); |
| 243 | |
| 244 | int end = r.end(); |
| 245 | if ( end-1 == numModes ) { |
| 246 | // For a default context our mode should be inherited. |
| 247 | tbb::task::spawn_root_and_wait( |
| 248 | *new( tbb::task::allocate_root() ) TestContextFpuEnvTask( arenaNum, mode, depth ) ); |
| 249 | AssertMode( newMode ); |
| 250 | end--; |
| 251 | } |
| 252 | for ( int i=r.begin(); i<end; ++i ) { |
| 253 | tbb::task::spawn_root_and_wait( |
| 254 | *new( tbb::task::allocate_root(*contexts[i]) ) TestContextFpuEnvTask( arenaNum, i, depth ) ); |
| 255 | AssertMode( newMode ); |
| 256 | } |
| 257 | |
| 258 | // Restore initial mode since user code in tbb::task::execute should not change FPU settings. |
| 259 | SetMode( mode ); |
| 260 | } |
| 261 | |
| 262 | class TestContextFpuEnvNativeLoopBody { |
| 263 | public: |
| 264 | void operator() ( int arenaNum ) const { |
| 265 | SetMode(numModes+arenaNum); |
| 266 | tbb::task_scheduler_init init; |
| 267 | tbb::task::spawn_root_and_wait( *new (tbb::task::allocate_root() ) TestContextFpuEnvTask( arenaNum, numModes+arenaNum ) ); |
| 268 | } |
| 269 | }; |
| 270 | |
| 271 | #if TBB_USE_EXCEPTIONS |
| 272 | const int NUM_ITERS = 1000; |
| 273 | class TestContextFpuEnvEhBody { |
| 274 | int mode; |
| 275 | int eh_iter; |
| 276 | int depth; |
| 277 | public: |
| 278 | TestContextFpuEnvEhBody( int _mode, int _eh_iter, int _depth = 0 ) : mode(_mode), eh_iter(_eh_iter), depth(_depth) {} |
| 279 | void operator()( const tbb::blocked_range<int> &r ) const { |
| 280 | AssertMode( mode ); |
| 281 | if ( depth < 1 ) { |
| 282 | const int newMode1 = SetNextMode( mode, 1 ); |
| 283 | tbb::task_group_context ctx; |
| 284 | ctx.capture_fp_settings(); |
| 285 | const int newMode2 = SetNextMode( newMode1, 1 ); |
| 286 | try { |
| 287 | tbb::parallel_for( tbb::blocked_range<int>(0, NUM_ITERS), TestContextFpuEnvEhBody(newMode1,rand()%NUM_ITERS,1), tbb::simple_partitioner(), ctx ); |
| 288 | } catch (...) { |
| 289 | AssertMode( newMode2 ); |
| 290 | if ( r.begin() == eh_iter ) throw; |
| 291 | } |
| 292 | AssertMode( newMode2 ); |
| 293 | SetMode( mode ); |
| 294 | } else if ( r.begin() == eh_iter ) throw 0; |
| 295 | } |
| 296 | }; |
| 297 | |
| 298 | class TestContextFpuEnvEhNativeLoopBody { |
| 299 | public: |
| 300 | void operator() ( int arenaNum ) const { |
| 301 | SetMode( arenaNum%numModes ); |
| 302 | try { |
| 303 | tbb::parallel_for( tbb::blocked_range<int>(0, NUM_ITERS), TestContextFpuEnvEhBody((arenaNum+1)%numModes,rand()%NUM_ITERS), |
| 304 | tbb::simple_partitioner(), *contexts[(arenaNum+1)%numModes] ); |
| 305 | ASSERT( false, "parallel_for has not thrown an exception." ); |
| 306 | } catch (...) { |
| 307 | AssertMode( arenaNum%numModes ); |
| 308 | } |
| 309 | } |
| 310 | }; |
| 311 | #endif /* TBB_USE_EXCEPTIONS */ |
| 312 | |
| 313 | void TestContextFpuEnv() { |
| 314 | // Prepare contexts' fp modes. |
| 315 | for ( int i = 0, modeNum = 0; i < NumRoundingModes; ++i ) { |
| 316 | const int roundingMode = RoundingModes[i]; |
| 317 | SetRoundingMode( roundingMode ); |
| 318 | for( int j = 0; j < NumSseModes; ++j, ++modeNum ) { |
| 319 | const int sseMode = SseModes[j]; |
| 320 | SetSseMode( sseMode ); |
| 321 | |
| 322 | contexts[modeNum] = new tbb::task_group_context( tbb::task_group_context::isolated, |
| 323 | tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings ); |
| 324 | roundingModes[modeNum] = roundingMode; |
| 325 | sseModes[modeNum] = sseMode; |
| 326 | } |
| 327 | } |
| 328 | // Prepare arenas' fp modes. |
| 329 | for ( int arenaNum = 0; arenaNum < numArenas; ++arenaNum ) { |
| 330 | roundingModes[numModes+arenaNum] = roundingModes[arenaNum%numModes]; |
| 331 | sseModes[numModes+arenaNum] = sseModes[arenaNum%numModes]; |
| 332 | } |
| 333 | NativeParallelFor( numArenas, TestContextFpuEnvNativeLoopBody() ); |
| 334 | #if TBB_USE_EXCEPTIONS |
| 335 | NativeParallelFor( numArenas, TestContextFpuEnvEhNativeLoopBody() ); |
| 336 | #endif |
| 337 | for ( int modeNum = 0; modeNum < numModes; ++modeNum ) |
| 338 | delete contexts[modeNum]; |
| 339 | } |
| 340 | |
| 341 | tbb::task_group_context glbIsolatedCtx( tbb::task_group_context::isolated ); |
| 342 | int glbIsolatedCtxMode = -1; |
| 343 | |
| 344 | struct TestGlobalIsolatedContextTask : public tbb::task { |
| 345 | tbb::task* execute() __TBB_override { |
| 346 | AssertFPMode( glbIsolatedCtxMode ); |
| 347 | return NULL; |
| 348 | } |
| 349 | }; |
| 350 | |
| 351 | #include "tbb/mutex.h" |
| 352 | |
| 353 | struct TestGlobalIsolatedContextNativeLoopBody { |
| 354 | void operator()( int threadId ) const { |
| 355 | FPModeContext fpGuard( threadId ); |
| 356 | static tbb::mutex rootAllocMutex; |
| 357 | rootAllocMutex.lock(); |
| 358 | if ( glbIsolatedCtxMode == -1 ) |
| 359 | glbIsolatedCtxMode = threadId; |
| 360 | tbb::task &root = *new (tbb::task::allocate_root( glbIsolatedCtx )) TestGlobalIsolatedContextTask(); |
| 361 | rootAllocMutex.unlock(); |
| 362 | tbb::task::spawn_root_and_wait( root ); |
| 363 | } |
| 364 | }; |
| 365 | |
| 366 | void TestGlobalIsolatedContext() { |
| 367 | ASSERT( numArenas > 1, NULL ); |
| 368 | NativeParallelFor( numArenas, TestGlobalIsolatedContextNativeLoopBody() ); |
| 369 | } |
| 370 | #endif /* __TBB_FP_CONTEXT */ |
| 371 | |
| 372 | int TestMain () { |
| 373 | TestCpuCtlEnvApi(); |
| 374 | TestFpuEnvPropagation(); |
| 375 | CheckNoSseStatusPropagation(); |
| 376 | #if __TBB_FP_CONTEXT |
| 377 | TestContextFpuEnv(); |
| 378 | TestGlobalIsolatedContext(); |
| 379 | #endif |
| 380 | return Harness::Done; |
| 381 | } |
| 382 | |