| 1 | // Licensed to the .NET Foundation under one or more agreements. |
| 2 | // The .NET Foundation licenses this file to you under the MIT license. |
| 3 | // See the LICENSE file in the project root for more information. |
| 4 | //***************************************************************************** |
| 5 | // File: Canary.cpp |
| 6 | // |
| 7 | |
| 8 | // |
| 9 | // Canary for debugger helper thread. This will sniff out if it's safe to take locks. |
| 10 | // |
| 11 | //***************************************************************************** |
| 12 | |
| 13 | #include "stdafx.h" |
| 14 | |
| 15 | |
| 16 | //----------------------------------------------------------------------------- |
| 17 | // Ctor for HelperCanary class |
| 18 | //----------------------------------------------------------------------------- |
| 19 | HelperCanary::HelperCanary() |
| 20 | { |
| 21 | m_hCanaryThread = NULL; |
| 22 | m_CanaryThreadId = 0; |
| 23 | m_RequestCounter = 0; |
| 24 | m_AnswerCounter = 0; |
| 25 | m_fStop = false; |
| 26 | |
| 27 | m_fCachedValid = false; |
| 28 | m_fCachedAnswer = false; |
| 29 | m_initialized = false; |
| 30 | } |
| 31 | |
| 32 | //----------------------------------------------------------------------------- |
| 33 | // Dtor for class |
| 34 | //----------------------------------------------------------------------------- |
| 35 | HelperCanary::~HelperCanary() |
| 36 | { |
| 37 | // Since we're deleting this memory, we need to kill the canary thread. |
| 38 | m_fStop = true; |
| 39 | SetEvent(m_hPingEvent); |
| 40 | |
| 41 | // m_hPingEvent dtor will close handle |
| 42 | WaitForSingleObject(m_hCanaryThread, INFINITE); |
| 43 | } |
| 44 | |
| 45 | //----------------------------------------------------------------------------- |
| 46 | // Clear the cached value for AreLocksAvailable(); |
| 47 | //----------------------------------------------------------------------------- |
| 48 | void HelperCanary::ClearCache() |
| 49 | { |
| 50 | _ASSERTE(ThisIsHelperThreadWorker()); |
| 51 | m_fCachedValid = false; |
| 52 | } |
| 53 | |
| 54 | //----------------------------------------------------------------------------- |
| 55 | // The helper thread can call this to determine if it can safely take a certain |
| 56 | // set of locks (mainly the heap lock(s)). The canary thread will go off and |
| 57 | // try and take these and report back to the helper w/o ever blocking the |
| 58 | // helper. |
| 59 | // |
| 60 | // Returns 'true' if it's safe for helper to take locks; else false. |
| 61 | // We err on the side of safety (returning false). |
| 62 | //----------------------------------------------------------------------------- |
| 63 | bool HelperCanary::AreLocksAvailable() |
| 64 | { |
| 65 | // If we're not on the helper thread, then we're guaranteed safe. |
| 66 | // We check this to support MaybeHelperThread code. |
| 67 | if (!ThisIsHelperThreadWorker()) |
| 68 | { |
| 69 | return true; |
| 70 | } |
| 71 | |
| 72 | if (m_fCachedValid) |
| 73 | { |
| 74 | return m_fCachedAnswer; |
| 75 | } |
| 76 | |
| 77 | // Cache the answer. |
| 78 | m_fCachedAnswer = AreLocksAvailableWorker(); |
| 79 | m_fCachedValid = true; |
| 80 | |
| 81 | #ifdef _DEBUG |
| 82 | // For managed-only debugging, we should always be safe. |
| 83 | if (!g_pRCThread->GetDCB()->m_rightSideIsWin32Debugger) |
| 84 | { |
| 85 | _ASSERTE(m_fCachedAnswer || !"Canary returned false in Managed-debugger" ); |
| 86 | } |
| 87 | |
| 88 | // For debug, nice to be able to enable an assert that tells us if this situation is actually happening. |
| 89 | if (!m_fCachedAnswer) |
| 90 | { |
| 91 | static BOOL shouldBreak = -1; |
| 92 | if (shouldBreak == -1) |
| 93 | { |
| 94 | shouldBreak = UnsafeGetConfigDWORD(CLRConfig::INTERNAL_DbgBreakIfLocksUnavailable); |
| 95 | } |
| 96 | if (shouldBreak) |
| 97 | { |
| 98 | _ASSERTE(!"Potential deadlock detected.\nLocks that the helper thread may need are currently held by other threads." ); |
| 99 | } |
| 100 | } |
| 101 | #endif // _DEBUG |
| 102 | |
| 103 | return m_fCachedAnswer; |
| 104 | } |
| 105 | |
| 106 | //----------------------------------------------------------------------------- |
| 107 | // Creates the canary thread and signaling events. |
| 108 | //----------------------------------------------------------------------------- |
| 109 | void HelperCanary::Init() |
| 110 | { |
| 111 | // You can only run the init code once. The debugger attempts to lazy-init |
| 112 | // the canary at several points but if the canary is already inited then |
| 113 | // we just eagerly return. See issue 841005 for more details. |
| 114 | if(m_initialized) |
| 115 | { |
| 116 | return; |
| 117 | } |
| 118 | else |
| 119 | { |
| 120 | m_initialized = true; |
| 121 | } |
| 122 | |
| 123 | m_hPingEvent = WszCreateEvent(NULL, (BOOL) kAutoResetEvent, FALSE, NULL); |
| 124 | if (m_hPingEvent == NULL) |
| 125 | { |
| 126 | STRESS_LOG1(LF_CORDB, LL_ALWAYS, "Canary failed to create ping event. gle=%d\n" , GetLastError()); |
| 127 | // in the past if we failed to start the thread we just assumed it was unsafe |
| 128 | // so I am preserving that behavior. However I am going to assert that this |
| 129 | // doesn't really happen |
| 130 | _ASSERTE(!"Canary failed to create ping event" ); |
| 131 | return; |
| 132 | } |
| 133 | |
| 134 | m_hWaitEvent = WszCreateEvent(NULL, (BOOL) kManualResetEvent, FALSE, NULL); |
| 135 | if (m_hWaitEvent == NULL) |
| 136 | { |
| 137 | STRESS_LOG1(LF_CORDB, LL_ALWAYS, "Canary failed to create wait event. gle=%d\n" , GetLastError()); |
| 138 | // in the past if we failed to start the thread we just assumed it was unsafe |
| 139 | // so I am preserving that behavior. However I am going to assert that this |
| 140 | // doesn't really happen |
| 141 | _ASSERTE(!"Canary failed to create wait event" ); |
| 142 | return; |
| 143 | } |
| 144 | |
| 145 | // Spin up the canary. This will call dllmain, but that's ok because it just |
| 146 | // degenerates to our timeout case. |
| 147 | const DWORD flags = CREATE_SUSPENDED; |
| 148 | m_hCanaryThread = CreateThread(NULL, 0, |
| 149 | HelperCanary::ThreadProc, this, |
| 150 | flags, &m_CanaryThreadId); |
| 151 | |
| 152 | // in the past if we failed to start the thread we just assumed it was unsafe |
| 153 | // so I am preserving that behavior. However I am going to assert that this |
| 154 | // doesn't really happen |
| 155 | if(m_hCanaryThread == NULL) |
| 156 | { |
| 157 | _ASSERTE(!"CreateThread() failed to create Canary thread" ); |
| 158 | return; |
| 159 | } |
| 160 | |
| 161 | // Capture the Canary thread's TID so that the RS can mark it as a can't-stop region. |
| 162 | // This is essential so that the RS doesn't view it as some external thread to be suspended when we hit |
| 163 | // debug events. |
| 164 | _ASSERTE(g_pRCThread != NULL); |
| 165 | g_pRCThread->GetDCB()->m_CanaryThreadId = m_CanaryThreadId; |
| 166 | |
| 167 | ResumeThread(m_hCanaryThread); |
| 168 | } |
| 169 | |
| 170 | |
| 171 | //----------------------------------------------------------------------------- |
| 172 | // Does real work for AreLocksAvailable(), minus caching. |
| 173 | //----------------------------------------------------------------------------- |
| 174 | bool HelperCanary::AreLocksAvailableWorker() |
| 175 | { |
| 176 | #if _DEBUG |
| 177 | // For debugging, allow a way to force the canary to fail, and thus test our |
| 178 | // failure paths. |
| 179 | static BOOL fShortcut= -1; |
| 180 | if (fShortcut == -1) |
| 181 | { |
| 182 | fShortcut = UnsafeGetConfigDWORD(CLRConfig::INTERNAL_DbgShortcutCanary); |
| 183 | } |
| 184 | if (fShortcut == 1) |
| 185 | { |
| 186 | return false; |
| 187 | } |
| 188 | if (fShortcut == 2) |
| 189 | { |
| 190 | return true; |
| 191 | } |
| 192 | #endif |
| 193 | |
| 194 | // We used to do lazy init but that is dangerous... CreateThread |
| 195 | // allocates some memory which can block on a lock, exactly the |
| 196 | // situation we are attempting to detect and not block on. |
| 197 | // Instead we spin up the canary in advance and if that failed then |
| 198 | // assume unsafe |
| 199 | if(m_CanaryThreadId == 0) |
| 200 | { |
| 201 | _ASSERTE(!"We shouldn't be lazy initing the canary anymore" ); |
| 202 | return false; |
| 203 | } |
| 204 | |
| 205 | // Canary will take the locks of interest and then set the Answer counter equal to our request counter. |
| 206 | m_RequestCounter = m_RequestCounter + 1; |
| 207 | ResetEvent(m_hWaitEvent); |
| 208 | SetEvent(m_hPingEvent); |
| 209 | |
| 210 | // Spin waiting for answer. If canary gets back to us, then the locks must be free and so it's safe for helper-thread. |
| 211 | // If we timeout, then we err on the side of safety and assume canary blocked on a lock and so it's not safe |
| 212 | // for the helper thread to take those locks. |
| 213 | // We explicitly have a simple spin-wait instead of using win32 events because we want something simple and |
| 214 | // provably correct. Since we already need the spin-wait for the counters, adding an extra win32 event |
| 215 | // to get rid of the sleep would be additional complexity and race windows without a clear benefit. |
| 216 | |
| 217 | // We need to track what iteration of "AreLocksAvailable" the helper is on. Say canary sniffs two locks, now Imagine if: |
| 218 | // 1) Helper calls AreLocksAvailable, |
| 219 | // 2) the canary does get blocked on lock #1, |
| 220 | // 3) process resumes, canary now gets + releases lock #1, |
| 221 | // 4) another random thread takes lock #1 |
| 222 | // 5) then helper calls AreLocksAvailable again later |
| 223 | // 6) then the canary finally finishes. Note it's never tested lock #1 on the 2nd iteration. |
| 224 | // We don't want the canary's response initiated from the 1st request to impact the Helper's 2nd request. |
| 225 | // Thus we keep a request / answer counter to make sure that the canary tests all locks on the same iteration. |
| 226 | DWORD retry = 0; |
| 227 | |
| 228 | const DWORD msSleepSteadyState = 150; // sleep time in ms |
| 229 | const DWORD maxRetry = 15; // number of times to try. |
| 230 | DWORD msSleep = 80; // how much to sleep on first iteration. |
| 231 | |
| 232 | while(m_RequestCounter != m_AnswerCounter) |
| 233 | { |
| 234 | retry ++; |
| 235 | if (retry > maxRetry) |
| 236 | { |
| 237 | STRESS_LOG0(LF_CORDB, LL_ALWAYS, "Canary timed out!\n" ); |
| 238 | return false; |
| 239 | } |
| 240 | |
| 241 | // We'll either timeout (in which case it's like a Sleep(), or |
| 242 | // get the event, which shortcuts the sleep. |
| 243 | WaitForSingleObject(m_hWaitEvent, msSleep); |
| 244 | |
| 245 | // In case a stale answer sets the wait event high, reset it now to avoid us doing |
| 246 | // a live spin-lock. |
| 247 | ResetEvent(m_hWaitEvent); |
| 248 | |
| 249 | |
| 250 | msSleep = msSleepSteadyState; |
| 251 | } |
| 252 | |
| 253 | // Canary made it on same Request iteration, so it must be safe! |
| 254 | return true; |
| 255 | } |
| 256 | |
| 257 | //----------------------------------------------------------------------------- |
| 258 | // Real OS thread proc for Canary thread. |
| 259 | // param - 'this' pointer for HelperCanary |
| 260 | // return value - meaningless, but threads need to return something. |
| 261 | //----------------------------------------------------------------------------- |
| 262 | DWORD HelperCanary::ThreadProc(LPVOID param) |
| 263 | { |
| 264 | _ASSERTE(!ThisIsHelperThreadWorker()); |
| 265 | |
| 266 | STRESS_LOG0(LF_CORDB, LL_ALWAYS, "Canary thread spun up\n" ); |
| 267 | HelperCanary * pThis = reinterpret_cast<HelperCanary*> (param); |
| 268 | pThis->ThreadProc(); |
| 269 | _ASSERTE(pThis->m_fStop); |
| 270 | STRESS_LOG0(LF_CORDB, LL_ALWAYS, "Canary thread exiting\n" ); |
| 271 | |
| 272 | return 0; |
| 273 | } |
| 274 | |
| 275 | //----------------------------------------------------------------------------- |
| 276 | // Real implementation of Canary Thread. |
| 277 | // Single canary thread is reused after creation. |
| 278 | //----------------------------------------------------------------------------- |
| 279 | void HelperCanary::ThreadProc() |
| 280 | { |
| 281 | _ASSERTE(m_CanaryThreadId == GetCurrentThreadId()); |
| 282 | |
| 283 | while(true) |
| 284 | { |
| 285 | WaitForSingleObject(m_hPingEvent, INFINITE); |
| 286 | |
| 287 | m_AnswerCounter = 0; |
| 288 | DWORD dwRequest = m_RequestCounter; |
| 289 | |
| 290 | if (m_fStop) |
| 291 | { |
| 292 | return; |
| 293 | } |
| 294 | STRESS_LOG2(LF_CORDB, LL_ALWAYS, "stage:%d,req:%d" , 0, dwRequest); |
| 295 | |
| 296 | // Now take the locks of interest. This could block indefinitely. If this blocks, we may even get multiple requests. |
| 297 | TakeLocks(); |
| 298 | |
| 299 | m_AnswerCounter = dwRequest; |
| 300 | |
| 301 | // Set wait event to let Requesting thread shortcut its spin lock. This is purely an |
| 302 | // optimization because requesting thread will still check Answer/Request counters. |
| 303 | // That protects us from recyling bugs. |
| 304 | SetEvent(m_hWaitEvent); |
| 305 | } |
| 306 | } |
| 307 | |
| 308 | //----------------------------------------------------------------------------- |
| 309 | // Try and take locks. |
| 310 | //----------------------------------------------------------------------------- |
| 311 | void HelperCanary::TakeLocks() |
| 312 | { |
| 313 | _ASSERTE(::GetThread() == NULL); // Canary Thread should always be outside the runtime. |
| 314 | _ASSERTE(m_CanaryThreadId == GetCurrentThreadId()); |
| 315 | |
| 316 | // Call new, which will take whatever standard heap locks there are. |
| 317 | // We don't care about what memory we get; we just want to take the heap lock(s). |
| 318 | DWORD * p = new (nothrow) DWORD(); |
| 319 | delete p; |
| 320 | |
| 321 | STRESS_LOG1(LF_CORDB, LL_ALWAYS, "canary stage:%d\n" , 1); |
| 322 | } |
| 323 | |
| 324 | |
| 325 | |