1 | // Licensed to the .NET Foundation under one or more agreements. |
2 | // The .NET Foundation licenses this file to you under the MIT license. |
3 | // See the LICENSE file in the project root for more information. |
4 | // |
5 | // File: AMD64/VirtualCallStubCpu.hpp |
6 | // |
7 | |
8 | |
9 | |
10 | // |
11 | |
12 | // See code:VirtualCallStubManager for details |
13 | // |
14 | // ============================================================================ |
15 | |
16 | #ifndef _VIRTUAL_CALL_STUB_AMD64_H |
17 | #define _VIRTUAL_CALL_STUB_AMD64_H |
18 | |
19 | #include "dbginterface.h" |
20 | |
21 | //#define STUB_LOGGING |
22 | |
23 | #pragma pack(push, 1) |
24 | // since we are placing code, we want byte packing of the structs |
25 | |
26 | #define USES_LOOKUP_STUBS 1 |
27 | |
28 | /********************************************************************************************* |
29 | Stubs that contain code are all part of larger structs called Holders. There is a |
30 | Holder for each kind of stub, i.e XXXStub is contained with XXXHolder. Holders are |
31 | essentially an implementation trick that allowed rearranging the code sequences more |
32 | easily while trying out different alternatives, and for dealing with any alignment |
33 | issues in a way that was mostly immune to the actually code sequences. These Holders |
34 | should be revisited when the stub code sequences are fixed, since in many cases they |
35 | add extra space to a stub that is not really needed. |
36 | |
37 | Stubs are placed in cache and hash tables. Since unaligned access of data in memory |
38 | is very slow, the keys used in those tables should be aligned. The things used as keys |
39 | typically also occur in the generated code, e.g. a token as an immediate part of an instruction. |
40 | For now, to avoid alignment computations as different code strategies are tried out, the key |
41 | fields are all in the Holders. Eventually, many of these fields should be dropped, and the instruction |
42 | streams aligned so that the immediate fields fall on aligned boundaries. |
43 | */ |
44 | |
45 | #if USES_LOOKUP_STUBS |
46 | |
47 | struct LookupStub; |
48 | struct LookupHolder; |
49 | |
50 | /*LookupStub************************************************************************************** |
51 | Virtual and interface call sites are initially setup to point at LookupStubs. |
52 | This is because the runtime type of the <this> pointer is not yet known, |
53 | so the target cannot be resolved. Note: if the jit is able to determine the runtime type |
54 | of the <this> pointer, it should be generating a direct call not a virtual or interface call. |
55 | This stub pushes a lookup token onto the stack to identify the sought after method, and then |
56 | jumps into the EE (VirtualCallStubManager::ResolveWorkerStub) to effectuate the lookup and |
57 | transfer of control to the appropriate target method implementation, perhaps patching of the call site |
58 | along the way to point to a more appropriate stub. Hence callsites that point to LookupStubs |
59 | get quickly changed to point to another kind of stub. |
60 | */ |
61 | struct LookupStub |
62 | { |
63 | inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } |
64 | |
65 | inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } |
66 | inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); } |
67 | |
68 | private: |
69 | friend struct LookupHolder; |
70 | |
71 | // The lookup entry point starts with a nop in order to allow us to quickly see |
72 | // if the stub is lookup stub or a dispatch stub. We can read thye first byte |
73 | // of a stub to find out what kind of a stub we have. |
74 | |
75 | BYTE _entryPoint [3]; // 90 nop |
76 | // 48 B8 mov rax, |
77 | size_t _token; // xx xx xx xx xx xx xx xx 64-bit address |
78 | BYTE part2 [3]; // 50 push rax |
79 | // 48 B8 mov rax, |
80 | size_t _resolveWorkerAddr; // xx xx xx xx xx xx xx xx 64-bit address |
81 | BYTE part3 [2]; // FF E0 jmp rax |
82 | }; |
83 | |
84 | /* LookupHolders are the containers for LookupStubs, they provide for any alignment of |
85 | stubs as necessary. In the case of LookupStubs, alignment is necessary since |
86 | LookupStubs are placed in a hash table keyed by token. */ |
87 | struct LookupHolder |
88 | { |
89 | static void InitializeStatic(); |
90 | |
91 | void Initialize(PCODE resolveWorkerTarget, size_t dispatchToken); |
92 | |
93 | LookupStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } |
94 | |
95 | static LookupHolder* FromLookupEntry(PCODE lookupEntry); |
96 | |
97 | private: |
98 | friend struct LookupStub; |
99 | |
100 | LookupStub _stub; |
101 | }; |
102 | |
103 | #endif // USES_LOOKUP_STUBS |
104 | |
105 | struct DispatchStub; |
106 | struct DispatchStubShort; |
107 | struct DispatchStubLong; |
108 | struct DispatchHolder; |
109 | |
110 | /*DispatchStub************************************************************************************** |
111 | The structure of a full dispatch stub in memory is a DispatchStub followed contiguously in memory |
112 | by either a DispatchStubShort of a DispatchStubLong. DispatchStubShort is used when the resolve |
113 | stub (failTarget()) is reachable by a rel32 (DISPL) jump. We make a pretty good effort to make sure |
114 | that the stub heaps are set up so that this is the case. If we allocate enough stubs that the heap |
115 | end up allocating in a new block that is further away than a DISPL jump can go, then we end up using |
116 | a DispatchStubLong which is bigger but is a full 64-bit jump. */ |
117 | |
118 | /*DispatchStubShort********************************************************************************* |
119 | This is the logical continuation of DispatchStub for the case when the failure target is within |
120 | a rel32 jump (DISPL). */ |
121 | struct DispatchStubShort |
122 | { |
123 | friend struct DispatchHolder; |
124 | friend struct DispatchStub; |
125 | |
126 | static BOOL isShortStub(LPCBYTE pCode); |
127 | inline PCODE implTarget() const { LIMITED_METHOD_CONTRACT; return (PCODE) _implTarget; } |
128 | inline PCODE failTarget() const { LIMITED_METHOD_CONTRACT; return (PCODE) &_failDispl + sizeof(DISPL) + _failDispl; } |
129 | |
130 | private: |
131 | BYTE part1 [2]; // 0f 85 jne |
132 | DISPL _failDispl; // xx xx xx xx failEntry ;must be forward jmp for perf reasons |
133 | BYTE part2 [2]; // 48 B8 mov rax, |
134 | size_t _implTarget; // xx xx xx xx xx xx xx xx 64-bit address |
135 | BYTE part3 [2]; // FF E0 jmp rax |
136 | |
137 | // 31 bytes long, need 1 byte of padding to 8-byte align. |
138 | BYTE alignPad [1]; // cc |
139 | }; |
140 | |
141 | inline BOOL DispatchStubShort::isShortStub(LPCBYTE pCode) |
142 | { |
143 | LIMITED_METHOD_CONTRACT; |
144 | return reinterpret_cast<DispatchStubShort const *>(pCode)->part1[0] == 0x0f; |
145 | } |
146 | |
147 | |
148 | /*DispatchStubLong********************************************************************************** |
149 | This is the logical continuation of DispatchStub for the case when the failure target is not |
150 | reachable by a rel32 jump (DISPL). */ |
151 | struct DispatchStubLong |
152 | { |
153 | friend struct DispatchHolder; |
154 | friend struct DispatchStub; |
155 | |
156 | static inline BOOL isLongStub(LPCBYTE pCode); |
157 | inline PCODE implTarget() const { LIMITED_METHOD_CONTRACT; return (PCODE) _implTarget; } |
158 | inline PCODE failTarget() const { LIMITED_METHOD_CONTRACT; return (PCODE) _failTarget; } |
159 | |
160 | private: |
161 | BYTE part1 [1]; // 75 jne |
162 | BYTE _failDispl; // xx failLabel |
163 | BYTE part2 [2]; // 48 B8 mov rax, |
164 | size_t _implTarget; // xx xx xx xx xx xx xx xx 64-bit address |
165 | BYTE part3 [2]; // FF E0 jmp rax |
166 | // failLabel: |
167 | BYTE part4 [2]; // 48 B8 mov rax, |
168 | size_t _failTarget; // xx xx xx xx xx xx xx xx 64-bit address |
169 | BYTE part5 [2]; // FF E0 jmp rax |
170 | |
171 | // 39 bytes long, need 1 byte of padding to 8-byte align. |
172 | BYTE alignPad [1]; // cc |
173 | }; |
174 | |
175 | inline BOOL DispatchStubLong::isLongStub(LPCBYTE pCode) |
176 | { |
177 | LIMITED_METHOD_CONTRACT; |
178 | return reinterpret_cast<DispatchStubLong const *>(pCode)->part1[0] == 0x75; |
179 | } |
180 | |
181 | /*DispatchStub************************************************************************************** |
182 | Monomorphic and mostly monomorphic call sites eventually point to DispatchStubs. |
183 | A dispatch stub has an expected type (expectedMT), target address (target) and fail address (failure). |
184 | If the calling frame does in fact have the <this> type be of the expected type, then |
185 | control is transfered to the target address, the method implementation. If not, |
186 | then control is transfered to the fail address, a fail stub (see below) where a polymorphic |
187 | lookup is done to find the correct address to go to. |
188 | |
189 | implementation note: Order, choice of instructions, and branch directions |
190 | should be carefully tuned since it can have an inordinate effect on performance. Particular |
191 | attention needs to be paid to the effects on the BTB and branch prediction, both in the small |
192 | and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. |
193 | Note that since this stub is only used for mostly monomorphic callsites (ones that are not, get patched |
194 | to something else), therefore the conditional jump "jne failure" is mostly not taken, and hence it is important |
195 | that the branch prediction staticly predict this, which means it must be a forward jump. The alternative |
196 | is to reverse the order of the jumps and make sure that the resulting conditional jump "je implTarget" |
197 | is statically predicted as taken, i.e a backward jump. The current choice was taken since it was easier |
198 | to control the placement of the stubs than control the placement of the jitted code and the stubs. */ |
199 | struct DispatchStub |
200 | { |
201 | friend struct DispatchHolder; |
202 | |
203 | enum DispatchStubType |
204 | { |
205 | e_TYPE_SHORT, |
206 | e_TYPE_LONG, |
207 | }; |
208 | |
209 | inline DispatchStubType type() const |
210 | { |
211 | LIMITED_METHOD_CONTRACT; |
212 | CONSISTENCY_CHECK(DispatchStubShort::isShortStub(reinterpret_cast<LPCBYTE>(this + 1)) |
213 | || DispatchStubLong::isLongStub(reinterpret_cast<LPCBYTE>(this + 1))); |
214 | return DispatchStubShort::isShortStub((BYTE *)(this + 1)) ? e_TYPE_SHORT : e_TYPE_LONG; |
215 | } |
216 | |
217 | inline static size_t size(DispatchStubType type) |
218 | { |
219 | STATIC_CONTRACT_LEAF; |
220 | return sizeof(DispatchStub) + |
221 | ((type == e_TYPE_SHORT) ? sizeof(DispatchStubShort) : sizeof(DispatchStubLong)); |
222 | } |
223 | |
224 | inline PCODE entryPoint() const { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } |
225 | inline size_t expectedMT() const { LIMITED_METHOD_CONTRACT; return _expectedMT; } |
226 | inline size_t size() const { WRAPPER_NO_CONTRACT; return size(type()); } |
227 | |
228 | inline PCODE implTarget() const |
229 | { |
230 | LIMITED_METHOD_CONTRACT; |
231 | if (type() == e_TYPE_SHORT) |
232 | return getShortStub()->implTarget(); |
233 | else |
234 | return getLongStub()->implTarget(); |
235 | } |
236 | |
237 | inline PCODE failTarget() const |
238 | { |
239 | if (type() == e_TYPE_SHORT) |
240 | return getShortStub()->failTarget(); |
241 | else |
242 | return getLongStub()->failTarget(); |
243 | } |
244 | |
245 | private: |
246 | inline DispatchStubShort const *getShortStub() const |
247 | { LIMITED_METHOD_CONTRACT; return reinterpret_cast<DispatchStubShort const *>(this + 1); } |
248 | |
249 | inline DispatchStubLong const *getLongStub() const |
250 | { LIMITED_METHOD_CONTRACT; return reinterpret_cast<DispatchStubLong const *>(this + 1); } |
251 | |
252 | BYTE _entryPoint [2]; // 48 B8 mov rax, |
253 | size_t _expectedMT; // xx xx xx xx xx xx xx xx 64-bit address |
254 | BYTE part1 [3]; // 48 39 XX cmp [THIS_REG], rax |
255 | |
256 | // Followed by either DispatchStubShort or DispatchStubLong, depending |
257 | // on whether we were able to make a rel32 or had to make an abs64 jump |
258 | // to the resolve stub on failure. |
259 | |
260 | }; |
261 | |
262 | /* DispatchHolders are the containers for DispatchStubs, they provide for any alignment of |
263 | stubs as necessary. DispatchStubs are placed in a hashtable and in a cache. The keys for both |
264 | are the pair expectedMT and token. Efficiency of the of the hash table is not a big issue, |
265 | since lookups in it are fairly rare. Efficiency of the cache is paramount since it is accessed frequently |
266 | (see ResolveStub below). Currently we are storing both of these fields in the DispatchHolder to simplify |
267 | alignment issues. If inlineMT in the stub itself was aligned, then it could be the expectedMT field. |
268 | While the token field can be logically gotten by following the failure target to the failEntryPoint |
269 | of the ResolveStub and then to the token over there, for perf reasons of cache access, it is duplicated here. |
270 | This allows us to use DispatchStubs in the cache. The alternative is to provide some other immutable struct |
271 | for the cache composed of the triplet (expectedMT, token, target) and some sort of reclaimation scheme when |
272 | they are thrown out of the cache via overwrites (since concurrency will make the obvious approaches invalid). |
273 | */ |
274 | |
275 | /* @workaround for ee resolution - Since the EE does not currently have a resolver function that |
276 | does what we want, see notes in implementation of VirtualCallStubManager::Resolver, we are |
277 | using dispatch stubs to siumulate what we want. That means that inlineTarget, which should be immutable |
278 | is in fact written. Hence we have moved target out into the holder and aligned it so we can |
279 | atomically update it. When we get a resolver function that does what we want, we can drop this field, |
280 | and live with just the inlineTarget field in the stub itself, since immutability will hold.*/ |
281 | struct DispatchHolder |
282 | { |
283 | static void InitializeStatic(); |
284 | |
285 | void Initialize(PCODE implTarget, PCODE failTarget, size_t expectedMT, |
286 | DispatchStub::DispatchStubType type); |
287 | |
288 | static size_t GetHolderSize(DispatchStub::DispatchStubType type) |
289 | { STATIC_CONTRACT_WRAPPER; return DispatchStub::size(type); } |
290 | |
291 | static BOOL CanShortJumpDispatchStubReachFailTarget(PCODE failTarget, LPCBYTE stubMemory) |
292 | { |
293 | STATIC_CONTRACT_WRAPPER; |
294 | LPCBYTE pFrom = stubMemory + sizeof(DispatchStub) + offsetof(DispatchStubShort, part2[0]); |
295 | size_t cbRelJump = failTarget - (PCODE)pFrom; |
296 | return FitsInI4(cbRelJump); |
297 | } |
298 | |
299 | DispatchStub* stub() { LIMITED_METHOD_CONTRACT; return reinterpret_cast<DispatchStub *>(this); } |
300 | |
301 | static DispatchHolder* FromDispatchEntry(PCODE dispatchEntry); |
302 | |
303 | private: |
304 | // DispatchStub follows here. It is dynamically sized on allocation |
305 | // because it could be a DispatchStubLong or a DispatchStubShort |
306 | }; |
307 | |
308 | struct ResolveStub; |
309 | struct ResolveHolder; |
310 | |
311 | /*ResolveStub************************************************************************************** |
312 | Polymorphic call sites and monomorphic calls that fail end up in a ResolverStub. There is only |
313 | one resolver stub built for any given token, even though there may be many call sites that |
314 | use that token and many distinct <this> types that are used in the calling call frames. A resolver stub |
315 | actually has two entry points, one for polymorphic call sites and one for dispatch stubs that fail on their |
316 | expectedMT test. There is a third part of the resolver stub that enters the ee when a decision should |
317 | be made about changing the callsite. Therefore, we have defined the resolver stub as three distinct pieces, |
318 | even though they are actually allocated as a single contiguous block of memory. These pieces are: |
319 | |
320 | A ResolveStub has two entry points: |
321 | |
322 | FailEntry - where the dispatch stub goes if the expected MT test fails. This piece of the stub does |
323 | a check to see how often we are actually failing. If failures are frequent, control transfers to the |
324 | patch piece to cause the call site to be changed from a mostly monomorphic callsite |
325 | (calls dispatch stub) to a polymorphic callsize (calls resolve stub). If failures are rare, control |
326 | transfers to the resolve piece (see ResolveStub). The failEntryPoint decrements a counter |
327 | every time it is entered. The ee at various times will add a large chunk to the counter. |
328 | |
329 | ResolveEntry - does a lookup via in a cache by hashing the actual type of the calling frame s |
330 | <this> and the token identifying the (contract,method) pair desired. If found, control is transfered |
331 | to the method implementation. If not found in the cache, the token is pushed and the ee is entered via |
332 | the ResolveWorkerStub to do a full lookup and eventual transfer to the correct method implementation. Since |
333 | there is a different resolve stub for every token, the token can be inlined and the token can be pre-hashed. |
334 | The effectiveness of this approach is highly sensitive to the effectiveness of the hashing algorithm used, |
335 | as well as its speed. It turns out it is very important to make the hash function sensitive to all |
336 | of the bits of the method table, as method tables are laid out in memory in a very non-random way. Before |
337 | making any changes to the code sequences here, it is very important to measure and tune them as perf |
338 | can vary greatly, in unexpected ways, with seeming minor changes. |
339 | |
340 | Implementation note - Order, choice of instructions, and branch directions |
341 | should be carefully tuned since it can have an inordinate effect on performance. Particular |
342 | attention needs to be paid to the effects on the BTB and branch prediction, both in the small |
343 | and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. |
344 | Note that this stub is called in highly polymorphic cases, but the cache should have been sized |
345 | and the hash function chosen to maximize the cache hit case. Hence the cmp/jcc instructions should |
346 | mostly be going down the cache hit route, and it is important that this be statically predicted as so. |
347 | Hence the 3 jcc instrs need to be forward jumps. As structured, there is only one jmp/jcc that typically |
348 | gets put in the BTB since all the others typically fall straight thru. Minimizing potential BTB entries |
349 | is important. */ |
350 | |
351 | struct ResolveStub |
352 | { |
353 | inline PCODE failEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_failEntryPoint[0]; } |
354 | inline PCODE resolveEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_resolveEntryPoint[0]; } |
355 | inline PCODE slowEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_slowEntryPoint[0]; } |
356 | |
357 | inline INT32* pCounter() { LIMITED_METHOD_CONTRACT; return _pCounter; } |
358 | inline UINT32 hashedToken() { LIMITED_METHOD_CONTRACT; return _hashedToken >> LOG2_PTRSIZE; } |
359 | inline size_t cacheAddress() { LIMITED_METHOD_CONTRACT; return _cacheAddress; } |
360 | inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } |
361 | inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); } |
362 | |
363 | private: |
364 | friend struct ResolveHolder; |
365 | |
366 | BYTE _resolveEntryPoint[3];// resolveStub: |
367 | // 52 push rdx |
368 | // 49 BA mov r10, |
369 | size_t _cacheAddress; // xx xx xx xx xx xx xx xx 64-bit address |
370 | BYTE part1 [15]; // 48 8B XX mov rax, [THIS_REG] ; Compute hash = ((MT + MT>>12) ^ prehash) |
371 | // 48 8B D0 mov rdx, rax ; rdx <- current MethodTable |
372 | // 48 C1 E8 0C shr rax, 12 |
373 | // 48 03 C2 add rax, rdx |
374 | // 48 35 xor rax, |
375 | UINT32 _hashedToken; // xx xx xx xx hashedtoken ; xor with pre-hashed token |
376 | BYTE part2 [2]; // 48 25 and rax, |
377 | UINT32 mask; // xx xx xx xx cache_mask ; and with cache mask |
378 | BYTE part3 [6]; // 4A 8B 04 10 mov rax, [r10 + rax] ; get cache entry address |
379 | // 49 BA mov r10, |
380 | size_t _token; // xx xx xx xx xx xx xx xx 64-bit address |
381 | BYTE part4 [3]; // 48 3B 50 cmp rdx, [rax+ ; compare our MT vs. cache MT |
382 | BYTE mtOffset; // xx ResolverCacheElem.pMT] |
383 | BYTE part5 [1]; // 75 jne |
384 | BYTE toMiss1; // xx miss ; must be forward jump, for perf reasons |
385 | BYTE part6 [3]; // 4C 3B 50 cmp r10, [rax+ ; compare our token vs. cache token |
386 | BYTE tokenOffset; // xx ResolverCacheElem.token] |
387 | BYTE part7 [1]; // 75 jne |
388 | BYTE toMiss2; // xx miss ; must be forward jump, for perf reasons |
389 | BYTE part8 [3]; // 48 8B 40 mov rax, [rax+ ; setup rax with method impl address |
390 | BYTE targetOffset; // xx ResolverCacheElem.target] |
391 | BYTE part9 [3]; // 5A pop rdx |
392 | // FF E0 jmp rax |
393 | // failStub: |
394 | BYTE _failEntryPoint [2]; // 48 B8 mov rax, |
395 | INT32* _pCounter; // xx xx xx xx xx xx xx xx 64-bit address |
396 | BYTE part11 [4]; // 83 00 FF add dword ptr [rax], -1 |
397 | // 7d jnl |
398 | BYTE toResolveStub1; // xx resolveStub |
399 | BYTE part12 [4]; // 49 83 CB 01 or r11, 1 |
400 | BYTE _slowEntryPoint [3]; // 52 slow: push rdx |
401 | // 49 BA mov r10, |
402 | size_t _tokenSlow; // xx xx xx xx xx xx xx xx 64-bit address |
403 | // BYTE miss [5]; // 5A miss: pop rdx ; don't pop rdx |
404 | // // 41 52 push r10 ; don't push r10 leave it setup with token |
405 | BYTE miss [3]; // 50 push rax ; push ptr to cache elem |
406 | // 48 B8 mov rax, |
407 | size_t _resolveWorker; // xx xx xx xx xx xx xx xx 64-bit address |
408 | BYTE part10 [2]; // FF E0 jmp rax |
409 | }; |
410 | |
411 | /* ResolveHolders are the containers for ResolveStubs, They provide |
412 | for any alignment of the stubs as necessary. The stubs are placed in a hash table keyed by |
413 | the token for which they are built. Efficiency of access requires that this token be aligned. |
414 | For now, we have copied that field into the ResolveHolder itself, if the resolve stub is arranged such that |
415 | any of its inlined tokens (non-prehashed) is aligned, then the token field in the ResolveHolder |
416 | is not needed. */ |
417 | struct ResolveHolder |
418 | { |
419 | static void InitializeStatic(); |
420 | |
421 | void Initialize(PCODE resolveWorkerTarget, PCODE patcherTarget, |
422 | size_t dispatchToken, UINT32 hashedToken, |
423 | void * cacheAddr, INT32* counterAddr); |
424 | |
425 | ResolveStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } |
426 | |
427 | static ResolveHolder* FromFailEntry(PCODE resolveEntry); |
428 | static ResolveHolder* FromResolveEntry(PCODE resolveEntry); |
429 | |
430 | private: |
431 | ResolveStub _stub; |
432 | }; |
433 | |
434 | /*VTableCallStub************************************************************************************** |
435 | These are jump stubs that perform a vtable-base virtual call. These stubs assume that an object is placed |
436 | in the first argument register (this pointer). From there, the stub extracts the MethodTable pointer, followed by the |
437 | vtable pointer, and finally jumps to the target method at a given slot in the vtable. |
438 | */ |
439 | struct VTableCallStub |
440 | { |
441 | friend struct VTableCallHolder; |
442 | |
443 | inline size_t size() |
444 | { |
445 | LIMITED_METHOD_CONTRACT; |
446 | |
447 | BYTE* pStubCode = (BYTE *)this; |
448 | |
449 | size_t cbSize = 3; // First mov instruction |
450 | cbSize += (pStubCode[cbSize + 2] == 0x80 ? 7 : 4); // Either 48 8B 80 or 48 8B 40: mov rax,[rax+offset] |
451 | cbSize += (pStubCode[cbSize + 1] == 0xa0 ? 6 : 3); // Either FF A0 or FF 60: jmp qword ptr [rax+slot] |
452 | cbSize += 4; // Slot value (data storage, not a real instruction) |
453 | |
454 | return cbSize; |
455 | } |
456 | |
457 | inline PCODE entryPoint() const { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } |
458 | |
459 | inline size_t token() |
460 | { |
461 | LIMITED_METHOD_CONTRACT; |
462 | DWORD slot = *(DWORD*)(reinterpret_cast<BYTE*>(this) + size() - 4); |
463 | return DispatchToken::CreateDispatchToken(slot).To_SIZE_T(); |
464 | } |
465 | |
466 | private: |
467 | BYTE _entryPoint[0]; // Dynamically sized stub. See Initialize() for more details. |
468 | }; |
469 | |
470 | /* VTableCallHolders are the containers for VTableCallStubs, they provide for any alignment of |
471 | stubs as necessary. */ |
472 | struct VTableCallHolder |
473 | { |
474 | void Initialize(unsigned slot); |
475 | |
476 | VTableCallStub* stub() { LIMITED_METHOD_CONTRACT; return reinterpret_cast<VTableCallStub *>(this); } |
477 | |
478 | static size_t GetHolderSize(unsigned slot) |
479 | { |
480 | STATIC_CONTRACT_WRAPPER; |
481 | unsigned offsetOfIndirection = MethodTable::GetVtableOffset() + MethodTable::GetIndexOfVtableIndirection(slot) * TARGET_POINTER_SIZE; |
482 | unsigned offsetAfterIndirection = MethodTable::GetIndexAfterVtableIndirection(slot) * TARGET_POINTER_SIZE; |
483 | return 3 + (offsetOfIndirection >= 0x80 ? 7 : 4) + (offsetAfterIndirection >= 0x80 ? 6 : 3) + 4; |
484 | } |
485 | |
486 | static VTableCallHolder* VTableCallHolder::FromVTableCallEntry(PCODE entry) { LIMITED_METHOD_CONTRACT; return (VTableCallHolder*)entry; } |
487 | |
488 | private: |
489 | // VTableCallStub follows here. It is dynamically sized on allocation because it could |
490 | // use short/long instruction sizes for mov/jmp, depending on the slot value. |
491 | }; |
492 | #pragma pack(pop) |
493 | |
494 | #ifdef DECLARE_DATA |
495 | |
496 | LookupStub lookupInit; |
497 | DispatchStub dispatchInit; |
498 | DispatchStubShort dispatchShortInit; |
499 | DispatchStubLong dispatchLongInit; |
500 | ResolveStub resolveInit; |
501 | |
502 | #define INSTR_INT3 0xcc |
503 | #define INSTR_NOP 0x90 |
504 | |
505 | #ifndef DACCESS_COMPILE |
506 | |
507 | #include "asmconstants.h" |
508 | |
509 | #ifdef STUB_LOGGING |
510 | extern size_t g_lookup_inline_counter; |
511 | extern size_t g_call_inline_counter; |
512 | extern size_t g_miss_inline_counter; |
513 | extern size_t g_call_cache_counter; |
514 | extern size_t g_miss_cache_counter; |
515 | #endif |
516 | |
517 | /* Template used to generate the stub. We generate a stub by allocating a block of |
518 | memory and copy the template over it and just update the specific fields that need |
519 | to be changed. |
520 | */ |
521 | |
522 | void LookupHolder::InitializeStatic() |
523 | { |
524 | static_assert_no_msg((sizeof(LookupHolder) % sizeof(void*)) == 0); |
525 | |
526 | // The first instruction of a LookupStub is nop |
527 | // and we use it in order to differentiate the first two bytes |
528 | // of a LookupStub and a ResolveStub |
529 | lookupInit._entryPoint [0] = INSTR_NOP; |
530 | lookupInit._entryPoint [1] = 0x48; |
531 | lookupInit._entryPoint [2] = 0xB8; |
532 | lookupInit._token = 0xcccccccccccccccc; |
533 | lookupInit.part2 [0] = 0x50; |
534 | lookupInit.part2 [1] = 0x48; |
535 | lookupInit.part2 [2] = 0xB8; |
536 | lookupInit._resolveWorkerAddr = 0xcccccccccccccccc; |
537 | lookupInit.part3 [0] = 0xFF; |
538 | lookupInit.part3 [1] = 0xE0; |
539 | } |
540 | |
541 | void LookupHolder::Initialize(PCODE resolveWorkerTarget, size_t dispatchToken) |
542 | { |
543 | _stub = lookupInit; |
544 | |
545 | //fill in the stub specific fields |
546 | _stub._token = dispatchToken; |
547 | _stub._resolveWorkerAddr = (size_t) resolveWorkerTarget; |
548 | } |
549 | |
550 | /* Template used to generate the stub. We generate a stub by allocating a block of |
551 | memory and copy the template over it and just update the specific fields that need |
552 | to be changed. |
553 | */ |
554 | |
555 | void DispatchHolder::InitializeStatic() |
556 | { |
557 | // Check that _expectedMT is aligned in the DispatchHolder |
558 | static_assert_no_msg(((sizeof(DispatchStub)+sizeof(DispatchStubShort)) % sizeof(void*)) == 0); |
559 | static_assert_no_msg(((sizeof(DispatchStub)+sizeof(DispatchStubLong)) % sizeof(void*)) == 0); |
560 | CONSISTENCY_CHECK((offsetof(DispatchStubLong, part4[0]) - offsetof(DispatchStubLong, part2[0])) < INT8_MAX); |
561 | |
562 | // Common dispatch stub initialization |
563 | dispatchInit._entryPoint [0] = 0x48; |
564 | dispatchInit._entryPoint [1] = 0xB8; |
565 | dispatchInit._expectedMT = 0xcccccccccccccccc; |
566 | dispatchInit.part1 [0] = 0x48; |
567 | dispatchInit.part1 [1] = 0x39; |
568 | #ifdef UNIX_AMD64_ABI |
569 | dispatchInit.part1 [2] = 0x07; // RDI |
570 | #else |
571 | dispatchInit.part1 [2] = 0x01; // RCX |
572 | #endif |
573 | |
574 | // Short dispatch stub initialization |
575 | dispatchShortInit.part1 [0] = 0x0F; |
576 | dispatchShortInit.part1 [1] = 0x85; |
577 | dispatchShortInit._failDispl = 0xcccccccc; |
578 | dispatchShortInit.part2 [0] = 0x48; |
579 | dispatchShortInit.part2 [1] = 0xb8; |
580 | dispatchShortInit._implTarget = 0xcccccccccccccccc; |
581 | dispatchShortInit.part3 [0] = 0xFF; |
582 | dispatchShortInit.part3 [1] = 0xE0; |
583 | dispatchShortInit.alignPad [0] = INSTR_INT3; |
584 | |
585 | // Long dispatch stub initialization |
586 | dispatchLongInit.part1 [0] = 0x75; |
587 | dispatchLongInit._failDispl = BYTE(&dispatchLongInit.part4[0] - &dispatchLongInit.part2[0]); |
588 | dispatchLongInit.part2 [0] = 0x48; |
589 | dispatchLongInit.part2 [1] = 0xb8; |
590 | dispatchLongInit._implTarget = 0xcccccccccccccccc; |
591 | dispatchLongInit.part3 [0] = 0xFF; |
592 | dispatchLongInit.part3 [1] = 0xE0; |
593 | // failLabel: |
594 | dispatchLongInit.part4 [0] = 0x48; |
595 | dispatchLongInit.part4 [1] = 0xb8; |
596 | dispatchLongInit._failTarget = 0xcccccccccccccccc; |
597 | dispatchLongInit.part5 [0] = 0xFF; |
598 | dispatchLongInit.part5 [1] = 0xE0; |
599 | dispatchLongInit.alignPad [0] = INSTR_INT3; |
600 | }; |
601 | |
602 | void DispatchHolder::Initialize(PCODE implTarget, PCODE failTarget, size_t expectedMT, |
603 | DispatchStub::DispatchStubType type) |
604 | { |
605 | // |
606 | // Initialize the common area |
607 | // |
608 | |
609 | // initialize the static data |
610 | *stub() = dispatchInit; |
611 | |
612 | // fill in the dynamic data |
613 | stub()->_expectedMT = expectedMT; |
614 | |
615 | // |
616 | // Initialize the short/long areas |
617 | // |
618 | if (type == DispatchStub::e_TYPE_SHORT) |
619 | { |
620 | DispatchStubShort *shortStub = const_cast<DispatchStubShort *>(stub()->getShortStub()); |
621 | |
622 | // initialize the static data |
623 | *shortStub = dispatchShortInit; |
624 | |
625 | // fill in the dynamic data |
626 | size_t displ = (failTarget - ((PCODE) &shortStub->_failDispl + sizeof(DISPL))); |
627 | CONSISTENCY_CHECK(FitsInI4(displ)); |
628 | shortStub->_failDispl = (DISPL) displ; |
629 | shortStub->_implTarget = (size_t) implTarget; |
630 | CONSISTENCY_CHECK((PCODE)&shortStub->_failDispl + sizeof(DISPL) + shortStub->_failDispl == failTarget); |
631 | } |
632 | else |
633 | { |
634 | CONSISTENCY_CHECK(type == DispatchStub::e_TYPE_LONG); |
635 | DispatchStubLong *longStub = const_cast<DispatchStubLong *>(stub()->getLongStub()); |
636 | |
637 | // initialize the static data |
638 | *longStub = dispatchLongInit; |
639 | |
640 | // fill in the dynamic data |
641 | longStub->_implTarget = implTarget; |
642 | longStub->_failTarget = failTarget; |
643 | } |
644 | } |
645 | |
646 | /* Template used to generate the stub. We generate a stub by allocating a block of |
647 | memory and copy the template over it and just update the specific fields that need |
648 | to be changed. |
649 | */ |
650 | |
651 | void ResolveHolder::InitializeStatic() |
652 | { |
653 | static_assert_no_msg((sizeof(ResolveHolder) % sizeof(void*)) == 0); |
654 | |
655 | resolveInit._resolveEntryPoint [0] = 0x52; |
656 | resolveInit._resolveEntryPoint [1] = 0x49; |
657 | resolveInit._resolveEntryPoint [2] = 0xBA; |
658 | resolveInit._cacheAddress = 0xcccccccccccccccc; |
659 | resolveInit.part1 [ 0] = 0x48; |
660 | resolveInit.part1 [ 1] = 0x8B; |
661 | #ifdef UNIX_AMD64_ABI |
662 | resolveInit.part1 [ 2] = 0x07; // RDI |
663 | #else |
664 | resolveInit.part1 [ 2] = 0x01; // RCX |
665 | #endif |
666 | resolveInit.part1 [ 3] = 0x48; |
667 | resolveInit.part1 [ 4] = 0x8B; |
668 | resolveInit.part1 [ 5] = 0xD0; |
669 | resolveInit.part1 [ 6] = 0x48; |
670 | resolveInit.part1 [ 7] = 0xC1; |
671 | resolveInit.part1 [ 8] = 0xE8; |
672 | resolveInit.part1 [ 9] = CALL_STUB_CACHE_NUM_BITS; |
673 | resolveInit.part1 [10] = 0x48; |
674 | resolveInit.part1 [11] = 0x03; |
675 | resolveInit.part1 [12] = 0xC2; |
676 | resolveInit.part1 [13] = 0x48; |
677 | resolveInit.part1 [14] = 0x35; |
678 | // Review truncation from unsigned __int64 to UINT32 of a constant value. |
679 | #if defined(_MSC_VER) |
680 | #pragma warning(push) |
681 | #pragma warning(disable:4305 4309) |
682 | #endif // defined(_MSC_VER) |
683 | |
684 | resolveInit._hashedToken = 0xcccccccc; |
685 | |
686 | #if defined(_MSC_VER) |
687 | #pragma warning(pop) |
688 | #endif // defined(_MSC_VER) |
689 | |
690 | resolveInit.part2 [ 0] = 0x48; |
691 | resolveInit.part2 [ 1] = 0x25; |
692 | resolveInit.mask = CALL_STUB_CACHE_MASK*sizeof(void *); |
693 | resolveInit.part3 [0] = 0x4A; |
694 | resolveInit.part3 [1] = 0x8B; |
695 | resolveInit.part3 [2] = 0x04; |
696 | resolveInit.part3 [3] = 0x10; |
697 | resolveInit.part3 [4] = 0x49; |
698 | resolveInit.part3 [5] = 0xBA; |
699 | resolveInit._token = 0xcccccccccccccccc; |
700 | resolveInit.part4 [0] = 0x48; |
701 | resolveInit.part4 [1] = 0x3B; |
702 | resolveInit.part4 [2] = 0x50; |
703 | resolveInit.mtOffset = offsetof(ResolveCacheElem,pMT) & 0xFF; |
704 | resolveInit.part5 [0] = 0x75; |
705 | resolveInit.toMiss1 = offsetof(ResolveStub,miss)-(offsetof(ResolveStub,toMiss1)+1) & 0xFF; |
706 | resolveInit.part6 [0] = 0x4C; |
707 | resolveInit.part6 [1] = 0x3B; |
708 | resolveInit.part6 [2] = 0x50; |
709 | resolveInit.tokenOffset = offsetof(ResolveCacheElem,token) & 0xFF; |
710 | resolveInit.part7 [0] = 0x75; |
711 | resolveInit.toMiss2 = offsetof(ResolveStub,miss)-(offsetof(ResolveStub,toMiss2)+1) & 0xFF; |
712 | resolveInit.part8 [0] = 0x48; |
713 | resolveInit.part8 [1] = 0x8B; |
714 | resolveInit.part8 [2] = 0x40; |
715 | resolveInit.targetOffset = offsetof(ResolveCacheElem,target) & 0xFF; |
716 | resolveInit.part9 [0] = 0x5A; |
717 | resolveInit.part9 [1] = 0xFF; |
718 | resolveInit.part9 [2] = 0xE0; |
719 | resolveInit._failEntryPoint [0] = 0x48; |
720 | resolveInit._failEntryPoint [1] = 0xB8; |
721 | resolveInit._pCounter = (INT32*) (size_t) 0xcccccccccccccccc; |
722 | resolveInit.part11 [0] = 0x83; |
723 | resolveInit.part11 [1] = 0x00; |
724 | resolveInit.part11 [2] = 0xFF; |
725 | resolveInit.part11 [3] = 0x7D; |
726 | resolveInit.toResolveStub1 = (offsetof(ResolveStub, _resolveEntryPoint) - (offsetof(ResolveStub, toResolveStub1)+1)) & 0xFF; |
727 | resolveInit.part12 [0] = 0x49; |
728 | resolveInit.part12 [1] = 0x83; |
729 | resolveInit.part12 [2] = 0xCB; |
730 | resolveInit.part12 [3] = 0x01; |
731 | resolveInit._slowEntryPoint [0] = 0x52; |
732 | resolveInit._slowEntryPoint [1] = 0x49; |
733 | resolveInit._slowEntryPoint [2] = 0xBA; |
734 | resolveInit._tokenSlow = 0xcccccccccccccccc; |
735 | resolveInit.miss [0] = 0x50; |
736 | resolveInit.miss [1] = 0x48; |
737 | resolveInit.miss [2] = 0xB8; |
738 | resolveInit._resolveWorker = 0xcccccccccccccccc; |
739 | resolveInit.part10 [0] = 0xFF; |
740 | resolveInit.part10 [1] = 0xE0; |
741 | }; |
742 | |
743 | void ResolveHolder::Initialize(PCODE resolveWorkerTarget, PCODE patcherTarget, |
744 | size_t dispatchToken, UINT32 hashedToken, |
745 | void * cacheAddr, INT32* counterAddr) |
746 | { |
747 | _stub = resolveInit; |
748 | |
749 | //fill in the stub specific fields |
750 | _stub._cacheAddress = (size_t) cacheAddr; |
751 | _stub._hashedToken = hashedToken << LOG2_PTRSIZE; |
752 | _stub._token = dispatchToken; |
753 | _stub._tokenSlow = dispatchToken; |
754 | _stub._resolveWorker = (size_t) resolveWorkerTarget; |
755 | _stub._pCounter = counterAddr; |
756 | } |
757 | |
758 | ResolveHolder* ResolveHolder::FromFailEntry(PCODE failEntry) |
759 | { |
760 | LIMITED_METHOD_CONTRACT; |
761 | ResolveHolder* resolveHolder = (ResolveHolder*) ( failEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _failEntryPoint) ); |
762 | _ASSERTE(resolveHolder->_stub._resolveEntryPoint[1] == resolveInit._resolveEntryPoint[1]); |
763 | return resolveHolder; |
764 | } |
765 | |
766 | #endif // DACCESS_COMPILE |
767 | |
768 | LookupHolder* LookupHolder::FromLookupEntry(PCODE lookupEntry) |
769 | { |
770 | LIMITED_METHOD_CONTRACT; |
771 | LookupHolder* lookupHolder = (LookupHolder*) ( lookupEntry - offsetof(LookupHolder, _stub) - offsetof(LookupStub, _entryPoint) ); |
772 | _ASSERTE(lookupHolder->_stub._entryPoint[2] == lookupInit._entryPoint[2]); |
773 | return lookupHolder; |
774 | } |
775 | |
776 | |
777 | DispatchHolder* DispatchHolder::FromDispatchEntry(PCODE dispatchEntry) |
778 | { |
779 | LIMITED_METHOD_CONTRACT; |
780 | DispatchHolder* dispatchHolder = (DispatchHolder*) ( dispatchEntry - offsetof(DispatchStub, _entryPoint) ); |
781 | _ASSERTE(dispatchHolder->stub()->_entryPoint[1] == dispatchInit._entryPoint[1]); |
782 | return dispatchHolder; |
783 | } |
784 | |
785 | |
786 | ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry) |
787 | { |
788 | LIMITED_METHOD_CONTRACT; |
789 | ResolveHolder* resolveHolder = (ResolveHolder*) ( resolveEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _resolveEntryPoint) ); |
790 | _ASSERTE(resolveHolder->_stub._resolveEntryPoint[1] == resolveInit._resolveEntryPoint[1]); |
791 | return resolveHolder; |
792 | } |
793 | |
794 | void VTableCallHolder::Initialize(unsigned slot) |
795 | { |
796 | unsigned offsetOfIndirection = MethodTable::GetVtableOffset() + MethodTable::GetIndexOfVtableIndirection(slot) * TARGET_POINTER_SIZE; |
797 | unsigned offsetAfterIndirection = MethodTable::GetIndexAfterVtableIndirection(slot) * TARGET_POINTER_SIZE; |
798 | _ASSERTE(MethodTable::VTableIndir_t::isRelative == false /* TODO: NYI */); |
799 | |
800 | VTableCallStub* pStub = stub(); |
801 | BYTE* p = (BYTE*)pStub->entryPoint(); |
802 | |
803 | #ifdef UNIX_AMD64_ABI |
804 | // mov rax,[rdi] : rax = MethodTable pointer |
805 | *(UINT32 *)p = 0x078b48; p += 3; |
806 | #else |
807 | // mov rax,[rcx] : rax = MethodTable pointer |
808 | *(UINT32 *)p = 0x018b48; p += 3; |
809 | #endif |
810 | |
811 | // mov rax,[rax+vtable offset] : rax = vtable pointer |
812 | if (offsetOfIndirection >= 0x80) |
813 | { |
814 | *(UINT32*)p = 0x00808b48; p += 3; |
815 | *(UINT32*)p = offsetOfIndirection; p += 4; |
816 | } |
817 | else |
818 | { |
819 | *(UINT32*)p = 0x00408b48; p += 3; |
820 | *p++ = (BYTE)offsetOfIndirection; |
821 | } |
822 | |
823 | // jmp qword ptr [rax+slot] |
824 | if (offsetAfterIndirection >= 0x80) |
825 | { |
826 | *(UINT32*)p = 0xa0ff; p += 2; |
827 | *(UINT32*)p = offsetAfterIndirection; p += 4; |
828 | } |
829 | else |
830 | { |
831 | *(UINT16*)p = 0x60ff; p += 2; |
832 | *p++ = (BYTE)offsetAfterIndirection; |
833 | } |
834 | |
835 | // Store the slot value here for convenience. Not a real instruction (unreachable anyways) |
836 | *(UINT32*)p = slot; p += 4; |
837 | |
838 | _ASSERT(p == (BYTE*)stub()->entryPoint() + VTableCallHolder::GetHolderSize(slot)); |
839 | _ASSERT(stub()->size() == VTableCallHolder::GetHolderSize(slot)); |
840 | } |
841 | |
842 | VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE stubStartAddress) |
843 | { |
844 | #ifdef DACCESS_COMPILE |
845 | return SK_BREAKPOINT; // Dac always uses the slower lookup |
846 | #else |
847 | StubKind stubKind = SK_UNKNOWN; |
848 | |
849 | EX_TRY |
850 | { |
851 | // If stubStartAddress is completely bogus, then this might AV, |
852 | // so we protect it with SEH. An AV here is OK. |
853 | AVInRuntimeImplOkayHolder AVOkay; |
854 | |
855 | WORD firstWord = *((WORD*) stubStartAddress); |
856 | |
857 | if (firstWord == 0xB848) |
858 | { |
859 | stubKind = SK_DISPATCH; |
860 | } |
861 | else if (firstWord == 0x4890) |
862 | { |
863 | stubKind = SK_LOOKUP; |
864 | } |
865 | else if (firstWord == 0x4952) |
866 | { |
867 | stubKind = SK_RESOLVE; |
868 | } |
869 | else if (firstWord == 0x48F8) |
870 | { |
871 | stubKind = SK_LOOKUP; |
872 | } |
873 | else if (firstWord == 0x8B48) |
874 | { |
875 | stubKind = SK_VTABLECALL; |
876 | } |
877 | else |
878 | { |
879 | BYTE firstByte = ((BYTE*) stubStartAddress)[0]; |
880 | BYTE secondByte = ((BYTE*) stubStartAddress)[1]; |
881 | |
882 | if ((firstByte == INSTR_INT3) || (secondByte == INSTR_INT3)) |
883 | { |
884 | stubKind = SK_BREAKPOINT; |
885 | } |
886 | } |
887 | } |
888 | EX_CATCH |
889 | { |
890 | stubKind = SK_UNKNOWN; |
891 | } |
892 | EX_END_CATCH(SwallowAllExceptions); |
893 | |
894 | return stubKind; |
895 | |
896 | #endif // DACCESS_COMPILE |
897 | } |
898 | |
899 | #endif //DECLARE_DATA |
900 | |
901 | #endif // _VIRTUAL_CALL_STUB_AMD64_H |
902 | |