1 | //===================================================================== |
2 | // |
3 | // FastMemcpy.c - skywind3000@163.com, 2015 |
4 | // |
5 | // feature: |
6 | // 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) |
7 | // |
8 | //===================================================================== |
9 | #ifndef __FAST_MEMCPY_H__ |
10 | #define __FAST_MEMCPY_H__ |
11 | |
12 | #include <stddef.h> |
13 | #include <stdint.h> |
14 | #include <emmintrin.h> |
15 | |
16 | |
17 | //--------------------------------------------------------------------- |
18 | // force inline for compilers |
19 | //--------------------------------------------------------------------- |
20 | #ifndef INLINE |
21 | #ifdef __GNUC__ |
22 | #if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) |
23 | #define INLINE __inline__ __attribute__((always_inline)) |
24 | #else |
25 | #define INLINE __inline__ |
26 | #endif |
27 | #elif defined(_MSC_VER) |
28 | #define INLINE __forceinline |
29 | #elif (defined(__BORLANDC__) || defined(__WATCOMC__)) |
30 | #define INLINE __inline |
31 | #else |
32 | #define INLINE |
33 | #endif |
34 | #endif |
35 | |
36 | |
37 | typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t; |
38 | typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t; |
39 | typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t; |
40 | |
41 | |
42 | |
43 | //--------------------------------------------------------------------- |
44 | // fast copy for different sizes |
45 | //--------------------------------------------------------------------- |
46 | static INLINE void memcpy_sse2_16(void *dst, const void *src) { |
47 | __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); |
48 | _mm_storeu_si128(((__m128i*)dst) + 0, m0); |
49 | } |
50 | |
51 | static INLINE void memcpy_sse2_32(void *dst, const void *src) { |
52 | __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); |
53 | __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); |
54 | _mm_storeu_si128(((__m128i*)dst) + 0, m0); |
55 | _mm_storeu_si128(((__m128i*)dst) + 1, m1); |
56 | } |
57 | |
58 | static INLINE void memcpy_sse2_64(void *dst, const void *src) { |
59 | __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); |
60 | __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); |
61 | __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); |
62 | __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); |
63 | _mm_storeu_si128(((__m128i*)dst) + 0, m0); |
64 | _mm_storeu_si128(((__m128i*)dst) + 1, m1); |
65 | _mm_storeu_si128(((__m128i*)dst) + 2, m2); |
66 | _mm_storeu_si128(((__m128i*)dst) + 3, m3); |
67 | } |
68 | |
69 | static INLINE void memcpy_sse2_128(void *dst, const void *src) { |
70 | __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); |
71 | __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); |
72 | __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); |
73 | __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); |
74 | __m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4); |
75 | __m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5); |
76 | __m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6); |
77 | __m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7); |
78 | _mm_storeu_si128(((__m128i*)dst) + 0, m0); |
79 | _mm_storeu_si128(((__m128i*)dst) + 1, m1); |
80 | _mm_storeu_si128(((__m128i*)dst) + 2, m2); |
81 | _mm_storeu_si128(((__m128i*)dst) + 3, m3); |
82 | _mm_storeu_si128(((__m128i*)dst) + 4, m4); |
83 | _mm_storeu_si128(((__m128i*)dst) + 5, m5); |
84 | _mm_storeu_si128(((__m128i*)dst) + 6, m6); |
85 | _mm_storeu_si128(((__m128i*)dst) + 7, m7); |
86 | } |
87 | |
88 | |
89 | //--------------------------------------------------------------------- |
90 | // tiny memory copy with jump table optimized |
91 | //--------------------------------------------------------------------- |
92 | static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { |
93 | unsigned char *dd = ((unsigned char*)dst) + size; |
94 | const unsigned char *ss = ((const unsigned char*)src) + size; |
95 | |
96 | switch (size) { |
97 | case 64: |
98 | memcpy_sse2_64(dd - 64, ss - 64); |
99 | case 0: |
100 | break; |
101 | |
102 | case 65: |
103 | memcpy_sse2_64(dd - 65, ss - 65); |
104 | case 1: |
105 | dd[-1] = ss[-1]; |
106 | break; |
107 | |
108 | case 66: |
109 | memcpy_sse2_64(dd - 66, ss - 66); |
110 | case 2: |
111 | *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); |
112 | break; |
113 | |
114 | case 67: |
115 | memcpy_sse2_64(dd - 67, ss - 67); |
116 | case 3: |
117 | *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); |
118 | dd[-1] = ss[-1]; |
119 | break; |
120 | |
121 | case 68: |
122 | memcpy_sse2_64(dd - 68, ss - 68); |
123 | case 4: |
124 | *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); |
125 | break; |
126 | |
127 | case 69: |
128 | memcpy_sse2_64(dd - 69, ss - 69); |
129 | case 5: |
130 | *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); |
131 | dd[-1] = ss[-1]; |
132 | break; |
133 | |
134 | case 70: |
135 | memcpy_sse2_64(dd - 70, ss - 70); |
136 | case 6: |
137 | *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); |
138 | *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); |
139 | break; |
140 | |
141 | case 71: |
142 | memcpy_sse2_64(dd - 71, ss - 71); |
143 | case 7: |
144 | *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); |
145 | *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); |
146 | break; |
147 | |
148 | case 72: |
149 | memcpy_sse2_64(dd - 72, ss - 72); |
150 | case 8: |
151 | *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); |
152 | break; |
153 | |
154 | case 73: |
155 | memcpy_sse2_64(dd - 73, ss - 73); |
156 | case 9: |
157 | *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9)); |
158 | dd[-1] = ss[-1]; |
159 | break; |
160 | |
161 | case 74: |
162 | memcpy_sse2_64(dd - 74, ss - 74); |
163 | case 10: |
164 | *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10)); |
165 | *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); |
166 | break; |
167 | |
168 | case 75: |
169 | memcpy_sse2_64(dd - 75, ss - 75); |
170 | case 11: |
171 | *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11)); |
172 | *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); |
173 | break; |
174 | |
175 | case 76: |
176 | memcpy_sse2_64(dd - 76, ss - 76); |
177 | case 12: |
178 | *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12)); |
179 | *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); |
180 | break; |
181 | |
182 | case 77: |
183 | memcpy_sse2_64(dd - 77, ss - 77); |
184 | case 13: |
185 | *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13)); |
186 | *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); |
187 | dd[-1] = ss[-1]; |
188 | break; |
189 | |
190 | case 78: |
191 | memcpy_sse2_64(dd - 78, ss - 78); |
192 | case 14: |
193 | *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14)); |
194 | *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); |
195 | break; |
196 | |
197 | case 79: |
198 | memcpy_sse2_64(dd - 79, ss - 79); |
199 | case 15: |
200 | *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15)); |
201 | *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); |
202 | break; |
203 | |
204 | case 80: |
205 | memcpy_sse2_64(dd - 80, ss - 80); |
206 | case 16: |
207 | memcpy_sse2_16(dd - 16, ss - 16); |
208 | break; |
209 | |
210 | case 81: |
211 | memcpy_sse2_64(dd - 81, ss - 81); |
212 | case 17: |
213 | memcpy_sse2_16(dd - 17, ss - 17); |
214 | dd[-1] = ss[-1]; |
215 | break; |
216 | |
217 | case 82: |
218 | memcpy_sse2_64(dd - 82, ss - 82); |
219 | case 18: |
220 | memcpy_sse2_16(dd - 18, ss - 18); |
221 | *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); |
222 | break; |
223 | |
224 | case 83: |
225 | memcpy_sse2_64(dd - 83, ss - 83); |
226 | case 19: |
227 | memcpy_sse2_16(dd - 19, ss - 19); |
228 | *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); |
229 | dd[-1] = ss[-1]; |
230 | break; |
231 | |
232 | case 84: |
233 | memcpy_sse2_64(dd - 84, ss - 84); |
234 | case 20: |
235 | memcpy_sse2_16(dd - 20, ss - 20); |
236 | *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); |
237 | break; |
238 | |
239 | case 85: |
240 | memcpy_sse2_64(dd - 85, ss - 85); |
241 | case 21: |
242 | memcpy_sse2_16(dd - 21, ss - 21); |
243 | *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); |
244 | dd[-1] = ss[-1]; |
245 | break; |
246 | |
247 | case 86: |
248 | memcpy_sse2_64(dd - 86, ss - 86); |
249 | case 22: |
250 | memcpy_sse2_16(dd - 22, ss - 22); |
251 | *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); |
252 | *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); |
253 | break; |
254 | |
255 | case 87: |
256 | memcpy_sse2_64(dd - 87, ss - 87); |
257 | case 23: |
258 | memcpy_sse2_16(dd - 23, ss - 23); |
259 | *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); |
260 | *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); |
261 | break; |
262 | |
263 | case 88: |
264 | memcpy_sse2_64(dd - 88, ss - 88); |
265 | case 24: |
266 | memcpy_sse2_16(dd - 24, ss - 24); |
267 | memcpy_sse2_16(dd - 16, ss - 16); |
268 | break; |
269 | |
270 | case 89: |
271 | memcpy_sse2_64(dd - 89, ss - 89); |
272 | case 25: |
273 | memcpy_sse2_16(dd - 25, ss - 25); |
274 | memcpy_sse2_16(dd - 16, ss - 16); |
275 | break; |
276 | |
277 | case 90: |
278 | memcpy_sse2_64(dd - 90, ss - 90); |
279 | case 26: |
280 | memcpy_sse2_16(dd - 26, ss - 26); |
281 | memcpy_sse2_16(dd - 16, ss - 16); |
282 | break; |
283 | |
284 | case 91: |
285 | memcpy_sse2_64(dd - 91, ss - 91); |
286 | case 27: |
287 | memcpy_sse2_16(dd - 27, ss - 27); |
288 | memcpy_sse2_16(dd - 16, ss - 16); |
289 | break; |
290 | |
291 | case 92: |
292 | memcpy_sse2_64(dd - 92, ss - 92); |
293 | case 28: |
294 | memcpy_sse2_16(dd - 28, ss - 28); |
295 | memcpy_sse2_16(dd - 16, ss - 16); |
296 | break; |
297 | |
298 | case 93: |
299 | memcpy_sse2_64(dd - 93, ss - 93); |
300 | case 29: |
301 | memcpy_sse2_16(dd - 29, ss - 29); |
302 | memcpy_sse2_16(dd - 16, ss - 16); |
303 | break; |
304 | |
305 | case 94: |
306 | memcpy_sse2_64(dd - 94, ss - 94); |
307 | case 30: |
308 | memcpy_sse2_16(dd - 30, ss - 30); |
309 | memcpy_sse2_16(dd - 16, ss - 16); |
310 | break; |
311 | |
312 | case 95: |
313 | memcpy_sse2_64(dd - 95, ss - 95); |
314 | case 31: |
315 | memcpy_sse2_16(dd - 31, ss - 31); |
316 | memcpy_sse2_16(dd - 16, ss - 16); |
317 | break; |
318 | |
319 | case 96: |
320 | memcpy_sse2_64(dd - 96, ss - 96); |
321 | case 32: |
322 | memcpy_sse2_32(dd - 32, ss - 32); |
323 | break; |
324 | |
325 | case 97: |
326 | memcpy_sse2_64(dd - 97, ss - 97); |
327 | case 33: |
328 | memcpy_sse2_32(dd - 33, ss - 33); |
329 | dd[-1] = ss[-1]; |
330 | break; |
331 | |
332 | case 98: |
333 | memcpy_sse2_64(dd - 98, ss - 98); |
334 | case 34: |
335 | memcpy_sse2_32(dd - 34, ss - 34); |
336 | *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); |
337 | break; |
338 | |
339 | case 99: |
340 | memcpy_sse2_64(dd - 99, ss - 99); |
341 | case 35: |
342 | memcpy_sse2_32(dd - 35, ss - 35); |
343 | *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); |
344 | dd[-1] = ss[-1]; |
345 | break; |
346 | |
347 | case 100: |
348 | memcpy_sse2_64(dd - 100, ss - 100); |
349 | case 36: |
350 | memcpy_sse2_32(dd - 36, ss - 36); |
351 | *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); |
352 | break; |
353 | |
354 | case 101: |
355 | memcpy_sse2_64(dd - 101, ss - 101); |
356 | case 37: |
357 | memcpy_sse2_32(dd - 37, ss - 37); |
358 | *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); |
359 | dd[-1] = ss[-1]; |
360 | break; |
361 | |
362 | case 102: |
363 | memcpy_sse2_64(dd - 102, ss - 102); |
364 | case 38: |
365 | memcpy_sse2_32(dd - 38, ss - 38); |
366 | *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); |
367 | *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); |
368 | break; |
369 | |
370 | case 103: |
371 | memcpy_sse2_64(dd - 103, ss - 103); |
372 | case 39: |
373 | memcpy_sse2_32(dd - 39, ss - 39); |
374 | *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); |
375 | *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); |
376 | break; |
377 | |
378 | case 104: |
379 | memcpy_sse2_64(dd - 104, ss - 104); |
380 | case 40: |
381 | memcpy_sse2_32(dd - 40, ss - 40); |
382 | *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); |
383 | break; |
384 | |
385 | case 105: |
386 | memcpy_sse2_64(dd - 105, ss - 105); |
387 | case 41: |
388 | memcpy_sse2_32(dd - 41, ss - 41); |
389 | *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9)); |
390 | dd[-1] = ss[-1]; |
391 | break; |
392 | |
393 | case 106: |
394 | memcpy_sse2_64(dd - 106, ss - 106); |
395 | case 42: |
396 | memcpy_sse2_32(dd - 42, ss - 42); |
397 | *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10)); |
398 | *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); |
399 | break; |
400 | |
401 | case 107: |
402 | memcpy_sse2_64(dd - 107, ss - 107); |
403 | case 43: |
404 | memcpy_sse2_32(dd - 43, ss - 43); |
405 | *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11)); |
406 | *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); |
407 | break; |
408 | |
409 | case 108: |
410 | memcpy_sse2_64(dd - 108, ss - 108); |
411 | case 44: |
412 | memcpy_sse2_32(dd - 44, ss - 44); |
413 | *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12)); |
414 | *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); |
415 | break; |
416 | |
417 | case 109: |
418 | memcpy_sse2_64(dd - 109, ss - 109); |
419 | case 45: |
420 | memcpy_sse2_32(dd - 45, ss - 45); |
421 | *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13)); |
422 | *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); |
423 | dd[-1] = ss[-1]; |
424 | break; |
425 | |
426 | case 110: |
427 | memcpy_sse2_64(dd - 110, ss - 110); |
428 | case 46: |
429 | memcpy_sse2_32(dd - 46, ss - 46); |
430 | *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14)); |
431 | *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); |
432 | break; |
433 | |
434 | case 111: |
435 | memcpy_sse2_64(dd - 111, ss - 111); |
436 | case 47: |
437 | memcpy_sse2_32(dd - 47, ss - 47); |
438 | *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15)); |
439 | *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); |
440 | break; |
441 | |
442 | case 112: |
443 | memcpy_sse2_64(dd - 112, ss - 112); |
444 | case 48: |
445 | memcpy_sse2_32(dd - 48, ss - 48); |
446 | memcpy_sse2_16(dd - 16, ss - 16); |
447 | break; |
448 | |
449 | case 113: |
450 | memcpy_sse2_64(dd - 113, ss - 113); |
451 | case 49: |
452 | memcpy_sse2_32(dd - 49, ss - 49); |
453 | memcpy_sse2_16(dd - 17, ss - 17); |
454 | dd[-1] = ss[-1]; |
455 | break; |
456 | |
457 | case 114: |
458 | memcpy_sse2_64(dd - 114, ss - 114); |
459 | case 50: |
460 | memcpy_sse2_32(dd - 50, ss - 50); |
461 | memcpy_sse2_16(dd - 18, ss - 18); |
462 | *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); |
463 | break; |
464 | |
465 | case 115: |
466 | memcpy_sse2_64(dd - 115, ss - 115); |
467 | case 51: |
468 | memcpy_sse2_32(dd - 51, ss - 51); |
469 | memcpy_sse2_16(dd - 19, ss - 19); |
470 | *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); |
471 | dd[-1] = ss[-1]; |
472 | break; |
473 | |
474 | case 116: |
475 | memcpy_sse2_64(dd - 116, ss - 116); |
476 | case 52: |
477 | memcpy_sse2_32(dd - 52, ss - 52); |
478 | memcpy_sse2_16(dd - 20, ss - 20); |
479 | *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); |
480 | break; |
481 | |
482 | case 117: |
483 | memcpy_sse2_64(dd - 117, ss - 117); |
484 | case 53: |
485 | memcpy_sse2_32(dd - 53, ss - 53); |
486 | memcpy_sse2_16(dd - 21, ss - 21); |
487 | *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); |
488 | dd[-1] = ss[-1]; |
489 | break; |
490 | |
491 | case 118: |
492 | memcpy_sse2_64(dd - 118, ss - 118); |
493 | case 54: |
494 | memcpy_sse2_32(dd - 54, ss - 54); |
495 | memcpy_sse2_16(dd - 22, ss - 22); |
496 | *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); |
497 | *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); |
498 | break; |
499 | |
500 | case 119: |
501 | memcpy_sse2_64(dd - 119, ss - 119); |
502 | case 55: |
503 | memcpy_sse2_32(dd - 55, ss - 55); |
504 | memcpy_sse2_16(dd - 23, ss - 23); |
505 | *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); |
506 | *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); |
507 | break; |
508 | |
509 | case 120: |
510 | memcpy_sse2_64(dd - 120, ss - 120); |
511 | case 56: |
512 | memcpy_sse2_32(dd - 56, ss - 56); |
513 | memcpy_sse2_16(dd - 24, ss - 24); |
514 | memcpy_sse2_16(dd - 16, ss - 16); |
515 | break; |
516 | |
517 | case 121: |
518 | memcpy_sse2_64(dd - 121, ss - 121); |
519 | case 57: |
520 | memcpy_sse2_32(dd - 57, ss - 57); |
521 | memcpy_sse2_16(dd - 25, ss - 25); |
522 | memcpy_sse2_16(dd - 16, ss - 16); |
523 | break; |
524 | |
525 | case 122: |
526 | memcpy_sse2_64(dd - 122, ss - 122); |
527 | case 58: |
528 | memcpy_sse2_32(dd - 58, ss - 58); |
529 | memcpy_sse2_16(dd - 26, ss - 26); |
530 | memcpy_sse2_16(dd - 16, ss - 16); |
531 | break; |
532 | |
533 | case 123: |
534 | memcpy_sse2_64(dd - 123, ss - 123); |
535 | case 59: |
536 | memcpy_sse2_32(dd - 59, ss - 59); |
537 | memcpy_sse2_16(dd - 27, ss - 27); |
538 | memcpy_sse2_16(dd - 16, ss - 16); |
539 | break; |
540 | |
541 | case 124: |
542 | memcpy_sse2_64(dd - 124, ss - 124); |
543 | case 60: |
544 | memcpy_sse2_32(dd - 60, ss - 60); |
545 | memcpy_sse2_16(dd - 28, ss - 28); |
546 | memcpy_sse2_16(dd - 16, ss - 16); |
547 | break; |
548 | |
549 | case 125: |
550 | memcpy_sse2_64(dd - 125, ss - 125); |
551 | case 61: |
552 | memcpy_sse2_32(dd - 61, ss - 61); |
553 | memcpy_sse2_16(dd - 29, ss - 29); |
554 | memcpy_sse2_16(dd - 16, ss - 16); |
555 | break; |
556 | |
557 | case 126: |
558 | memcpy_sse2_64(dd - 126, ss - 126); |
559 | case 62: |
560 | memcpy_sse2_32(dd - 62, ss - 62); |
561 | memcpy_sse2_16(dd - 30, ss - 30); |
562 | memcpy_sse2_16(dd - 16, ss - 16); |
563 | break; |
564 | |
565 | case 127: |
566 | memcpy_sse2_64(dd - 127, ss - 127); |
567 | case 63: |
568 | memcpy_sse2_32(dd - 63, ss - 63); |
569 | memcpy_sse2_16(dd - 31, ss - 31); |
570 | memcpy_sse2_16(dd - 16, ss - 16); |
571 | break; |
572 | |
573 | case 128: |
574 | memcpy_sse2_128(dd - 128, ss - 128); |
575 | break; |
576 | } |
577 | |
578 | return dst; |
579 | } |
580 | |
581 | |
582 | //--------------------------------------------------------------------- |
583 | // main routine |
584 | //--------------------------------------------------------------------- |
585 | static INLINE void* memcpy_fast(void *destination, const void *source, size_t size) |
586 | { |
587 | unsigned char *dst = (unsigned char*)destination; |
588 | const unsigned char *src = (const unsigned char*)source; |
589 | size_t padding; |
590 | |
591 | // small memory copy |
592 | if (size <= 128) { |
593 | return memcpy_tiny(dst, src, size); |
594 | } |
595 | |
596 | // align destination to 16 bytes boundary |
597 | padding = (16 - (((size_t)dst) & 15)) & 15; |
598 | |
599 | if (padding > 0) { |
600 | __m128i head = _mm_loadu_si128((const __m128i*)src); |
601 | _mm_storeu_si128((__m128i*)dst, head); |
602 | dst += padding; |
603 | src += padding; |
604 | size -= padding; |
605 | } |
606 | |
607 | // medium size copy |
608 | if (size <= 0x200000) // something around half of LL-cache size |
609 | { |
610 | __m128i c0, c1, c2, c3, c4, c5, c6, c7; |
611 | |
612 | for (; size >= 128; size -= 128) { |
613 | c0 = _mm_loadu_si128(((const __m128i*)src) + 0); |
614 | c1 = _mm_loadu_si128(((const __m128i*)src) + 1); |
615 | c2 = _mm_loadu_si128(((const __m128i*)src) + 2); |
616 | c3 = _mm_loadu_si128(((const __m128i*)src) + 3); |
617 | c4 = _mm_loadu_si128(((const __m128i*)src) + 4); |
618 | c5 = _mm_loadu_si128(((const __m128i*)src) + 5); |
619 | c6 = _mm_loadu_si128(((const __m128i*)src) + 6); |
620 | c7 = _mm_loadu_si128(((const __m128i*)src) + 7); |
621 | _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); |
622 | src += 128; |
623 | _mm_store_si128((((__m128i*)dst) + 0), c0); |
624 | _mm_store_si128((((__m128i*)dst) + 1), c1); |
625 | _mm_store_si128((((__m128i*)dst) + 2), c2); |
626 | _mm_store_si128((((__m128i*)dst) + 3), c3); |
627 | _mm_store_si128((((__m128i*)dst) + 4), c4); |
628 | _mm_store_si128((((__m128i*)dst) + 5), c5); |
629 | _mm_store_si128((((__m128i*)dst) + 6), c6); |
630 | _mm_store_si128((((__m128i*)dst) + 7), c7); |
631 | dst += 128; |
632 | } |
633 | } |
634 | else { // big memory copy |
635 | __m128i c0, c1, c2, c3, c4, c5, c6, c7; |
636 | |
637 | _mm_prefetch((const char*)(src), _MM_HINT_NTA); |
638 | |
639 | if ((((size_t)src) & 15) == 0) { // source aligned |
640 | for (; size >= 128; size -= 128) { |
641 | c0 = _mm_load_si128(((const __m128i*)src) + 0); |
642 | c1 = _mm_load_si128(((const __m128i*)src) + 1); |
643 | c2 = _mm_load_si128(((const __m128i*)src) + 2); |
644 | c3 = _mm_load_si128(((const __m128i*)src) + 3); |
645 | c4 = _mm_load_si128(((const __m128i*)src) + 4); |
646 | c5 = _mm_load_si128(((const __m128i*)src) + 5); |
647 | c6 = _mm_load_si128(((const __m128i*)src) + 6); |
648 | c7 = _mm_load_si128(((const __m128i*)src) + 7); |
649 | _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); |
650 | src += 128; |
651 | _mm_stream_si128((((__m128i*)dst) + 0), c0); |
652 | _mm_stream_si128((((__m128i*)dst) + 1), c1); |
653 | _mm_stream_si128((((__m128i*)dst) + 2), c2); |
654 | _mm_stream_si128((((__m128i*)dst) + 3), c3); |
655 | _mm_stream_si128((((__m128i*)dst) + 4), c4); |
656 | _mm_stream_si128((((__m128i*)dst) + 5), c5); |
657 | _mm_stream_si128((((__m128i*)dst) + 6), c6); |
658 | _mm_stream_si128((((__m128i*)dst) + 7), c7); |
659 | dst += 128; |
660 | } |
661 | } |
662 | else { // source unaligned |
663 | for (; size >= 128; size -= 128) { |
664 | c0 = _mm_loadu_si128(((const __m128i*)src) + 0); |
665 | c1 = _mm_loadu_si128(((const __m128i*)src) + 1); |
666 | c2 = _mm_loadu_si128(((const __m128i*)src) + 2); |
667 | c3 = _mm_loadu_si128(((const __m128i*)src) + 3); |
668 | c4 = _mm_loadu_si128(((const __m128i*)src) + 4); |
669 | c5 = _mm_loadu_si128(((const __m128i*)src) + 5); |
670 | c6 = _mm_loadu_si128(((const __m128i*)src) + 6); |
671 | c7 = _mm_loadu_si128(((const __m128i*)src) + 7); |
672 | _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); |
673 | src += 128; |
674 | _mm_stream_si128((((__m128i*)dst) + 0), c0); |
675 | _mm_stream_si128((((__m128i*)dst) + 1), c1); |
676 | _mm_stream_si128((((__m128i*)dst) + 2), c2); |
677 | _mm_stream_si128((((__m128i*)dst) + 3), c3); |
678 | _mm_stream_si128((((__m128i*)dst) + 4), c4); |
679 | _mm_stream_si128((((__m128i*)dst) + 5), c5); |
680 | _mm_stream_si128((((__m128i*)dst) + 6), c6); |
681 | _mm_stream_si128((((__m128i*)dst) + 7), c7); |
682 | dst += 128; |
683 | } |
684 | } |
685 | _mm_sfence(); |
686 | } |
687 | |
688 | memcpy_tiny(dst, src, size); |
689 | |
690 | return destination; |
691 | } |
692 | |
693 | |
694 | #endif |
695 | |
696 | |
697 | |
698 | |