1//=====================================================================
2//
3// FastMemcpy.c - skywind3000@163.com, 2015
4//
5// feature:
6// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1)
7//
8//=====================================================================
9#ifndef __FAST_MEMCPY_H__
10#define __FAST_MEMCPY_H__
11
12#include <stddef.h>
13#include <stdint.h>
14#include <emmintrin.h>
15
16
17//---------------------------------------------------------------------
18// force inline for compilers
19//---------------------------------------------------------------------
20#ifndef INLINE
21#ifdef __GNUC__
22#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
23 #define INLINE __inline__ __attribute__((always_inline))
24#else
25 #define INLINE __inline__
26#endif
27#elif defined(_MSC_VER)
28 #define INLINE __forceinline
29#elif (defined(__BORLANDC__) || defined(__WATCOMC__))
30 #define INLINE __inline
31#else
32 #define INLINE
33#endif
34#endif
35
36
37typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t;
38typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t;
39typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t;
40
41
42
43//---------------------------------------------------------------------
44// fast copy for different sizes
45//---------------------------------------------------------------------
46static INLINE void memcpy_sse2_16(void *dst, const void *src) {
47 __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
48 _mm_storeu_si128(((__m128i*)dst) + 0, m0);
49}
50
51static INLINE void memcpy_sse2_32(void *dst, const void *src) {
52 __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
53 __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1);
54 _mm_storeu_si128(((__m128i*)dst) + 0, m0);
55 _mm_storeu_si128(((__m128i*)dst) + 1, m1);
56}
57
58static INLINE void memcpy_sse2_64(void *dst, const void *src) {
59 __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
60 __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1);
61 __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2);
62 __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3);
63 _mm_storeu_si128(((__m128i*)dst) + 0, m0);
64 _mm_storeu_si128(((__m128i*)dst) + 1, m1);
65 _mm_storeu_si128(((__m128i*)dst) + 2, m2);
66 _mm_storeu_si128(((__m128i*)dst) + 3, m3);
67}
68
69static INLINE void memcpy_sse2_128(void *dst, const void *src) {
70 __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
71 __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1);
72 __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2);
73 __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3);
74 __m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4);
75 __m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5);
76 __m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6);
77 __m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7);
78 _mm_storeu_si128(((__m128i*)dst) + 0, m0);
79 _mm_storeu_si128(((__m128i*)dst) + 1, m1);
80 _mm_storeu_si128(((__m128i*)dst) + 2, m2);
81 _mm_storeu_si128(((__m128i*)dst) + 3, m3);
82 _mm_storeu_si128(((__m128i*)dst) + 4, m4);
83 _mm_storeu_si128(((__m128i*)dst) + 5, m5);
84 _mm_storeu_si128(((__m128i*)dst) + 6, m6);
85 _mm_storeu_si128(((__m128i*)dst) + 7, m7);
86}
87
88
89//---------------------------------------------------------------------
90// tiny memory copy with jump table optimized
91//---------------------------------------------------------------------
92static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) {
93 unsigned char *dd = ((unsigned char*)dst) + size;
94 const unsigned char *ss = ((const unsigned char*)src) + size;
95
96 switch (size) {
97 case 64:
98 memcpy_sse2_64(dd - 64, ss - 64);
99 case 0:
100 break;
101
102 case 65:
103 memcpy_sse2_64(dd - 65, ss - 65);
104 case 1:
105 dd[-1] = ss[-1];
106 break;
107
108 case 66:
109 memcpy_sse2_64(dd - 66, ss - 66);
110 case 2:
111 *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
112 break;
113
114 case 67:
115 memcpy_sse2_64(dd - 67, ss - 67);
116 case 3:
117 *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
118 dd[-1] = ss[-1];
119 break;
120
121 case 68:
122 memcpy_sse2_64(dd - 68, ss - 68);
123 case 4:
124 *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
125 break;
126
127 case 69:
128 memcpy_sse2_64(dd - 69, ss - 69);
129 case 5:
130 *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
131 dd[-1] = ss[-1];
132 break;
133
134 case 70:
135 memcpy_sse2_64(dd - 70, ss - 70);
136 case 6:
137 *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
138 *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
139 break;
140
141 case 71:
142 memcpy_sse2_64(dd - 71, ss - 71);
143 case 7:
144 *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
145 *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
146 break;
147
148 case 72:
149 memcpy_sse2_64(dd - 72, ss - 72);
150 case 8:
151 *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
152 break;
153
154 case 73:
155 memcpy_sse2_64(dd - 73, ss - 73);
156 case 9:
157 *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9));
158 dd[-1] = ss[-1];
159 break;
160
161 case 74:
162 memcpy_sse2_64(dd - 74, ss - 74);
163 case 10:
164 *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10));
165 *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
166 break;
167
168 case 75:
169 memcpy_sse2_64(dd - 75, ss - 75);
170 case 11:
171 *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11));
172 *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
173 break;
174
175 case 76:
176 memcpy_sse2_64(dd - 76, ss - 76);
177 case 12:
178 *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12));
179 *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
180 break;
181
182 case 77:
183 memcpy_sse2_64(dd - 77, ss - 77);
184 case 13:
185 *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13));
186 *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
187 dd[-1] = ss[-1];
188 break;
189
190 case 78:
191 memcpy_sse2_64(dd - 78, ss - 78);
192 case 14:
193 *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14));
194 *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
195 break;
196
197 case 79:
198 memcpy_sse2_64(dd - 79, ss - 79);
199 case 15:
200 *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15));
201 *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
202 break;
203
204 case 80:
205 memcpy_sse2_64(dd - 80, ss - 80);
206 case 16:
207 memcpy_sse2_16(dd - 16, ss - 16);
208 break;
209
210 case 81:
211 memcpy_sse2_64(dd - 81, ss - 81);
212 case 17:
213 memcpy_sse2_16(dd - 17, ss - 17);
214 dd[-1] = ss[-1];
215 break;
216
217 case 82:
218 memcpy_sse2_64(dd - 82, ss - 82);
219 case 18:
220 memcpy_sse2_16(dd - 18, ss - 18);
221 *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
222 break;
223
224 case 83:
225 memcpy_sse2_64(dd - 83, ss - 83);
226 case 19:
227 memcpy_sse2_16(dd - 19, ss - 19);
228 *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
229 dd[-1] = ss[-1];
230 break;
231
232 case 84:
233 memcpy_sse2_64(dd - 84, ss - 84);
234 case 20:
235 memcpy_sse2_16(dd - 20, ss - 20);
236 *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
237 break;
238
239 case 85:
240 memcpy_sse2_64(dd - 85, ss - 85);
241 case 21:
242 memcpy_sse2_16(dd - 21, ss - 21);
243 *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
244 dd[-1] = ss[-1];
245 break;
246
247 case 86:
248 memcpy_sse2_64(dd - 86, ss - 86);
249 case 22:
250 memcpy_sse2_16(dd - 22, ss - 22);
251 *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
252 *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
253 break;
254
255 case 87:
256 memcpy_sse2_64(dd - 87, ss - 87);
257 case 23:
258 memcpy_sse2_16(dd - 23, ss - 23);
259 *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
260 *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
261 break;
262
263 case 88:
264 memcpy_sse2_64(dd - 88, ss - 88);
265 case 24:
266 memcpy_sse2_16(dd - 24, ss - 24);
267 memcpy_sse2_16(dd - 16, ss - 16);
268 break;
269
270 case 89:
271 memcpy_sse2_64(dd - 89, ss - 89);
272 case 25:
273 memcpy_sse2_16(dd - 25, ss - 25);
274 memcpy_sse2_16(dd - 16, ss - 16);
275 break;
276
277 case 90:
278 memcpy_sse2_64(dd - 90, ss - 90);
279 case 26:
280 memcpy_sse2_16(dd - 26, ss - 26);
281 memcpy_sse2_16(dd - 16, ss - 16);
282 break;
283
284 case 91:
285 memcpy_sse2_64(dd - 91, ss - 91);
286 case 27:
287 memcpy_sse2_16(dd - 27, ss - 27);
288 memcpy_sse2_16(dd - 16, ss - 16);
289 break;
290
291 case 92:
292 memcpy_sse2_64(dd - 92, ss - 92);
293 case 28:
294 memcpy_sse2_16(dd - 28, ss - 28);
295 memcpy_sse2_16(dd - 16, ss - 16);
296 break;
297
298 case 93:
299 memcpy_sse2_64(dd - 93, ss - 93);
300 case 29:
301 memcpy_sse2_16(dd - 29, ss - 29);
302 memcpy_sse2_16(dd - 16, ss - 16);
303 break;
304
305 case 94:
306 memcpy_sse2_64(dd - 94, ss - 94);
307 case 30:
308 memcpy_sse2_16(dd - 30, ss - 30);
309 memcpy_sse2_16(dd - 16, ss - 16);
310 break;
311
312 case 95:
313 memcpy_sse2_64(dd - 95, ss - 95);
314 case 31:
315 memcpy_sse2_16(dd - 31, ss - 31);
316 memcpy_sse2_16(dd - 16, ss - 16);
317 break;
318
319 case 96:
320 memcpy_sse2_64(dd - 96, ss - 96);
321 case 32:
322 memcpy_sse2_32(dd - 32, ss - 32);
323 break;
324
325 case 97:
326 memcpy_sse2_64(dd - 97, ss - 97);
327 case 33:
328 memcpy_sse2_32(dd - 33, ss - 33);
329 dd[-1] = ss[-1];
330 break;
331
332 case 98:
333 memcpy_sse2_64(dd - 98, ss - 98);
334 case 34:
335 memcpy_sse2_32(dd - 34, ss - 34);
336 *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
337 break;
338
339 case 99:
340 memcpy_sse2_64(dd - 99, ss - 99);
341 case 35:
342 memcpy_sse2_32(dd - 35, ss - 35);
343 *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
344 dd[-1] = ss[-1];
345 break;
346
347 case 100:
348 memcpy_sse2_64(dd - 100, ss - 100);
349 case 36:
350 memcpy_sse2_32(dd - 36, ss - 36);
351 *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
352 break;
353
354 case 101:
355 memcpy_sse2_64(dd - 101, ss - 101);
356 case 37:
357 memcpy_sse2_32(dd - 37, ss - 37);
358 *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
359 dd[-1] = ss[-1];
360 break;
361
362 case 102:
363 memcpy_sse2_64(dd - 102, ss - 102);
364 case 38:
365 memcpy_sse2_32(dd - 38, ss - 38);
366 *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
367 *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
368 break;
369
370 case 103:
371 memcpy_sse2_64(dd - 103, ss - 103);
372 case 39:
373 memcpy_sse2_32(dd - 39, ss - 39);
374 *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
375 *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
376 break;
377
378 case 104:
379 memcpy_sse2_64(dd - 104, ss - 104);
380 case 40:
381 memcpy_sse2_32(dd - 40, ss - 40);
382 *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
383 break;
384
385 case 105:
386 memcpy_sse2_64(dd - 105, ss - 105);
387 case 41:
388 memcpy_sse2_32(dd - 41, ss - 41);
389 *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9));
390 dd[-1] = ss[-1];
391 break;
392
393 case 106:
394 memcpy_sse2_64(dd - 106, ss - 106);
395 case 42:
396 memcpy_sse2_32(dd - 42, ss - 42);
397 *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10));
398 *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
399 break;
400
401 case 107:
402 memcpy_sse2_64(dd - 107, ss - 107);
403 case 43:
404 memcpy_sse2_32(dd - 43, ss - 43);
405 *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11));
406 *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
407 break;
408
409 case 108:
410 memcpy_sse2_64(dd - 108, ss - 108);
411 case 44:
412 memcpy_sse2_32(dd - 44, ss - 44);
413 *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12));
414 *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
415 break;
416
417 case 109:
418 memcpy_sse2_64(dd - 109, ss - 109);
419 case 45:
420 memcpy_sse2_32(dd - 45, ss - 45);
421 *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13));
422 *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
423 dd[-1] = ss[-1];
424 break;
425
426 case 110:
427 memcpy_sse2_64(dd - 110, ss - 110);
428 case 46:
429 memcpy_sse2_32(dd - 46, ss - 46);
430 *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14));
431 *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
432 break;
433
434 case 111:
435 memcpy_sse2_64(dd - 111, ss - 111);
436 case 47:
437 memcpy_sse2_32(dd - 47, ss - 47);
438 *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15));
439 *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
440 break;
441
442 case 112:
443 memcpy_sse2_64(dd - 112, ss - 112);
444 case 48:
445 memcpy_sse2_32(dd - 48, ss - 48);
446 memcpy_sse2_16(dd - 16, ss - 16);
447 break;
448
449 case 113:
450 memcpy_sse2_64(dd - 113, ss - 113);
451 case 49:
452 memcpy_sse2_32(dd - 49, ss - 49);
453 memcpy_sse2_16(dd - 17, ss - 17);
454 dd[-1] = ss[-1];
455 break;
456
457 case 114:
458 memcpy_sse2_64(dd - 114, ss - 114);
459 case 50:
460 memcpy_sse2_32(dd - 50, ss - 50);
461 memcpy_sse2_16(dd - 18, ss - 18);
462 *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
463 break;
464
465 case 115:
466 memcpy_sse2_64(dd - 115, ss - 115);
467 case 51:
468 memcpy_sse2_32(dd - 51, ss - 51);
469 memcpy_sse2_16(dd - 19, ss - 19);
470 *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
471 dd[-1] = ss[-1];
472 break;
473
474 case 116:
475 memcpy_sse2_64(dd - 116, ss - 116);
476 case 52:
477 memcpy_sse2_32(dd - 52, ss - 52);
478 memcpy_sse2_16(dd - 20, ss - 20);
479 *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
480 break;
481
482 case 117:
483 memcpy_sse2_64(dd - 117, ss - 117);
484 case 53:
485 memcpy_sse2_32(dd - 53, ss - 53);
486 memcpy_sse2_16(dd - 21, ss - 21);
487 *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
488 dd[-1] = ss[-1];
489 break;
490
491 case 118:
492 memcpy_sse2_64(dd - 118, ss - 118);
493 case 54:
494 memcpy_sse2_32(dd - 54, ss - 54);
495 memcpy_sse2_16(dd - 22, ss - 22);
496 *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
497 *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
498 break;
499
500 case 119:
501 memcpy_sse2_64(dd - 119, ss - 119);
502 case 55:
503 memcpy_sse2_32(dd - 55, ss - 55);
504 memcpy_sse2_16(dd - 23, ss - 23);
505 *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
506 *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
507 break;
508
509 case 120:
510 memcpy_sse2_64(dd - 120, ss - 120);
511 case 56:
512 memcpy_sse2_32(dd - 56, ss - 56);
513 memcpy_sse2_16(dd - 24, ss - 24);
514 memcpy_sse2_16(dd - 16, ss - 16);
515 break;
516
517 case 121:
518 memcpy_sse2_64(dd - 121, ss - 121);
519 case 57:
520 memcpy_sse2_32(dd - 57, ss - 57);
521 memcpy_sse2_16(dd - 25, ss - 25);
522 memcpy_sse2_16(dd - 16, ss - 16);
523 break;
524
525 case 122:
526 memcpy_sse2_64(dd - 122, ss - 122);
527 case 58:
528 memcpy_sse2_32(dd - 58, ss - 58);
529 memcpy_sse2_16(dd - 26, ss - 26);
530 memcpy_sse2_16(dd - 16, ss - 16);
531 break;
532
533 case 123:
534 memcpy_sse2_64(dd - 123, ss - 123);
535 case 59:
536 memcpy_sse2_32(dd - 59, ss - 59);
537 memcpy_sse2_16(dd - 27, ss - 27);
538 memcpy_sse2_16(dd - 16, ss - 16);
539 break;
540
541 case 124:
542 memcpy_sse2_64(dd - 124, ss - 124);
543 case 60:
544 memcpy_sse2_32(dd - 60, ss - 60);
545 memcpy_sse2_16(dd - 28, ss - 28);
546 memcpy_sse2_16(dd - 16, ss - 16);
547 break;
548
549 case 125:
550 memcpy_sse2_64(dd - 125, ss - 125);
551 case 61:
552 memcpy_sse2_32(dd - 61, ss - 61);
553 memcpy_sse2_16(dd - 29, ss - 29);
554 memcpy_sse2_16(dd - 16, ss - 16);
555 break;
556
557 case 126:
558 memcpy_sse2_64(dd - 126, ss - 126);
559 case 62:
560 memcpy_sse2_32(dd - 62, ss - 62);
561 memcpy_sse2_16(dd - 30, ss - 30);
562 memcpy_sse2_16(dd - 16, ss - 16);
563 break;
564
565 case 127:
566 memcpy_sse2_64(dd - 127, ss - 127);
567 case 63:
568 memcpy_sse2_32(dd - 63, ss - 63);
569 memcpy_sse2_16(dd - 31, ss - 31);
570 memcpy_sse2_16(dd - 16, ss - 16);
571 break;
572
573 case 128:
574 memcpy_sse2_128(dd - 128, ss - 128);
575 break;
576 }
577
578 return dst;
579}
580
581
582//---------------------------------------------------------------------
583// main routine
584//---------------------------------------------------------------------
585static INLINE void* memcpy_fast(void *destination, const void *source, size_t size)
586{
587 unsigned char *dst = (unsigned char*)destination;
588 const unsigned char *src = (const unsigned char*)source;
589 size_t padding;
590
591 // small memory copy
592 if (size <= 128) {
593 return memcpy_tiny(dst, src, size);
594 }
595
596 // align destination to 16 bytes boundary
597 padding = (16 - (((size_t)dst) & 15)) & 15;
598
599 if (padding > 0) {
600 __m128i head = _mm_loadu_si128((const __m128i*)src);
601 _mm_storeu_si128((__m128i*)dst, head);
602 dst += padding;
603 src += padding;
604 size -= padding;
605 }
606
607 // medium size copy
608 if (size <= 0x200000) // something around half of LL-cache size
609 {
610 __m128i c0, c1, c2, c3, c4, c5, c6, c7;
611
612 for (; size >= 128; size -= 128) {
613 c0 = _mm_loadu_si128(((const __m128i*)src) + 0);
614 c1 = _mm_loadu_si128(((const __m128i*)src) + 1);
615 c2 = _mm_loadu_si128(((const __m128i*)src) + 2);
616 c3 = _mm_loadu_si128(((const __m128i*)src) + 3);
617 c4 = _mm_loadu_si128(((const __m128i*)src) + 4);
618 c5 = _mm_loadu_si128(((const __m128i*)src) + 5);
619 c6 = _mm_loadu_si128(((const __m128i*)src) + 6);
620 c7 = _mm_loadu_si128(((const __m128i*)src) + 7);
621 _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
622 src += 128;
623 _mm_store_si128((((__m128i*)dst) + 0), c0);
624 _mm_store_si128((((__m128i*)dst) + 1), c1);
625 _mm_store_si128((((__m128i*)dst) + 2), c2);
626 _mm_store_si128((((__m128i*)dst) + 3), c3);
627 _mm_store_si128((((__m128i*)dst) + 4), c4);
628 _mm_store_si128((((__m128i*)dst) + 5), c5);
629 _mm_store_si128((((__m128i*)dst) + 6), c6);
630 _mm_store_si128((((__m128i*)dst) + 7), c7);
631 dst += 128;
632 }
633 }
634 else { // big memory copy
635 __m128i c0, c1, c2, c3, c4, c5, c6, c7;
636
637 _mm_prefetch((const char*)(src), _MM_HINT_NTA);
638
639 if ((((size_t)src) & 15) == 0) { // source aligned
640 for (; size >= 128; size -= 128) {
641 c0 = _mm_load_si128(((const __m128i*)src) + 0);
642 c1 = _mm_load_si128(((const __m128i*)src) + 1);
643 c2 = _mm_load_si128(((const __m128i*)src) + 2);
644 c3 = _mm_load_si128(((const __m128i*)src) + 3);
645 c4 = _mm_load_si128(((const __m128i*)src) + 4);
646 c5 = _mm_load_si128(((const __m128i*)src) + 5);
647 c6 = _mm_load_si128(((const __m128i*)src) + 6);
648 c7 = _mm_load_si128(((const __m128i*)src) + 7);
649 _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
650 src += 128;
651 _mm_stream_si128((((__m128i*)dst) + 0), c0);
652 _mm_stream_si128((((__m128i*)dst) + 1), c1);
653 _mm_stream_si128((((__m128i*)dst) + 2), c2);
654 _mm_stream_si128((((__m128i*)dst) + 3), c3);
655 _mm_stream_si128((((__m128i*)dst) + 4), c4);
656 _mm_stream_si128((((__m128i*)dst) + 5), c5);
657 _mm_stream_si128((((__m128i*)dst) + 6), c6);
658 _mm_stream_si128((((__m128i*)dst) + 7), c7);
659 dst += 128;
660 }
661 }
662 else { // source unaligned
663 for (; size >= 128; size -= 128) {
664 c0 = _mm_loadu_si128(((const __m128i*)src) + 0);
665 c1 = _mm_loadu_si128(((const __m128i*)src) + 1);
666 c2 = _mm_loadu_si128(((const __m128i*)src) + 2);
667 c3 = _mm_loadu_si128(((const __m128i*)src) + 3);
668 c4 = _mm_loadu_si128(((const __m128i*)src) + 4);
669 c5 = _mm_loadu_si128(((const __m128i*)src) + 5);
670 c6 = _mm_loadu_si128(((const __m128i*)src) + 6);
671 c7 = _mm_loadu_si128(((const __m128i*)src) + 7);
672 _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
673 src += 128;
674 _mm_stream_si128((((__m128i*)dst) + 0), c0);
675 _mm_stream_si128((((__m128i*)dst) + 1), c1);
676 _mm_stream_si128((((__m128i*)dst) + 2), c2);
677 _mm_stream_si128((((__m128i*)dst) + 3), c3);
678 _mm_stream_si128((((__m128i*)dst) + 4), c4);
679 _mm_stream_si128((((__m128i*)dst) + 5), c5);
680 _mm_stream_si128((((__m128i*)dst) + 6), c6);
681 _mm_stream_si128((((__m128i*)dst) + 7), c7);
682 dst += 128;
683 }
684 }
685 _mm_sfence();
686 }
687
688 memcpy_tiny(dst, src, size);
689
690 return destination;
691}
692
693
694#endif
695
696
697
698