1#pragma once
2
3#include <string.h>
4
5#ifdef __SSE2__
6#include <emmintrin.h>
7
8
9/** memcpy function could work suboptimal if all the following conditions are met:
10 * 1. Size of memory region is relatively small (approximately, under 50 bytes).
11 * 2. Size of memory region is not known at compile-time.
12 *
13 * In that case, memcpy works suboptimal by following reasons:
14 * 1. Function is not inlined.
15 * 2. Much time/instructions are spend to process "tails" of data.
16 *
17 * There are cases when function could be implemented in more optimal way, with help of some assumptions.
18 * One of that assumptions - ability to read and write some number of bytes after end of passed memory regions.
19 * Under that assumption, it is possible not to implement difficult code to process tails of data and do copy always by big chunks.
20 *
21 * This case is typical, for example, when many small pieces of data are gathered to single contiguous piece of memory in a loop.
22 * - because each next copy will overwrite excessive data after previous copy.
23 *
24 * Assumption that size of memory region is small enough allows us to not unroll the loop.
25 * This is slower, when size of memory is actually big.
26 *
27 * Use with caution.
28 */
29
30namespace detail
31{
32 inline void memcpySmallAllowReadWriteOverflow15Impl(char * __restrict dst, const char * __restrict src, ssize_t n)
33 {
34 while (n > 0)
35 {
36 _mm_storeu_si128(reinterpret_cast<__m128i *>(dst),
37 _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
38
39 dst += 16;
40 src += 16;
41 n -= 16;
42 }
43 }
44}
45
46/** Works under assumption, that it's possible to read up to 15 excessive bytes after end of 'src' region
47 * and to write any garbage into up to 15 bytes after end of 'dst' region.
48 */
49inline void memcpySmallAllowReadWriteOverflow15(void * __restrict dst, const void * __restrict src, size_t n)
50{
51 detail::memcpySmallAllowReadWriteOverflow15Impl(reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src), n);
52}
53
54/** NOTE There was also a function, that assumes, that you could read any bytes inside same memory page of src.
55 * This function was unused, and also it requires special handling for Valgrind and ASan.
56 */
57
58#else /// Implementation for other platforms.
59
60inline void memcpySmallAllowReadWriteOverflow15(void * __restrict dst, const void * __restrict src, size_t n)
61{
62 memcpy(dst, src, n);
63}
64
65#endif
66