1#include "llama-mmap.h"
2
3#include "llama-impl.h"
4
5#include "ggml.h"
6
7#include <cstring>
8#include <climits>
9#include <stdexcept>
10#include <cerrno>
11#include <algorithm>
12
13#ifdef __has_include
14 #if __has_include(<unistd.h>)
15 #include <unistd.h>
16 #if defined(_POSIX_MAPPED_FILES)
17 #include <sys/mman.h>
18 #include <fcntl.h>
19 #endif
20 #if defined(_POSIX_MEMLOCK_RANGE)
21 #include <sys/resource.h>
22 #endif
23 #endif
24#endif
25
26#if defined(_WIN32)
27 #define WIN32_LEAN_AND_MEAN
28 #ifndef NOMINMAX
29 #define NOMINMAX
30 #endif
31 #include <windows.h>
32 #ifndef PATH_MAX
33 #define PATH_MAX MAX_PATH
34 #endif
35 #include <io.h>
36#endif
37
38#if defined(__APPLE__)
39#include <TargetConditionals.h>
40#endif
41
42// TODO: consider moving to llama-impl.h if needed in more places
43#if defined(_WIN32)
44static std::string llama_format_win_err(DWORD err) {
45 LPSTR buf;
46 size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
47 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
48 if (!size) {
49 return "FormatMessageA failed";
50 }
51 std::string ret(buf, size);
52 LocalFree(buf);
53 return ret;
54}
55#endif
56
57// llama_file
58
59struct llama_file::impl {
60#if defined(_WIN32)
61 HANDLE fp_win32;
62 std::string GetErrorMessageWin32(DWORD error_code) const {
63 std::string ret;
64 LPSTR lpMsgBuf = NULL;
65 DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
66 NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
67 if (!bufLen) {
68 ret = format("Win32 error code: %lx", error_code);
69 } else {
70 ret = lpMsgBuf;
71 LocalFree(lpMsgBuf);
72 }
73
74 return ret;
75 }
76
77 impl(const char * fname, const char * mode) {
78 fp = ggml_fopen(fname, mode);
79 if (fp == NULL) {
80 throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
81 }
82 fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
83 seek(0, SEEK_END);
84 size = tell();
85 seek(0, SEEK_SET);
86 }
87
88 size_t tell() const {
89 LARGE_INTEGER li;
90 li.QuadPart = 0;
91 BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
92 if (!ret) {
93 throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
94 }
95
96 return li.QuadPart;
97 }
98
99 void seek(size_t offset, int whence) const {
100 static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
101 static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
102 static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
103
104 LARGE_INTEGER li;
105 li.QuadPart = offset;
106 BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
107 if (!ret) {
108 throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
109 }
110 }
111
112 void read_raw(void * ptr, size_t len) const {
113 size_t bytes_read = 0;
114 while (bytes_read < len) {
115 size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
116 DWORD chunk_read = 0;
117 BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
118 if (!result) {
119 throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
120 }
121 if (chunk_read < chunk_size || chunk_read == 0) {
122 throw std::runtime_error("unexpectedly reached end of file");
123 }
124
125 bytes_read += chunk_read;
126 }
127 }
128
129 uint32_t read_u32() const {
130 uint32_t val;
131 read_raw(&val, sizeof(val));
132 return val;
133 }
134
135 void write_raw(const void * ptr, size_t len) const {
136 size_t bytes_written = 0;
137 while (bytes_written < len) {
138 size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
139 DWORD chunk_written = 0;
140 BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
141 if (!result) {
142 throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
143 }
144 if (chunk_written < chunk_size || chunk_written == 0) {
145 throw std::runtime_error("unexpectedly failed to write bytes");
146 }
147
148 bytes_written += chunk_written;
149 }
150 }
151
152 void write_u32(uint32_t val) const {
153 write_raw(&val, sizeof(val));
154 }
155
156 ~impl() {
157 if (fp) {
158 std::fclose(fp);
159 }
160 }
161#else
162 impl(const char * fname, const char * mode) {
163 fp = ggml_fopen(fname, mode);
164 if (fp == NULL) {
165 throw std::runtime_error(format(fmt: "failed to open %s: %s", fname, strerror(errno)));
166 }
167 seek(offset: 0, SEEK_END);
168 size = tell();
169 seek(offset: 0, SEEK_SET);
170 }
171
172 size_t tell() const {
173// TODO: this ifdef is never true?
174#ifdef _WIN32
175 __int64 ret = _ftelli64(fp);
176#else
177 long ret = std::ftell(stream: fp);
178#endif
179 if (ret == -1) {
180 throw std::runtime_error(format(fmt: "ftell error: %s", strerror(errno)));
181 }
182
183 return (size_t) ret;
184 }
185
186 void seek(size_t offset, int whence) const {
187// TODO: this ifdef is never true?
188#ifdef _WIN32
189 int ret = _fseeki64(fp, (__int64) offset, whence);
190#else
191 int ret = std::fseek(stream: fp, off: (long) offset, whence: whence);
192#endif
193 if (ret != 0) {
194 throw std::runtime_error(format(fmt: "seek error: %s", strerror(errno)));
195 }
196 }
197
198 void read_raw(void * ptr, size_t len) const {
199 if (len == 0) {
200 return;
201 }
202 errno = 0;
203 std::size_t ret = std::fread(ptr: ptr, size: len, n: 1, stream: fp);
204 if (ferror(stream: fp)) {
205 throw std::runtime_error(format(fmt: "read error: %s", strerror(errno)));
206 }
207 if (ret != 1) {
208 throw std::runtime_error("unexpectedly reached end of file");
209 }
210 }
211
212 uint32_t read_u32() const {
213 uint32_t ret;
214 read_raw(ptr: &ret, len: sizeof(ret));
215 return ret;
216 }
217
218 void write_raw(const void * ptr, size_t len) const {
219 if (len == 0) {
220 return;
221 }
222 errno = 0;
223 size_t ret = std::fwrite(ptr: ptr, size: len, n: 1, s: fp);
224 if (ret != 1) {
225 throw std::runtime_error(format(fmt: "write error: %s", strerror(errno)));
226 }
227 }
228
229 void write_u32(uint32_t val) const {
230 write_raw(ptr: &val, len: sizeof(val));
231 }
232
233 ~impl() {
234 if (fp) {
235 std::fclose(stream: fp);
236 }
237 }
238#endif
239
240 FILE * fp;
241 size_t size;
242};
243
244llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(args&: fname, args&: mode)) {}
245llama_file::~llama_file() = default;
246
247size_t llama_file::tell() const { return pimpl->tell(); }
248size_t llama_file::size() const { return pimpl->size; }
249
250int llama_file::file_id() const {
251#ifdef _WIN32
252 return _fileno(pimpl->fp);
253#else
254#if defined(fileno)
255 return fileno(pimpl->fp);
256#else
257 return ::fileno(stream: pimpl->fp);
258#endif
259#endif
260}
261
262void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
263void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
264
265uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
266
267void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
268void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
269
270// llama_mmap
271
272struct llama_mmap::impl {
273#ifdef _POSIX_MAPPED_FILES
274 std::vector<std::pair<size_t, size_t>> mapped_fragments;
275
276 impl(struct llama_file * file, size_t prefetch, bool numa) {
277 size = file->size();
278 int fd = file->file_id();
279 int flags = MAP_SHARED;
280 if (numa) { prefetch = 0; }
281#ifdef __linux__
282 if (posix_fadvise(fd: fd, offset: 0, len: 0, POSIX_FADV_SEQUENTIAL)) {
283 LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
284 strerror(errno));
285 }
286 if (prefetch) { flags |= MAP_POPULATE; }
287#endif
288 addr = mmap(NULL, len: file->size(), PROT_READ, flags: flags, fd: fd, offset: 0);
289 if (addr == MAP_FAILED) {
290 throw std::runtime_error(format(fmt: "mmap failed: %s", strerror(errno)));
291 }
292
293 if (prefetch > 0) {
294 if (posix_madvise(addr: addr, len: std::min(a: file->size(), b: prefetch), POSIX_MADV_WILLNEED)) {
295 LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
296 strerror(errno));
297 }
298 }
299 if (numa) {
300 if (posix_madvise(addr: addr, len: file->size(), POSIX_MADV_RANDOM)) {
301 LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
302 strerror(errno));
303 }
304 }
305
306 mapped_fragments.emplace_back(args: 0, args: file->size());
307 }
308
309 static void align_range(size_t * first, size_t * last, size_t page_size) {
310 size_t offset_in_page = *first & (page_size - 1);
311 size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
312 *first += offset_to_page;
313
314 *last = *last & ~(page_size - 1);
315
316 if (*last <= *first) {
317 *last = *first;
318 }
319 }
320
321 void unmap_fragment(size_t first, size_t last) {
322 int page_size = sysconf(_SC_PAGESIZE);
323 align_range(first: &first, last: &last, page_size);
324 size_t len = last - first;
325
326 if (len == 0) {
327 return;
328 }
329
330 GGML_ASSERT(first % page_size == 0);
331 GGML_ASSERT(last % page_size == 0);
332 GGML_ASSERT(last > first);
333
334 void * next_page_start = (uint8_t *) addr + first;
335
336 if (munmap(addr: next_page_start, len: len)) {
337 LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
338 }
339
340 std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
341 for (const auto & frag : mapped_fragments) {
342 if (frag.first < first && frag.second > last) {
343 new_mapped_fragments.emplace_back(args: frag.first, args&: first);
344 new_mapped_fragments.emplace_back(args&: last, args: frag.second);
345 } else if (frag.first < first && frag.second > first) {
346 new_mapped_fragments.emplace_back(args: frag.first, args&: first);
347 } else if (frag.first < last && frag.second > last) {
348 new_mapped_fragments.emplace_back(args&: last, args: frag.second);
349 } else if (frag.first >= first && frag.second <= last) {
350 } else {
351 new_mapped_fragments.push_back(x: frag);
352 }
353 }
354 mapped_fragments = std::move(new_mapped_fragments);
355 }
356
357 ~impl() {
358 for (const auto & frag : mapped_fragments) {
359 if (munmap(addr: (char *) addr + frag.first, len: frag.second - frag.first)) {
360 LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
361 }
362 }
363 }
364#elif defined(_WIN32)
365 impl(struct llama_file * file, size_t prefetch, bool numa) {
366 GGML_UNUSED(numa);
367
368 size = file->size();
369
370 HANDLE hFile = (HANDLE) _get_osfhandle(file->file_id());
371
372 HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
373
374 if (hMapping == NULL) {
375 DWORD error = GetLastError();
376 throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
377 }
378
379 addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
380 DWORD error = GetLastError();
381 CloseHandle(hMapping);
382
383 if (addr == NULL) {
384 throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
385 }
386
387 if (prefetch > 0) {
388#if _WIN32_WINNT >= 0x602
389 BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
390 HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
391
392 pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
393
394 if (pPrefetchVirtualMemory) {
395 WIN32_MEMORY_RANGE_ENTRY range;
396 range.VirtualAddress = addr;
397 range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
398 if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
399 LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
400 llama_format_win_err(GetLastError()).c_str());
401 }
402 }
403#else
404 LLAMA_LOG_DEBUG("skipping PrefetchVirtualMemory because _WIN32_WINNT < 0x602\n");
405#endif
406 }
407 }
408
409 void unmap_fragment(size_t first, size_t last) {
410 GGML_UNUSED(first);
411 GGML_UNUSED(last);
412 }
413
414 ~impl() {
415 if (!UnmapViewOfFile(addr)) {
416 LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
417 llama_format_win_err(GetLastError()).c_str());
418 }
419 }
420#else
421 impl(struct llama_file * file, size_t prefetch, bool numa) {
422 GGML_UNUSED(file);
423 GGML_UNUSED(prefetch);
424 GGML_UNUSED(numa);
425
426 throw std::runtime_error("mmap not supported");
427 }
428
429 void unmap_fragment(size_t first, size_t last) {
430 GGML_UNUSED(first);
431 GGML_UNUSED(last);
432
433 throw std::runtime_error("mmap not supported");
434 }
435#endif
436
437 void * addr;
438 size_t size;
439};
440
441llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch, bool numa) : pimpl(std::make_unique<impl>(args&: file, args&: prefetch, args&: numa)) {}
442llama_mmap::~llama_mmap() = default;
443
444size_t llama_mmap::size() const { return pimpl->size; }
445void * llama_mmap::addr() const { return pimpl->addr; }
446
447void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragment(first, last); }
448
449#if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32)
450const bool llama_mmap::SUPPORTED = true;
451#else
452const bool llama_mmap::SUPPORTED = false;
453#endif
454
455// llama_mlock
456
457struct llama_mlock::impl {
458#ifdef _POSIX_MEMLOCK_RANGE
459 static size_t lock_granularity() {
460 return (size_t) sysconf(_SC_PAGESIZE);
461 }
462
463 bool raw_lock(const void * addr, size_t size) const {
464 if (!mlock(addr: addr, len: size)) {
465 return true;
466 }
467
468#ifdef __APPLE__
469#define MLOCK_SUGGESTION \
470 "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
471 "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
472#else
473#define MLOCK_SUGGESTION \
474 "Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
475#endif
476
477 char* errmsg = std::strerror(errno);
478 bool suggest = (errno == ENOMEM);
479#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
480 // visionOS/tvOS dont't support RLIMIT_MEMLOCK
481 // Skip resource limit checks on visionOS/tvOS
482 suggest = false;
483#else
484 struct rlimit lock_limit;
485 if (suggest && getrlimit(RLIMIT_MEMLOCK, rlimits: &lock_limit)) {
486 suggest = false;
487 }
488 if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
489 suggest = false;
490 }
491#endif
492
493 LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
494 size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
495 return false;
496 }
497
498 static void raw_unlock(void * addr, size_t size) {
499 if (munlock(addr: addr, len: size)) {
500 LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
501 }
502 }
503#elif defined(_WIN32)
504 static size_t lock_granularity() {
505 SYSTEM_INFO si;
506 GetSystemInfo(&si);
507 return (size_t) si.dwPageSize;
508 }
509
510 bool raw_lock(void * ptr, size_t len) const {
511 for (int tries = 1; ; tries++) {
512 if (VirtualLock(ptr, len)) {
513 return true;
514 }
515 if (tries == 2) {
516 LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
517 len, size, llama_format_win_err(GetLastError()).c_str());
518 return false;
519 }
520
521 SIZE_T min_ws_size, max_ws_size;
522 if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
523 LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
524 llama_format_win_err(GetLastError()).c_str());
525 return false;
526 }
527 size_t increment = len + 1048576;
528 min_ws_size += increment;
529 max_ws_size += increment;
530 if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
531 LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
532 llama_format_win_err(GetLastError()).c_str());
533 return false;
534 }
535 }
536 }
537
538 static void raw_unlock(void * ptr, size_t len) {
539 if (!VirtualUnlock(ptr, len)) {
540 LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
541 llama_format_win_err(GetLastError()).c_str());
542 }
543 }
544#else
545 static size_t lock_granularity() {
546 return (size_t) 65536;
547 }
548
549 bool raw_lock(const void * addr, size_t len) const {
550 LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
551 return false;
552 }
553
554 static void raw_unlock(const void * addr, size_t len) {}
555#endif
556
557 impl() : addr(NULL), size(0), failed_already(false) {}
558
559 void init(void * ptr) {
560 GGML_ASSERT(addr == NULL && size == 0);
561 addr = ptr;
562 }
563
564 void grow_to(size_t target_size) {
565 GGML_ASSERT(addr);
566 if (failed_already) {
567 return;
568 }
569 size_t granularity = lock_granularity();
570 target_size = (target_size + granularity - 1) & ~(granularity - 1);
571 if (target_size > size) {
572 if (raw_lock(addr: (uint8_t *) addr + size, size: target_size - size)) {
573 size = target_size;
574 } else {
575 failed_already = true;
576 }
577 }
578 }
579
580 void * addr;
581 size_t size;
582
583 bool failed_already;
584};
585
586llama_mlock::llama_mlock() : pimpl(std::make_unique<impl>()) {}
587llama_mlock::~llama_mlock() = default;
588
589void llama_mlock::init(void * ptr) { pimpl->init(ptr); }
590void llama_mlock::grow_to(size_t target_size) { pimpl->grow_to(target_size); }
591
592#if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32)
593const bool llama_mlock::SUPPORTED = true;
594#else
595const bool llama_mlock::SUPPORTED = false;
596#endif
597
598size_t llama_path_max() {
599 return PATH_MAX;
600}
601