1 | /** @file mdb.c |
2 | * @brief Lightning memory-mapped database library |
3 | * |
4 | * A Btree-based database management library modeled loosely on the |
5 | * BerkeleyDB API, but much simplified. |
6 | */ |
7 | /* |
8 | * Copyright 2011-2019 Howard Chu, Symas Corp. |
9 | * All rights reserved. |
10 | * |
11 | * Redistribution and use in source and binary forms, with or without |
12 | * modification, are permitted only as authorized by the OpenLDAP |
13 | * Public License. |
14 | * |
15 | * A copy of this license is available in the file LICENSE in the |
16 | * top-level directory of the distribution or, alternatively, at |
17 | * <http://www.OpenLDAP.org/license.html>. |
18 | * |
19 | * This code is derived from btree.c written by Martin Hedenfalk. |
20 | * |
21 | * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se> |
22 | * |
23 | * Permission to use, copy, modify, and distribute this software for any |
24 | * purpose with or without fee is hereby granted, provided that the above |
25 | * copyright notice and this permission notice appear in all copies. |
26 | * |
27 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
28 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
29 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
30 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
31 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
32 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
33 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
34 | */ |
35 | #ifndef _GNU_SOURCE |
36 | #define _GNU_SOURCE 1 |
37 | #endif |
38 | #if defined(MDB_VL32) || defined(__WIN64__) |
39 | #define _FILE_OFFSET_BITS 64 |
40 | #endif |
41 | #ifdef _WIN32 |
42 | #include <malloc.h> |
43 | #include <windows.h> |
44 | #include <wchar.h> /* get wcscpy() */ |
45 | |
46 | /* We use native NT APIs to setup the memory map, so that we can |
47 | * let the DB file grow incrementally instead of always preallocating |
48 | * the full size. These APIs are defined in <wdm.h> and <ntifs.h> |
49 | * but those headers are meant for driver-level development and |
50 | * conflict with the regular user-level headers, so we explicitly |
51 | * declare them here. We get pointers to these functions from |
52 | * NTDLL.DLL at runtime, to avoid buildtime dependencies on any |
53 | * NTDLL import libraries. |
54 | */ |
55 | typedef NTSTATUS (WINAPI NtCreateSectionFunc) |
56 | (OUT PHANDLE sh, IN ACCESS_MASK acc, |
57 | IN void * oa OPTIONAL, |
58 | IN PLARGE_INTEGER ms OPTIONAL, |
59 | IN ULONG pp, IN ULONG aa, IN HANDLE fh OPTIONAL); |
60 | |
61 | static NtCreateSectionFunc *NtCreateSection; |
62 | |
63 | typedef enum _SECTION_INHERIT { |
64 | ViewShare = 1, |
65 | ViewUnmap = 2 |
66 | } SECTION_INHERIT; |
67 | |
68 | typedef NTSTATUS (WINAPI NtMapViewOfSectionFunc) |
69 | (IN PHANDLE sh, IN HANDLE ph, |
70 | IN OUT PVOID *addr, IN ULONG_PTR zbits, |
71 | IN SIZE_T cs, IN OUT PLARGE_INTEGER off OPTIONAL, |
72 | IN OUT PSIZE_T vs, IN SECTION_INHERIT ih, |
73 | IN ULONG at, IN ULONG pp); |
74 | |
75 | static NtMapViewOfSectionFunc *NtMapViewOfSection; |
76 | |
77 | typedef NTSTATUS (WINAPI NtCloseFunc)(HANDLE h); |
78 | |
79 | static NtCloseFunc *NtClose; |
80 | |
81 | /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it |
82 | * as int64 which is wrong. MSVC doesn't define it at all, so just |
83 | * don't use it. |
84 | */ |
85 | #define MDB_PID_T int |
86 | #define MDB_THR_T DWORD |
87 | #include <sys/types.h> |
88 | #include <sys/stat.h> |
89 | #ifdef __GNUC__ |
90 | # include <sys/param.h> |
91 | #else |
92 | # define LITTLE_ENDIAN 1234 |
93 | # define BIG_ENDIAN 4321 |
94 | # define BYTE_ORDER LITTLE_ENDIAN |
95 | # ifndef SSIZE_MAX |
96 | # define SSIZE_MAX INT_MAX |
97 | # endif |
98 | #endif |
99 | #else |
100 | #include <sys/types.h> |
101 | #include <sys/stat.h> |
102 | #define MDB_PID_T pid_t |
103 | #define MDB_THR_T pthread_t |
104 | #include <sys/param.h> |
105 | #include <sys/uio.h> |
106 | #include <sys/mman.h> |
107 | #ifdef HAVE_SYS_FILE_H |
108 | #include <sys/file.h> |
109 | #endif |
110 | #include <fcntl.h> |
111 | #endif |
112 | |
113 | #if defined(__mips) && defined(__linux) |
114 | /* MIPS has cache coherency issues, requires explicit cache control */ |
115 | #include <asm/cachectl.h> |
116 | extern int cacheflush(char *addr, int nbytes, int cache); |
117 | #define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache) |
118 | #else |
119 | #define CACHEFLUSH(addr, bytes, cache) |
120 | #endif |
121 | |
122 | #if defined(__linux) && !defined(MDB_FDATASYNC_WORKS) |
123 | /** fdatasync is broken on ext3/ext4fs on older kernels, see |
124 | * description in #mdb_env_open2 comments. You can safely |
125 | * define MDB_FDATASYNC_WORKS if this code will only be run |
126 | * on kernels 3.6 and newer. |
127 | */ |
128 | #define BROKEN_FDATASYNC |
129 | #endif |
130 | |
131 | #include <errno.h> |
132 | #include <limits.h> |
133 | #include <stddef.h> |
134 | #include <inttypes.h> |
135 | #include <stdio.h> |
136 | #include <stdlib.h> |
137 | #include <string.h> |
138 | #include <time.h> |
139 | |
140 | #ifdef _MSC_VER |
141 | #include <io.h> |
142 | typedef SSIZE_T ssize_t; |
143 | #else |
144 | #include <unistd.h> |
145 | #endif |
146 | |
147 | #if defined(__sun) || defined(__ANDROID__) |
148 | /* Most platforms have posix_memalign, older may only have memalign */ |
149 | #define HAVE_MEMALIGN 1 |
150 | #include <malloc.h> |
151 | /* On Solaris, we need the POSIX sigwait function */ |
152 | #if defined (__sun) |
153 | # define _POSIX_PTHREAD_SEMANTICS 1 |
154 | #endif |
155 | #endif |
156 | |
157 | #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) |
158 | #include <netinet/in.h> |
159 | #include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */ |
160 | #endif |
161 | |
162 | #if defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) |
163 | # if !(defined(MDB_USE_POSIX_MUTEX) || defined(MDB_USE_POSIX_SEM)) |
164 | # define MDB_USE_SYSV_SEM 1 |
165 | # endif |
166 | # define MDB_FDATASYNC fsync |
167 | #elif defined(__ANDROID__) |
168 | # define MDB_FDATASYNC fsync |
169 | #endif |
170 | |
171 | #ifndef _WIN32 |
172 | #include <pthread.h> |
173 | #include <signal.h> |
174 | #ifdef MDB_USE_POSIX_SEM |
175 | # define MDB_USE_HASH 1 |
176 | #include <semaphore.h> |
177 | #elif defined(MDB_USE_SYSV_SEM) |
178 | #include <sys/ipc.h> |
179 | #include <sys/sem.h> |
180 | #ifdef _SEM_SEMUN_UNDEFINED |
181 | union semun { |
182 | int val; |
183 | struct semid_ds *buf; |
184 | unsigned short *array; |
185 | }; |
186 | #endif /* _SEM_SEMUN_UNDEFINED */ |
187 | #else |
188 | #define MDB_USE_POSIX_MUTEX 1 |
189 | #endif /* MDB_USE_POSIX_SEM */ |
190 | #endif /* !_WIN32 */ |
191 | |
192 | #if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) + defined(MDB_USE_SYSV_SEM) \ |
193 | + defined(MDB_USE_POSIX_MUTEX) != 1 |
194 | # error "Ambiguous shared-lock implementation" |
195 | #endif |
196 | |
197 | #ifdef USE_VALGRIND |
198 | #include <valgrind/memcheck.h> |
199 | #define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z) |
200 | #define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s) |
201 | #define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a) |
202 | #define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h) |
203 | #define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s) |
204 | #else |
205 | #define VGMEMP_CREATE(h,r,z) |
206 | #define VGMEMP_ALLOC(h,a,s) |
207 | #define VGMEMP_FREE(h,a) |
208 | #define VGMEMP_DESTROY(h) |
209 | #define VGMEMP_DEFINED(a,s) |
210 | #endif |
211 | |
212 | #ifndef BYTE_ORDER |
213 | # if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) |
214 | /* Solaris just defines one or the other */ |
215 | # define LITTLE_ENDIAN 1234 |
216 | # define BIG_ENDIAN 4321 |
217 | # ifdef _LITTLE_ENDIAN |
218 | # define BYTE_ORDER LITTLE_ENDIAN |
219 | # else |
220 | # define BYTE_ORDER BIG_ENDIAN |
221 | # endif |
222 | # else |
223 | # define BYTE_ORDER __BYTE_ORDER |
224 | # endif |
225 | #endif |
226 | |
227 | #ifndef LITTLE_ENDIAN |
228 | #define LITTLE_ENDIAN __LITTLE_ENDIAN |
229 | #endif |
230 | #ifndef BIG_ENDIAN |
231 | #define BIG_ENDIAN __BIG_ENDIAN |
232 | #endif |
233 | |
234 | #if defined(__i386) || defined(__x86_64) || defined(_M_IX86) |
235 | #define MISALIGNED_OK 1 |
236 | #endif |
237 | |
238 | #include "lmdb.h" |
239 | #include "midl.h" |
240 | |
241 | #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) |
242 | # error "Unknown or unsupported endianness (BYTE_ORDER)" |
243 | #elif (-6 & 5) || CHAR_BIT!=8 || UINT_MAX!=0xffffffff || MDB_SIZE_MAX%UINT_MAX |
244 | # error "Two's complement, reasonably sized integer types, please" |
245 | #endif |
246 | |
247 | #ifdef __GNUC__ |
248 | /** Put infrequently used env functions in separate section */ |
249 | # ifdef __APPLE__ |
250 | # define ESECT __attribute__ ((section("__TEXT,text_env"))) |
251 | # else |
252 | # define ESECT __attribute__ ((section("text_env"))) |
253 | # endif |
254 | #else |
255 | #define ESECT |
256 | #endif |
257 | |
258 | #ifdef _WIN32 |
259 | #define CALL_CONV WINAPI |
260 | #else |
261 | #define CALL_CONV |
262 | #endif |
263 | |
264 | /** @defgroup internal LMDB Internals |
265 | * @{ |
266 | */ |
267 | /** @defgroup compat Compatibility Macros |
268 | * A bunch of macros to minimize the amount of platform-specific ifdefs |
269 | * needed throughout the rest of the code. When the features this library |
270 | * needs are similar enough to POSIX to be hidden in a one-or-two line |
271 | * replacement, this macro approach is used. |
272 | * @{ |
273 | */ |
274 | |
275 | /** Features under development */ |
276 | #ifndef MDB_DEVEL |
277 | #define MDB_DEVEL 0 |
278 | #endif |
279 | |
280 | /** Wrapper around __func__, which is a C99 feature */ |
281 | #if __STDC_VERSION__ >= 199901L |
282 | # define mdb_func_ __func__ |
283 | #elif __GNUC__ >= 2 || _MSC_VER >= 1300 |
284 | # define mdb_func_ __FUNCTION__ |
285 | #else |
286 | /* If a debug message says <mdb_unknown>(), update the #if statements above */ |
287 | # define mdb_func_ "<mdb_unknown>" |
288 | #endif |
289 | |
290 | /* Internal error codes, not exposed outside liblmdb */ |
291 | #define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) |
292 | #ifdef _WIN32 |
293 | #define MDB_OWNERDEAD ((int) WAIT_ABANDONED) |
294 | #elif defined MDB_USE_SYSV_SEM |
295 | #define MDB_OWNERDEAD (MDB_LAST_ERRCODE + 11) |
296 | #elif defined(MDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD) |
297 | #define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */ |
298 | #endif |
299 | |
300 | #ifdef __GLIBC__ |
301 | #define GLIBC_VER ((__GLIBC__ << 16 )| __GLIBC_MINOR__) |
302 | #endif |
303 | /** Some platforms define the EOWNERDEAD error code |
304 | * even though they don't support Robust Mutexes. |
305 | * Compile with -DMDB_USE_ROBUST=0, or use some other |
306 | * mechanism like -DMDB_USE_SYSV_SEM instead of |
307 | * -DMDB_USE_POSIX_MUTEX. (SysV semaphores are |
308 | * also Robust, but some systems don't support them |
309 | * either.) |
310 | */ |
311 | #ifndef MDB_USE_ROBUST |
312 | /* Android currently lacks Robust Mutex support. So does glibc < 2.4. */ |
313 | # if defined(MDB_USE_POSIX_MUTEX) && (defined(__ANDROID__) || \ |
314 | (defined(__GLIBC__) && GLIBC_VER < 0x020004)) |
315 | # define MDB_USE_ROBUST 0 |
316 | # else |
317 | # define MDB_USE_ROBUST 1 |
318 | # endif |
319 | #endif /* !MDB_USE_ROBUST */ |
320 | |
321 | #if defined(MDB_USE_POSIX_MUTEX) && (MDB_USE_ROBUST) |
322 | /* glibc < 2.12 only provided _np API */ |
323 | # if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \ |
324 | (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST)) |
325 | # define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP |
326 | # define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) |
327 | # define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) |
328 | # endif |
329 | #endif /* MDB_USE_POSIX_MUTEX && MDB_USE_ROBUST */ |
330 | |
331 | #if defined(MDB_OWNERDEAD) && (MDB_USE_ROBUST) |
332 | #define MDB_ROBUST_SUPPORTED 1 |
333 | #endif |
334 | |
335 | #ifdef _WIN32 |
336 | #define MDB_USE_HASH 1 |
337 | #define MDB_PIDLOCK 0 |
338 | #define THREAD_RET DWORD |
339 | #define pthread_t HANDLE |
340 | #define pthread_mutex_t HANDLE |
341 | #define pthread_cond_t HANDLE |
342 | typedef HANDLE mdb_mutex_t, mdb_mutexref_t; |
343 | #define pthread_key_t DWORD |
344 | #define pthread_self() GetCurrentThreadId() |
345 | #define pthread_key_create(x,y) \ |
346 | ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0) |
347 | #define pthread_key_delete(x) TlsFree(x) |
348 | #define pthread_getspecific(x) TlsGetValue(x) |
349 | #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) |
350 | #define pthread_mutex_unlock(x) ReleaseMutex(*x) |
351 | #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) |
352 | #define pthread_cond_signal(x) SetEvent(*x) |
353 | #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) |
354 | #define THREAD_CREATE(thr,start,arg) \ |
355 | (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode()) |
356 | #define THREAD_FINISH(thr) \ |
357 | (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0) |
358 | #define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE) |
359 | #define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex) |
360 | #define mdb_mutex_consistent(mutex) 0 |
361 | #define getpid() GetCurrentProcessId() |
362 | #define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd)) |
363 | #define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len)) |
364 | #define ErrCode() GetLastError() |
365 | #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} |
366 | #define close(fd) (CloseHandle(fd) ? 0 : -1) |
367 | #define munmap(ptr,len) UnmapViewOfFile(ptr) |
368 | #ifdef PROCESS_QUERY_LIMITED_INFORMATION |
369 | #define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION |
370 | #else |
371 | #define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000 |
372 | #endif |
373 | #else |
374 | #define THREAD_RET void * |
375 | #define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg) |
376 | #define THREAD_FINISH(thr) pthread_join(thr,NULL) |
377 | |
378 | /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ |
379 | #define MDB_PIDLOCK 1 |
380 | |
381 | #ifdef MDB_USE_POSIX_SEM |
382 | |
383 | typedef sem_t *mdb_mutex_t, *mdb_mutexref_t; |
384 | #define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex) |
385 | #define UNLOCK_MUTEX(mutex) sem_post(mutex) |
386 | |
387 | static int |
388 | mdb_sem_wait(sem_t *sem) |
389 | { |
390 | int rc; |
391 | while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ; |
392 | return rc; |
393 | } |
394 | |
395 | #elif defined MDB_USE_SYSV_SEM |
396 | |
397 | typedef struct mdb_mutex { |
398 | int semid; |
399 | int semnum; |
400 | int *locked; |
401 | } mdb_mutex_t[1], *mdb_mutexref_t; |
402 | |
403 | #define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex) |
404 | #define UNLOCK_MUTEX(mutex) do { \ |
405 | struct sembuf sb = { 0, 1, SEM_UNDO }; \ |
406 | sb.sem_num = (mutex)->semnum; \ |
407 | *(mutex)->locked = 0; \ |
408 | semop((mutex)->semid, &sb, 1); \ |
409 | } while(0) |
410 | |
411 | static int |
412 | mdb_sem_wait(mdb_mutexref_t sem) |
413 | { |
414 | int rc, *locked = sem->locked; |
415 | struct sembuf sb = { 0, -1, SEM_UNDO }; |
416 | sb.sem_num = sem->semnum; |
417 | do { |
418 | if (!semop(sem->semid, &sb, 1)) { |
419 | rc = *locked ? MDB_OWNERDEAD : MDB_SUCCESS; |
420 | *locked = 1; |
421 | break; |
422 | } |
423 | } while ((rc = errno) == EINTR); |
424 | return rc; |
425 | } |
426 | |
427 | #define mdb_mutex_consistent(mutex) 0 |
428 | |
429 | #else /* MDB_USE_POSIX_MUTEX: */ |
430 | /** Shared mutex/semaphore as the original is stored. |
431 | * |
432 | * Not for copies. Instead it can be assigned to an #mdb_mutexref_t. |
433 | * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it |
434 | * is array[size 1] so it can be assigned to the pointer. |
435 | */ |
436 | typedef pthread_mutex_t mdb_mutex_t[1]; |
437 | /** Reference to an #mdb_mutex_t */ |
438 | typedef pthread_mutex_t *mdb_mutexref_t; |
439 | /** Lock the reader or writer mutex. |
440 | * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX(). |
441 | */ |
442 | #define LOCK_MUTEX0(mutex) pthread_mutex_lock(mutex) |
443 | /** Unlock the reader or writer mutex. |
444 | */ |
445 | #define UNLOCK_MUTEX(mutex) pthread_mutex_unlock(mutex) |
446 | /** Mark mutex-protected data as repaired, after death of previous owner. |
447 | */ |
448 | #define mdb_mutex_consistent(mutex) pthread_mutex_consistent(mutex) |
449 | #endif /* MDB_USE_POSIX_SEM || MDB_USE_SYSV_SEM */ |
450 | |
451 | /** Get the error code for the last failed system function. |
452 | */ |
453 | #define ErrCode() errno |
454 | |
455 | /** An abstraction for a file handle. |
456 | * On POSIX systems file handles are small integers. On Windows |
457 | * they're opaque pointers. |
458 | */ |
459 | #define HANDLE int |
460 | |
461 | /** A value for an invalid file handle. |
462 | * Mainly used to initialize file variables and signify that they are |
463 | * unused. |
464 | */ |
465 | #define INVALID_HANDLE_VALUE (-1) |
466 | |
467 | /** Get the size of a memory page for the system. |
468 | * This is the basic size that the platform's memory manager uses, and is |
469 | * fundamental to the use of memory-mapped files. |
470 | */ |
471 | #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) |
472 | #endif |
473 | |
474 | #define Z MDB_FMT_Z /**< printf/scanf format modifier for size_t */ |
475 | #define Yu MDB_PRIy(u) /**< printf format for #mdb_size_t */ |
476 | #define Yd MDB_PRIy(d) /**< printf format for 'signed #mdb_size_t' */ |
477 | |
478 | #ifdef MDB_USE_SYSV_SEM |
479 | #define MNAME_LEN (sizeof(int)) |
480 | #else |
481 | #define MNAME_LEN (sizeof(pthread_mutex_t)) |
482 | #endif |
483 | |
484 | /** Initial part of #MDB_env.me_mutexname[]. |
485 | * Changes to this code must be reflected in #MDB_LOCK_FORMAT. |
486 | */ |
487 | #ifdef _WIN32 |
488 | #define MUTEXNAME_PREFIX "Global\\MDB" |
489 | #elif defined MDB_USE_POSIX_SEM |
490 | #define MUTEXNAME_PREFIX "/MDB" |
491 | #endif |
492 | |
493 | /** @} */ |
494 | |
495 | #ifdef MDB_ROBUST_SUPPORTED |
496 | /** Lock mutex, handle any error, set rc = result. |
497 | * Return 0 on success, nonzero (not rc) on error. |
498 | */ |
499 | #define LOCK_MUTEX(rc, env, mutex) \ |
500 | (((rc) = LOCK_MUTEX0(mutex)) && \ |
501 | ((rc) = mdb_mutex_failed(env, mutex, rc))) |
502 | static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc); |
503 | #else |
504 | #define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex)) |
505 | #define mdb_mutex_failed(env, mutex, rc) (rc) |
506 | #endif |
507 | |
508 | #ifndef _WIN32 |
509 | /** A flag for opening a file and requesting synchronous data writes. |
510 | * This is only used when writing a meta page. It's not strictly needed; |
511 | * we could just do a normal write and then immediately perform a flush. |
512 | * But if this flag is available it saves us an extra system call. |
513 | * |
514 | * @note If O_DSYNC is undefined but exists in /usr/include, |
515 | * preferably set some compiler flag to get the definition. |
516 | */ |
517 | #ifndef MDB_DSYNC |
518 | # ifdef O_DSYNC |
519 | # define MDB_DSYNC O_DSYNC |
520 | # else |
521 | # define MDB_DSYNC O_SYNC |
522 | # endif |
523 | #endif |
524 | #endif |
525 | |
526 | /** Function for flushing the data of a file. Define this to fsync |
527 | * if fdatasync() is not supported. |
528 | */ |
529 | #ifndef MDB_FDATASYNC |
530 | # define MDB_FDATASYNC fdatasync |
531 | #endif |
532 | |
533 | #ifndef MDB_MSYNC |
534 | # define MDB_MSYNC(addr,len,flags) msync(addr,len,flags) |
535 | #endif |
536 | |
537 | #ifndef MS_SYNC |
538 | #define MS_SYNC 1 |
539 | #endif |
540 | |
541 | #ifndef MS_ASYNC |
542 | #define MS_ASYNC 0 |
543 | #endif |
544 | |
545 | /** A page number in the database. |
546 | * Note that 64 bit page numbers are overkill, since pages themselves |
547 | * already represent 12-13 bits of addressable memory, and the OS will |
548 | * always limit applications to a maximum of 63 bits of address space. |
549 | * |
550 | * @note In the #MDB_node structure, we only store 48 bits of this value, |
551 | * which thus limits us to only 60 bits of addressable data. |
552 | */ |
553 | typedef MDB_ID pgno_t; |
554 | |
555 | /** A transaction ID. |
556 | * See struct MDB_txn.mt_txnid for details. |
557 | */ |
558 | typedef MDB_ID txnid_t; |
559 | |
560 | /** @defgroup debug Debug Macros |
561 | * @{ |
562 | */ |
563 | #ifndef MDB_DEBUG |
564 | /** Enable debug output. Needs variable argument macros (a C99 feature). |
565 | * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs |
566 | * read from and written to the database (used for free space management). |
567 | */ |
568 | #define MDB_DEBUG 0 |
569 | #endif |
570 | |
571 | #if MDB_DEBUG |
572 | static int mdb_debug; |
573 | static txnid_t mdb_debug_start; |
574 | |
575 | /** Print a debug message with printf formatting. |
576 | * Requires double parenthesis around 2 or more args. |
577 | */ |
578 | # define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args)) |
579 | # define DPRINTF0(fmt, ...) \ |
580 | fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__) |
581 | #else |
582 | # define DPRINTF(args) ((void) 0) |
583 | #endif |
584 | /** Print a debug string. |
585 | * The string is printed literally, with no format processing. |
586 | */ |
587 | #define DPUTS(arg) DPRINTF(("%s", arg)) |
588 | /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */ |
589 | #define DDBI(mc) \ |
590 | (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) |
591 | /** @} */ |
592 | |
593 | /** @brief The maximum size of a database page. |
594 | * |
595 | * It is 32k or 64k, since value-PAGEBASE must fit in |
596 | * #MDB_page.%mp_upper. |
597 | * |
598 | * LMDB will use database pages < OS pages if needed. |
599 | * That causes more I/O in write transactions: The OS must |
600 | * know (read) the whole page before writing a partial page. |
601 | * |
602 | * Note that we don't currently support Huge pages. On Linux, |
603 | * regular data files cannot use Huge pages, and in general |
604 | * Huge pages aren't actually pageable. We rely on the OS |
605 | * demand-pager to read our data and page it out when memory |
606 | * pressure from other processes is high. So until OSs have |
607 | * actual paging support for Huge pages, they're not viable. |
608 | */ |
609 | #define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) |
610 | |
611 | /** The minimum number of keys required in a database page. |
612 | * Setting this to a larger value will place a smaller bound on the |
613 | * maximum size of a data item. Data items larger than this size will |
614 | * be pushed into overflow pages instead of being stored directly in |
615 | * the B-tree node. This value used to default to 4. With a page size |
616 | * of 4096 bytes that meant that any item larger than 1024 bytes would |
617 | * go into an overflow page. That also meant that on average 2-3KB of |
618 | * each overflow page was wasted space. The value cannot be lower than |
619 | * 2 because then there would no longer be a tree structure. With this |
620 | * value, items larger than 2KB will go into overflow pages, and on |
621 | * average only 1KB will be wasted. |
622 | */ |
623 | #define MDB_MINKEYS 2 |
624 | |
625 | /** A stamp that identifies a file as an LMDB file. |
626 | * There's nothing special about this value other than that it is easily |
627 | * recognizable, and it will reflect any byte order mismatches. |
628 | */ |
629 | #define MDB_MAGIC 0xBEEFC0DE |
630 | |
631 | /** The version number for a database's datafile format. */ |
632 | #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) |
633 | /** The version number for a database's lockfile format. */ |
634 | #define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 2) |
635 | /** Number of bits representing #MDB_LOCK_VERSION in #MDB_LOCK_FORMAT. |
636 | * The remaining bits must leave room for #MDB_lock_desc. |
637 | */ |
638 | #define MDB_LOCK_VERSION_BITS 12 |
639 | |
640 | /** @brief The max size of a key we can write, or 0 for computed max. |
641 | * |
642 | * This macro should normally be left alone or set to 0. |
643 | * Note that a database with big keys or dupsort data cannot be |
644 | * reliably modified by a liblmdb which uses a smaller max. |
645 | * The default is 511 for backwards compat, or 0 when #MDB_DEVEL. |
646 | * |
647 | * Other values are allowed, for backwards compat. However: |
648 | * A value bigger than the computed max can break if you do not |
649 | * know what you are doing, and liblmdb <= 0.9.10 can break when |
650 | * modifying a DB with keys/dupsort data bigger than its max. |
651 | * |
652 | * Data items in an #MDB_DUPSORT database are also limited to |
653 | * this size, since they're actually keys of a sub-DB. Keys and |
654 | * #MDB_DUPSORT data items must fit on a node in a regular page. |
655 | */ |
656 | #ifndef MDB_MAXKEYSIZE |
657 | #define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) |
658 | #endif |
659 | |
660 | /** The maximum size of a key we can write to the environment. */ |
661 | #if MDB_MAXKEYSIZE |
662 | #define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) |
663 | #else |
664 | #define ENV_MAXKEY(env) ((env)->me_maxkey) |
665 | #endif |
666 | |
667 | /** @brief The maximum size of a data item. |
668 | * |
669 | * We only store a 32 bit value for node sizes. |
670 | */ |
671 | #define MAXDATASIZE 0xffffffffUL |
672 | |
673 | #if MDB_DEBUG |
674 | /** Key size which fits in a #DKBUF. |
675 | * @ingroup debug |
676 | */ |
677 | #define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) |
678 | /** A key buffer. |
679 | * @ingroup debug |
680 | * This is used for printing a hex dump of a key's contents. |
681 | */ |
682 | #define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1] |
683 | /** Display a key in hex. |
684 | * @ingroup debug |
685 | * Invoke a function to display a key in hex. |
686 | */ |
687 | #define DKEY(x) mdb_dkey(x, kbuf) |
688 | #else |
689 | #define DKBUF |
690 | #define DKEY(x) 0 |
691 | #endif |
692 | |
693 | /** An invalid page number. |
694 | * Mainly used to denote an empty tree. |
695 | */ |
696 | #define P_INVALID (~(pgno_t)0) |
697 | |
698 | /** Test if the flags \b f are set in a flag word \b w. */ |
699 | #define F_ISSET(w, f) (((w) & (f)) == (f)) |
700 | |
701 | /** Round \b n up to an even number. */ |
702 | #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ |
703 | |
704 | /** Least significant 1-bit of \b n. n must be of an unsigned type. */ |
705 | #define LOW_BIT(n) ((n) & (-(n))) |
706 | |
707 | /** (log2(\b p2) % \b n), for p2 = power of 2 and 0 < n < 8. */ |
708 | #define LOG2_MOD(p2, n) (7 - 86 / ((p2) % ((1U<<(n))-1) + 11)) |
709 | /* Explanation: Let p2 = 2**(n*y + x), x<n and M = (1U<<n)-1. Now p2 = |
710 | * (M+1)**y * 2**x = 2**x (mod M). Finally "/" "happens" to return 7-x. |
711 | */ |
712 | |
713 | /** Should be alignment of \b type. Ensure it is a power of 2. */ |
714 | #define ALIGNOF2(type) \ |
715 | LOW_BIT(offsetof(struct { char ch_; type align_; }, align_)) |
716 | |
717 | /** Used for offsets within a single page. |
718 | * Since memory pages are typically 4 or 8KB in size, 12-13 bits, |
719 | * this is plenty. |
720 | */ |
721 | typedef uint16_t indx_t; |
722 | |
723 | typedef unsigned long long mdb_hash_t; |
724 | |
725 | /** Default size of memory map. |
726 | * This is certainly too small for any actual applications. Apps should always set |
727 | * the size explicitly using #mdb_env_set_mapsize(). |
728 | */ |
729 | #define DEFAULT_MAPSIZE 1048576 |
730 | |
731 | /** @defgroup readers Reader Lock Table |
732 | * Readers don't acquire any locks for their data access. Instead, they |
733 | * simply record their transaction ID in the reader table. The reader |
734 | * mutex is needed just to find an empty slot in the reader table. The |
735 | * slot's address is saved in thread-specific data so that subsequent read |
736 | * transactions started by the same thread need no further locking to proceed. |
737 | * |
738 | * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. |
739 | * |
740 | * No reader table is used if the database is on a read-only filesystem, or |
741 | * if #MDB_NOLOCK is set. |
742 | * |
743 | * Since the database uses multi-version concurrency control, readers don't |
744 | * actually need any locking. This table is used to keep track of which |
745 | * readers are using data from which old transactions, so that we'll know |
746 | * when a particular old transaction is no longer in use. Old transactions |
747 | * that have discarded any data pages can then have those pages reclaimed |
748 | * for use by a later write transaction. |
749 | * |
750 | * The lock table is constructed such that reader slots are aligned with the |
751 | * processor's cache line size. Any slot is only ever used by one thread. |
752 | * This alignment guarantees that there will be no contention or cache |
753 | * thrashing as threads update their own slot info, and also eliminates |
754 | * any need for locking when accessing a slot. |
755 | * |
756 | * A writer thread will scan every slot in the table to determine the oldest |
757 | * outstanding reader transaction. Any freed pages older than this will be |
758 | * reclaimed by the writer. The writer doesn't use any locks when scanning |
759 | * this table. This means that there's no guarantee that the writer will |
760 | * see the most up-to-date reader info, but that's not required for correct |
761 | * operation - all we need is to know the upper bound on the oldest reader, |
762 | * we don't care at all about the newest reader. So the only consequence of |
763 | * reading stale information here is that old pages might hang around a |
764 | * while longer before being reclaimed. That's actually good anyway, because |
765 | * the longer we delay reclaiming old pages, the more likely it is that a |
766 | * string of contiguous pages can be found after coalescing old pages from |
767 | * many old transactions together. |
768 | * @{ |
769 | */ |
770 | /** Number of slots in the reader table. |
771 | * This value was chosen somewhat arbitrarily. 126 readers plus a |
772 | * couple mutexes fit exactly into 8KB on my development machine. |
773 | * Applications should set the table size using #mdb_env_set_maxreaders(). |
774 | */ |
775 | #define DEFAULT_READERS 126 |
776 | |
777 | /** The size of a CPU cache line in bytes. We want our lock structures |
778 | * aligned to this size to avoid false cache line sharing in the |
779 | * lock table. |
780 | * This value works for most CPUs. For Itanium this should be 128. |
781 | */ |
782 | #ifndef CACHELINE |
783 | #define CACHELINE 64 |
784 | #endif |
785 | |
786 | /** The information we store in a single slot of the reader table. |
787 | * In addition to a transaction ID, we also record the process and |
788 | * thread ID that owns a slot, so that we can detect stale information, |
789 | * e.g. threads or processes that went away without cleaning up. |
790 | * @note We currently don't check for stale records. We simply re-init |
791 | * the table when we know that we're the only process opening the |
792 | * lock file. |
793 | */ |
794 | typedef struct MDB_rxbody { |
795 | /** Current Transaction ID when this transaction began, or (txnid_t)-1. |
796 | * Multiple readers that start at the same time will probably have the |
797 | * same ID here. Again, it's not important to exclude them from |
798 | * anything; all we need to know is which version of the DB they |
799 | * started from so we can avoid overwriting any data used in that |
800 | * particular version. |
801 | */ |
802 | volatile txnid_t mrb_txnid; |
803 | /** The process ID of the process owning this reader txn. */ |
804 | volatile MDB_PID_T mrb_pid; |
805 | /** The thread ID of the thread owning this txn. */ |
806 | volatile MDB_THR_T mrb_tid; |
807 | } MDB_rxbody; |
808 | |
809 | /** The actual reader record, with cacheline padding. */ |
810 | typedef struct MDB_reader { |
811 | union { |
812 | MDB_rxbody mrx; |
813 | /** shorthand for mrb_txnid */ |
814 | #define mr_txnid mru.mrx.mrb_txnid |
815 | #define mr_pid mru.mrx.mrb_pid |
816 | #define mr_tid mru.mrx.mrb_tid |
817 | /** cache line alignment */ |
818 | char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)]; |
819 | } mru; |
820 | } MDB_reader; |
821 | |
822 | /** The header for the reader table. |
823 | * The table resides in a memory-mapped file. (This is a different file |
824 | * than is used for the main database.) |
825 | * |
826 | * For POSIX the actual mutexes reside in the shared memory of this |
827 | * mapped file. On Windows, mutexes are named objects allocated by the |
828 | * kernel; we store the mutex names in this mapped file so that other |
829 | * processes can grab them. This same approach is also used on |
830 | * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support |
831 | * process-shared POSIX mutexes. For these cases where a named object |
832 | * is used, the object name is derived from a 64 bit FNV hash of the |
833 | * environment pathname. As such, naming collisions are extremely |
834 | * unlikely. If a collision occurs, the results are unpredictable. |
835 | */ |
836 | typedef struct MDB_txbody { |
837 | /** Stamp identifying this as an LMDB file. It must be set |
838 | * to #MDB_MAGIC. */ |
839 | uint32_t mtb_magic; |
840 | /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ |
841 | uint32_t mtb_format; |
842 | /** The ID of the last transaction committed to the database. |
843 | * This is recorded here only for convenience; the value can always |
844 | * be determined by reading the main database meta pages. |
845 | */ |
846 | volatile txnid_t mtb_txnid; |
847 | /** The number of slots that have been used in the reader table. |
848 | * This always records the maximum count, it is not decremented |
849 | * when readers release their slots. |
850 | */ |
851 | volatile unsigned mtb_numreaders; |
852 | #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) |
853 | /** Binary form of names of the reader/writer locks */ |
854 | mdb_hash_t mtb_mutexid; |
855 | #elif defined(MDB_USE_SYSV_SEM) |
856 | int mtb_semid; |
857 | int mtb_rlocked; |
858 | #else |
859 | /** Mutex protecting access to this table. |
860 | * This is the reader table lock used with LOCK_MUTEX(). |
861 | */ |
862 | mdb_mutex_t mtb_rmutex; |
863 | #endif |
864 | } MDB_txbody; |
865 | |
866 | /** The actual reader table definition. */ |
867 | typedef struct MDB_txninfo { |
868 | union { |
869 | MDB_txbody mtb; |
870 | #define mti_magic mt1.mtb.mtb_magic |
871 | #define mti_format mt1.mtb.mtb_format |
872 | #define mti_rmutex mt1.mtb.mtb_rmutex |
873 | #define mti_txnid mt1.mtb.mtb_txnid |
874 | #define mti_numreaders mt1.mtb.mtb_numreaders |
875 | #define mti_mutexid mt1.mtb.mtb_mutexid |
876 | #ifdef MDB_USE_SYSV_SEM |
877 | #define mti_semid mt1.mtb.mtb_semid |
878 | #define mti_rlocked mt1.mtb.mtb_rlocked |
879 | #endif |
880 | char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; |
881 | } mt1; |
882 | #if !(defined(_WIN32) || defined(MDB_USE_POSIX_SEM)) |
883 | union { |
884 | #ifdef MDB_USE_SYSV_SEM |
885 | int mt2_wlocked; |
886 | #define mti_wlocked mt2.mt2_wlocked |
887 | #else |
888 | mdb_mutex_t mt2_wmutex; |
889 | #define mti_wmutex mt2.mt2_wmutex |
890 | #endif |
891 | char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)]; |
892 | } mt2; |
893 | #endif |
894 | MDB_reader mti_readers[1]; |
895 | } MDB_txninfo; |
896 | |
897 | /** Lockfile format signature: version, features and field layout */ |
898 | #define MDB_LOCK_FORMAT \ |
899 | ((uint32_t) \ |
900 | (((MDB_LOCK_VERSION) % (1U << MDB_LOCK_VERSION_BITS)) \ |
901 | + MDB_lock_desc * (1U << MDB_LOCK_VERSION_BITS))) |
902 | |
903 | /** Lock type and layout. Values 0-119. _WIN32 implies #MDB_PIDLOCK. |
904 | * Some low values are reserved for future tweaks. |
905 | */ |
906 | #ifdef _WIN32 |
907 | # define MDB_LOCK_TYPE (0 + ALIGNOF2(mdb_hash_t)/8 % 2) |
908 | #elif defined MDB_USE_POSIX_SEM |
909 | # define MDB_LOCK_TYPE (4 + ALIGNOF2(mdb_hash_t)/8 % 2) |
910 | #elif defined MDB_USE_SYSV_SEM |
911 | # define MDB_LOCK_TYPE (8) |
912 | #elif defined MDB_USE_POSIX_MUTEX |
913 | /* We do not know the inside of a POSIX mutex and how to check if mutexes |
914 | * used by two executables are compatible. Just check alignment and size. |
915 | */ |
916 | # define MDB_LOCK_TYPE (10 + \ |
917 | LOG2_MOD(ALIGNOF2(pthread_mutex_t), 5) + \ |
918 | sizeof(pthread_mutex_t) / 4U % 22 * 5) |
919 | #endif |
920 | |
921 | enum { |
922 | /** Magic number for lockfile layout and features. |
923 | * |
924 | * This *attempts* to stop liblmdb variants compiled with conflicting |
925 | * options from using the lockfile at the same time and thus breaking |
926 | * it. It describes locking types, and sizes and sometimes alignment |
927 | * of the various lockfile items. |
928 | * |
929 | * The detected ranges are mostly guesswork, or based simply on how |
930 | * big they could be without using more bits. So we can tweak them |
931 | * in good conscience when updating #MDB_LOCK_VERSION. |
932 | */ |
933 | MDB_lock_desc = |
934 | /* Default CACHELINE=64 vs. other values (have seen mention of 32-256) */ |
935 | (CACHELINE==64 ? 0 : 1 + LOG2_MOD(CACHELINE >> (CACHELINE>64), 5)) |
936 | + 6 * (sizeof(MDB_PID_T)/4 % 3) /* legacy(2) to word(4/8)? */ |
937 | + 18 * (sizeof(pthread_t)/4 % 5) /* can be struct{id, active data} */ |
938 | + 90 * (sizeof(MDB_txbody) / CACHELINE % 3) |
939 | + 270 * (MDB_LOCK_TYPE % 120) |
940 | /* The above is < 270*120 < 2**15 */ |
941 | + ((sizeof(txnid_t) == 8) << 15) /* 32bit/64bit */ |
942 | + ((sizeof(MDB_reader) > CACHELINE) << 16) |
943 | /* Not really needed - implied by MDB_LOCK_TYPE != (_WIN32 locking) */ |
944 | + (((MDB_PIDLOCK) != 0) << 17) |
945 | /* 18 bits total: Must be <= (32 - MDB_LOCK_VERSION_BITS). */ |
946 | }; |
947 | /** @} */ |
948 | |
949 | /** Common header for all page types. The page type depends on #mp_flags. |
950 | * |
951 | * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with |
952 | * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages |
953 | * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. |
954 | * |
955 | * #P_OVERFLOW records occupy one or more contiguous pages where only the |
956 | * first has a page header. They hold the real data of #F_BIGDATA nodes. |
957 | * |
958 | * #P_SUBP sub-pages are small leaf "pages" with duplicate data. |
959 | * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. |
960 | * (Duplicate data can also go in sub-databases, which use normal pages.) |
961 | * |
962 | * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. |
963 | * |
964 | * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once |
965 | * in the snapshot: Either used by a database or listed in a freeDB record. |
966 | */ |
967 | typedef struct MDB_page { |
968 | #define mp_pgno mp_p.p_pgno |
969 | #define mp_next mp_p.p_next |
970 | union { |
971 | pgno_t p_pgno; /**< page number */ |
972 | struct MDB_page *p_next; /**< for in-memory list of freed pages */ |
973 | } mp_p; |
974 | uint16_t mp_pad; /**< key size if this is a LEAF2 page */ |
975 | /** @defgroup mdb_page Page Flags |
976 | * @ingroup internal |
977 | * Flags for the page headers. |
978 | * @{ |
979 | */ |
980 | #define P_BRANCH 0x01 /**< branch page */ |
981 | #define P_LEAF 0x02 /**< leaf page */ |
982 | #define P_OVERFLOW 0x04 /**< overflow page */ |
983 | #define P_META 0x08 /**< meta page */ |
984 | #define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ |
985 | #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ |
986 | #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ |
987 | #define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ |
988 | #define P_KEEP 0x8000 /**< leave this page alone during spill */ |
989 | /** @} */ |
990 | uint16_t mp_flags; /**< @ref mdb_page */ |
991 | #define mp_lower mp_pb.pb.pb_lower |
992 | #define mp_upper mp_pb.pb.pb_upper |
993 | #define mp_pages mp_pb.pb_pages |
994 | union { |
995 | struct { |
996 | indx_t pb_lower; /**< lower bound of free space */ |
997 | indx_t pb_upper; /**< upper bound of free space */ |
998 | } pb; |
999 | uint32_t pb_pages; /**< number of overflow pages */ |
1000 | } mp_pb; |
1001 | indx_t mp_ptrs[1]; /**< dynamic size */ |
1002 | } MDB_page; |
1003 | |
1004 | /** Size of the page header, excluding dynamic data at the end */ |
1005 | #define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs)) |
1006 | |
1007 | /** Address of first usable data byte in a page, after the header */ |
1008 | #define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) |
1009 | |
1010 | /** ITS#7713, change PAGEBASE to handle 65536 byte pages */ |
1011 | #define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) |
1012 | |
1013 | /** Number of nodes on a page */ |
1014 | #define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1) |
1015 | |
1016 | /** The amount of space remaining in the page */ |
1017 | #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) |
1018 | |
1019 | /** The percentage of space used in the page, in tenths of a percent. */ |
1020 | #define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ |
1021 | ((env)->me_psize - PAGEHDRSZ)) |
1022 | /** The minimum page fill factor, in tenths of a percent. |
1023 | * Pages emptier than this are candidates for merging. |
1024 | */ |
1025 | #define FILL_THRESHOLD 250 |
1026 | |
1027 | /** Test if a page is a leaf page */ |
1028 | #define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) |
1029 | /** Test if a page is a LEAF2 page */ |
1030 | #define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) |
1031 | /** Test if a page is a branch page */ |
1032 | #define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) |
1033 | /** Test if a page is an overflow page */ |
1034 | #define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) |
1035 | /** Test if a page is a sub page */ |
1036 | #define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) |
1037 | |
1038 | /** The number of overflow pages needed to store the given size. */ |
1039 | #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) |
1040 | |
1041 | /** Link in #MDB_txn.%mt_loose_pgs list. |
1042 | * Kept outside the page header, which is needed when reusing the page. |
1043 | */ |
1044 | #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) |
1045 | |
1046 | /** Header for a single key/data pair within a page. |
1047 | * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. |
1048 | * We guarantee 2-byte alignment for 'MDB_node's. |
1049 | * |
1050 | * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child |
1051 | * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used |
1052 | * for pgno. (Branch nodes have no flags). Lo and hi are in host byte |
1053 | * order in case some accesses can be optimized to 32-bit word access. |
1054 | * |
1055 | * Leaf node flags describe node contents. #F_BIGDATA says the node's |
1056 | * data part is the page number of an overflow page with actual data. |
1057 | * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in |
1058 | * a sub-page/sub-database, and named databases (just #F_SUBDATA). |
1059 | */ |
1060 | typedef struct MDB_node { |
1061 | /** part of data size or pgno |
1062 | * @{ */ |
1063 | #if BYTE_ORDER == LITTLE_ENDIAN |
1064 | unsigned short mn_lo, mn_hi; |
1065 | #else |
1066 | unsigned short mn_hi, mn_lo; |
1067 | #endif |
1068 | /** @} */ |
1069 | /** @defgroup mdb_node Node Flags |
1070 | * @ingroup internal |
1071 | * Flags for node headers. |
1072 | * @{ |
1073 | */ |
1074 | #define F_BIGDATA 0x01 /**< data put on overflow page */ |
1075 | #define F_SUBDATA 0x02 /**< data is a sub-database */ |
1076 | #define F_DUPDATA 0x04 /**< data has duplicates */ |
1077 | |
1078 | /** valid flags for #mdb_node_add() */ |
1079 | #define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND) |
1080 | |
1081 | /** @} */ |
1082 | unsigned short mn_flags; /**< @ref mdb_node */ |
1083 | unsigned short mn_ksize; /**< key size */ |
1084 | char mn_data[1]; /**< key and data are appended here */ |
1085 | } MDB_node; |
1086 | |
1087 | /** Size of the node header, excluding dynamic data at the end */ |
1088 | #define NODESIZE offsetof(MDB_node, mn_data) |
1089 | |
1090 | /** Bit position of top word in page number, for shifting mn_flags */ |
1091 | #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) |
1092 | |
1093 | /** Size of a node in a branch page with a given key. |
1094 | * This is just the node header plus the key, there is no data. |
1095 | */ |
1096 | #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) |
1097 | |
1098 | /** Size of a node in a leaf page with a given key and data. |
1099 | * This is node header plus key plus data size. |
1100 | */ |
1101 | #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) |
1102 | |
1103 | /** Address of node \b i in page \b p */ |
1104 | #define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE)) |
1105 | |
1106 | /** Address of the key for the node */ |
1107 | #define NODEKEY(node) (void *)((node)->mn_data) |
1108 | |
1109 | /** Address of the data for a node */ |
1110 | #define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) |
1111 | |
1112 | /** Get the page number pointed to by a branch node */ |
1113 | #define NODEPGNO(node) \ |
1114 | ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \ |
1115 | (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0)) |
1116 | /** Set the page number in a branch node */ |
1117 | #define SETPGNO(node,pgno) do { \ |
1118 | (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \ |
1119 | if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0) |
1120 | |
1121 | /** Get the size of the data in a leaf node */ |
1122 | #define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) |
1123 | /** Set the size of the data for a leaf node */ |
1124 | #define SETDSZ(node,size) do { \ |
1125 | (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0) |
1126 | /** The size of a key in a node */ |
1127 | #define NODEKSZ(node) ((node)->mn_ksize) |
1128 | |
1129 | /** Copy a page number from src to dst */ |
1130 | #ifdef MISALIGNED_OK |
1131 | #define COPY_PGNO(dst,src) dst = src |
1132 | #else |
1133 | #if MDB_SIZE_MAX > 0xffffffffU |
1134 | #define COPY_PGNO(dst,src) do { \ |
1135 | unsigned short *s, *d; \ |
1136 | s = (unsigned short *)&(src); \ |
1137 | d = (unsigned short *)&(dst); \ |
1138 | *d++ = *s++; \ |
1139 | *d++ = *s++; \ |
1140 | *d++ = *s++; \ |
1141 | *d = *s; \ |
1142 | } while (0) |
1143 | #else |
1144 | #define COPY_PGNO(dst,src) do { \ |
1145 | unsigned short *s, *d; \ |
1146 | s = (unsigned short *)&(src); \ |
1147 | d = (unsigned short *)&(dst); \ |
1148 | *d++ = *s++; \ |
1149 | *d = *s; \ |
1150 | } while (0) |
1151 | #endif |
1152 | #endif |
1153 | /** The address of a key in a LEAF2 page. |
1154 | * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs. |
1155 | * There are no node headers, keys are stored contiguously. |
1156 | */ |
1157 | #define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks))) |
1158 | |
1159 | /** Set the \b node's key into \b keyptr, if requested. */ |
1160 | #define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \ |
1161 | (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } } |
1162 | |
1163 | /** Set the \b node's key into \b key. */ |
1164 | #define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); } |
1165 | |
1166 | /** Information about a single database in the environment. */ |
1167 | typedef struct MDB_db { |
1168 | uint32_t md_pad; /**< also ksize for LEAF2 pages */ |
1169 | uint16_t md_flags; /**< @ref mdb_dbi_open */ |
1170 | uint16_t md_depth; /**< depth of this tree */ |
1171 | pgno_t md_branch_pages; /**< number of internal pages */ |
1172 | pgno_t md_leaf_pages; /**< number of leaf pages */ |
1173 | pgno_t md_overflow_pages; /**< number of overflow pages */ |
1174 | mdb_size_t md_entries; /**< number of data items */ |
1175 | pgno_t md_root; /**< the root page of this tree */ |
1176 | } MDB_db; |
1177 | |
1178 | #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ |
1179 | #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) |
1180 | /** #mdb_dbi_open() flags */ |
1181 | #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ |
1182 | MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) |
1183 | |
1184 | /** Handle for the DB used to track free pages. */ |
1185 | #define FREE_DBI 0 |
1186 | /** Handle for the default DB. */ |
1187 | #define MAIN_DBI 1 |
1188 | /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ |
1189 | #define CORE_DBS 2 |
1190 | |
1191 | /** Number of meta pages - also hardcoded elsewhere */ |
1192 | #define NUM_METAS 2 |
1193 | |
1194 | /** Meta page content. |
1195 | * A meta page is the start point for accessing a database snapshot. |
1196 | * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). |
1197 | */ |
1198 | typedef struct MDB_meta { |
1199 | /** Stamp identifying this as an LMDB file. It must be set |
1200 | * to #MDB_MAGIC. */ |
1201 | uint32_t mm_magic; |
1202 | /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ |
1203 | uint32_t mm_version; |
1204 | #ifdef MDB_VL32 |
1205 | union { /* always zero since we don't support fixed mapping in MDB_VL32 */ |
1206 | MDB_ID mmun_ull; |
1207 | void *mmun_address; |
1208 | } mm_un; |
1209 | #define mm_address mm_un.mmun_address |
1210 | #else |
1211 | void *mm_address; /**< address for fixed mapping */ |
1212 | #endif |
1213 | mdb_size_t mm_mapsize; /**< size of mmap region */ |
1214 | MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ |
1215 | /** The size of pages used in this DB */ |
1216 | #define mm_psize mm_dbs[FREE_DBI].md_pad |
1217 | /** Any persistent environment flags. @ref mdb_env */ |
1218 | #define mm_flags mm_dbs[FREE_DBI].md_flags |
1219 | /** Last used page in the datafile. |
1220 | * Actually the file may be shorter if the freeDB lists the final pages. |
1221 | */ |
1222 | pgno_t mm_last_pg; |
1223 | volatile txnid_t mm_txnid; /**< txnid that committed this page */ |
1224 | } MDB_meta; |
1225 | |
1226 | /** Buffer for a stack-allocated meta page. |
1227 | * The members define size and alignment, and silence type |
1228 | * aliasing warnings. They are not used directly; that could |
1229 | * mean incorrectly using several union members in parallel. |
1230 | */ |
1231 | typedef union MDB_metabuf { |
1232 | MDB_page mb_page; |
1233 | struct { |
1234 | char mm_pad[PAGEHDRSZ]; |
1235 | MDB_meta mm_meta; |
1236 | } mb_metabuf; |
1237 | } MDB_metabuf; |
1238 | |
1239 | /** Auxiliary DB info. |
1240 | * The information here is mostly static/read-only. There is |
1241 | * only a single copy of this record in the environment. |
1242 | */ |
1243 | typedef struct MDB_dbx { |
1244 | MDB_val md_name; /**< name of the database */ |
1245 | MDB_cmp_func *md_cmp; /**< function for comparing keys */ |
1246 | MDB_cmp_func *md_dcmp; /**< function for comparing data items */ |
1247 | MDB_rel_func *md_rel; /**< user relocate function */ |
1248 | void *md_relctx; /**< user-provided context for md_rel */ |
1249 | } MDB_dbx; |
1250 | |
1251 | /** A database transaction. |
1252 | * Every operation requires a transaction handle. |
1253 | */ |
1254 | struct MDB_txn { |
1255 | MDB_txn *mt_parent; /**< parent of a nested txn */ |
1256 | /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ |
1257 | MDB_txn *mt_child; |
1258 | pgno_t mt_next_pgno; /**< next unallocated page */ |
1259 | #ifdef MDB_VL32 |
1260 | pgno_t mt_last_pgno; /**< last written page */ |
1261 | #endif |
1262 | /** The ID of this transaction. IDs are integers incrementing from 1. |
1263 | * Only committed write transactions increment the ID. If a transaction |
1264 | * aborts, the ID may be re-used by the next writer. |
1265 | */ |
1266 | txnid_t mt_txnid; |
1267 | MDB_env *mt_env; /**< the DB environment */ |
1268 | /** The list of pages that became unused during this transaction. |
1269 | */ |
1270 | MDB_IDL mt_free_pgs; |
1271 | /** The list of loose pages that became unused and may be reused |
1272 | * in this transaction, linked through #NEXT_LOOSE_PAGE(page). |
1273 | */ |
1274 | MDB_page *mt_loose_pgs; |
1275 | /** Number of loose pages (#mt_loose_pgs) */ |
1276 | int mt_loose_count; |
1277 | /** The sorted list of dirty pages we temporarily wrote to disk |
1278 | * because the dirty list was full. page numbers in here are |
1279 | * shifted left by 1, deleted slots have the LSB set. |
1280 | */ |
1281 | MDB_IDL mt_spill_pgs; |
1282 | union { |
1283 | /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ |
1284 | MDB_ID2L dirty_list; |
1285 | /** For read txns: This thread/txn's reader table slot, or NULL. */ |
1286 | MDB_reader *reader; |
1287 | } mt_u; |
1288 | /** Array of records for each DB known in the environment. */ |
1289 | MDB_dbx *mt_dbxs; |
1290 | /** Array of MDB_db records for each known DB */ |
1291 | MDB_db *mt_dbs; |
1292 | /** Array of sequence numbers for each DB handle */ |
1293 | unsigned int *mt_dbiseqs; |
1294 | /** @defgroup mt_dbflag Transaction DB Flags |
1295 | * @ingroup internal |
1296 | * @{ |
1297 | */ |
1298 | #define DB_DIRTY 0x01 /**< DB was written in this txn */ |
1299 | #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ |
1300 | #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ |
1301 | #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ |
1302 | #define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ |
1303 | #define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ |
1304 | /** @} */ |
1305 | /** In write txns, array of cursors for each DB */ |
1306 | MDB_cursor **mt_cursors; |
1307 | /** Array of flags for each DB */ |
1308 | unsigned char *mt_dbflags; |
1309 | #ifdef MDB_VL32 |
1310 | /** List of read-only pages (actually chunks) */ |
1311 | MDB_ID3L mt_rpages; |
1312 | /** We map chunks of 16 pages. Even though Windows uses 4KB pages, all |
1313 | * mappings must begin on 64KB boundaries. So we round off all pgnos to |
1314 | * a chunk boundary. We do the same on Linux for symmetry, and also to |
1315 | * reduce the frequency of mmap/munmap calls. |
1316 | */ |
1317 | #define MDB_RPAGE_CHUNK 16 |
1318 | #define MDB_TRPAGE_SIZE 4096 /**< size of #mt_rpages array of chunks */ |
1319 | #define MDB_TRPAGE_MAX (MDB_TRPAGE_SIZE-1) /**< maximum chunk index */ |
1320 | unsigned int mt_rpcheck; /**< threshold for reclaiming unref'd chunks */ |
1321 | #endif |
1322 | /** Number of DB records in use, or 0 when the txn is finished. |
1323 | * This number only ever increments until the txn finishes; we |
1324 | * don't decrement it when individual DB handles are closed. |
1325 | */ |
1326 | MDB_dbi mt_numdbs; |
1327 | |
1328 | /** @defgroup mdb_txn Transaction Flags |
1329 | * @ingroup internal |
1330 | * @{ |
1331 | */ |
1332 | /** #mdb_txn_begin() flags */ |
1333 | #define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC|MDB_NOSYNC|MDB_RDONLY) |
1334 | #define MDB_TXN_NOMETASYNC MDB_NOMETASYNC /**< don't sync meta for this txn on commit */ |
1335 | #define MDB_TXN_NOSYNC MDB_NOSYNC /**< don't sync this txn on commit */ |
1336 | #define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ |
1337 | /* internal txn flags */ |
1338 | #define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */ |
1339 | #define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ |
1340 | #define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ |
1341 | #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ |
1342 | #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ |
1343 | #define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ |
1344 | /** most operations on the txn are currently illegal */ |
1345 | #define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD) |
1346 | /** @} */ |
1347 | unsigned int mt_flags; /**< @ref mdb_txn */ |
1348 | /** #dirty_list room: Array size - \#dirty pages visible to this txn. |
1349 | * Includes ancestor txns' dirty pages not hidden by other txns' |
1350 | * dirty/spilled pages. Thus commit(nested txn) has room to merge |
1351 | * dirty_list into mt_parent after freeing hidden mt_parent pages. |
1352 | */ |
1353 | unsigned int mt_dirty_room; |
1354 | }; |
1355 | |
1356 | /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. |
1357 | * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to |
1358 | * raise this on a 64 bit machine. |
1359 | */ |
1360 | #define CURSOR_STACK 32 |
1361 | |
1362 | struct MDB_xcursor; |
1363 | |
1364 | /** Cursors are used for all DB operations. |
1365 | * A cursor holds a path of (page pointer, key index) from the DB |
1366 | * root to a position in the DB, plus other state. #MDB_DUPSORT |
1367 | * cursors include an xcursor to the current data item. Write txns |
1368 | * track their cursors and keep them up to date when data moves. |
1369 | * Exception: An xcursor's pointer to a #P_SUBP page can be stale. |
1370 | * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). |
1371 | */ |
1372 | struct MDB_cursor { |
1373 | /** Next cursor on this DB in this txn */ |
1374 | MDB_cursor *mc_next; |
1375 | /** Backup of the original cursor if this cursor is a shadow */ |
1376 | MDB_cursor *mc_backup; |
1377 | /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ |
1378 | struct MDB_xcursor *mc_xcursor; |
1379 | /** The transaction that owns this cursor */ |
1380 | MDB_txn *mc_txn; |
1381 | /** The database handle this cursor operates on */ |
1382 | MDB_dbi mc_dbi; |
1383 | /** The database record for this cursor */ |
1384 | MDB_db *mc_db; |
1385 | /** The database auxiliary record for this cursor */ |
1386 | MDB_dbx *mc_dbx; |
1387 | /** The @ref mt_dbflag for this database */ |
1388 | unsigned char *mc_dbflag; |
1389 | unsigned short mc_snum; /**< number of pushed pages */ |
1390 | unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ |
1391 | /** @defgroup mdb_cursor Cursor Flags |
1392 | * @ingroup internal |
1393 | * Cursor state flags. |
1394 | * @{ |
1395 | */ |
1396 | #define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ |
1397 | #define C_EOF 0x02 /**< No more data */ |
1398 | #define C_SUB 0x04 /**< Cursor is a sub-cursor */ |
1399 | #define C_DEL 0x08 /**< last op was a cursor_del */ |
1400 | #define C_UNTRACK 0x40 /**< Un-track cursor when closing */ |
1401 | #define C_WRITEMAP MDB_TXN_WRITEMAP /**< Copy of txn flag */ |
1402 | /** Read-only cursor into the txn's original snapshot in the map. |
1403 | * Set for read-only txns, and in #mdb_page_alloc() for #FREE_DBI when |
1404 | * #MDB_DEVEL & 2. Only implements code which is necessary for this. |
1405 | */ |
1406 | #define C_ORIG_RDONLY MDB_TXN_RDONLY |
1407 | /** @} */ |
1408 | unsigned int mc_flags; /**< @ref mdb_cursor */ |
1409 | MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ |
1410 | indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ |
1411 | #ifdef MDB_VL32 |
1412 | MDB_page *mc_ovpg; /**< a referenced overflow page */ |
1413 | # define MC_OVPG(mc) ((mc)->mc_ovpg) |
1414 | # define MC_SET_OVPG(mc, pg) ((mc)->mc_ovpg = (pg)) |
1415 | #else |
1416 | # define MC_OVPG(mc) ((MDB_page *)0) |
1417 | # define MC_SET_OVPG(mc, pg) ((void)0) |
1418 | #endif |
1419 | }; |
1420 | |
1421 | /** Context for sorted-dup records. |
1422 | * We could have gone to a fully recursive design, with arbitrarily |
1423 | * deep nesting of sub-databases. But for now we only handle these |
1424 | * levels - main DB, optional sub-DB, sorted-duplicate DB. |
1425 | */ |
1426 | typedef struct MDB_xcursor { |
1427 | /** A sub-cursor for traversing the Dup DB */ |
1428 | MDB_cursor mx_cursor; |
1429 | /** The database record for this Dup DB */ |
1430 | MDB_db mx_db; |
1431 | /** The auxiliary DB record for this Dup DB */ |
1432 | MDB_dbx mx_dbx; |
1433 | /** The @ref mt_dbflag for this Dup DB */ |
1434 | unsigned char mx_dbflag; |
1435 | } MDB_xcursor; |
1436 | |
1437 | /** Check if there is an inited xcursor */ |
1438 | #define XCURSOR_INITED(mc) \ |
1439 | ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) |
1440 | |
1441 | /** Update the xcursor's sub-page pointer, if any, in \b mc. Needed |
1442 | * when the node which contains the sub-page may have moved. Called |
1443 | * with leaf page \b mp = mc->mc_pg[\b top]. |
1444 | */ |
1445 | #define XCURSOR_REFRESH(mc, top, mp) do { \ |
1446 | MDB_page *xr_pg = (mp); \ |
1447 | MDB_node *xr_node; \ |
1448 | if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \ |
1449 | xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \ |
1450 | if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ |
1451 | (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ |
1452 | } while (0) |
1453 | |
1454 | /** State of FreeDB old pages, stored in the MDB_env */ |
1455 | typedef struct MDB_pgstate { |
1456 | pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ |
1457 | txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ |
1458 | } MDB_pgstate; |
1459 | |
1460 | /** The database environment. */ |
1461 | struct MDB_env { |
1462 | HANDLE me_fd; /**< The main data file */ |
1463 | HANDLE me_lfd; /**< The lock file */ |
1464 | HANDLE me_mfd; /**< For writing and syncing the meta pages */ |
1465 | #if defined(MDB_VL32) && defined(_WIN32) |
1466 | HANDLE me_fmh; /**< File Mapping handle */ |
1467 | #endif |
1468 | /** Failed to update the meta page. Probably an I/O error. */ |
1469 | #define MDB_FATAL_ERROR 0x80000000U |
1470 | /** Some fields are initialized. */ |
1471 | #define MDB_ENV_ACTIVE 0x20000000U |
1472 | /** me_txkey is set */ |
1473 | #define MDB_ENV_TXKEY 0x10000000U |
1474 | /** fdatasync is unreliable */ |
1475 | #define MDB_FSYNCONLY 0x08000000U |
1476 | uint32_t me_flags; /**< @ref mdb_env */ |
1477 | unsigned int me_psize; /**< DB page size, inited from me_os_psize */ |
1478 | unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ |
1479 | unsigned int me_maxreaders; /**< size of the reader table */ |
1480 | /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */ |
1481 | volatile int me_close_readers; |
1482 | MDB_dbi me_numdbs; /**< number of DBs opened */ |
1483 | MDB_dbi me_maxdbs; /**< size of the DB table */ |
1484 | MDB_PID_T me_pid; /**< process ID of this env */ |
1485 | char *me_path; /**< path to the DB files */ |
1486 | char *me_map; /**< the memory map of the data file */ |
1487 | MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ |
1488 | MDB_meta *me_metas[NUM_METAS]; /**< pointers to the two meta pages */ |
1489 | void *me_pbuf; /**< scratch area for DUPSORT put() */ |
1490 | MDB_txn *me_txn; /**< current write transaction */ |
1491 | MDB_txn *me_txn0; /**< prealloc'd write transaction */ |
1492 | mdb_size_t me_mapsize; /**< size of the data memory map */ |
1493 | off_t me_size; /**< current file size */ |
1494 | pgno_t me_maxpg; /**< me_mapsize / me_psize */ |
1495 | MDB_dbx *me_dbxs; /**< array of static DB info */ |
1496 | uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ |
1497 | unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */ |
1498 | pthread_key_t me_txkey; /**< thread-key for readers */ |
1499 | txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ |
1500 | MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ |
1501 | # define me_pglast me_pgstate.mf_pglast |
1502 | # define me_pghead me_pgstate.mf_pghead |
1503 | MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ |
1504 | /** IDL of pages that became unused in a write txn */ |
1505 | MDB_IDL me_free_pgs; |
1506 | /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ |
1507 | MDB_ID2L me_dirty_list; |
1508 | /** Max number of freelist items that can fit in a single overflow page */ |
1509 | int me_maxfree_1pg; |
1510 | /** Max size of a node on a page */ |
1511 | unsigned int me_nodemax; |
1512 | #if !(MDB_MAXKEYSIZE) |
1513 | unsigned int me_maxkey; /**< max size of a key */ |
1514 | #endif |
1515 | int me_live_reader; /**< have liveness lock in reader table */ |
1516 | #ifdef _WIN32 |
1517 | int me_pidquery; /**< Used in OpenProcess */ |
1518 | #endif |
1519 | #ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */ |
1520 | # define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */ |
1521 | # define me_wmutex me_txns->mti_wmutex /**< Shared writer lock */ |
1522 | #else |
1523 | mdb_mutex_t me_rmutex; |
1524 | mdb_mutex_t me_wmutex; |
1525 | # if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) |
1526 | /** Half-initialized name of mutexes, to be completed by #MUTEXNAME() */ |
1527 | char me_mutexname[sizeof(MUTEXNAME_PREFIX) + 11]; |
1528 | # endif |
1529 | #endif |
1530 | #ifdef MDB_VL32 |
1531 | MDB_ID3L me_rpages; /**< like #mt_rpages, but global to env */ |
1532 | pthread_mutex_t me_rpmutex; /**< control access to #me_rpages */ |
1533 | #define MDB_ERPAGE_SIZE 16384 |
1534 | #define MDB_ERPAGE_MAX (MDB_ERPAGE_SIZE-1) |
1535 | unsigned int me_rpcheck; |
1536 | #endif |
1537 | void *me_userctx; /**< User-settable context */ |
1538 | MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ |
1539 | }; |
1540 | |
1541 | /** Nested transaction */ |
1542 | typedef struct MDB_ntxn { |
1543 | MDB_txn mnt_txn; /**< the transaction */ |
1544 | MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ |
1545 | } MDB_ntxn; |
1546 | |
1547 | /** max number of pages to commit in one writev() call */ |
1548 | #define MDB_COMMIT_PAGES 64 |
1549 | #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES |
1550 | #undef MDB_COMMIT_PAGES |
1551 | #define MDB_COMMIT_PAGES IOV_MAX |
1552 | #endif |
1553 | |
1554 | /** max bytes to write in one call */ |
1555 | #define MAX_WRITE (0x40000000U >> (sizeof(ssize_t) == 4)) |
1556 | |
1557 | /** Check \b txn and \b dbi arguments to a function */ |
1558 | #define TXN_DBI_EXIST(txn, dbi, validity) \ |
1559 | ((txn) && (dbi)<(txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) |
1560 | |
1561 | /** Check for misused \b dbi handles */ |
1562 | #define TXN_DBI_CHANGED(txn, dbi) \ |
1563 | ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) |
1564 | |
1565 | static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); |
1566 | static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); |
1567 | static int mdb_page_touch(MDB_cursor *mc); |
1568 | |
1569 | #define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \ |
1570 | "reset-tmp", "fail-begin", "fail-beginchild"} |
1571 | enum { |
1572 | /* mdb_txn_end operation number, for logging */ |
1573 | MDB_END_COMMITTED, MDB_END_EMPTY_COMMIT, MDB_END_ABORT, MDB_END_RESET, |
1574 | MDB_END_RESET_TMP, MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD |
1575 | }; |
1576 | #define MDB_END_OPMASK 0x0F /**< mask for #mdb_txn_end() operation number */ |
1577 | #define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ |
1578 | #define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ |
1579 | #define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ |
1580 | static void mdb_txn_end(MDB_txn *txn, unsigned mode); |
1581 | |
1582 | static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); |
1583 | static int mdb_page_search_root(MDB_cursor *mc, |
1584 | MDB_val *key, int modify); |
1585 | #define MDB_PS_MODIFY 1 |
1586 | #define MDB_PS_ROOTONLY 2 |
1587 | #define MDB_PS_FIRST 4 |
1588 | #define MDB_PS_LAST 8 |
1589 | static int mdb_page_search(MDB_cursor *mc, |
1590 | MDB_val *key, int flags); |
1591 | static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); |
1592 | |
1593 | #define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ |
1594 | static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, |
1595 | pgno_t newpgno, unsigned int nflags); |
1596 | |
1597 | static int mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta); |
1598 | static MDB_meta *mdb_env_pick_meta(const MDB_env *env); |
1599 | static int mdb_env_write_meta(MDB_txn *txn); |
1600 | #ifdef MDB_USE_POSIX_MUTEX /* Drop unused excl arg */ |
1601 | # define mdb_env_close0(env, excl) mdb_env_close1(env) |
1602 | #endif |
1603 | static void mdb_env_close0(MDB_env *env, int excl); |
1604 | |
1605 | static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); |
1606 | static int mdb_node_add(MDB_cursor *mc, indx_t indx, |
1607 | MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags); |
1608 | static void mdb_node_del(MDB_cursor *mc, int ksize); |
1609 | static void mdb_node_shrink(MDB_page *mp, indx_t indx); |
1610 | static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); |
1611 | static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); |
1612 | static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); |
1613 | static size_t mdb_branch_size(MDB_env *env, MDB_val *key); |
1614 | |
1615 | static int mdb_rebalance(MDB_cursor *mc); |
1616 | static int mdb_update_key(MDB_cursor *mc, MDB_val *key); |
1617 | |
1618 | static void mdb_cursor_pop(MDB_cursor *mc); |
1619 | static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); |
1620 | |
1621 | static int mdb_cursor_del0(MDB_cursor *mc); |
1622 | static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); |
1623 | static int mdb_cursor_sibling(MDB_cursor *mc, int move_right); |
1624 | static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); |
1625 | static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); |
1626 | static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, |
1627 | int *exactp); |
1628 | static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); |
1629 | static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); |
1630 | |
1631 | static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); |
1632 | static void mdb_xcursor_init0(MDB_cursor *mc); |
1633 | static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); |
1634 | static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); |
1635 | |
1636 | static int mdb_drop0(MDB_cursor *mc, int subs); |
1637 | static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); |
1638 | static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); |
1639 | |
1640 | /** @cond */ |
1641 | static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long; |
1642 | /** @endcond */ |
1643 | |
1644 | /** Compare two items pointing at '#mdb_size_t's of unknown alignment. */ |
1645 | #ifdef MISALIGNED_OK |
1646 | # define mdb_cmp_clong mdb_cmp_long |
1647 | #else |
1648 | # define mdb_cmp_clong mdb_cmp_cint |
1649 | #endif |
1650 | |
1651 | /** True if we need #mdb_cmp_clong() instead of \b cmp for #MDB_INTEGERDUP */ |
1652 | #define NEED_CMP_CLONG(cmp, ksize) \ |
1653 | (UINT_MAX < MDB_SIZE_MAX && \ |
1654 | (cmp) == mdb_cmp_int && (ksize) == sizeof(mdb_size_t)) |
1655 | |
1656 | #ifdef _WIN32 |
1657 | static SECURITY_DESCRIPTOR mdb_null_sd; |
1658 | static SECURITY_ATTRIBUTES mdb_all_sa; |
1659 | static int mdb_sec_inited; |
1660 | |
1661 | struct MDB_name; |
1662 | static int utf8_to_utf16(const char *src, struct MDB_name *dst, int xtra); |
1663 | #endif |
1664 | |
1665 | /** Return the library version info. */ |
1666 | char * ESECT |
1667 | mdb_version(int *major, int *minor, int *patch) |
1668 | { |
1669 | if (major) *major = MDB_VERSION_MAJOR; |
1670 | if (minor) *minor = MDB_VERSION_MINOR; |
1671 | if (patch) *patch = MDB_VERSION_PATCH; |
1672 | return MDB_VERSION_STRING; |
1673 | } |
1674 | |
1675 | /** Table of descriptions for LMDB @ref errors */ |
1676 | static char *const mdb_errstr[] = { |
1677 | "MDB_KEYEXIST: Key/data pair already exists" , |
1678 | "MDB_NOTFOUND: No matching key/data pair found" , |
1679 | "MDB_PAGE_NOTFOUND: Requested page not found" , |
1680 | "MDB_CORRUPTED: Located page was wrong type" , |
1681 | "MDB_PANIC: Update of meta page failed or environment had fatal error" , |
1682 | "MDB_VERSION_MISMATCH: Database environment version mismatch" , |
1683 | "MDB_INVALID: File is not an LMDB file" , |
1684 | "MDB_MAP_FULL: Environment mapsize limit reached" , |
1685 | "MDB_DBS_FULL: Environment maxdbs limit reached" , |
1686 | "MDB_READERS_FULL: Environment maxreaders limit reached" , |
1687 | "MDB_TLS_FULL: Thread-local storage keys full - too many environments open" , |
1688 | "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big" , |
1689 | "MDB_CURSOR_FULL: Internal error - cursor stack limit reached" , |
1690 | "MDB_PAGE_FULL: Internal error - page has no more space" , |
1691 | "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize" , |
1692 | "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed" , |
1693 | "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot" , |
1694 | "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid" , |
1695 | "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size" , |
1696 | "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly" , |
1697 | "MDB_PROBLEM: Unexpected problem - txn should abort" , |
1698 | }; |
1699 | |
1700 | char * |
1701 | mdb_strerror(int err) |
1702 | { |
1703 | #ifdef _WIN32 |
1704 | /** HACK: pad 4KB on stack over the buf. Return system msgs in buf. |
1705 | * This works as long as no function between the call to mdb_strerror |
1706 | * and the actual use of the message uses more than 4K of stack. |
1707 | */ |
1708 | #define MSGSIZE 1024 |
1709 | #define PADSIZE 4096 |
1710 | char buf[MSGSIZE+PADSIZE], *ptr = buf; |
1711 | #endif |
1712 | int i; |
1713 | if (!err) |
1714 | return ("Successful return: 0" ); |
1715 | |
1716 | if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { |
1717 | i = err - MDB_KEYEXIST; |
1718 | return mdb_errstr[i]; |
1719 | } |
1720 | |
1721 | #ifdef _WIN32 |
1722 | /* These are the C-runtime error codes we use. The comment indicates |
1723 | * their numeric value, and the Win32 error they would correspond to |
1724 | * if the error actually came from a Win32 API. A major mess, we should |
1725 | * have used LMDB-specific error codes for everything. |
1726 | */ |
1727 | switch(err) { |
1728 | case ENOENT: /* 2, FILE_NOT_FOUND */ |
1729 | case EIO: /* 5, ACCESS_DENIED */ |
1730 | case ENOMEM: /* 12, INVALID_ACCESS */ |
1731 | case EACCES: /* 13, INVALID_DATA */ |
1732 | case EBUSY: /* 16, CURRENT_DIRECTORY */ |
1733 | case EINVAL: /* 22, BAD_COMMAND */ |
1734 | case ENOSPC: /* 28, OUT_OF_PAPER */ |
1735 | return strerror(err); |
1736 | default: |
1737 | ; |
1738 | } |
1739 | buf[0] = 0; |
1740 | FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | |
1741 | FORMAT_MESSAGE_IGNORE_INSERTS, |
1742 | NULL, err, 0, ptr, MSGSIZE, (va_list *)buf+MSGSIZE); |
1743 | return ptr; |
1744 | #else |
1745 | return strerror(err); |
1746 | #endif |
1747 | } |
1748 | |
1749 | /** assert(3) variant in cursor context */ |
1750 | #define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) |
1751 | /** assert(3) variant in transaction context */ |
1752 | #define mdb_tassert(txn, expr) mdb_assert0((txn)->mt_env, expr, #expr) |
1753 | /** assert(3) variant in environment context */ |
1754 | #define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) |
1755 | |
1756 | #ifndef NDEBUG |
1757 | # define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \ |
1758 | mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__)) |
1759 | |
1760 | static void ESECT |
1761 | mdb_assert_fail(MDB_env *env, const char *expr_txt, |
1762 | const char *func, const char *file, int line) |
1763 | { |
1764 | char buf[400]; |
1765 | sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()" , |
1766 | file, line, expr_txt, func); |
1767 | if (env->me_assert_func) |
1768 | env->me_assert_func(env, buf); |
1769 | fprintf(stderr, "%s\n" , buf); |
1770 | abort(); |
1771 | } |
1772 | #else |
1773 | # define mdb_assert0(env, expr, expr_txt) ((void) 0) |
1774 | #endif /* NDEBUG */ |
1775 | |
1776 | #if MDB_DEBUG |
1777 | /** Return the page number of \b mp which may be sub-page, for debug output */ |
1778 | static pgno_t |
1779 | mdb_dbg_pgno(MDB_page *mp) |
1780 | { |
1781 | pgno_t ret; |
1782 | COPY_PGNO(ret, mp->mp_pgno); |
1783 | return ret; |
1784 | } |
1785 | |
1786 | /** Display a key in hexadecimal and return the address of the result. |
1787 | * @param[in] key the key to display |
1788 | * @param[in] buf the buffer to write into. Should always be #DKBUF. |
1789 | * @return The key in hexadecimal form. |
1790 | */ |
1791 | char * |
1792 | mdb_dkey(MDB_val *key, char *buf) |
1793 | { |
1794 | char *ptr = buf; |
1795 | unsigned char *c = key->mv_data; |
1796 | unsigned int i; |
1797 | |
1798 | if (!key) |
1799 | return "" ; |
1800 | |
1801 | if (key->mv_size > DKBUF_MAXKEYSIZE) |
1802 | return "MDB_MAXKEYSIZE" ; |
1803 | /* may want to make this a dynamic check: if the key is mostly |
1804 | * printable characters, print it as-is instead of converting to hex. |
1805 | */ |
1806 | #if 1 |
1807 | buf[0] = '\0'; |
1808 | for (i=0; i<key->mv_size; i++) |
1809 | ptr += sprintf(ptr, "%02x" , *c++); |
1810 | #else |
1811 | sprintf(buf, "%.*s" , key->mv_size, key->mv_data); |
1812 | #endif |
1813 | return buf; |
1814 | } |
1815 | |
1816 | static const char * |
1817 | mdb_leafnode_type(MDB_node *n) |
1818 | { |
1819 | static char *const tp[2][2] = {{"" , ": DB" }, {": sub-page" , ": sub-DB" }}; |
1820 | return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : |
1821 | tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; |
1822 | } |
1823 | |
1824 | /** Display all the keys in the page. */ |
1825 | void |
1826 | mdb_page_list(MDB_page *mp) |
1827 | { |
1828 | pgno_t pgno = mdb_dbg_pgno(mp); |
1829 | const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : "" ; |
1830 | MDB_node *node; |
1831 | unsigned int i, nkeys, nsize, total = 0; |
1832 | MDB_val key; |
1833 | DKBUF; |
1834 | |
1835 | switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { |
1836 | case P_BRANCH: type = "Branch page" ; break; |
1837 | case P_LEAF: type = "Leaf page" ; break; |
1838 | case P_LEAF|P_SUBP: type = "Sub-page" ; break; |
1839 | case P_LEAF|P_LEAF2: type = "LEAF2 page" ; break; |
1840 | case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page" ; break; |
1841 | case P_OVERFLOW: |
1842 | fprintf(stderr, "Overflow page %" Yu" pages %u%s\n" , |
1843 | pgno, mp->mp_pages, state); |
1844 | return; |
1845 | case P_META: |
1846 | fprintf(stderr, "Meta-page %" Yu" txnid %" Yu"\n" , |
1847 | pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); |
1848 | return; |
1849 | default: |
1850 | fprintf(stderr, "Bad page %" Yu" flags 0x%X\n" , pgno, mp->mp_flags); |
1851 | return; |
1852 | } |
1853 | |
1854 | nkeys = NUMKEYS(mp); |
1855 | fprintf(stderr, "%s %" Yu" numkeys %d%s\n" , type, pgno, nkeys, state); |
1856 | |
1857 | for (i=0; i<nkeys; i++) { |
1858 | if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ |
1859 | key.mv_size = nsize = mp->mp_pad; |
1860 | key.mv_data = LEAF2KEY(mp, i, nsize); |
1861 | total += nsize; |
1862 | fprintf(stderr, "key %d: nsize %d, %s\n" , i, nsize, DKEY(&key)); |
1863 | continue; |
1864 | } |
1865 | node = NODEPTR(mp, i); |
1866 | key.mv_size = node->mn_ksize; |
1867 | key.mv_data = node->mn_data; |
1868 | nsize = NODESIZE + key.mv_size; |
1869 | if (IS_BRANCH(mp)) { |
1870 | fprintf(stderr, "key %d: page %" Yu", %s\n" , i, NODEPGNO(node), |
1871 | DKEY(&key)); |
1872 | total += nsize; |
1873 | } else { |
1874 | if (F_ISSET(node->mn_flags, F_BIGDATA)) |
1875 | nsize += sizeof(pgno_t); |
1876 | else |
1877 | nsize += NODEDSZ(node); |
1878 | total += nsize; |
1879 | nsize += sizeof(indx_t); |
1880 | fprintf(stderr, "key %d: nsize %d, %s%s\n" , |
1881 | i, nsize, DKEY(&key), mdb_leafnode_type(node)); |
1882 | } |
1883 | total = EVEN(total); |
1884 | } |
1885 | fprintf(stderr, "Total: header %d + contents %d + unused %d\n" , |
1886 | IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp)); |
1887 | } |
1888 | |
1889 | void |
1890 | mdb_cursor_chk(MDB_cursor *mc) |
1891 | { |
1892 | unsigned int i; |
1893 | MDB_node *node; |
1894 | MDB_page *mp; |
1895 | |
1896 | if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; |
1897 | for (i=0; i<mc->mc_top; i++) { |
1898 | mp = mc->mc_pg[i]; |
1899 | node = NODEPTR(mp, mc->mc_ki[i]); |
1900 | if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno) |
1901 | printf("oops!\n" ); |
1902 | } |
1903 | if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) |
1904 | printf("ack!\n" ); |
1905 | if (XCURSOR_INITED(mc)) { |
1906 | node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
1907 | if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && |
1908 | mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { |
1909 | printf("blah!\n" ); |
1910 | } |
1911 | } |
1912 | } |
1913 | #endif |
1914 | |
1915 | #if (MDB_DEBUG) > 2 |
1916 | /** Count all the pages in each DB and in the freelist |
1917 | * and make sure it matches the actual number of pages |
1918 | * being used. |
1919 | * All named DBs must be open for a correct count. |
1920 | */ |
1921 | static void mdb_audit(MDB_txn *txn) |
1922 | { |
1923 | MDB_cursor mc; |
1924 | MDB_val key, data; |
1925 | MDB_ID freecount, count; |
1926 | MDB_dbi i; |
1927 | int rc; |
1928 | |
1929 | freecount = 0; |
1930 | mdb_cursor_init(&mc, txn, FREE_DBI, NULL); |
1931 | while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) |
1932 | freecount += *(MDB_ID *)data.mv_data; |
1933 | mdb_tassert(txn, rc == MDB_NOTFOUND); |
1934 | |
1935 | count = 0; |
1936 | for (i = 0; i<txn->mt_numdbs; i++) { |
1937 | MDB_xcursor mx; |
1938 | if (!(txn->mt_dbflags[i] & DB_VALID)) |
1939 | continue; |
1940 | mdb_cursor_init(&mc, txn, i, &mx); |
1941 | if (txn->mt_dbs[i].md_root == P_INVALID) |
1942 | continue; |
1943 | count += txn->mt_dbs[i].md_branch_pages + |
1944 | txn->mt_dbs[i].md_leaf_pages + |
1945 | txn->mt_dbs[i].md_overflow_pages; |
1946 | if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { |
1947 | rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST); |
1948 | for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) { |
1949 | unsigned j; |
1950 | MDB_page *mp; |
1951 | mp = mc.mc_pg[mc.mc_top]; |
1952 | for (j=0; j<NUMKEYS(mp); j++) { |
1953 | MDB_node *leaf = NODEPTR(mp, j); |
1954 | if (leaf->mn_flags & F_SUBDATA) { |
1955 | MDB_db db; |
1956 | memcpy(&db, NODEDATA(leaf), sizeof(db)); |
1957 | count += db.md_branch_pages + db.md_leaf_pages + |
1958 | db.md_overflow_pages; |
1959 | } |
1960 | } |
1961 | } |
1962 | mdb_tassert(txn, rc == MDB_NOTFOUND); |
1963 | } |
1964 | } |
1965 | if (freecount + count + NUM_METAS != txn->mt_next_pgno) { |
1966 | fprintf(stderr, "audit: %" Yu" freecount: %" Yu" count: %" Yu" total: %" Yu" next_pgno: %" Yu"\n" , |
1967 | txn->mt_txnid, freecount, count+NUM_METAS, |
1968 | freecount+count+NUM_METAS, txn->mt_next_pgno); |
1969 | } |
1970 | } |
1971 | #endif |
1972 | |
1973 | int |
1974 | mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) |
1975 | { |
1976 | return txn->mt_dbxs[dbi].md_cmp(a, b); |
1977 | } |
1978 | |
1979 | int |
1980 | mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) |
1981 | { |
1982 | MDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp; |
1983 | if (NEED_CMP_CLONG(dcmp, a->mv_size)) |
1984 | dcmp = mdb_cmp_clong; |
1985 | return dcmp(a, b); |
1986 | } |
1987 | |
1988 | /** Allocate memory for a page. |
1989 | * Re-use old malloc'd pages first for singletons, otherwise just malloc. |
1990 | * Set #MDB_TXN_ERROR on failure. |
1991 | */ |
1992 | static MDB_page * |
1993 | mdb_page_malloc(MDB_txn *txn, unsigned num) |
1994 | { |
1995 | MDB_env *env = txn->mt_env; |
1996 | MDB_page *ret = env->me_dpages; |
1997 | size_t psize = env->me_psize, sz = psize, off; |
1998 | /* For ! #MDB_NOMEMINIT, psize counts how much to init. |
1999 | * For a single page alloc, we init everything after the page header. |
2000 | * For multi-page, we init the final page; if the caller needed that |
2001 | * many pages they will be filling in at least up to the last page. |
2002 | */ |
2003 | if (num == 1) { |
2004 | if (ret) { |
2005 | VGMEMP_ALLOC(env, ret, sz); |
2006 | VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); |
2007 | env->me_dpages = ret->mp_next; |
2008 | return ret; |
2009 | } |
2010 | psize -= off = PAGEHDRSZ; |
2011 | } else { |
2012 | sz *= num; |
2013 | off = sz - psize; |
2014 | } |
2015 | if ((ret = malloc(sz)) != NULL) { |
2016 | VGMEMP_ALLOC(env, ret, sz); |
2017 | if (!(env->me_flags & MDB_NOMEMINIT)) { |
2018 | memset((char *)ret + off, 0, psize); |
2019 | ret->mp_pad = 0; |
2020 | } |
2021 | } else { |
2022 | txn->mt_flags |= MDB_TXN_ERROR; |
2023 | } |
2024 | return ret; |
2025 | } |
2026 | /** Free a single page. |
2027 | * Saves single pages to a list, for future reuse. |
2028 | * (This is not used for multi-page overflow pages.) |
2029 | */ |
2030 | static void |
2031 | mdb_page_free(MDB_env *env, MDB_page *mp) |
2032 | { |
2033 | mp->mp_next = env->me_dpages; |
2034 | VGMEMP_FREE(env, mp); |
2035 | env->me_dpages = mp; |
2036 | } |
2037 | |
2038 | /** Free a dirty page */ |
2039 | static void |
2040 | mdb_dpage_free(MDB_env *env, MDB_page *dp) |
2041 | { |
2042 | if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { |
2043 | mdb_page_free(env, dp); |
2044 | } else { |
2045 | /* large pages just get freed directly */ |
2046 | VGMEMP_FREE(env, dp); |
2047 | free(dp); |
2048 | } |
2049 | } |
2050 | |
2051 | /** Return all dirty pages to dpage list */ |
2052 | static void |
2053 | mdb_dlist_free(MDB_txn *txn) |
2054 | { |
2055 | MDB_env *env = txn->mt_env; |
2056 | MDB_ID2L dl = txn->mt_u.dirty_list; |
2057 | unsigned i, n = dl[0].mid; |
2058 | |
2059 | for (i = 1; i <= n; i++) { |
2060 | mdb_dpage_free(env, dl[i].mptr); |
2061 | } |
2062 | dl[0].mid = 0; |
2063 | } |
2064 | |
2065 | #ifdef MDB_VL32 |
2066 | static void |
2067 | mdb_page_unref(MDB_txn *txn, MDB_page *mp) |
2068 | { |
2069 | pgno_t pgno; |
2070 | MDB_ID3L tl = txn->mt_rpages; |
2071 | unsigned x, rem; |
2072 | if (mp->mp_flags & (P_SUBP|P_DIRTY)) |
2073 | return; |
2074 | rem = mp->mp_pgno & (MDB_RPAGE_CHUNK-1); |
2075 | pgno = mp->mp_pgno ^ rem; |
2076 | x = mdb_mid3l_search(tl, pgno); |
2077 | if (x != tl[0].mid && tl[x+1].mid == mp->mp_pgno) |
2078 | x++; |
2079 | if (tl[x].mref) |
2080 | tl[x].mref--; |
2081 | } |
2082 | #define MDB_PAGE_UNREF(txn, mp) mdb_page_unref(txn, mp) |
2083 | |
2084 | static void |
2085 | mdb_cursor_unref(MDB_cursor *mc) |
2086 | { |
2087 | int i; |
2088 | if (mc->mc_txn->mt_rpages[0].mid) { |
2089 | if (!mc->mc_snum || !mc->mc_pg[0] || IS_SUBP(mc->mc_pg[0])) |
2090 | return; |
2091 | for (i=0; i<mc->mc_snum; i++) |
2092 | mdb_page_unref(mc->mc_txn, mc->mc_pg[i]); |
2093 | if (mc->mc_ovpg) { |
2094 | mdb_page_unref(mc->mc_txn, mc->mc_ovpg); |
2095 | mc->mc_ovpg = 0; |
2096 | } |
2097 | } |
2098 | mc->mc_snum = mc->mc_top = 0; |
2099 | mc->mc_pg[0] = NULL; |
2100 | mc->mc_flags &= ~C_INITIALIZED; |
2101 | } |
2102 | #define MDB_CURSOR_UNREF(mc, force) \ |
2103 | (((force) || ((mc)->mc_flags & C_INITIALIZED)) \ |
2104 | ? mdb_cursor_unref(mc) \ |
2105 | : (void)0) |
2106 | |
2107 | #else |
2108 | #define MDB_PAGE_UNREF(txn, mp) |
2109 | #define MDB_CURSOR_UNREF(mc, force) ((void)0) |
2110 | #endif /* MDB_VL32 */ |
2111 | |
2112 | /** Loosen or free a single page. |
2113 | * Saves single pages to a list for future reuse |
2114 | * in this same txn. It has been pulled from the freeDB |
2115 | * and already resides on the dirty list, but has been |
2116 | * deleted. Use these pages first before pulling again |
2117 | * from the freeDB. |
2118 | * |
2119 | * If the page wasn't dirtied in this txn, just add it |
2120 | * to this txn's free list. |
2121 | */ |
2122 | static int |
2123 | mdb_page_loose(MDB_cursor *mc, MDB_page *mp) |
2124 | { |
2125 | int loose = 0; |
2126 | pgno_t pgno = mp->mp_pgno; |
2127 | MDB_txn *txn = mc->mc_txn; |
2128 | |
2129 | if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { |
2130 | if (txn->mt_parent) { |
2131 | MDB_ID2 *dl = txn->mt_u.dirty_list; |
2132 | /* If txn has a parent, make sure the page is in our |
2133 | * dirty list. |
2134 | */ |
2135 | if (dl[0].mid) { |
2136 | unsigned x = mdb_mid2l_search(dl, pgno); |
2137 | if (x <= dl[0].mid && dl[x].mid == pgno) { |
2138 | if (mp != dl[x].mptr) { /* bad cursor? */ |
2139 | mc->mc_flags &= ~(C_INITIALIZED|C_EOF); |
2140 | txn->mt_flags |= MDB_TXN_ERROR; |
2141 | return MDB_PROBLEM; |
2142 | } |
2143 | /* ok, it's ours */ |
2144 | loose = 1; |
2145 | } |
2146 | } |
2147 | } else { |
2148 | /* no parent txn, so it's just ours */ |
2149 | loose = 1; |
2150 | } |
2151 | } |
2152 | if (loose) { |
2153 | DPRINTF(("loosen db %d page %" Yu, DDBI(mc), mp->mp_pgno)); |
2154 | NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs; |
2155 | txn->mt_loose_pgs = mp; |
2156 | txn->mt_loose_count++; |
2157 | mp->mp_flags |= P_LOOSE; |
2158 | } else { |
2159 | int rc = mdb_midl_append(&txn->mt_free_pgs, pgno); |
2160 | if (rc) |
2161 | return rc; |
2162 | } |
2163 | |
2164 | return MDB_SUCCESS; |
2165 | } |
2166 | |
2167 | /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. |
2168 | * @param[in] mc A cursor handle for the current operation. |
2169 | * @param[in] pflags Flags of the pages to update: |
2170 | * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. |
2171 | * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). |
2172 | * @return 0 on success, non-zero on failure. |
2173 | */ |
2174 | static int |
2175 | mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) |
2176 | { |
2177 | enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; |
2178 | MDB_txn *txn = mc->mc_txn; |
2179 | MDB_cursor *m3, *m0 = mc; |
2180 | MDB_xcursor *mx; |
2181 | MDB_page *dp, *mp; |
2182 | MDB_node *leaf; |
2183 | unsigned i, j; |
2184 | int rc = MDB_SUCCESS, level; |
2185 | |
2186 | /* Mark pages seen by cursors: First m0, then tracked cursors */ |
2187 | for (i = txn->mt_numdbs;; ) { |
2188 | if (mc->mc_flags & C_INITIALIZED) { |
2189 | for (m3 = mc;; m3 = &mx->mx_cursor) { |
2190 | mp = NULL; |
2191 | for (j=0; j<m3->mc_snum; j++) { |
2192 | mp = m3->mc_pg[j]; |
2193 | if ((mp->mp_flags & Mask) == pflags) |
2194 | mp->mp_flags ^= P_KEEP; |
2195 | } |
2196 | mx = m3->mc_xcursor; |
2197 | /* Proceed to mx if it is at a sub-database */ |
2198 | if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) |
2199 | break; |
2200 | if (! (mp && (mp->mp_flags & P_LEAF))) |
2201 | break; |
2202 | leaf = NODEPTR(mp, m3->mc_ki[j-1]); |
2203 | if (!(leaf->mn_flags & F_SUBDATA)) |
2204 | break; |
2205 | } |
2206 | } |
2207 | mc = mc->mc_next; |
2208 | for (; !mc || mc == m0; mc = txn->mt_cursors[--i]) |
2209 | if (i == 0) |
2210 | goto mark_done; |
2211 | } |
2212 | |
2213 | mark_done: |
2214 | if (all) { |
2215 | /* Mark dirty root pages */ |
2216 | for (i=0; i<txn->mt_numdbs; i++) { |
2217 | if (txn->mt_dbflags[i] & DB_DIRTY) { |
2218 | pgno_t pgno = txn->mt_dbs[i].md_root; |
2219 | if (pgno == P_INVALID) |
2220 | continue; |
2221 | if ((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS) |
2222 | break; |
2223 | if ((dp->mp_flags & Mask) == pflags && level <= 1) |
2224 | dp->mp_flags ^= P_KEEP; |
2225 | } |
2226 | } |
2227 | } |
2228 | |
2229 | return rc; |
2230 | } |
2231 | |
2232 | static int mdb_page_flush(MDB_txn *txn, int keep); |
2233 | |
2234 | /** Spill pages from the dirty list back to disk. |
2235 | * This is intended to prevent running into #MDB_TXN_FULL situations, |
2236 | * but note that they may still occur in a few cases: |
2237 | * 1) our estimate of the txn size could be too small. Currently this |
2238 | * seems unlikely, except with a large number of #MDB_MULTIPLE items. |
2239 | * 2) child txns may run out of space if their parents dirtied a |
2240 | * lot of pages and never spilled them. TODO: we probably should do |
2241 | * a preemptive spill during #mdb_txn_begin() of a child txn, if |
2242 | * the parent's dirty_room is below a given threshold. |
2243 | * |
2244 | * Otherwise, if not using nested txns, it is expected that apps will |
2245 | * not run into #MDB_TXN_FULL any more. The pages are flushed to disk |
2246 | * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. |
2247 | * If the txn never references them again, they can be left alone. |
2248 | * If the txn only reads them, they can be used without any fuss. |
2249 | * If the txn writes them again, they can be dirtied immediately without |
2250 | * going thru all of the work of #mdb_page_touch(). Such references are |
2251 | * handled by #mdb_page_unspill(). |
2252 | * |
2253 | * Also note, we never spill DB root pages, nor pages of active cursors, |
2254 | * because we'll need these back again soon anyway. And in nested txns, |
2255 | * we can't spill a page in a child txn if it was already spilled in a |
2256 | * parent txn. That would alter the parent txns' data even though |
2257 | * the child hasn't committed yet, and we'd have no way to undo it if |
2258 | * the child aborted. |
2259 | * |
2260 | * @param[in] m0 cursor A cursor handle identifying the transaction and |
2261 | * database for which we are checking space. |
2262 | * @param[in] key For a put operation, the key being stored. |
2263 | * @param[in] data For a put operation, the data being stored. |
2264 | * @return 0 on success, non-zero on failure. |
2265 | */ |
2266 | static int |
2267 | mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) |
2268 | { |
2269 | MDB_txn *txn = m0->mc_txn; |
2270 | MDB_page *dp; |
2271 | MDB_ID2L dl = txn->mt_u.dirty_list; |
2272 | unsigned int i, j, need; |
2273 | int rc; |
2274 | |
2275 | if (m0->mc_flags & C_SUB) |
2276 | return MDB_SUCCESS; |
2277 | |
2278 | /* Estimate how much space this op will take */ |
2279 | i = m0->mc_db->md_depth; |
2280 | /* Named DBs also dirty the main DB */ |
2281 | if (m0->mc_dbi >= CORE_DBS) |
2282 | i += txn->mt_dbs[MAIN_DBI].md_depth; |
2283 | /* For puts, roughly factor in the key+data size */ |
2284 | if (key) |
2285 | i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; |
2286 | i += i; /* double it for good measure */ |
2287 | need = i; |
2288 | |
2289 | if (txn->mt_dirty_room > i) |
2290 | return MDB_SUCCESS; |
2291 | |
2292 | if (!txn->mt_spill_pgs) { |
2293 | txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); |
2294 | if (!txn->mt_spill_pgs) |
2295 | return ENOMEM; |
2296 | } else { |
2297 | /* purge deleted slots */ |
2298 | MDB_IDL sl = txn->mt_spill_pgs; |
2299 | unsigned int num = sl[0]; |
2300 | j=0; |
2301 | for (i=1; i<=num; i++) { |
2302 | if (!(sl[i] & 1)) |
2303 | sl[++j] = sl[i]; |
2304 | } |
2305 | sl[0] = j; |
2306 | } |
2307 | |
2308 | /* Preserve pages which may soon be dirtied again */ |
2309 | if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) |
2310 | goto done; |
2311 | |
2312 | /* Less aggressive spill - we originally spilled the entire dirty list, |
2313 | * with a few exceptions for cursor pages and DB root pages. But this |
2314 | * turns out to be a lot of wasted effort because in a large txn many |
2315 | * of those pages will need to be used again. So now we spill only 1/8th |
2316 | * of the dirty pages. Testing revealed this to be a good tradeoff, |
2317 | * better than 1/2, 1/4, or 1/10. |
2318 | */ |
2319 | if (need < MDB_IDL_UM_MAX / 8) |
2320 | need = MDB_IDL_UM_MAX / 8; |
2321 | |
2322 | /* Save the page IDs of all the pages we're flushing */ |
2323 | /* flush from the tail forward, this saves a lot of shifting later on. */ |
2324 | for (i=dl[0].mid; i && need; i--) { |
2325 | MDB_ID pn = dl[i].mid << 1; |
2326 | dp = dl[i].mptr; |
2327 | if (dp->mp_flags & (P_LOOSE|P_KEEP)) |
2328 | continue; |
2329 | /* Can't spill twice, make sure it's not already in a parent's |
2330 | * spill list. |
2331 | */ |
2332 | if (txn->mt_parent) { |
2333 | MDB_txn *tx2; |
2334 | for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { |
2335 | if (tx2->mt_spill_pgs) { |
2336 | j = mdb_midl_search(tx2->mt_spill_pgs, pn); |
2337 | if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { |
2338 | dp->mp_flags |= P_KEEP; |
2339 | break; |
2340 | } |
2341 | } |
2342 | } |
2343 | if (tx2) |
2344 | continue; |
2345 | } |
2346 | if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) |
2347 | goto done; |
2348 | need--; |
2349 | } |
2350 | mdb_midl_sort(txn->mt_spill_pgs); |
2351 | |
2352 | /* Flush the spilled part of dirty list */ |
2353 | if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) |
2354 | goto done; |
2355 | |
2356 | /* Reset any dirty pages we kept that page_flush didn't see */ |
2357 | rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); |
2358 | |
2359 | done: |
2360 | txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; |
2361 | return rc; |
2362 | } |
2363 | |
2364 | /** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ |
2365 | static txnid_t |
2366 | mdb_find_oldest(MDB_txn *txn) |
2367 | { |
2368 | int i; |
2369 | txnid_t mr, oldest = txn->mt_txnid - 1; |
2370 | if (txn->mt_env->me_txns) { |
2371 | MDB_reader *r = txn->mt_env->me_txns->mti_readers; |
2372 | for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { |
2373 | if (r[i].mr_pid) { |
2374 | mr = r[i].mr_txnid; |
2375 | if (oldest > mr) |
2376 | oldest = mr; |
2377 | } |
2378 | } |
2379 | } |
2380 | return oldest; |
2381 | } |
2382 | |
2383 | /** Add a page to the txn's dirty list */ |
2384 | static void |
2385 | mdb_page_dirty(MDB_txn *txn, MDB_page *mp) |
2386 | { |
2387 | MDB_ID2 mid; |
2388 | int rc, (*insert)(MDB_ID2L, MDB_ID2 *); |
2389 | |
2390 | if (txn->mt_flags & MDB_TXN_WRITEMAP) { |
2391 | insert = mdb_mid2l_append; |
2392 | } else { |
2393 | insert = mdb_mid2l_insert; |
2394 | } |
2395 | mid.mid = mp->mp_pgno; |
2396 | mid.mptr = mp; |
2397 | rc = insert(txn->mt_u.dirty_list, &mid); |
2398 | mdb_tassert(txn, rc == 0); |
2399 | txn->mt_dirty_room--; |
2400 | } |
2401 | |
2402 | /** Allocate page numbers and memory for writing. Maintain me_pglast, |
2403 | * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. |
2404 | * |
2405 | * If there are free pages available from older transactions, they |
2406 | * are re-used first. Otherwise allocate a new page at mt_next_pgno. |
2407 | * Do not modify the freedB, just merge freeDB records into me_pghead[] |
2408 | * and move me_pglast to say which records were consumed. Only this |
2409 | * function can create me_pghead and move me_pglast/mt_next_pgno. |
2410 | * When #MDB_DEVEL & 2, it is not affected by #mdb_freelist_save(): it |
2411 | * then uses the transaction's original snapshot of the freeDB. |
2412 | * @param[in] mc cursor A cursor handle identifying the transaction and |
2413 | * database for which we are allocating. |
2414 | * @param[in] num the number of pages to allocate. |
2415 | * @param[out] mp Address of the allocated page(s). Requests for multiple pages |
2416 | * will always be satisfied by a single contiguous chunk of memory. |
2417 | * @return 0 on success, non-zero on failure. |
2418 | */ |
2419 | static int |
2420 | mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) |
2421 | { |
2422 | #ifdef MDB_PARANOID /* Seems like we can ignore this now */ |
2423 | /* Get at most <Max_retries> more freeDB records once me_pghead |
2424 | * has enough pages. If not enough, use new pages from the map. |
2425 | * If <Paranoid> and mc is updating the freeDB, only get new |
2426 | * records if me_pghead is empty. Then the freelist cannot play |
2427 | * catch-up with itself by growing while trying to save it. |
2428 | */ |
2429 | enum { Paranoid = 1, Max_retries = 500 }; |
2430 | #else |
2431 | enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ }; |
2432 | #endif |
2433 | int rc, retry = num * 60; |
2434 | MDB_txn *txn = mc->mc_txn; |
2435 | MDB_env *env = txn->mt_env; |
2436 | pgno_t pgno, *mop = env->me_pghead; |
2437 | unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1; |
2438 | MDB_page *np; |
2439 | txnid_t oldest = 0, last; |
2440 | MDB_cursor_op op; |
2441 | MDB_cursor m2; |
2442 | int found_old = 0; |
2443 | |
2444 | /* If there are any loose pages, just use them */ |
2445 | if (num == 1 && txn->mt_loose_pgs) { |
2446 | np = txn->mt_loose_pgs; |
2447 | txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); |
2448 | txn->mt_loose_count--; |
2449 | DPRINTF(("db %d use loose page %" Yu, DDBI(mc), np->mp_pgno)); |
2450 | *mp = np; |
2451 | return MDB_SUCCESS; |
2452 | } |
2453 | |
2454 | *mp = NULL; |
2455 | |
2456 | /* If our dirty list is already full, we can't do anything */ |
2457 | if (txn->mt_dirty_room == 0) { |
2458 | rc = MDB_TXN_FULL; |
2459 | goto fail; |
2460 | } |
2461 | |
2462 | for (op = MDB_FIRST;; op = MDB_NEXT) { |
2463 | MDB_val key, data; |
2464 | MDB_node *leaf; |
2465 | pgno_t *idl; |
2466 | |
2467 | /* Seek a big enough contiguous page range. Prefer |
2468 | * pages at the tail, just truncating the list. |
2469 | */ |
2470 | if (mop_len > n2) { |
2471 | i = mop_len; |
2472 | do { |
2473 | pgno = mop[i]; |
2474 | if (mop[i-n2] == pgno+n2) |
2475 | goto search_done; |
2476 | } while (--i > n2); |
2477 | if (--retry < 0) |
2478 | break; |
2479 | } |
2480 | |
2481 | if (op == MDB_FIRST) { /* 1st iteration */ |
2482 | /* Prepare to fetch more and coalesce */ |
2483 | last = env->me_pglast; |
2484 | oldest = env->me_pgoldest; |
2485 | mdb_cursor_init(&m2, txn, FREE_DBI, NULL); |
2486 | #if (MDB_DEVEL) & 2 /* "& 2" so MDB_DEVEL=1 won't hide bugs breaking freeDB */ |
2487 | /* Use original snapshot. TODO: Should need less care in code |
2488 | * which modifies the database. Maybe we can delete some code? |
2489 | */ |
2490 | m2.mc_flags |= C_ORIG_RDONLY; |
2491 | m2.mc_db = &env->me_metas[(txn->mt_txnid-1) & 1]->mm_dbs[FREE_DBI]; |
2492 | m2.mc_dbflag = (unsigned char *)"" ; /* probably unnecessary */ |
2493 | #endif |
2494 | if (last) { |
2495 | op = MDB_SET_RANGE; |
2496 | key.mv_data = &last; /* will look up last+1 */ |
2497 | key.mv_size = sizeof(last); |
2498 | } |
2499 | if (Paranoid && mc->mc_dbi == FREE_DBI) |
2500 | retry = -1; |
2501 | } |
2502 | if (Paranoid && retry < 0 && mop_len) |
2503 | break; |
2504 | |
2505 | last++; |
2506 | /* Do not fetch more if the record will be too recent */ |
2507 | if (oldest <= last) { |
2508 | if (!found_old) { |
2509 | oldest = mdb_find_oldest(txn); |
2510 | env->me_pgoldest = oldest; |
2511 | found_old = 1; |
2512 | } |
2513 | if (oldest <= last) |
2514 | break; |
2515 | } |
2516 | rc = mdb_cursor_get(&m2, &key, NULL, op); |
2517 | if (rc) { |
2518 | if (rc == MDB_NOTFOUND) |
2519 | break; |
2520 | goto fail; |
2521 | } |
2522 | last = *(txnid_t*)key.mv_data; |
2523 | if (oldest <= last) { |
2524 | if (!found_old) { |
2525 | oldest = mdb_find_oldest(txn); |
2526 | env->me_pgoldest = oldest; |
2527 | found_old = 1; |
2528 | } |
2529 | if (oldest <= last) |
2530 | break; |
2531 | } |
2532 | np = m2.mc_pg[m2.mc_top]; |
2533 | leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); |
2534 | if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS) |
2535 | goto fail; |
2536 | |
2537 | idl = (MDB_ID *) data.mv_data; |
2538 | i = idl[0]; |
2539 | if (!mop) { |
2540 | if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { |
2541 | rc = ENOMEM; |
2542 | goto fail; |
2543 | } |
2544 | } else { |
2545 | if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) |
2546 | goto fail; |
2547 | mop = env->me_pghead; |
2548 | } |
2549 | env->me_pglast = last; |
2550 | #if (MDB_DEBUG) > 1 |
2551 | DPRINTF(("IDL read txn %" Yu" root %" Yu" num %u" , |
2552 | last, txn->mt_dbs[FREE_DBI].md_root, i)); |
2553 | for (j = i; j; j--) |
2554 | DPRINTF(("IDL %" Yu, idl[j])); |
2555 | #endif |
2556 | /* Merge in descending sorted order */ |
2557 | mdb_midl_xmerge(mop, idl); |
2558 | mop_len = mop[0]; |
2559 | } |
2560 | |
2561 | /* Use new pages from the map when nothing suitable in the freeDB */ |
2562 | i = 0; |
2563 | pgno = txn->mt_next_pgno; |
2564 | if (pgno + num >= env->me_maxpg) { |
2565 | DPUTS("DB size maxed out" ); |
2566 | rc = MDB_MAP_FULL; |
2567 | goto fail; |
2568 | } |
2569 | #if defined(_WIN32) && !defined(MDB_VL32) |
2570 | if (!(env->me_flags & MDB_RDONLY)) { |
2571 | void *p; |
2572 | p = (MDB_page *)(env->me_map + env->me_psize * pgno); |
2573 | p = VirtualAlloc(p, env->me_psize * num, MEM_COMMIT, |
2574 | (env->me_flags & MDB_WRITEMAP) ? PAGE_READWRITE: |
2575 | PAGE_READONLY); |
2576 | if (!p) { |
2577 | DPUTS("VirtualAlloc failed" ); |
2578 | rc = ErrCode(); |
2579 | goto fail; |
2580 | } |
2581 | } |
2582 | #endif |
2583 | |
2584 | search_done: |
2585 | if (env->me_flags & MDB_WRITEMAP) { |
2586 | np = (MDB_page *)(env->me_map + env->me_psize * pgno); |
2587 | } else { |
2588 | if (!(np = mdb_page_malloc(txn, num))) { |
2589 | rc = ENOMEM; |
2590 | goto fail; |
2591 | } |
2592 | } |
2593 | if (i) { |
2594 | mop[0] = mop_len -= num; |
2595 | /* Move any stragglers down */ |
2596 | for (j = i-num; j < mop_len; ) |
2597 | mop[++j] = mop[++i]; |
2598 | } else { |
2599 | txn->mt_next_pgno = pgno + num; |
2600 | } |
2601 | np->mp_pgno = pgno; |
2602 | mdb_page_dirty(txn, np); |
2603 | *mp = np; |
2604 | |
2605 | return MDB_SUCCESS; |
2606 | |
2607 | fail: |
2608 | txn->mt_flags |= MDB_TXN_ERROR; |
2609 | return rc; |
2610 | } |
2611 | |
2612 | /** Copy the used portions of a non-overflow page. |
2613 | * @param[in] dst page to copy into |
2614 | * @param[in] src page to copy from |
2615 | * @param[in] psize size of a page |
2616 | */ |
2617 | static void |
2618 | mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) |
2619 | { |
2620 | enum { Align = sizeof(pgno_t) }; |
2621 | indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; |
2622 | |
2623 | /* If page isn't full, just copy the used portion. Adjust |
2624 | * alignment so memcpy may copy words instead of bytes. |
2625 | */ |
2626 | if ((unused &= -Align) && !IS_LEAF2(src)) { |
2627 | upper = (upper + PAGEBASE) & -Align; |
2628 | memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align); |
2629 | memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), |
2630 | psize - upper); |
2631 | } else { |
2632 | memcpy(dst, src, psize - unused); |
2633 | } |
2634 | } |
2635 | |
2636 | /** Pull a page off the txn's spill list, if present. |
2637 | * If a page being referenced was spilled to disk in this txn, bring |
2638 | * it back and make it dirty/writable again. |
2639 | * @param[in] txn the transaction handle. |
2640 | * @param[in] mp the page being referenced. It must not be dirty. |
2641 | * @param[out] ret the writable page, if any. ret is unchanged if |
2642 | * mp wasn't spilled. |
2643 | */ |
2644 | static int |
2645 | mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) |
2646 | { |
2647 | MDB_env *env = txn->mt_env; |
2648 | const MDB_txn *tx2; |
2649 | unsigned x; |
2650 | pgno_t pgno = mp->mp_pgno, pn = pgno << 1; |
2651 | |
2652 | for (tx2 = txn; tx2; tx2=tx2->mt_parent) { |
2653 | if (!tx2->mt_spill_pgs) |
2654 | continue; |
2655 | x = mdb_midl_search(tx2->mt_spill_pgs, pn); |
2656 | if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { |
2657 | MDB_page *np; |
2658 | int num; |
2659 | if (txn->mt_dirty_room == 0) |
2660 | return MDB_TXN_FULL; |
2661 | if (IS_OVERFLOW(mp)) |
2662 | num = mp->mp_pages; |
2663 | else |
2664 | num = 1; |
2665 | if (env->me_flags & MDB_WRITEMAP) { |
2666 | np = mp; |
2667 | } else { |
2668 | np = mdb_page_malloc(txn, num); |
2669 | if (!np) |
2670 | return ENOMEM; |
2671 | if (num > 1) |
2672 | memcpy(np, mp, num * env->me_psize); |
2673 | else |
2674 | mdb_page_copy(np, mp, env->me_psize); |
2675 | } |
2676 | if (tx2 == txn) { |
2677 | /* If in current txn, this page is no longer spilled. |
2678 | * If it happens to be the last page, truncate the spill list. |
2679 | * Otherwise mark it as deleted by setting the LSB. |
2680 | */ |
2681 | if (x == txn->mt_spill_pgs[0]) |
2682 | txn->mt_spill_pgs[0]--; |
2683 | else |
2684 | txn->mt_spill_pgs[x] |= 1; |
2685 | } /* otherwise, if belonging to a parent txn, the |
2686 | * page remains spilled until child commits |
2687 | */ |
2688 | |
2689 | mdb_page_dirty(txn, np); |
2690 | np->mp_flags |= P_DIRTY; |
2691 | *ret = np; |
2692 | break; |
2693 | } |
2694 | } |
2695 | return MDB_SUCCESS; |
2696 | } |
2697 | |
2698 | /** Touch a page: make it dirty and re-insert into tree with updated pgno. |
2699 | * Set #MDB_TXN_ERROR on failure. |
2700 | * @param[in] mc cursor pointing to the page to be touched |
2701 | * @return 0 on success, non-zero on failure. |
2702 | */ |
2703 | static int |
2704 | mdb_page_touch(MDB_cursor *mc) |
2705 | { |
2706 | MDB_page *mp = mc->mc_pg[mc->mc_top], *np; |
2707 | MDB_txn *txn = mc->mc_txn; |
2708 | MDB_cursor *m2, *m3; |
2709 | pgno_t pgno; |
2710 | int rc; |
2711 | |
2712 | if (!F_ISSET(mp->mp_flags, P_DIRTY)) { |
2713 | if (txn->mt_flags & MDB_TXN_SPILLS) { |
2714 | np = NULL; |
2715 | rc = mdb_page_unspill(txn, mp, &np); |
2716 | if (rc) |
2717 | goto fail; |
2718 | if (np) |
2719 | goto done; |
2720 | } |
2721 | if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || |
2722 | (rc = mdb_page_alloc(mc, 1, &np))) |
2723 | goto fail; |
2724 | pgno = np->mp_pgno; |
2725 | DPRINTF(("touched db %d page %" Yu" -> %" Yu, DDBI(mc), |
2726 | mp->mp_pgno, pgno)); |
2727 | mdb_cassert(mc, mp->mp_pgno != pgno); |
2728 | mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); |
2729 | /* Update the parent page, if any, to point to the new page */ |
2730 | if (mc->mc_top) { |
2731 | MDB_page *parent = mc->mc_pg[mc->mc_top-1]; |
2732 | MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); |
2733 | SETPGNO(node, pgno); |
2734 | } else { |
2735 | mc->mc_db->md_root = pgno; |
2736 | } |
2737 | } else if (txn->mt_parent && !IS_SUBP(mp)) { |
2738 | MDB_ID2 mid, *dl = txn->mt_u.dirty_list; |
2739 | pgno = mp->mp_pgno; |
2740 | /* If txn has a parent, make sure the page is in our |
2741 | * dirty list. |
2742 | */ |
2743 | if (dl[0].mid) { |
2744 | unsigned x = mdb_mid2l_search(dl, pgno); |
2745 | if (x <= dl[0].mid && dl[x].mid == pgno) { |
2746 | if (mp != dl[x].mptr) { /* bad cursor? */ |
2747 | mc->mc_flags &= ~(C_INITIALIZED|C_EOF); |
2748 | txn->mt_flags |= MDB_TXN_ERROR; |
2749 | return MDB_PROBLEM; |
2750 | } |
2751 | return 0; |
2752 | } |
2753 | } |
2754 | mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); |
2755 | /* No - copy it */ |
2756 | np = mdb_page_malloc(txn, 1); |
2757 | if (!np) |
2758 | return ENOMEM; |
2759 | mid.mid = pgno; |
2760 | mid.mptr = np; |
2761 | rc = mdb_mid2l_insert(dl, &mid); |
2762 | mdb_cassert(mc, rc == 0); |
2763 | } else { |
2764 | return 0; |
2765 | } |
2766 | |
2767 | mdb_page_copy(np, mp, txn->mt_env->me_psize); |
2768 | np->mp_pgno = pgno; |
2769 | np->mp_flags |= P_DIRTY; |
2770 | |
2771 | done: |
2772 | /* Adjust cursors pointing to mp */ |
2773 | mc->mc_pg[mc->mc_top] = np; |
2774 | m2 = txn->mt_cursors[mc->mc_dbi]; |
2775 | if (mc->mc_flags & C_SUB) { |
2776 | for (; m2; m2=m2->mc_next) { |
2777 | m3 = &m2->mc_xcursor->mx_cursor; |
2778 | if (m3->mc_snum < mc->mc_snum) continue; |
2779 | if (m3->mc_pg[mc->mc_top] == mp) |
2780 | m3->mc_pg[mc->mc_top] = np; |
2781 | } |
2782 | } else { |
2783 | for (; m2; m2=m2->mc_next) { |
2784 | if (m2->mc_snum < mc->mc_snum) continue; |
2785 | if (m2 == mc) continue; |
2786 | if (m2->mc_pg[mc->mc_top] == mp) { |
2787 | m2->mc_pg[mc->mc_top] = np; |
2788 | if (IS_LEAF(np)) |
2789 | XCURSOR_REFRESH(m2, mc->mc_top, np); |
2790 | } |
2791 | } |
2792 | } |
2793 | MDB_PAGE_UNREF(mc->mc_txn, mp); |
2794 | return 0; |
2795 | |
2796 | fail: |
2797 | txn->mt_flags |= MDB_TXN_ERROR; |
2798 | return rc; |
2799 | } |
2800 | |
2801 | int |
2802 | mdb_env_sync0(MDB_env *env, int force, pgno_t numpgs) |
2803 | { |
2804 | int rc = 0; |
2805 | if (env->me_flags & MDB_RDONLY) |
2806 | return EACCES; |
2807 | if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { |
2808 | if (env->me_flags & MDB_WRITEMAP) { |
2809 | int flags = ((env->me_flags & MDB_MAPASYNC) && !force) |
2810 | ? MS_ASYNC : MS_SYNC; |
2811 | if (MDB_MSYNC(env->me_map, env->me_psize * numpgs, flags)) |
2812 | rc = ErrCode(); |
2813 | #ifdef _WIN32 |
2814 | else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) |
2815 | rc = ErrCode(); |
2816 | #endif |
2817 | } else { |
2818 | #ifdef BROKEN_FDATASYNC |
2819 | if (env->me_flags & MDB_FSYNCONLY) { |
2820 | if (fsync(env->me_fd)) |
2821 | rc = ErrCode(); |
2822 | } else |
2823 | #endif |
2824 | if (MDB_FDATASYNC(env->me_fd)) |
2825 | rc = ErrCode(); |
2826 | } |
2827 | } |
2828 | return rc; |
2829 | } |
2830 | |
2831 | int |
2832 | mdb_env_sync(MDB_env *env, int force) |
2833 | { |
2834 | MDB_meta *m = mdb_env_pick_meta(env); |
2835 | return mdb_env_sync0(env, force, m->mm_last_pg+1); |
2836 | } |
2837 | |
2838 | /** Back up parent txn's cursors, then grab the originals for tracking */ |
2839 | static int |
2840 | mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) |
2841 | { |
2842 | MDB_cursor *mc, *bk; |
2843 | MDB_xcursor *mx; |
2844 | size_t size; |
2845 | int i; |
2846 | |
2847 | for (i = src->mt_numdbs; --i >= 0; ) { |
2848 | if ((mc = src->mt_cursors[i]) != NULL) { |
2849 | size = sizeof(MDB_cursor); |
2850 | if (mc->mc_xcursor) |
2851 | size += sizeof(MDB_xcursor); |
2852 | for (; mc; mc = bk->mc_next) { |
2853 | bk = malloc(size); |
2854 | if (!bk) |
2855 | return ENOMEM; |
2856 | *bk = *mc; |
2857 | mc->mc_backup = bk; |
2858 | mc->mc_db = &dst->mt_dbs[i]; |
2859 | /* Kill pointers into src to reduce abuse: The |
2860 | * user may not use mc until dst ends. But we need a valid |
2861 | * txn pointer here for cursor fixups to keep working. |
2862 | */ |
2863 | mc->mc_txn = dst; |
2864 | mc->mc_dbflag = &dst->mt_dbflags[i]; |
2865 | if ((mx = mc->mc_xcursor) != NULL) { |
2866 | *(MDB_xcursor *)(bk+1) = *mx; |
2867 | mx->mx_cursor.mc_txn = dst; |
2868 | } |
2869 | mc->mc_next = dst->mt_cursors[i]; |
2870 | dst->mt_cursors[i] = mc; |
2871 | } |
2872 | } |
2873 | } |
2874 | return MDB_SUCCESS; |
2875 | } |
2876 | |
2877 | /** Close this write txn's cursors, give parent txn's cursors back to parent. |
2878 | * @param[in] txn the transaction handle. |
2879 | * @param[in] merge true to keep changes to parent cursors, false to revert. |
2880 | * @return 0 on success, non-zero on failure. |
2881 | */ |
2882 | static void |
2883 | mdb_cursors_close(MDB_txn *txn, unsigned merge) |
2884 | { |
2885 | MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; |
2886 | MDB_xcursor *mx; |
2887 | int i; |
2888 | |
2889 | for (i = txn->mt_numdbs; --i >= 0; ) { |
2890 | for (mc = cursors[i]; mc; mc = next) { |
2891 | next = mc->mc_next; |
2892 | if ((bk = mc->mc_backup) != NULL) { |
2893 | if (merge) { |
2894 | /* Commit changes to parent txn */ |
2895 | mc->mc_next = bk->mc_next; |
2896 | mc->mc_backup = bk->mc_backup; |
2897 | mc->mc_txn = bk->mc_txn; |
2898 | mc->mc_db = bk->mc_db; |
2899 | mc->mc_dbflag = bk->mc_dbflag; |
2900 | if ((mx = mc->mc_xcursor) != NULL) |
2901 | mx->mx_cursor.mc_txn = bk->mc_txn; |
2902 | } else { |
2903 | /* Abort nested txn */ |
2904 | *mc = *bk; |
2905 | if ((mx = mc->mc_xcursor) != NULL) |
2906 | *mx = *(MDB_xcursor *)(bk+1); |
2907 | } |
2908 | mc = bk; |
2909 | } |
2910 | /* Only malloced cursors are permanently tracked. */ |
2911 | free(mc); |
2912 | } |
2913 | cursors[i] = NULL; |
2914 | } |
2915 | } |
2916 | |
2917 | #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ |
2918 | enum Pidlock_op { |
2919 | Pidset, Pidcheck |
2920 | }; |
2921 | #else |
2922 | enum Pidlock_op { |
2923 | Pidset = F_SETLK, Pidcheck = F_GETLK |
2924 | }; |
2925 | #endif |
2926 | |
2927 | /** Set or check a pid lock. Set returns 0 on success. |
2928 | * Check returns 0 if the process is certainly dead, nonzero if it may |
2929 | * be alive (the lock exists or an error happened so we do not know). |
2930 | * |
2931 | * On Windows Pidset is a no-op, we merely check for the existence |
2932 | * of the process with the given pid. On POSIX we use a single byte |
2933 | * lock on the lockfile, set at an offset equal to the pid. |
2934 | */ |
2935 | static int |
2936 | mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid) |
2937 | { |
2938 | #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ |
2939 | int ret = 0; |
2940 | HANDLE h; |
2941 | if (op == Pidcheck) { |
2942 | h = OpenProcess(env->me_pidquery, FALSE, pid); |
2943 | /* No documented "no such process" code, but other program use this: */ |
2944 | if (!h) |
2945 | return ErrCode() != ERROR_INVALID_PARAMETER; |
2946 | /* A process exists until all handles to it close. Has it exited? */ |
2947 | ret = WaitForSingleObject(h, 0) != 0; |
2948 | CloseHandle(h); |
2949 | } |
2950 | return ret; |
2951 | #else |
2952 | for (;;) { |
2953 | int rc; |
2954 | struct flock lock_info; |
2955 | memset(&lock_info, 0, sizeof(lock_info)); |
2956 | lock_info.l_type = F_WRLCK; |
2957 | lock_info.l_whence = SEEK_SET; |
2958 | lock_info.l_start = pid; |
2959 | lock_info.l_len = 1; |
2960 | if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { |
2961 | if (op == F_GETLK && lock_info.l_type != F_UNLCK) |
2962 | rc = -1; |
2963 | } else if ((rc = ErrCode()) == EINTR) { |
2964 | continue; |
2965 | } |
2966 | return rc; |
2967 | } |
2968 | #endif |
2969 | } |
2970 | |
2971 | /** Common code for #mdb_txn_begin() and #mdb_txn_renew(). |
2972 | * @param[in] txn the transaction handle to initialize |
2973 | * @return 0 on success, non-zero on failure. |
2974 | */ |
2975 | static int |
2976 | mdb_txn_renew0(MDB_txn *txn) |
2977 | { |
2978 | MDB_env *env = txn->mt_env; |
2979 | MDB_txninfo *ti = env->me_txns; |
2980 | MDB_meta *meta; |
2981 | unsigned int i, nr, flags = txn->mt_flags; |
2982 | uint16_t x; |
2983 | int rc, new_notls = 0; |
2984 | |
2985 | if ((flags &= MDB_TXN_RDONLY) != 0) { |
2986 | if (!ti) { |
2987 | meta = mdb_env_pick_meta(env); |
2988 | txn->mt_txnid = meta->mm_txnid; |
2989 | txn->mt_u.reader = NULL; |
2990 | } else { |
2991 | MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : |
2992 | pthread_getspecific(env->me_txkey); |
2993 | if (r) { |
2994 | if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) |
2995 | return MDB_BAD_RSLOT; |
2996 | } else { |
2997 | MDB_PID_T pid = env->me_pid; |
2998 | MDB_THR_T tid = pthread_self(); |
2999 | mdb_mutexref_t rmutex = env->me_rmutex; |
3000 | |
3001 | if (!env->me_live_reader) { |
3002 | rc = mdb_reader_pid(env, Pidset, pid); |
3003 | if (rc) |
3004 | return rc; |
3005 | env->me_live_reader = 1; |
3006 | } |
3007 | |
3008 | if (LOCK_MUTEX(rc, env, rmutex)) |
3009 | return rc; |
3010 | nr = ti->mti_numreaders; |
3011 | for (i=0; i<nr; i++) |
3012 | if (ti->mti_readers[i].mr_pid == 0) |
3013 | break; |
3014 | if (i == env->me_maxreaders) { |
3015 | UNLOCK_MUTEX(rmutex); |
3016 | return MDB_READERS_FULL; |
3017 | } |
3018 | r = &ti->mti_readers[i]; |
3019 | /* Claim the reader slot, carefully since other code |
3020 | * uses the reader table un-mutexed: First reset the |
3021 | * slot, next publish it in mti_numreaders. After |
3022 | * that, it is safe for mdb_env_close() to touch it. |
3023 | * When it will be closed, we can finally claim it. |
3024 | */ |
3025 | r->mr_pid = 0; |
3026 | r->mr_txnid = (txnid_t)-1; |
3027 | r->mr_tid = tid; |
3028 | if (i == nr) |
3029 | ti->mti_numreaders = ++nr; |
3030 | env->me_close_readers = nr; |
3031 | r->mr_pid = pid; |
3032 | UNLOCK_MUTEX(rmutex); |
3033 | |
3034 | new_notls = (env->me_flags & MDB_NOTLS); |
3035 | if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { |
3036 | r->mr_pid = 0; |
3037 | return rc; |
3038 | } |
3039 | } |
3040 | do /* LY: Retry on a race, ITS#7970. */ |
3041 | r->mr_txnid = ti->mti_txnid; |
3042 | while(r->mr_txnid != ti->mti_txnid); |
3043 | txn->mt_txnid = r->mr_txnid; |
3044 | txn->mt_u.reader = r; |
3045 | meta = env->me_metas[txn->mt_txnid & 1]; |
3046 | } |
3047 | |
3048 | } else { |
3049 | /* Not yet touching txn == env->me_txn0, it may be active */ |
3050 | if (ti) { |
3051 | if (LOCK_MUTEX(rc, env, env->me_wmutex)) |
3052 | return rc; |
3053 | txn->mt_txnid = ti->mti_txnid; |
3054 | meta = env->me_metas[txn->mt_txnid & 1]; |
3055 | } else { |
3056 | meta = mdb_env_pick_meta(env); |
3057 | txn->mt_txnid = meta->mm_txnid; |
3058 | } |
3059 | txn->mt_txnid++; |
3060 | #if MDB_DEBUG |
3061 | if (txn->mt_txnid == mdb_debug_start) |
3062 | mdb_debug = 1; |
3063 | #endif |
3064 | txn->mt_child = NULL; |
3065 | txn->mt_loose_pgs = NULL; |
3066 | txn->mt_loose_count = 0; |
3067 | txn->mt_dirty_room = MDB_IDL_UM_MAX; |
3068 | txn->mt_u.dirty_list = env->me_dirty_list; |
3069 | txn->mt_u.dirty_list[0].mid = 0; |
3070 | txn->mt_free_pgs = env->me_free_pgs; |
3071 | txn->mt_free_pgs[0] = 0; |
3072 | txn->mt_spill_pgs = NULL; |
3073 | env->me_txn = txn; |
3074 | memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int)); |
3075 | } |
3076 | |
3077 | /* Copy the DB info and flags */ |
3078 | memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); |
3079 | |
3080 | /* Moved to here to avoid a data race in read TXNs */ |
3081 | txn->mt_next_pgno = meta->mm_last_pg+1; |
3082 | #ifdef MDB_VL32 |
3083 | txn->mt_last_pgno = txn->mt_next_pgno - 1; |
3084 | #endif |
3085 | |
3086 | txn->mt_flags = flags; |
3087 | |
3088 | /* Setup db info */ |
3089 | txn->mt_numdbs = env->me_numdbs; |
3090 | for (i=CORE_DBS; i<txn->mt_numdbs; i++) { |
3091 | x = env->me_dbflags[i]; |
3092 | txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; |
3093 | txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0; |
3094 | } |
3095 | txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID; |
3096 | txn->mt_dbflags[FREE_DBI] = DB_VALID; |
3097 | |
3098 | if (env->me_flags & MDB_FATAL_ERROR) { |
3099 | DPUTS("environment had fatal error, must shutdown!" ); |
3100 | rc = MDB_PANIC; |
3101 | } else if (env->me_maxpg < txn->mt_next_pgno) { |
3102 | rc = MDB_MAP_RESIZED; |
3103 | } else { |
3104 | return MDB_SUCCESS; |
3105 | } |
3106 | mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); |
3107 | return rc; |
3108 | } |
3109 | |
3110 | int |
3111 | mdb_txn_renew(MDB_txn *txn) |
3112 | { |
3113 | int rc; |
3114 | |
3115 | if (!txn || !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED)) |
3116 | return EINVAL; |
3117 | |
3118 | rc = mdb_txn_renew0(txn); |
3119 | if (rc == MDB_SUCCESS) { |
3120 | DPRINTF(("renew txn %" Yu"%c %p on mdbenv %p, root page %" Yu, |
3121 | txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', |
3122 | (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); |
3123 | } |
3124 | return rc; |
3125 | } |
3126 | |
3127 | int |
3128 | mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) |
3129 | { |
3130 | MDB_txn *txn; |
3131 | MDB_ntxn *ntxn; |
3132 | int rc, size, tsize; |
3133 | |
3134 | flags &= MDB_TXN_BEGIN_FLAGS; |
3135 | flags |= env->me_flags & MDB_WRITEMAP; |
3136 | |
3137 | if (env->me_flags & MDB_RDONLY & ~flags) /* write txn in RDONLY env */ |
3138 | return EACCES; |
3139 | |
3140 | if (parent) { |
3141 | /* Nested transactions: Max 1 child, write txns only, no writemap */ |
3142 | flags |= parent->mt_flags; |
3143 | if (flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_BLOCKED)) { |
3144 | return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; |
3145 | } |
3146 | /* Child txns save MDB_pgstate and use own copy of cursors */ |
3147 | size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1); |
3148 | size += tsize = sizeof(MDB_ntxn); |
3149 | } else if (flags & MDB_RDONLY) { |
3150 | size = env->me_maxdbs * (sizeof(MDB_db)+1); |
3151 | size += tsize = sizeof(MDB_txn); |
3152 | } else { |
3153 | /* Reuse preallocated write txn. However, do not touch it until |
3154 | * mdb_txn_renew0() succeeds, since it currently may be active. |
3155 | */ |
3156 | txn = env->me_txn0; |
3157 | goto renew; |
3158 | } |
3159 | if ((txn = calloc(1, size)) == NULL) { |
3160 | DPRINTF(("calloc: %s" , strerror(errno))); |
3161 | return ENOMEM; |
3162 | } |
3163 | #ifdef MDB_VL32 |
3164 | if (!parent) { |
3165 | txn->mt_rpages = malloc(MDB_TRPAGE_SIZE * sizeof(MDB_ID3)); |
3166 | if (!txn->mt_rpages) { |
3167 | free(txn); |
3168 | return ENOMEM; |
3169 | } |
3170 | txn->mt_rpages[0].mid = 0; |
3171 | txn->mt_rpcheck = MDB_TRPAGE_SIZE/2; |
3172 | } |
3173 | #endif |
3174 | txn->mt_dbxs = env->me_dbxs; /* static */ |
3175 | txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); |
3176 | txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; |
3177 | txn->mt_flags = flags; |
3178 | txn->mt_env = env; |
3179 | |
3180 | if (parent) { |
3181 | unsigned int i; |
3182 | txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); |
3183 | txn->mt_dbiseqs = parent->mt_dbiseqs; |
3184 | txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); |
3185 | if (!txn->mt_u.dirty_list || |
3186 | !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) |
3187 | { |
3188 | free(txn->mt_u.dirty_list); |
3189 | free(txn); |
3190 | return ENOMEM; |
3191 | } |
3192 | txn->mt_txnid = parent->mt_txnid; |
3193 | txn->mt_dirty_room = parent->mt_dirty_room; |
3194 | txn->mt_u.dirty_list[0].mid = 0; |
3195 | txn->mt_spill_pgs = NULL; |
3196 | txn->mt_next_pgno = parent->mt_next_pgno; |
3197 | parent->mt_flags |= MDB_TXN_HAS_CHILD; |
3198 | parent->mt_child = txn; |
3199 | txn->mt_parent = parent; |
3200 | txn->mt_numdbs = parent->mt_numdbs; |
3201 | #ifdef MDB_VL32 |
3202 | txn->mt_rpages = parent->mt_rpages; |
3203 | #endif |
3204 | memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); |
3205 | /* Copy parent's mt_dbflags, but clear DB_NEW */ |
3206 | for (i=0; i<txn->mt_numdbs; i++) |
3207 | txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; |
3208 | rc = 0; |
3209 | ntxn = (MDB_ntxn *)txn; |
3210 | ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ |
3211 | if (env->me_pghead) { |
3212 | size = MDB_IDL_SIZEOF(env->me_pghead); |
3213 | env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); |
3214 | if (env->me_pghead) |
3215 | memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); |
3216 | else |
3217 | rc = ENOMEM; |
3218 | } |
3219 | if (!rc) |
3220 | rc = mdb_cursor_shadow(parent, txn); |
3221 | if (rc) |
3222 | mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD); |
3223 | } else { /* MDB_RDONLY */ |
3224 | txn->mt_dbiseqs = env->me_dbiseqs; |
3225 | renew: |
3226 | rc = mdb_txn_renew0(txn); |
3227 | } |
3228 | if (rc) { |
3229 | if (txn != env->me_txn0) { |
3230 | #ifdef MDB_VL32 |
3231 | free(txn->mt_rpages); |
3232 | #endif |
3233 | free(txn); |
3234 | } |
3235 | } else { |
3236 | txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */ |
3237 | *ret = txn; |
3238 | DPRINTF(("begin txn %" Yu"%c %p on mdbenv %p, root page %" Yu, |
3239 | txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', |
3240 | (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); |
3241 | } |
3242 | |
3243 | return rc; |
3244 | } |
3245 | |
3246 | MDB_env * |
3247 | mdb_txn_env(MDB_txn *txn) |
3248 | { |
3249 | if(!txn) return NULL; |
3250 | return txn->mt_env; |
3251 | } |
3252 | |
3253 | mdb_size_t |
3254 | mdb_txn_id(MDB_txn *txn) |
3255 | { |
3256 | if(!txn) return 0; |
3257 | return txn->mt_txnid; |
3258 | } |
3259 | |
3260 | /** Export or close DBI handles opened in this txn. */ |
3261 | static void |
3262 | mdb_dbis_update(MDB_txn *txn, int keep) |
3263 | { |
3264 | int i; |
3265 | MDB_dbi n = txn->mt_numdbs; |
3266 | MDB_env *env = txn->mt_env; |
3267 | unsigned char *tdbflags = txn->mt_dbflags; |
3268 | |
3269 | for (i = n; --i >= CORE_DBS;) { |
3270 | if (tdbflags[i] & DB_NEW) { |
3271 | if (keep) { |
3272 | env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; |
3273 | } else { |
3274 | char *ptr = env->me_dbxs[i].md_name.mv_data; |
3275 | if (ptr) { |
3276 | env->me_dbxs[i].md_name.mv_data = NULL; |
3277 | env->me_dbxs[i].md_name.mv_size = 0; |
3278 | env->me_dbflags[i] = 0; |
3279 | env->me_dbiseqs[i]++; |
3280 | free(ptr); |
3281 | } |
3282 | } |
3283 | } |
3284 | } |
3285 | if (keep && env->me_numdbs < n) |
3286 | env->me_numdbs = n; |
3287 | } |
3288 | |
3289 | /** End a transaction, except successful commit of a nested transaction. |
3290 | * May be called twice for readonly txns: First reset it, then abort. |
3291 | * @param[in] txn the transaction handle to end |
3292 | * @param[in] mode why and how to end the transaction |
3293 | */ |
3294 | static void |
3295 | mdb_txn_end(MDB_txn *txn, unsigned mode) |
3296 | { |
3297 | MDB_env *env = txn->mt_env; |
3298 | #if MDB_DEBUG |
3299 | static const char *const names[] = MDB_END_NAMES; |
3300 | #endif |
3301 | |
3302 | /* Export or close DBI handles opened in this txn */ |
3303 | mdb_dbis_update(txn, mode & MDB_END_UPDATE); |
3304 | |
3305 | DPRINTF(("%s txn %" Yu"%c %p on mdbenv %p, root page %" Yu, |
3306 | names[mode & MDB_END_OPMASK], |
3307 | txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', |
3308 | (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); |
3309 | |
3310 | if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { |
3311 | if (txn->mt_u.reader) { |
3312 | txn->mt_u.reader->mr_txnid = (txnid_t)-1; |
3313 | if (!(env->me_flags & MDB_NOTLS)) { |
3314 | txn->mt_u.reader = NULL; /* txn does not own reader */ |
3315 | } else if (mode & MDB_END_SLOT) { |
3316 | txn->mt_u.reader->mr_pid = 0; |
3317 | txn->mt_u.reader = NULL; |
3318 | } /* else txn owns the slot until it does MDB_END_SLOT */ |
3319 | } |
3320 | txn->mt_numdbs = 0; /* prevent further DBI activity */ |
3321 | txn->mt_flags |= MDB_TXN_FINISHED; |
3322 | |
3323 | } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { |
3324 | pgno_t *pghead = env->me_pghead; |
3325 | |
3326 | if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ |
3327 | mdb_cursors_close(txn, 0); |
3328 | if (!(env->me_flags & MDB_WRITEMAP)) { |
3329 | mdb_dlist_free(txn); |
3330 | } |
3331 | |
3332 | txn->mt_numdbs = 0; |
3333 | txn->mt_flags = MDB_TXN_FINISHED; |
3334 | |
3335 | if (!txn->mt_parent) { |
3336 | mdb_midl_shrink(&txn->mt_free_pgs); |
3337 | env->me_free_pgs = txn->mt_free_pgs; |
3338 | /* me_pgstate: */ |
3339 | env->me_pghead = NULL; |
3340 | env->me_pglast = 0; |
3341 | |
3342 | env->me_txn = NULL; |
3343 | mode = 0; /* txn == env->me_txn0, do not free() it */ |
3344 | |
3345 | /* The writer mutex was locked in mdb_txn_begin. */ |
3346 | if (env->me_txns) |
3347 | UNLOCK_MUTEX(env->me_wmutex); |
3348 | } else { |
3349 | txn->mt_parent->mt_child = NULL; |
3350 | txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; |
3351 | env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; |
3352 | mdb_midl_free(txn->mt_free_pgs); |
3353 | mdb_midl_free(txn->mt_spill_pgs); |
3354 | free(txn->mt_u.dirty_list); |
3355 | } |
3356 | |
3357 | mdb_midl_free(pghead); |
3358 | } |
3359 | #ifdef MDB_VL32 |
3360 | if (!txn->mt_parent) { |
3361 | MDB_ID3L el = env->me_rpages, tl = txn->mt_rpages; |
3362 | unsigned i, x, n = tl[0].mid; |
3363 | pthread_mutex_lock(&env->me_rpmutex); |
3364 | for (i = 1; i <= n; i++) { |
3365 | if (tl[i].mid & (MDB_RPAGE_CHUNK-1)) { |
3366 | /* tmp overflow pages that we didn't share in env */ |
3367 | munmap(tl[i].mptr, tl[i].mcnt * env->me_psize); |
3368 | } else { |
3369 | x = mdb_mid3l_search(el, tl[i].mid); |
3370 | if (tl[i].mptr == el[x].mptr) { |
3371 | el[x].mref--; |
3372 | } else { |
3373 | /* another tmp overflow page */ |
3374 | munmap(tl[i].mptr, tl[i].mcnt * env->me_psize); |
3375 | } |
3376 | } |
3377 | } |
3378 | pthread_mutex_unlock(&env->me_rpmutex); |
3379 | tl[0].mid = 0; |
3380 | if (mode & MDB_END_FREE) |
3381 | free(tl); |
3382 | } |
3383 | #endif |
3384 | if (mode & MDB_END_FREE) |
3385 | free(txn); |
3386 | } |
3387 | |
3388 | void |
3389 | mdb_txn_reset(MDB_txn *txn) |
3390 | { |
3391 | if (txn == NULL) |
3392 | return; |
3393 | |
3394 | /* This call is only valid for read-only txns */ |
3395 | if (!(txn->mt_flags & MDB_TXN_RDONLY)) |
3396 | return; |
3397 | |
3398 | mdb_txn_end(txn, MDB_END_RESET); |
3399 | } |
3400 | |
3401 | void |
3402 | mdb_txn_abort(MDB_txn *txn) |
3403 | { |
3404 | if (txn == NULL) |
3405 | return; |
3406 | |
3407 | if (txn->mt_child) |
3408 | mdb_txn_abort(txn->mt_child); |
3409 | |
3410 | mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); |
3411 | } |
3412 | |
3413 | /** Save the freelist as of this transaction to the freeDB. |
3414 | * This changes the freelist. Keep trying until it stabilizes. |
3415 | * |
3416 | * When (MDB_DEVEL) & 2, the changes do not affect #mdb_page_alloc(), |
3417 | * it then uses the transaction's original snapshot of the freeDB. |
3418 | */ |
3419 | static int |
3420 | mdb_freelist_save(MDB_txn *txn) |
3421 | { |
3422 | /* env->me_pghead[] can grow and shrink during this call. |
3423 | * env->me_pglast and txn->mt_free_pgs[] can only grow. |
3424 | * Page numbers cannot disappear from txn->mt_free_pgs[]. |
3425 | */ |
3426 | MDB_cursor mc; |
3427 | MDB_env *env = txn->mt_env; |
3428 | int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; |
3429 | txnid_t pglast = 0, head_id = 0; |
3430 | pgno_t freecnt = 0, *free_pgs, *mop; |
3431 | ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; |
3432 | |
3433 | mdb_cursor_init(&mc, txn, FREE_DBI, NULL); |
3434 | |
3435 | if (env->me_pghead) { |
3436 | /* Make sure first page of freeDB is touched and on freelist */ |
3437 | rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); |
3438 | if (rc && rc != MDB_NOTFOUND) |
3439 | return rc; |
3440 | } |
3441 | |
3442 | if (!env->me_pghead && txn->mt_loose_pgs) { |
3443 | /* Put loose page numbers in mt_free_pgs, since |
3444 | * we may be unable to return them to me_pghead. |
3445 | */ |
3446 | MDB_page *mp = txn->mt_loose_pgs; |
3447 | MDB_ID2 *dl = txn->mt_u.dirty_list; |
3448 | unsigned x; |
3449 | if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0) |
3450 | return rc; |
3451 | for (; mp; mp = NEXT_LOOSE_PAGE(mp)) { |
3452 | mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); |
3453 | /* must also remove from dirty list */ |
3454 | if (txn->mt_flags & MDB_TXN_WRITEMAP) { |
3455 | for (x=1; x<=dl[0].mid; x++) |
3456 | if (dl[x].mid == mp->mp_pgno) |
3457 | break; |
3458 | mdb_tassert(txn, x <= dl[0].mid); |
3459 | } else { |
3460 | x = mdb_mid2l_search(dl, mp->mp_pgno); |
3461 | mdb_tassert(txn, dl[x].mid == mp->mp_pgno); |
3462 | mdb_dpage_free(env, mp); |
3463 | } |
3464 | dl[x].mptr = NULL; |
3465 | } |
3466 | { |
3467 | /* squash freed slots out of the dirty list */ |
3468 | unsigned y; |
3469 | for (y=1; dl[y].mptr && y <= dl[0].mid; y++); |
3470 | if (y <= dl[0].mid) { |
3471 | for(x=y, y++;;) { |
3472 | while (!dl[y].mptr && y <= dl[0].mid) y++; |
3473 | if (y > dl[0].mid) break; |
3474 | dl[x++] = dl[y++]; |
3475 | } |
3476 | dl[0].mid = x-1; |
3477 | } else { |
3478 | /* all slots freed */ |
3479 | dl[0].mid = 0; |
3480 | } |
3481 | } |
3482 | txn->mt_loose_pgs = NULL; |
3483 | txn->mt_loose_count = 0; |
3484 | } |
3485 | |
3486 | /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ |
3487 | clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) |
3488 | ? SSIZE_MAX : maxfree_1pg; |
3489 | |
3490 | for (;;) { |
3491 | /* Come back here after each Put() in case freelist changed */ |
3492 | MDB_val key, data; |
3493 | pgno_t *pgs; |
3494 | ssize_t j; |
3495 | |
3496 | /* If using records from freeDB which we have not yet |
3497 | * deleted, delete them and any we reserved for me_pghead. |
3498 | */ |
3499 | while (pglast < env->me_pglast) { |
3500 | rc = mdb_cursor_first(&mc, &key, NULL); |
3501 | if (rc) |
3502 | return rc; |
3503 | pglast = head_id = *(txnid_t *)key.mv_data; |
3504 | total_room = head_room = 0; |
3505 | mdb_tassert(txn, pglast <= env->me_pglast); |
3506 | rc = mdb_cursor_del(&mc, 0); |
3507 | if (rc) |
3508 | return rc; |
3509 | } |
3510 | |
3511 | /* Save the IDL of pages freed by this txn, to a single record */ |
3512 | if (freecnt < txn->mt_free_pgs[0]) { |
3513 | if (!freecnt) { |
3514 | /* Make sure last page of freeDB is touched and on freelist */ |
3515 | rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); |
3516 | if (rc && rc != MDB_NOTFOUND) |
3517 | return rc; |
3518 | } |
3519 | free_pgs = txn->mt_free_pgs; |
3520 | /* Write to last page of freeDB */ |
3521 | key.mv_size = sizeof(txn->mt_txnid); |
3522 | key.mv_data = &txn->mt_txnid; |
3523 | do { |
3524 | freecnt = free_pgs[0]; |
3525 | data.mv_size = MDB_IDL_SIZEOF(free_pgs); |
3526 | rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); |
3527 | if (rc) |
3528 | return rc; |
3529 | /* Retry if mt_free_pgs[] grew during the Put() */ |
3530 | free_pgs = txn->mt_free_pgs; |
3531 | } while (freecnt < free_pgs[0]); |
3532 | mdb_midl_sort(free_pgs); |
3533 | memcpy(data.mv_data, free_pgs, data.mv_size); |
3534 | #if (MDB_DEBUG) > 1 |
3535 | { |
3536 | unsigned int i = free_pgs[0]; |
3537 | DPRINTF(("IDL write txn %" Yu" root %" Yu" num %u" , |
3538 | txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); |
3539 | for (; i; i--) |
3540 | DPRINTF(("IDL %" Yu, free_pgs[i])); |
3541 | } |
3542 | #endif |
3543 | continue; |
3544 | } |
3545 | |
3546 | mop = env->me_pghead; |
3547 | mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; |
3548 | |
3549 | /* Reserve records for me_pghead[]. Split it if multi-page, |
3550 | * to avoid searching freeDB for a page range. Use keys in |
3551 | * range [1,me_pglast]: Smaller than txnid of oldest reader. |
3552 | */ |
3553 | if (total_room >= mop_len) { |
3554 | if (total_room == mop_len || --more < 0) |
3555 | break; |
3556 | } else if (head_room >= maxfree_1pg && head_id > 1) { |
3557 | /* Keep current record (overflow page), add a new one */ |
3558 | head_id--; |
3559 | head_room = 0; |
3560 | } |
3561 | /* (Re)write {key = head_id, IDL length = head_room} */ |
3562 | total_room -= head_room; |
3563 | head_room = mop_len - total_room; |
3564 | if (head_room > maxfree_1pg && head_id > 1) { |
3565 | /* Overflow multi-page for part of me_pghead */ |
3566 | head_room /= head_id; /* amortize page sizes */ |
3567 | head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); |
3568 | } else if (head_room < 0) { |
3569 | /* Rare case, not bothering to delete this record */ |
3570 | head_room = 0; |
3571 | } |
3572 | key.mv_size = sizeof(head_id); |
3573 | key.mv_data = &head_id; |
3574 | data.mv_size = (head_room + 1) * sizeof(pgno_t); |
3575 | rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); |
3576 | if (rc) |
3577 | return rc; |
3578 | /* IDL is initially empty, zero out at least the length */ |
3579 | pgs = (pgno_t *)data.mv_data; |
3580 | j = head_room > clean_limit ? head_room : 0; |
3581 | do { |
3582 | pgs[j] = 0; |
3583 | } while (--j >= 0); |
3584 | total_room += head_room; |
3585 | } |
3586 | |
3587 | /* Return loose page numbers to me_pghead, though usually none are |
3588 | * left at this point. The pages themselves remain in dirty_list. |
3589 | */ |
3590 | if (txn->mt_loose_pgs) { |
3591 | MDB_page *mp = txn->mt_loose_pgs; |
3592 | unsigned count = txn->mt_loose_count; |
3593 | MDB_IDL loose; |
3594 | /* Room for loose pages + temp IDL with same */ |
3595 | if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0) |
3596 | return rc; |
3597 | mop = env->me_pghead; |
3598 | loose = mop + MDB_IDL_ALLOCLEN(mop) - count; |
3599 | for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) |
3600 | loose[ ++count ] = mp->mp_pgno; |
3601 | loose[0] = count; |
3602 | mdb_midl_sort(loose); |
3603 | mdb_midl_xmerge(mop, loose); |
3604 | txn->mt_loose_pgs = NULL; |
3605 | txn->mt_loose_count = 0; |
3606 | mop_len = mop[0]; |
3607 | } |
3608 | |
3609 | /* Fill in the reserved me_pghead records */ |
3610 | rc = MDB_SUCCESS; |
3611 | if (mop_len) { |
3612 | MDB_val key, data; |
3613 | |
3614 | mop += mop_len; |
3615 | rc = mdb_cursor_first(&mc, &key, &data); |
3616 | for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { |
3617 | txnid_t id = *(txnid_t *)key.mv_data; |
3618 | ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; |
3619 | MDB_ID save; |
3620 | |
3621 | mdb_tassert(txn, len >= 0 && id <= env->me_pglast); |
3622 | key.mv_data = &id; |
3623 | if (len > mop_len) { |
3624 | len = mop_len; |
3625 | data.mv_size = (len + 1) * sizeof(MDB_ID); |
3626 | } |
3627 | data.mv_data = mop -= len; |
3628 | save = mop[0]; |
3629 | mop[0] = len; |
3630 | rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT); |
3631 | mop[0] = save; |
3632 | if (rc || !(mop_len -= len)) |
3633 | break; |
3634 | } |
3635 | } |
3636 | return rc; |
3637 | } |
3638 | |
3639 | /** Flush (some) dirty pages to the map, after clearing their dirty flag. |
3640 | * @param[in] txn the transaction that's being committed |
3641 | * @param[in] keep number of initial pages in dirty_list to keep dirty. |
3642 | * @return 0 on success, non-zero on failure. |
3643 | */ |
3644 | static int |
3645 | mdb_page_flush(MDB_txn *txn, int keep) |
3646 | { |
3647 | MDB_env *env = txn->mt_env; |
3648 | MDB_ID2L dl = txn->mt_u.dirty_list; |
3649 | unsigned psize = env->me_psize, j; |
3650 | int i, pagecount = dl[0].mid, rc; |
3651 | size_t size = 0; |
3652 | off_t pos = 0; |
3653 | pgno_t pgno = 0; |
3654 | MDB_page *dp = NULL; |
3655 | #ifdef _WIN32 |
3656 | OVERLAPPED ov; |
3657 | #else |
3658 | struct iovec iov[MDB_COMMIT_PAGES]; |
3659 | ssize_t wsize = 0, wres; |
3660 | off_t wpos = 0, next_pos = 1; /* impossible pos, so pos != next_pos */ |
3661 | int n = 0; |
3662 | #endif |
3663 | |
3664 | j = i = keep; |
3665 | |
3666 | if (env->me_flags & MDB_WRITEMAP) { |
3667 | /* Clear dirty flags */ |
3668 | while (++i <= pagecount) { |
3669 | dp = dl[i].mptr; |
3670 | /* Don't flush this page yet */ |
3671 | if (dp->mp_flags & (P_LOOSE|P_KEEP)) { |
3672 | dp->mp_flags &= ~P_KEEP; |
3673 | dl[++j] = dl[i]; |
3674 | continue; |
3675 | } |
3676 | dp->mp_flags &= ~P_DIRTY; |
3677 | } |
3678 | goto done; |
3679 | } |
3680 | |
3681 | /* Write the pages */ |
3682 | for (;;) { |
3683 | if (++i <= pagecount) { |
3684 | dp = dl[i].mptr; |
3685 | /* Don't flush this page yet */ |
3686 | if (dp->mp_flags & (P_LOOSE|P_KEEP)) { |
3687 | dp->mp_flags &= ~P_KEEP; |
3688 | dl[i].mid = 0; |
3689 | continue; |
3690 | } |
3691 | pgno = dl[i].mid; |
3692 | /* clear dirty flag */ |
3693 | dp->mp_flags &= ~P_DIRTY; |
3694 | pos = pgno * psize; |
3695 | size = psize; |
3696 | if (IS_OVERFLOW(dp)) size *= dp->mp_pages; |
3697 | } |
3698 | #ifdef _WIN32 |
3699 | else break; |
3700 | |
3701 | /* Windows actually supports scatter/gather I/O, but only on |
3702 | * unbuffered file handles. Since we're relying on the OS page |
3703 | * cache for all our data, that's self-defeating. So we just |
3704 | * write pages one at a time. We use the ov structure to set |
3705 | * the write offset, to at least save the overhead of a Seek |
3706 | * system call. |
3707 | */ |
3708 | DPRINTF(("committing page %" Yu, pgno)); |
3709 | memset(&ov, 0, sizeof(ov)); |
3710 | ov.Offset = pos & 0xffffffff; |
3711 | ov.OffsetHigh = pos >> 16 >> 16; |
3712 | if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { |
3713 | rc = ErrCode(); |
3714 | DPRINTF(("WriteFile: %d" , rc)); |
3715 | return rc; |
3716 | } |
3717 | #else |
3718 | /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ |
3719 | if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { |
3720 | if (n) { |
3721 | retry_write: |
3722 | /* Write previous page(s) */ |
3723 | #ifdef MDB_USE_PWRITEV |
3724 | wres = pwritev(env->me_fd, iov, n, wpos); |
3725 | #else |
3726 | if (n == 1) { |
3727 | wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); |
3728 | } else { |
3729 | retry_seek: |
3730 | if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { |
3731 | rc = ErrCode(); |
3732 | if (rc == EINTR) |
3733 | goto retry_seek; |
3734 | DPRINTF(("lseek: %s" , strerror(rc))); |
3735 | return rc; |
3736 | } |
3737 | wres = writev(env->me_fd, iov, n); |
3738 | } |
3739 | #endif |
3740 | if (wres != wsize) { |
3741 | if (wres < 0) { |
3742 | rc = ErrCode(); |
3743 | if (rc == EINTR) |
3744 | goto retry_write; |
3745 | DPRINTF(("Write error: %s" , strerror(rc))); |
3746 | } else { |
3747 | rc = EIO; /* TODO: Use which error code? */ |
3748 | DPUTS("short write, filesystem full?" ); |
3749 | } |
3750 | return rc; |
3751 | } |
3752 | n = 0; |
3753 | } |
3754 | if (i > pagecount) |
3755 | break; |
3756 | wpos = pos; |
3757 | wsize = 0; |
3758 | } |
3759 | DPRINTF(("committing page %" Yu, pgno)); |
3760 | next_pos = pos + size; |
3761 | iov[n].iov_len = size; |
3762 | iov[n].iov_base = (char *)dp; |
3763 | wsize += size; |
3764 | n++; |
3765 | #endif /* _WIN32 */ |
3766 | } |
3767 | #ifdef MDB_VL32 |
3768 | if (pgno > txn->mt_last_pgno) |
3769 | txn->mt_last_pgno = pgno; |
3770 | #endif |
3771 | |
3772 | /* MIPS has cache coherency issues, this is a no-op everywhere else |
3773 | * Note: for any size >= on-chip cache size, entire on-chip cache is |
3774 | * flushed. |
3775 | */ |
3776 | CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); |
3777 | |
3778 | for (i = keep; ++i <= pagecount; ) { |
3779 | dp = dl[i].mptr; |
3780 | /* This is a page we skipped above */ |
3781 | if (!dl[i].mid) { |
3782 | dl[++j] = dl[i]; |
3783 | dl[j].mid = dp->mp_pgno; |
3784 | continue; |
3785 | } |
3786 | mdb_dpage_free(env, dp); |
3787 | } |
3788 | |
3789 | done: |
3790 | i--; |
3791 | txn->mt_dirty_room += i - j; |
3792 | dl[0].mid = j; |
3793 | return MDB_SUCCESS; |
3794 | } |
3795 | |
3796 | static int ESECT mdb_env_share_locks(MDB_env *env, int *excl); |
3797 | |
3798 | int |
3799 | mdb_txn_commit(MDB_txn *txn) |
3800 | { |
3801 | int rc; |
3802 | unsigned int i, end_mode; |
3803 | MDB_env *env; |
3804 | |
3805 | if (txn == NULL) |
3806 | return EINVAL; |
3807 | |
3808 | /* mdb_txn_end() mode for a commit which writes nothing */ |
3809 | end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; |
3810 | |
3811 | if (txn->mt_child) { |
3812 | rc = mdb_txn_commit(txn->mt_child); |
3813 | if (rc) |
3814 | goto fail; |
3815 | } |
3816 | |
3817 | env = txn->mt_env; |
3818 | |
3819 | if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { |
3820 | goto done; |
3821 | } |
3822 | |
3823 | if (txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR)) { |
3824 | DPUTS("txn has failed/finished, can't commit" ); |
3825 | if (txn->mt_parent) |
3826 | txn->mt_parent->mt_flags |= MDB_TXN_ERROR; |
3827 | rc = MDB_BAD_TXN; |
3828 | goto fail; |
3829 | } |
3830 | |
3831 | if (txn->mt_parent) { |
3832 | MDB_txn *parent = txn->mt_parent; |
3833 | MDB_page **lp; |
3834 | MDB_ID2L dst, src; |
3835 | MDB_IDL pspill; |
3836 | unsigned x, y, len, ps_len; |
3837 | |
3838 | /* Append our free list to parent's */ |
3839 | rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); |
3840 | if (rc) |
3841 | goto fail; |
3842 | mdb_midl_free(txn->mt_free_pgs); |
3843 | /* Failures after this must either undo the changes |
3844 | * to the parent or set MDB_TXN_ERROR in the parent. |
3845 | */ |
3846 | |
3847 | parent->mt_next_pgno = txn->mt_next_pgno; |
3848 | parent->mt_flags = txn->mt_flags; |
3849 | |
3850 | /* Merge our cursors into parent's and close them */ |
3851 | mdb_cursors_close(txn, 1); |
3852 | |
3853 | /* Update parent's DB table. */ |
3854 | memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); |
3855 | parent->mt_numdbs = txn->mt_numdbs; |
3856 | parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; |
3857 | parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; |
3858 | for (i=CORE_DBS; i<txn->mt_numdbs; i++) { |
3859 | /* preserve parent's DB_NEW status */ |
3860 | x = parent->mt_dbflags[i] & DB_NEW; |
3861 | parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; |
3862 | } |
3863 | |
3864 | dst = parent->mt_u.dirty_list; |
3865 | src = txn->mt_u.dirty_list; |
3866 | /* Remove anything in our dirty list from parent's spill list */ |
3867 | if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { |
3868 | x = y = ps_len; |
3869 | pspill[0] = (pgno_t)-1; |
3870 | /* Mark our dirty pages as deleted in parent spill list */ |
3871 | for (i=0, len=src[0].mid; ++i <= len; ) { |
3872 | MDB_ID pn = src[i].mid << 1; |
3873 | while (pn > pspill[x]) |
3874 | x--; |
3875 | if (pn == pspill[x]) { |
3876 | pspill[x] = 1; |
3877 | y = --x; |
3878 | } |
3879 | } |
3880 | /* Squash deleted pagenums if we deleted any */ |
3881 | for (x=y; ++x <= ps_len; ) |
3882 | if (!(pspill[x] & 1)) |
3883 | pspill[++y] = pspill[x]; |
3884 | pspill[0] = y; |
3885 | } |
3886 | |
3887 | /* Remove anything in our spill list from parent's dirty list */ |
3888 | if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { |
3889 | for (i=1; i<=txn->mt_spill_pgs[0]; i++) { |
3890 | MDB_ID pn = txn->mt_spill_pgs[i]; |
3891 | if (pn & 1) |
3892 | continue; /* deleted spillpg */ |
3893 | pn >>= 1; |
3894 | y = mdb_mid2l_search(dst, pn); |
3895 | if (y <= dst[0].mid && dst[y].mid == pn) { |
3896 | free(dst[y].mptr); |
3897 | while (y < dst[0].mid) { |
3898 | dst[y] = dst[y+1]; |
3899 | y++; |
3900 | } |
3901 | dst[0].mid--; |
3902 | } |
3903 | } |
3904 | } |
3905 | |
3906 | /* Find len = length of merging our dirty list with parent's */ |
3907 | x = dst[0].mid; |
3908 | dst[0].mid = 0; /* simplify loops */ |
3909 | if (parent->mt_parent) { |
3910 | len = x + src[0].mid; |
3911 | y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; |
3912 | for (i = x; y && i; y--) { |
3913 | pgno_t yp = src[y].mid; |
3914 | while (yp < dst[i].mid) |
3915 | i--; |
3916 | if (yp == dst[i].mid) { |
3917 | i--; |
3918 | len--; |
3919 | } |
3920 | } |
3921 | } else { /* Simplify the above for single-ancestor case */ |
3922 | len = MDB_IDL_UM_MAX - txn->mt_dirty_room; |
3923 | } |
3924 | /* Merge our dirty list with parent's */ |
3925 | y = src[0].mid; |
3926 | for (i = len; y; dst[i--] = src[y--]) { |
3927 | pgno_t yp = src[y].mid; |
3928 | while (yp < dst[x].mid) |
3929 | dst[i--] = dst[x--]; |
3930 | if (yp == dst[x].mid) |
3931 | free(dst[x--].mptr); |
3932 | } |
3933 | mdb_tassert(txn, i == x); |
3934 | dst[0].mid = len; |
3935 | free(txn->mt_u.dirty_list); |
3936 | parent->mt_dirty_room = txn->mt_dirty_room; |
3937 | if (txn->mt_spill_pgs) { |
3938 | if (parent->mt_spill_pgs) { |
3939 | /* TODO: Prevent failure here, so parent does not fail */ |
3940 | rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); |
3941 | if (rc) |
3942 | parent->mt_flags |= MDB_TXN_ERROR; |
3943 | mdb_midl_free(txn->mt_spill_pgs); |
3944 | mdb_midl_sort(parent->mt_spill_pgs); |
3945 | } else { |
3946 | parent->mt_spill_pgs = txn->mt_spill_pgs; |
3947 | } |
3948 | } |
3949 | |
3950 | /* Append our loose page list to parent's */ |
3951 | for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) |
3952 | ; |
3953 | *lp = txn->mt_loose_pgs; |
3954 | parent->mt_loose_count += txn->mt_loose_count; |
3955 | |
3956 | parent->mt_child = NULL; |
3957 | mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); |
3958 | free(txn); |
3959 | return rc; |
3960 | } |
3961 | |
3962 | if (txn != env->me_txn) { |
3963 | DPUTS("attempt to commit unknown transaction" ); |
3964 | rc = EINVAL; |
3965 | goto fail; |
3966 | } |
3967 | |
3968 | mdb_cursors_close(txn, 0); |
3969 | |
3970 | if (!txn->mt_u.dirty_list[0].mid && |
3971 | !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) |
3972 | goto done; |
3973 | |
3974 | DPRINTF(("committing txn %" Yu" %p on mdbenv %p, root page %" Yu, |
3975 | txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); |
3976 | |
3977 | /* Update DB root pointers */ |
3978 | if (txn->mt_numdbs > CORE_DBS) { |
3979 | MDB_cursor mc; |
3980 | MDB_dbi i; |
3981 | MDB_val data; |
3982 | data.mv_size = sizeof(MDB_db); |
3983 | |
3984 | mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); |
3985 | for (i = CORE_DBS; i < txn->mt_numdbs; i++) { |
3986 | if (txn->mt_dbflags[i] & DB_DIRTY) { |
3987 | if (TXN_DBI_CHANGED(txn, i)) { |
3988 | rc = MDB_BAD_DBI; |
3989 | goto fail; |
3990 | } |
3991 | data.mv_data = &txn->mt_dbs[i]; |
3992 | rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, |
3993 | F_SUBDATA); |
3994 | if (rc) |
3995 | goto fail; |
3996 | } |
3997 | } |
3998 | } |
3999 | |
4000 | rc = mdb_freelist_save(txn); |
4001 | if (rc) |
4002 | goto fail; |
4003 | |
4004 | mdb_midl_free(env->me_pghead); |
4005 | env->me_pghead = NULL; |
4006 | mdb_midl_shrink(&txn->mt_free_pgs); |
4007 | |
4008 | #if (MDB_DEBUG) > 2 |
4009 | mdb_audit(txn); |
4010 | #endif |
4011 | |
4012 | if ((rc = mdb_page_flush(txn, 0))) |
4013 | goto fail; |
4014 | if (!F_ISSET(txn->mt_flags, MDB_TXN_NOSYNC) && |
4015 | (rc = mdb_env_sync0(env, 0, txn->mt_next_pgno))) |
4016 | goto fail; |
4017 | if ((rc = mdb_env_write_meta(txn))) |
4018 | goto fail; |
4019 | end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; |
4020 | if (env->me_flags & MDB_PREVSNAPSHOT) { |
4021 | if (!(env->me_flags & MDB_NOLOCK)) { |
4022 | int excl; |
4023 | rc = mdb_env_share_locks(env, &excl); |
4024 | if (rc) |
4025 | goto fail; |
4026 | } |
4027 | env->me_flags ^= MDB_PREVSNAPSHOT; |
4028 | } |
4029 | |
4030 | done: |
4031 | mdb_txn_end(txn, end_mode); |
4032 | return MDB_SUCCESS; |
4033 | |
4034 | fail: |
4035 | mdb_txn_abort(txn); |
4036 | return rc; |
4037 | } |
4038 | |
4039 | /** Read the environment parameters of a DB environment before |
4040 | * mapping it into memory. |
4041 | * @param[in] env the environment handle |
4042 | * @param[in] prev whether to read the backup meta page |
4043 | * @param[out] meta address of where to store the meta information |
4044 | * @return 0 on success, non-zero on failure. |
4045 | */ |
4046 | static int ESECT |
4047 | (MDB_env *env, int prev, MDB_meta *meta) |
4048 | { |
4049 | MDB_metabuf pbuf; |
4050 | MDB_page *p; |
4051 | MDB_meta *m; |
4052 | int i, rc, off; |
4053 | enum { Size = sizeof(pbuf) }; |
4054 | |
4055 | /* We don't know the page size yet, so use a minimum value. |
4056 | * Read both meta pages so we can use the latest one. |
4057 | */ |
4058 | |
4059 | for (i=off=0; i<NUM_METAS; i++, off += meta->mm_psize) { |
4060 | #ifdef _WIN32 |
4061 | DWORD len; |
4062 | OVERLAPPED ov; |
4063 | memset(&ov, 0, sizeof(ov)); |
4064 | ov.Offset = off; |
4065 | rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1; |
4066 | if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF) |
4067 | rc = 0; |
4068 | #else |
4069 | rc = pread(env->me_fd, &pbuf, Size, off); |
4070 | #endif |
4071 | if (rc != Size) { |
4072 | if (rc == 0 && off == 0) |
4073 | return ENOENT; |
4074 | rc = rc < 0 ? (int) ErrCode() : MDB_INVALID; |
4075 | DPRINTF(("read: %s" , mdb_strerror(rc))); |
4076 | return rc; |
4077 | } |
4078 | |
4079 | p = (MDB_page *)&pbuf; |
4080 | |
4081 | if (!F_ISSET(p->mp_flags, P_META)) { |
4082 | DPRINTF(("page %" Yu" not a meta page" , p->mp_pgno)); |
4083 | return MDB_INVALID; |
4084 | } |
4085 | |
4086 | m = METADATA(p); |
4087 | if (m->mm_magic != MDB_MAGIC) { |
4088 | DPUTS("meta has invalid magic" ); |
4089 | return MDB_INVALID; |
4090 | } |
4091 | |
4092 | if (m->mm_version != MDB_DATA_VERSION) { |
4093 | DPRINTF(("database is version %u, expected version %u" , |
4094 | m->mm_version, MDB_DATA_VERSION)); |
4095 | return MDB_VERSION_MISMATCH; |
4096 | } |
4097 | |
4098 | if (off == 0 || (prev ? m->mm_txnid < meta->mm_txnid : m->mm_txnid > meta->mm_txnid)) |
4099 | *meta = *m; |
4100 | } |
4101 | return 0; |
4102 | } |
4103 | |
4104 | /** Fill in most of the zeroed #MDB_meta for an empty database environment */ |
4105 | static void ESECT |
4106 | mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) |
4107 | { |
4108 | meta->mm_magic = MDB_MAGIC; |
4109 | meta->mm_version = MDB_DATA_VERSION; |
4110 | meta->mm_mapsize = env->me_mapsize; |
4111 | meta->mm_psize = env->me_psize; |
4112 | meta->mm_last_pg = NUM_METAS-1; |
4113 | meta->mm_flags = env->me_flags & 0xffff; |
4114 | meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ |
4115 | meta->mm_dbs[FREE_DBI].md_root = P_INVALID; |
4116 | meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; |
4117 | } |
4118 | |
4119 | /** Write the environment parameters of a freshly created DB environment. |
4120 | * @param[in] env the environment handle |
4121 | * @param[in] meta the #MDB_meta to write |
4122 | * @return 0 on success, non-zero on failure. |
4123 | */ |
4124 | static int ESECT |
4125 | mdb_env_init_meta(MDB_env *env, MDB_meta *meta) |
4126 | { |
4127 | MDB_page *p, *q; |
4128 | int rc; |
4129 | unsigned int psize; |
4130 | #ifdef _WIN32 |
4131 | DWORD len; |
4132 | OVERLAPPED ov; |
4133 | memset(&ov, 0, sizeof(ov)); |
4134 | #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ |
4135 | ov.Offset = pos; \ |
4136 | rc = WriteFile(fd, ptr, size, &len, &ov); } while(0) |
4137 | #else |
4138 | int len; |
4139 | #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ |
4140 | len = pwrite(fd, ptr, size, pos); \ |
4141 | if (len == -1 && ErrCode() == EINTR) continue; \ |
4142 | rc = (len >= 0); break; } while(1) |
4143 | #endif |
4144 | |
4145 | DPUTS("writing new meta page" ); |
4146 | |
4147 | psize = env->me_psize; |
4148 | |
4149 | p = calloc(NUM_METAS, psize); |
4150 | if (!p) |
4151 | return ENOMEM; |
4152 | p->mp_pgno = 0; |
4153 | p->mp_flags = P_META; |
4154 | *(MDB_meta *)METADATA(p) = *meta; |
4155 | |
4156 | q = (MDB_page *)((char *)p + psize); |
4157 | q->mp_pgno = 1; |
4158 | q->mp_flags = P_META; |
4159 | *(MDB_meta *)METADATA(q) = *meta; |
4160 | |
4161 | DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0); |
4162 | if (!rc) |
4163 | rc = ErrCode(); |
4164 | else if ((unsigned) len == psize * NUM_METAS) |
4165 | rc = MDB_SUCCESS; |
4166 | else |
4167 | rc = ENOSPC; |
4168 | free(p); |
4169 | return rc; |
4170 | } |
4171 | |
4172 | /** Update the environment info to commit a transaction. |
4173 | * @param[in] txn the transaction that's being committed |
4174 | * @return 0 on success, non-zero on failure. |
4175 | */ |
4176 | static int |
4177 | mdb_env_write_meta(MDB_txn *txn) |
4178 | { |
4179 | MDB_env *env; |
4180 | MDB_meta meta, metab, *mp; |
4181 | unsigned flags; |
4182 | mdb_size_t mapsize; |
4183 | off_t off; |
4184 | int rc, len, toggle; |
4185 | char *ptr; |
4186 | HANDLE mfd; |
4187 | #ifdef _WIN32 |
4188 | OVERLAPPED ov; |
4189 | #else |
4190 | int r2; |
4191 | #endif |
4192 | |
4193 | toggle = txn->mt_txnid & 1; |
4194 | DPRINTF(("writing meta page %d for root page %" Yu, |
4195 | toggle, txn->mt_dbs[MAIN_DBI].md_root)); |
4196 | |
4197 | env = txn->mt_env; |
4198 | flags = txn->mt_flags | env->me_flags; |
4199 | mp = env->me_metas[toggle]; |
4200 | mapsize = env->me_metas[toggle ^ 1]->mm_mapsize; |
4201 | /* Persist any increases of mapsize config */ |
4202 | if (mapsize < env->me_mapsize) |
4203 | mapsize = env->me_mapsize; |
4204 | |
4205 | if (flags & MDB_WRITEMAP) { |
4206 | mp->mm_mapsize = mapsize; |
4207 | mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; |
4208 | mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; |
4209 | mp->mm_last_pg = txn->mt_next_pgno - 1; |
4210 | #if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && /* TODO: portability */ \ |
4211 | !(defined(__i386__) || defined(__x86_64__)) |
4212 | /* LY: issue a memory barrier, if not x86. ITS#7969 */ |
4213 | __sync_synchronize(); |
4214 | #endif |
4215 | mp->mm_txnid = txn->mt_txnid; |
4216 | if (!(flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { |
4217 | unsigned meta_size = env->me_psize; |
4218 | rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; |
4219 | ptr = (char *)mp - PAGEHDRSZ; |
4220 | #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ |
4221 | r2 = (ptr - env->me_map) & (env->me_os_psize - 1); |
4222 | ptr -= r2; |
4223 | meta_size += r2; |
4224 | #endif |
4225 | if (MDB_MSYNC(ptr, meta_size, rc)) { |
4226 | rc = ErrCode(); |
4227 | goto fail; |
4228 | } |
4229 | } |
4230 | goto done; |
4231 | } |
4232 | metab.mm_txnid = mp->mm_txnid; |
4233 | metab.mm_last_pg = mp->mm_last_pg; |
4234 | |
4235 | meta.mm_mapsize = mapsize; |
4236 | meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; |
4237 | meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; |
4238 | meta.mm_last_pg = txn->mt_next_pgno - 1; |
4239 | meta.mm_txnid = txn->mt_txnid; |
4240 | |
4241 | off = offsetof(MDB_meta, mm_mapsize); |
4242 | ptr = (char *)&meta + off; |
4243 | len = sizeof(MDB_meta) - off; |
4244 | off += (char *)mp - env->me_map; |
4245 | |
4246 | /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC. |
4247 | * (me_mfd goes to the same file as me_fd, but writing to it |
4248 | * also syncs to disk. Avoids a separate fdatasync() call.) |
4249 | */ |
4250 | mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; |
4251 | #ifdef _WIN32 |
4252 | { |
4253 | memset(&ov, 0, sizeof(ov)); |
4254 | ov.Offset = off; |
4255 | if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) |
4256 | rc = -1; |
4257 | } |
4258 | #else |
4259 | retry_write: |
4260 | rc = pwrite(mfd, ptr, len, off); |
4261 | #endif |
4262 | if (rc != len) { |
4263 | rc = rc < 0 ? ErrCode() : EIO; |
4264 | #ifndef _WIN32 |
4265 | if (rc == EINTR) |
4266 | goto retry_write; |
4267 | #endif |
4268 | DPUTS("write failed, disk error?" ); |
4269 | /* On a failure, the pagecache still contains the new data. |
4270 | * Write some old data back, to prevent it from being used. |
4271 | * Use the non-SYNC fd; we know it will fail anyway. |
4272 | */ |
4273 | meta.mm_last_pg = metab.mm_last_pg; |
4274 | meta.mm_txnid = metab.mm_txnid; |
4275 | #ifdef _WIN32 |
4276 | memset(&ov, 0, sizeof(ov)); |
4277 | ov.Offset = off; |
4278 | WriteFile(env->me_fd, ptr, len, NULL, &ov); |
4279 | #else |
4280 | r2 = pwrite(env->me_fd, ptr, len, off); |
4281 | (void)r2; /* Silence warnings. We don't care about pwrite's return value */ |
4282 | #endif |
4283 | fail: |
4284 | env->me_flags |= MDB_FATAL_ERROR; |
4285 | return rc; |
4286 | } |
4287 | /* MIPS has cache coherency issues, this is a no-op everywhere else */ |
4288 | CACHEFLUSH(env->me_map + off, len, DCACHE); |
4289 | done: |
4290 | /* Memory ordering issues are irrelevant; since the entire writer |
4291 | * is wrapped by wmutex, all of these changes will become visible |
4292 | * after the wmutex is unlocked. Since the DB is multi-version, |
4293 | * readers will get consistent data regardless of how fresh or |
4294 | * how stale their view of these values is. |
4295 | */ |
4296 | if (env->me_txns) |
4297 | env->me_txns->mti_txnid = txn->mt_txnid; |
4298 | |
4299 | return MDB_SUCCESS; |
4300 | } |
4301 | |
4302 | /** Check both meta pages to see which one is newer. |
4303 | * @param[in] env the environment handle |
4304 | * @return newest #MDB_meta. |
4305 | */ |
4306 | static MDB_meta * |
4307 | mdb_env_pick_meta(const MDB_env *env) |
4308 | { |
4309 | MDB_meta *const *metas = env->me_metas; |
4310 | return metas[ (metas[0]->mm_txnid < metas[1]->mm_txnid) ^ |
4311 | ((env->me_flags & MDB_PREVSNAPSHOT) != 0) ]; |
4312 | } |
4313 | |
4314 | int ESECT |
4315 | mdb_env_create(MDB_env **env) |
4316 | { |
4317 | MDB_env *e; |
4318 | |
4319 | e = calloc(1, sizeof(MDB_env)); |
4320 | if (!e) |
4321 | return ENOMEM; |
4322 | |
4323 | e->me_maxreaders = DEFAULT_READERS; |
4324 | e->me_maxdbs = e->me_numdbs = CORE_DBS; |
4325 | e->me_fd = INVALID_HANDLE_VALUE; |
4326 | e->me_lfd = INVALID_HANDLE_VALUE; |
4327 | e->me_mfd = INVALID_HANDLE_VALUE; |
4328 | #ifdef MDB_USE_POSIX_SEM |
4329 | e->me_rmutex = SEM_FAILED; |
4330 | e->me_wmutex = SEM_FAILED; |
4331 | #elif defined MDB_USE_SYSV_SEM |
4332 | e->me_rmutex->semid = -1; |
4333 | e->me_wmutex->semid = -1; |
4334 | #endif |
4335 | e->me_pid = getpid(); |
4336 | GET_PAGESIZE(e->me_os_psize); |
4337 | VGMEMP_CREATE(e,0,0); |
4338 | *env = e; |
4339 | return MDB_SUCCESS; |
4340 | } |
4341 | |
4342 | #ifdef _WIN32 |
4343 | /** @brief Map a result from an NTAPI call to WIN32. */ |
4344 | static DWORD |
4345 | mdb_nt2win32(NTSTATUS st) |
4346 | { |
4347 | OVERLAPPED o = {0}; |
4348 | DWORD br; |
4349 | o.Internal = st; |
4350 | GetOverlappedResult(NULL, &o, &br, FALSE); |
4351 | return GetLastError(); |
4352 | } |
4353 | #endif |
4354 | |
4355 | static int ESECT |
4356 | mdb_env_map(MDB_env *env, void *addr) |
4357 | { |
4358 | MDB_page *p; |
4359 | unsigned int flags = env->me_flags; |
4360 | #ifdef _WIN32 |
4361 | int rc; |
4362 | int access = SECTION_MAP_READ; |
4363 | HANDLE mh; |
4364 | void *map; |
4365 | SIZE_T msize; |
4366 | ULONG pageprot = PAGE_READONLY, secprot, alloctype; |
4367 | |
4368 | if (flags & MDB_WRITEMAP) { |
4369 | access |= SECTION_MAP_WRITE; |
4370 | pageprot = PAGE_READWRITE; |
4371 | } |
4372 | if (flags & MDB_RDONLY) { |
4373 | secprot = PAGE_READONLY; |
4374 | msize = 0; |
4375 | alloctype = 0; |
4376 | } else { |
4377 | secprot = PAGE_READWRITE; |
4378 | msize = env->me_mapsize; |
4379 | alloctype = MEM_RESERVE; |
4380 | } |
4381 | |
4382 | rc = NtCreateSection(&mh, access, NULL, NULL, secprot, SEC_RESERVE, env->me_fd); |
4383 | if (rc) |
4384 | return mdb_nt2win32(rc); |
4385 | map = addr; |
4386 | #ifdef MDB_VL32 |
4387 | msize = NUM_METAS * env->me_psize; |
4388 | #endif |
4389 | rc = NtMapViewOfSection(mh, GetCurrentProcess(), &map, 0, 0, NULL, &msize, ViewUnmap, alloctype, pageprot); |
4390 | #ifdef MDB_VL32 |
4391 | env->me_fmh = mh; |
4392 | #else |
4393 | NtClose(mh); |
4394 | #endif |
4395 | if (rc) |
4396 | return mdb_nt2win32(rc); |
4397 | env->me_map = map; |
4398 | #else |
4399 | #ifdef MDB_VL32 |
4400 | (void) flags; |
4401 | env->me_map = mmap(addr, NUM_METAS * env->me_psize, PROT_READ, MAP_SHARED, |
4402 | env->me_fd, 0); |
4403 | if (env->me_map == MAP_FAILED) { |
4404 | env->me_map = NULL; |
4405 | return ErrCode(); |
4406 | } |
4407 | #else |
4408 | int prot = PROT_READ; |
4409 | if (flags & MDB_WRITEMAP) { |
4410 | prot |= PROT_WRITE; |
4411 | if (ftruncate(env->me_fd, env->me_mapsize) < 0) |
4412 | return ErrCode(); |
4413 | } |
4414 | env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, |
4415 | env->me_fd, 0); |
4416 | if (env->me_map == MAP_FAILED) { |
4417 | env->me_map = NULL; |
4418 | return ErrCode(); |
4419 | } |
4420 | |
4421 | if (flags & MDB_NORDAHEAD) { |
4422 | /* Turn off readahead. It's harmful when the DB is larger than RAM. */ |
4423 | #ifdef MADV_RANDOM |
4424 | madvise(env->me_map, env->me_mapsize, MADV_RANDOM); |
4425 | #else |
4426 | #ifdef POSIX_MADV_RANDOM |
4427 | posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); |
4428 | #endif /* POSIX_MADV_RANDOM */ |
4429 | #endif /* MADV_RANDOM */ |
4430 | } |
4431 | #endif /* _WIN32 */ |
4432 | |
4433 | /* Can happen because the address argument to mmap() is just a |
4434 | * hint. mmap() can pick another, e.g. if the range is in use. |
4435 | * The MAP_FIXED flag would prevent that, but then mmap could |
4436 | * instead unmap existing pages to make room for the new map. |
4437 | */ |
4438 | if (addr && env->me_map != addr) |
4439 | return EBUSY; /* TODO: Make a new MDB_* error code? */ |
4440 | #endif |
4441 | |
4442 | p = (MDB_page *)env->me_map; |
4443 | env->me_metas[0] = METADATA(p); |
4444 | env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize); |
4445 | |
4446 | return MDB_SUCCESS; |
4447 | } |
4448 | |
4449 | int ESECT |
4450 | mdb_env_set_mapsize(MDB_env *env, mdb_size_t size) |
4451 | { |
4452 | /* If env is already open, caller is responsible for making |
4453 | * sure there are no active txns. |
4454 | */ |
4455 | if (env->me_map) { |
4456 | MDB_meta *meta; |
4457 | #ifndef MDB_VL32 |
4458 | void *old; |
4459 | int rc; |
4460 | #endif |
4461 | if (env->me_txn) |
4462 | return EINVAL; |
4463 | meta = mdb_env_pick_meta(env); |
4464 | if (!size) |
4465 | size = meta->mm_mapsize; |
4466 | { |
4467 | /* Silently round up to minimum if the size is too small */ |
4468 | mdb_size_t minsize = (meta->mm_last_pg + 1) * env->me_psize; |
4469 | if (size < minsize) |
4470 | size = minsize; |
4471 | } |
4472 | #ifndef MDB_VL32 |
4473 | /* For MDB_VL32 this bit is a noop since we dynamically remap |
4474 | * chunks of the DB anyway. |
4475 | */ |
4476 | munmap(env->me_map, env->me_mapsize); |
4477 | env->me_mapsize = size; |
4478 | old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; |
4479 | rc = mdb_env_map(env, old); |
4480 | if (rc) |
4481 | return rc; |
4482 | #endif /* !MDB_VL32 */ |
4483 | } |
4484 | env->me_mapsize = size; |
4485 | if (env->me_psize) |
4486 | env->me_maxpg = env->me_mapsize / env->me_psize; |
4487 | return MDB_SUCCESS; |
4488 | } |
4489 | |
4490 | int ESECT |
4491 | mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) |
4492 | { |
4493 | if (env->me_map) |
4494 | return EINVAL; |
4495 | env->me_maxdbs = dbs + CORE_DBS; |
4496 | return MDB_SUCCESS; |
4497 | } |
4498 | |
4499 | int ESECT |
4500 | mdb_env_set_maxreaders(MDB_env *env, unsigned int readers) |
4501 | { |
4502 | if (env->me_map || readers < 1) |
4503 | return EINVAL; |
4504 | env->me_maxreaders = readers; |
4505 | return MDB_SUCCESS; |
4506 | } |
4507 | |
4508 | int ESECT |
4509 | mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) |
4510 | { |
4511 | if (!env || !readers) |
4512 | return EINVAL; |
4513 | *readers = env->me_maxreaders; |
4514 | return MDB_SUCCESS; |
4515 | } |
4516 | |
4517 | static int ESECT |
4518 | mdb_fsize(HANDLE fd, mdb_size_t *size) |
4519 | { |
4520 | #ifdef _WIN32 |
4521 | LARGE_INTEGER fsize; |
4522 | |
4523 | if (!GetFileSizeEx(fd, &fsize)) |
4524 | return ErrCode(); |
4525 | |
4526 | *size = fsize.QuadPart; |
4527 | #else |
4528 | struct stat st; |
4529 | |
4530 | if (fstat(fd, &st)) |
4531 | return ErrCode(); |
4532 | |
4533 | *size = st.st_size; |
4534 | #endif |
4535 | return MDB_SUCCESS; |
4536 | } |
4537 | |
4538 | |
4539 | #ifdef _WIN32 |
4540 | typedef wchar_t mdb_nchar_t; |
4541 | # define MDB_NAME(str) L##str |
4542 | # define mdb_name_cpy wcscpy |
4543 | #else |
4544 | /** Character type for file names: char on Unix, wchar_t on Windows */ |
4545 | typedef char mdb_nchar_t; |
4546 | # define MDB_NAME(str) str /**< #mdb_nchar_t[] string literal */ |
4547 | # define mdb_name_cpy strcpy /**< Copy name (#mdb_nchar_t string) */ |
4548 | #endif |
4549 | |
4550 | /** Filename - string of #mdb_nchar_t[] */ |
4551 | typedef struct MDB_name { |
4552 | int mn_len; /**< Length */ |
4553 | int mn_alloced; /**< True if #mn_val was malloced */ |
4554 | mdb_nchar_t *mn_val; /**< Contents */ |
4555 | } MDB_name; |
4556 | |
4557 | /** Filename suffixes [datafile,lockfile][without,with MDB_NOSUBDIR] */ |
4558 | static const mdb_nchar_t *const mdb_suffixes[2][2] = { |
4559 | { MDB_NAME("/data.mdb" ), MDB_NAME("" ) }, |
4560 | { MDB_NAME("/lock.mdb" ), MDB_NAME("-lock" ) } |
4561 | }; |
4562 | |
4563 | #define MDB_SUFFLEN 9 /**< Max string length in #mdb_suffixes[] */ |
4564 | |
4565 | /** Set up filename + scratch area for filename suffix, for opening files. |
4566 | * It should be freed with #mdb_fname_destroy(). |
4567 | * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16. |
4568 | * |
4569 | * @param[in] path Pathname for #mdb_env_open(). |
4570 | * @param[in] envflags Whether a subdir and/or lockfile will be used. |
4571 | * @param[out] fname Resulting filename, with room for a suffix if necessary. |
4572 | */ |
4573 | static int ESECT |
4574 | mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname) |
4575 | { |
4576 | int no_suffix = F_ISSET(envflags, MDB_NOSUBDIR|MDB_NOLOCK); |
4577 | fname->mn_alloced = 0; |
4578 | #ifdef _WIN32 |
4579 | return utf8_to_utf16(path, fname, no_suffix ? 0 : MDB_SUFFLEN); |
4580 | #else |
4581 | fname->mn_len = strlen(path); |
4582 | if (no_suffix) |
4583 | fname->mn_val = (char *) path; |
4584 | else if ((fname->mn_val = malloc(fname->mn_len + MDB_SUFFLEN+1)) != NULL) { |
4585 | fname->mn_alloced = 1; |
4586 | strcpy(fname->mn_val, path); |
4587 | } |
4588 | else |
4589 | return ENOMEM; |
4590 | return MDB_SUCCESS; |
4591 | #endif |
4592 | } |
4593 | |
4594 | /** Destroy \b fname from #mdb_fname_init() */ |
4595 | #define mdb_fname_destroy(fname) \ |
4596 | do { if ((fname).mn_alloced) free((fname).mn_val); } while (0) |
4597 | |
4598 | #ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */ |
4599 | # define MDB_CLOEXEC O_CLOEXEC |
4600 | #else |
4601 | # define MDB_CLOEXEC 0 |
4602 | #endif |
4603 | |
4604 | /** File type, access mode etc. for #mdb_fopen() */ |
4605 | enum mdb_fopen_type { |
4606 | #ifdef _WIN32 |
4607 | MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS |
4608 | #else |
4609 | /* A comment in mdb_fopen() explains some O_* flag choices. */ |
4610 | MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ |
4611 | MDB_O_RDWR = O_RDWR |O_CREAT, /**< for me_fd */ |
4612 | MDB_O_META = O_WRONLY|MDB_DSYNC |MDB_CLOEXEC, /**< for me_mfd */ |
4613 | MDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|MDB_CLOEXEC, /**< for #mdb_env_copy() */ |
4614 | /** Bitmask for open() flags in enum #mdb_fopen_type. The other bits |
4615 | * distinguish otherwise-equal MDB_O_* constants from each other. |
4616 | */ |
4617 | MDB_O_MASK = MDB_O_RDWR|MDB_CLOEXEC | MDB_O_RDONLY|MDB_O_META|MDB_O_COPY, |
4618 | MDB_O_LOCKS = MDB_O_RDWR|MDB_CLOEXEC | ((MDB_O_MASK+1) & ~MDB_O_MASK) /**< for me_lfd */ |
4619 | #endif |
4620 | }; |
4621 | |
4622 | /** Open an LMDB file. |
4623 | * @param[in] env The LMDB environment. |
4624 | * @param[in,out] fname Path from from #mdb_fname_init(). A suffix is |
4625 | * appended if necessary to create the filename, without changing mn_len. |
4626 | * @param[in] which Determines file type, access mode, etc. |
4627 | * @param[in] mode The Unix permissions for the file, if we create it. |
4628 | * @param[out] res Resulting file handle. |
4629 | * @return 0 on success, non-zero on failure. |
4630 | */ |
4631 | static int ESECT |
4632 | mdb_fopen(const MDB_env *env, MDB_name *fname, |
4633 | enum mdb_fopen_type which, mdb_mode_t mode, |
4634 | HANDLE *res) |
4635 | { |
4636 | int rc = MDB_SUCCESS; |
4637 | HANDLE fd; |
4638 | #ifdef _WIN32 |
4639 | DWORD acc, share, disp, attrs; |
4640 | #else |
4641 | int flags; |
4642 | #endif |
4643 | |
4644 | if (fname->mn_alloced) /* modifiable copy */ |
4645 | mdb_name_cpy(fname->mn_val + fname->mn_len, |
4646 | mdb_suffixes[which==MDB_O_LOCKS][F_ISSET(env->me_flags, MDB_NOSUBDIR)]); |
4647 | |
4648 | /* The directory must already exist. Usually the file need not. |
4649 | * MDB_O_META requires the file because we already created it using |
4650 | * MDB_O_RDWR. MDB_O_COPY must not overwrite an existing file. |
4651 | * |
4652 | * With MDB_O_COPY we do not want the OS to cache the writes, since |
4653 | * the source data is already in the OS cache. |
4654 | * |
4655 | * The lockfile needs FD_CLOEXEC (close file descriptor on exec*()) |
4656 | * to avoid the flock() issues noted under Caveats in lmdb.h. |
4657 | * Also set it for other filehandles which the user cannot get at |
4658 | * and close himself, which he may need after fork(). I.e. all but |
4659 | * me_fd, which programs do use via mdb_env_get_fd(). |
4660 | */ |
4661 | |
4662 | #ifdef _WIN32 |
4663 | acc = GENERIC_READ|GENERIC_WRITE; |
4664 | share = FILE_SHARE_READ|FILE_SHARE_WRITE; |
4665 | disp = OPEN_ALWAYS; |
4666 | attrs = FILE_ATTRIBUTE_NORMAL; |
4667 | switch (which) { |
4668 | case MDB_O_RDONLY: /* read-only datafile */ |
4669 | acc = GENERIC_READ; |
4670 | disp = OPEN_EXISTING; |
4671 | break; |
4672 | case MDB_O_META: /* for writing metapages */ |
4673 | acc = GENERIC_WRITE; |
4674 | disp = OPEN_EXISTING; |
4675 | attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH; |
4676 | break; |
4677 | case MDB_O_COPY: /* mdb_env_copy() & co */ |
4678 | acc = GENERIC_WRITE; |
4679 | share = 0; |
4680 | disp = CREATE_NEW; |
4681 | attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH; |
4682 | break; |
4683 | default: break; /* silence gcc -Wswitch (not all enum values handled) */ |
4684 | } |
4685 | fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL); |
4686 | #else |
4687 | fd = open(fname->mn_val, which & MDB_O_MASK, mode); |
4688 | #endif |
4689 | |
4690 | if (fd == INVALID_HANDLE_VALUE) |
4691 | rc = ErrCode(); |
4692 | #ifndef _WIN32 |
4693 | else { |
4694 | if (which != MDB_O_RDONLY && which != MDB_O_RDWR) { |
4695 | /* Set CLOEXEC if we could not pass it to open() */ |
4696 | if (!MDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1) |
4697 | (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC); |
4698 | } |
4699 | if (which == MDB_O_COPY && env->me_psize >= env->me_os_psize) { |
4700 | /* This may require buffer alignment. There is no portable |
4701 | * way to ask how much, so we require OS pagesize alignment. |
4702 | */ |
4703 | # ifdef F_NOCACHE /* __APPLE__ */ |
4704 | (void) fcntl(fd, F_NOCACHE, 1); |
4705 | # elif defined O_DIRECT |
4706 | /* open(...O_DIRECT...) would break on filesystems without |
4707 | * O_DIRECT support (ITS#7682). Try to set it here instead. |
4708 | */ |
4709 | if ((flags = fcntl(fd, F_GETFL)) != -1) |
4710 | (void) fcntl(fd, F_SETFL, flags | O_DIRECT); |
4711 | # endif |
4712 | } |
4713 | } |
4714 | #endif /* !_WIN32 */ |
4715 | |
4716 | *res = fd; |
4717 | return rc; |
4718 | } |
4719 | |
4720 | |
4721 | #ifdef BROKEN_FDATASYNC |
4722 | #include <sys/utsname.h> |
4723 | #include <sys/vfs.h> |
4724 | #endif |
4725 | |
4726 | /** Further setup required for opening an LMDB environment |
4727 | */ |
4728 | static int ESECT |
4729 | mdb_env_open2(MDB_env *env, int prev) |
4730 | { |
4731 | unsigned int flags = env->me_flags; |
4732 | int i, newenv = 0, rc; |
4733 | MDB_meta meta; |
4734 | |
4735 | #ifdef _WIN32 |
4736 | /* See if we should use QueryLimited */ |
4737 | rc = GetVersion(); |
4738 | if ((rc & 0xff) > 5) |
4739 | env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION; |
4740 | else |
4741 | env->me_pidquery = PROCESS_QUERY_INFORMATION; |
4742 | /* Grab functions we need from NTDLL */ |
4743 | if (!NtCreateSection) { |
4744 | HMODULE h = GetModuleHandleW(L"NTDLL.DLL" ); |
4745 | if (!h) |
4746 | return MDB_PROBLEM; |
4747 | NtClose = (NtCloseFunc *)GetProcAddress(h, "NtClose" ); |
4748 | if (!NtClose) |
4749 | return MDB_PROBLEM; |
4750 | NtMapViewOfSection = (NtMapViewOfSectionFunc *)GetProcAddress(h, "NtMapViewOfSection" ); |
4751 | if (!NtMapViewOfSection) |
4752 | return MDB_PROBLEM; |
4753 | NtCreateSection = (NtCreateSectionFunc *)GetProcAddress(h, "NtCreateSection" ); |
4754 | if (!NtCreateSection) |
4755 | return MDB_PROBLEM; |
4756 | } |
4757 | #endif /* _WIN32 */ |
4758 | |
4759 | #ifdef BROKEN_FDATASYNC |
4760 | /* ext3/ext4 fdatasync is broken on some older Linux kernels. |
4761 | * https://lkml.org/lkml/2012/9/3/83 |
4762 | * Kernels after 3.6-rc6 are known good. |
4763 | * https://lkml.org/lkml/2012/9/10/556 |
4764 | * See if the DB is on ext3/ext4, then check for new enough kernel |
4765 | * Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known |
4766 | * to be patched. |
4767 | */ |
4768 | { |
4769 | struct statfs st; |
4770 | fstatfs(env->me_fd, &st); |
4771 | while (st.f_type == 0xEF53) { |
4772 | struct utsname uts; |
4773 | int i; |
4774 | uname(&uts); |
4775 | if (uts.release[0] < '3') { |
4776 | if (!strncmp(uts.release, "2.6.32." , 7)) { |
4777 | i = atoi(uts.release+7); |
4778 | if (i >= 60) |
4779 | break; /* 2.6.32.60 and newer is OK */ |
4780 | } else if (!strncmp(uts.release, "2.6.34." , 7)) { |
4781 | i = atoi(uts.release+7); |
4782 | if (i >= 15) |
4783 | break; /* 2.6.34.15 and newer is OK */ |
4784 | } |
4785 | } else if (uts.release[0] == '3') { |
4786 | i = atoi(uts.release+2); |
4787 | if (i > 5) |
4788 | break; /* 3.6 and newer is OK */ |
4789 | if (i == 5) { |
4790 | i = atoi(uts.release+4); |
4791 | if (i >= 4) |
4792 | break; /* 3.5.4 and newer is OK */ |
4793 | } else if (i == 2) { |
4794 | i = atoi(uts.release+4); |
4795 | if (i >= 30) |
4796 | break; /* 3.2.30 and newer is OK */ |
4797 | } |
4798 | } else { /* 4.x and newer is OK */ |
4799 | break; |
4800 | } |
4801 | env->me_flags |= MDB_FSYNCONLY; |
4802 | break; |
4803 | } |
4804 | } |
4805 | #endif |
4806 | |
4807 | if ((i = mdb_env_read_header(env, prev, &meta)) != 0) { |
4808 | if (i != ENOENT) |
4809 | return i; |
4810 | DPUTS("new mdbenv" ); |
4811 | newenv = 1; |
4812 | env->me_psize = env->me_os_psize; |
4813 | if (env->me_psize > MAX_PAGESIZE) |
4814 | env->me_psize = MAX_PAGESIZE; |
4815 | memset(&meta, 0, sizeof(meta)); |
4816 | mdb_env_init_meta0(env, &meta); |
4817 | meta.mm_mapsize = DEFAULT_MAPSIZE; |
4818 | } else { |
4819 | env->me_psize = meta.mm_psize; |
4820 | } |
4821 | |
4822 | /* Was a mapsize configured? */ |
4823 | if (!env->me_mapsize) { |
4824 | env->me_mapsize = meta.mm_mapsize; |
4825 | } |
4826 | { |
4827 | /* Make sure mapsize >= committed data size. Even when using |
4828 | * mm_mapsize, which could be broken in old files (ITS#7789). |
4829 | */ |
4830 | mdb_size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; |
4831 | if (env->me_mapsize < minsize) |
4832 | env->me_mapsize = minsize; |
4833 | } |
4834 | meta.mm_mapsize = env->me_mapsize; |
4835 | |
4836 | if (newenv && !(flags & MDB_FIXEDMAP)) { |
4837 | /* mdb_env_map() may grow the datafile. Write the metapages |
4838 | * first, so the file will be valid if initialization fails. |
4839 | * Except with FIXEDMAP, since we do not yet know mm_address. |
4840 | * We could fill in mm_address later, but then a different |
4841 | * program might end up doing that - one with a memory layout |
4842 | * and map address which does not suit the main program. |
4843 | */ |
4844 | rc = mdb_env_init_meta(env, &meta); |
4845 | if (rc) |
4846 | return rc; |
4847 | newenv = 0; |
4848 | } |
4849 | #ifdef _WIN32 |
4850 | /* For FIXEDMAP, make sure the file is non-empty before we attempt to map it */ |
4851 | if (newenv) { |
4852 | char dummy = 0; |
4853 | DWORD len; |
4854 | rc = WriteFile(env->me_fd, &dummy, 1, &len, NULL); |
4855 | if (!rc) { |
4856 | rc = ErrCode(); |
4857 | return rc; |
4858 | } |
4859 | } |
4860 | #endif |
4861 | |
4862 | rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL); |
4863 | if (rc) |
4864 | return rc; |
4865 | |
4866 | if (newenv) { |
4867 | if (flags & MDB_FIXEDMAP) |
4868 | meta.mm_address = env->me_map; |
4869 | i = mdb_env_init_meta(env, &meta); |
4870 | if (i != MDB_SUCCESS) { |
4871 | return i; |
4872 | } |
4873 | } |
4874 | |
4875 | env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; |
4876 | env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) |
4877 | - sizeof(indx_t); |
4878 | #if !(MDB_MAXKEYSIZE) |
4879 | env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); |
4880 | #endif |
4881 | env->me_maxpg = env->me_mapsize / env->me_psize; |
4882 | |
4883 | if (env->me_txns) |
4884 | env->me_txns->mti_txnid = meta.mm_txnid; |
4885 | |
4886 | #if MDB_DEBUG |
4887 | { |
4888 | MDB_meta *meta = mdb_env_pick_meta(env); |
4889 | MDB_db *db = &meta->mm_dbs[MAIN_DBI]; |
4890 | |
4891 | DPRINTF(("opened database version %u, pagesize %u" , |
4892 | meta->mm_version, env->me_psize)); |
4893 | DPRINTF(("using meta page %d" , (int) (meta->mm_txnid & 1))); |
4894 | DPRINTF(("depth: %u" , db->md_depth)); |
4895 | DPRINTF(("entries: %" Yu, db->md_entries)); |
4896 | DPRINTF(("branch pages: %" Yu, db->md_branch_pages)); |
4897 | DPRINTF(("leaf pages: %" Yu, db->md_leaf_pages)); |
4898 | DPRINTF(("overflow pages: %" Yu, db->md_overflow_pages)); |
4899 | DPRINTF(("root: %" Yu, db->md_root)); |
4900 | } |
4901 | #endif |
4902 | |
4903 | return MDB_SUCCESS; |
4904 | } |
4905 | |
4906 | |
4907 | /** Release a reader thread's slot in the reader lock table. |
4908 | * This function is called automatically when a thread exits. |
4909 | * @param[in] ptr This points to the slot in the reader lock table. |
4910 | */ |
4911 | static void |
4912 | mdb_env_reader_dest(void *ptr) |
4913 | { |
4914 | MDB_reader *reader = ptr; |
4915 | |
4916 | #ifndef _WIN32 |
4917 | if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */ |
4918 | #endif |
4919 | /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */ |
4920 | reader->mr_pid = 0; |
4921 | } |
4922 | |
4923 | #ifdef _WIN32 |
4924 | /** Junk for arranging thread-specific callbacks on Windows. This is |
4925 | * necessarily platform and compiler-specific. Windows supports up |
4926 | * to 1088 keys. Let's assume nobody opens more than 64 environments |
4927 | * in a single process, for now. They can override this if needed. |
4928 | */ |
4929 | #ifndef MAX_TLS_KEYS |
4930 | #define MAX_TLS_KEYS 64 |
4931 | #endif |
4932 | static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS]; |
4933 | static int mdb_tls_nkeys; |
4934 | |
4935 | static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr) |
4936 | { |
4937 | int i; |
4938 | switch(reason) { |
4939 | case DLL_PROCESS_ATTACH: break; |
4940 | case DLL_THREAD_ATTACH: break; |
4941 | case DLL_THREAD_DETACH: |
4942 | for (i=0; i<mdb_tls_nkeys; i++) { |
4943 | MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]); |
4944 | if (r) { |
4945 | mdb_env_reader_dest(r); |
4946 | } |
4947 | } |
4948 | break; |
4949 | case DLL_PROCESS_DETACH: break; |
4950 | } |
4951 | } |
4952 | #ifdef __GNUC__ |
4953 | #ifdef _WIN64 |
4954 | const PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB" ))) = mdb_tls_callback; |
4955 | #else |
4956 | PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB" ))) = mdb_tls_callback; |
4957 | #endif |
4958 | #else |
4959 | #ifdef _WIN64 |
4960 | /* Force some symbol references. |
4961 | * _tls_used forces the linker to create the TLS directory if not already done |
4962 | * mdb_tls_cbp prevents whole-program-optimizer from dropping the symbol. |
4963 | */ |
4964 | #pragma comment(linker, "/INCLUDE:_tls_used") |
4965 | #pragma comment(linker, "/INCLUDE:mdb_tls_cbp") |
4966 | #pragma const_seg(".CRT$XLB") |
4967 | extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp; |
4968 | const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; |
4969 | #pragma const_seg() |
4970 | #else /* _WIN32 */ |
4971 | #pragma comment(linker, "/INCLUDE:__tls_used") |
4972 | #pragma comment(linker, "/INCLUDE:_mdb_tls_cbp") |
4973 | #pragma data_seg(".CRT$XLB") |
4974 | PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; |
4975 | #pragma data_seg() |
4976 | #endif /* WIN 32/64 */ |
4977 | #endif /* !__GNUC__ */ |
4978 | #endif |
4979 | |
4980 | /** Downgrade the exclusive lock on the region back to shared */ |
4981 | static int ESECT |
4982 | mdb_env_share_locks(MDB_env *env, int *excl) |
4983 | { |
4984 | int rc = 0; |
4985 | |
4986 | #ifdef _WIN32 |
4987 | { |
4988 | OVERLAPPED ov; |
4989 | /* First acquire a shared lock. The Unlock will |
4990 | * then release the existing exclusive lock. |
4991 | */ |
4992 | memset(&ov, 0, sizeof(ov)); |
4993 | if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { |
4994 | rc = ErrCode(); |
4995 | } else { |
4996 | UnlockFile(env->me_lfd, 0, 0, 1, 0); |
4997 | *excl = 0; |
4998 | } |
4999 | } |
5000 | #else |
5001 | { |
5002 | struct flock lock_info; |
5003 | /* The shared lock replaces the existing lock */ |
5004 | memset((void *)&lock_info, 0, sizeof(lock_info)); |
5005 | lock_info.l_type = F_RDLCK; |
5006 | lock_info.l_whence = SEEK_SET; |
5007 | lock_info.l_start = 0; |
5008 | lock_info.l_len = 1; |
5009 | while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && |
5010 | (rc = ErrCode()) == EINTR) ; |
5011 | *excl = rc ? -1 : 0; /* error may mean we lost the lock */ |
5012 | } |
5013 | #endif |
5014 | |
5015 | return rc; |
5016 | } |
5017 | |
5018 | /** Try to get exclusive lock, otherwise shared. |
5019 | * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. |
5020 | */ |
5021 | static int ESECT |
5022 | mdb_env_excl_lock(MDB_env *env, int *excl) |
5023 | { |
5024 | int rc = 0; |
5025 | #ifdef _WIN32 |
5026 | if (LockFile(env->me_lfd, 0, 0, 1, 0)) { |
5027 | *excl = 1; |
5028 | } else { |
5029 | OVERLAPPED ov; |
5030 | memset(&ov, 0, sizeof(ov)); |
5031 | if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { |
5032 | *excl = 0; |
5033 | } else { |
5034 | rc = ErrCode(); |
5035 | } |
5036 | } |
5037 | #else |
5038 | struct flock lock_info; |
5039 | memset((void *)&lock_info, 0, sizeof(lock_info)); |
5040 | lock_info.l_type = F_WRLCK; |
5041 | lock_info.l_whence = SEEK_SET; |
5042 | lock_info.l_start = 0; |
5043 | lock_info.l_len = 1; |
5044 | while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && |
5045 | (rc = ErrCode()) == EINTR) ; |
5046 | if (!rc) { |
5047 | *excl = 1; |
5048 | } else |
5049 | # ifndef MDB_USE_POSIX_MUTEX |
5050 | if (*excl < 0) /* always true when MDB_USE_POSIX_MUTEX */ |
5051 | # endif |
5052 | { |
5053 | lock_info.l_type = F_RDLCK; |
5054 | while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && |
5055 | (rc = ErrCode()) == EINTR) ; |
5056 | if (rc == 0) |
5057 | *excl = 0; |
5058 | } |
5059 | #endif |
5060 | return rc; |
5061 | } |
5062 | |
5063 | #ifdef MDB_USE_HASH |
5064 | /* |
5065 | * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code |
5066 | * |
5067 | * @(#) $Revision: 5.1 $ |
5068 | * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $ |
5069 | * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $ |
5070 | * |
5071 | * http://www.isthe.com/chongo/tech/comp/fnv/index.html |
5072 | * |
5073 | *** |
5074 | * |
5075 | * Please do not copyright this code. This code is in the public domain. |
5076 | * |
5077 | * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, |
5078 | * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO |
5079 | * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR |
5080 | * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF |
5081 | * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR |
5082 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR |
5083 | * PERFORMANCE OF THIS SOFTWARE. |
5084 | * |
5085 | * By: |
5086 | * chongo <Landon Curt Noll> /\oo/\ |
5087 | * http://www.isthe.com/chongo/ |
5088 | * |
5089 | * Share and Enjoy! :-) |
5090 | */ |
5091 | |
5092 | /** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer |
5093 | * @param[in] val value to hash |
5094 | * @param[in] len length of value |
5095 | * @return 64 bit hash |
5096 | */ |
5097 | static mdb_hash_t |
5098 | mdb_hash(const void *val, size_t len) |
5099 | { |
5100 | const unsigned char *s = (const unsigned char *) val, *end = s + len; |
5101 | mdb_hash_t hval = 0xcbf29ce484222325ULL; |
5102 | /* |
5103 | * FNV-1a hash each octet of the buffer |
5104 | */ |
5105 | while (s < end) { |
5106 | hval = (hval ^ *s++) * 0x100000001b3ULL; |
5107 | } |
5108 | /* return our new hash value */ |
5109 | return hval; |
5110 | } |
5111 | |
5112 | /** Hash the string and output the encoded hash. |
5113 | * This uses modified RFC1924 Ascii85 encoding to accommodate systems with |
5114 | * very short name limits. We don't care about the encoding being reversible, |
5115 | * we just want to preserve as many bits of the input as possible in a |
5116 | * small printable string. |
5117 | * @param[in] str string to hash |
5118 | * @param[out] encbuf an array of 11 chars to hold the hash |
5119 | */ |
5120 | static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~" ; |
5121 | |
5122 | static void ESECT |
5123 | mdb_pack85(unsigned long long l, char *out) |
5124 | { |
5125 | int i; |
5126 | |
5127 | for (i=0; i<10 && l; i++) { |
5128 | *out++ = mdb_a85[l % 85]; |
5129 | l /= 85; |
5130 | } |
5131 | *out = '\0'; |
5132 | } |
5133 | |
5134 | /** Init #MDB_env.me_mutexname[] except the char which #MUTEXNAME() will set. |
5135 | * Changes to this code must be reflected in #MDB_LOCK_FORMAT. |
5136 | */ |
5137 | static void ESECT |
5138 | mdb_env_mname_init(MDB_env *env) |
5139 | { |
5140 | char *nm = env->me_mutexname; |
5141 | strcpy(nm, MUTEXNAME_PREFIX); |
5142 | mdb_pack85(env->me_txns->mti_mutexid, nm + sizeof(MUTEXNAME_PREFIX)); |
5143 | } |
5144 | |
5145 | /** Return env->me_mutexname after filling in ch ('r'/'w') for convenience */ |
5146 | #define MUTEXNAME(env, ch) ( \ |
5147 | (void) ((env)->me_mutexname[sizeof(MUTEXNAME_PREFIX)-1] = (ch)), \ |
5148 | (env)->me_mutexname) |
5149 | |
5150 | #endif |
5151 | |
5152 | /** Open and/or initialize the lock region for the environment. |
5153 | * @param[in] env The LMDB environment. |
5154 | * @param[in] fname Filename + scratch area, from #mdb_fname_init(). |
5155 | * @param[in] mode The Unix permissions for the file, if we create it. |
5156 | * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive |
5157 | * @return 0 on success, non-zero on failure. |
5158 | */ |
5159 | static int ESECT |
5160 | mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) |
5161 | { |
5162 | #ifdef _WIN32 |
5163 | # define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT |
5164 | #else |
5165 | # define MDB_ERRCODE_ROFS EROFS |
5166 | #endif |
5167 | #ifdef MDB_USE_SYSV_SEM |
5168 | int semid; |
5169 | union semun semu; |
5170 | #endif |
5171 | int rc; |
5172 | off_t size, rsize; |
5173 | |
5174 | rc = mdb_fopen(env, fname, MDB_O_LOCKS, mode, &env->me_lfd); |
5175 | if (rc) { |
5176 | /* Omit lockfile if read-only env on read-only filesystem */ |
5177 | if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { |
5178 | return MDB_SUCCESS; |
5179 | } |
5180 | goto fail; |
5181 | } |
5182 | |
5183 | if (!(env->me_flags & MDB_NOTLS)) { |
5184 | rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); |
5185 | if (rc) |
5186 | goto fail; |
5187 | env->me_flags |= MDB_ENV_TXKEY; |
5188 | #ifdef _WIN32 |
5189 | /* Windows TLS callbacks need help finding their TLS info. */ |
5190 | if (mdb_tls_nkeys >= MAX_TLS_KEYS) { |
5191 | rc = MDB_TLS_FULL; |
5192 | goto fail; |
5193 | } |
5194 | mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; |
5195 | #endif |
5196 | } |
5197 | |
5198 | /* Try to get exclusive lock. If we succeed, then |
5199 | * nobody is using the lock region and we should initialize it. |
5200 | */ |
5201 | if ((rc = mdb_env_excl_lock(env, excl))) goto fail; |
5202 | |
5203 | #ifdef _WIN32 |
5204 | size = GetFileSize(env->me_lfd, NULL); |
5205 | #else |
5206 | size = lseek(env->me_lfd, 0, SEEK_END); |
5207 | if (size == -1) goto fail_errno; |
5208 | #endif |
5209 | rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); |
5210 | if (size < rsize && *excl > 0) { |
5211 | #ifdef _WIN32 |
5212 | if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize |
5213 | || !SetEndOfFile(env->me_lfd)) |
5214 | goto fail_errno; |
5215 | #else |
5216 | if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; |
5217 | #endif |
5218 | } else { |
5219 | rsize = size; |
5220 | size = rsize - sizeof(MDB_txninfo); |
5221 | env->me_maxreaders = size/sizeof(MDB_reader) + 1; |
5222 | } |
5223 | { |
5224 | #ifdef _WIN32 |
5225 | HANDLE mh; |
5226 | mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE, |
5227 | 0, 0, NULL); |
5228 | if (!mh) goto fail_errno; |
5229 | env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL); |
5230 | CloseHandle(mh); |
5231 | if (!env->me_txns) goto fail_errno; |
5232 | #else |
5233 | void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, |
5234 | env->me_lfd, 0); |
5235 | if (m == MAP_FAILED) goto fail_errno; |
5236 | env->me_txns = m; |
5237 | #endif |
5238 | } |
5239 | if (*excl > 0) { |
5240 | #ifdef _WIN32 |
5241 | BY_HANDLE_FILE_INFORMATION stbuf; |
5242 | struct { |
5243 | DWORD volume; |
5244 | DWORD nhigh; |
5245 | DWORD nlow; |
5246 | } idbuf; |
5247 | |
5248 | if (!mdb_sec_inited) { |
5249 | InitializeSecurityDescriptor(&mdb_null_sd, |
5250 | SECURITY_DESCRIPTOR_REVISION); |
5251 | SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE); |
5252 | mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES); |
5253 | mdb_all_sa.bInheritHandle = FALSE; |
5254 | mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd; |
5255 | mdb_sec_inited = 1; |
5256 | } |
5257 | if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno; |
5258 | idbuf.volume = stbuf.dwVolumeSerialNumber; |
5259 | idbuf.nhigh = stbuf.nFileIndexHigh; |
5260 | idbuf.nlow = stbuf.nFileIndexLow; |
5261 | env->me_txns->mti_mutexid = mdb_hash(&idbuf, sizeof(idbuf)); |
5262 | mdb_env_mname_init(env); |
5263 | env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, MUTEXNAME(env, 'r')); |
5264 | if (!env->me_rmutex) goto fail_errno; |
5265 | env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, MUTEXNAME(env, 'w')); |
5266 | if (!env->me_wmutex) goto fail_errno; |
5267 | #elif defined(MDB_USE_POSIX_SEM) |
5268 | struct stat stbuf; |
5269 | struct { |
5270 | dev_t dev; |
5271 | ino_t ino; |
5272 | } idbuf; |
5273 | |
5274 | #if defined(__NetBSD__) |
5275 | #define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */ |
5276 | #endif |
5277 | if (fstat(env->me_lfd, &stbuf)) goto fail_errno; |
5278 | memset(&idbuf, 0, sizeof(idbuf)); |
5279 | idbuf.dev = stbuf.st_dev; |
5280 | idbuf.ino = stbuf.st_ino; |
5281 | env->me_txns->mti_mutexid = mdb_hash(&idbuf, sizeof(idbuf)) |
5282 | #ifdef MDB_SHORT_SEMNAMES |
5283 | /* Max 9 base85-digits. We truncate here instead of in |
5284 | * mdb_env_mname_init() to keep the latter portable. |
5285 | */ |
5286 | % ((mdb_hash_t)85*85*85*85*85*85*85*85*85) |
5287 | #endif |
5288 | ; |
5289 | mdb_env_mname_init(env); |
5290 | /* Clean up after a previous run, if needed: Try to |
5291 | * remove both semaphores before doing anything else. |
5292 | */ |
5293 | sem_unlink(MUTEXNAME(env, 'r')); |
5294 | sem_unlink(MUTEXNAME(env, 'w')); |
5295 | env->me_rmutex = sem_open(MUTEXNAME(env, 'r'), O_CREAT|O_EXCL, mode, 1); |
5296 | if (env->me_rmutex == SEM_FAILED) goto fail_errno; |
5297 | env->me_wmutex = sem_open(MUTEXNAME(env, 'w'), O_CREAT|O_EXCL, mode, 1); |
5298 | if (env->me_wmutex == SEM_FAILED) goto fail_errno; |
5299 | #elif defined(MDB_USE_SYSV_SEM) |
5300 | unsigned short vals[2] = {1, 1}; |
5301 | key_t key = ftok(fname->mn_val, 'M'); /* fname is lockfile path now */ |
5302 | if (key == -1) |
5303 | goto fail_errno; |
5304 | semid = semget(key, 2, (mode & 0777) | IPC_CREAT); |
5305 | if (semid < 0) |
5306 | goto fail_errno; |
5307 | semu.array = vals; |
5308 | if (semctl(semid, 0, SETALL, semu) < 0) |
5309 | goto fail_errno; |
5310 | env->me_txns->mti_semid = semid; |
5311 | env->me_txns->mti_rlocked = 0; |
5312 | env->me_txns->mti_wlocked = 0; |
5313 | #else /* MDB_USE_POSIX_MUTEX: */ |
5314 | pthread_mutexattr_t mattr; |
5315 | |
5316 | /* Solaris needs this before initing a robust mutex. Otherwise |
5317 | * it may skip the init and return EBUSY "seems someone already |
5318 | * inited" or EINVAL "it was inited differently". |
5319 | */ |
5320 | memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex)); |
5321 | memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex)); |
5322 | |
5323 | if ((rc = pthread_mutexattr_init(&mattr)) != 0) |
5324 | goto fail; |
5325 | rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); |
5326 | #ifdef MDB_ROBUST_SUPPORTED |
5327 | if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); |
5328 | #endif |
5329 | if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr); |
5330 | if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr); |
5331 | pthread_mutexattr_destroy(&mattr); |
5332 | if (rc) |
5333 | goto fail; |
5334 | #endif /* _WIN32 || ... */ |
5335 | |
5336 | env->me_txns->mti_magic = MDB_MAGIC; |
5337 | env->me_txns->mti_format = MDB_LOCK_FORMAT; |
5338 | env->me_txns->mti_txnid = 0; |
5339 | env->me_txns->mti_numreaders = 0; |
5340 | |
5341 | } else { |
5342 | #ifdef MDB_USE_SYSV_SEM |
5343 | struct semid_ds buf; |
5344 | #endif |
5345 | if (env->me_txns->mti_magic != MDB_MAGIC) { |
5346 | DPUTS("lock region has invalid magic" ); |
5347 | rc = MDB_INVALID; |
5348 | goto fail; |
5349 | } |
5350 | if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { |
5351 | DPRINTF(("lock region has format+version 0x%x, expected 0x%x" , |
5352 | env->me_txns->mti_format, MDB_LOCK_FORMAT)); |
5353 | rc = MDB_VERSION_MISMATCH; |
5354 | goto fail; |
5355 | } |
5356 | rc = ErrCode(); |
5357 | if (rc && rc != EACCES && rc != EAGAIN) { |
5358 | goto fail; |
5359 | } |
5360 | #ifdef _WIN32 |
5361 | mdb_env_mname_init(env); |
5362 | env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, MUTEXNAME(env, 'r')); |
5363 | if (!env->me_rmutex) goto fail_errno; |
5364 | env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, MUTEXNAME(env, 'w')); |
5365 | if (!env->me_wmutex) goto fail_errno; |
5366 | #elif defined(MDB_USE_POSIX_SEM) |
5367 | mdb_env_mname_init(env); |
5368 | env->me_rmutex = sem_open(MUTEXNAME(env, 'r'), 0); |
5369 | if (env->me_rmutex == SEM_FAILED) goto fail_errno; |
5370 | env->me_wmutex = sem_open(MUTEXNAME(env, 'w'), 0); |
5371 | if (env->me_wmutex == SEM_FAILED) goto fail_errno; |
5372 | #elif defined(MDB_USE_SYSV_SEM) |
5373 | semid = env->me_txns->mti_semid; |
5374 | semu.buf = &buf; |
5375 | /* check for read access */ |
5376 | if (semctl(semid, 0, IPC_STAT, semu) < 0) |
5377 | goto fail_errno; |
5378 | /* check for write access */ |
5379 | if (semctl(semid, 0, IPC_SET, semu) < 0) |
5380 | goto fail_errno; |
5381 | #endif |
5382 | } |
5383 | #ifdef MDB_USE_SYSV_SEM |
5384 | env->me_rmutex->semid = semid; |
5385 | env->me_wmutex->semid = semid; |
5386 | env->me_rmutex->semnum = 0; |
5387 | env->me_wmutex->semnum = 1; |
5388 | env->me_rmutex->locked = &env->me_txns->mti_rlocked; |
5389 | env->me_wmutex->locked = &env->me_txns->mti_wlocked; |
5390 | #endif |
5391 | |
5392 | return MDB_SUCCESS; |
5393 | |
5394 | fail_errno: |
5395 | rc = ErrCode(); |
5396 | fail: |
5397 | return rc; |
5398 | } |
5399 | |
5400 | /** Only a subset of the @ref mdb_env flags can be changed |
5401 | * at runtime. Changing other flags requires closing the |
5402 | * environment and re-opening it with the new flags. |
5403 | */ |
5404 | #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) |
5405 | #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ |
5406 | MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD|MDB_PREVSNAPSHOT) |
5407 | |
5408 | #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) |
5409 | # error "Persistent DB flags & env flags overlap, but both go in mm_flags" |
5410 | #endif |
5411 | |
5412 | int ESECT |
5413 | mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) |
5414 | { |
5415 | int rc, excl = -1; |
5416 | MDB_name fname; |
5417 | |
5418 | if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) |
5419 | return EINVAL; |
5420 | |
5421 | #ifdef MDB_VL32 |
5422 | if (flags & MDB_WRITEMAP) { |
5423 | /* silently ignore WRITEMAP in 32 bit mode */ |
5424 | flags ^= MDB_WRITEMAP; |
5425 | } |
5426 | if (flags & MDB_FIXEDMAP) { |
5427 | /* cannot support FIXEDMAP */ |
5428 | return EINVAL; |
5429 | } |
5430 | #endif |
5431 | flags |= env->me_flags; |
5432 | |
5433 | rc = mdb_fname_init(path, flags, &fname); |
5434 | if (rc) |
5435 | return rc; |
5436 | |
5437 | #ifdef MDB_VL32 |
5438 | #ifdef _WIN32 |
5439 | env->me_rpmutex = CreateMutex(NULL, FALSE, NULL); |
5440 | if (!env->me_rpmutex) { |
5441 | rc = ErrCode(); |
5442 | goto leave; |
5443 | } |
5444 | #else |
5445 | rc = pthread_mutex_init(&env->me_rpmutex, NULL); |
5446 | if (rc) |
5447 | goto leave; |
5448 | #endif |
5449 | #endif |
5450 | flags |= MDB_ENV_ACTIVE; /* tell mdb_env_close0() to clean up */ |
5451 | |
5452 | if (flags & MDB_RDONLY) { |
5453 | /* silently ignore WRITEMAP when we're only getting read access */ |
5454 | flags &= ~MDB_WRITEMAP; |
5455 | } else { |
5456 | if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) && |
5457 | (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) |
5458 | rc = ENOMEM; |
5459 | } |
5460 | |
5461 | env->me_flags = flags; |
5462 | if (rc) |
5463 | goto leave; |
5464 | |
5465 | #ifdef MDB_VL32 |
5466 | { |
5467 | env->me_rpages = malloc(MDB_ERPAGE_SIZE * sizeof(MDB_ID3)); |
5468 | if (!env->me_rpages) { |
5469 | rc = ENOMEM; |
5470 | goto leave; |
5471 | } |
5472 | env->me_rpages[0].mid = 0; |
5473 | env->me_rpcheck = MDB_ERPAGE_SIZE/2; |
5474 | } |
5475 | #endif |
5476 | |
5477 | env->me_path = strdup(path); |
5478 | env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); |
5479 | env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); |
5480 | env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int)); |
5481 | if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { |
5482 | rc = ENOMEM; |
5483 | goto leave; |
5484 | } |
5485 | env->me_dbxs[FREE_DBI].md_cmp = mdb_cmp_long; /* aligned MDB_INTEGERKEY */ |
5486 | |
5487 | /* For RDONLY, get lockfile after we know datafile exists */ |
5488 | if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { |
5489 | rc = mdb_env_setup_locks(env, &fname, mode, &excl); |
5490 | if (rc) |
5491 | goto leave; |
5492 | if ((flags & MDB_PREVSNAPSHOT) && !excl) { |
5493 | rc = EAGAIN; |
5494 | goto leave; |
5495 | } |
5496 | } |
5497 | |
5498 | rc = mdb_fopen(env, &fname, |
5499 | (flags & MDB_RDONLY) ? MDB_O_RDONLY : MDB_O_RDWR, |
5500 | mode, &env->me_fd); |
5501 | if (rc) |
5502 | goto leave; |
5503 | |
5504 | if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { |
5505 | rc = mdb_env_setup_locks(env, &fname, mode, &excl); |
5506 | if (rc) |
5507 | goto leave; |
5508 | } |
5509 | |
5510 | if ((rc = mdb_env_open2(env, flags & MDB_PREVSNAPSHOT)) == MDB_SUCCESS) { |
5511 | if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { |
5512 | /* Synchronous fd for meta writes. Needed even with |
5513 | * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. |
5514 | */ |
5515 | rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); |
5516 | if (rc) |
5517 | goto leave; |
5518 | } |
5519 | DPRINTF(("opened dbenv %p" , (void *) env)); |
5520 | if (excl > 0 && !(flags & MDB_PREVSNAPSHOT)) { |
5521 | rc = mdb_env_share_locks(env, &excl); |
5522 | if (rc) |
5523 | goto leave; |
5524 | } |
5525 | if (!(flags & MDB_RDONLY)) { |
5526 | MDB_txn *txn; |
5527 | int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs * |
5528 | (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned int)+1); |
5529 | if ((env->me_pbuf = calloc(1, env->me_psize)) && |
5530 | (txn = calloc(1, size))) |
5531 | { |
5532 | txn->mt_dbs = (MDB_db *)((char *)txn + tsize); |
5533 | txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); |
5534 | txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); |
5535 | txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); |
5536 | txn->mt_env = env; |
5537 | #ifdef MDB_VL32 |
5538 | txn->mt_rpages = malloc(MDB_TRPAGE_SIZE * sizeof(MDB_ID3)); |
5539 | if (!txn->mt_rpages) { |
5540 | free(txn); |
5541 | rc = ENOMEM; |
5542 | goto leave; |
5543 | } |
5544 | txn->mt_rpages[0].mid = 0; |
5545 | txn->mt_rpcheck = MDB_TRPAGE_SIZE/2; |
5546 | #endif |
5547 | txn->mt_dbxs = env->me_dbxs; |
5548 | txn->mt_flags = MDB_TXN_FINISHED; |
5549 | env->me_txn0 = txn; |
5550 | } else { |
5551 | rc = ENOMEM; |
5552 | } |
5553 | } |
5554 | } |
5555 | |
5556 | leave: |
5557 | if (rc) { |
5558 | mdb_env_close0(env, excl); |
5559 | } |
5560 | mdb_fname_destroy(fname); |
5561 | return rc; |
5562 | } |
5563 | |
5564 | /** Destroy resources from mdb_env_open(), clear our readers & DBIs */ |
5565 | static void ESECT |
5566 | mdb_env_close0(MDB_env *env, int excl) |
5567 | { |
5568 | int i; |
5569 | |
5570 | if (!(env->me_flags & MDB_ENV_ACTIVE)) |
5571 | return; |
5572 | |
5573 | /* Doing this here since me_dbxs may not exist during mdb_env_close */ |
5574 | if (env->me_dbxs) { |
5575 | for (i = env->me_maxdbs; --i >= CORE_DBS; ) |
5576 | free(env->me_dbxs[i].md_name.mv_data); |
5577 | free(env->me_dbxs); |
5578 | } |
5579 | |
5580 | free(env->me_pbuf); |
5581 | free(env->me_dbiseqs); |
5582 | free(env->me_dbflags); |
5583 | free(env->me_path); |
5584 | free(env->me_dirty_list); |
5585 | #ifdef MDB_VL32 |
5586 | if (env->me_txn0 && env->me_txn0->mt_rpages) |
5587 | free(env->me_txn0->mt_rpages); |
5588 | if (env->me_rpages) { |
5589 | MDB_ID3L el = env->me_rpages; |
5590 | unsigned int x; |
5591 | for (x=1; x<=el[0].mid; x++) |
5592 | munmap(el[x].mptr, el[x].mcnt * env->me_psize); |
5593 | free(el); |
5594 | } |
5595 | #endif |
5596 | free(env->me_txn0); |
5597 | mdb_midl_free(env->me_free_pgs); |
5598 | |
5599 | if (env->me_flags & MDB_ENV_TXKEY) { |
5600 | pthread_key_delete(env->me_txkey); |
5601 | #ifdef _WIN32 |
5602 | /* Delete our key from the global list */ |
5603 | for (i=0; i<mdb_tls_nkeys; i++) |
5604 | if (mdb_tls_keys[i] == env->me_txkey) { |
5605 | mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; |
5606 | mdb_tls_nkeys--; |
5607 | break; |
5608 | } |
5609 | #endif |
5610 | } |
5611 | |
5612 | if (env->me_map) { |
5613 | #ifdef MDB_VL32 |
5614 | munmap(env->me_map, NUM_METAS*env->me_psize); |
5615 | #else |
5616 | munmap(env->me_map, env->me_mapsize); |
5617 | #endif |
5618 | } |
5619 | if (env->me_mfd != INVALID_HANDLE_VALUE) |
5620 | (void) close(env->me_mfd); |
5621 | if (env->me_fd != INVALID_HANDLE_VALUE) |
5622 | (void) close(env->me_fd); |
5623 | if (env->me_txns) { |
5624 | MDB_PID_T pid = getpid(); |
5625 | /* Clearing readers is done in this function because |
5626 | * me_txkey with its destructor must be disabled first. |
5627 | * |
5628 | * We skip the the reader mutex, so we touch only |
5629 | * data owned by this process (me_close_readers and |
5630 | * our readers), and clear each reader atomically. |
5631 | */ |
5632 | for (i = env->me_close_readers; --i >= 0; ) |
5633 | if (env->me_txns->mti_readers[i].mr_pid == pid) |
5634 | env->me_txns->mti_readers[i].mr_pid = 0; |
5635 | #ifdef _WIN32 |
5636 | if (env->me_rmutex) { |
5637 | CloseHandle(env->me_rmutex); |
5638 | if (env->me_wmutex) CloseHandle(env->me_wmutex); |
5639 | } |
5640 | /* Windows automatically destroys the mutexes when |
5641 | * the last handle closes. |
5642 | */ |
5643 | #elif defined(MDB_USE_POSIX_SEM) |
5644 | if (env->me_rmutex != SEM_FAILED) { |
5645 | sem_close(env->me_rmutex); |
5646 | if (env->me_wmutex != SEM_FAILED) |
5647 | sem_close(env->me_wmutex); |
5648 | /* If we have the filelock: If we are the |
5649 | * only remaining user, clean up semaphores. |
5650 | */ |
5651 | if (excl == 0) |
5652 | mdb_env_excl_lock(env, &excl); |
5653 | if (excl > 0) { |
5654 | sem_unlink(MUTEXNAME(env, 'r')); |
5655 | sem_unlink(MUTEXNAME(env, 'w')); |
5656 | } |
5657 | } |
5658 | #elif defined(MDB_USE_SYSV_SEM) |
5659 | if (env->me_rmutex->semid != -1) { |
5660 | /* If we have the filelock: If we are the |
5661 | * only remaining user, clean up semaphores. |
5662 | */ |
5663 | if (excl == 0) |
5664 | mdb_env_excl_lock(env, &excl); |
5665 | if (excl > 0) |
5666 | semctl(env->me_rmutex->semid, 0, IPC_RMID); |
5667 | } |
5668 | #endif |
5669 | munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); |
5670 | } |
5671 | if (env->me_lfd != INVALID_HANDLE_VALUE) { |
5672 | #ifdef _WIN32 |
5673 | if (excl >= 0) { |
5674 | /* Unlock the lockfile. Windows would have unlocked it |
5675 | * after closing anyway, but not necessarily at once. |
5676 | */ |
5677 | UnlockFile(env->me_lfd, 0, 0, 1, 0); |
5678 | } |
5679 | #endif |
5680 | (void) close(env->me_lfd); |
5681 | } |
5682 | #ifdef MDB_VL32 |
5683 | #ifdef _WIN32 |
5684 | if (env->me_fmh) CloseHandle(env->me_fmh); |
5685 | if (env->me_rpmutex) CloseHandle(env->me_rpmutex); |
5686 | #else |
5687 | pthread_mutex_destroy(&env->me_rpmutex); |
5688 | #endif |
5689 | #endif |
5690 | |
5691 | env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); |
5692 | } |
5693 | |
5694 | void ESECT |
5695 | mdb_env_close(MDB_env *env) |
5696 | { |
5697 | MDB_page *dp; |
5698 | |
5699 | if (env == NULL) |
5700 | return; |
5701 | |
5702 | VGMEMP_DESTROY(env); |
5703 | while ((dp = env->me_dpages) != NULL) { |
5704 | VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); |
5705 | env->me_dpages = dp->mp_next; |
5706 | free(dp); |
5707 | } |
5708 | |
5709 | mdb_env_close0(env, 0); |
5710 | free(env); |
5711 | } |
5712 | |
5713 | /** Compare two items pointing at aligned #mdb_size_t's */ |
5714 | static int |
5715 | mdb_cmp_long(const MDB_val *a, const MDB_val *b) |
5716 | { |
5717 | return (*(mdb_size_t *)a->mv_data < *(mdb_size_t *)b->mv_data) ? -1 : |
5718 | *(mdb_size_t *)a->mv_data > *(mdb_size_t *)b->mv_data; |
5719 | } |
5720 | |
5721 | /** Compare two items pointing at aligned unsigned int's. |
5722 | * |
5723 | * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp, |
5724 | * but #mdb_cmp_clong() is called instead if the data type is #mdb_size_t. |
5725 | */ |
5726 | static int |
5727 | mdb_cmp_int(const MDB_val *a, const MDB_val *b) |
5728 | { |
5729 | return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 : |
5730 | *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data; |
5731 | } |
5732 | |
5733 | /** Compare two items pointing at unsigned ints of unknown alignment. |
5734 | * Nodes and keys are guaranteed to be 2-byte aligned. |
5735 | */ |
5736 | static int |
5737 | mdb_cmp_cint(const MDB_val *a, const MDB_val *b) |
5738 | { |
5739 | #if BYTE_ORDER == LITTLE_ENDIAN |
5740 | unsigned short *u, *c; |
5741 | int x; |
5742 | |
5743 | u = (unsigned short *) ((char *) a->mv_data + a->mv_size); |
5744 | c = (unsigned short *) ((char *) b->mv_data + a->mv_size); |
5745 | do { |
5746 | x = *--u - *--c; |
5747 | } while(!x && u > (unsigned short *)a->mv_data); |
5748 | return x; |
5749 | #else |
5750 | unsigned short *u, *c, *end; |
5751 | int x; |
5752 | |
5753 | end = (unsigned short *) ((char *) a->mv_data + a->mv_size); |
5754 | u = (unsigned short *)a->mv_data; |
5755 | c = (unsigned short *)b->mv_data; |
5756 | do { |
5757 | x = *u++ - *c++; |
5758 | } while(!x && u < end); |
5759 | return x; |
5760 | #endif |
5761 | } |
5762 | |
5763 | /** Compare two items lexically */ |
5764 | static int |
5765 | mdb_cmp_memn(const MDB_val *a, const MDB_val *b) |
5766 | { |
5767 | int diff; |
5768 | ssize_t len_diff; |
5769 | unsigned int len; |
5770 | |
5771 | len = a->mv_size; |
5772 | len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; |
5773 | if (len_diff > 0) { |
5774 | len = b->mv_size; |
5775 | len_diff = 1; |
5776 | } |
5777 | |
5778 | diff = memcmp(a->mv_data, b->mv_data, len); |
5779 | return diff ? diff : len_diff<0 ? -1 : len_diff; |
5780 | } |
5781 | |
5782 | /** Compare two items in reverse byte order */ |
5783 | static int |
5784 | mdb_cmp_memnr(const MDB_val *a, const MDB_val *b) |
5785 | { |
5786 | const unsigned char *p1, *p2, *p1_lim; |
5787 | ssize_t len_diff; |
5788 | int diff; |
5789 | |
5790 | p1_lim = (const unsigned char *)a->mv_data; |
5791 | p1 = (const unsigned char *)a->mv_data + a->mv_size; |
5792 | p2 = (const unsigned char *)b->mv_data + b->mv_size; |
5793 | |
5794 | len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; |
5795 | if (len_diff > 0) { |
5796 | p1_lim += len_diff; |
5797 | len_diff = 1; |
5798 | } |
5799 | |
5800 | while (p1 > p1_lim) { |
5801 | diff = *--p1 - *--p2; |
5802 | if (diff) |
5803 | return diff; |
5804 | } |
5805 | return len_diff<0 ? -1 : len_diff; |
5806 | } |
5807 | |
5808 | /** Search for key within a page, using binary search. |
5809 | * Returns the smallest entry larger or equal to the key. |
5810 | * If exactp is non-null, stores whether the found entry was an exact match |
5811 | * in *exactp (1 or 0). |
5812 | * Updates the cursor index with the index of the found entry. |
5813 | * If no entry larger or equal to the key is found, returns NULL. |
5814 | */ |
5815 | static MDB_node * |
5816 | mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) |
5817 | { |
5818 | unsigned int i = 0, nkeys; |
5819 | int low, high; |
5820 | int rc = 0; |
5821 | MDB_page *mp = mc->mc_pg[mc->mc_top]; |
5822 | MDB_node *node = NULL; |
5823 | MDB_val nodekey; |
5824 | MDB_cmp_func *cmp; |
5825 | DKBUF; |
5826 | |
5827 | nkeys = NUMKEYS(mp); |
5828 | |
5829 | DPRINTF(("searching %u keys in %s %spage %" Yu, |
5830 | nkeys, IS_LEAF(mp) ? "leaf" : "branch" , IS_SUBP(mp) ? "sub-" : "" , |
5831 | mdb_dbg_pgno(mp))); |
5832 | |
5833 | low = IS_LEAF(mp) ? 0 : 1; |
5834 | high = nkeys - 1; |
5835 | cmp = mc->mc_dbx->md_cmp; |
5836 | |
5837 | /* Branch pages have no data, so if using integer keys, |
5838 | * alignment is guaranteed. Use faster mdb_cmp_int. |
5839 | */ |
5840 | if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { |
5841 | if (NODEPTR(mp, 1)->mn_ksize == sizeof(mdb_size_t)) |
5842 | cmp = mdb_cmp_long; |
5843 | else |
5844 | cmp = mdb_cmp_int; |
5845 | } |
5846 | |
5847 | if (IS_LEAF2(mp)) { |
5848 | nodekey.mv_size = mc->mc_db->md_pad; |
5849 | node = NODEPTR(mp, 0); /* fake */ |
5850 | while (low <= high) { |
5851 | i = (low + high) >> 1; |
5852 | nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); |
5853 | rc = cmp(key, &nodekey); |
5854 | DPRINTF(("found leaf index %u [%s], rc = %i" , |
5855 | i, DKEY(&nodekey), rc)); |
5856 | if (rc == 0) |
5857 | break; |
5858 | if (rc > 0) |
5859 | low = i + 1; |
5860 | else |
5861 | high = i - 1; |
5862 | } |
5863 | } else { |
5864 | while (low <= high) { |
5865 | i = (low + high) >> 1; |
5866 | |
5867 | node = NODEPTR(mp, i); |
5868 | nodekey.mv_size = NODEKSZ(node); |
5869 | nodekey.mv_data = NODEKEY(node); |
5870 | |
5871 | rc = cmp(key, &nodekey); |
5872 | #if MDB_DEBUG |
5873 | if (IS_LEAF(mp)) |
5874 | DPRINTF(("found leaf index %u [%s], rc = %i" , |
5875 | i, DKEY(&nodekey), rc)); |
5876 | else |
5877 | DPRINTF(("found branch index %u [%s -> %" Yu"], rc = %i" , |
5878 | i, DKEY(&nodekey), NODEPGNO(node), rc)); |
5879 | #endif |
5880 | if (rc == 0) |
5881 | break; |
5882 | if (rc > 0) |
5883 | low = i + 1; |
5884 | else |
5885 | high = i - 1; |
5886 | } |
5887 | } |
5888 | |
5889 | if (rc > 0) { /* Found entry is less than the key. */ |
5890 | i++; /* Skip to get the smallest entry larger than key. */ |
5891 | if (!IS_LEAF2(mp)) |
5892 | node = NODEPTR(mp, i); |
5893 | } |
5894 | if (exactp) |
5895 | *exactp = (rc == 0 && nkeys > 0); |
5896 | /* store the key index */ |
5897 | mc->mc_ki[mc->mc_top] = i; |
5898 | if (i >= nkeys) |
5899 | /* There is no entry larger or equal to the key. */ |
5900 | return NULL; |
5901 | |
5902 | /* nodeptr is fake for LEAF2 */ |
5903 | return node; |
5904 | } |
5905 | |
5906 | #if 0 |
5907 | static void |
5908 | mdb_cursor_adjust(MDB_cursor *mc, func) |
5909 | { |
5910 | MDB_cursor *m2; |
5911 | |
5912 | for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { |
5913 | if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { |
5914 | func(mc, m2); |
5915 | } |
5916 | } |
5917 | } |
5918 | #endif |
5919 | |
5920 | /** Pop a page off the top of the cursor's stack. */ |
5921 | static void |
5922 | mdb_cursor_pop(MDB_cursor *mc) |
5923 | { |
5924 | if (mc->mc_snum) { |
5925 | DPRINTF(("popping page %" Yu" off db %d cursor %p" , |
5926 | mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc)); |
5927 | |
5928 | mc->mc_snum--; |
5929 | if (mc->mc_snum) { |
5930 | mc->mc_top--; |
5931 | } else { |
5932 | mc->mc_flags &= ~C_INITIALIZED; |
5933 | } |
5934 | } |
5935 | } |
5936 | |
5937 | /** Push a page onto the top of the cursor's stack. |
5938 | * Set #MDB_TXN_ERROR on failure. |
5939 | */ |
5940 | static int |
5941 | mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) |
5942 | { |
5943 | DPRINTF(("pushing page %" Yu" on db %d cursor %p" , mp->mp_pgno, |
5944 | DDBI(mc), (void *) mc)); |
5945 | |
5946 | if (mc->mc_snum >= CURSOR_STACK) { |
5947 | mc->mc_txn->mt_flags |= MDB_TXN_ERROR; |
5948 | return MDB_CURSOR_FULL; |
5949 | } |
5950 | |
5951 | mc->mc_top = mc->mc_snum++; |
5952 | mc->mc_pg[mc->mc_top] = mp; |
5953 | mc->mc_ki[mc->mc_top] = 0; |
5954 | |
5955 | return MDB_SUCCESS; |
5956 | } |
5957 | |
5958 | #ifdef MDB_VL32 |
5959 | /** Map a read-only page. |
5960 | * There are two levels of tracking in use, a per-txn list and a per-env list. |
5961 | * ref'ing and unref'ing the per-txn list is faster since it requires no |
5962 | * locking. Pages are cached in the per-env list for global reuse, and a lock |
5963 | * is required. Pages are not immediately unmapped when their refcnt goes to |
5964 | * zero; they hang around in case they will be reused again soon. |
5965 | * |
5966 | * When the per-txn list gets full, all pages with refcnt=0 are purged from the |
5967 | * list and their refcnts in the per-env list are decremented. |
5968 | * |
5969 | * When the per-env list gets full, all pages with refcnt=0 are purged from the |
5970 | * list and their pages are unmapped. |
5971 | * |
5972 | * @note "full" means the list has reached its respective rpcheck threshold. |
5973 | * This threshold slowly raises if no pages could be purged on a given check, |
5974 | * and returns to its original value when enough pages were purged. |
5975 | * |
5976 | * If purging doesn't free any slots, filling the per-txn list will return |
5977 | * MDB_TXN_FULL, and filling the per-env list returns MDB_MAP_FULL. |
5978 | * |
5979 | * Reference tracking in a txn is imperfect, pages can linger with non-zero |
5980 | * refcnt even without active references. It was deemed to be too invasive |
5981 | * to add unrefs in every required location. However, all pages are unref'd |
5982 | * at the end of the transaction. This guarantees that no stale references |
5983 | * linger in the per-env list. |
5984 | * |
5985 | * Usually we map chunks of 16 pages at a time, but if an overflow page begins |
5986 | * at the tail of the chunk we extend the chunk to include the entire overflow |
5987 | * page. Unfortunately, pages can be turned into overflow pages after their |
5988 | * chunk was already mapped. In that case we must remap the chunk if the |
5989 | * overflow page is referenced. If the chunk's refcnt is 0 we can just remap |
5990 | * it, otherwise we temporarily map a new chunk just for the overflow page. |
5991 | * |
5992 | * @note this chunk handling means we cannot guarantee that a data item |
5993 | * returned from the DB will stay alive for the duration of the transaction: |
5994 | * We unref pages as soon as a cursor moves away from the page |
5995 | * A subsequent op may cause a purge, which may unmap any unref'd chunks |
5996 | * The caller must copy the data if it must be used later in the same txn. |
5997 | * |
5998 | * Also - our reference counting revolves around cursors, but overflow pages |
5999 | * aren't pointed to by a cursor's page stack. We have to remember them |
6000 | * explicitly, in the added mc_ovpg field. A single cursor can only hold a |
6001 | * reference to one overflow page at a time. |
6002 | * |
6003 | * @param[in] txn the transaction for this access. |
6004 | * @param[in] pgno the page number for the page to retrieve. |
6005 | * @param[out] ret address of a pointer where the page's address will be stored. |
6006 | * @return 0 on success, non-zero on failure. |
6007 | */ |
6008 | static int |
6009 | mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) |
6010 | { |
6011 | MDB_env *env = txn->mt_env; |
6012 | MDB_page *p; |
6013 | MDB_ID3L tl = txn->mt_rpages; |
6014 | MDB_ID3L el = env->me_rpages; |
6015 | MDB_ID3 id3; |
6016 | unsigned x, rem; |
6017 | pgno_t pgno; |
6018 | int rc, retries = 1; |
6019 | #ifdef _WIN32 |
6020 | LARGE_INTEGER off; |
6021 | SIZE_T len; |
6022 | #define SET_OFF(off,val) off.QuadPart = val |
6023 | #define MAP(rc,env,addr,len,off) \ |
6024 | addr = NULL; \ |
6025 | rc = NtMapViewOfSection(env->me_fmh, GetCurrentProcess(), &addr, 0, \ |
6026 | len, &off, &len, ViewUnmap, (env->me_flags & MDB_RDONLY) ? 0 : MEM_RESERVE, PAGE_READONLY); \ |
6027 | if (rc) rc = mdb_nt2win32(rc) |
6028 | #else |
6029 | off_t off; |
6030 | size_t len; |
6031 | #define SET_OFF(off,val) off = val |
6032 | #define MAP(rc,env,addr,len,off) \ |
6033 | addr = mmap(NULL, len, PROT_READ, MAP_SHARED, env->me_fd, off); \ |
6034 | rc = (addr == MAP_FAILED) ? errno : 0 |
6035 | #endif |
6036 | |
6037 | /* remember the offset of the actual page number, so we can |
6038 | * return the correct pointer at the end. |
6039 | */ |
6040 | rem = pg0 & (MDB_RPAGE_CHUNK-1); |
6041 | pgno = pg0 ^ rem; |
6042 | |
6043 | id3.mid = 0; |
6044 | x = mdb_mid3l_search(tl, pgno); |
6045 | if (x <= tl[0].mid && tl[x].mid == pgno) { |
6046 | if (x != tl[0].mid && tl[x+1].mid == pg0) |
6047 | x++; |
6048 | /* check for overflow size */ |
6049 | p = (MDB_page *)((char *)tl[x].mptr + rem * env->me_psize); |
6050 | if (IS_OVERFLOW(p) && p->mp_pages + rem > tl[x].mcnt) { |
6051 | id3.mcnt = p->mp_pages + rem; |
6052 | len = id3.mcnt * env->me_psize; |
6053 | SET_OFF(off, pgno * env->me_psize); |
6054 | MAP(rc, env, id3.mptr, len, off); |
6055 | if (rc) |
6056 | return rc; |
6057 | /* check for local-only page */ |
6058 | if (rem) { |
6059 | mdb_tassert(txn, tl[x].mid != pg0); |
6060 | /* hope there's room to insert this locally. |
6061 | * setting mid here tells later code to just insert |
6062 | * this id3 instead of searching for a match. |
6063 | */ |
6064 | id3.mid = pg0; |
6065 | goto notlocal; |
6066 | } else { |
6067 | /* ignore the mapping we got from env, use new one */ |
6068 | tl[x].mptr = id3.mptr; |
6069 | tl[x].mcnt = id3.mcnt; |
6070 | /* if no active ref, see if we can replace in env */ |
6071 | if (!tl[x].mref) { |
6072 | unsigned i; |
6073 | pthread_mutex_lock(&env->me_rpmutex); |
6074 | i = mdb_mid3l_search(el, tl[x].mid); |
6075 | if (el[i].mref == 1) { |
6076 | /* just us, replace it */ |
6077 | munmap(el[i].mptr, el[i].mcnt * env->me_psize); |
6078 | el[i].mptr = tl[x].mptr; |
6079 | el[i].mcnt = tl[x].mcnt; |
6080 | } else { |
6081 | /* there are others, remove ourself */ |
6082 | el[i].mref--; |
6083 | } |
6084 | pthread_mutex_unlock(&env->me_rpmutex); |
6085 | } |
6086 | } |
6087 | } |
6088 | id3.mptr = tl[x].mptr; |
6089 | id3.mcnt = tl[x].mcnt; |
6090 | tl[x].mref++; |
6091 | goto ok; |
6092 | } |
6093 | |
6094 | notlocal: |
6095 | if (tl[0].mid >= MDB_TRPAGE_MAX - txn->mt_rpcheck) { |
6096 | unsigned i, y; |
6097 | /* purge unref'd pages from our list and unref in env */ |
6098 | pthread_mutex_lock(&env->me_rpmutex); |
6099 | retry: |
6100 | y = 0; |
6101 | for (i=1; i<=tl[0].mid; i++) { |
6102 | if (!tl[i].mref) { |
6103 | if (!y) y = i; |
6104 | /* tmp overflow pages don't go to env */ |
6105 | if (tl[i].mid & (MDB_RPAGE_CHUNK-1)) { |
6106 | munmap(tl[i].mptr, tl[i].mcnt * env->me_psize); |
6107 | continue; |
6108 | } |
6109 | x = mdb_mid3l_search(el, tl[i].mid); |
6110 | el[x].mref--; |
6111 | } |
6112 | } |
6113 | pthread_mutex_unlock(&env->me_rpmutex); |
6114 | if (!y) { |
6115 | /* we didn't find any unref'd chunks. |
6116 | * if we're out of room, fail. |
6117 | */ |
6118 | if (tl[0].mid >= MDB_TRPAGE_MAX) |
6119 | return MDB_TXN_FULL; |
6120 | /* otherwise, raise threshold for next time around |
6121 | * and let this go. |
6122 | */ |
6123 | txn->mt_rpcheck /= 2; |
6124 | } else { |
6125 | /* we found some unused; consolidate the list */ |
6126 | for (i=y+1; i<= tl[0].mid; i++) |
6127 | if (tl[i].mref) |
6128 | tl[y++] = tl[i]; |
6129 | tl[0].mid = y-1; |
6130 | /* decrease the check threshold toward its original value */ |
6131 | if (!txn->mt_rpcheck) |
6132 | txn->mt_rpcheck = 1; |
6133 | while (txn->mt_rpcheck < tl[0].mid && txn->mt_rpcheck < MDB_TRPAGE_SIZE/2) |
6134 | txn->mt_rpcheck *= 2; |
6135 | } |
6136 | } |
6137 | if (tl[0].mid < MDB_TRPAGE_SIZE) { |
6138 | id3.mref = 1; |
6139 | if (id3.mid) |
6140 | goto found; |
6141 | /* don't map past last written page in read-only envs */ |
6142 | if ((env->me_flags & MDB_RDONLY) && pgno + MDB_RPAGE_CHUNK-1 > txn->mt_last_pgno) |
6143 | id3.mcnt = txn->mt_last_pgno + 1 - pgno; |
6144 | else |
6145 | id3.mcnt = MDB_RPAGE_CHUNK; |
6146 | len = id3.mcnt * env->me_psize; |
6147 | id3.mid = pgno; |
6148 | |
6149 | /* search for page in env */ |
6150 | pthread_mutex_lock(&env->me_rpmutex); |
6151 | x = mdb_mid3l_search(el, pgno); |
6152 | if (x <= el[0].mid && el[x].mid == pgno) { |
6153 | id3.mptr = el[x].mptr; |
6154 | id3.mcnt = el[x].mcnt; |
6155 | /* check for overflow size */ |
6156 | p = (MDB_page *)((char *)id3.mptr + rem * env->me_psize); |
6157 | if (IS_OVERFLOW(p) && p->mp_pages + rem > id3.mcnt) { |
6158 | id3.mcnt = p->mp_pages + rem; |
6159 | len = id3.mcnt * env->me_psize; |
6160 | SET_OFF(off, pgno * env->me_psize); |
6161 | MAP(rc, env, id3.mptr, len, off); |
6162 | if (rc) |
6163 | goto fail; |
6164 | if (!el[x].mref) { |
6165 | munmap(el[x].mptr, env->me_psize * el[x].mcnt); |
6166 | el[x].mptr = id3.mptr; |
6167 | el[x].mcnt = id3.mcnt; |
6168 | } else { |
6169 | id3.mid = pg0; |
6170 | pthread_mutex_unlock(&env->me_rpmutex); |
6171 | goto found; |
6172 | } |
6173 | } |
6174 | el[x].mref++; |
6175 | pthread_mutex_unlock(&env->me_rpmutex); |
6176 | goto found; |
6177 | } |
6178 | if (el[0].mid >= MDB_ERPAGE_MAX - env->me_rpcheck) { |
6179 | /* purge unref'd pages */ |
6180 | unsigned i, y = 0; |
6181 | for (i=1; i<=el[0].mid; i++) { |
6182 | if (!el[i].mref) { |
6183 | if (!y) y = i; |
6184 | munmap(el[i].mptr, env->me_psize * el[i].mcnt); |
6185 | } |
6186 | } |
6187 | if (!y) { |
6188 | if (retries) { |
6189 | /* see if we can unref some local pages */ |
6190 | retries--; |
6191 | id3.mid = 0; |
6192 | goto retry; |
6193 | } |
6194 | if (el[0].mid >= MDB_ERPAGE_MAX) { |
6195 | pthread_mutex_unlock(&env->me_rpmutex); |
6196 | return MDB_MAP_FULL; |
6197 | } |
6198 | env->me_rpcheck /= 2; |
6199 | } else { |
6200 | for (i=y+1; i<= el[0].mid; i++) |
6201 | if (el[i].mref) |
6202 | el[y++] = el[i]; |
6203 | el[0].mid = y-1; |
6204 | if (!env->me_rpcheck) |
6205 | env->me_rpcheck = 1; |
6206 | while (env->me_rpcheck < el[0].mid && env->me_rpcheck < MDB_ERPAGE_SIZE/2) |
6207 | env->me_rpcheck *= 2; |
6208 | } |
6209 | } |
6210 | SET_OFF(off, pgno * env->me_psize); |
6211 | MAP(rc, env, id3.mptr, len, off); |
6212 | if (rc) { |
6213 | fail: |
6214 | pthread_mutex_unlock(&env->me_rpmutex); |
6215 | return rc; |
6216 | } |
6217 | /* check for overflow size */ |
6218 | p = (MDB_page *)((char *)id3.mptr + rem * env->me_psize); |
6219 | if (IS_OVERFLOW(p) && p->mp_pages + rem > id3.mcnt) { |
6220 | id3.mcnt = p->mp_pages + rem; |
6221 | munmap(id3.mptr, len); |
6222 | len = id3.mcnt * env->me_psize; |
6223 | MAP(rc, env, id3.mptr, len, off); |
6224 | if (rc) |
6225 | goto fail; |
6226 | } |
6227 | mdb_mid3l_insert(el, &id3); |
6228 | pthread_mutex_unlock(&env->me_rpmutex); |
6229 | found: |
6230 | mdb_mid3l_insert(tl, &id3); |
6231 | } else { |
6232 | return MDB_TXN_FULL; |
6233 | } |
6234 | ok: |
6235 | p = (MDB_page *)((char *)id3.mptr + rem * env->me_psize); |
6236 | #if MDB_DEBUG /* we don't need this check any more */ |
6237 | if (IS_OVERFLOW(p)) { |
6238 | mdb_tassert(txn, p->mp_pages + rem <= id3.mcnt); |
6239 | } |
6240 | #endif |
6241 | *ret = p; |
6242 | return MDB_SUCCESS; |
6243 | } |
6244 | #endif |
6245 | |
6246 | /** Find the address of the page corresponding to a given page number. |
6247 | * Set #MDB_TXN_ERROR on failure. |
6248 | * @param[in] mc the cursor accessing the page. |
6249 | * @param[in] pgno the page number for the page to retrieve. |
6250 | * @param[out] ret address of a pointer where the page's address will be stored. |
6251 | * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. |
6252 | * @return 0 on success, non-zero on failure. |
6253 | */ |
6254 | static int |
6255 | mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) |
6256 | { |
6257 | MDB_txn *txn = mc->mc_txn; |
6258 | MDB_page *p = NULL; |
6259 | int level; |
6260 | |
6261 | if (! (mc->mc_flags & (C_ORIG_RDONLY|C_WRITEMAP))) { |
6262 | MDB_txn *tx2 = txn; |
6263 | level = 1; |
6264 | do { |
6265 | MDB_ID2L dl = tx2->mt_u.dirty_list; |
6266 | unsigned x; |
6267 | /* Spilled pages were dirtied in this txn and flushed |
6268 | * because the dirty list got full. Bring this page |
6269 | * back in from the map (but don't unspill it here, |
6270 | * leave that unless page_touch happens again). |
6271 | */ |
6272 | if (tx2->mt_spill_pgs) { |
6273 | MDB_ID pn = pgno << 1; |
6274 | x = mdb_midl_search(tx2->mt_spill_pgs, pn); |
6275 | if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { |
6276 | goto mapped; |
6277 | } |
6278 | } |
6279 | if (dl[0].mid) { |
6280 | unsigned x = mdb_mid2l_search(dl, pgno); |
6281 | if (x <= dl[0].mid && dl[x].mid == pgno) { |
6282 | p = dl[x].mptr; |
6283 | goto done; |
6284 | } |
6285 | } |
6286 | level++; |
6287 | } while ((tx2 = tx2->mt_parent) != NULL); |
6288 | } |
6289 | |
6290 | if (pgno >= txn->mt_next_pgno) { |
6291 | DPRINTF(("page %" Yu" not found" , pgno)); |
6292 | txn->mt_flags |= MDB_TXN_ERROR; |
6293 | return MDB_PAGE_NOTFOUND; |
6294 | } |
6295 | |
6296 | level = 0; |
6297 | |
6298 | mapped: |
6299 | { |
6300 | #ifdef MDB_VL32 |
6301 | int rc = mdb_rpage_get(txn, pgno, &p); |
6302 | if (rc) { |
6303 | txn->mt_flags |= MDB_TXN_ERROR; |
6304 | return rc; |
6305 | } |
6306 | #else |
6307 | MDB_env *env = txn->mt_env; |
6308 | p = (MDB_page *)(env->me_map + env->me_psize * pgno); |
6309 | #endif |
6310 | } |
6311 | |
6312 | done: |
6313 | *ret = p; |
6314 | if (lvl) |
6315 | *lvl = level; |
6316 | return MDB_SUCCESS; |
6317 | } |
6318 | |
6319 | /** Finish #mdb_page_search() / #mdb_page_search_lowest(). |
6320 | * The cursor is at the root page, set up the rest of it. |
6321 | */ |
6322 | static int |
6323 | mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) |
6324 | { |
6325 | MDB_page *mp = mc->mc_pg[mc->mc_top]; |
6326 | int rc; |
6327 | DKBUF; |
6328 | |
6329 | while (IS_BRANCH(mp)) { |
6330 | MDB_node *node; |
6331 | indx_t i; |
6332 | |
6333 | DPRINTF(("branch page %" Yu" has %u keys" , mp->mp_pgno, NUMKEYS(mp))); |
6334 | /* Don't assert on branch pages in the FreeDB. We can get here |
6335 | * while in the process of rebalancing a FreeDB branch page; we must |
6336 | * let that proceed. ITS#8336 |
6337 | */ |
6338 | mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); |
6339 | DPRINTF(("found index 0 to page %" Yu, NODEPGNO(NODEPTR(mp, 0)))); |
6340 | |
6341 | if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { |
6342 | i = 0; |
6343 | if (flags & MDB_PS_LAST) { |
6344 | i = NUMKEYS(mp) - 1; |
6345 | /* if already init'd, see if we're already in right place */ |
6346 | if (mc->mc_flags & C_INITIALIZED) { |
6347 | if (mc->mc_ki[mc->mc_top] == i) { |
6348 | mc->mc_top = mc->mc_snum++; |
6349 | mp = mc->mc_pg[mc->mc_top]; |
6350 | goto ready; |
6351 | } |
6352 | } |
6353 | } |
6354 | } else { |
6355 | int exact; |
6356 | node = mdb_node_search(mc, key, &exact); |
6357 | if (node == NULL) |
6358 | i = NUMKEYS(mp) - 1; |
6359 | else { |
6360 | i = mc->mc_ki[mc->mc_top]; |
6361 | if (!exact) { |
6362 | mdb_cassert(mc, i > 0); |
6363 | i--; |
6364 | } |
6365 | } |
6366 | DPRINTF(("following index %u for key [%s]" , i, DKEY(key))); |
6367 | } |
6368 | |
6369 | mdb_cassert(mc, i < NUMKEYS(mp)); |
6370 | node = NODEPTR(mp, i); |
6371 | |
6372 | if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) |
6373 | return rc; |
6374 | |
6375 | mc->mc_ki[mc->mc_top] = i; |
6376 | if ((rc = mdb_cursor_push(mc, mp))) |
6377 | return rc; |
6378 | |
6379 | ready: |
6380 | if (flags & MDB_PS_MODIFY) { |
6381 | if ((rc = mdb_page_touch(mc)) != 0) |
6382 | return rc; |
6383 | mp = mc->mc_pg[mc->mc_top]; |
6384 | } |
6385 | } |
6386 | |
6387 | if (!IS_LEAF(mp)) { |
6388 | DPRINTF(("internal error, index points to a %02X page!?" , |
6389 | mp->mp_flags)); |
6390 | mc->mc_txn->mt_flags |= MDB_TXN_ERROR; |
6391 | return MDB_CORRUPTED; |
6392 | } |
6393 | |
6394 | DPRINTF(("found leaf page %" Yu" for key [%s]" , mp->mp_pgno, |
6395 | key ? DKEY(key) : "null" )); |
6396 | mc->mc_flags |= C_INITIALIZED; |
6397 | mc->mc_flags &= ~C_EOF; |
6398 | |
6399 | return MDB_SUCCESS; |
6400 | } |
6401 | |
6402 | /** Search for the lowest key under the current branch page. |
6403 | * This just bypasses a NUMKEYS check in the current page |
6404 | * before calling mdb_page_search_root(), because the callers |
6405 | * are all in situations where the current page is known to |
6406 | * be underfilled. |
6407 | */ |
6408 | static int |
6409 | mdb_page_search_lowest(MDB_cursor *mc) |
6410 | { |
6411 | MDB_page *mp = mc->mc_pg[mc->mc_top]; |
6412 | MDB_node *node = NODEPTR(mp, 0); |
6413 | int rc; |
6414 | |
6415 | if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) |
6416 | return rc; |
6417 | |
6418 | mc->mc_ki[mc->mc_top] = 0; |
6419 | if ((rc = mdb_cursor_push(mc, mp))) |
6420 | return rc; |
6421 | return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); |
6422 | } |
6423 | |
6424 | /** Search for the page a given key should be in. |
6425 | * Push it and its parent pages on the cursor stack. |
6426 | * @param[in,out] mc the cursor for this operation. |
6427 | * @param[in] key the key to search for, or NULL for first/last page. |
6428 | * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB |
6429 | * are touched (updated with new page numbers). |
6430 | * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. |
6431 | * This is used by #mdb_cursor_first() and #mdb_cursor_last(). |
6432 | * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. |
6433 | * @return 0 on success, non-zero on failure. |
6434 | */ |
6435 | static int |
6436 | mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) |
6437 | { |
6438 | int rc; |
6439 | pgno_t root; |
6440 | |
6441 | /* Make sure the txn is still viable, then find the root from |
6442 | * the txn's db table and set it as the root of the cursor's stack. |
6443 | */ |
6444 | if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) { |
6445 | DPUTS("transaction may not be used now" ); |
6446 | return MDB_BAD_TXN; |
6447 | } else { |
6448 | /* Make sure we're using an up-to-date root */ |
6449 | if (*mc->mc_dbflag & DB_STALE) { |
6450 | MDB_cursor mc2; |
6451 | if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) |
6452 | return MDB_BAD_DBI; |
6453 | mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); |
6454 | rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); |
6455 | if (rc) |
6456 | return rc; |
6457 | { |
6458 | MDB_val data; |
6459 | int exact = 0; |
6460 | uint16_t flags; |
6461 | MDB_node *leaf = mdb_node_search(&mc2, |
6462 | &mc->mc_dbx->md_name, &exact); |
6463 | if (!exact) |
6464 | return MDB_NOTFOUND; |
6465 | if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) |
6466 | return MDB_INCOMPATIBLE; /* not a named DB */ |
6467 | rc = mdb_node_read(&mc2, leaf, &data); |
6468 | if (rc) |
6469 | return rc; |
6470 | memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), |
6471 | sizeof(uint16_t)); |
6472 | /* The txn may not know this DBI, or another process may |
6473 | * have dropped and recreated the DB with other flags. |
6474 | */ |
6475 | if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) |
6476 | return MDB_INCOMPATIBLE; |
6477 | memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); |
6478 | } |
6479 | *mc->mc_dbflag &= ~DB_STALE; |
6480 | } |
6481 | root = mc->mc_db->md_root; |
6482 | |
6483 | if (root == P_INVALID) { /* Tree is empty. */ |
6484 | DPUTS("tree is empty" ); |
6485 | return MDB_NOTFOUND; |
6486 | } |
6487 | } |
6488 | |
6489 | mdb_cassert(mc, root > 1); |
6490 | if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) { |
6491 | #ifdef MDB_VL32 |
6492 | if (mc->mc_pg[0]) |
6493 | MDB_PAGE_UNREF(mc->mc_txn, mc->mc_pg[0]); |
6494 | #endif |
6495 | if ((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0) |
6496 | return rc; |
6497 | } |
6498 | |
6499 | #ifdef MDB_VL32 |
6500 | { |
6501 | int i; |
6502 | for (i=1; i<mc->mc_snum; i++) |
6503 | MDB_PAGE_UNREF(mc->mc_txn, mc->mc_pg[i]); |
6504 | } |
6505 | #endif |
6506 | mc->mc_snum = 1; |
6507 | mc->mc_top = 0; |
6508 | |
6509 | DPRINTF(("db %d root page %" Yu" has flags 0x%X" , |
6510 | DDBI(mc), root, mc->mc_pg[0]->mp_flags)); |
6511 | |
6512 | if (flags & MDB_PS_MODIFY) { |
6513 | if ((rc = mdb_page_touch(mc))) |
6514 | return rc; |
6515 | } |
6516 | |
6517 | if (flags & MDB_PS_ROOTONLY) |
6518 | return MDB_SUCCESS; |
6519 | |
6520 | return mdb_page_search_root(mc, key, flags); |
6521 | } |
6522 | |
6523 | static int |
6524 | mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) |
6525 | { |
6526 | MDB_txn *txn = mc->mc_txn; |
6527 | pgno_t pg = mp->mp_pgno; |
6528 | unsigned x = 0, ovpages = mp->mp_pages; |
6529 | MDB_env *env = txn->mt_env; |
6530 | MDB_IDL sl = txn->mt_spill_pgs; |
6531 | MDB_ID pn = pg << 1; |
6532 | int rc; |
6533 | |
6534 | DPRINTF(("free ov page %" Yu" (%d)" , pg, ovpages)); |
6535 | /* If the page is dirty or on the spill list we just acquired it, |
6536 | * so we should give it back to our current free list, if any. |
6537 | * Otherwise put it onto the list of pages we freed in this txn. |
6538 | * |
6539 | * Won't create me_pghead: me_pglast must be inited along with it. |
6540 | * Unsupported in nested txns: They would need to hide the page |
6541 | * range in ancestor txns' dirty and spilled lists. |
6542 | */ |
6543 | if (env->me_pghead && |
6544 | !txn->mt_parent && |
6545 | ((mp->mp_flags & P_DIRTY) || |
6546 | (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) |
6547 | { |
6548 | unsigned i, j; |
6549 | pgno_t *mop; |
6550 | MDB_ID2 *dl, ix, iy; |
6551 | rc = mdb_midl_need(&env->me_pghead, ovpages); |
6552 | if (rc) |
6553 | return rc; |
6554 | if (!(mp->mp_flags & P_DIRTY)) { |
6555 | /* This page is no longer spilled */ |
6556 | if (x == sl[0]) |
6557 | sl[0]--; |
6558 | else |
6559 | sl[x] |= 1; |
6560 | goto release; |
6561 | } |
6562 | /* Remove from dirty list */ |
6563 | dl = txn->mt_u.dirty_list; |
6564 | x = dl[0].mid--; |
6565 | for (ix = dl[x]; ix.mptr != mp; ix = iy) { |
6566 | if (x > 1) { |
6567 | x--; |
6568 | iy = dl[x]; |
6569 | dl[x] = ix; |
6570 | } else { |
6571 | mdb_cassert(mc, x > 1); |
6572 | j = ++(dl[0].mid); |
6573 | dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ |
6574 | txn->mt_flags |= MDB_TXN_ERROR; |
6575 | return MDB_PROBLEM; |
6576 | } |
6577 | } |
6578 | txn->mt_dirty_room++; |
6579 | if (!(env->me_flags & MDB_WRITEMAP)) |
6580 | mdb_dpage_free(env, mp); |
6581 | release: |
6582 | /* Insert in me_pghead */ |
6583 | mop = env->me_pghead; |
6584 | j = mop[0] + ovpages; |
6585 | for (i = mop[0]; i && mop[i] < pg; i--) |
6586 | mop[j--] = mop[i]; |
6587 | while (j>i) |
6588 | mop[j--] = pg++; |
6589 | mop[0] += ovpages; |
6590 | } else { |
6591 | rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); |
6592 | if (rc) |
6593 | return rc; |
6594 | } |
6595 | #ifdef MDB_VL32 |
6596 | if (mc->mc_ovpg == mp) |
6597 | mc->mc_ovpg = NULL; |
6598 | #endif |
6599 | mc->mc_db->md_overflow_pages -= ovpages; |
6600 | return 0; |
6601 | } |
6602 | |
6603 | /** Return the data associated with a given node. |
6604 | * @param[in] mc The cursor for this operation. |
6605 | * @param[in] leaf The node being read. |
6606 | * @param[out] data Updated to point to the node's data. |
6607 | * @return 0 on success, non-zero on failure. |
6608 | */ |
6609 | static int |
6610 | mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) |
6611 | { |
6612 | MDB_page *omp; /* overflow page */ |
6613 | pgno_t pgno; |
6614 | int rc; |
6615 | |
6616 | if (MC_OVPG(mc)) { |
6617 | MDB_PAGE_UNREF(mc->mc_txn, MC_OVPG(mc)); |
6618 | MC_SET_OVPG(mc, NULL); |
6619 | } |
6620 | if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { |
6621 | data->mv_size = NODEDSZ(leaf); |
6622 | data->mv_data = NODEDATA(leaf); |
6623 | return MDB_SUCCESS; |
6624 | } |
6625 | |
6626 | /* Read overflow data. |
6627 | */ |
6628 | data->mv_size = NODEDSZ(leaf); |
6629 | memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); |
6630 | if ((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0) { |
6631 | DPRINTF(("read overflow page %" Yu" failed" , pgno)); |
6632 | return rc; |
6633 | } |
6634 | data->mv_data = METADATA(omp); |
6635 | MC_SET_OVPG(mc, omp); |
6636 | |
6637 | return MDB_SUCCESS; |
6638 | } |
6639 | |
6640 | int |
6641 | mdb_get(MDB_txn *txn, MDB_dbi dbi, |
6642 | MDB_val *key, MDB_val *data) |
6643 | { |
6644 | MDB_cursor mc; |
6645 | MDB_xcursor mx; |
6646 | int exact = 0, rc; |
6647 | DKBUF; |
6648 | |
6649 | DPRINTF(("===> get db %u key [%s]" , dbi, DKEY(key))); |
6650 | |
6651 | if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
6652 | return EINVAL; |
6653 | |
6654 | if (txn->mt_flags & MDB_TXN_BLOCKED) |
6655 | return MDB_BAD_TXN; |
6656 | |
6657 | mdb_cursor_init(&mc, txn, dbi, &mx); |
6658 | rc = mdb_cursor_set(&mc, key, data, MDB_SET, &exact); |
6659 | /* unref all the pages when MDB_VL32 - caller must copy the data |
6660 | * before doing anything else |
6661 | */ |
6662 | MDB_CURSOR_UNREF(&mc, 1); |
6663 | return rc; |
6664 | } |
6665 | |
6666 | /** Find a sibling for a page. |
6667 | * Replaces the page at the top of the cursor's stack with the |
6668 | * specified sibling, if one exists. |
6669 | * @param[in] mc The cursor for this operation. |
6670 | * @param[in] move_right Non-zero if the right sibling is requested, |
6671 | * otherwise the left sibling. |
6672 | * @return 0 on success, non-zero on failure. |
6673 | */ |
6674 | static int |
6675 | mdb_cursor_sibling(MDB_cursor *mc, int move_right) |
6676 | { |
6677 | int rc; |
6678 | MDB_node *indx; |
6679 | MDB_page *mp; |
6680 | #ifdef MDB_VL32 |
6681 | MDB_page *op; |
6682 | #endif |
6683 | |
6684 | if (mc->mc_snum < 2) { |
6685 | return MDB_NOTFOUND; /* root has no siblings */ |
6686 | } |
6687 | |
6688 | #ifdef MDB_VL32 |
6689 | op = mc->mc_pg[mc->mc_top]; |
6690 | #endif |
6691 | mdb_cursor_pop(mc); |
6692 | DPRINTF(("parent page is page %" Yu", index %u" , |
6693 | mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); |
6694 | |
6695 | if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) |
6696 | : (mc->mc_ki[mc->mc_top] == 0)) { |
6697 | DPRINTF(("no more keys left, moving to %s sibling" , |
6698 | move_right ? "right" : "left" )); |
6699 | if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { |
6700 | /* undo cursor_pop before returning */ |
6701 | mc->mc_top++; |
6702 | mc->mc_snum++; |
6703 | return rc; |
6704 | } |
6705 | } else { |
6706 | if (move_right) |
6707 | mc->mc_ki[mc->mc_top]++; |
6708 | else |
6709 | mc->mc_ki[mc->mc_top]--; |
6710 | DPRINTF(("just moving to %s index key %u" , |
6711 | move_right ? "right" : "left" , mc->mc_ki[mc->mc_top])); |
6712 | } |
6713 | mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); |
6714 | |
6715 | MDB_PAGE_UNREF(mc->mc_txn, op); |
6716 | |
6717 | indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
6718 | if ((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) { |
6719 | /* mc will be inconsistent if caller does mc_snum++ as above */ |
6720 | mc->mc_flags &= ~(C_INITIALIZED|C_EOF); |
6721 | return rc; |
6722 | } |
6723 | |
6724 | mdb_cursor_push(mc, mp); |
6725 | if (!move_right) |
6726 | mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; |
6727 | |
6728 | return MDB_SUCCESS; |
6729 | } |
6730 | |
6731 | /** Move the cursor to the next data item. */ |
6732 | static int |
6733 | mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) |
6734 | { |
6735 | MDB_page *mp; |
6736 | MDB_node *leaf; |
6737 | int rc; |
6738 | |
6739 | if ((mc->mc_flags & C_DEL && op == MDB_NEXT_DUP)) |
6740 | return MDB_NOTFOUND; |
6741 | |
6742 | if (!(mc->mc_flags & C_INITIALIZED)) |
6743 | return mdb_cursor_first(mc, key, data); |
6744 | |
6745 | mp = mc->mc_pg[mc->mc_top]; |
6746 | |
6747 | if (mc->mc_flags & C_EOF) { |
6748 | if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1) |
6749 | return MDB_NOTFOUND; |
6750 | mc->mc_flags ^= C_EOF; |
6751 | } |
6752 | |
6753 | if (mc->mc_db->md_flags & MDB_DUPSORT) { |
6754 | leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
6755 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
6756 | if (op == MDB_NEXT || op == MDB_NEXT_DUP) { |
6757 | rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); |
6758 | if (op != MDB_NEXT || rc != MDB_NOTFOUND) { |
6759 | if (rc == MDB_SUCCESS) |
6760 | MDB_GET_KEY(leaf, key); |
6761 | return rc; |
6762 | } |
6763 | } |
6764 | else { |
6765 | MDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0); |
6766 | } |
6767 | } else { |
6768 | mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); |
6769 | if (op == MDB_NEXT_DUP) |
6770 | return MDB_NOTFOUND; |
6771 | } |
6772 | } |
6773 | |
6774 | DPRINTF(("cursor_next: top page is %" Yu" in cursor %p" , |
6775 | mdb_dbg_pgno(mp), (void *) mc)); |
6776 | if (mc->mc_flags & C_DEL) { |
6777 | mc->mc_flags ^= C_DEL; |
6778 | goto skip; |
6779 | } |
6780 | |
6781 | if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { |
6782 | DPUTS("=====> move to next sibling page" ); |
6783 | if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { |
6784 | mc->mc_flags |= C_EOF; |
6785 | return rc; |
6786 | } |
6787 | mp = mc->mc_pg[mc->mc_top]; |
6788 | DPRINTF(("next page is %" Yu", key index %u" , mp->mp_pgno, mc->mc_ki[mc->mc_top])); |
6789 | } else |
6790 | mc->mc_ki[mc->mc_top]++; |
6791 | |
6792 | skip: |
6793 | DPRINTF(("==> cursor points to page %" Yu" with %u keys, key index %u" , |
6794 | mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); |
6795 | |
6796 | if (IS_LEAF2(mp)) { |
6797 | key->mv_size = mc->mc_db->md_pad; |
6798 | key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); |
6799 | return MDB_SUCCESS; |
6800 | } |
6801 | |
6802 | mdb_cassert(mc, IS_LEAF(mp)); |
6803 | leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
6804 | |
6805 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
6806 | mdb_xcursor_init1(mc, leaf); |
6807 | } |
6808 | if (data) { |
6809 | if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) |
6810 | return rc; |
6811 | |
6812 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
6813 | rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); |
6814 | if (rc != MDB_SUCCESS) |
6815 | return rc; |
6816 | } |
6817 | } |
6818 | |
6819 | MDB_GET_KEY(leaf, key); |
6820 | return MDB_SUCCESS; |
6821 | } |
6822 | |
6823 | /** Move the cursor to the previous data item. */ |
6824 | static int |
6825 | mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) |
6826 | { |
6827 | MDB_page *mp; |
6828 | MDB_node *leaf; |
6829 | int rc; |
6830 | |
6831 | if (!(mc->mc_flags & C_INITIALIZED)) { |
6832 | rc = mdb_cursor_last(mc, key, data); |
6833 | if (rc) |
6834 | return rc; |
6835 | mc->mc_ki[mc->mc_top]++; |
6836 | } |
6837 | |
6838 | mp = mc->mc_pg[mc->mc_top]; |
6839 | |
6840 | if (mc->mc_db->md_flags & MDB_DUPSORT) { |
6841 | leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
6842 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
6843 | if (op == MDB_PREV || op == MDB_PREV_DUP) { |
6844 | rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); |
6845 | if (op != MDB_PREV || rc != MDB_NOTFOUND) { |
6846 | if (rc == MDB_SUCCESS) { |
6847 | MDB_GET_KEY(leaf, key); |
6848 | mc->mc_flags &= ~C_EOF; |
6849 | } |
6850 | return rc; |
6851 | } |
6852 | } |
6853 | else { |
6854 | MDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0); |
6855 | } |
6856 | } else { |
6857 | mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); |
6858 | if (op == MDB_PREV_DUP) |
6859 | return MDB_NOTFOUND; |
6860 | } |
6861 | } |
6862 | |
6863 | DPRINTF(("cursor_prev: top page is %" Yu" in cursor %p" , |
6864 | mdb_dbg_pgno(mp), (void *) mc)); |
6865 | |
6866 | mc->mc_flags &= ~(C_EOF|C_DEL); |
6867 | |
6868 | if (mc->mc_ki[mc->mc_top] == 0) { |
6869 | DPUTS("=====> move to prev sibling page" ); |
6870 | if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { |
6871 | return rc; |
6872 | } |
6873 | mp = mc->mc_pg[mc->mc_top]; |
6874 | mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; |
6875 | DPRINTF(("prev page is %" Yu", key index %u" , mp->mp_pgno, mc->mc_ki[mc->mc_top])); |
6876 | } else |
6877 | mc->mc_ki[mc->mc_top]--; |
6878 | |
6879 | DPRINTF(("==> cursor points to page %" Yu" with %u keys, key index %u" , |
6880 | mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); |
6881 | |
6882 | if (IS_LEAF2(mp)) { |
6883 | key->mv_size = mc->mc_db->md_pad; |
6884 | key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); |
6885 | return MDB_SUCCESS; |
6886 | } |
6887 | |
6888 | mdb_cassert(mc, IS_LEAF(mp)); |
6889 | leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
6890 | |
6891 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
6892 | mdb_xcursor_init1(mc, leaf); |
6893 | } |
6894 | if (data) { |
6895 | if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) |
6896 | return rc; |
6897 | |
6898 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
6899 | rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); |
6900 | if (rc != MDB_SUCCESS) |
6901 | return rc; |
6902 | } |
6903 | } |
6904 | |
6905 | MDB_GET_KEY(leaf, key); |
6906 | return MDB_SUCCESS; |
6907 | } |
6908 | |
6909 | /** Set the cursor on a specific data item. */ |
6910 | static int |
6911 | mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, |
6912 | MDB_cursor_op op, int *exactp) |
6913 | { |
6914 | int rc; |
6915 | MDB_page *mp; |
6916 | MDB_node *leaf = NULL; |
6917 | DKBUF; |
6918 | |
6919 | if (key->mv_size == 0) |
6920 | return MDB_BAD_VALSIZE; |
6921 | |
6922 | if (mc->mc_xcursor) { |
6923 | MDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0); |
6924 | mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); |
6925 | } |
6926 | |
6927 | /* See if we're already on the right page */ |
6928 | if (mc->mc_flags & C_INITIALIZED) { |
6929 | MDB_val nodekey; |
6930 | |
6931 | mp = mc->mc_pg[mc->mc_top]; |
6932 | if (!NUMKEYS(mp)) { |
6933 | mc->mc_ki[mc->mc_top] = 0; |
6934 | return MDB_NOTFOUND; |
6935 | } |
6936 | if (mp->mp_flags & P_LEAF2) { |
6937 | nodekey.mv_size = mc->mc_db->md_pad; |
6938 | nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); |
6939 | } else { |
6940 | leaf = NODEPTR(mp, 0); |
6941 | MDB_GET_KEY2(leaf, nodekey); |
6942 | } |
6943 | rc = mc->mc_dbx->md_cmp(key, &nodekey); |
6944 | if (rc == 0) { |
6945 | /* Probably happens rarely, but first node on the page |
6946 | * was the one we wanted. |
6947 | */ |
6948 | mc->mc_ki[mc->mc_top] = 0; |
6949 | if (exactp) |
6950 | *exactp = 1; |
6951 | goto set1; |
6952 | } |
6953 | if (rc > 0) { |
6954 | unsigned int i; |
6955 | unsigned int nkeys = NUMKEYS(mp); |
6956 | if (nkeys > 1) { |
6957 | if (mp->mp_flags & P_LEAF2) { |
6958 | nodekey.mv_data = LEAF2KEY(mp, |
6959 | nkeys-1, nodekey.mv_size); |
6960 | } else { |
6961 | leaf = NODEPTR(mp, nkeys-1); |
6962 | MDB_GET_KEY2(leaf, nodekey); |
6963 | } |
6964 | rc = mc->mc_dbx->md_cmp(key, &nodekey); |
6965 | if (rc == 0) { |
6966 | /* last node was the one we wanted */ |
6967 | mc->mc_ki[mc->mc_top] = nkeys-1; |
6968 | if (exactp) |
6969 | *exactp = 1; |
6970 | goto set1; |
6971 | } |
6972 | if (rc < 0) { |
6973 | if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { |
6974 | /* This is definitely the right page, skip search_page */ |
6975 | if (mp->mp_flags & P_LEAF2) { |
6976 | nodekey.mv_data = LEAF2KEY(mp, |
6977 | mc->mc_ki[mc->mc_top], nodekey.mv_size); |
6978 | } else { |
6979 | leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
6980 | MDB_GET_KEY2(leaf, nodekey); |
6981 | } |
6982 | rc = mc->mc_dbx->md_cmp(key, &nodekey); |
6983 | if (rc == 0) { |
6984 | /* current node was the one we wanted */ |
6985 | if (exactp) |
6986 | *exactp = 1; |
6987 | goto set1; |
6988 | } |
6989 | } |
6990 | rc = 0; |
6991 | mc->mc_flags &= ~C_EOF; |
6992 | goto set2; |
6993 | } |
6994 | } |
6995 | /* If any parents have right-sibs, search. |
6996 | * Otherwise, there's nothing further. |
6997 | */ |
6998 | for (i=0; i<mc->mc_top; i++) |
6999 | if (mc->mc_ki[i] < |
7000 | NUMKEYS(mc->mc_pg[i])-1) |
7001 | break; |
7002 | if (i == mc->mc_top) { |
7003 | /* There are no other pages */ |
7004 | mc->mc_ki[mc->mc_top] = nkeys; |
7005 | return MDB_NOTFOUND; |
7006 | } |
7007 | } |
7008 | if (!mc->mc_top) { |
7009 | /* There are no other pages */ |
7010 | mc->mc_ki[mc->mc_top] = 0; |
7011 | if (op == MDB_SET_RANGE && !exactp) { |
7012 | rc = 0; |
7013 | goto set1; |
7014 | } else |
7015 | return MDB_NOTFOUND; |
7016 | } |
7017 | } else { |
7018 | mc->mc_pg[0] = 0; |
7019 | } |
7020 | |
7021 | rc = mdb_page_search(mc, key, 0); |
7022 | if (rc != MDB_SUCCESS) |
7023 | return rc; |
7024 | |
7025 | mp = mc->mc_pg[mc->mc_top]; |
7026 | mdb_cassert(mc, IS_LEAF(mp)); |
7027 | |
7028 | set2: |
7029 | leaf = mdb_node_search(mc, key, exactp); |
7030 | if (exactp != NULL && !*exactp) { |
7031 | /* MDB_SET specified and not an exact match. */ |
7032 | return MDB_NOTFOUND; |
7033 | } |
7034 | |
7035 | if (leaf == NULL) { |
7036 | DPUTS("===> inexact leaf not found, goto sibling" ); |
7037 | if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { |
7038 | mc->mc_flags |= C_EOF; |
7039 | return rc; /* no entries matched */ |
7040 | } |
7041 | mp = mc->mc_pg[mc->mc_top]; |
7042 | mdb_cassert(mc, IS_LEAF(mp)); |
7043 | leaf = NODEPTR(mp, 0); |
7044 | } |
7045 | |
7046 | set1: |
7047 | mc->mc_flags |= C_INITIALIZED; |
7048 | mc->mc_flags &= ~C_EOF; |
7049 | |
7050 | if (IS_LEAF2(mp)) { |
7051 | if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { |
7052 | key->mv_size = mc->mc_db->md_pad; |
7053 | key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); |
7054 | } |
7055 | return MDB_SUCCESS; |
7056 | } |
7057 | |
7058 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
7059 | mdb_xcursor_init1(mc, leaf); |
7060 | } |
7061 | if (data) { |
7062 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
7063 | if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { |
7064 | rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); |
7065 | } else { |
7066 | int ex2, *ex2p; |
7067 | if (op == MDB_GET_BOTH) { |
7068 | ex2p = &ex2; |
7069 | ex2 = 0; |
7070 | } else { |
7071 | ex2p = NULL; |
7072 | } |
7073 | rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); |
7074 | if (rc != MDB_SUCCESS) |
7075 | return rc; |
7076 | } |
7077 | } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { |
7078 | MDB_val olddata; |
7079 | MDB_cmp_func *dcmp; |
7080 | if ((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS) |
7081 | return rc; |
7082 | dcmp = mc->mc_dbx->md_dcmp; |
7083 | if (NEED_CMP_CLONG(dcmp, olddata.mv_size)) |
7084 | dcmp = mdb_cmp_clong; |
7085 | rc = dcmp(data, &olddata); |
7086 | if (rc) { |
7087 | if (op == MDB_GET_BOTH || rc > 0) |
7088 | return MDB_NOTFOUND; |
7089 | rc = 0; |
7090 | } |
7091 | *data = olddata; |
7092 | |
7093 | } else { |
7094 | if (mc->mc_xcursor) |
7095 | mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); |
7096 | if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) |
7097 | return rc; |
7098 | } |
7099 | } |
7100 | |
7101 | /* The key already matches in all other cases */ |
7102 | if (op == MDB_SET_RANGE || op == MDB_SET_KEY) |
7103 | MDB_GET_KEY(leaf, key); |
7104 | DPRINTF(("==> cursor placed on key [%s]" , DKEY(key))); |
7105 | |
7106 | return rc; |
7107 | } |
7108 | |
7109 | /** Move the cursor to the first item in the database. */ |
7110 | static int |
7111 | mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) |
7112 | { |
7113 | int rc; |
7114 | MDB_node *leaf; |
7115 | |
7116 | if (mc->mc_xcursor) { |
7117 | MDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0); |
7118 | mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); |
7119 | } |
7120 | |
7121 | if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { |
7122 | rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); |
7123 | if (rc != MDB_SUCCESS) |
7124 | return rc; |
7125 | } |
7126 | mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); |
7127 | |
7128 | leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); |
7129 | mc->mc_flags |= C_INITIALIZED; |
7130 | mc->mc_flags &= ~C_EOF; |
7131 | |
7132 | mc->mc_ki[mc->mc_top] = 0; |
7133 | |
7134 | if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { |
7135 | key->mv_size = mc->mc_db->md_pad; |
7136 | key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); |
7137 | return MDB_SUCCESS; |
7138 | } |
7139 | |
7140 | if (data) { |
7141 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
7142 | mdb_xcursor_init1(mc, leaf); |
7143 | rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); |
7144 | if (rc) |
7145 | return rc; |
7146 | } else { |
7147 | if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) |
7148 | return rc; |
7149 | } |
7150 | } |
7151 | MDB_GET_KEY(leaf, key); |
7152 | return MDB_SUCCESS; |
7153 | } |
7154 | |
7155 | /** Move the cursor to the last item in the database. */ |
7156 | static int |
7157 | mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) |
7158 | { |
7159 | int rc; |
7160 | MDB_node *leaf; |
7161 | |
7162 | if (mc->mc_xcursor) { |
7163 | MDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0); |
7164 | mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); |
7165 | } |
7166 | |
7167 | if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { |
7168 | rc = mdb_page_search(mc, NULL, MDB_PS_LAST); |
7169 | if (rc != MDB_SUCCESS) |
7170 | return rc; |
7171 | } |
7172 | mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); |
7173 | |
7174 | mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; |
7175 | mc->mc_flags |= C_INITIALIZED|C_EOF; |
7176 | leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
7177 | |
7178 | if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { |
7179 | key->mv_size = mc->mc_db->md_pad; |
7180 | key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); |
7181 | return MDB_SUCCESS; |
7182 | } |
7183 | |
7184 | if (data) { |
7185 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
7186 | mdb_xcursor_init1(mc, leaf); |
7187 | rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); |
7188 | if (rc) |
7189 | return rc; |
7190 | } else { |
7191 | if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) |
7192 | return rc; |
7193 | } |
7194 | } |
7195 | |
7196 | MDB_GET_KEY(leaf, key); |
7197 | return MDB_SUCCESS; |
7198 | } |
7199 | |
7200 | int |
7201 | mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, |
7202 | MDB_cursor_op op) |
7203 | { |
7204 | int rc; |
7205 | int exact = 0; |
7206 | int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); |
7207 | |
7208 | if (mc == NULL) |
7209 | return EINVAL; |
7210 | |
7211 | if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) |
7212 | return MDB_BAD_TXN; |
7213 | |
7214 | switch (op) { |
7215 | case MDB_GET_CURRENT: |
7216 | if (!(mc->mc_flags & C_INITIALIZED)) { |
7217 | rc = EINVAL; |
7218 | } else { |
7219 | MDB_page *mp = mc->mc_pg[mc->mc_top]; |
7220 | int nkeys = NUMKEYS(mp); |
7221 | if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { |
7222 | mc->mc_ki[mc->mc_top] = nkeys; |
7223 | rc = MDB_NOTFOUND; |
7224 | break; |
7225 | } |
7226 | rc = MDB_SUCCESS; |
7227 | if (IS_LEAF2(mp)) { |
7228 | key->mv_size = mc->mc_db->md_pad; |
7229 | key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); |
7230 | } else { |
7231 | MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
7232 | MDB_GET_KEY(leaf, key); |
7233 | if (data) { |
7234 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
7235 | rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); |
7236 | } else { |
7237 | rc = mdb_node_read(mc, leaf, data); |
7238 | } |
7239 | } |
7240 | } |
7241 | } |
7242 | break; |
7243 | case MDB_GET_BOTH: |
7244 | case MDB_GET_BOTH_RANGE: |
7245 | if (data == NULL) { |
7246 | rc = EINVAL; |
7247 | break; |
7248 | } |
7249 | if (mc->mc_xcursor == NULL) { |
7250 | rc = MDB_INCOMPATIBLE; |
7251 | break; |
7252 | } |
7253 | /* FALLTHRU */ |
7254 | case MDB_SET: |
7255 | case MDB_SET_KEY: |
7256 | case MDB_SET_RANGE: |
7257 | if (key == NULL) { |
7258 | rc = EINVAL; |
7259 | } else { |
7260 | rc = mdb_cursor_set(mc, key, data, op, |
7261 | op == MDB_SET_RANGE ? NULL : &exact); |
7262 | } |
7263 | break; |
7264 | case MDB_GET_MULTIPLE: |
7265 | if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { |
7266 | rc = EINVAL; |
7267 | break; |
7268 | } |
7269 | if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { |
7270 | rc = MDB_INCOMPATIBLE; |
7271 | break; |
7272 | } |
7273 | rc = MDB_SUCCESS; |
7274 | if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || |
7275 | (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) |
7276 | break; |
7277 | goto fetchm; |
7278 | case MDB_NEXT_MULTIPLE: |
7279 | if (data == NULL) { |
7280 | rc = EINVAL; |
7281 | break; |
7282 | } |
7283 | if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { |
7284 | rc = MDB_INCOMPATIBLE; |
7285 | break; |
7286 | } |
7287 | rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); |
7288 | if (rc == MDB_SUCCESS) { |
7289 | if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { |
7290 | MDB_cursor *mx; |
7291 | fetchm: |
7292 | mx = &mc->mc_xcursor->mx_cursor; |
7293 | data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * |
7294 | mx->mc_db->md_pad; |
7295 | data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); |
7296 | mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; |
7297 | } else { |
7298 | rc = MDB_NOTFOUND; |
7299 | } |
7300 | } |
7301 | break; |
7302 | case MDB_PREV_MULTIPLE: |
7303 | if (data == NULL) { |
7304 | rc = EINVAL; |
7305 | break; |
7306 | } |
7307 | if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { |
7308 | rc = MDB_INCOMPATIBLE; |
7309 | break; |
7310 | } |
7311 | if (!(mc->mc_flags & C_INITIALIZED)) |
7312 | rc = mdb_cursor_last(mc, key, data); |
7313 | else |
7314 | rc = MDB_SUCCESS; |
7315 | if (rc == MDB_SUCCESS) { |
7316 | MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; |
7317 | if (mx->mc_flags & C_INITIALIZED) { |
7318 | rc = mdb_cursor_sibling(mx, 0); |
7319 | if (rc == MDB_SUCCESS) |
7320 | goto fetchm; |
7321 | } else { |
7322 | rc = MDB_NOTFOUND; |
7323 | } |
7324 | } |
7325 | break; |
7326 | case MDB_NEXT: |
7327 | case MDB_NEXT_DUP: |
7328 | case MDB_NEXT_NODUP: |
7329 | rc = mdb_cursor_next(mc, key, data, op); |
7330 | break; |
7331 | case MDB_PREV: |
7332 | case MDB_PREV_DUP: |
7333 | case MDB_PREV_NODUP: |
7334 | rc = mdb_cursor_prev(mc, key, data, op); |
7335 | break; |
7336 | case MDB_FIRST: |
7337 | rc = mdb_cursor_first(mc, key, data); |
7338 | break; |
7339 | case MDB_FIRST_DUP: |
7340 | mfunc = mdb_cursor_first; |
7341 | mmove: |
7342 | if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { |
7343 | rc = EINVAL; |
7344 | break; |
7345 | } |
7346 | if (mc->mc_xcursor == NULL) { |
7347 | rc = MDB_INCOMPATIBLE; |
7348 | break; |
7349 | } |
7350 | if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) { |
7351 | mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); |
7352 | rc = MDB_NOTFOUND; |
7353 | break; |
7354 | } |
7355 | { |
7356 | MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
7357 | if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
7358 | MDB_GET_KEY(leaf, key); |
7359 | rc = mdb_node_read(mc, leaf, data); |
7360 | break; |
7361 | } |
7362 | } |
7363 | if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { |
7364 | rc = EINVAL; |
7365 | break; |
7366 | } |
7367 | rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); |
7368 | break; |
7369 | case MDB_LAST: |
7370 | rc = mdb_cursor_last(mc, key, data); |
7371 | break; |
7372 | case MDB_LAST_DUP: |
7373 | mfunc = mdb_cursor_last; |
7374 | goto mmove; |
7375 | default: |
7376 | DPRINTF(("unhandled/unimplemented cursor operation %u" , op)); |
7377 | rc = EINVAL; |
7378 | break; |
7379 | } |
7380 | |
7381 | if (mc->mc_flags & C_DEL) |
7382 | mc->mc_flags ^= C_DEL; |
7383 | |
7384 | return rc; |
7385 | } |
7386 | |
7387 | /** Touch all the pages in the cursor stack. Set mc_top. |
7388 | * Makes sure all the pages are writable, before attempting a write operation. |
7389 | * @param[in] mc The cursor to operate on. |
7390 | */ |
7391 | static int |
7392 | mdb_cursor_touch(MDB_cursor *mc) |
7393 | { |
7394 | int rc = MDB_SUCCESS; |
7395 | |
7396 | if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { |
7397 | /* Touch DB record of named DB */ |
7398 | MDB_cursor mc2; |
7399 | MDB_xcursor mcx; |
7400 | if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) |
7401 | return MDB_BAD_DBI; |
7402 | mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); |
7403 | rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); |
7404 | if (rc) |
7405 | return rc; |
7406 | *mc->mc_dbflag |= DB_DIRTY; |
7407 | } |
7408 | mc->mc_top = 0; |
7409 | if (mc->mc_snum) { |
7410 | do { |
7411 | rc = mdb_page_touch(mc); |
7412 | } while (!rc && ++(mc->mc_top) < mc->mc_snum); |
7413 | mc->mc_top = mc->mc_snum-1; |
7414 | } |
7415 | return rc; |
7416 | } |
7417 | |
7418 | /** Do not spill pages to disk if txn is getting full, may fail instead */ |
7419 | #define MDB_NOSPILL 0x8000 |
7420 | |
7421 | int |
7422 | mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, |
7423 | unsigned int flags) |
7424 | { |
7425 | MDB_env *env; |
7426 | MDB_node *leaf = NULL; |
7427 | MDB_page *fp, *mp, *sub_root = NULL; |
7428 | uint16_t fp_flags; |
7429 | MDB_val xdata, *rdata, dkey, olddata; |
7430 | MDB_db dummy; |
7431 | int do_sub = 0, insert_key, insert_data; |
7432 | unsigned int mcount = 0, dcount = 0, nospill; |
7433 | size_t nsize; |
7434 | int rc, rc2; |
7435 | unsigned int nflags; |
7436 | DKBUF; |
7437 | |
7438 | if (mc == NULL || key == NULL) |
7439 | return EINVAL; |
7440 | |
7441 | env = mc->mc_txn->mt_env; |
7442 | |
7443 | /* Check this first so counter will always be zero on any |
7444 | * early failures. |
7445 | */ |
7446 | if (flags & MDB_MULTIPLE) { |
7447 | dcount = data[1].mv_size; |
7448 | data[1].mv_size = 0; |
7449 | if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) |
7450 | return MDB_INCOMPATIBLE; |
7451 | } |
7452 | |
7453 | nospill = flags & MDB_NOSPILL; |
7454 | flags &= ~MDB_NOSPILL; |
7455 | |
7456 | if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) |
7457 | return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; |
7458 | |
7459 | if (key->mv_size-1 >= ENV_MAXKEY(env)) |
7460 | return MDB_BAD_VALSIZE; |
7461 | |
7462 | #if SIZE_MAX > MAXDATASIZE |
7463 | if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) |
7464 | return MDB_BAD_VALSIZE; |
7465 | #else |
7466 | if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) |
7467 | return MDB_BAD_VALSIZE; |
7468 | #endif |
7469 | |
7470 | DPRINTF(("==> put db %d key [%s], size %" Z"u, data size %" Z"u" , |
7471 | DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); |
7472 | |
7473 | dkey.mv_size = 0; |
7474 | |
7475 | if (flags & MDB_CURRENT) { |
7476 | if (!(mc->mc_flags & C_INITIALIZED)) |
7477 | return EINVAL; |
7478 | rc = MDB_SUCCESS; |
7479 | } else if (mc->mc_db->md_root == P_INVALID) { |
7480 | /* new database, cursor has nothing to point to */ |
7481 | mc->mc_snum = 0; |
7482 | mc->mc_top = 0; |
7483 | mc->mc_flags &= ~C_INITIALIZED; |
7484 | rc = MDB_NO_ROOT; |
7485 | } else { |
7486 | int exact = 0; |
7487 | MDB_val d2; |
7488 | if (flags & MDB_APPEND) { |
7489 | MDB_val k2; |
7490 | rc = mdb_cursor_last(mc, &k2, &d2); |
7491 | if (rc == 0) { |
7492 | rc = mc->mc_dbx->md_cmp(key, &k2); |
7493 | if (rc > 0) { |
7494 | rc = MDB_NOTFOUND; |
7495 | mc->mc_ki[mc->mc_top]++; |
7496 | } else { |
7497 | /* new key is <= last key */ |
7498 | rc = MDB_KEYEXIST; |
7499 | } |
7500 | } |
7501 | } else { |
7502 | rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); |
7503 | } |
7504 | if ((flags & MDB_NOOVERWRITE) && rc == 0) { |
7505 | DPRINTF(("duplicate key [%s]" , DKEY(key))); |
7506 | *data = d2; |
7507 | return MDB_KEYEXIST; |
7508 | } |
7509 | if (rc && rc != MDB_NOTFOUND) |
7510 | return rc; |
7511 | } |
7512 | |
7513 | if (mc->mc_flags & C_DEL) |
7514 | mc->mc_flags ^= C_DEL; |
7515 | |
7516 | /* Cursor is positioned, check for room in the dirty list */ |
7517 | if (!nospill) { |
7518 | if (flags & MDB_MULTIPLE) { |
7519 | rdata = &xdata; |
7520 | xdata.mv_size = data->mv_size * dcount; |
7521 | } else { |
7522 | rdata = data; |
7523 | } |
7524 | if ((rc2 = mdb_page_spill(mc, key, rdata))) |
7525 | return rc2; |
7526 | } |
7527 | |
7528 | if (rc == MDB_NO_ROOT) { |
7529 | MDB_page *np; |
7530 | /* new database, write a root leaf page */ |
7531 | DPUTS("allocating new root leaf page" ); |
7532 | if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { |
7533 | return rc2; |
7534 | } |
7535 | mdb_cursor_push(mc, np); |
7536 | mc->mc_db->md_root = np->mp_pgno; |
7537 | mc->mc_db->md_depth++; |
7538 | *mc->mc_dbflag |= DB_DIRTY; |
7539 | if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) |
7540 | == MDB_DUPFIXED) |
7541 | np->mp_flags |= P_LEAF2; |
7542 | mc->mc_flags |= C_INITIALIZED; |
7543 | } else { |
7544 | /* make sure all cursor pages are writable */ |
7545 | rc2 = mdb_cursor_touch(mc); |
7546 | if (rc2) |
7547 | return rc2; |
7548 | } |
7549 | |
7550 | insert_key = insert_data = rc; |
7551 | if (insert_key) { |
7552 | /* The key does not exist */ |
7553 | DPRINTF(("inserting key at index %i" , mc->mc_ki[mc->mc_top])); |
7554 | if ((mc->mc_db->md_flags & MDB_DUPSORT) && |
7555 | LEAFSIZE(key, data) > env->me_nodemax) |
7556 | { |
7557 | /* Too big for a node, insert in sub-DB. Set up an empty |
7558 | * "old sub-page" for prep_subDB to expand to a full page. |
7559 | */ |
7560 | fp_flags = P_LEAF|P_DIRTY; |
7561 | fp = env->me_pbuf; |
7562 | fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */ |
7563 | fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE); |
7564 | olddata.mv_size = PAGEHDRSZ; |
7565 | goto prep_subDB; |
7566 | } |
7567 | } else { |
7568 | /* there's only a key anyway, so this is a no-op */ |
7569 | if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { |
7570 | char *ptr; |
7571 | unsigned int ksize = mc->mc_db->md_pad; |
7572 | if (key->mv_size != ksize) |
7573 | return MDB_BAD_VALSIZE; |
7574 | ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); |
7575 | memcpy(ptr, key->mv_data, ksize); |
7576 | fix_parent: |
7577 | /* if overwriting slot 0 of leaf, need to |
7578 | * update branch key if there is a parent page |
7579 | */ |
7580 | if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { |
7581 | unsigned short dtop = 1; |
7582 | mc->mc_top--; |
7583 | /* slot 0 is always an empty key, find real slot */ |
7584 | while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { |
7585 | mc->mc_top--; |
7586 | dtop++; |
7587 | } |
7588 | if (mc->mc_ki[mc->mc_top]) |
7589 | rc2 = mdb_update_key(mc, key); |
7590 | else |
7591 | rc2 = MDB_SUCCESS; |
7592 | mc->mc_top += dtop; |
7593 | if (rc2) |
7594 | return rc2; |
7595 | } |
7596 | return MDB_SUCCESS; |
7597 | } |
7598 | |
7599 | more: |
7600 | leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
7601 | olddata.mv_size = NODEDSZ(leaf); |
7602 | olddata.mv_data = NODEDATA(leaf); |
7603 | |
7604 | /* DB has dups? */ |
7605 | if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { |
7606 | /* Prepare (sub-)page/sub-DB to accept the new item, |
7607 | * if needed. fp: old sub-page or a header faking |
7608 | * it. mp: new (sub-)page. offset: growth in page |
7609 | * size. xdata: node data with new page or DB. |
7610 | */ |
7611 | unsigned i, offset = 0; |
7612 | mp = fp = xdata.mv_data = env->me_pbuf; |
7613 | mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; |
7614 | |
7615 | /* Was a single item before, must convert now */ |
7616 | if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
7617 | MDB_cmp_func *dcmp; |
7618 | /* Just overwrite the current item */ |
7619 | if (flags == MDB_CURRENT) |
7620 | goto current; |
7621 | dcmp = mc->mc_dbx->md_dcmp; |
7622 | if (NEED_CMP_CLONG(dcmp, olddata.mv_size)) |
7623 | dcmp = mdb_cmp_clong; |
7624 | /* does data match? */ |
7625 | if (!dcmp(data, &olddata)) { |
7626 | if (flags & (MDB_NODUPDATA|MDB_APPENDDUP)) |
7627 | return MDB_KEYEXIST; |
7628 | /* overwrite it */ |
7629 | goto current; |
7630 | } |
7631 | |
7632 | /* Back up original data item */ |
7633 | dkey.mv_size = olddata.mv_size; |
7634 | dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); |
7635 | |
7636 | /* Make sub-page header for the dup items, with dummy body */ |
7637 | fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; |
7638 | fp->mp_lower = (PAGEHDRSZ-PAGEBASE); |
7639 | xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; |
7640 | if (mc->mc_db->md_flags & MDB_DUPFIXED) { |
7641 | fp->mp_flags |= P_LEAF2; |
7642 | fp->mp_pad = data->mv_size; |
7643 | xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ |
7644 | } else { |
7645 | xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + |
7646 | (dkey.mv_size & 1) + (data->mv_size & 1); |
7647 | } |
7648 | fp->mp_upper = xdata.mv_size - PAGEBASE; |
7649 | olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ |
7650 | } else if (leaf->mn_flags & F_SUBDATA) { |
7651 | /* Data is on sub-DB, just store it */ |
7652 | flags |= F_DUPDATA|F_SUBDATA; |
7653 | goto put_sub; |
7654 | } else { |
7655 | /* Data is on sub-page */ |
7656 | fp = olddata.mv_data; |
7657 | switch (flags) { |
7658 | default: |
7659 | if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { |
7660 | offset = EVEN(NODESIZE + sizeof(indx_t) + |
7661 | data->mv_size); |
7662 | break; |
7663 | } |
7664 | offset = fp->mp_pad; |
7665 | if (SIZELEFT(fp) < offset) { |
7666 | offset *= 4; /* space for 4 more */ |
7667 | break; |
7668 | } |
7669 | /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */ |
7670 | case MDB_CURRENT: |
7671 | fp->mp_flags |= P_DIRTY; |
7672 | COPY_PGNO(fp->mp_pgno, mp->mp_pgno); |
7673 | mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; |
7674 | flags |= F_DUPDATA; |
7675 | goto put_sub; |
7676 | } |
7677 | xdata.mv_size = olddata.mv_size + offset; |
7678 | } |
7679 | |
7680 | fp_flags = fp->mp_flags; |
7681 | if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { |
7682 | /* Too big for a sub-page, convert to sub-DB */ |
7683 | fp_flags &= ~P_SUBP; |
7684 | prep_subDB: |
7685 | if (mc->mc_db->md_flags & MDB_DUPFIXED) { |
7686 | fp_flags |= P_LEAF2; |
7687 | dummy.md_pad = fp->mp_pad; |
7688 | dummy.md_flags = MDB_DUPFIXED; |
7689 | if (mc->mc_db->md_flags & MDB_INTEGERDUP) |
7690 | dummy.md_flags |= MDB_INTEGERKEY; |
7691 | } else { |
7692 | dummy.md_pad = 0; |
7693 | dummy.md_flags = 0; |
7694 | } |
7695 | dummy.md_depth = 1; |
7696 | dummy.md_branch_pages = 0; |
7697 | dummy.md_leaf_pages = 1; |
7698 | dummy.md_overflow_pages = 0; |
7699 | dummy.md_entries = NUMKEYS(fp); |
7700 | xdata.mv_size = sizeof(MDB_db); |
7701 | xdata.mv_data = &dummy; |
7702 | if ((rc = mdb_page_alloc(mc, 1, &mp))) |
7703 | return rc; |
7704 | offset = env->me_psize - olddata.mv_size; |
7705 | flags |= F_DUPDATA|F_SUBDATA; |
7706 | dummy.md_root = mp->mp_pgno; |
7707 | sub_root = mp; |
7708 | } |
7709 | if (mp != fp) { |
7710 | mp->mp_flags = fp_flags | P_DIRTY; |
7711 | mp->mp_pad = fp->mp_pad; |
7712 | mp->mp_lower = fp->mp_lower; |
7713 | mp->mp_upper = fp->mp_upper + offset; |
7714 | if (fp_flags & P_LEAF2) { |
7715 | memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); |
7716 | } else { |
7717 | memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, |
7718 | olddata.mv_size - fp->mp_upper - PAGEBASE); |
7719 | memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), NUMKEYS(fp) * sizeof(mp->mp_ptrs[0])); |
7720 | for (i=0; i<NUMKEYS(fp); i++) |
7721 | mp->mp_ptrs[i] += offset; |
7722 | } |
7723 | } |
7724 | |
7725 | rdata = &xdata; |
7726 | flags |= F_DUPDATA; |
7727 | do_sub = 1; |
7728 | if (!insert_key) |
7729 | mdb_node_del(mc, 0); |
7730 | goto new_sub; |
7731 | } |
7732 | current: |
7733 | /* LMDB passes F_SUBDATA in 'flags' to write a DB record */ |
7734 | if ((leaf->mn_flags ^ flags) & F_SUBDATA) |
7735 | return MDB_INCOMPATIBLE; |
7736 | /* overflow page overwrites need special handling */ |
7737 | if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { |
7738 | MDB_page *omp; |
7739 | pgno_t pg; |
7740 | int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); |
7741 | |
7742 | memcpy(&pg, olddata.mv_data, sizeof(pg)); |
7743 | if ((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0) |
7744 | return rc2; |
7745 | ovpages = omp->mp_pages; |
7746 | |
7747 | /* Is the ov page large enough? */ |
7748 | if (ovpages >= dpages) { |
7749 | if (!(omp->mp_flags & P_DIRTY) && |
7750 | (level || (env->me_flags & MDB_WRITEMAP))) |
7751 | { |
7752 | rc = mdb_page_unspill(mc->mc_txn, omp, &omp); |
7753 | if (rc) |
7754 | return rc; |
7755 | level = 0; /* dirty in this txn or clean */ |
7756 | } |
7757 | /* Is it dirty? */ |
7758 | if (omp->mp_flags & P_DIRTY) { |
7759 | /* yes, overwrite it. Note in this case we don't |
7760 | * bother to try shrinking the page if the new data |
7761 | * is smaller than the overflow threshold. |
7762 | */ |
7763 | if (level > 1) { |
7764 | /* It is writable only in a parent txn */ |
7765 | size_t sz = (size_t) env->me_psize * ovpages, off; |
7766 | MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); |
7767 | MDB_ID2 id2; |
7768 | if (!np) |
7769 | return ENOMEM; |
7770 | id2.mid = pg; |
7771 | id2.mptr = np; |
7772 | /* Note - this page is already counted in parent's dirty_room */ |
7773 | rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); |
7774 | mdb_cassert(mc, rc2 == 0); |
7775 | /* Currently we make the page look as with put() in the |
7776 | * parent txn, in case the user peeks at MDB_RESERVEd |
7777 | * or unused parts. Some users treat ovpages specially. |
7778 | */ |
7779 | if (!(flags & MDB_RESERVE)) { |
7780 | /* Skip the part where LMDB will put *data. |
7781 | * Copy end of page, adjusting alignment so |
7782 | * compiler may copy words instead of bytes. |
7783 | */ |
7784 | off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); |
7785 | memcpy((size_t *)((char *)np + off), |
7786 | (size_t *)((char *)omp + off), sz - off); |
7787 | sz = PAGEHDRSZ; |
7788 | } |
7789 | memcpy(np, omp, sz); /* Copy beginning of page */ |
7790 | omp = np; |
7791 | } |
7792 | SETDSZ(leaf, data->mv_size); |
7793 | if (F_ISSET(flags, MDB_RESERVE)) |
7794 | data->mv_data = METADATA(omp); |
7795 | else |
7796 | memcpy(METADATA(omp), data->mv_data, data->mv_size); |
7797 | return MDB_SUCCESS; |
7798 | } |
7799 | } |
7800 | if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) |
7801 | return rc2; |
7802 | } else if (data->mv_size == olddata.mv_size) { |
7803 | /* same size, just replace it. Note that we could |
7804 | * also reuse this node if the new data is smaller, |
7805 | * but instead we opt to shrink the node in that case. |
7806 | */ |
7807 | if (F_ISSET(flags, MDB_RESERVE)) |
7808 | data->mv_data = olddata.mv_data; |
7809 | else if (!(mc->mc_flags & C_SUB)) |
7810 | memcpy(olddata.mv_data, data->mv_data, data->mv_size); |
7811 | else { |
7812 | memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); |
7813 | goto fix_parent; |
7814 | } |
7815 | return MDB_SUCCESS; |
7816 | } |
7817 | mdb_node_del(mc, 0); |
7818 | } |
7819 | |
7820 | rdata = data; |
7821 | |
7822 | new_sub: |
7823 | nflags = flags & NODE_ADD_FLAGS; |
7824 | nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); |
7825 | if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { |
7826 | if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) |
7827 | nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ |
7828 | if (!insert_key) |
7829 | nflags |= MDB_SPLIT_REPLACE; |
7830 | rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); |
7831 | } else { |
7832 | /* There is room already in this leaf page. */ |
7833 | rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); |
7834 | if (rc == 0) { |
7835 | /* Adjust other cursors pointing to mp */ |
7836 | MDB_cursor *m2, *m3; |
7837 | MDB_dbi dbi = mc->mc_dbi; |
7838 | unsigned i = mc->mc_top; |
7839 | MDB_page *mp = mc->mc_pg[i]; |
7840 | |
7841 | for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
7842 | if (mc->mc_flags & C_SUB) |
7843 | m3 = &m2->mc_xcursor->mx_cursor; |
7844 | else |
7845 | m3 = m2; |
7846 | if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue; |
7847 | if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { |
7848 | m3->mc_ki[i]++; |
7849 | } |
7850 | XCURSOR_REFRESH(m3, i, mp); |
7851 | } |
7852 | } |
7853 | } |
7854 | |
7855 | if (rc == MDB_SUCCESS) { |
7856 | /* Now store the actual data in the child DB. Note that we're |
7857 | * storing the user data in the keys field, so there are strict |
7858 | * size limits on dupdata. The actual data fields of the child |
7859 | * DB are all zero size. |
7860 | */ |
7861 | if (do_sub) { |
7862 | int xflags, new_dupdata; |
7863 | mdb_size_t ecount; |
7864 | put_sub: |
7865 | xdata.mv_size = 0; |
7866 | xdata.mv_data = "" ; |
7867 | leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
7868 | if (flags == MDB_CURRENT) { |
7869 | xflags = MDB_CURRENT|MDB_NOSPILL; |
7870 | } else { |
7871 | mdb_xcursor_init1(mc, leaf); |
7872 | xflags = (flags & MDB_NODUPDATA) ? |
7873 | MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; |
7874 | } |
7875 | if (sub_root) |
7876 | mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; |
7877 | new_dupdata = (int)dkey.mv_size; |
7878 | /* converted, write the original data first */ |
7879 | if (dkey.mv_size) { |
7880 | rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); |
7881 | if (rc) |
7882 | goto bad_sub; |
7883 | /* we've done our job */ |
7884 | dkey.mv_size = 0; |
7885 | } |
7886 | if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { |
7887 | /* Adjust other cursors pointing to mp */ |
7888 | MDB_cursor *m2; |
7889 | MDB_xcursor *mx = mc->mc_xcursor; |
7890 | unsigned i = mc->mc_top; |
7891 | MDB_page *mp = mc->mc_pg[i]; |
7892 | |
7893 | for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { |
7894 | if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; |
7895 | if (!(m2->mc_flags & C_INITIALIZED)) continue; |
7896 | if (m2->mc_pg[i] == mp) { |
7897 | if (m2->mc_ki[i] == mc->mc_ki[i]) { |
7898 | mdb_xcursor_init2(m2, mx, new_dupdata); |
7899 | } else if (!insert_key) { |
7900 | XCURSOR_REFRESH(m2, i, mp); |
7901 | } |
7902 | } |
7903 | } |
7904 | } |
7905 | ecount = mc->mc_xcursor->mx_db.md_entries; |
7906 | if (flags & MDB_APPENDDUP) |
7907 | xflags |= MDB_APPEND; |
7908 | rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); |
7909 | if (flags & F_SUBDATA) { |
7910 | void *db = NODEDATA(leaf); |
7911 | memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); |
7912 | } |
7913 | insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; |
7914 | } |
7915 | /* Increment count unless we just replaced an existing item. */ |
7916 | if (insert_data) |
7917 | mc->mc_db->md_entries++; |
7918 | if (insert_key) { |
7919 | /* Invalidate txn if we created an empty sub-DB */ |
7920 | if (rc) |
7921 | goto bad_sub; |
7922 | /* If we succeeded and the key didn't exist before, |
7923 | * make sure the cursor is marked valid. |
7924 | */ |
7925 | mc->mc_flags |= C_INITIALIZED; |
7926 | } |
7927 | if (flags & MDB_MULTIPLE) { |
7928 | if (!rc) { |
7929 | mcount++; |
7930 | /* let caller know how many succeeded, if any */ |
7931 | data[1].mv_size = mcount; |
7932 | if (mcount < dcount) { |
7933 | data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; |
7934 | insert_key = insert_data = 0; |
7935 | goto more; |
7936 | } |
7937 | } |
7938 | } |
7939 | return rc; |
7940 | bad_sub: |
7941 | if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */ |
7942 | rc = MDB_PROBLEM; |
7943 | } |
7944 | mc->mc_txn->mt_flags |= MDB_TXN_ERROR; |
7945 | return rc; |
7946 | } |
7947 | |
7948 | int |
7949 | mdb_cursor_del(MDB_cursor *mc, unsigned int flags) |
7950 | { |
7951 | MDB_node *leaf; |
7952 | MDB_page *mp; |
7953 | int rc; |
7954 | |
7955 | if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) |
7956 | return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; |
7957 | |
7958 | if (!(mc->mc_flags & C_INITIALIZED)) |
7959 | return EINVAL; |
7960 | |
7961 | if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) |
7962 | return MDB_NOTFOUND; |
7963 | |
7964 | if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) |
7965 | return rc; |
7966 | |
7967 | rc = mdb_cursor_touch(mc); |
7968 | if (rc) |
7969 | return rc; |
7970 | |
7971 | mp = mc->mc_pg[mc->mc_top]; |
7972 | if (IS_LEAF2(mp)) |
7973 | goto del_key; |
7974 | leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
7975 | |
7976 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
7977 | if (flags & MDB_NODUPDATA) { |
7978 | /* mdb_cursor_del0() will subtract the final entry */ |
7979 | mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; |
7980 | mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; |
7981 | } else { |
7982 | if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { |
7983 | mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); |
7984 | } |
7985 | rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); |
7986 | if (rc) |
7987 | return rc; |
7988 | /* If sub-DB still has entries, we're done */ |
7989 | if (mc->mc_xcursor->mx_db.md_entries) { |
7990 | if (leaf->mn_flags & F_SUBDATA) { |
7991 | /* update subDB info */ |
7992 | void *db = NODEDATA(leaf); |
7993 | memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); |
7994 | } else { |
7995 | MDB_cursor *m2; |
7996 | /* shrink fake page */ |
7997 | mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); |
7998 | leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
7999 | mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); |
8000 | /* fix other sub-DB cursors pointed at fake pages on this page */ |
8001 | for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { |
8002 | if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; |
8003 | if (!(m2->mc_flags & C_INITIALIZED)) continue; |
8004 | if (m2->mc_pg[mc->mc_top] == mp) { |
8005 | XCURSOR_REFRESH(m2, mc->mc_top, mp); |
8006 | } |
8007 | } |
8008 | } |
8009 | mc->mc_db->md_entries--; |
8010 | return rc; |
8011 | } else { |
8012 | mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; |
8013 | } |
8014 | /* otherwise fall thru and delete the sub-DB */ |
8015 | } |
8016 | |
8017 | if (leaf->mn_flags & F_SUBDATA) { |
8018 | /* add all the child DB's pages to the free list */ |
8019 | rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); |
8020 | if (rc) |
8021 | goto fail; |
8022 | } |
8023 | } |
8024 | /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */ |
8025 | else if ((leaf->mn_flags ^ flags) & F_SUBDATA) { |
8026 | rc = MDB_INCOMPATIBLE; |
8027 | goto fail; |
8028 | } |
8029 | |
8030 | /* add overflow pages to free list */ |
8031 | if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { |
8032 | MDB_page *omp; |
8033 | pgno_t pg; |
8034 | |
8035 | memcpy(&pg, NODEDATA(leaf), sizeof(pg)); |
8036 | if ((rc = mdb_page_get(mc, pg, &omp, NULL)) || |
8037 | (rc = mdb_ovpage_free(mc, omp))) |
8038 | goto fail; |
8039 | } |
8040 | |
8041 | del_key: |
8042 | return mdb_cursor_del0(mc); |
8043 | |
8044 | fail: |
8045 | mc->mc_txn->mt_flags |= MDB_TXN_ERROR; |
8046 | return rc; |
8047 | } |
8048 | |
8049 | /** Allocate and initialize new pages for a database. |
8050 | * Set #MDB_TXN_ERROR on failure. |
8051 | * @param[in] mc a cursor on the database being added to. |
8052 | * @param[in] flags flags defining what type of page is being allocated. |
8053 | * @param[in] num the number of pages to allocate. This is usually 1, |
8054 | * unless allocating overflow pages for a large record. |
8055 | * @param[out] mp Address of a page, or NULL on failure. |
8056 | * @return 0 on success, non-zero on failure. |
8057 | */ |
8058 | static int |
8059 | mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) |
8060 | { |
8061 | MDB_page *np; |
8062 | int rc; |
8063 | |
8064 | if ((rc = mdb_page_alloc(mc, num, &np))) |
8065 | return rc; |
8066 | DPRINTF(("allocated new mpage %" Yu", page size %u" , |
8067 | np->mp_pgno, mc->mc_txn->mt_env->me_psize)); |
8068 | np->mp_flags = flags | P_DIRTY; |
8069 | np->mp_lower = (PAGEHDRSZ-PAGEBASE); |
8070 | np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; |
8071 | |
8072 | if (IS_BRANCH(np)) |
8073 | mc->mc_db->md_branch_pages++; |
8074 | else if (IS_LEAF(np)) |
8075 | mc->mc_db->md_leaf_pages++; |
8076 | else if (IS_OVERFLOW(np)) { |
8077 | mc->mc_db->md_overflow_pages += num; |
8078 | np->mp_pages = num; |
8079 | } |
8080 | *mp = np; |
8081 | |
8082 | return 0; |
8083 | } |
8084 | |
8085 | /** Calculate the size of a leaf node. |
8086 | * The size depends on the environment's page size; if a data item |
8087 | * is too large it will be put onto an overflow page and the node |
8088 | * size will only include the key and not the data. Sizes are always |
8089 | * rounded up to an even number of bytes, to guarantee 2-byte alignment |
8090 | * of the #MDB_node headers. |
8091 | * @param[in] env The environment handle. |
8092 | * @param[in] key The key for the node. |
8093 | * @param[in] data The data for the node. |
8094 | * @return The number of bytes needed to store the node. |
8095 | */ |
8096 | static size_t |
8097 | mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) |
8098 | { |
8099 | size_t sz; |
8100 | |
8101 | sz = LEAFSIZE(key, data); |
8102 | if (sz > env->me_nodemax) { |
8103 | /* put on overflow page */ |
8104 | sz -= data->mv_size - sizeof(pgno_t); |
8105 | } |
8106 | |
8107 | return EVEN(sz + sizeof(indx_t)); |
8108 | } |
8109 | |
8110 | /** Calculate the size of a branch node. |
8111 | * The size should depend on the environment's page size but since |
8112 | * we currently don't support spilling large keys onto overflow |
8113 | * pages, it's simply the size of the #MDB_node header plus the |
8114 | * size of the key. Sizes are always rounded up to an even number |
8115 | * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. |
8116 | * @param[in] env The environment handle. |
8117 | * @param[in] key The key for the node. |
8118 | * @return The number of bytes needed to store the node. |
8119 | */ |
8120 | static size_t |
8121 | mdb_branch_size(MDB_env *env, MDB_val *key) |
8122 | { |
8123 | size_t sz; |
8124 | |
8125 | sz = INDXSIZE(key); |
8126 | if (sz > env->me_nodemax) { |
8127 | /* put on overflow page */ |
8128 | /* not implemented */ |
8129 | /* sz -= key->size - sizeof(pgno_t); */ |
8130 | } |
8131 | |
8132 | return sz + sizeof(indx_t); |
8133 | } |
8134 | |
8135 | /** Add a node to the page pointed to by the cursor. |
8136 | * Set #MDB_TXN_ERROR on failure. |
8137 | * @param[in] mc The cursor for this operation. |
8138 | * @param[in] indx The index on the page where the new node should be added. |
8139 | * @param[in] key The key for the new node. |
8140 | * @param[in] data The data for the new node, if any. |
8141 | * @param[in] pgno The page number, if adding a branch node. |
8142 | * @param[in] flags Flags for the node. |
8143 | * @return 0 on success, non-zero on failure. Possible errors are: |
8144 | * <ul> |
8145 | * <li>ENOMEM - failed to allocate overflow pages for the node. |
8146 | * <li>MDB_PAGE_FULL - there is insufficient room in the page. This error |
8147 | * should never happen since all callers already calculate the |
8148 | * page's free space before calling this function. |
8149 | * </ul> |
8150 | */ |
8151 | static int |
8152 | mdb_node_add(MDB_cursor *mc, indx_t indx, |
8153 | MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags) |
8154 | { |
8155 | unsigned int i; |
8156 | size_t node_size = NODESIZE; |
8157 | ssize_t room; |
8158 | indx_t ofs; |
8159 | MDB_node *node; |
8160 | MDB_page *mp = mc->mc_pg[mc->mc_top]; |
8161 | MDB_page *ofp = NULL; /* overflow page */ |
8162 | void *ndata; |
8163 | DKBUF; |
8164 | |
8165 | mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); |
8166 | |
8167 | DPRINTF(("add to %s %spage %" Yu" index %i, data size %" Z"u key size %" Z"u [%s]" , |
8168 | IS_LEAF(mp) ? "leaf" : "branch" , |
8169 | IS_SUBP(mp) ? "sub-" : "" , |
8170 | mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, |
8171 | key ? key->mv_size : 0, key ? DKEY(key) : "null" )); |
8172 | |
8173 | if (IS_LEAF2(mp)) { |
8174 | /* Move higher keys up one slot. */ |
8175 | int ksize = mc->mc_db->md_pad, dif; |
8176 | char *ptr = LEAF2KEY(mp, indx, ksize); |
8177 | dif = NUMKEYS(mp) - indx; |
8178 | if (dif > 0) |
8179 | memmove(ptr+ksize, ptr, dif*ksize); |
8180 | /* insert new key */ |
8181 | memcpy(ptr, key->mv_data, ksize); |
8182 | |
8183 | /* Just using these for counting */ |
8184 | mp->mp_lower += sizeof(indx_t); |
8185 | mp->mp_upper -= ksize - sizeof(indx_t); |
8186 | return MDB_SUCCESS; |
8187 | } |
8188 | |
8189 | room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); |
8190 | if (key != NULL) |
8191 | node_size += key->mv_size; |
8192 | if (IS_LEAF(mp)) { |
8193 | mdb_cassert(mc, key && data); |
8194 | if (F_ISSET(flags, F_BIGDATA)) { |
8195 | /* Data already on overflow page. */ |
8196 | node_size += sizeof(pgno_t); |
8197 | } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { |
8198 | int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); |
8199 | int rc; |
8200 | /* Put data on overflow page. */ |
8201 | DPRINTF(("data size is %" Z"u, node would be %" Z"u, put data on overflow page" , |
8202 | data->mv_size, node_size+data->mv_size)); |
8203 | node_size = EVEN(node_size + sizeof(pgno_t)); |
8204 | if ((ssize_t)node_size > room) |
8205 | goto full; |
8206 | if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) |
8207 | return rc; |
8208 | DPRINTF(("allocated overflow page %" Yu, ofp->mp_pgno)); |
8209 | flags |= F_BIGDATA; |
8210 | goto update; |
8211 | } else { |
8212 | node_size += data->mv_size; |
8213 | } |
8214 | } |
8215 | node_size = EVEN(node_size); |
8216 | if ((ssize_t)node_size > room) |
8217 | goto full; |
8218 | |
8219 | update: |
8220 | /* Move higher pointers up one slot. */ |
8221 | for (i = NUMKEYS(mp); i > indx; i--) |
8222 | mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; |
8223 | |
8224 | /* Adjust free space offsets. */ |
8225 | ofs = mp->mp_upper - node_size; |
8226 | mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); |
8227 | mp->mp_ptrs[indx] = ofs; |
8228 | mp->mp_upper = ofs; |
8229 | mp->mp_lower += sizeof(indx_t); |
8230 | |
8231 | /* Write the node data. */ |
8232 | node = NODEPTR(mp, indx); |
8233 | node->mn_ksize = (key == NULL) ? 0 : key->mv_size; |
8234 | node->mn_flags = flags; |
8235 | if (IS_LEAF(mp)) |
8236 | SETDSZ(node,data->mv_size); |
8237 | else |
8238 | SETPGNO(node,pgno); |
8239 | |
8240 | if (key) |
8241 | memcpy(NODEKEY(node), key->mv_data, key->mv_size); |
8242 | |
8243 | if (IS_LEAF(mp)) { |
8244 | ndata = NODEDATA(node); |
8245 | if (ofp == NULL) { |
8246 | if (F_ISSET(flags, F_BIGDATA)) |
8247 | memcpy(ndata, data->mv_data, sizeof(pgno_t)); |
8248 | else if (F_ISSET(flags, MDB_RESERVE)) |
8249 | data->mv_data = ndata; |
8250 | else |
8251 | memcpy(ndata, data->mv_data, data->mv_size); |
8252 | } else { |
8253 | memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); |
8254 | ndata = METADATA(ofp); |
8255 | if (F_ISSET(flags, MDB_RESERVE)) |
8256 | data->mv_data = ndata; |
8257 | else |
8258 | memcpy(ndata, data->mv_data, data->mv_size); |
8259 | } |
8260 | } |
8261 | |
8262 | return MDB_SUCCESS; |
8263 | |
8264 | full: |
8265 | DPRINTF(("not enough room in page %" Yu", got %u ptrs" , |
8266 | mdb_dbg_pgno(mp), NUMKEYS(mp))); |
8267 | DPRINTF(("upper-lower = %u - %u = %" Z"d" , mp->mp_upper,mp->mp_lower,room)); |
8268 | DPRINTF(("node size = %" Z"u" , node_size)); |
8269 | mc->mc_txn->mt_flags |= MDB_TXN_ERROR; |
8270 | return MDB_PAGE_FULL; |
8271 | } |
8272 | |
8273 | /** Delete the specified node from a page. |
8274 | * @param[in] mc Cursor pointing to the node to delete. |
8275 | * @param[in] ksize The size of a node. Only used if the page is |
8276 | * part of a #MDB_DUPFIXED database. |
8277 | */ |
8278 | static void |
8279 | mdb_node_del(MDB_cursor *mc, int ksize) |
8280 | { |
8281 | MDB_page *mp = mc->mc_pg[mc->mc_top]; |
8282 | indx_t indx = mc->mc_ki[mc->mc_top]; |
8283 | unsigned int sz; |
8284 | indx_t i, j, numkeys, ptr; |
8285 | MDB_node *node; |
8286 | char *base; |
8287 | |
8288 | DPRINTF(("delete node %u on %s page %" Yu, indx, |
8289 | IS_LEAF(mp) ? "leaf" : "branch" , mdb_dbg_pgno(mp))); |
8290 | numkeys = NUMKEYS(mp); |
8291 | mdb_cassert(mc, indx < numkeys); |
8292 | |
8293 | if (IS_LEAF2(mp)) { |
8294 | int x = numkeys - 1 - indx; |
8295 | base = LEAF2KEY(mp, indx, ksize); |
8296 | if (x) |
8297 | memmove(base, base + ksize, x * ksize); |
8298 | mp->mp_lower -= sizeof(indx_t); |
8299 | mp->mp_upper += ksize - sizeof(indx_t); |
8300 | return; |
8301 | } |
8302 | |
8303 | node = NODEPTR(mp, indx); |
8304 | sz = NODESIZE + node->mn_ksize; |
8305 | if (IS_LEAF(mp)) { |
8306 | if (F_ISSET(node->mn_flags, F_BIGDATA)) |
8307 | sz += sizeof(pgno_t); |
8308 | else |
8309 | sz += NODEDSZ(node); |
8310 | } |
8311 | sz = EVEN(sz); |
8312 | |
8313 | ptr = mp->mp_ptrs[indx]; |
8314 | for (i = j = 0; i < numkeys; i++) { |
8315 | if (i != indx) { |
8316 | mp->mp_ptrs[j] = mp->mp_ptrs[i]; |
8317 | if (mp->mp_ptrs[i] < ptr) |
8318 | mp->mp_ptrs[j] += sz; |
8319 | j++; |
8320 | } |
8321 | } |
8322 | |
8323 | base = (char *)mp + mp->mp_upper + PAGEBASE; |
8324 | memmove(base + sz, base, ptr - mp->mp_upper); |
8325 | |
8326 | mp->mp_lower -= sizeof(indx_t); |
8327 | mp->mp_upper += sz; |
8328 | } |
8329 | |
8330 | /** Compact the main page after deleting a node on a subpage. |
8331 | * @param[in] mp The main page to operate on. |
8332 | * @param[in] indx The index of the subpage on the main page. |
8333 | */ |
8334 | static void |
8335 | mdb_node_shrink(MDB_page *mp, indx_t indx) |
8336 | { |
8337 | MDB_node *node; |
8338 | MDB_page *sp, *xp; |
8339 | char *base; |
8340 | indx_t delta, nsize, len, ptr; |
8341 | int i; |
8342 | |
8343 | node = NODEPTR(mp, indx); |
8344 | sp = (MDB_page *)NODEDATA(node); |
8345 | delta = SIZELEFT(sp); |
8346 | nsize = NODEDSZ(node) - delta; |
8347 | |
8348 | /* Prepare to shift upward, set len = length(subpage part to shift) */ |
8349 | if (IS_LEAF2(sp)) { |
8350 | len = nsize; |
8351 | if (nsize & 1) |
8352 | return; /* do not make the node uneven-sized */ |
8353 | } else { |
8354 | xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ |
8355 | for (i = NUMKEYS(sp); --i >= 0; ) |
8356 | xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; |
8357 | len = PAGEHDRSZ; |
8358 | } |
8359 | sp->mp_upper = sp->mp_lower; |
8360 | COPY_PGNO(sp->mp_pgno, mp->mp_pgno); |
8361 | SETDSZ(node, nsize); |
8362 | |
8363 | /* Shift <lower nodes...initial part of subpage> upward */ |
8364 | base = (char *)mp + mp->mp_upper + PAGEBASE; |
8365 | memmove(base + delta, base, (char *)sp + len - base); |
8366 | |
8367 | ptr = mp->mp_ptrs[indx]; |
8368 | for (i = NUMKEYS(mp); --i >= 0; ) { |
8369 | if (mp->mp_ptrs[i] <= ptr) |
8370 | mp->mp_ptrs[i] += delta; |
8371 | } |
8372 | mp->mp_upper += delta; |
8373 | } |
8374 | |
8375 | /** Initial setup of a sorted-dups cursor. |
8376 | * Sorted duplicates are implemented as a sub-database for the given key. |
8377 | * The duplicate data items are actually keys of the sub-database. |
8378 | * Operations on the duplicate data items are performed using a sub-cursor |
8379 | * initialized when the sub-database is first accessed. This function does |
8380 | * the preliminary setup of the sub-cursor, filling in the fields that |
8381 | * depend only on the parent DB. |
8382 | * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. |
8383 | */ |
8384 | static void |
8385 | mdb_xcursor_init0(MDB_cursor *mc) |
8386 | { |
8387 | MDB_xcursor *mx = mc->mc_xcursor; |
8388 | |
8389 | mx->mx_cursor.mc_xcursor = NULL; |
8390 | mx->mx_cursor.mc_txn = mc->mc_txn; |
8391 | mx->mx_cursor.mc_db = &mx->mx_db; |
8392 | mx->mx_cursor.mc_dbx = &mx->mx_dbx; |
8393 | mx->mx_cursor.mc_dbi = mc->mc_dbi; |
8394 | mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; |
8395 | mx->mx_cursor.mc_snum = 0; |
8396 | mx->mx_cursor.mc_top = 0; |
8397 | MC_SET_OVPG(&mx->mx_cursor, NULL); |
8398 | mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_ORIG_RDONLY|C_WRITEMAP)); |
8399 | mx->mx_dbx.md_name.mv_size = 0; |
8400 | mx->mx_dbx.md_name.mv_data = NULL; |
8401 | mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; |
8402 | mx->mx_dbx.md_dcmp = NULL; |
8403 | mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; |
8404 | } |
8405 | |
8406 | /** Final setup of a sorted-dups cursor. |
8407 | * Sets up the fields that depend on the data from the main cursor. |
8408 | * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. |
8409 | * @param[in] node The data containing the #MDB_db record for the |
8410 | * sorted-dup database. |
8411 | */ |
8412 | static void |
8413 | mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) |
8414 | { |
8415 | MDB_xcursor *mx = mc->mc_xcursor; |
8416 | |
8417 | mx->mx_cursor.mc_flags &= C_SUB|C_ORIG_RDONLY|C_WRITEMAP; |
8418 | if (node->mn_flags & F_SUBDATA) { |
8419 | memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); |
8420 | mx->mx_cursor.mc_pg[0] = 0; |
8421 | mx->mx_cursor.mc_snum = 0; |
8422 | mx->mx_cursor.mc_top = 0; |
8423 | } else { |
8424 | MDB_page *fp = NODEDATA(node); |
8425 | mx->mx_db.md_pad = 0; |
8426 | mx->mx_db.md_flags = 0; |
8427 | mx->mx_db.md_depth = 1; |
8428 | mx->mx_db.md_branch_pages = 0; |
8429 | mx->mx_db.md_leaf_pages = 1; |
8430 | mx->mx_db.md_overflow_pages = 0; |
8431 | mx->mx_db.md_entries = NUMKEYS(fp); |
8432 | COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); |
8433 | mx->mx_cursor.mc_snum = 1; |
8434 | mx->mx_cursor.mc_top = 0; |
8435 | mx->mx_cursor.mc_flags |= C_INITIALIZED; |
8436 | mx->mx_cursor.mc_pg[0] = fp; |
8437 | mx->mx_cursor.mc_ki[0] = 0; |
8438 | if (mc->mc_db->md_flags & MDB_DUPFIXED) { |
8439 | mx->mx_db.md_flags = MDB_DUPFIXED; |
8440 | mx->mx_db.md_pad = fp->mp_pad; |
8441 | if (mc->mc_db->md_flags & MDB_INTEGERDUP) |
8442 | mx->mx_db.md_flags |= MDB_INTEGERKEY; |
8443 | } |
8444 | } |
8445 | DPRINTF(("Sub-db -%u root page %" Yu, mx->mx_cursor.mc_dbi, |
8446 | mx->mx_db.md_root)); |
8447 | mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; |
8448 | if (NEED_CMP_CLONG(mx->mx_dbx.md_cmp, mx->mx_db.md_pad)) |
8449 | mx->mx_dbx.md_cmp = mdb_cmp_clong; |
8450 | } |
8451 | |
8452 | |
8453 | /** Fixup a sorted-dups cursor due to underlying update. |
8454 | * Sets up some fields that depend on the data from the main cursor. |
8455 | * Almost the same as init1, but skips initialization steps if the |
8456 | * xcursor had already been used. |
8457 | * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up. |
8458 | * @param[in] src_mx The xcursor of an up-to-date cursor. |
8459 | * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item. |
8460 | */ |
8461 | static void |
8462 | mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) |
8463 | { |
8464 | MDB_xcursor *mx = mc->mc_xcursor; |
8465 | |
8466 | if (new_dupdata) { |
8467 | mx->mx_cursor.mc_snum = 1; |
8468 | mx->mx_cursor.mc_top = 0; |
8469 | mx->mx_cursor.mc_flags |= C_INITIALIZED; |
8470 | mx->mx_cursor.mc_ki[0] = 0; |
8471 | mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; |
8472 | #if UINT_MAX < MDB_SIZE_MAX /* matches mdb_xcursor_init1:NEED_CMP_CLONG() */ |
8473 | mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; |
8474 | #endif |
8475 | } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) { |
8476 | return; |
8477 | } |
8478 | mx->mx_db = src_mx->mx_db; |
8479 | mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; |
8480 | DPRINTF(("Sub-db -%u root page %" Yu, mx->mx_cursor.mc_dbi, |
8481 | mx->mx_db.md_root)); |
8482 | } |
8483 | |
8484 | /** Initialize a cursor for a given transaction and database. */ |
8485 | static void |
8486 | mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) |
8487 | { |
8488 | mc->mc_next = NULL; |
8489 | mc->mc_backup = NULL; |
8490 | mc->mc_dbi = dbi; |
8491 | mc->mc_txn = txn; |
8492 | mc->mc_db = &txn->mt_dbs[dbi]; |
8493 | mc->mc_dbx = &txn->mt_dbxs[dbi]; |
8494 | mc->mc_dbflag = &txn->mt_dbflags[dbi]; |
8495 | mc->mc_snum = 0; |
8496 | mc->mc_top = 0; |
8497 | mc->mc_pg[0] = 0; |
8498 | mc->mc_ki[0] = 0; |
8499 | MC_SET_OVPG(mc, NULL); |
8500 | mc->mc_flags = txn->mt_flags & (C_ORIG_RDONLY|C_WRITEMAP); |
8501 | if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { |
8502 | mdb_tassert(txn, mx != NULL); |
8503 | mc->mc_xcursor = mx; |
8504 | mdb_xcursor_init0(mc); |
8505 | } else { |
8506 | mc->mc_xcursor = NULL; |
8507 | } |
8508 | if (*mc->mc_dbflag & DB_STALE) { |
8509 | mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); |
8510 | } |
8511 | } |
8512 | |
8513 | int |
8514 | mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) |
8515 | { |
8516 | MDB_cursor *mc; |
8517 | size_t size = sizeof(MDB_cursor); |
8518 | |
8519 | if (!ret || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) |
8520 | return EINVAL; |
8521 | |
8522 | if (txn->mt_flags & MDB_TXN_BLOCKED) |
8523 | return MDB_BAD_TXN; |
8524 | |
8525 | if (dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) |
8526 | return EINVAL; |
8527 | |
8528 | if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) |
8529 | size += sizeof(MDB_xcursor); |
8530 | |
8531 | if ((mc = malloc(size)) != NULL) { |
8532 | mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); |
8533 | if (txn->mt_cursors) { |
8534 | mc->mc_next = txn->mt_cursors[dbi]; |
8535 | txn->mt_cursors[dbi] = mc; |
8536 | mc->mc_flags |= C_UNTRACK; |
8537 | } |
8538 | } else { |
8539 | return ENOMEM; |
8540 | } |
8541 | |
8542 | *ret = mc; |
8543 | |
8544 | return MDB_SUCCESS; |
8545 | } |
8546 | |
8547 | int |
8548 | mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) |
8549 | { |
8550 | if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID)) |
8551 | return EINVAL; |
8552 | |
8553 | if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) |
8554 | return EINVAL; |
8555 | |
8556 | if (txn->mt_flags & MDB_TXN_BLOCKED) |
8557 | return MDB_BAD_TXN; |
8558 | |
8559 | mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); |
8560 | return MDB_SUCCESS; |
8561 | } |
8562 | |
8563 | /* Return the count of duplicate data items for the current key */ |
8564 | int |
8565 | mdb_cursor_count(MDB_cursor *mc, mdb_size_t *countp) |
8566 | { |
8567 | MDB_node *leaf; |
8568 | |
8569 | if (mc == NULL || countp == NULL) |
8570 | return EINVAL; |
8571 | |
8572 | if (mc->mc_xcursor == NULL) |
8573 | return MDB_INCOMPATIBLE; |
8574 | |
8575 | if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) |
8576 | return MDB_BAD_TXN; |
8577 | |
8578 | if (!(mc->mc_flags & C_INITIALIZED)) |
8579 | return EINVAL; |
8580 | |
8581 | if (!mc->mc_snum) |
8582 | return MDB_NOTFOUND; |
8583 | |
8584 | if (mc->mc_flags & C_EOF) { |
8585 | if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) |
8586 | return MDB_NOTFOUND; |
8587 | mc->mc_flags ^= C_EOF; |
8588 | } |
8589 | |
8590 | leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
8591 | if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
8592 | *countp = 1; |
8593 | } else { |
8594 | if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) |
8595 | return EINVAL; |
8596 | |
8597 | *countp = mc->mc_xcursor->mx_db.md_entries; |
8598 | } |
8599 | return MDB_SUCCESS; |
8600 | } |
8601 | |
8602 | void |
8603 | mdb_cursor_close(MDB_cursor *mc) |
8604 | { |
8605 | if (mc) { |
8606 | MDB_CURSOR_UNREF(mc, 0); |
8607 | } |
8608 | if (mc && !mc->mc_backup) { |
8609 | /* Remove from txn, if tracked. |
8610 | * A read-only txn (!C_UNTRACK) may have been freed already, |
8611 | * so do not peek inside it. Only write txns track cursors. |
8612 | */ |
8613 | if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { |
8614 | MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; |
8615 | while (*prev && *prev != mc) prev = &(*prev)->mc_next; |
8616 | if (*prev == mc) |
8617 | *prev = mc->mc_next; |
8618 | } |
8619 | free(mc); |
8620 | } |
8621 | } |
8622 | |
8623 | MDB_txn * |
8624 | mdb_cursor_txn(MDB_cursor *mc) |
8625 | { |
8626 | if (!mc) return NULL; |
8627 | return mc->mc_txn; |
8628 | } |
8629 | |
8630 | MDB_dbi |
8631 | mdb_cursor_dbi(MDB_cursor *mc) |
8632 | { |
8633 | return mc->mc_dbi; |
8634 | } |
8635 | |
8636 | /** Replace the key for a branch node with a new key. |
8637 | * Set #MDB_TXN_ERROR on failure. |
8638 | * @param[in] mc Cursor pointing to the node to operate on. |
8639 | * @param[in] key The new key to use. |
8640 | * @return 0 on success, non-zero on failure. |
8641 | */ |
8642 | static int |
8643 | mdb_update_key(MDB_cursor *mc, MDB_val *key) |
8644 | { |
8645 | MDB_page *mp; |
8646 | MDB_node *node; |
8647 | char *base; |
8648 | size_t len; |
8649 | int delta, ksize, oksize; |
8650 | indx_t ptr, i, numkeys, indx; |
8651 | DKBUF; |
8652 | |
8653 | indx = mc->mc_ki[mc->mc_top]; |
8654 | mp = mc->mc_pg[mc->mc_top]; |
8655 | node = NODEPTR(mp, indx); |
8656 | ptr = mp->mp_ptrs[indx]; |
8657 | #if MDB_DEBUG |
8658 | { |
8659 | MDB_val k2; |
8660 | char kbuf2[DKBUF_MAXKEYSIZE*2+1]; |
8661 | k2.mv_data = NODEKEY(node); |
8662 | k2.mv_size = node->mn_ksize; |
8663 | DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %" Yu, |
8664 | indx, ptr, |
8665 | mdb_dkey(&k2, kbuf2), |
8666 | DKEY(key), |
8667 | mp->mp_pgno)); |
8668 | } |
8669 | #endif |
8670 | |
8671 | /* Sizes must be 2-byte aligned. */ |
8672 | ksize = EVEN(key->mv_size); |
8673 | oksize = EVEN(node->mn_ksize); |
8674 | delta = ksize - oksize; |
8675 | |
8676 | /* Shift node contents if EVEN(key length) changed. */ |
8677 | if (delta) { |
8678 | if (delta > 0 && SIZELEFT(mp) < delta) { |
8679 | pgno_t pgno; |
8680 | /* not enough space left, do a delete and split */ |
8681 | DPRINTF(("Not enough room, delta = %d, splitting..." , delta)); |
8682 | pgno = NODEPGNO(node); |
8683 | mdb_node_del(mc, 0); |
8684 | return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); |
8685 | } |
8686 | |
8687 | numkeys = NUMKEYS(mp); |
8688 | for (i = 0; i < numkeys; i++) { |
8689 | if (mp->mp_ptrs[i] <= ptr) |
8690 | mp->mp_ptrs[i] -= delta; |
8691 | } |
8692 | |
8693 | base = (char *)mp + mp->mp_upper + PAGEBASE; |
8694 | len = ptr - mp->mp_upper + NODESIZE; |
8695 | memmove(base - delta, base, len); |
8696 | mp->mp_upper -= delta; |
8697 | |
8698 | node = NODEPTR(mp, indx); |
8699 | } |
8700 | |
8701 | /* But even if no shift was needed, update ksize */ |
8702 | if (node->mn_ksize != key->mv_size) |
8703 | node->mn_ksize = key->mv_size; |
8704 | |
8705 | if (key->mv_size) |
8706 | memcpy(NODEKEY(node), key->mv_data, key->mv_size); |
8707 | |
8708 | return MDB_SUCCESS; |
8709 | } |
8710 | |
8711 | static void |
8712 | mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); |
8713 | |
8714 | /** Perform \b act while tracking temporary cursor \b mn */ |
8715 | #define WITH_CURSOR_TRACKING(mn, act) do { \ |
8716 | MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ |
8717 | if ((mn).mc_flags & C_SUB) { \ |
8718 | dummy.mc_flags = C_INITIALIZED; \ |
8719 | dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ |
8720 | tracked = &dummy; \ |
8721 | } else { \ |
8722 | tracked = &(mn); \ |
8723 | } \ |
8724 | tracked->mc_next = *tp; \ |
8725 | *tp = tracked; \ |
8726 | { act; } \ |
8727 | *tp = tracked->mc_next; \ |
8728 | } while (0) |
8729 | |
8730 | /** Move a node from csrc to cdst. |
8731 | */ |
8732 | static int |
8733 | mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) |
8734 | { |
8735 | MDB_node *srcnode; |
8736 | MDB_val key, data; |
8737 | pgno_t srcpg; |
8738 | MDB_cursor mn; |
8739 | int rc; |
8740 | unsigned short flags; |
8741 | |
8742 | DKBUF; |
8743 | |
8744 | /* Mark src and dst as dirty. */ |
8745 | if ((rc = mdb_page_touch(csrc)) || |
8746 | (rc = mdb_page_touch(cdst))) |
8747 | return rc; |
8748 | |
8749 | if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { |
8750 | key.mv_size = csrc->mc_db->md_pad; |
8751 | key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); |
8752 | data.mv_size = 0; |
8753 | data.mv_data = NULL; |
8754 | srcpg = 0; |
8755 | flags = 0; |
8756 | } else { |
8757 | srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); |
8758 | mdb_cassert(csrc, !((size_t)srcnode & 1)); |
8759 | srcpg = NODEPGNO(srcnode); |
8760 | flags = srcnode->mn_flags; |
8761 | if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { |
8762 | unsigned int snum = csrc->mc_snum; |
8763 | MDB_node *s2; |
8764 | /* must find the lowest key below src */ |
8765 | rc = mdb_page_search_lowest(csrc); |
8766 | if (rc) |
8767 | return rc; |
8768 | if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { |
8769 | key.mv_size = csrc->mc_db->md_pad; |
8770 | key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); |
8771 | } else { |
8772 | s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); |
8773 | key.mv_size = NODEKSZ(s2); |
8774 | key.mv_data = NODEKEY(s2); |
8775 | } |
8776 | csrc->mc_snum = snum--; |
8777 | csrc->mc_top = snum; |
8778 | } else { |
8779 | key.mv_size = NODEKSZ(srcnode); |
8780 | key.mv_data = NODEKEY(srcnode); |
8781 | } |
8782 | data.mv_size = NODEDSZ(srcnode); |
8783 | data.mv_data = NODEDATA(srcnode); |
8784 | } |
8785 | mn.mc_xcursor = NULL; |
8786 | if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { |
8787 | unsigned int snum = cdst->mc_snum; |
8788 | MDB_node *s2; |
8789 | MDB_val bkey; |
8790 | /* must find the lowest key below dst */ |
8791 | mdb_cursor_copy(cdst, &mn); |
8792 | rc = mdb_page_search_lowest(&mn); |
8793 | if (rc) |
8794 | return rc; |
8795 | if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { |
8796 | bkey.mv_size = mn.mc_db->md_pad; |
8797 | bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); |
8798 | } else { |
8799 | s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); |
8800 | bkey.mv_size = NODEKSZ(s2); |
8801 | bkey.mv_data = NODEKEY(s2); |
8802 | } |
8803 | mn.mc_snum = snum--; |
8804 | mn.mc_top = snum; |
8805 | mn.mc_ki[snum] = 0; |
8806 | rc = mdb_update_key(&mn, &bkey); |
8807 | if (rc) |
8808 | return rc; |
8809 | } |
8810 | |
8811 | DPRINTF(("moving %s node %u [%s] on page %" Yu" to node %u on page %" Yu, |
8812 | IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch" , |
8813 | csrc->mc_ki[csrc->mc_top], |
8814 | DKEY(&key), |
8815 | csrc->mc_pg[csrc->mc_top]->mp_pgno, |
8816 | cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); |
8817 | |
8818 | /* Add the node to the destination page. |
8819 | */ |
8820 | rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); |
8821 | if (rc != MDB_SUCCESS) |
8822 | return rc; |
8823 | |
8824 | /* Delete the node from the source page. |
8825 | */ |
8826 | mdb_node_del(csrc, key.mv_size); |
8827 | |
8828 | { |
8829 | /* Adjust other cursors pointing to mp */ |
8830 | MDB_cursor *m2, *m3; |
8831 | MDB_dbi dbi = csrc->mc_dbi; |
8832 | MDB_page *mpd, *mps; |
8833 | |
8834 | mps = csrc->mc_pg[csrc->mc_top]; |
8835 | /* If we're adding on the left, bump others up */ |
8836 | if (fromleft) { |
8837 | mpd = cdst->mc_pg[csrc->mc_top]; |
8838 | for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
8839 | if (csrc->mc_flags & C_SUB) |
8840 | m3 = &m2->mc_xcursor->mx_cursor; |
8841 | else |
8842 | m3 = m2; |
8843 | if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) |
8844 | continue; |
8845 | if (m3 != cdst && |
8846 | m3->mc_pg[csrc->mc_top] == mpd && |
8847 | m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { |
8848 | m3->mc_ki[csrc->mc_top]++; |
8849 | } |
8850 | if (m3 !=csrc && |
8851 | m3->mc_pg[csrc->mc_top] == mps && |
8852 | m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { |
8853 | m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; |
8854 | m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; |
8855 | m3->mc_ki[csrc->mc_top-1]++; |
8856 | } |
8857 | if (IS_LEAF(mps)) |
8858 | XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); |
8859 | } |
8860 | } else |
8861 | /* Adding on the right, bump others down */ |
8862 | { |
8863 | for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
8864 | if (csrc->mc_flags & C_SUB) |
8865 | m3 = &m2->mc_xcursor->mx_cursor; |
8866 | else |
8867 | m3 = m2; |
8868 | if (m3 == csrc) continue; |
8869 | if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) |
8870 | continue; |
8871 | if (m3->mc_pg[csrc->mc_top] == mps) { |
8872 | if (!m3->mc_ki[csrc->mc_top]) { |
8873 | m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; |
8874 | m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; |
8875 | m3->mc_ki[csrc->mc_top-1]--; |
8876 | } else { |
8877 | m3->mc_ki[csrc->mc_top]--; |
8878 | } |
8879 | if (IS_LEAF(mps)) |
8880 | XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); |
8881 | } |
8882 | } |
8883 | } |
8884 | } |
8885 | |
8886 | /* Update the parent separators. |
8887 | */ |
8888 | if (csrc->mc_ki[csrc->mc_top] == 0) { |
8889 | if (csrc->mc_ki[csrc->mc_top-1] != 0) { |
8890 | if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { |
8891 | key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); |
8892 | } else { |
8893 | srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); |
8894 | key.mv_size = NODEKSZ(srcnode); |
8895 | key.mv_data = NODEKEY(srcnode); |
8896 | } |
8897 | DPRINTF(("update separator for source page %" Yu" to [%s]" , |
8898 | csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); |
8899 | mdb_cursor_copy(csrc, &mn); |
8900 | mn.mc_snum--; |
8901 | mn.mc_top--; |
8902 | /* We want mdb_rebalance to find mn when doing fixups */ |
8903 | WITH_CURSOR_TRACKING(mn, |
8904 | rc = mdb_update_key(&mn, &key)); |
8905 | if (rc) |
8906 | return rc; |
8907 | } |
8908 | if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { |
8909 | MDB_val nullkey; |
8910 | indx_t ix = csrc->mc_ki[csrc->mc_top]; |
8911 | nullkey.mv_size = 0; |
8912 | csrc->mc_ki[csrc->mc_top] = 0; |
8913 | rc = mdb_update_key(csrc, &nullkey); |
8914 | csrc->mc_ki[csrc->mc_top] = ix; |
8915 | mdb_cassert(csrc, rc == MDB_SUCCESS); |
8916 | } |
8917 | } |
8918 | |
8919 | if (cdst->mc_ki[cdst->mc_top] == 0) { |
8920 | if (cdst->mc_ki[cdst->mc_top-1] != 0) { |
8921 | if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { |
8922 | key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); |
8923 | } else { |
8924 | srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); |
8925 | key.mv_size = NODEKSZ(srcnode); |
8926 | key.mv_data = NODEKEY(srcnode); |
8927 | } |
8928 | DPRINTF(("update separator for destination page %" Yu" to [%s]" , |
8929 | cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); |
8930 | mdb_cursor_copy(cdst, &mn); |
8931 | mn.mc_snum--; |
8932 | mn.mc_top--; |
8933 | /* We want mdb_rebalance to find mn when doing fixups */ |
8934 | WITH_CURSOR_TRACKING(mn, |
8935 | rc = mdb_update_key(&mn, &key)); |
8936 | if (rc) |
8937 | return rc; |
8938 | } |
8939 | if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { |
8940 | MDB_val nullkey; |
8941 | indx_t ix = cdst->mc_ki[cdst->mc_top]; |
8942 | nullkey.mv_size = 0; |
8943 | cdst->mc_ki[cdst->mc_top] = 0; |
8944 | rc = mdb_update_key(cdst, &nullkey); |
8945 | cdst->mc_ki[cdst->mc_top] = ix; |
8946 | mdb_cassert(cdst, rc == MDB_SUCCESS); |
8947 | } |
8948 | } |
8949 | |
8950 | return MDB_SUCCESS; |
8951 | } |
8952 | |
8953 | /** Merge one page into another. |
8954 | * The nodes from the page pointed to by \b csrc will |
8955 | * be copied to the page pointed to by \b cdst and then |
8956 | * the \b csrc page will be freed. |
8957 | * @param[in] csrc Cursor pointing to the source page. |
8958 | * @param[in] cdst Cursor pointing to the destination page. |
8959 | * @return 0 on success, non-zero on failure. |
8960 | */ |
8961 | static int |
8962 | mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) |
8963 | { |
8964 | MDB_page *psrc, *pdst; |
8965 | MDB_node *srcnode; |
8966 | MDB_val key, data; |
8967 | unsigned nkeys; |
8968 | int rc; |
8969 | indx_t i, j; |
8970 | |
8971 | psrc = csrc->mc_pg[csrc->mc_top]; |
8972 | pdst = cdst->mc_pg[cdst->mc_top]; |
8973 | |
8974 | DPRINTF(("merging page %" Yu" into %" Yu, psrc->mp_pgno, pdst->mp_pgno)); |
8975 | |
8976 | mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ |
8977 | mdb_cassert(csrc, cdst->mc_snum > 1); |
8978 | |
8979 | /* Mark dst as dirty. */ |
8980 | if ((rc = mdb_page_touch(cdst))) |
8981 | return rc; |
8982 | |
8983 | /* get dst page again now that we've touched it. */ |
8984 | pdst = cdst->mc_pg[cdst->mc_top]; |
8985 | |
8986 | /* Move all nodes from src to dst. |
8987 | */ |
8988 | j = nkeys = NUMKEYS(pdst); |
8989 | if (IS_LEAF2(psrc)) { |
8990 | key.mv_size = csrc->mc_db->md_pad; |
8991 | key.mv_data = METADATA(psrc); |
8992 | for (i = 0; i < NUMKEYS(psrc); i++, j++) { |
8993 | rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); |
8994 | if (rc != MDB_SUCCESS) |
8995 | return rc; |
8996 | key.mv_data = (char *)key.mv_data + key.mv_size; |
8997 | } |
8998 | } else { |
8999 | for (i = 0; i < NUMKEYS(psrc); i++, j++) { |
9000 | srcnode = NODEPTR(psrc, i); |
9001 | if (i == 0 && IS_BRANCH(psrc)) { |
9002 | MDB_cursor mn; |
9003 | MDB_node *s2; |
9004 | mdb_cursor_copy(csrc, &mn); |
9005 | mn.mc_xcursor = NULL; |
9006 | /* must find the lowest key below src */ |
9007 | rc = mdb_page_search_lowest(&mn); |
9008 | if (rc) |
9009 | return rc; |
9010 | if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { |
9011 | key.mv_size = mn.mc_db->md_pad; |
9012 | key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); |
9013 | } else { |
9014 | s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); |
9015 | key.mv_size = NODEKSZ(s2); |
9016 | key.mv_data = NODEKEY(s2); |
9017 | } |
9018 | } else { |
9019 | key.mv_size = srcnode->mn_ksize; |
9020 | key.mv_data = NODEKEY(srcnode); |
9021 | } |
9022 | |
9023 | data.mv_size = NODEDSZ(srcnode); |
9024 | data.mv_data = NODEDATA(srcnode); |
9025 | rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); |
9026 | if (rc != MDB_SUCCESS) |
9027 | return rc; |
9028 | } |
9029 | } |
9030 | |
9031 | DPRINTF(("dst page %" Yu" now has %u keys (%.1f%% filled)" , |
9032 | pdst->mp_pgno, NUMKEYS(pdst), |
9033 | (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10)); |
9034 | |
9035 | /* Unlink the src page from parent and add to free list. |
9036 | */ |
9037 | csrc->mc_top--; |
9038 | mdb_node_del(csrc, 0); |
9039 | if (csrc->mc_ki[csrc->mc_top] == 0) { |
9040 | key.mv_size = 0; |
9041 | rc = mdb_update_key(csrc, &key); |
9042 | if (rc) { |
9043 | csrc->mc_top++; |
9044 | return rc; |
9045 | } |
9046 | } |
9047 | csrc->mc_top++; |
9048 | |
9049 | psrc = csrc->mc_pg[csrc->mc_top]; |
9050 | /* If not operating on FreeDB, allow this page to be reused |
9051 | * in this txn. Otherwise just add to free list. |
9052 | */ |
9053 | rc = mdb_page_loose(csrc, psrc); |
9054 | if (rc) |
9055 | return rc; |
9056 | if (IS_LEAF(psrc)) |
9057 | csrc->mc_db->md_leaf_pages--; |
9058 | else |
9059 | csrc->mc_db->md_branch_pages--; |
9060 | { |
9061 | /* Adjust other cursors pointing to mp */ |
9062 | MDB_cursor *m2, *m3; |
9063 | MDB_dbi dbi = csrc->mc_dbi; |
9064 | unsigned int top = csrc->mc_top; |
9065 | |
9066 | for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
9067 | if (csrc->mc_flags & C_SUB) |
9068 | m3 = &m2->mc_xcursor->mx_cursor; |
9069 | else |
9070 | m3 = m2; |
9071 | if (m3 == csrc) continue; |
9072 | if (m3->mc_snum < csrc->mc_snum) continue; |
9073 | if (m3->mc_pg[top] == psrc) { |
9074 | m3->mc_pg[top] = pdst; |
9075 | m3->mc_ki[top] += nkeys; |
9076 | m3->mc_ki[top-1] = cdst->mc_ki[top-1]; |
9077 | } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] && |
9078 | m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { |
9079 | m3->mc_ki[top-1]--; |
9080 | } |
9081 | if (IS_LEAF(psrc)) |
9082 | XCURSOR_REFRESH(m3, top, m3->mc_pg[top]); |
9083 | } |
9084 | } |
9085 | { |
9086 | unsigned int snum = cdst->mc_snum; |
9087 | uint16_t depth = cdst->mc_db->md_depth; |
9088 | mdb_cursor_pop(cdst); |
9089 | rc = mdb_rebalance(cdst); |
9090 | /* Did the tree height change? */ |
9091 | if (depth != cdst->mc_db->md_depth) |
9092 | snum += cdst->mc_db->md_depth - depth; |
9093 | cdst->mc_snum = snum; |
9094 | cdst->mc_top = snum-1; |
9095 | } |
9096 | return rc; |
9097 | } |
9098 | |
9099 | /** Copy the contents of a cursor. |
9100 | * @param[in] csrc The cursor to copy from. |
9101 | * @param[out] cdst The cursor to copy to. |
9102 | */ |
9103 | static void |
9104 | mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) |
9105 | { |
9106 | unsigned int i; |
9107 | |
9108 | cdst->mc_txn = csrc->mc_txn; |
9109 | cdst->mc_dbi = csrc->mc_dbi; |
9110 | cdst->mc_db = csrc->mc_db; |
9111 | cdst->mc_dbx = csrc->mc_dbx; |
9112 | cdst->mc_snum = csrc->mc_snum; |
9113 | cdst->mc_top = csrc->mc_top; |
9114 | cdst->mc_flags = csrc->mc_flags; |
9115 | MC_SET_OVPG(cdst, MC_OVPG(csrc)); |
9116 | |
9117 | for (i=0; i<csrc->mc_snum; i++) { |
9118 | cdst->mc_pg[i] = csrc->mc_pg[i]; |
9119 | cdst->mc_ki[i] = csrc->mc_ki[i]; |
9120 | } |
9121 | } |
9122 | |
9123 | /** Rebalance the tree after a delete operation. |
9124 | * @param[in] mc Cursor pointing to the page where rebalancing |
9125 | * should begin. |
9126 | * @return 0 on success, non-zero on failure. |
9127 | */ |
9128 | static int |
9129 | mdb_rebalance(MDB_cursor *mc) |
9130 | { |
9131 | MDB_node *node; |
9132 | int rc, fromleft; |
9133 | unsigned int ptop, minkeys, thresh; |
9134 | MDB_cursor mn; |
9135 | indx_t oldki; |
9136 | |
9137 | if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { |
9138 | minkeys = 2; |
9139 | thresh = 1; |
9140 | } else { |
9141 | minkeys = 1; |
9142 | thresh = FILL_THRESHOLD; |
9143 | } |
9144 | DPRINTF(("rebalancing %s page %" Yu" (has %u keys, %.1f%% full)" , |
9145 | IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch" , |
9146 | mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), |
9147 | (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); |
9148 | |
9149 | if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && |
9150 | NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { |
9151 | DPRINTF(("no need to rebalance page %" Yu", above fill threshold" , |
9152 | mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); |
9153 | return MDB_SUCCESS; |
9154 | } |
9155 | |
9156 | if (mc->mc_snum < 2) { |
9157 | MDB_page *mp = mc->mc_pg[0]; |
9158 | if (IS_SUBP(mp)) { |
9159 | DPUTS("Can't rebalance a subpage, ignoring" ); |
9160 | return MDB_SUCCESS; |
9161 | } |
9162 | if (NUMKEYS(mp) == 0) { |
9163 | DPUTS("tree is completely empty" ); |
9164 | mc->mc_db->md_root = P_INVALID; |
9165 | mc->mc_db->md_depth = 0; |
9166 | mc->mc_db->md_leaf_pages = 0; |
9167 | rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); |
9168 | if (rc) |
9169 | return rc; |
9170 | /* Adjust cursors pointing to mp */ |
9171 | mc->mc_snum = 0; |
9172 | mc->mc_top = 0; |
9173 | mc->mc_flags &= ~C_INITIALIZED; |
9174 | { |
9175 | MDB_cursor *m2, *m3; |
9176 | MDB_dbi dbi = mc->mc_dbi; |
9177 | |
9178 | for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
9179 | if (mc->mc_flags & C_SUB) |
9180 | m3 = &m2->mc_xcursor->mx_cursor; |
9181 | else |
9182 | m3 = m2; |
9183 | if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) |
9184 | continue; |
9185 | if (m3->mc_pg[0] == mp) { |
9186 | m3->mc_snum = 0; |
9187 | m3->mc_top = 0; |
9188 | m3->mc_flags &= ~C_INITIALIZED; |
9189 | } |
9190 | } |
9191 | } |
9192 | } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { |
9193 | int i; |
9194 | DPUTS("collapsing root page!" ); |
9195 | rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); |
9196 | if (rc) |
9197 | return rc; |
9198 | mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); |
9199 | rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); |
9200 | if (rc) |
9201 | return rc; |
9202 | mc->mc_db->md_depth--; |
9203 | mc->mc_db->md_branch_pages--; |
9204 | mc->mc_ki[0] = mc->mc_ki[1]; |
9205 | for (i = 1; i<mc->mc_db->md_depth; i++) { |
9206 | mc->mc_pg[i] = mc->mc_pg[i+1]; |
9207 | mc->mc_ki[i] = mc->mc_ki[i+1]; |
9208 | } |
9209 | { |
9210 | /* Adjust other cursors pointing to mp */ |
9211 | MDB_cursor *m2, *m3; |
9212 | MDB_dbi dbi = mc->mc_dbi; |
9213 | |
9214 | for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
9215 | if (mc->mc_flags & C_SUB) |
9216 | m3 = &m2->mc_xcursor->mx_cursor; |
9217 | else |
9218 | m3 = m2; |
9219 | if (m3 == mc) continue; |
9220 | if (!(m3->mc_flags & C_INITIALIZED)) |
9221 | continue; |
9222 | if (m3->mc_pg[0] == mp) { |
9223 | for (i=0; i<mc->mc_db->md_depth; i++) { |
9224 | m3->mc_pg[i] = m3->mc_pg[i+1]; |
9225 | m3->mc_ki[i] = m3->mc_ki[i+1]; |
9226 | } |
9227 | m3->mc_snum--; |
9228 | m3->mc_top--; |
9229 | } |
9230 | } |
9231 | } |
9232 | } else |
9233 | DPUTS("root page doesn't need rebalancing" ); |
9234 | return MDB_SUCCESS; |
9235 | } |
9236 | |
9237 | /* The parent (branch page) must have at least 2 pointers, |
9238 | * otherwise the tree is invalid. |
9239 | */ |
9240 | ptop = mc->mc_top-1; |
9241 | mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); |
9242 | |
9243 | /* Leaf page fill factor is below the threshold. |
9244 | * Try to move keys from left or right neighbor, or |
9245 | * merge with a neighbor page. |
9246 | */ |
9247 | |
9248 | /* Find neighbors. |
9249 | */ |
9250 | mdb_cursor_copy(mc, &mn); |
9251 | mn.mc_xcursor = NULL; |
9252 | |
9253 | oldki = mc->mc_ki[mc->mc_top]; |
9254 | if (mc->mc_ki[ptop] == 0) { |
9255 | /* We're the leftmost leaf in our parent. |
9256 | */ |
9257 | DPUTS("reading right neighbor" ); |
9258 | mn.mc_ki[ptop]++; |
9259 | node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); |
9260 | rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); |
9261 | if (rc) |
9262 | return rc; |
9263 | mn.mc_ki[mn.mc_top] = 0; |
9264 | mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); |
9265 | fromleft = 0; |
9266 | } else { |
9267 | /* There is at least one neighbor to the left. |
9268 | */ |
9269 | DPUTS("reading left neighbor" ); |
9270 | mn.mc_ki[ptop]--; |
9271 | node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); |
9272 | rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); |
9273 | if (rc) |
9274 | return rc; |
9275 | mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; |
9276 | mc->mc_ki[mc->mc_top] = 0; |
9277 | fromleft = 1; |
9278 | } |
9279 | |
9280 | DPRINTF(("found neighbor page %" Yu" (%u keys, %.1f%% full)" , |
9281 | mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), |
9282 | (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); |
9283 | |
9284 | /* If the neighbor page is above threshold and has enough keys, |
9285 | * move one key from it. Otherwise we should try to merge them. |
9286 | * (A branch page must never have less than 2 keys.) |
9287 | */ |
9288 | if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { |
9289 | rc = mdb_node_move(&mn, mc, fromleft); |
9290 | if (fromleft) { |
9291 | /* if we inserted on left, bump position up */ |
9292 | oldki++; |
9293 | } |
9294 | } else { |
9295 | if (!fromleft) { |
9296 | rc = mdb_page_merge(&mn, mc); |
9297 | } else { |
9298 | oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); |
9299 | mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; |
9300 | /* We want mdb_rebalance to find mn when doing fixups */ |
9301 | WITH_CURSOR_TRACKING(mn, |
9302 | rc = mdb_page_merge(mc, &mn)); |
9303 | mdb_cursor_copy(&mn, mc); |
9304 | } |
9305 | mc->mc_flags &= ~C_EOF; |
9306 | } |
9307 | mc->mc_ki[mc->mc_top] = oldki; |
9308 | return rc; |
9309 | } |
9310 | |
9311 | /** Complete a delete operation started by #mdb_cursor_del(). */ |
9312 | static int |
9313 | mdb_cursor_del0(MDB_cursor *mc) |
9314 | { |
9315 | int rc; |
9316 | MDB_page *mp; |
9317 | indx_t ki; |
9318 | unsigned int nkeys; |
9319 | MDB_cursor *m2, *m3; |
9320 | MDB_dbi dbi = mc->mc_dbi; |
9321 | |
9322 | ki = mc->mc_ki[mc->mc_top]; |
9323 | mp = mc->mc_pg[mc->mc_top]; |
9324 | mdb_node_del(mc, mc->mc_db->md_pad); |
9325 | mc->mc_db->md_entries--; |
9326 | { |
9327 | /* Adjust other cursors pointing to mp */ |
9328 | for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
9329 | m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; |
9330 | if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) |
9331 | continue; |
9332 | if (m3 == mc || m3->mc_snum < mc->mc_snum) |
9333 | continue; |
9334 | if (m3->mc_pg[mc->mc_top] == mp) { |
9335 | if (m3->mc_ki[mc->mc_top] == ki) { |
9336 | m3->mc_flags |= C_DEL; |
9337 | if (mc->mc_db->md_flags & MDB_DUPSORT) { |
9338 | /* Sub-cursor referred into dataset which is gone */ |
9339 | m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); |
9340 | } |
9341 | continue; |
9342 | } else if (m3->mc_ki[mc->mc_top] > ki) { |
9343 | m3->mc_ki[mc->mc_top]--; |
9344 | } |
9345 | XCURSOR_REFRESH(m3, mc->mc_top, mp); |
9346 | } |
9347 | } |
9348 | } |
9349 | rc = mdb_rebalance(mc); |
9350 | |
9351 | if (rc == MDB_SUCCESS) { |
9352 | /* DB is totally empty now, just bail out. |
9353 | * Other cursors adjustments were already done |
9354 | * by mdb_rebalance and aren't needed here. |
9355 | */ |
9356 | if (!mc->mc_snum) |
9357 | return rc; |
9358 | |
9359 | mp = mc->mc_pg[mc->mc_top]; |
9360 | nkeys = NUMKEYS(mp); |
9361 | |
9362 | /* Adjust other cursors pointing to mp */ |
9363 | for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { |
9364 | m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; |
9365 | if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) |
9366 | continue; |
9367 | if (m3->mc_snum < mc->mc_snum) |
9368 | continue; |
9369 | if (m3->mc_pg[mc->mc_top] == mp) { |
9370 | /* if m3 points past last node in page, find next sibling */ |
9371 | if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { |
9372 | if (m3->mc_ki[mc->mc_top] >= nkeys) { |
9373 | rc = mdb_cursor_sibling(m3, 1); |
9374 | if (rc == MDB_NOTFOUND) { |
9375 | m3->mc_flags |= C_EOF; |
9376 | rc = MDB_SUCCESS; |
9377 | continue; |
9378 | } |
9379 | } |
9380 | if (mc->mc_db->md_flags & MDB_DUPSORT) { |
9381 | MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); |
9382 | /* If this node has dupdata, it may need to be reinited |
9383 | * because its data has moved. |
9384 | * If the xcursor was not initd it must be reinited. |
9385 | * Else if node points to a subDB, nothing is needed. |
9386 | * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset. |
9387 | */ |
9388 | if (node->mn_flags & F_DUPDATA) { |
9389 | if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { |
9390 | if (!(node->mn_flags & F_SUBDATA)) |
9391 | m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); |
9392 | } else { |
9393 | mdb_xcursor_init1(m3, node); |
9394 | m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; |
9395 | } |
9396 | } |
9397 | } |
9398 | } |
9399 | } |
9400 | } |
9401 | mc->mc_flags |= C_DEL; |
9402 | } |
9403 | |
9404 | if (rc) |
9405 | mc->mc_txn->mt_flags |= MDB_TXN_ERROR; |
9406 | return rc; |
9407 | } |
9408 | |
9409 | int |
9410 | mdb_del(MDB_txn *txn, MDB_dbi dbi, |
9411 | MDB_val *key, MDB_val *data) |
9412 | { |
9413 | if (!key || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
9414 | return EINVAL; |
9415 | |
9416 | if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) |
9417 | return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; |
9418 | |
9419 | if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { |
9420 | /* must ignore any data */ |
9421 | data = NULL; |
9422 | } |
9423 | |
9424 | return mdb_del0(txn, dbi, key, data, 0); |
9425 | } |
9426 | |
9427 | static int |
9428 | mdb_del0(MDB_txn *txn, MDB_dbi dbi, |
9429 | MDB_val *key, MDB_val *data, unsigned flags) |
9430 | { |
9431 | MDB_cursor mc; |
9432 | MDB_xcursor mx; |
9433 | MDB_cursor_op op; |
9434 | MDB_val rdata, *xdata; |
9435 | int rc, exact = 0; |
9436 | DKBUF; |
9437 | |
9438 | DPRINTF(("====> delete db %u key [%s]" , dbi, DKEY(key))); |
9439 | |
9440 | mdb_cursor_init(&mc, txn, dbi, &mx); |
9441 | |
9442 | if (data) { |
9443 | op = MDB_GET_BOTH; |
9444 | rdata = *data; |
9445 | xdata = &rdata; |
9446 | } else { |
9447 | op = MDB_SET; |
9448 | xdata = NULL; |
9449 | flags |= MDB_NODUPDATA; |
9450 | } |
9451 | rc = mdb_cursor_set(&mc, key, xdata, op, &exact); |
9452 | if (rc == 0) { |
9453 | /* let mdb_page_split know about this cursor if needed: |
9454 | * delete will trigger a rebalance; if it needs to move |
9455 | * a node from one page to another, it will have to |
9456 | * update the parent's separator key(s). If the new sepkey |
9457 | * is larger than the current one, the parent page may |
9458 | * run out of space, triggering a split. We need this |
9459 | * cursor to be consistent until the end of the rebalance. |
9460 | */ |
9461 | mc.mc_next = txn->mt_cursors[dbi]; |
9462 | txn->mt_cursors[dbi] = &mc; |
9463 | rc = mdb_cursor_del(&mc, flags); |
9464 | txn->mt_cursors[dbi] = mc.mc_next; |
9465 | } |
9466 | return rc; |
9467 | } |
9468 | |
9469 | /** Split a page and insert a new node. |
9470 | * Set #MDB_TXN_ERROR on failure. |
9471 | * @param[in,out] mc Cursor pointing to the page and desired insertion index. |
9472 | * The cursor will be updated to point to the actual page and index where |
9473 | * the node got inserted after the split. |
9474 | * @param[in] newkey The key for the newly inserted node. |
9475 | * @param[in] newdata The data for the newly inserted node. |
9476 | * @param[in] newpgno The page number, if the new node is a branch node. |
9477 | * @param[in] nflags The #NODE_ADD_FLAGS for the new node. |
9478 | * @return 0 on success, non-zero on failure. |
9479 | */ |
9480 | static int |
9481 | mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, |
9482 | unsigned int nflags) |
9483 | { |
9484 | unsigned int flags; |
9485 | int rc = MDB_SUCCESS, new_root = 0, did_split = 0; |
9486 | indx_t newindx; |
9487 | pgno_t pgno = 0; |
9488 | int i, j, split_indx, nkeys, pmax; |
9489 | MDB_env *env = mc->mc_txn->mt_env; |
9490 | MDB_node *node; |
9491 | MDB_val sepkey, rkey, xdata, *rdata = &xdata; |
9492 | MDB_page *copy = NULL; |
9493 | MDB_page *mp, *rp, *pp; |
9494 | int ptop; |
9495 | MDB_cursor mn; |
9496 | DKBUF; |
9497 | |
9498 | mp = mc->mc_pg[mc->mc_top]; |
9499 | newindx = mc->mc_ki[mc->mc_top]; |
9500 | nkeys = NUMKEYS(mp); |
9501 | |
9502 | DPRINTF(("-----> splitting %s page %" Yu" and adding [%s] at index %i/%i" , |
9503 | IS_LEAF(mp) ? "leaf" : "branch" , mp->mp_pgno, |
9504 | DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); |
9505 | |
9506 | /* Create a right sibling. */ |
9507 | if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) |
9508 | return rc; |
9509 | rp->mp_pad = mp->mp_pad; |
9510 | DPRINTF(("new right sibling: page %" Yu, rp->mp_pgno)); |
9511 | |
9512 | /* Usually when splitting the root page, the cursor |
9513 | * height is 1. But when called from mdb_update_key, |
9514 | * the cursor height may be greater because it walks |
9515 | * up the stack while finding the branch slot to update. |
9516 | */ |
9517 | if (mc->mc_top < 1) { |
9518 | if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) |
9519 | goto done; |
9520 | /* shift current top to make room for new parent */ |
9521 | for (i=mc->mc_snum; i>0; i--) { |
9522 | mc->mc_pg[i] = mc->mc_pg[i-1]; |
9523 | mc->mc_ki[i] = mc->mc_ki[i-1]; |
9524 | } |
9525 | mc->mc_pg[0] = pp; |
9526 | mc->mc_ki[0] = 0; |
9527 | mc->mc_db->md_root = pp->mp_pgno; |
9528 | DPRINTF(("root split! new root = %" Yu, pp->mp_pgno)); |
9529 | new_root = mc->mc_db->md_depth++; |
9530 | |
9531 | /* Add left (implicit) pointer. */ |
9532 | if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { |
9533 | /* undo the pre-push */ |
9534 | mc->mc_pg[0] = mc->mc_pg[1]; |
9535 | mc->mc_ki[0] = mc->mc_ki[1]; |
9536 | mc->mc_db->md_root = mp->mp_pgno; |
9537 | mc->mc_db->md_depth--; |
9538 | goto done; |
9539 | } |
9540 | mc->mc_snum++; |
9541 | mc->mc_top++; |
9542 | ptop = 0; |
9543 | } else { |
9544 | ptop = mc->mc_top-1; |
9545 | DPRINTF(("parent branch page is %" Yu, mc->mc_pg[ptop]->mp_pgno)); |
9546 | } |
9547 | |
9548 | mdb_cursor_copy(mc, &mn); |
9549 | mn.mc_xcursor = NULL; |
9550 | mn.mc_pg[mn.mc_top] = rp; |
9551 | mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; |
9552 | |
9553 | if (nflags & MDB_APPEND) { |
9554 | mn.mc_ki[mn.mc_top] = 0; |
9555 | sepkey = *newkey; |
9556 | split_indx = newindx; |
9557 | nkeys = 0; |
9558 | } else { |
9559 | |
9560 | split_indx = (nkeys+1) / 2; |
9561 | |
9562 | if (IS_LEAF2(rp)) { |
9563 | char *split, *ins; |
9564 | int x; |
9565 | unsigned int lsize, rsize, ksize; |
9566 | /* Move half of the keys to the right sibling */ |
9567 | x = mc->mc_ki[mc->mc_top] - split_indx; |
9568 | ksize = mc->mc_db->md_pad; |
9569 | split = LEAF2KEY(mp, split_indx, ksize); |
9570 | rsize = (nkeys - split_indx) * ksize; |
9571 | lsize = (nkeys - split_indx) * sizeof(indx_t); |
9572 | mp->mp_lower -= lsize; |
9573 | rp->mp_lower += lsize; |
9574 | mp->mp_upper += rsize - lsize; |
9575 | rp->mp_upper -= rsize - lsize; |
9576 | sepkey.mv_size = ksize; |
9577 | if (newindx == split_indx) { |
9578 | sepkey.mv_data = newkey->mv_data; |
9579 | } else { |
9580 | sepkey.mv_data = split; |
9581 | } |
9582 | if (x<0) { |
9583 | ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); |
9584 | memcpy(rp->mp_ptrs, split, rsize); |
9585 | sepkey.mv_data = rp->mp_ptrs; |
9586 | memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); |
9587 | memcpy(ins, newkey->mv_data, ksize); |
9588 | mp->mp_lower += sizeof(indx_t); |
9589 | mp->mp_upper -= ksize - sizeof(indx_t); |
9590 | } else { |
9591 | if (x) |
9592 | memcpy(rp->mp_ptrs, split, x * ksize); |
9593 | ins = LEAF2KEY(rp, x, ksize); |
9594 | memcpy(ins, newkey->mv_data, ksize); |
9595 | memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); |
9596 | rp->mp_lower += sizeof(indx_t); |
9597 | rp->mp_upper -= ksize - sizeof(indx_t); |
9598 | mc->mc_ki[mc->mc_top] = x; |
9599 | } |
9600 | } else { |
9601 | int psize, nsize, k; |
9602 | /* Maximum free space in an empty page */ |
9603 | pmax = env->me_psize - PAGEHDRSZ; |
9604 | if (IS_LEAF(mp)) |
9605 | nsize = mdb_leaf_size(env, newkey, newdata); |
9606 | else |
9607 | nsize = mdb_branch_size(env, newkey); |
9608 | nsize = EVEN(nsize); |
9609 | |
9610 | /* grab a page to hold a temporary copy */ |
9611 | copy = mdb_page_malloc(mc->mc_txn, 1); |
9612 | if (copy == NULL) { |
9613 | rc = ENOMEM; |
9614 | goto done; |
9615 | } |
9616 | copy->mp_pgno = mp->mp_pgno; |
9617 | copy->mp_flags = mp->mp_flags; |
9618 | copy->mp_lower = (PAGEHDRSZ-PAGEBASE); |
9619 | copy->mp_upper = env->me_psize - PAGEBASE; |
9620 | |
9621 | /* prepare to insert */ |
9622 | for (i=0, j=0; i<nkeys; i++) { |
9623 | if (i == newindx) { |
9624 | copy->mp_ptrs[j++] = 0; |
9625 | } |
9626 | copy->mp_ptrs[j++] = mp->mp_ptrs[i]; |
9627 | } |
9628 | |
9629 | /* When items are relatively large the split point needs |
9630 | * to be checked, because being off-by-one will make the |
9631 | * difference between success or failure in mdb_node_add. |
9632 | * |
9633 | * It's also relevant if a page happens to be laid out |
9634 | * such that one half of its nodes are all "small" and |
9635 | * the other half of its nodes are "large." If the new |
9636 | * item is also "large" and falls on the half with |
9637 | * "large" nodes, it also may not fit. |
9638 | * |
9639 | * As a final tweak, if the new item goes on the last |
9640 | * spot on the page (and thus, onto the new page), bias |
9641 | * the split so the new page is emptier than the old page. |
9642 | * This yields better packing during sequential inserts. |
9643 | */ |
9644 | if (nkeys < 32 || nsize > pmax/16 || newindx >= nkeys) { |
9645 | /* Find split point */ |
9646 | psize = 0; |
9647 | if (newindx <= split_indx || newindx >= nkeys) { |
9648 | i = 0; j = 1; |
9649 | k = newindx >= nkeys ? nkeys : split_indx+1+IS_LEAF(mp); |
9650 | } else { |
9651 | i = nkeys; j = -1; |
9652 | k = split_indx-1; |
9653 | } |
9654 | for (; i!=k; i+=j) { |
9655 | if (i == newindx) { |
9656 | psize += nsize; |
9657 | node = NULL; |
9658 | } else { |
9659 | node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); |
9660 | psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); |
9661 | if (IS_LEAF(mp)) { |
9662 | if (F_ISSET(node->mn_flags, F_BIGDATA)) |
9663 | psize += sizeof(pgno_t); |
9664 | else |
9665 | psize += NODEDSZ(node); |
9666 | } |
9667 | psize = EVEN(psize); |
9668 | } |
9669 | if (psize > pmax || i == k-j) { |
9670 | split_indx = i + (j<0); |
9671 | break; |
9672 | } |
9673 | } |
9674 | } |
9675 | if (split_indx == newindx) { |
9676 | sepkey.mv_size = newkey->mv_size; |
9677 | sepkey.mv_data = newkey->mv_data; |
9678 | } else { |
9679 | node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); |
9680 | sepkey.mv_size = node->mn_ksize; |
9681 | sepkey.mv_data = NODEKEY(node); |
9682 | } |
9683 | } |
9684 | } |
9685 | |
9686 | DPRINTF(("separator is %d [%s]" , split_indx, DKEY(&sepkey))); |
9687 | |
9688 | /* Copy separator key to the parent. |
9689 | */ |
9690 | if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { |
9691 | int snum = mc->mc_snum; |
9692 | mn.mc_snum--; |
9693 | mn.mc_top--; |
9694 | did_split = 1; |
9695 | /* We want other splits to find mn when doing fixups */ |
9696 | WITH_CURSOR_TRACKING(mn, |
9697 | rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); |
9698 | if (rc) |
9699 | goto done; |
9700 | |
9701 | /* root split? */ |
9702 | if (mc->mc_snum > snum) { |
9703 | ptop++; |
9704 | } |
9705 | /* Right page might now have changed parent. |
9706 | * Check if left page also changed parent. |
9707 | */ |
9708 | if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && |
9709 | mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { |
9710 | for (i=0; i<ptop; i++) { |
9711 | mc->mc_pg[i] = mn.mc_pg[i]; |
9712 | mc->mc_ki[i] = mn.mc_ki[i]; |
9713 | } |
9714 | mc->mc_pg[ptop] = mn.mc_pg[ptop]; |
9715 | if (mn.mc_ki[ptop]) { |
9716 | mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; |
9717 | } else { |
9718 | /* find right page's left sibling */ |
9719 | mc->mc_ki[ptop] = mn.mc_ki[ptop]; |
9720 | rc = mdb_cursor_sibling(mc, 0); |
9721 | } |
9722 | } |
9723 | } else { |
9724 | mn.mc_top--; |
9725 | rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); |
9726 | mn.mc_top++; |
9727 | } |
9728 | if (rc != MDB_SUCCESS) { |
9729 | if (rc == MDB_NOTFOUND) /* improper mdb_cursor_sibling() result */ |
9730 | rc = MDB_PROBLEM; |
9731 | goto done; |
9732 | } |
9733 | if (nflags & MDB_APPEND) { |
9734 | mc->mc_pg[mc->mc_top] = rp; |
9735 | mc->mc_ki[mc->mc_top] = 0; |
9736 | rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); |
9737 | if (rc) |
9738 | goto done; |
9739 | for (i=0; i<mc->mc_top; i++) |
9740 | mc->mc_ki[i] = mn.mc_ki[i]; |
9741 | } else if (!IS_LEAF2(mp)) { |
9742 | /* Move nodes */ |
9743 | mc->mc_pg[mc->mc_top] = rp; |
9744 | i = split_indx; |
9745 | j = 0; |
9746 | do { |
9747 | if (i == newindx) { |
9748 | rkey.mv_data = newkey->mv_data; |
9749 | rkey.mv_size = newkey->mv_size; |
9750 | if (IS_LEAF(mp)) { |
9751 | rdata = newdata; |
9752 | } else |
9753 | pgno = newpgno; |
9754 | flags = nflags; |
9755 | /* Update index for the new key. */ |
9756 | mc->mc_ki[mc->mc_top] = j; |
9757 | } else { |
9758 | node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); |
9759 | rkey.mv_data = NODEKEY(node); |
9760 | rkey.mv_size = node->mn_ksize; |
9761 | if (IS_LEAF(mp)) { |
9762 | xdata.mv_data = NODEDATA(node); |
9763 | xdata.mv_size = NODEDSZ(node); |
9764 | rdata = &xdata; |
9765 | } else |
9766 | pgno = NODEPGNO(node); |
9767 | flags = node->mn_flags; |
9768 | } |
9769 | |
9770 | if (!IS_LEAF(mp) && j == 0) { |
9771 | /* First branch index doesn't need key data. */ |
9772 | rkey.mv_size = 0; |
9773 | } |
9774 | |
9775 | rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); |
9776 | if (rc) |
9777 | goto done; |
9778 | if (i == nkeys) { |
9779 | i = 0; |
9780 | j = 0; |
9781 | mc->mc_pg[mc->mc_top] = copy; |
9782 | } else { |
9783 | i++; |
9784 | j++; |
9785 | } |
9786 | } while (i != split_indx); |
9787 | |
9788 | nkeys = NUMKEYS(copy); |
9789 | for (i=0; i<nkeys; i++) |
9790 | mp->mp_ptrs[i] = copy->mp_ptrs[i]; |
9791 | mp->mp_lower = copy->mp_lower; |
9792 | mp->mp_upper = copy->mp_upper; |
9793 | memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), |
9794 | env->me_psize - copy->mp_upper - PAGEBASE); |
9795 | |
9796 | /* reset back to original page */ |
9797 | if (newindx < split_indx) { |
9798 | mc->mc_pg[mc->mc_top] = mp; |
9799 | } else { |
9800 | mc->mc_pg[mc->mc_top] = rp; |
9801 | mc->mc_ki[ptop]++; |
9802 | /* Make sure mc_ki is still valid. |
9803 | */ |
9804 | if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && |
9805 | mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { |
9806 | for (i=0; i<=ptop; i++) { |
9807 | mc->mc_pg[i] = mn.mc_pg[i]; |
9808 | mc->mc_ki[i] = mn.mc_ki[i]; |
9809 | } |
9810 | } |
9811 | } |
9812 | if (nflags & MDB_RESERVE) { |
9813 | node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
9814 | if (!(node->mn_flags & F_BIGDATA)) |
9815 | newdata->mv_data = NODEDATA(node); |
9816 | } |
9817 | } else { |
9818 | if (newindx >= split_indx) { |
9819 | mc->mc_pg[mc->mc_top] = rp; |
9820 | mc->mc_ki[ptop]++; |
9821 | /* Make sure mc_ki is still valid. |
9822 | */ |
9823 | if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && |
9824 | mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { |
9825 | for (i=0; i<=ptop; i++) { |
9826 | mc->mc_pg[i] = mn.mc_pg[i]; |
9827 | mc->mc_ki[i] = mn.mc_ki[i]; |
9828 | } |
9829 | } |
9830 | } |
9831 | } |
9832 | |
9833 | { |
9834 | /* Adjust other cursors pointing to mp */ |
9835 | MDB_cursor *m2, *m3; |
9836 | MDB_dbi dbi = mc->mc_dbi; |
9837 | nkeys = NUMKEYS(mp); |
9838 | |
9839 | for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
9840 | if (mc->mc_flags & C_SUB) |
9841 | m3 = &m2->mc_xcursor->mx_cursor; |
9842 | else |
9843 | m3 = m2; |
9844 | if (m3 == mc) |
9845 | continue; |
9846 | if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) |
9847 | continue; |
9848 | if (new_root) { |
9849 | int k; |
9850 | /* sub cursors may be on different DB */ |
9851 | if (m3->mc_pg[0] != mp) |
9852 | continue; |
9853 | /* root split */ |
9854 | for (k=new_root; k>=0; k--) { |
9855 | m3->mc_ki[k+1] = m3->mc_ki[k]; |
9856 | m3->mc_pg[k+1] = m3->mc_pg[k]; |
9857 | } |
9858 | if (m3->mc_ki[0] >= nkeys) { |
9859 | m3->mc_ki[0] = 1; |
9860 | } else { |
9861 | m3->mc_ki[0] = 0; |
9862 | } |
9863 | m3->mc_pg[0] = mc->mc_pg[0]; |
9864 | m3->mc_snum++; |
9865 | m3->mc_top++; |
9866 | } |
9867 | if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { |
9868 | if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) |
9869 | m3->mc_ki[mc->mc_top]++; |
9870 | if (m3->mc_ki[mc->mc_top] >= nkeys) { |
9871 | m3->mc_pg[mc->mc_top] = rp; |
9872 | m3->mc_ki[mc->mc_top] -= nkeys; |
9873 | for (i=0; i<mc->mc_top; i++) { |
9874 | m3->mc_ki[i] = mn.mc_ki[i]; |
9875 | m3->mc_pg[i] = mn.mc_pg[i]; |
9876 | } |
9877 | } |
9878 | } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && |
9879 | m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { |
9880 | m3->mc_ki[ptop]++; |
9881 | } |
9882 | if (IS_LEAF(mp)) |
9883 | XCURSOR_REFRESH(m3, mc->mc_top, m3->mc_pg[mc->mc_top]); |
9884 | } |
9885 | } |
9886 | DPRINTF(("mp left: %d, rp left: %d" , SIZELEFT(mp), SIZELEFT(rp))); |
9887 | |
9888 | done: |
9889 | if (copy) /* tmp page */ |
9890 | mdb_page_free(env, copy); |
9891 | if (rc) |
9892 | mc->mc_txn->mt_flags |= MDB_TXN_ERROR; |
9893 | return rc; |
9894 | } |
9895 | |
9896 | int |
9897 | mdb_put(MDB_txn *txn, MDB_dbi dbi, |
9898 | MDB_val *key, MDB_val *data, unsigned int flags) |
9899 | { |
9900 | MDB_cursor mc; |
9901 | MDB_xcursor mx; |
9902 | int rc; |
9903 | |
9904 | if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
9905 | return EINVAL; |
9906 | |
9907 | if (flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) |
9908 | return EINVAL; |
9909 | |
9910 | if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) |
9911 | return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; |
9912 | |
9913 | mdb_cursor_init(&mc, txn, dbi, &mx); |
9914 | mc.mc_next = txn->mt_cursors[dbi]; |
9915 | txn->mt_cursors[dbi] = &mc; |
9916 | rc = mdb_cursor_put(&mc, key, data, flags); |
9917 | txn->mt_cursors[dbi] = mc.mc_next; |
9918 | return rc; |
9919 | } |
9920 | |
9921 | #ifndef MDB_WBUF |
9922 | #define MDB_WBUF (1024*1024) |
9923 | #endif |
9924 | #define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ |
9925 | |
9926 | /** State needed for a double-buffering compacting copy. */ |
9927 | typedef struct mdb_copy { |
9928 | MDB_env *mc_env; |
9929 | MDB_txn *mc_txn; |
9930 | pthread_mutex_t mc_mutex; |
9931 | pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ |
9932 | char *mc_wbuf[2]; |
9933 | char *mc_over[2]; |
9934 | int mc_wlen[2]; |
9935 | int mc_olen[2]; |
9936 | pgno_t mc_next_pgno; |
9937 | HANDLE mc_fd; |
9938 | int mc_toggle; /**< Buffer number in provider */ |
9939 | int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ |
9940 | /** Error code. Never cleared if set. Both threads can set nonzero |
9941 | * to fail the copy. Not mutex-protected, LMDB expects atomic int. |
9942 | */ |
9943 | volatile int mc_error; |
9944 | } mdb_copy; |
9945 | |
9946 | /** Dedicated writer thread for compacting copy. */ |
9947 | static THREAD_RET ESECT CALL_CONV |
9948 | mdb_env_copythr(void *arg) |
9949 | { |
9950 | mdb_copy *my = arg; |
9951 | char *ptr; |
9952 | int toggle = 0, wsize, rc; |
9953 | #ifdef _WIN32 |
9954 | DWORD len; |
9955 | #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) |
9956 | #else |
9957 | int len; |
9958 | #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) |
9959 | #ifdef SIGPIPE |
9960 | sigset_t set; |
9961 | sigemptyset(&set); |
9962 | sigaddset(&set, SIGPIPE); |
9963 | if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) |
9964 | my->mc_error = rc; |
9965 | #endif |
9966 | #endif |
9967 | |
9968 | pthread_mutex_lock(&my->mc_mutex); |
9969 | for(;;) { |
9970 | while (!my->mc_new) |
9971 | pthread_cond_wait(&my->mc_cond, &my->mc_mutex); |
9972 | if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ |
9973 | break; |
9974 | wsize = my->mc_wlen[toggle]; |
9975 | ptr = my->mc_wbuf[toggle]; |
9976 | again: |
9977 | rc = MDB_SUCCESS; |
9978 | while (wsize > 0 && !my->mc_error) { |
9979 | DO_WRITE(rc, my->mc_fd, ptr, wsize, len); |
9980 | if (!rc) { |
9981 | rc = ErrCode(); |
9982 | #if defined(SIGPIPE) && !defined(_WIN32) |
9983 | if (rc == EPIPE) { |
9984 | /* Collect the pending SIGPIPE, otherwise at least OS X |
9985 | * gives it to the process on thread-exit (ITS#8504). |
9986 | */ |
9987 | int tmp; |
9988 | sigwait(&set, &tmp); |
9989 | } |
9990 | #endif |
9991 | break; |
9992 | } else if (len > 0) { |
9993 | rc = MDB_SUCCESS; |
9994 | ptr += len; |
9995 | wsize -= len; |
9996 | continue; |
9997 | } else { |
9998 | rc = EIO; |
9999 | break; |
10000 | } |
10001 | } |
10002 | if (rc) { |
10003 | my->mc_error = rc; |
10004 | } |
10005 | /* If there's an overflow page tail, write it too */ |
10006 | if (my->mc_olen[toggle]) { |
10007 | wsize = my->mc_olen[toggle]; |
10008 | ptr = my->mc_over[toggle]; |
10009 | my->mc_olen[toggle] = 0; |
10010 | goto again; |
10011 | } |
10012 | my->mc_wlen[toggle] = 0; |
10013 | toggle ^= 1; |
10014 | /* Return the empty buffer to provider */ |
10015 | my->mc_new--; |
10016 | pthread_cond_signal(&my->mc_cond); |
10017 | } |
10018 | pthread_mutex_unlock(&my->mc_mutex); |
10019 | return (THREAD_RET)0; |
10020 | #undef DO_WRITE |
10021 | } |
10022 | |
10023 | /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. |
10024 | * |
10025 | * @param[in] my control structure. |
10026 | * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). |
10027 | */ |
10028 | static int ESECT |
10029 | mdb_env_cthr_toggle(mdb_copy *my, int adjust) |
10030 | { |
10031 | pthread_mutex_lock(&my->mc_mutex); |
10032 | my->mc_new += adjust; |
10033 | pthread_cond_signal(&my->mc_cond); |
10034 | while (my->mc_new & 2) /* both buffers in use */ |
10035 | pthread_cond_wait(&my->mc_cond, &my->mc_mutex); |
10036 | pthread_mutex_unlock(&my->mc_mutex); |
10037 | |
10038 | my->mc_toggle ^= (adjust & 1); |
10039 | /* Both threads reset mc_wlen, to be safe from threading errors */ |
10040 | my->mc_wlen[my->mc_toggle] = 0; |
10041 | return my->mc_error; |
10042 | } |
10043 | |
10044 | /** Depth-first tree traversal for compacting copy. |
10045 | * @param[in] my control structure. |
10046 | * @param[in,out] pg database root. |
10047 | * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. |
10048 | */ |
10049 | static int ESECT |
10050 | mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) |
10051 | { |
10052 | MDB_cursor mc = {0}; |
10053 | MDB_node *ni; |
10054 | MDB_page *mo, *mp, *leaf; |
10055 | char *buf, *ptr; |
10056 | int rc, toggle; |
10057 | unsigned int i; |
10058 | |
10059 | /* Empty DB, nothing to do */ |
10060 | if (*pg == P_INVALID) |
10061 | return MDB_SUCCESS; |
10062 | |
10063 | mc.mc_snum = 1; |
10064 | mc.mc_txn = my->mc_txn; |
10065 | mc.mc_flags = my->mc_txn->mt_flags & (C_ORIG_RDONLY|C_WRITEMAP); |
10066 | |
10067 | rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); |
10068 | if (rc) |
10069 | return rc; |
10070 | rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); |
10071 | if (rc) |
10072 | return rc; |
10073 | |
10074 | /* Make cursor pages writable */ |
10075 | buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); |
10076 | if (buf == NULL) |
10077 | return ENOMEM; |
10078 | |
10079 | for (i=0; i<mc.mc_top; i++) { |
10080 | mdb_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize); |
10081 | mc.mc_pg[i] = (MDB_page *)ptr; |
10082 | ptr += my->mc_env->me_psize; |
10083 | } |
10084 | |
10085 | /* This is writable space for a leaf page. Usually not needed. */ |
10086 | leaf = (MDB_page *)ptr; |
10087 | |
10088 | toggle = my->mc_toggle; |
10089 | while (mc.mc_snum > 0) { |
10090 | unsigned n; |
10091 | mp = mc.mc_pg[mc.mc_top]; |
10092 | n = NUMKEYS(mp); |
10093 | |
10094 | if (IS_LEAF(mp)) { |
10095 | if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { |
10096 | for (i=0; i<n; i++) { |
10097 | ni = NODEPTR(mp, i); |
10098 | if (ni->mn_flags & F_BIGDATA) { |
10099 | MDB_page *omp; |
10100 | pgno_t pg; |
10101 | |
10102 | /* Need writable leaf */ |
10103 | if (mp != leaf) { |
10104 | mc.mc_pg[mc.mc_top] = leaf; |
10105 | mdb_page_copy(leaf, mp, my->mc_env->me_psize); |
10106 | mp = leaf; |
10107 | ni = NODEPTR(mp, i); |
10108 | } |
10109 | |
10110 | memcpy(&pg, NODEDATA(ni), sizeof(pg)); |
10111 | memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); |
10112 | rc = mdb_page_get(&mc, pg, &omp, NULL); |
10113 | if (rc) |
10114 | goto done; |
10115 | if (my->mc_wlen[toggle] >= MDB_WBUF) { |
10116 | rc = mdb_env_cthr_toggle(my, 1); |
10117 | if (rc) |
10118 | goto done; |
10119 | toggle = my->mc_toggle; |
10120 | } |
10121 | mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); |
10122 | memcpy(mo, omp, my->mc_env->me_psize); |
10123 | mo->mp_pgno = my->mc_next_pgno; |
10124 | my->mc_next_pgno += omp->mp_pages; |
10125 | my->mc_wlen[toggle] += my->mc_env->me_psize; |
10126 | if (omp->mp_pages > 1) { |
10127 | my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); |
10128 | my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; |
10129 | rc = mdb_env_cthr_toggle(my, 1); |
10130 | if (rc) |
10131 | goto done; |
10132 | toggle = my->mc_toggle; |
10133 | } |
10134 | } else if (ni->mn_flags & F_SUBDATA) { |
10135 | MDB_db db; |
10136 | |
10137 | /* Need writable leaf */ |
10138 | if (mp != leaf) { |
10139 | mc.mc_pg[mc.mc_top] = leaf; |
10140 | mdb_page_copy(leaf, mp, my->mc_env->me_psize); |
10141 | mp = leaf; |
10142 | ni = NODEPTR(mp, i); |
10143 | } |
10144 | |
10145 | memcpy(&db, NODEDATA(ni), sizeof(db)); |
10146 | my->mc_toggle = toggle; |
10147 | rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); |
10148 | if (rc) |
10149 | goto done; |
10150 | toggle = my->mc_toggle; |
10151 | memcpy(NODEDATA(ni), &db, sizeof(db)); |
10152 | } |
10153 | } |
10154 | } |
10155 | } else { |
10156 | mc.mc_ki[mc.mc_top]++; |
10157 | if (mc.mc_ki[mc.mc_top] < n) { |
10158 | pgno_t pg; |
10159 | again: |
10160 | ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); |
10161 | pg = NODEPGNO(ni); |
10162 | rc = mdb_page_get(&mc, pg, &mp, NULL); |
10163 | if (rc) |
10164 | goto done; |
10165 | mc.mc_top++; |
10166 | mc.mc_snum++; |
10167 | mc.mc_ki[mc.mc_top] = 0; |
10168 | if (IS_BRANCH(mp)) { |
10169 | /* Whenever we advance to a sibling branch page, |
10170 | * we must proceed all the way down to its first leaf. |
10171 | */ |
10172 | mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); |
10173 | goto again; |
10174 | } else |
10175 | mc.mc_pg[mc.mc_top] = mp; |
10176 | continue; |
10177 | } |
10178 | } |
10179 | if (my->mc_wlen[toggle] >= MDB_WBUF) { |
10180 | rc = mdb_env_cthr_toggle(my, 1); |
10181 | if (rc) |
10182 | goto done; |
10183 | toggle = my->mc_toggle; |
10184 | } |
10185 | mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); |
10186 | mdb_page_copy(mo, mp, my->mc_env->me_psize); |
10187 | mo->mp_pgno = my->mc_next_pgno++; |
10188 | my->mc_wlen[toggle] += my->mc_env->me_psize; |
10189 | if (mc.mc_top) { |
10190 | /* Update parent if there is one */ |
10191 | ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]); |
10192 | SETPGNO(ni, mo->mp_pgno); |
10193 | mdb_cursor_pop(&mc); |
10194 | } else { |
10195 | /* Otherwise we're done */ |
10196 | *pg = mo->mp_pgno; |
10197 | break; |
10198 | } |
10199 | } |
10200 | done: |
10201 | free(buf); |
10202 | return rc; |
10203 | } |
10204 | |
10205 | /** Copy environment with compaction. */ |
10206 | static int ESECT |
10207 | mdb_env_copyfd1(MDB_env *env, HANDLE fd) |
10208 | { |
10209 | MDB_meta *mm; |
10210 | MDB_page *mp; |
10211 | mdb_copy my = {0}; |
10212 | MDB_txn *txn = NULL; |
10213 | pthread_t thr; |
10214 | pgno_t root, new_root; |
10215 | int rc = MDB_SUCCESS; |
10216 | |
10217 | #ifdef _WIN32 |
10218 | if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) || |
10219 | !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) { |
10220 | rc = ErrCode(); |
10221 | goto done; |
10222 | } |
10223 | my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize); |
10224 | if (my.mc_wbuf[0] == NULL) { |
10225 | /* _aligned_malloc() sets errno, but we use Windows error codes */ |
10226 | rc = ERROR_NOT_ENOUGH_MEMORY; |
10227 | goto done; |
10228 | } |
10229 | #else |
10230 | if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) |
10231 | return rc; |
10232 | if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) |
10233 | goto done2; |
10234 | #ifdef HAVE_MEMALIGN |
10235 | my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); |
10236 | if (my.mc_wbuf[0] == NULL) { |
10237 | rc = errno; |
10238 | goto done; |
10239 | } |
10240 | #else |
10241 | { |
10242 | void *p; |
10243 | if ((rc = posix_memalign(&p, env->me_os_psize, MDB_WBUF*2)) != 0) |
10244 | goto done; |
10245 | my.mc_wbuf[0] = p; |
10246 | } |
10247 | #endif |
10248 | #endif |
10249 | memset(my.mc_wbuf[0], 0, MDB_WBUF*2); |
10250 | my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; |
10251 | my.mc_next_pgno = NUM_METAS; |
10252 | my.mc_env = env; |
10253 | my.mc_fd = fd; |
10254 | rc = THREAD_CREATE(thr, mdb_env_copythr, &my); |
10255 | if (rc) |
10256 | goto done; |
10257 | |
10258 | rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); |
10259 | if (rc) |
10260 | goto finish; |
10261 | |
10262 | mp = (MDB_page *)my.mc_wbuf[0]; |
10263 | memset(mp, 0, NUM_METAS * env->me_psize); |
10264 | mp->mp_pgno = 0; |
10265 | mp->mp_flags = P_META; |
10266 | mm = (MDB_meta *)METADATA(mp); |
10267 | mdb_env_init_meta0(env, mm); |
10268 | mm->mm_address = env->me_metas[0]->mm_address; |
10269 | |
10270 | mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); |
10271 | mp->mp_pgno = 1; |
10272 | mp->mp_flags = P_META; |
10273 | *(MDB_meta *)METADATA(mp) = *mm; |
10274 | mm = (MDB_meta *)METADATA(mp); |
10275 | |
10276 | /* Set metapage 1 with current main DB */ |
10277 | root = new_root = txn->mt_dbs[MAIN_DBI].md_root; |
10278 | if (root != P_INVALID) { |
10279 | /* Count free pages + freeDB pages. Subtract from last_pg |
10280 | * to find the new last_pg, which also becomes the new root. |
10281 | */ |
10282 | MDB_ID freecount = 0; |
10283 | MDB_cursor mc; |
10284 | MDB_val key, data; |
10285 | mdb_cursor_init(&mc, txn, FREE_DBI, NULL); |
10286 | while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) |
10287 | freecount += *(MDB_ID *)data.mv_data; |
10288 | if (rc != MDB_NOTFOUND) |
10289 | goto finish; |
10290 | freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + |
10291 | txn->mt_dbs[FREE_DBI].md_leaf_pages + |
10292 | txn->mt_dbs[FREE_DBI].md_overflow_pages; |
10293 | |
10294 | new_root = txn->mt_next_pgno - 1 - freecount; |
10295 | mm->mm_last_pg = new_root; |
10296 | mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; |
10297 | mm->mm_dbs[MAIN_DBI].md_root = new_root; |
10298 | } else { |
10299 | /* When the DB is empty, handle it specially to |
10300 | * fix any breakage like page leaks from ITS#8174. |
10301 | */ |
10302 | mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; |
10303 | } |
10304 | if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { |
10305 | mm->mm_txnid = 1; /* use metapage 1 */ |
10306 | } |
10307 | |
10308 | my.mc_wlen[0] = env->me_psize * NUM_METAS; |
10309 | my.mc_txn = txn; |
10310 | rc = mdb_env_cwalk(&my, &root, 0); |
10311 | if (rc == MDB_SUCCESS && root != new_root) { |
10312 | rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ |
10313 | } |
10314 | |
10315 | finish: |
10316 | if (rc) |
10317 | my.mc_error = rc; |
10318 | mdb_env_cthr_toggle(&my, 1 | MDB_EOF); |
10319 | rc = THREAD_FINISH(thr); |
10320 | mdb_txn_abort(txn); |
10321 | |
10322 | done: |
10323 | #ifdef _WIN32 |
10324 | if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]); |
10325 | if (my.mc_cond) CloseHandle(my.mc_cond); |
10326 | if (my.mc_mutex) CloseHandle(my.mc_mutex); |
10327 | #else |
10328 | free(my.mc_wbuf[0]); |
10329 | pthread_cond_destroy(&my.mc_cond); |
10330 | done2: |
10331 | pthread_mutex_destroy(&my.mc_mutex); |
10332 | #endif |
10333 | return rc ? rc : my.mc_error; |
10334 | } |
10335 | |
10336 | /** Copy environment as-is. */ |
10337 | static int ESECT |
10338 | mdb_env_copyfd0(MDB_env *env, HANDLE fd) |
10339 | { |
10340 | MDB_txn *txn = NULL; |
10341 | mdb_mutexref_t wmutex = NULL; |
10342 | int rc; |
10343 | mdb_size_t wsize, w3; |
10344 | char *ptr; |
10345 | #ifdef _WIN32 |
10346 | DWORD len, w2; |
10347 | #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) |
10348 | #else |
10349 | ssize_t len; |
10350 | size_t w2; |
10351 | #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) |
10352 | #endif |
10353 | |
10354 | /* Do the lock/unlock of the reader mutex before starting the |
10355 | * write txn. Otherwise other read txns could block writers. |
10356 | */ |
10357 | rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); |
10358 | if (rc) |
10359 | return rc; |
10360 | |
10361 | if (env->me_txns) { |
10362 | /* We must start the actual read txn after blocking writers */ |
10363 | mdb_txn_end(txn, MDB_END_RESET_TMP); |
10364 | |
10365 | /* Temporarily block writers until we snapshot the meta pages */ |
10366 | wmutex = env->me_wmutex; |
10367 | if (LOCK_MUTEX(rc, env, wmutex)) |
10368 | goto leave; |
10369 | |
10370 | rc = mdb_txn_renew0(txn); |
10371 | if (rc) { |
10372 | UNLOCK_MUTEX(wmutex); |
10373 | goto leave; |
10374 | } |
10375 | } |
10376 | |
10377 | wsize = env->me_psize * NUM_METAS; |
10378 | ptr = env->me_map; |
10379 | w2 = wsize; |
10380 | while (w2 > 0) { |
10381 | DO_WRITE(rc, fd, ptr, w2, len); |
10382 | if (!rc) { |
10383 | rc = ErrCode(); |
10384 | break; |
10385 | } else if (len > 0) { |
10386 | rc = MDB_SUCCESS; |
10387 | ptr += len; |
10388 | w2 -= len; |
10389 | continue; |
10390 | } else { |
10391 | /* Non-blocking or async handles are not supported */ |
10392 | rc = EIO; |
10393 | break; |
10394 | } |
10395 | } |
10396 | if (wmutex) |
10397 | UNLOCK_MUTEX(wmutex); |
10398 | |
10399 | if (rc) |
10400 | goto leave; |
10401 | |
10402 | w3 = txn->mt_next_pgno * env->me_psize; |
10403 | { |
10404 | mdb_size_t fsize = 0; |
10405 | if ((rc = mdb_fsize(env->me_fd, &fsize))) |
10406 | goto leave; |
10407 | if (w3 > fsize) |
10408 | w3 = fsize; |
10409 | } |
10410 | wsize = w3 - wsize; |
10411 | while (wsize > 0) { |
10412 | if (wsize > MAX_WRITE) |
10413 | w2 = MAX_WRITE; |
10414 | else |
10415 | w2 = wsize; |
10416 | DO_WRITE(rc, fd, ptr, w2, len); |
10417 | if (!rc) { |
10418 | rc = ErrCode(); |
10419 | break; |
10420 | } else if (len > 0) { |
10421 | rc = MDB_SUCCESS; |
10422 | ptr += len; |
10423 | wsize -= len; |
10424 | continue; |
10425 | } else { |
10426 | rc = EIO; |
10427 | break; |
10428 | } |
10429 | } |
10430 | |
10431 | leave: |
10432 | mdb_txn_abort(txn); |
10433 | return rc; |
10434 | } |
10435 | |
10436 | int ESECT |
10437 | mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags) |
10438 | { |
10439 | if (flags & MDB_CP_COMPACT) |
10440 | return mdb_env_copyfd1(env, fd); |
10441 | else |
10442 | return mdb_env_copyfd0(env, fd); |
10443 | } |
10444 | |
10445 | int ESECT |
10446 | mdb_env_copyfd(MDB_env *env, HANDLE fd) |
10447 | { |
10448 | return mdb_env_copyfd2(env, fd, 0); |
10449 | } |
10450 | |
10451 | int ESECT |
10452 | mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) |
10453 | { |
10454 | int rc; |
10455 | MDB_name fname; |
10456 | HANDLE newfd = INVALID_HANDLE_VALUE; |
10457 | |
10458 | rc = mdb_fname_init(path, env->me_flags | MDB_NOLOCK, &fname); |
10459 | if (rc == MDB_SUCCESS) { |
10460 | rc = mdb_fopen(env, &fname, MDB_O_COPY, 0666, &newfd); |
10461 | mdb_fname_destroy(fname); |
10462 | } |
10463 | if (rc == MDB_SUCCESS) { |
10464 | rc = mdb_env_copyfd2(env, newfd, flags); |
10465 | if (close(newfd) < 0 && rc == MDB_SUCCESS) |
10466 | rc = ErrCode(); |
10467 | } |
10468 | return rc; |
10469 | } |
10470 | |
10471 | int ESECT |
10472 | mdb_env_copy(MDB_env *env, const char *path) |
10473 | { |
10474 | return mdb_env_copy2(env, path, 0); |
10475 | } |
10476 | |
10477 | int ESECT |
10478 | mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) |
10479 | { |
10480 | if (flag & ~CHANGEABLE) |
10481 | return EINVAL; |
10482 | if (onoff) |
10483 | env->me_flags |= flag; |
10484 | else |
10485 | env->me_flags &= ~flag; |
10486 | return MDB_SUCCESS; |
10487 | } |
10488 | |
10489 | int ESECT |
10490 | mdb_env_get_flags(MDB_env *env, unsigned int *arg) |
10491 | { |
10492 | if (!env || !arg) |
10493 | return EINVAL; |
10494 | |
10495 | *arg = env->me_flags & (CHANGEABLE|CHANGELESS); |
10496 | return MDB_SUCCESS; |
10497 | } |
10498 | |
10499 | int ESECT |
10500 | mdb_env_set_userctx(MDB_env *env, void *ctx) |
10501 | { |
10502 | if (!env) |
10503 | return EINVAL; |
10504 | env->me_userctx = ctx; |
10505 | return MDB_SUCCESS; |
10506 | } |
10507 | |
10508 | void * ESECT |
10509 | mdb_env_get_userctx(MDB_env *env) |
10510 | { |
10511 | return env ? env->me_userctx : NULL; |
10512 | } |
10513 | |
10514 | int ESECT |
10515 | mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) |
10516 | { |
10517 | if (!env) |
10518 | return EINVAL; |
10519 | #ifndef NDEBUG |
10520 | env->me_assert_func = func; |
10521 | #endif |
10522 | return MDB_SUCCESS; |
10523 | } |
10524 | |
10525 | int ESECT |
10526 | mdb_env_get_path(MDB_env *env, const char **arg) |
10527 | { |
10528 | if (!env || !arg) |
10529 | return EINVAL; |
10530 | |
10531 | *arg = env->me_path; |
10532 | return MDB_SUCCESS; |
10533 | } |
10534 | |
10535 | int ESECT |
10536 | mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) |
10537 | { |
10538 | if (!env || !arg) |
10539 | return EINVAL; |
10540 | |
10541 | *arg = env->me_fd; |
10542 | return MDB_SUCCESS; |
10543 | } |
10544 | |
10545 | /** Common code for #mdb_stat() and #mdb_env_stat(). |
10546 | * @param[in] env the environment to operate in. |
10547 | * @param[in] db the #MDB_db record containing the stats to return. |
10548 | * @param[out] arg the address of an #MDB_stat structure to receive the stats. |
10549 | * @return 0, this function always succeeds. |
10550 | */ |
10551 | static int ESECT |
10552 | mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) |
10553 | { |
10554 | arg->ms_psize = env->me_psize; |
10555 | arg->ms_depth = db->md_depth; |
10556 | arg->ms_branch_pages = db->md_branch_pages; |
10557 | arg->ms_leaf_pages = db->md_leaf_pages; |
10558 | arg->ms_overflow_pages = db->md_overflow_pages; |
10559 | arg->ms_entries = db->md_entries; |
10560 | |
10561 | return MDB_SUCCESS; |
10562 | } |
10563 | |
10564 | int ESECT |
10565 | mdb_env_stat(MDB_env *env, MDB_stat *arg) |
10566 | { |
10567 | MDB_meta *meta; |
10568 | |
10569 | if (env == NULL || arg == NULL) |
10570 | return EINVAL; |
10571 | |
10572 | meta = mdb_env_pick_meta(env); |
10573 | |
10574 | return mdb_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); |
10575 | } |
10576 | |
10577 | int ESECT |
10578 | mdb_env_info(MDB_env *env, MDB_envinfo *arg) |
10579 | { |
10580 | MDB_meta *meta; |
10581 | |
10582 | if (env == NULL || arg == NULL) |
10583 | return EINVAL; |
10584 | |
10585 | meta = mdb_env_pick_meta(env); |
10586 | arg->me_mapaddr = meta->mm_address; |
10587 | arg->me_last_pgno = meta->mm_last_pg; |
10588 | arg->me_last_txnid = meta->mm_txnid; |
10589 | |
10590 | arg->me_mapsize = env->me_mapsize; |
10591 | arg->me_maxreaders = env->me_maxreaders; |
10592 | arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0; |
10593 | return MDB_SUCCESS; |
10594 | } |
10595 | |
10596 | /** Set the default comparison functions for a database. |
10597 | * Called immediately after a database is opened to set the defaults. |
10598 | * The user can then override them with #mdb_set_compare() or |
10599 | * #mdb_set_dupsort(). |
10600 | * @param[in] txn A transaction handle returned by #mdb_txn_begin() |
10601 | * @param[in] dbi A database handle returned by #mdb_dbi_open() |
10602 | */ |
10603 | static void |
10604 | mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) |
10605 | { |
10606 | uint16_t f = txn->mt_dbs[dbi].md_flags; |
10607 | |
10608 | txn->mt_dbxs[dbi].md_cmp = |
10609 | (f & MDB_REVERSEKEY) ? mdb_cmp_memnr : |
10610 | (f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn; |
10611 | |
10612 | txn->mt_dbxs[dbi].md_dcmp = |
10613 | !(f & MDB_DUPSORT) ? 0 : |
10614 | ((f & MDB_INTEGERDUP) |
10615 | ? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint) |
10616 | : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn)); |
10617 | } |
10618 | |
10619 | int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) |
10620 | { |
10621 | MDB_val key, data; |
10622 | MDB_dbi i; |
10623 | MDB_cursor mc; |
10624 | MDB_db dummy; |
10625 | int rc, dbflag, exact; |
10626 | unsigned int unused = 0, seq; |
10627 | char *namedup; |
10628 | size_t len; |
10629 | |
10630 | if (flags & ~VALID_FLAGS) |
10631 | return EINVAL; |
10632 | if (txn->mt_flags & MDB_TXN_BLOCKED) |
10633 | return MDB_BAD_TXN; |
10634 | |
10635 | /* main DB? */ |
10636 | if (!name) { |
10637 | *dbi = MAIN_DBI; |
10638 | if (flags & PERSISTENT_FLAGS) { |
10639 | uint16_t f2 = flags & PERSISTENT_FLAGS; |
10640 | /* make sure flag changes get committed */ |
10641 | if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { |
10642 | txn->mt_dbs[MAIN_DBI].md_flags |= f2; |
10643 | txn->mt_flags |= MDB_TXN_DIRTY; |
10644 | } |
10645 | } |
10646 | mdb_default_cmp(txn, MAIN_DBI); |
10647 | return MDB_SUCCESS; |
10648 | } |
10649 | |
10650 | if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { |
10651 | mdb_default_cmp(txn, MAIN_DBI); |
10652 | } |
10653 | |
10654 | /* Is the DB already open? */ |
10655 | len = strlen(name); |
10656 | for (i=CORE_DBS; i<txn->mt_numdbs; i++) { |
10657 | if (!txn->mt_dbxs[i].md_name.mv_size) { |
10658 | /* Remember this free slot */ |
10659 | if (!unused) unused = i; |
10660 | continue; |
10661 | } |
10662 | if (len == txn->mt_dbxs[i].md_name.mv_size && |
10663 | !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { |
10664 | *dbi = i; |
10665 | return MDB_SUCCESS; |
10666 | } |
10667 | } |
10668 | |
10669 | /* If no free slot and max hit, fail */ |
10670 | if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) |
10671 | return MDB_DBS_FULL; |
10672 | |
10673 | /* Cannot mix named databases with some mainDB flags */ |
10674 | if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) |
10675 | return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; |
10676 | |
10677 | /* Find the DB info */ |
10678 | dbflag = DB_NEW|DB_VALID|DB_USRVALID; |
10679 | exact = 0; |
10680 | key.mv_size = len; |
10681 | key.mv_data = (void *)name; |
10682 | mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); |
10683 | rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); |
10684 | if (rc == MDB_SUCCESS) { |
10685 | /* make sure this is actually a DB */ |
10686 | MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); |
10687 | if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) |
10688 | return MDB_INCOMPATIBLE; |
10689 | } else { |
10690 | if (rc != MDB_NOTFOUND || !(flags & MDB_CREATE)) |
10691 | return rc; |
10692 | if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) |
10693 | return EACCES; |
10694 | } |
10695 | |
10696 | /* Done here so we cannot fail after creating a new DB */ |
10697 | if ((namedup = strdup(name)) == NULL) |
10698 | return ENOMEM; |
10699 | |
10700 | if (rc) { |
10701 | /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ |
10702 | data.mv_size = sizeof(MDB_db); |
10703 | data.mv_data = &dummy; |
10704 | memset(&dummy, 0, sizeof(dummy)); |
10705 | dummy.md_root = P_INVALID; |
10706 | dummy.md_flags = flags & PERSISTENT_FLAGS; |
10707 | WITH_CURSOR_TRACKING(mc, |
10708 | rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); |
10709 | dbflag |= DB_DIRTY; |
10710 | } |
10711 | |
10712 | if (rc) { |
10713 | free(namedup); |
10714 | } else { |
10715 | /* Got info, register DBI in this txn */ |
10716 | unsigned int slot = unused ? unused : txn->mt_numdbs; |
10717 | txn->mt_dbxs[slot].md_name.mv_data = namedup; |
10718 | txn->mt_dbxs[slot].md_name.mv_size = len; |
10719 | txn->mt_dbxs[slot].md_rel = NULL; |
10720 | txn->mt_dbflags[slot] = dbflag; |
10721 | /* txn-> and env-> are the same in read txns, use |
10722 | * tmp variable to avoid undefined assignment |
10723 | */ |
10724 | seq = ++txn->mt_env->me_dbiseqs[slot]; |
10725 | txn->mt_dbiseqs[slot] = seq; |
10726 | |
10727 | memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); |
10728 | *dbi = slot; |
10729 | mdb_default_cmp(txn, slot); |
10730 | if (!unused) { |
10731 | txn->mt_numdbs++; |
10732 | } |
10733 | } |
10734 | |
10735 | return rc; |
10736 | } |
10737 | |
10738 | int ESECT |
10739 | mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) |
10740 | { |
10741 | if (!arg || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) |
10742 | return EINVAL; |
10743 | |
10744 | if (txn->mt_flags & MDB_TXN_BLOCKED) |
10745 | return MDB_BAD_TXN; |
10746 | |
10747 | if (txn->mt_dbflags[dbi] & DB_STALE) { |
10748 | MDB_cursor mc; |
10749 | MDB_xcursor mx; |
10750 | /* Stale, must read the DB's root. cursor_init does it for us. */ |
10751 | mdb_cursor_init(&mc, txn, dbi, &mx); |
10752 | } |
10753 | return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); |
10754 | } |
10755 | |
10756 | void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) |
10757 | { |
10758 | char *ptr; |
10759 | if (dbi < CORE_DBS || dbi >= env->me_maxdbs) |
10760 | return; |
10761 | ptr = env->me_dbxs[dbi].md_name.mv_data; |
10762 | /* If there was no name, this was already closed */ |
10763 | if (ptr) { |
10764 | env->me_dbxs[dbi].md_name.mv_data = NULL; |
10765 | env->me_dbxs[dbi].md_name.mv_size = 0; |
10766 | env->me_dbflags[dbi] = 0; |
10767 | env->me_dbiseqs[dbi]++; |
10768 | free(ptr); |
10769 | } |
10770 | } |
10771 | |
10772 | int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) |
10773 | { |
10774 | /* We could return the flags for the FREE_DBI too but what's the point? */ |
10775 | if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
10776 | return EINVAL; |
10777 | *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; |
10778 | return MDB_SUCCESS; |
10779 | } |
10780 | |
10781 | /** Add all the DB's pages to the free list. |
10782 | * @param[in] mc Cursor on the DB to free. |
10783 | * @param[in] subs non-Zero to check for sub-DBs in this DB. |
10784 | * @return 0 on success, non-zero on failure. |
10785 | */ |
10786 | static int |
10787 | mdb_drop0(MDB_cursor *mc, int subs) |
10788 | { |
10789 | int rc; |
10790 | |
10791 | rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); |
10792 | if (rc == MDB_SUCCESS) { |
10793 | MDB_txn *txn = mc->mc_txn; |
10794 | MDB_node *ni; |
10795 | MDB_cursor mx; |
10796 | unsigned int i; |
10797 | |
10798 | /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. |
10799 | * This also avoids any P_LEAF2 pages, which have no nodes. |
10800 | * Also if the DB doesn't have sub-DBs and has no overflow |
10801 | * pages, omit scanning leaves. |
10802 | */ |
10803 | if ((mc->mc_flags & C_SUB) || |
10804 | (!subs && !mc->mc_db->md_overflow_pages)) |
10805 | mdb_cursor_pop(mc); |
10806 | |
10807 | mdb_cursor_copy(mc, &mx); |
10808 | #ifdef MDB_VL32 |
10809 | /* bump refcount for mx's pages */ |
10810 | for (i=0; i<mc->mc_snum; i++) |
10811 | mdb_page_get(&mx, mc->mc_pg[i]->mp_pgno, &mx.mc_pg[i], NULL); |
10812 | #endif |
10813 | while (mc->mc_snum > 0) { |
10814 | MDB_page *mp = mc->mc_pg[mc->mc_top]; |
10815 | unsigned n = NUMKEYS(mp); |
10816 | if (IS_LEAF(mp)) { |
10817 | for (i=0; i<n; i++) { |
10818 | ni = NODEPTR(mp, i); |
10819 | if (ni->mn_flags & F_BIGDATA) { |
10820 | MDB_page *omp; |
10821 | pgno_t pg; |
10822 | memcpy(&pg, NODEDATA(ni), sizeof(pg)); |
10823 | rc = mdb_page_get(mc, pg, &omp, NULL); |
10824 | if (rc != 0) |
10825 | goto done; |
10826 | mdb_cassert(mc, IS_OVERFLOW(omp)); |
10827 | rc = mdb_midl_append_range(&txn->mt_free_pgs, |
10828 | pg, omp->mp_pages); |
10829 | if (rc) |
10830 | goto done; |
10831 | mc->mc_db->md_overflow_pages -= omp->mp_pages; |
10832 | if (!mc->mc_db->md_overflow_pages && !subs) |
10833 | break; |
10834 | } else if (subs && (ni->mn_flags & F_SUBDATA)) { |
10835 | mdb_xcursor_init1(mc, ni); |
10836 | rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); |
10837 | if (rc) |
10838 | goto done; |
10839 | } |
10840 | } |
10841 | if (!subs && !mc->mc_db->md_overflow_pages) |
10842 | goto pop; |
10843 | } else { |
10844 | if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) |
10845 | goto done; |
10846 | for (i=0; i<n; i++) { |
10847 | pgno_t pg; |
10848 | ni = NODEPTR(mp, i); |
10849 | pg = NODEPGNO(ni); |
10850 | /* free it */ |
10851 | mdb_midl_xappend(txn->mt_free_pgs, pg); |
10852 | } |
10853 | } |
10854 | if (!mc->mc_top) |
10855 | break; |
10856 | mc->mc_ki[mc->mc_top] = i; |
10857 | rc = mdb_cursor_sibling(mc, 1); |
10858 | if (rc) { |
10859 | if (rc != MDB_NOTFOUND) |
10860 | goto done; |
10861 | /* no more siblings, go back to beginning |
10862 | * of previous level. |
10863 | */ |
10864 | pop: |
10865 | mdb_cursor_pop(mc); |
10866 | mc->mc_ki[0] = 0; |
10867 | for (i=1; i<mc->mc_snum; i++) { |
10868 | mc->mc_ki[i] = 0; |
10869 | mc->mc_pg[i] = mx.mc_pg[i]; |
10870 | } |
10871 | } |
10872 | } |
10873 | /* free it */ |
10874 | rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); |
10875 | done: |
10876 | if (rc) |
10877 | txn->mt_flags |= MDB_TXN_ERROR; |
10878 | /* drop refcount for mx's pages */ |
10879 | MDB_CURSOR_UNREF(&mx, 0); |
10880 | } else if (rc == MDB_NOTFOUND) { |
10881 | rc = MDB_SUCCESS; |
10882 | } |
10883 | mc->mc_flags &= ~C_INITIALIZED; |
10884 | return rc; |
10885 | } |
10886 | |
10887 | int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) |
10888 | { |
10889 | MDB_cursor *mc, *m2; |
10890 | int rc; |
10891 | |
10892 | if ((unsigned)del > 1 || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
10893 | return EINVAL; |
10894 | |
10895 | if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) |
10896 | return EACCES; |
10897 | |
10898 | if (TXN_DBI_CHANGED(txn, dbi)) |
10899 | return MDB_BAD_DBI; |
10900 | |
10901 | rc = mdb_cursor_open(txn, dbi, &mc); |
10902 | if (rc) |
10903 | return rc; |
10904 | |
10905 | rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); |
10906 | /* Invalidate the dropped DB's cursors */ |
10907 | for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) |
10908 | m2->mc_flags &= ~(C_INITIALIZED|C_EOF); |
10909 | if (rc) |
10910 | goto leave; |
10911 | |
10912 | /* Can't delete the main DB */ |
10913 | if (del && dbi >= CORE_DBS) { |
10914 | rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); |
10915 | if (!rc) { |
10916 | txn->mt_dbflags[dbi] = DB_STALE; |
10917 | mdb_dbi_close(txn->mt_env, dbi); |
10918 | } else { |
10919 | txn->mt_flags |= MDB_TXN_ERROR; |
10920 | } |
10921 | } else { |
10922 | /* reset the DB record, mark it dirty */ |
10923 | txn->mt_dbflags[dbi] |= DB_DIRTY; |
10924 | txn->mt_dbs[dbi].md_depth = 0; |
10925 | txn->mt_dbs[dbi].md_branch_pages = 0; |
10926 | txn->mt_dbs[dbi].md_leaf_pages = 0; |
10927 | txn->mt_dbs[dbi].md_overflow_pages = 0; |
10928 | txn->mt_dbs[dbi].md_entries = 0; |
10929 | txn->mt_dbs[dbi].md_root = P_INVALID; |
10930 | |
10931 | txn->mt_flags |= MDB_TXN_DIRTY; |
10932 | } |
10933 | leave: |
10934 | mdb_cursor_close(mc); |
10935 | return rc; |
10936 | } |
10937 | |
10938 | int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) |
10939 | { |
10940 | if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
10941 | return EINVAL; |
10942 | |
10943 | txn->mt_dbxs[dbi].md_cmp = cmp; |
10944 | return MDB_SUCCESS; |
10945 | } |
10946 | |
10947 | int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) |
10948 | { |
10949 | if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
10950 | return EINVAL; |
10951 | |
10952 | txn->mt_dbxs[dbi].md_dcmp = cmp; |
10953 | return MDB_SUCCESS; |
10954 | } |
10955 | |
10956 | int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) |
10957 | { |
10958 | if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
10959 | return EINVAL; |
10960 | |
10961 | txn->mt_dbxs[dbi].md_rel = rel; |
10962 | return MDB_SUCCESS; |
10963 | } |
10964 | |
10965 | int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) |
10966 | { |
10967 | if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
10968 | return EINVAL; |
10969 | |
10970 | txn->mt_dbxs[dbi].md_relctx = ctx; |
10971 | return MDB_SUCCESS; |
10972 | } |
10973 | |
10974 | int ESECT |
10975 | mdb_env_get_maxkeysize(MDB_env *env) |
10976 | { |
10977 | return ENV_MAXKEY(env); |
10978 | } |
10979 | |
10980 | int ESECT |
10981 | mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) |
10982 | { |
10983 | unsigned int i, rdrs; |
10984 | MDB_reader *mr; |
10985 | char buf[64]; |
10986 | int rc = 0, first = 1; |
10987 | |
10988 | if (!env || !func) |
10989 | return -1; |
10990 | if (!env->me_txns) { |
10991 | return func("(no reader locks)\n" , ctx); |
10992 | } |
10993 | rdrs = env->me_txns->mti_numreaders; |
10994 | mr = env->me_txns->mti_readers; |
10995 | for (i=0; i<rdrs; i++) { |
10996 | if (mr[i].mr_pid) { |
10997 | txnid_t txnid = mr[i].mr_txnid; |
10998 | sprintf(buf, txnid == (txnid_t)-1 ? |
10999 | "%10d %" Z"x -\n" : "%10d %" Z"x %" Yu"\n" , |
11000 | (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid); |
11001 | if (first) { |
11002 | first = 0; |
11003 | rc = func(" pid thread txnid\n" , ctx); |
11004 | if (rc < 0) |
11005 | break; |
11006 | } |
11007 | rc = func(buf, ctx); |
11008 | if (rc < 0) |
11009 | break; |
11010 | } |
11011 | } |
11012 | if (first) { |
11013 | rc = func("(no active readers)\n" , ctx); |
11014 | } |
11015 | return rc; |
11016 | } |
11017 | |
11018 | /** Insert pid into list if not already present. |
11019 | * return -1 if already present. |
11020 | */ |
11021 | static int ESECT |
11022 | mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid) |
11023 | { |
11024 | /* binary search of pid in list */ |
11025 | unsigned base = 0; |
11026 | unsigned cursor = 1; |
11027 | int val = 0; |
11028 | unsigned n = ids[0]; |
11029 | |
11030 | while( 0 < n ) { |
11031 | unsigned pivot = n >> 1; |
11032 | cursor = base + pivot + 1; |
11033 | val = pid - ids[cursor]; |
11034 | |
11035 | if( val < 0 ) { |
11036 | n = pivot; |
11037 | |
11038 | } else if ( val > 0 ) { |
11039 | base = cursor; |
11040 | n -= pivot + 1; |
11041 | |
11042 | } else { |
11043 | /* found, so it's a duplicate */ |
11044 | return -1; |
11045 | } |
11046 | } |
11047 | |
11048 | if( val > 0 ) { |
11049 | ++cursor; |
11050 | } |
11051 | ids[0]++; |
11052 | for (n = ids[0]; n > cursor; n--) |
11053 | ids[n] = ids[n-1]; |
11054 | ids[n] = pid; |
11055 | return 0; |
11056 | } |
11057 | |
11058 | int ESECT |
11059 | mdb_reader_check(MDB_env *env, int *dead) |
11060 | { |
11061 | if (!env) |
11062 | return EINVAL; |
11063 | if (dead) |
11064 | *dead = 0; |
11065 | return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS; |
11066 | } |
11067 | |
11068 | /** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ |
11069 | static int ESECT |
11070 | mdb_reader_check0(MDB_env *env, int rlocked, int *dead) |
11071 | { |
11072 | mdb_mutexref_t rmutex = rlocked ? NULL : env->me_rmutex; |
11073 | unsigned int i, j, rdrs; |
11074 | MDB_reader *mr; |
11075 | MDB_PID_T *pids, pid; |
11076 | int rc = MDB_SUCCESS, count = 0; |
11077 | |
11078 | rdrs = env->me_txns->mti_numreaders; |
11079 | pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); |
11080 | if (!pids) |
11081 | return ENOMEM; |
11082 | pids[0] = 0; |
11083 | mr = env->me_txns->mti_readers; |
11084 | for (i=0; i<rdrs; i++) { |
11085 | pid = mr[i].mr_pid; |
11086 | if (pid && pid != env->me_pid) { |
11087 | if (mdb_pid_insert(pids, pid) == 0) { |
11088 | if (!mdb_reader_pid(env, Pidcheck, pid)) { |
11089 | /* Stale reader found */ |
11090 | j = i; |
11091 | if (rmutex) { |
11092 | if ((rc = LOCK_MUTEX0(rmutex)) != 0) { |
11093 | if ((rc = mdb_mutex_failed(env, rmutex, rc))) |
11094 | break; |
11095 | rdrs = 0; /* the above checked all readers */ |
11096 | } else { |
11097 | /* Recheck, a new process may have reused pid */ |
11098 | if (mdb_reader_pid(env, Pidcheck, pid)) |
11099 | j = rdrs; |
11100 | } |
11101 | } |
11102 | for (; j<rdrs; j++) |
11103 | if (mr[j].mr_pid == pid) { |
11104 | DPRINTF(("clear stale reader pid %u txn %" Yd, |
11105 | (unsigned) pid, mr[j].mr_txnid)); |
11106 | mr[j].mr_pid = 0; |
11107 | count++; |
11108 | } |
11109 | if (rmutex) |
11110 | UNLOCK_MUTEX(rmutex); |
11111 | } |
11112 | } |
11113 | } |
11114 | } |
11115 | free(pids); |
11116 | if (dead) |
11117 | *dead = count; |
11118 | return rc; |
11119 | } |
11120 | |
11121 | #ifdef MDB_ROBUST_SUPPORTED |
11122 | /** Handle #LOCK_MUTEX0() failure. |
11123 | * Try to repair the lock file if the mutex owner died. |
11124 | * @param[in] env the environment handle |
11125 | * @param[in] mutex LOCK_MUTEX0() mutex |
11126 | * @param[in] rc LOCK_MUTEX0() error (nonzero) |
11127 | * @return 0 on success with the mutex locked, or an error code on failure. |
11128 | */ |
11129 | static int ESECT |
11130 | mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc) |
11131 | { |
11132 | int rlocked, rc2; |
11133 | MDB_meta *meta; |
11134 | |
11135 | if (rc == MDB_OWNERDEAD) { |
11136 | /* We own the mutex. Clean up after dead previous owner. */ |
11137 | rc = MDB_SUCCESS; |
11138 | rlocked = (mutex == env->me_rmutex); |
11139 | if (!rlocked) { |
11140 | /* Keep mti_txnid updated, otherwise next writer can |
11141 | * overwrite data which latest meta page refers to. |
11142 | */ |
11143 | meta = mdb_env_pick_meta(env); |
11144 | env->me_txns->mti_txnid = meta->mm_txnid; |
11145 | /* env is hosed if the dead thread was ours */ |
11146 | if (env->me_txn) { |
11147 | env->me_flags |= MDB_FATAL_ERROR; |
11148 | env->me_txn = NULL; |
11149 | rc = MDB_PANIC; |
11150 | } |
11151 | } |
11152 | DPRINTF(("%cmutex owner died, %s" , (rlocked ? 'r' : 'w'), |
11153 | (rc ? "this process' env is hosed" : "recovering" ))); |
11154 | rc2 = mdb_reader_check0(env, rlocked, NULL); |
11155 | if (rc2 == 0) |
11156 | rc2 = mdb_mutex_consistent(mutex); |
11157 | if (rc || (rc = rc2)) { |
11158 | DPRINTF(("LOCK_MUTEX recovery failed, %s" , mdb_strerror(rc))); |
11159 | UNLOCK_MUTEX(mutex); |
11160 | } |
11161 | } else { |
11162 | #ifdef _WIN32 |
11163 | rc = ErrCode(); |
11164 | #endif |
11165 | DPRINTF(("LOCK_MUTEX failed, %s" , mdb_strerror(rc))); |
11166 | } |
11167 | |
11168 | return rc; |
11169 | } |
11170 | #endif /* MDB_ROBUST_SUPPORTED */ |
11171 | |
11172 | #if defined(_WIN32) |
11173 | /** Convert \b src to new wchar_t[] string with room for \b xtra extra chars */ |
11174 | static int ESECT |
11175 | utf8_to_utf16(const char *src, MDB_name *dst, int xtra) |
11176 | { |
11177 | int rc, need = 0; |
11178 | wchar_t *result = NULL; |
11179 | for (;;) { /* malloc result, then fill it in */ |
11180 | need = MultiByteToWideChar(CP_UTF8, 0, src, -1, result, need); |
11181 | if (!need) { |
11182 | rc = ErrCode(); |
11183 | free(result); |
11184 | return rc; |
11185 | } |
11186 | if (!result) { |
11187 | result = malloc(sizeof(wchar_t) * (need + xtra)); |
11188 | if (!result) |
11189 | return ENOMEM; |
11190 | continue; |
11191 | } |
11192 | dst->mn_alloced = 1; |
11193 | dst->mn_len = need - 1; |
11194 | dst->mn_val = result; |
11195 | return MDB_SUCCESS; |
11196 | } |
11197 | } |
11198 | #endif /* defined(_WIN32) */ |
11199 | /** @} */ |
11200 | |