1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * fsmpage.c |
4 | * routines to search and manipulate one FSM page. |
5 | * |
6 | * |
7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
8 | * Portions Copyright (c) 1994, Regents of the University of California |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/storage/freespace/fsmpage.c |
12 | * |
13 | * NOTES: |
14 | * |
15 | * The public functions in this file form an API that hides the internal |
16 | * structure of a FSM page. This allows freespace.c to treat each FSM page |
17 | * as a black box with SlotsPerPage "slots". fsm_set_avail() and |
18 | * fsm_get_avail() let you get/set the value of a slot, and |
19 | * fsm_search_avail() lets you search for a slot with value >= X. |
20 | * |
21 | *------------------------------------------------------------------------- |
22 | */ |
23 | #include "postgres.h" |
24 | |
25 | #include "storage/bufmgr.h" |
26 | #include "storage/fsm_internals.h" |
27 | |
28 | /* Macros to navigate the tree within a page. Root has index zero. */ |
29 | #define leftchild(x) (2 * (x) + 1) |
30 | #define rightchild(x) (2 * (x) + 2) |
31 | #define parentof(x) (((x) - 1) / 2) |
32 | |
33 | /* |
34 | * Find right neighbor of x, wrapping around within the level |
35 | */ |
36 | static int |
37 | rightneighbor(int x) |
38 | { |
39 | /* |
40 | * Move right. This might wrap around, stepping to the leftmost node at |
41 | * the next level. |
42 | */ |
43 | x++; |
44 | |
45 | /* |
46 | * Check if we stepped to the leftmost node at next level, and correct if |
47 | * so. The leftmost nodes at each level are numbered x = 2^level - 1, so |
48 | * check if (x + 1) is a power of two, using a standard |
49 | * twos-complement-arithmetic trick. |
50 | */ |
51 | if (((x + 1) & x) == 0) |
52 | x = parentof(x); |
53 | |
54 | return x; |
55 | } |
56 | |
57 | /* |
58 | * Sets the value of a slot on page. Returns true if the page was modified. |
59 | * |
60 | * The caller must hold an exclusive lock on the page. |
61 | */ |
62 | bool |
63 | fsm_set_avail(Page page, int slot, uint8 value) |
64 | { |
65 | int nodeno = NonLeafNodesPerPage + slot; |
66 | FSMPage fsmpage = (FSMPage) PageGetContents(page); |
67 | uint8 oldvalue; |
68 | |
69 | Assert(slot < LeafNodesPerPage); |
70 | |
71 | oldvalue = fsmpage->fp_nodes[nodeno]; |
72 | |
73 | /* If the value hasn't changed, we don't need to do anything */ |
74 | if (oldvalue == value && value <= fsmpage->fp_nodes[0]) |
75 | return false; |
76 | |
77 | fsmpage->fp_nodes[nodeno] = value; |
78 | |
79 | /* |
80 | * Propagate up, until we hit the root or a node that doesn't need to be |
81 | * updated. |
82 | */ |
83 | do |
84 | { |
85 | uint8 newvalue = 0; |
86 | int lchild; |
87 | int rchild; |
88 | |
89 | nodeno = parentof(nodeno); |
90 | lchild = leftchild(nodeno); |
91 | rchild = lchild + 1; |
92 | |
93 | newvalue = fsmpage->fp_nodes[lchild]; |
94 | if (rchild < NodesPerPage) |
95 | newvalue = Max(newvalue, |
96 | fsmpage->fp_nodes[rchild]); |
97 | |
98 | oldvalue = fsmpage->fp_nodes[nodeno]; |
99 | if (oldvalue == newvalue) |
100 | break; |
101 | |
102 | fsmpage->fp_nodes[nodeno] = newvalue; |
103 | } while (nodeno > 0); |
104 | |
105 | /* |
106 | * sanity check: if the new value is (still) higher than the value at the |
107 | * top, the tree is corrupt. If so, rebuild. |
108 | */ |
109 | if (value > fsmpage->fp_nodes[0]) |
110 | fsm_rebuild_page(page); |
111 | |
112 | return true; |
113 | } |
114 | |
115 | /* |
116 | * Returns the value of given slot on page. |
117 | * |
118 | * Since this is just a read-only access of a single byte, the page doesn't |
119 | * need to be locked. |
120 | */ |
121 | uint8 |
122 | fsm_get_avail(Page page, int slot) |
123 | { |
124 | FSMPage fsmpage = (FSMPage) PageGetContents(page); |
125 | |
126 | Assert(slot < LeafNodesPerPage); |
127 | |
128 | return fsmpage->fp_nodes[NonLeafNodesPerPage + slot]; |
129 | } |
130 | |
131 | /* |
132 | * Returns the value at the root of a page. |
133 | * |
134 | * Since this is just a read-only access of a single byte, the page doesn't |
135 | * need to be locked. |
136 | */ |
137 | uint8 |
138 | fsm_get_max_avail(Page page) |
139 | { |
140 | FSMPage fsmpage = (FSMPage) PageGetContents(page); |
141 | |
142 | return fsmpage->fp_nodes[0]; |
143 | } |
144 | |
145 | /* |
146 | * Searches for a slot with category at least minvalue. |
147 | * Returns slot number, or -1 if none found. |
148 | * |
149 | * The caller must hold at least a shared lock on the page, and this |
150 | * function can unlock and lock the page again in exclusive mode if it |
151 | * needs to be updated. exclusive_lock_held should be set to true if the |
152 | * caller is already holding an exclusive lock, to avoid extra work. |
153 | * |
154 | * If advancenext is false, fp_next_slot is set to point to the returned |
155 | * slot, and if it's true, to the slot after the returned slot. |
156 | */ |
157 | int |
158 | fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, |
159 | bool exclusive_lock_held) |
160 | { |
161 | Page page = BufferGetPage(buf); |
162 | FSMPage fsmpage = (FSMPage) PageGetContents(page); |
163 | int nodeno; |
164 | int target; |
165 | uint16 slot; |
166 | |
167 | restart: |
168 | |
169 | /* |
170 | * Check the root first, and exit quickly if there's no leaf with enough |
171 | * free space |
172 | */ |
173 | if (fsmpage->fp_nodes[0] < minvalue) |
174 | return -1; |
175 | |
176 | /* |
177 | * Start search using fp_next_slot. It's just a hint, so check that it's |
178 | * sane. (This also handles wrapping around when the prior call returned |
179 | * the last slot on the page.) |
180 | */ |
181 | target = fsmpage->fp_next_slot; |
182 | if (target < 0 || target >= LeafNodesPerPage) |
183 | target = 0; |
184 | target += NonLeafNodesPerPage; |
185 | |
186 | /*---------- |
187 | * Start the search from the target slot. At every step, move one |
188 | * node to the right, then climb up to the parent. Stop when we reach |
189 | * a node with enough free space (as we must, since the root has enough |
190 | * space). |
191 | * |
192 | * The idea is to gradually expand our "search triangle", that is, all |
193 | * nodes covered by the current node, and to be sure we search to the |
194 | * right from the start point. At the first step, only the target slot |
195 | * is examined. When we move up from a left child to its parent, we are |
196 | * adding the right-hand subtree of that parent to the search triangle. |
197 | * When we move right then up from a right child, we are dropping the |
198 | * current search triangle (which we know doesn't contain any suitable |
199 | * page) and instead looking at the next-larger-size triangle to its |
200 | * right. So we never look left from our original start point, and at |
201 | * each step the size of the search triangle doubles, ensuring it takes |
202 | * only log2(N) work to search N pages. |
203 | * |
204 | * The "move right" operation will wrap around if it hits the right edge |
205 | * of the tree, so the behavior is still good if we start near the right. |
206 | * Note also that the move-and-climb behavior ensures that we can't end |
207 | * up on one of the missing nodes at the right of the leaf level. |
208 | * |
209 | * For example, consider this tree: |
210 | * |
211 | * 7 |
212 | * 7 6 |
213 | * 5 7 6 5 |
214 | * 4 5 5 7 2 6 5 2 |
215 | * T |
216 | * |
217 | * Assume that the target node is the node indicated by the letter T, |
218 | * and we're searching for a node with value of 6 or higher. The search |
219 | * begins at T. At the first iteration, we move to the right, then to the |
220 | * parent, arriving at the rightmost 5. At the second iteration, we move |
221 | * to the right, wrapping around, then climb up, arriving at the 7 on the |
222 | * third level. 7 satisfies our search, so we descend down to the bottom, |
223 | * following the path of sevens. This is in fact the first suitable page |
224 | * to the right of (allowing for wraparound) our start point. |
225 | *---------- |
226 | */ |
227 | nodeno = target; |
228 | while (nodeno > 0) |
229 | { |
230 | if (fsmpage->fp_nodes[nodeno] >= minvalue) |
231 | break; |
232 | |
233 | /* |
234 | * Move to the right, wrapping around on same level if necessary, then |
235 | * climb up. |
236 | */ |
237 | nodeno = parentof(rightneighbor(nodeno)); |
238 | } |
239 | |
240 | /* |
241 | * We're now at a node with enough free space, somewhere in the middle of |
242 | * the tree. Descend to the bottom, following a path with enough free |
243 | * space, preferring to move left if there's a choice. |
244 | */ |
245 | while (nodeno < NonLeafNodesPerPage) |
246 | { |
247 | int childnodeno = leftchild(nodeno); |
248 | |
249 | if (childnodeno < NodesPerPage && |
250 | fsmpage->fp_nodes[childnodeno] >= minvalue) |
251 | { |
252 | nodeno = childnodeno; |
253 | continue; |
254 | } |
255 | childnodeno++; /* point to right child */ |
256 | if (childnodeno < NodesPerPage && |
257 | fsmpage->fp_nodes[childnodeno] >= minvalue) |
258 | { |
259 | nodeno = childnodeno; |
260 | } |
261 | else |
262 | { |
263 | /* |
264 | * Oops. The parent node promised that either left or right child |
265 | * has enough space, but neither actually did. This can happen in |
266 | * case of a "torn page", IOW if we crashed earlier while writing |
267 | * the page to disk, and only part of the page made it to disk. |
268 | * |
269 | * Fix the corruption and restart. |
270 | */ |
271 | RelFileNode rnode; |
272 | ForkNumber forknum; |
273 | BlockNumber blknum; |
274 | |
275 | BufferGetTag(buf, &rnode, &forknum, &blknum); |
276 | elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u" , |
277 | blknum, rnode.spcNode, rnode.dbNode, rnode.relNode); |
278 | |
279 | /* make sure we hold an exclusive lock */ |
280 | if (!exclusive_lock_held) |
281 | { |
282 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
283 | LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
284 | exclusive_lock_held = true; |
285 | } |
286 | fsm_rebuild_page(page); |
287 | MarkBufferDirtyHint(buf, false); |
288 | goto restart; |
289 | } |
290 | } |
291 | |
292 | /* We're now at the bottom level, at a node with enough space. */ |
293 | slot = nodeno - NonLeafNodesPerPage; |
294 | |
295 | /* |
296 | * Update the next-target pointer. Note that we do this even if we're only |
297 | * holding a shared lock, on the grounds that it's better to use a shared |
298 | * lock and get a garbled next pointer every now and then, than take the |
299 | * concurrency hit of an exclusive lock. |
300 | * |
301 | * Wrap-around is handled at the beginning of this function. |
302 | */ |
303 | fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); |
304 | |
305 | return slot; |
306 | } |
307 | |
308 | /* |
309 | * Sets the available space to zero for all slots numbered >= nslots. |
310 | * Returns true if the page was modified. |
311 | */ |
312 | bool |
313 | fsm_truncate_avail(Page page, int nslots) |
314 | { |
315 | FSMPage fsmpage = (FSMPage) PageGetContents(page); |
316 | uint8 *ptr; |
317 | bool changed = false; |
318 | |
319 | Assert(nslots >= 0 && nslots < LeafNodesPerPage); |
320 | |
321 | /* Clear all truncated leaf nodes */ |
322 | ptr = &fsmpage->fp_nodes[NonLeafNodesPerPage + nslots]; |
323 | for (; ptr < &fsmpage->fp_nodes[NodesPerPage]; ptr++) |
324 | { |
325 | if (*ptr != 0) |
326 | changed = true; |
327 | *ptr = 0; |
328 | } |
329 | |
330 | /* Fix upper nodes. */ |
331 | if (changed) |
332 | fsm_rebuild_page(page); |
333 | |
334 | return changed; |
335 | } |
336 | |
337 | /* |
338 | * Reconstructs the upper levels of a page. Returns true if the page |
339 | * was modified. |
340 | */ |
341 | bool |
342 | fsm_rebuild_page(Page page) |
343 | { |
344 | FSMPage fsmpage = (FSMPage) PageGetContents(page); |
345 | bool changed = false; |
346 | int nodeno; |
347 | |
348 | /* |
349 | * Start from the lowest non-leaf level, at last node, working our way |
350 | * backwards, through all non-leaf nodes at all levels, up to the root. |
351 | */ |
352 | for (nodeno = NonLeafNodesPerPage - 1; nodeno >= 0; nodeno--) |
353 | { |
354 | int lchild = leftchild(nodeno); |
355 | int rchild = lchild + 1; |
356 | uint8 newvalue = 0; |
357 | |
358 | /* The first few nodes we examine might have zero or one child. */ |
359 | if (lchild < NodesPerPage) |
360 | newvalue = fsmpage->fp_nodes[lchild]; |
361 | |
362 | if (rchild < NodesPerPage) |
363 | newvalue = Max(newvalue, |
364 | fsmpage->fp_nodes[rchild]); |
365 | |
366 | if (fsmpage->fp_nodes[nodeno] != newvalue) |
367 | { |
368 | fsmpage->fp_nodes[nodeno] = newvalue; |
369 | changed = true; |
370 | } |
371 | } |
372 | |
373 | return changed; |
374 | } |
375 | |