| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * syncscan.c |
| 4 | * heap scan synchronization support |
| 5 | * |
| 6 | * When multiple backends run a sequential scan on the same table, we try |
| 7 | * to keep them synchronized to reduce the overall I/O needed. The goal is |
| 8 | * to read each page into shared buffer cache only once, and let all backends |
| 9 | * that take part in the shared scan process the page before it falls out of |
| 10 | * the cache. |
| 11 | * |
| 12 | * Since the "leader" in a pack of backends doing a seqscan will have to wait |
| 13 | * for I/O, while the "followers" don't, there is a strong self-synchronizing |
| 14 | * effect once we can get the backends examining approximately the same part |
| 15 | * of the table at the same time. Hence all that is really needed is to get |
| 16 | * a new backend beginning a seqscan to begin it close to where other backends |
| 17 | * are reading. We can scan the table circularly, from block X up to the |
| 18 | * end and then from block 0 to X-1, to ensure we visit all rows while still |
| 19 | * participating in the common scan. |
| 20 | * |
| 21 | * To accomplish that, we keep track of the scan position of each table, and |
| 22 | * start new scans close to where the previous scan(s) are. We don't try to |
| 23 | * do any extra synchronization to keep the scans together afterwards; some |
| 24 | * scans might progress much more slowly than others, for example if the |
| 25 | * results need to be transferred to the client over a slow network, and we |
| 26 | * don't want such queries to slow down others. |
| 27 | * |
| 28 | * There can realistically only be a few large sequential scans on different |
| 29 | * tables in progress at any time. Therefore we just keep the scan positions |
| 30 | * in a small LRU list which we scan every time we need to look up or update a |
| 31 | * scan position. The whole mechanism is only applied for tables exceeding |
| 32 | * a threshold size (but that is not the concern of this module). |
| 33 | * |
| 34 | * INTERFACE ROUTINES |
| 35 | * ss_get_location - return current scan location of a relation |
| 36 | * ss_report_location - update current scan location |
| 37 | * |
| 38 | * |
| 39 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 40 | * Portions Copyright (c) 1994, Regents of the University of California |
| 41 | * |
| 42 | * IDENTIFICATION |
| 43 | * src/backend/access/heap/syncscan.c |
| 44 | * |
| 45 | *------------------------------------------------------------------------- |
| 46 | */ |
| 47 | #include "postgres.h" |
| 48 | |
| 49 | #include "access/heapam.h" |
| 50 | #include "miscadmin.h" |
| 51 | #include "storage/lwlock.h" |
| 52 | #include "storage/shmem.h" |
| 53 | #include "utils/rel.h" |
| 54 | |
| 55 | |
| 56 | /* GUC variables */ |
| 57 | #ifdef TRACE_SYNCSCAN |
| 58 | bool trace_syncscan = false; |
| 59 | #endif |
| 60 | |
| 61 | |
| 62 | /* |
| 63 | * Size of the LRU list. |
| 64 | * |
| 65 | * Note: the code assumes that SYNC_SCAN_NELEM > 1. |
| 66 | * |
| 67 | * XXX: What's a good value? It should be large enough to hold the |
| 68 | * maximum number of large tables scanned simultaneously. But a larger value |
| 69 | * means more traversing of the LRU list when starting a new scan. |
| 70 | */ |
| 71 | #define SYNC_SCAN_NELEM 20 |
| 72 | |
| 73 | /* |
| 74 | * Interval between reports of the location of the current scan, in pages. |
| 75 | * |
| 76 | * Note: This should be smaller than the ring size (see buffer/freelist.c) |
| 77 | * we use for bulk reads. Otherwise a scan joining other scans might start |
| 78 | * from a page that's no longer in the buffer cache. This is a bit fuzzy; |
| 79 | * there's no guarantee that the new scan will read the page before it leaves |
| 80 | * the buffer cache anyway, and on the other hand the page is most likely |
| 81 | * still in the OS cache. |
| 82 | */ |
| 83 | #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ) |
| 84 | |
| 85 | |
| 86 | /* |
| 87 | * The scan locations structure is essentially a doubly-linked LRU with head |
| 88 | * and tail pointer, but designed to hold a fixed maximum number of elements in |
| 89 | * fixed-size shared memory. |
| 90 | */ |
| 91 | typedef struct ss_scan_location_t |
| 92 | { |
| 93 | RelFileNode relfilenode; /* identity of a relation */ |
| 94 | BlockNumber location; /* last-reported location in the relation */ |
| 95 | } ss_scan_location_t; |
| 96 | |
| 97 | typedef struct ss_lru_item_t |
| 98 | { |
| 99 | struct ss_lru_item_t *prev; |
| 100 | struct ss_lru_item_t *next; |
| 101 | ss_scan_location_t location; |
| 102 | } ss_lru_item_t; |
| 103 | |
| 104 | typedef struct ss_scan_locations_t |
| 105 | { |
| 106 | ss_lru_item_t *head; |
| 107 | ss_lru_item_t *tail; |
| 108 | ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */ |
| 109 | } ss_scan_locations_t; |
| 110 | |
| 111 | #define SizeOfScanLocations(N) \ |
| 112 | (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t)) |
| 113 | |
| 114 | /* Pointer to struct in shared memory */ |
| 115 | static ss_scan_locations_t *scan_locations; |
| 116 | |
| 117 | /* prototypes for internal functions */ |
| 118 | static BlockNumber ss_search(RelFileNode relfilenode, |
| 119 | BlockNumber location, bool set); |
| 120 | |
| 121 | |
| 122 | /* |
| 123 | * SyncScanShmemSize --- report amount of shared memory space needed |
| 124 | */ |
| 125 | Size |
| 126 | SyncScanShmemSize(void) |
| 127 | { |
| 128 | return SizeOfScanLocations(SYNC_SCAN_NELEM); |
| 129 | } |
| 130 | |
| 131 | /* |
| 132 | * SyncScanShmemInit --- initialize this module's shared memory |
| 133 | */ |
| 134 | void |
| 135 | SyncScanShmemInit(void) |
| 136 | { |
| 137 | int i; |
| 138 | bool found; |
| 139 | |
| 140 | scan_locations = (ss_scan_locations_t *) |
| 141 | ShmemInitStruct("Sync Scan Locations List" , |
| 142 | SizeOfScanLocations(SYNC_SCAN_NELEM), |
| 143 | &found); |
| 144 | |
| 145 | if (!IsUnderPostmaster) |
| 146 | { |
| 147 | /* Initialize shared memory area */ |
| 148 | Assert(!found); |
| 149 | |
| 150 | scan_locations->head = &scan_locations->items[0]; |
| 151 | scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1]; |
| 152 | |
| 153 | for (i = 0; i < SYNC_SCAN_NELEM; i++) |
| 154 | { |
| 155 | ss_lru_item_t *item = &scan_locations->items[i]; |
| 156 | |
| 157 | /* |
| 158 | * Initialize all slots with invalid values. As scans are started, |
| 159 | * these invalid entries will fall off the LRU list and get |
| 160 | * replaced with real entries. |
| 161 | */ |
| 162 | item->location.relfilenode.spcNode = InvalidOid; |
| 163 | item->location.relfilenode.dbNode = InvalidOid; |
| 164 | item->location.relfilenode.relNode = InvalidOid; |
| 165 | item->location.location = InvalidBlockNumber; |
| 166 | |
| 167 | item->prev = (i > 0) ? |
| 168 | (&scan_locations->items[i - 1]) : NULL; |
| 169 | item->next = (i < SYNC_SCAN_NELEM - 1) ? |
| 170 | (&scan_locations->items[i + 1]) : NULL; |
| 171 | } |
| 172 | } |
| 173 | else |
| 174 | Assert(found); |
| 175 | } |
| 176 | |
| 177 | /* |
| 178 | * ss_search --- search the scan_locations structure for an entry with the |
| 179 | * given relfilenode. |
| 180 | * |
| 181 | * If "set" is true, the location is updated to the given location. If no |
| 182 | * entry for the given relfilenode is found, it will be created at the head |
| 183 | * of the list with the given location, even if "set" is false. |
| 184 | * |
| 185 | * In any case, the location after possible update is returned. |
| 186 | * |
| 187 | * Caller is responsible for having acquired suitable lock on the shared |
| 188 | * data structure. |
| 189 | */ |
| 190 | static BlockNumber |
| 191 | ss_search(RelFileNode relfilenode, BlockNumber location, bool set) |
| 192 | { |
| 193 | ss_lru_item_t *item; |
| 194 | |
| 195 | item = scan_locations->head; |
| 196 | for (;;) |
| 197 | { |
| 198 | bool match; |
| 199 | |
| 200 | match = RelFileNodeEquals(item->location.relfilenode, relfilenode); |
| 201 | |
| 202 | if (match || item->next == NULL) |
| 203 | { |
| 204 | /* |
| 205 | * If we reached the end of list and no match was found, take over |
| 206 | * the last entry |
| 207 | */ |
| 208 | if (!match) |
| 209 | { |
| 210 | item->location.relfilenode = relfilenode; |
| 211 | item->location.location = location; |
| 212 | } |
| 213 | else if (set) |
| 214 | item->location.location = location; |
| 215 | |
| 216 | /* Move the entry to the front of the LRU list */ |
| 217 | if (item != scan_locations->head) |
| 218 | { |
| 219 | /* unlink */ |
| 220 | if (item == scan_locations->tail) |
| 221 | scan_locations->tail = item->prev; |
| 222 | item->prev->next = item->next; |
| 223 | if (item->next) |
| 224 | item->next->prev = item->prev; |
| 225 | |
| 226 | /* link */ |
| 227 | item->prev = NULL; |
| 228 | item->next = scan_locations->head; |
| 229 | scan_locations->head->prev = item; |
| 230 | scan_locations->head = item; |
| 231 | } |
| 232 | |
| 233 | return item->location.location; |
| 234 | } |
| 235 | |
| 236 | item = item->next; |
| 237 | } |
| 238 | |
| 239 | /* not reached */ |
| 240 | } |
| 241 | |
| 242 | /* |
| 243 | * ss_get_location --- get the optimal starting location for scan |
| 244 | * |
| 245 | * Returns the last-reported location of a sequential scan on the |
| 246 | * relation, or 0 if no valid location is found. |
| 247 | * |
| 248 | * We expect the caller has just done RelationGetNumberOfBlocks(), and |
| 249 | * so that number is passed in rather than computing it again. The result |
| 250 | * is guaranteed less than relnblocks (assuming that's > 0). |
| 251 | */ |
| 252 | BlockNumber |
| 253 | ss_get_location(Relation rel, BlockNumber relnblocks) |
| 254 | { |
| 255 | BlockNumber startloc; |
| 256 | |
| 257 | LWLockAcquire(SyncScanLock, LW_EXCLUSIVE); |
| 258 | startloc = ss_search(rel->rd_node, 0, false); |
| 259 | LWLockRelease(SyncScanLock); |
| 260 | |
| 261 | /* |
| 262 | * If the location is not a valid block number for this scan, start at 0. |
| 263 | * |
| 264 | * This can happen if for instance a VACUUM truncated the table since the |
| 265 | * location was saved. |
| 266 | */ |
| 267 | if (startloc >= relnblocks) |
| 268 | startloc = 0; |
| 269 | |
| 270 | #ifdef TRACE_SYNCSCAN |
| 271 | if (trace_syncscan) |
| 272 | elog(LOG, |
| 273 | "SYNC_SCAN: start \"%s\" (size %u) at %u" , |
| 274 | RelationGetRelationName(rel), relnblocks, startloc); |
| 275 | #endif |
| 276 | |
| 277 | return startloc; |
| 278 | } |
| 279 | |
| 280 | /* |
| 281 | * ss_report_location --- update the current scan location |
| 282 | * |
| 283 | * Writes an entry into the shared Sync Scan state of the form |
| 284 | * (relfilenode, blocknumber), overwriting any existing entry for the |
| 285 | * same relfilenode. |
| 286 | */ |
| 287 | void |
| 288 | ss_report_location(Relation rel, BlockNumber location) |
| 289 | { |
| 290 | #ifdef TRACE_SYNCSCAN |
| 291 | if (trace_syncscan) |
| 292 | { |
| 293 | if ((location % 1024) == 0) |
| 294 | elog(LOG, |
| 295 | "SYNC_SCAN: scanning \"%s\" at %u" , |
| 296 | RelationGetRelationName(rel), location); |
| 297 | } |
| 298 | #endif |
| 299 | |
| 300 | /* |
| 301 | * To reduce lock contention, only report scan progress every N pages. For |
| 302 | * the same reason, don't block if the lock isn't immediately available. |
| 303 | * Missing a few updates isn't critical, it just means that a new scan |
| 304 | * that wants to join the pack will start a little bit behind the head of |
| 305 | * the scan. Hopefully the pages are still in OS cache and the scan |
| 306 | * catches up quickly. |
| 307 | */ |
| 308 | if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0) |
| 309 | { |
| 310 | if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE)) |
| 311 | { |
| 312 | (void) ss_search(rel->rd_node, location, true); |
| 313 | LWLockRelease(SyncScanLock); |
| 314 | } |
| 315 | #ifdef TRACE_SYNCSCAN |
| 316 | else if (trace_syncscan) |
| 317 | elog(LOG, |
| 318 | "SYNC_SCAN: missed update for \"%s\" at %u" , |
| 319 | RelationGetRelationName(rel), location); |
| 320 | #endif |
| 321 | } |
| 322 | } |
| 323 | |