1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * buffile.c |
4 | * Management of large buffered temporary files. |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * IDENTIFICATION |
10 | * src/backend/storage/file/buffile.c |
11 | * |
12 | * NOTES: |
13 | * |
14 | * BufFiles provide a very incomplete emulation of stdio atop virtual Files |
15 | * (as managed by fd.c). Currently, we only support the buffered-I/O |
16 | * aspect of stdio: a read or write of the low-level File occurs only |
17 | * when the buffer is filled or emptied. This is an even bigger win |
18 | * for virtual Files than for ordinary kernel files, since reducing the |
19 | * frequency with which a virtual File is touched reduces "thrashing" |
20 | * of opening/closing file descriptors. |
21 | * |
22 | * Note that BufFile structs are allocated with palloc(), and therefore |
23 | * will go away automatically at query/transaction end. Since the underlying |
24 | * virtual Files are made with OpenTemporaryFile, all resources for |
25 | * the file are certain to be cleaned up even if processing is aborted |
26 | * by ereport(ERROR). The data structures required are made in the |
27 | * palloc context that was current when the BufFile was created, and |
28 | * any external resources such as temp files are owned by the ResourceOwner |
29 | * that was current at that time. |
30 | * |
31 | * BufFile also supports temporary files that exceed the OS file size limit |
32 | * (by opening multiple fd.c temporary files). This is an essential feature |
33 | * for sorts and hashjoins on large amounts of data. |
34 | * |
35 | * BufFile supports temporary files that can be made read-only and shared with |
36 | * other backends, as infrastructure for parallel execution. Such files need |
37 | * to be created as a member of a SharedFileSet that all participants are |
38 | * attached to. |
39 | *------------------------------------------------------------------------- |
40 | */ |
41 | |
42 | #include "postgres.h" |
43 | |
44 | #include "commands/tablespace.h" |
45 | #include "executor/instrument.h" |
46 | #include "miscadmin.h" |
47 | #include "pgstat.h" |
48 | #include "storage/fd.h" |
49 | #include "storage/buffile.h" |
50 | #include "storage/buf_internals.h" |
51 | #include "utils/resowner.h" |
52 | |
53 | /* |
54 | * We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE. |
55 | * The reason is that we'd like large BufFiles to be spread across multiple |
56 | * tablespaces when available. |
57 | */ |
58 | #define MAX_PHYSICAL_FILESIZE 0x40000000 |
59 | #define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ) |
60 | |
61 | /* |
62 | * This data structure represents a buffered file that consists of one or |
63 | * more physical files (each accessed through a virtual file descriptor |
64 | * managed by fd.c). |
65 | */ |
66 | struct BufFile |
67 | { |
68 | int numFiles; /* number of physical files in set */ |
69 | /* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */ |
70 | File *files; /* palloc'd array with numFiles entries */ |
71 | |
72 | bool isInterXact; /* keep open over transactions? */ |
73 | bool dirty; /* does buffer need to be written? */ |
74 | bool readOnly; /* has the file been set to read only? */ |
75 | |
76 | SharedFileSet *fileset; /* space for segment files if shared */ |
77 | const char *name; /* name of this BufFile if shared */ |
78 | |
79 | /* |
80 | * resowner is the ResourceOwner to use for underlying temp files. (We |
81 | * don't need to remember the memory context we're using explicitly, |
82 | * because after creation we only repalloc our arrays larger.) |
83 | */ |
84 | ResourceOwner resowner; |
85 | |
86 | /* |
87 | * "current pos" is position of start of buffer within the logical file. |
88 | * Position as seen by user of BufFile is (curFile, curOffset + pos). |
89 | */ |
90 | int curFile; /* file index (0..n) part of current pos */ |
91 | off_t curOffset; /* offset part of current pos */ |
92 | int pos; /* next read/write position in buffer */ |
93 | int nbytes; /* total # of valid bytes in buffer */ |
94 | PGAlignedBlock buffer; |
95 | }; |
96 | |
97 | static BufFile *makeBufFileCommon(int nfiles); |
98 | static BufFile *makeBufFile(File firstfile); |
99 | static void extendBufFile(BufFile *file); |
100 | static void BufFileLoadBuffer(BufFile *file); |
101 | static void BufFileDumpBuffer(BufFile *file); |
102 | static int BufFileFlush(BufFile *file); |
103 | static File MakeNewSharedSegment(BufFile *file, int segment); |
104 | |
105 | /* |
106 | * Create BufFile and perform the common initialization. |
107 | */ |
108 | static BufFile * |
109 | makeBufFileCommon(int nfiles) |
110 | { |
111 | BufFile *file = (BufFile *) palloc(sizeof(BufFile)); |
112 | |
113 | file->numFiles = nfiles; |
114 | file->isInterXact = false; |
115 | file->dirty = false; |
116 | file->resowner = CurrentResourceOwner; |
117 | file->curFile = 0; |
118 | file->curOffset = 0L; |
119 | file->pos = 0; |
120 | file->nbytes = 0; |
121 | |
122 | return file; |
123 | } |
124 | |
125 | /* |
126 | * Create a BufFile given the first underlying physical file. |
127 | * NOTE: caller must set isInterXact if appropriate. |
128 | */ |
129 | static BufFile * |
130 | makeBufFile(File firstfile) |
131 | { |
132 | BufFile *file = makeBufFileCommon(1); |
133 | |
134 | file->files = (File *) palloc(sizeof(File)); |
135 | file->files[0] = firstfile; |
136 | file->readOnly = false; |
137 | file->fileset = NULL; |
138 | file->name = NULL; |
139 | |
140 | return file; |
141 | } |
142 | |
143 | /* |
144 | * Add another component temp file. |
145 | */ |
146 | static void |
147 | extendBufFile(BufFile *file) |
148 | { |
149 | File pfile; |
150 | ResourceOwner oldowner; |
151 | |
152 | /* Be sure to associate the file with the BufFile's resource owner */ |
153 | oldowner = CurrentResourceOwner; |
154 | CurrentResourceOwner = file->resowner; |
155 | |
156 | if (file->fileset == NULL) |
157 | pfile = OpenTemporaryFile(file->isInterXact); |
158 | else |
159 | pfile = MakeNewSharedSegment(file, file->numFiles); |
160 | |
161 | Assert(pfile >= 0); |
162 | |
163 | CurrentResourceOwner = oldowner; |
164 | |
165 | file->files = (File *) repalloc(file->files, |
166 | (file->numFiles + 1) * sizeof(File)); |
167 | file->files[file->numFiles] = pfile; |
168 | file->numFiles++; |
169 | } |
170 | |
171 | /* |
172 | * Create a BufFile for a new temporary file (which will expand to become |
173 | * multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are |
174 | * written to it). |
175 | * |
176 | * If interXact is true, the temp file will not be automatically deleted |
177 | * at end of transaction. |
178 | * |
179 | * Note: if interXact is true, the caller had better be calling us in a |
180 | * memory context, and with a resource owner, that will survive across |
181 | * transaction boundaries. |
182 | */ |
183 | BufFile * |
184 | BufFileCreateTemp(bool interXact) |
185 | { |
186 | BufFile *file; |
187 | File pfile; |
188 | |
189 | /* |
190 | * Ensure that temp tablespaces are set up for OpenTemporaryFile to use. |
191 | * Possibly the caller will have done this already, but it seems useful to |
192 | * double-check here. Failure to do this at all would result in the temp |
193 | * files always getting placed in the default tablespace, which is a |
194 | * pretty hard-to-detect bug. Callers may prefer to do it earlier if they |
195 | * want to be sure that any required catalog access is done in some other |
196 | * resource context. |
197 | */ |
198 | PrepareTempTablespaces(); |
199 | |
200 | pfile = OpenTemporaryFile(interXact); |
201 | Assert(pfile >= 0); |
202 | |
203 | file = makeBufFile(pfile); |
204 | file->isInterXact = interXact; |
205 | |
206 | return file; |
207 | } |
208 | |
209 | /* |
210 | * Build the name for a given segment of a given BufFile. |
211 | */ |
212 | static void |
213 | SharedSegmentName(char *name, const char *buffile_name, int segment) |
214 | { |
215 | snprintf(name, MAXPGPATH, "%s.%d" , buffile_name, segment); |
216 | } |
217 | |
218 | /* |
219 | * Create a new segment file backing a shared BufFile. |
220 | */ |
221 | static File |
222 | MakeNewSharedSegment(BufFile *buffile, int segment) |
223 | { |
224 | char name[MAXPGPATH]; |
225 | File file; |
226 | |
227 | /* |
228 | * It is possible that there are files left over from before a crash |
229 | * restart with the same name. In order for BufFileOpenShared() not to |
230 | * get confused about how many segments there are, we'll unlink the next |
231 | * segment number if it already exists. |
232 | */ |
233 | SharedSegmentName(name, buffile->name, segment + 1); |
234 | SharedFileSetDelete(buffile->fileset, name, true); |
235 | |
236 | /* Create the new segment. */ |
237 | SharedSegmentName(name, buffile->name, segment); |
238 | file = SharedFileSetCreate(buffile->fileset, name); |
239 | |
240 | /* SharedFileSetCreate would've errored out */ |
241 | Assert(file > 0); |
242 | |
243 | return file; |
244 | } |
245 | |
246 | /* |
247 | * Create a BufFile that can be discovered and opened read-only by other |
248 | * backends that are attached to the same SharedFileSet using the same name. |
249 | * |
250 | * The naming scheme for shared BufFiles is left up to the calling code. The |
251 | * name will appear as part of one or more filenames on disk, and might |
252 | * provide clues to administrators about which subsystem is generating |
253 | * temporary file data. Since each SharedFileSet object is backed by one or |
254 | * more uniquely named temporary directory, names don't conflict with |
255 | * unrelated SharedFileSet objects. |
256 | */ |
257 | BufFile * |
258 | BufFileCreateShared(SharedFileSet *fileset, const char *name) |
259 | { |
260 | BufFile *file; |
261 | |
262 | file = makeBufFileCommon(1); |
263 | file->fileset = fileset; |
264 | file->name = pstrdup(name); |
265 | file->files = (File *) palloc(sizeof(File)); |
266 | file->files[0] = MakeNewSharedSegment(file, 0); |
267 | file->readOnly = false; |
268 | |
269 | return file; |
270 | } |
271 | |
272 | /* |
273 | * Open a file that was previously created in another backend (or this one) |
274 | * with BufFileCreateShared in the same SharedFileSet using the same name. |
275 | * The backend that created the file must have called BufFileClose() or |
276 | * BufFileExportShared() to make sure that it is ready to be opened by other |
277 | * backends and render it read-only. |
278 | */ |
279 | BufFile * |
280 | BufFileOpenShared(SharedFileSet *fileset, const char *name) |
281 | { |
282 | BufFile *file; |
283 | char segment_name[MAXPGPATH]; |
284 | Size capacity = 16; |
285 | File *files; |
286 | int nfiles = 0; |
287 | |
288 | files = palloc(sizeof(File) * capacity); |
289 | |
290 | /* |
291 | * We don't know how many segments there are, so we'll probe the |
292 | * filesystem to find out. |
293 | */ |
294 | for (;;) |
295 | { |
296 | /* See if we need to expand our file segment array. */ |
297 | if (nfiles + 1 > capacity) |
298 | { |
299 | capacity *= 2; |
300 | files = repalloc(files, sizeof(File) * capacity); |
301 | } |
302 | /* Try to load a segment. */ |
303 | SharedSegmentName(segment_name, name, nfiles); |
304 | files[nfiles] = SharedFileSetOpen(fileset, segment_name); |
305 | if (files[nfiles] <= 0) |
306 | break; |
307 | ++nfiles; |
308 | |
309 | CHECK_FOR_INTERRUPTS(); |
310 | } |
311 | |
312 | /* |
313 | * If we didn't find any files at all, then no BufFile exists with this |
314 | * name. |
315 | */ |
316 | if (nfiles == 0) |
317 | ereport(ERROR, |
318 | (errcode_for_file_access(), |
319 | errmsg("could not open temporary file \"%s\" from BufFile \"%s\": %m" , |
320 | segment_name, name))); |
321 | |
322 | file = makeBufFileCommon(nfiles); |
323 | file->files = files; |
324 | file->readOnly = true; /* Can't write to files opened this way */ |
325 | file->fileset = fileset; |
326 | file->name = pstrdup(name); |
327 | |
328 | return file; |
329 | } |
330 | |
331 | /* |
332 | * Delete a BufFile that was created by BufFileCreateShared in the given |
333 | * SharedFileSet using the given name. |
334 | * |
335 | * It is not necessary to delete files explicitly with this function. It is |
336 | * provided only as a way to delete files proactively, rather than waiting for |
337 | * the SharedFileSet to be cleaned up. |
338 | * |
339 | * Only one backend should attempt to delete a given name, and should know |
340 | * that it exists and has been exported or closed. |
341 | */ |
342 | void |
343 | BufFileDeleteShared(SharedFileSet *fileset, const char *name) |
344 | { |
345 | char segment_name[MAXPGPATH]; |
346 | int segment = 0; |
347 | bool found = false; |
348 | |
349 | /* |
350 | * We don't know how many segments the file has. We'll keep deleting |
351 | * until we run out. If we don't manage to find even an initial segment, |
352 | * raise an error. |
353 | */ |
354 | for (;;) |
355 | { |
356 | SharedSegmentName(segment_name, name, segment); |
357 | if (!SharedFileSetDelete(fileset, segment_name, true)) |
358 | break; |
359 | found = true; |
360 | ++segment; |
361 | |
362 | CHECK_FOR_INTERRUPTS(); |
363 | } |
364 | |
365 | if (!found) |
366 | elog(ERROR, "could not delete unknown shared BufFile \"%s\"" , name); |
367 | } |
368 | |
369 | /* |
370 | * BufFileExportShared --- flush and make read-only, in preparation for sharing. |
371 | */ |
372 | void |
373 | BufFileExportShared(BufFile *file) |
374 | { |
375 | /* Must be a file belonging to a SharedFileSet. */ |
376 | Assert(file->fileset != NULL); |
377 | |
378 | /* It's probably a bug if someone calls this twice. */ |
379 | Assert(!file->readOnly); |
380 | |
381 | BufFileFlush(file); |
382 | file->readOnly = true; |
383 | } |
384 | |
385 | /* |
386 | * Close a BufFile |
387 | * |
388 | * Like fclose(), this also implicitly FileCloses the underlying File. |
389 | */ |
390 | void |
391 | BufFileClose(BufFile *file) |
392 | { |
393 | int i; |
394 | |
395 | /* flush any unwritten data */ |
396 | BufFileFlush(file); |
397 | /* close and delete the underlying file(s) */ |
398 | for (i = 0; i < file->numFiles; i++) |
399 | FileClose(file->files[i]); |
400 | /* release the buffer space */ |
401 | pfree(file->files); |
402 | pfree(file); |
403 | } |
404 | |
405 | /* |
406 | * BufFileLoadBuffer |
407 | * |
408 | * Load some data into buffer, if possible, starting from curOffset. |
409 | * At call, must have dirty = false, pos and nbytes = 0. |
410 | * On exit, nbytes is number of bytes loaded. |
411 | */ |
412 | static void |
413 | BufFileLoadBuffer(BufFile *file) |
414 | { |
415 | File thisfile; |
416 | |
417 | /* |
418 | * Advance to next component file if necessary and possible. |
419 | */ |
420 | if (file->curOffset >= MAX_PHYSICAL_FILESIZE && |
421 | file->curFile + 1 < file->numFiles) |
422 | { |
423 | file->curFile++; |
424 | file->curOffset = 0L; |
425 | } |
426 | |
427 | /* |
428 | * Read whatever we can get, up to a full bufferload. |
429 | */ |
430 | thisfile = file->files[file->curFile]; |
431 | file->nbytes = FileRead(thisfile, |
432 | file->buffer.data, |
433 | sizeof(file->buffer), |
434 | file->curOffset, |
435 | WAIT_EVENT_BUFFILE_READ); |
436 | if (file->nbytes < 0) |
437 | file->nbytes = 0; |
438 | /* we choose not to advance curOffset here */ |
439 | |
440 | if (file->nbytes > 0) |
441 | pgBufferUsage.temp_blks_read++; |
442 | } |
443 | |
444 | /* |
445 | * BufFileDumpBuffer |
446 | * |
447 | * Dump buffer contents starting at curOffset. |
448 | * At call, should have dirty = true, nbytes > 0. |
449 | * On exit, dirty is cleared if successful write, and curOffset is advanced. |
450 | */ |
451 | static void |
452 | BufFileDumpBuffer(BufFile *file) |
453 | { |
454 | int wpos = 0; |
455 | int bytestowrite; |
456 | File thisfile; |
457 | |
458 | /* |
459 | * Unlike BufFileLoadBuffer, we must dump the whole buffer even if it |
460 | * crosses a component-file boundary; so we need a loop. |
461 | */ |
462 | while (wpos < file->nbytes) |
463 | { |
464 | off_t availbytes; |
465 | |
466 | /* |
467 | * Advance to next component file if necessary and possible. |
468 | */ |
469 | if (file->curOffset >= MAX_PHYSICAL_FILESIZE) |
470 | { |
471 | while (file->curFile + 1 >= file->numFiles) |
472 | extendBufFile(file); |
473 | file->curFile++; |
474 | file->curOffset = 0L; |
475 | } |
476 | |
477 | /* |
478 | * Determine how much we need to write into this file. |
479 | */ |
480 | bytestowrite = file->nbytes - wpos; |
481 | availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset; |
482 | |
483 | if ((off_t) bytestowrite > availbytes) |
484 | bytestowrite = (int) availbytes; |
485 | |
486 | thisfile = file->files[file->curFile]; |
487 | bytestowrite = FileWrite(thisfile, |
488 | file->buffer.data + wpos, |
489 | bytestowrite, |
490 | file->curOffset, |
491 | WAIT_EVENT_BUFFILE_WRITE); |
492 | if (bytestowrite <= 0) |
493 | return; /* failed to write */ |
494 | file->curOffset += bytestowrite; |
495 | wpos += bytestowrite; |
496 | |
497 | pgBufferUsage.temp_blks_written++; |
498 | } |
499 | file->dirty = false; |
500 | |
501 | /* |
502 | * At this point, curOffset has been advanced to the end of the buffer, |
503 | * ie, its original value + nbytes. We need to make it point to the |
504 | * logical file position, ie, original value + pos, in case that is less |
505 | * (as could happen due to a small backwards seek in a dirty buffer!) |
506 | */ |
507 | file->curOffset -= (file->nbytes - file->pos); |
508 | if (file->curOffset < 0) /* handle possible segment crossing */ |
509 | { |
510 | file->curFile--; |
511 | Assert(file->curFile >= 0); |
512 | file->curOffset += MAX_PHYSICAL_FILESIZE; |
513 | } |
514 | |
515 | /* |
516 | * Now we can set the buffer empty without changing the logical position |
517 | */ |
518 | file->pos = 0; |
519 | file->nbytes = 0; |
520 | } |
521 | |
522 | /* |
523 | * BufFileRead |
524 | * |
525 | * Like fread() except we assume 1-byte element size. |
526 | */ |
527 | size_t |
528 | BufFileRead(BufFile *file, void *ptr, size_t size) |
529 | { |
530 | size_t nread = 0; |
531 | size_t nthistime; |
532 | |
533 | if (file->dirty) |
534 | { |
535 | if (BufFileFlush(file) != 0) |
536 | return 0; /* could not flush... */ |
537 | Assert(!file->dirty); |
538 | } |
539 | |
540 | while (size > 0) |
541 | { |
542 | if (file->pos >= file->nbytes) |
543 | { |
544 | /* Try to load more data into buffer. */ |
545 | file->curOffset += file->pos; |
546 | file->pos = 0; |
547 | file->nbytes = 0; |
548 | BufFileLoadBuffer(file); |
549 | if (file->nbytes <= 0) |
550 | break; /* no more data available */ |
551 | } |
552 | |
553 | nthistime = file->nbytes - file->pos; |
554 | if (nthistime > size) |
555 | nthistime = size; |
556 | Assert(nthistime > 0); |
557 | |
558 | memcpy(ptr, file->buffer.data + file->pos, nthistime); |
559 | |
560 | file->pos += nthistime; |
561 | ptr = (void *) ((char *) ptr + nthistime); |
562 | size -= nthistime; |
563 | nread += nthistime; |
564 | } |
565 | |
566 | return nread; |
567 | } |
568 | |
569 | /* |
570 | * BufFileWrite |
571 | * |
572 | * Like fwrite() except we assume 1-byte element size. |
573 | */ |
574 | size_t |
575 | BufFileWrite(BufFile *file, void *ptr, size_t size) |
576 | { |
577 | size_t nwritten = 0; |
578 | size_t nthistime; |
579 | |
580 | Assert(!file->readOnly); |
581 | |
582 | while (size > 0) |
583 | { |
584 | if (file->pos >= BLCKSZ) |
585 | { |
586 | /* Buffer full, dump it out */ |
587 | if (file->dirty) |
588 | { |
589 | BufFileDumpBuffer(file); |
590 | if (file->dirty) |
591 | break; /* I/O error */ |
592 | } |
593 | else |
594 | { |
595 | /* Hmm, went directly from reading to writing? */ |
596 | file->curOffset += file->pos; |
597 | file->pos = 0; |
598 | file->nbytes = 0; |
599 | } |
600 | } |
601 | |
602 | nthistime = BLCKSZ - file->pos; |
603 | if (nthistime > size) |
604 | nthistime = size; |
605 | Assert(nthistime > 0); |
606 | |
607 | memcpy(file->buffer.data + file->pos, ptr, nthistime); |
608 | |
609 | file->dirty = true; |
610 | file->pos += nthistime; |
611 | if (file->nbytes < file->pos) |
612 | file->nbytes = file->pos; |
613 | ptr = (void *) ((char *) ptr + nthistime); |
614 | size -= nthistime; |
615 | nwritten += nthistime; |
616 | } |
617 | |
618 | return nwritten; |
619 | } |
620 | |
621 | /* |
622 | * BufFileFlush |
623 | * |
624 | * Like fflush() |
625 | */ |
626 | static int |
627 | BufFileFlush(BufFile *file) |
628 | { |
629 | if (file->dirty) |
630 | { |
631 | BufFileDumpBuffer(file); |
632 | if (file->dirty) |
633 | return EOF; |
634 | } |
635 | |
636 | return 0; |
637 | } |
638 | |
639 | /* |
640 | * BufFileSeek |
641 | * |
642 | * Like fseek(), except that target position needs two values in order to |
643 | * work when logical filesize exceeds maximum value representable by off_t. |
644 | * We do not support relative seeks across more than that, however. |
645 | * |
646 | * Result is 0 if OK, EOF if not. Logical position is not moved if an |
647 | * impossible seek is attempted. |
648 | */ |
649 | int |
650 | BufFileSeek(BufFile *file, int fileno, off_t offset, int whence) |
651 | { |
652 | int newFile; |
653 | off_t newOffset; |
654 | |
655 | switch (whence) |
656 | { |
657 | case SEEK_SET: |
658 | if (fileno < 0) |
659 | return EOF; |
660 | newFile = fileno; |
661 | newOffset = offset; |
662 | break; |
663 | case SEEK_CUR: |
664 | |
665 | /* |
666 | * Relative seek considers only the signed offset, ignoring |
667 | * fileno. Note that large offsets (> 1 gig) risk overflow in this |
668 | * add, unless we have 64-bit off_t. |
669 | */ |
670 | newFile = file->curFile; |
671 | newOffset = (file->curOffset + file->pos) + offset; |
672 | break; |
673 | #ifdef NOT_USED |
674 | case SEEK_END: |
675 | /* could be implemented, not needed currently */ |
676 | break; |
677 | #endif |
678 | default: |
679 | elog(ERROR, "invalid whence: %d" , whence); |
680 | return EOF; |
681 | } |
682 | while (newOffset < 0) |
683 | { |
684 | if (--newFile < 0) |
685 | return EOF; |
686 | newOffset += MAX_PHYSICAL_FILESIZE; |
687 | } |
688 | if (newFile == file->curFile && |
689 | newOffset >= file->curOffset && |
690 | newOffset <= file->curOffset + file->nbytes) |
691 | { |
692 | /* |
693 | * Seek is to a point within existing buffer; we can just adjust |
694 | * pos-within-buffer, without flushing buffer. Note this is OK |
695 | * whether reading or writing, but buffer remains dirty if we were |
696 | * writing. |
697 | */ |
698 | file->pos = (int) (newOffset - file->curOffset); |
699 | return 0; |
700 | } |
701 | /* Otherwise, must reposition buffer, so flush any dirty data */ |
702 | if (BufFileFlush(file) != 0) |
703 | return EOF; |
704 | |
705 | /* |
706 | * At this point and no sooner, check for seek past last segment. The |
707 | * above flush could have created a new segment, so checking sooner would |
708 | * not work (at least not with this code). |
709 | */ |
710 | |
711 | /* convert seek to "start of next seg" to "end of last seg" */ |
712 | if (newFile == file->numFiles && newOffset == 0) |
713 | { |
714 | newFile--; |
715 | newOffset = MAX_PHYSICAL_FILESIZE; |
716 | } |
717 | while (newOffset > MAX_PHYSICAL_FILESIZE) |
718 | { |
719 | if (++newFile >= file->numFiles) |
720 | return EOF; |
721 | newOffset -= MAX_PHYSICAL_FILESIZE; |
722 | } |
723 | if (newFile >= file->numFiles) |
724 | return EOF; |
725 | /* Seek is OK! */ |
726 | file->curFile = newFile; |
727 | file->curOffset = newOffset; |
728 | file->pos = 0; |
729 | file->nbytes = 0; |
730 | return 0; |
731 | } |
732 | |
733 | void |
734 | BufFileTell(BufFile *file, int *fileno, off_t *offset) |
735 | { |
736 | *fileno = file->curFile; |
737 | *offset = file->curOffset + file->pos; |
738 | } |
739 | |
740 | /* |
741 | * BufFileSeekBlock --- block-oriented seek |
742 | * |
743 | * Performs absolute seek to the start of the n'th BLCKSZ-sized block of |
744 | * the file. Note that users of this interface will fail if their files |
745 | * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work |
746 | * with tables bigger than that, either... |
747 | * |
748 | * Result is 0 if OK, EOF if not. Logical position is not moved if an |
749 | * impossible seek is attempted. |
750 | */ |
751 | int |
752 | BufFileSeekBlock(BufFile *file, long blknum) |
753 | { |
754 | return BufFileSeek(file, |
755 | (int) (blknum / BUFFILE_SEG_SIZE), |
756 | (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ, |
757 | SEEK_SET); |
758 | } |
759 | |
760 | #ifdef NOT_USED |
761 | /* |
762 | * BufFileTellBlock --- block-oriented tell |
763 | * |
764 | * Any fractional part of a block in the current seek position is ignored. |
765 | */ |
766 | long |
767 | BufFileTellBlock(BufFile *file) |
768 | { |
769 | long blknum; |
770 | |
771 | blknum = (file->curOffset + file->pos) / BLCKSZ; |
772 | blknum += file->curFile * BUFFILE_SEG_SIZE; |
773 | return blknum; |
774 | } |
775 | |
776 | #endif |
777 | |
778 | /* |
779 | * Return the current shared BufFile size. |
780 | * |
781 | * Counts any holes left behind by BufFileAppend as part of the size. |
782 | * ereport()s on failure. |
783 | */ |
784 | int64 |
785 | BufFileSize(BufFile *file) |
786 | { |
787 | int64 lastFileSize; |
788 | |
789 | Assert(file->fileset != NULL); |
790 | |
791 | /* Get the size of the last physical file. */ |
792 | lastFileSize = FileSize(file->files[file->numFiles - 1]); |
793 | if (lastFileSize < 0) |
794 | ereport(ERROR, |
795 | (errcode_for_file_access(), |
796 | errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m" , |
797 | FilePathName(file->files[file->numFiles - 1]), |
798 | file->name))); |
799 | |
800 | return ((file->numFiles - 1) * (int64) MAX_PHYSICAL_FILESIZE) + |
801 | lastFileSize; |
802 | } |
803 | |
804 | /* |
805 | * Append the contents of source file (managed within shared fileset) to |
806 | * end of target file (managed within same shared fileset). |
807 | * |
808 | * Note that operation subsumes ownership of underlying resources from |
809 | * "source". Caller should never call BufFileClose against source having |
810 | * called here first. Resource owners for source and target must match, |
811 | * too. |
812 | * |
813 | * This operation works by manipulating lists of segment files, so the |
814 | * file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned |
815 | * boundary, typically creating empty holes before the boundary. These |
816 | * areas do not contain any interesting data, and cannot be read from by |
817 | * caller. |
818 | * |
819 | * Returns the block number within target where the contents of source |
820 | * begins. Caller should apply this as an offset when working off block |
821 | * positions that are in terms of the original BufFile space. |
822 | */ |
823 | long |
824 | BufFileAppend(BufFile *target, BufFile *source) |
825 | { |
826 | long startBlock = target->numFiles * BUFFILE_SEG_SIZE; |
827 | int newNumFiles = target->numFiles + source->numFiles; |
828 | int i; |
829 | |
830 | Assert(target->fileset != NULL); |
831 | Assert(source->readOnly); |
832 | Assert(!source->dirty); |
833 | Assert(source->fileset != NULL); |
834 | |
835 | if (target->resowner != source->resowner) |
836 | elog(ERROR, "could not append BufFile with non-matching resource owner" ); |
837 | |
838 | target->files = (File *) |
839 | repalloc(target->files, sizeof(File) * newNumFiles); |
840 | for (i = target->numFiles; i < newNumFiles; i++) |
841 | target->files[i] = source->files[i - target->numFiles]; |
842 | target->numFiles = newNumFiles; |
843 | |
844 | return startBlock; |
845 | } |
846 | |