1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * tuplesort.h |
4 | * Generalized tuple sorting routines. |
5 | * |
6 | * This module handles sorting of heap tuples, index tuples, or single |
7 | * Datums (and could easily support other kinds of sortable objects, |
8 | * if necessary). It works efficiently for both small and large amounts |
9 | * of data. Small amounts are sorted in-memory using qsort(). Large |
10 | * amounts are sorted using temporary files and a standard external sort |
11 | * algorithm. Parallel sorts use a variant of this external sort |
12 | * algorithm, and are typically only used for large amounts of data. |
13 | * |
14 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
15 | * Portions Copyright (c) 1994, Regents of the University of California |
16 | * |
17 | * src/include/utils/tuplesort.h |
18 | * |
19 | *------------------------------------------------------------------------- |
20 | */ |
21 | #ifndef TUPLESORT_H |
22 | #define TUPLESORT_H |
23 | |
24 | #include "access/itup.h" |
25 | #include "executor/tuptable.h" |
26 | #include "fmgr.h" |
27 | #include "storage/dsm.h" |
28 | #include "utils/relcache.h" |
29 | |
30 | |
31 | /* |
32 | * Tuplesortstate and Sharedsort are opaque types whose details are not |
33 | * known outside tuplesort.c. |
34 | */ |
35 | typedef struct Tuplesortstate Tuplesortstate; |
36 | typedef struct Sharedsort Sharedsort; |
37 | |
38 | /* |
39 | * Tuplesort parallel coordination state, allocated by each participant in |
40 | * local memory. Participant caller initializes everything. See usage notes |
41 | * below. |
42 | */ |
43 | typedef struct SortCoordinateData |
44 | { |
45 | /* Worker process? If not, must be leader. */ |
46 | bool isWorker; |
47 | |
48 | /* |
49 | * Leader-process-passed number of participants known launched (workers |
50 | * set this to -1). Includes state within leader needed for it to |
51 | * participate as a worker, if any. |
52 | */ |
53 | int nParticipants; |
54 | |
55 | /* Private opaque state (points to shared memory) */ |
56 | Sharedsort *sharedsort; |
57 | } SortCoordinateData; |
58 | |
59 | typedef struct SortCoordinateData *SortCoordinate; |
60 | |
61 | /* |
62 | * Data structures for reporting sort statistics. Note that |
63 | * TuplesortInstrumentation can't contain any pointers because we |
64 | * sometimes put it in shared memory. |
65 | */ |
66 | typedef enum |
67 | { |
68 | SORT_TYPE_STILL_IN_PROGRESS = 0, |
69 | SORT_TYPE_TOP_N_HEAPSORT, |
70 | SORT_TYPE_QUICKSORT, |
71 | SORT_TYPE_EXTERNAL_SORT, |
72 | SORT_TYPE_EXTERNAL_MERGE |
73 | } TuplesortMethod; |
74 | |
75 | typedef enum |
76 | { |
77 | SORT_SPACE_TYPE_DISK, |
78 | SORT_SPACE_TYPE_MEMORY |
79 | } TuplesortSpaceType; |
80 | |
81 | typedef struct TuplesortInstrumentation |
82 | { |
83 | TuplesortMethod sortMethod; /* sort algorithm used */ |
84 | TuplesortSpaceType spaceType; /* type of space spaceUsed represents */ |
85 | long spaceUsed; /* space consumption, in kB */ |
86 | } TuplesortInstrumentation; |
87 | |
88 | |
89 | /* |
90 | * We provide multiple interfaces to what is essentially the same code, |
91 | * since different callers have different data to be sorted and want to |
92 | * specify the sort key information differently. There are two APIs for |
93 | * sorting HeapTuples and two more for sorting IndexTuples. Yet another |
94 | * API supports sorting bare Datums. |
95 | * |
96 | * Serial sort callers should pass NULL for their coordinate argument. |
97 | * |
98 | * The "heap" API actually stores/sorts MinimalTuples, which means it doesn't |
99 | * preserve the system columns (tuple identity and transaction visibility |
100 | * info). The sort keys are specified by column numbers within the tuples |
101 | * and sort operator OIDs. We save some cycles by passing and returning the |
102 | * tuples in TupleTableSlots, rather than forming actual HeapTuples (which'd |
103 | * have to be converted to MinimalTuples). This API works well for sorts |
104 | * executed as parts of plan trees. |
105 | * |
106 | * The "cluster" API stores/sorts full HeapTuples including all visibility |
107 | * info. The sort keys are specified by reference to a btree index that is |
108 | * defined on the relation to be sorted. Note that putheaptuple/getheaptuple |
109 | * go with this API, not the "begin_heap" one! |
110 | * |
111 | * The "index_btree" API stores/sorts IndexTuples (preserving all their |
112 | * header fields). The sort keys are specified by a btree index definition. |
113 | * |
114 | * The "index_hash" API is similar to index_btree, but the tuples are |
115 | * actually sorted by their hash codes not the raw data. |
116 | * |
117 | * Parallel sort callers are required to coordinate multiple tuplesort states |
118 | * in a leader process and one or more worker processes. The leader process |
119 | * must launch workers, and have each perform an independent "partial" |
120 | * tuplesort, typically fed by the parallel heap interface. The leader later |
121 | * produces the final output (internally, it merges runs output by workers). |
122 | * |
123 | * Callers must do the following to perform a sort in parallel using multiple |
124 | * worker processes: |
125 | * |
126 | * 1. Request tuplesort-private shared memory for n workers. Use |
127 | * tuplesort_estimate_shared() to get the required size. |
128 | * 2. Have leader process initialize allocated shared memory using |
129 | * tuplesort_initialize_shared(). Launch workers. |
130 | * 3. Initialize a coordinate argument within both the leader process, and |
131 | * for each worker process. This has a pointer to the shared |
132 | * tuplesort-private structure, as well as some caller-initialized fields. |
133 | * Leader's coordinate argument reliably indicates number of workers |
134 | * launched (this is unused by workers). |
135 | * 4. Begin a tuplesort using some appropriate tuplesort_begin* routine, |
136 | * (passing the coordinate argument) within each worker. The workMem |
137 | * arguments need not be identical. All other arguments should match |
138 | * exactly, though. |
139 | * 5. tuplesort_attach_shared() should be called by all workers. Feed tuples |
140 | * to each worker, and call tuplesort_performsort() within each when input |
141 | * is exhausted. |
142 | * 6. Call tuplesort_end() in each worker process. Worker processes can shut |
143 | * down once tuplesort_end() returns. |
144 | * 7. Begin a tuplesort in the leader using the same tuplesort_begin* |
145 | * routine, passing a leader-appropriate coordinate argument (this can |
146 | * happen as early as during step 3, actually, since we only need to know |
147 | * the number of workers successfully launched). The leader must now wait |
148 | * for workers to finish. Caller must use own mechanism for ensuring that |
149 | * next step isn't reached until all workers have called and returned from |
150 | * tuplesort_performsort(). (Note that it's okay if workers have already |
151 | * also called tuplesort_end() by then.) |
152 | * 8. Call tuplesort_performsort() in leader. Consume output using the |
153 | * appropriate tuplesort_get* routine. Leader can skip this step if |
154 | * tuplesort turns out to be unnecessary. |
155 | * 9. Call tuplesort_end() in leader. |
156 | * |
157 | * This division of labor assumes nothing about how input tuples are produced, |
158 | * but does require that caller combine the state of multiple tuplesorts for |
159 | * any purpose other than producing the final output. For example, callers |
160 | * must consider that tuplesort_get_stats() reports on only one worker's role |
161 | * in a sort (or the leader's role), and not statistics for the sort as a |
162 | * whole. |
163 | * |
164 | * Note that callers may use the leader process to sort runs as if it was an |
165 | * independent worker process (prior to the process performing a leader sort |
166 | * to produce the final sorted output). Doing so only requires a second |
167 | * "partial" tuplesort within the leader process, initialized like that of a |
168 | * worker process. The steps above don't touch on this directly. The only |
169 | * difference is that the tuplesort_attach_shared() call is never needed within |
170 | * leader process, because the backend as a whole holds the shared fileset |
171 | * reference. A worker Tuplesortstate in leader is expected to do exactly the |
172 | * same amount of total initial processing work as a worker process |
173 | * Tuplesortstate, since the leader process has nothing else to do before |
174 | * workers finish. |
175 | * |
176 | * Note that only a very small amount of memory will be allocated prior to |
177 | * the leader state first consuming input, and that workers will free the |
178 | * vast majority of their memory upon returning from tuplesort_performsort(). |
179 | * Callers can rely on this to arrange for memory to be used in a way that |
180 | * respects a workMem-style budget across an entire parallel sort operation. |
181 | * |
182 | * Callers are responsible for parallel safety in general. However, they |
183 | * can at least rely on there being no parallel safety hazards within |
184 | * tuplesort, because tuplesort thinks of the sort as several independent |
185 | * sorts whose results are combined. Since, in general, the behavior of |
186 | * sort operators is immutable, caller need only worry about the parallel |
187 | * safety of whatever the process is through which input tuples are |
188 | * generated (typically, caller uses a parallel heap scan). |
189 | */ |
190 | |
191 | extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc, |
192 | int nkeys, AttrNumber *attNums, |
193 | Oid *sortOperators, Oid *sortCollations, |
194 | bool *nullsFirstFlags, |
195 | int workMem, SortCoordinate coordinate, |
196 | bool randomAccess); |
197 | extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc, |
198 | Relation indexRel, int workMem, |
199 | SortCoordinate coordinate, bool randomAccess); |
200 | extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, |
201 | Relation indexRel, |
202 | bool enforceUnique, |
203 | int workMem, SortCoordinate coordinate, |
204 | bool randomAccess); |
205 | extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, |
206 | Relation indexRel, |
207 | uint32 high_mask, |
208 | uint32 low_mask, |
209 | uint32 max_buckets, |
210 | int workMem, SortCoordinate coordinate, |
211 | bool randomAccess); |
212 | extern Tuplesortstate *tuplesort_begin_datum(Oid datumType, |
213 | Oid sortOperator, Oid sortCollation, |
214 | bool nullsFirstFlag, |
215 | int workMem, SortCoordinate coordinate, |
216 | bool randomAccess); |
217 | |
218 | extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound); |
219 | |
220 | extern void tuplesort_puttupleslot(Tuplesortstate *state, |
221 | TupleTableSlot *slot); |
222 | extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup); |
223 | extern void tuplesort_putindextuplevalues(Tuplesortstate *state, |
224 | Relation rel, ItemPointer self, |
225 | Datum *values, bool *isnull); |
226 | extern void tuplesort_putdatum(Tuplesortstate *state, Datum val, |
227 | bool isNull); |
228 | |
229 | extern void tuplesort_performsort(Tuplesortstate *state); |
230 | |
231 | extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, |
232 | bool copy, TupleTableSlot *slot, Datum *abbrev); |
233 | extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward); |
234 | extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward); |
235 | extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward, |
236 | Datum *val, bool *isNull, Datum *abbrev); |
237 | |
238 | extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, |
239 | bool forward); |
240 | |
241 | extern void tuplesort_end(Tuplesortstate *state); |
242 | |
243 | extern void tuplesort_get_stats(Tuplesortstate *state, |
244 | TuplesortInstrumentation *stats); |
245 | extern const char *tuplesort_method_name(TuplesortMethod m); |
246 | extern const char *tuplesort_space_type_name(TuplesortSpaceType t); |
247 | |
248 | extern int tuplesort_merge_order(int64 allowedMem); |
249 | |
250 | extern Size tuplesort_estimate_shared(int nworkers); |
251 | extern void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, |
252 | dsm_segment *seg); |
253 | extern void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg); |
254 | |
255 | /* |
256 | * These routines may only be called if randomAccess was specified 'true'. |
257 | * Likewise, backwards scan in gettuple/getdatum is only allowed if |
258 | * randomAccess was specified. Note that parallel sorts do not support |
259 | * randomAccess. |
260 | */ |
261 | |
262 | extern void tuplesort_rescan(Tuplesortstate *state); |
263 | extern void tuplesort_markpos(Tuplesortstate *state); |
264 | extern void tuplesort_restorepos(Tuplesortstate *state); |
265 | |
266 | #endif /* TUPLESORT_H */ |
267 | |