1/*-------------------------------------------------------------------------
2 *
3 * tuplesort.h
4 * Generalized tuple sorting routines.
5 *
6 * This module handles sorting of heap tuples, index tuples, or single
7 * Datums (and could easily support other kinds of sortable objects,
8 * if necessary). It works efficiently for both small and large amounts
9 * of data. Small amounts are sorted in-memory using qsort(). Large
10 * amounts are sorted using temporary files and a standard external sort
11 * algorithm. Parallel sorts use a variant of this external sort
12 * algorithm, and are typically only used for large amounts of data.
13 *
14 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
15 * Portions Copyright (c) 1994, Regents of the University of California
16 *
17 * src/include/utils/tuplesort.h
18 *
19 *-------------------------------------------------------------------------
20 */
21#ifndef TUPLESORT_H
22#define TUPLESORT_H
23
24#include "access/itup.h"
25#include "executor/tuptable.h"
26#include "fmgr.h"
27#include "storage/dsm.h"
28#include "utils/relcache.h"
29
30
31/*
32 * Tuplesortstate and Sharedsort are opaque types whose details are not
33 * known outside tuplesort.c.
34 */
35typedef struct Tuplesortstate Tuplesortstate;
36typedef struct Sharedsort Sharedsort;
37
38/*
39 * Tuplesort parallel coordination state, allocated by each participant in
40 * local memory. Participant caller initializes everything. See usage notes
41 * below.
42 */
43typedef struct SortCoordinateData
44{
45 /* Worker process? If not, must be leader. */
46 bool isWorker;
47
48 /*
49 * Leader-process-passed number of participants known launched (workers
50 * set this to -1). Includes state within leader needed for it to
51 * participate as a worker, if any.
52 */
53 int nParticipants;
54
55 /* Private opaque state (points to shared memory) */
56 Sharedsort *sharedsort;
57} SortCoordinateData;
58
59typedef struct SortCoordinateData *SortCoordinate;
60
61/*
62 * Data structures for reporting sort statistics. Note that
63 * TuplesortInstrumentation can't contain any pointers because we
64 * sometimes put it in shared memory.
65 */
66typedef enum
67{
68 SORT_TYPE_STILL_IN_PROGRESS = 0,
69 SORT_TYPE_TOP_N_HEAPSORT,
70 SORT_TYPE_QUICKSORT,
71 SORT_TYPE_EXTERNAL_SORT,
72 SORT_TYPE_EXTERNAL_MERGE
73} TuplesortMethod;
74
75typedef enum
76{
77 SORT_SPACE_TYPE_DISK,
78 SORT_SPACE_TYPE_MEMORY
79} TuplesortSpaceType;
80
81typedef struct TuplesortInstrumentation
82{
83 TuplesortMethod sortMethod; /* sort algorithm used */
84 TuplesortSpaceType spaceType; /* type of space spaceUsed represents */
85 long spaceUsed; /* space consumption, in kB */
86} TuplesortInstrumentation;
87
88
89/*
90 * We provide multiple interfaces to what is essentially the same code,
91 * since different callers have different data to be sorted and want to
92 * specify the sort key information differently. There are two APIs for
93 * sorting HeapTuples and two more for sorting IndexTuples. Yet another
94 * API supports sorting bare Datums.
95 *
96 * Serial sort callers should pass NULL for their coordinate argument.
97 *
98 * The "heap" API actually stores/sorts MinimalTuples, which means it doesn't
99 * preserve the system columns (tuple identity and transaction visibility
100 * info). The sort keys are specified by column numbers within the tuples
101 * and sort operator OIDs. We save some cycles by passing and returning the
102 * tuples in TupleTableSlots, rather than forming actual HeapTuples (which'd
103 * have to be converted to MinimalTuples). This API works well for sorts
104 * executed as parts of plan trees.
105 *
106 * The "cluster" API stores/sorts full HeapTuples including all visibility
107 * info. The sort keys are specified by reference to a btree index that is
108 * defined on the relation to be sorted. Note that putheaptuple/getheaptuple
109 * go with this API, not the "begin_heap" one!
110 *
111 * The "index_btree" API stores/sorts IndexTuples (preserving all their
112 * header fields). The sort keys are specified by a btree index definition.
113 *
114 * The "index_hash" API is similar to index_btree, but the tuples are
115 * actually sorted by their hash codes not the raw data.
116 *
117 * Parallel sort callers are required to coordinate multiple tuplesort states
118 * in a leader process and one or more worker processes. The leader process
119 * must launch workers, and have each perform an independent "partial"
120 * tuplesort, typically fed by the parallel heap interface. The leader later
121 * produces the final output (internally, it merges runs output by workers).
122 *
123 * Callers must do the following to perform a sort in parallel using multiple
124 * worker processes:
125 *
126 * 1. Request tuplesort-private shared memory for n workers. Use
127 * tuplesort_estimate_shared() to get the required size.
128 * 2. Have leader process initialize allocated shared memory using
129 * tuplesort_initialize_shared(). Launch workers.
130 * 3. Initialize a coordinate argument within both the leader process, and
131 * for each worker process. This has a pointer to the shared
132 * tuplesort-private structure, as well as some caller-initialized fields.
133 * Leader's coordinate argument reliably indicates number of workers
134 * launched (this is unused by workers).
135 * 4. Begin a tuplesort using some appropriate tuplesort_begin* routine,
136 * (passing the coordinate argument) within each worker. The workMem
137 * arguments need not be identical. All other arguments should match
138 * exactly, though.
139 * 5. tuplesort_attach_shared() should be called by all workers. Feed tuples
140 * to each worker, and call tuplesort_performsort() within each when input
141 * is exhausted.
142 * 6. Call tuplesort_end() in each worker process. Worker processes can shut
143 * down once tuplesort_end() returns.
144 * 7. Begin a tuplesort in the leader using the same tuplesort_begin*
145 * routine, passing a leader-appropriate coordinate argument (this can
146 * happen as early as during step 3, actually, since we only need to know
147 * the number of workers successfully launched). The leader must now wait
148 * for workers to finish. Caller must use own mechanism for ensuring that
149 * next step isn't reached until all workers have called and returned from
150 * tuplesort_performsort(). (Note that it's okay if workers have already
151 * also called tuplesort_end() by then.)
152 * 8. Call tuplesort_performsort() in leader. Consume output using the
153 * appropriate tuplesort_get* routine. Leader can skip this step if
154 * tuplesort turns out to be unnecessary.
155 * 9. Call tuplesort_end() in leader.
156 *
157 * This division of labor assumes nothing about how input tuples are produced,
158 * but does require that caller combine the state of multiple tuplesorts for
159 * any purpose other than producing the final output. For example, callers
160 * must consider that tuplesort_get_stats() reports on only one worker's role
161 * in a sort (or the leader's role), and not statistics for the sort as a
162 * whole.
163 *
164 * Note that callers may use the leader process to sort runs as if it was an
165 * independent worker process (prior to the process performing a leader sort
166 * to produce the final sorted output). Doing so only requires a second
167 * "partial" tuplesort within the leader process, initialized like that of a
168 * worker process. The steps above don't touch on this directly. The only
169 * difference is that the tuplesort_attach_shared() call is never needed within
170 * leader process, because the backend as a whole holds the shared fileset
171 * reference. A worker Tuplesortstate in leader is expected to do exactly the
172 * same amount of total initial processing work as a worker process
173 * Tuplesortstate, since the leader process has nothing else to do before
174 * workers finish.
175 *
176 * Note that only a very small amount of memory will be allocated prior to
177 * the leader state first consuming input, and that workers will free the
178 * vast majority of their memory upon returning from tuplesort_performsort().
179 * Callers can rely on this to arrange for memory to be used in a way that
180 * respects a workMem-style budget across an entire parallel sort operation.
181 *
182 * Callers are responsible for parallel safety in general. However, they
183 * can at least rely on there being no parallel safety hazards within
184 * tuplesort, because tuplesort thinks of the sort as several independent
185 * sorts whose results are combined. Since, in general, the behavior of
186 * sort operators is immutable, caller need only worry about the parallel
187 * safety of whatever the process is through which input tuples are
188 * generated (typically, caller uses a parallel heap scan).
189 */
190
191extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
192 int nkeys, AttrNumber *attNums,
193 Oid *sortOperators, Oid *sortCollations,
194 bool *nullsFirstFlags,
195 int workMem, SortCoordinate coordinate,
196 bool randomAccess);
197extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
198 Relation indexRel, int workMem,
199 SortCoordinate coordinate, bool randomAccess);
200extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel,
201 Relation indexRel,
202 bool enforceUnique,
203 int workMem, SortCoordinate coordinate,
204 bool randomAccess);
205extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel,
206 Relation indexRel,
207 uint32 high_mask,
208 uint32 low_mask,
209 uint32 max_buckets,
210 int workMem, SortCoordinate coordinate,
211 bool randomAccess);
212extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
213 Oid sortOperator, Oid sortCollation,
214 bool nullsFirstFlag,
215 int workMem, SortCoordinate coordinate,
216 bool randomAccess);
217
218extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
219
220extern void tuplesort_puttupleslot(Tuplesortstate *state,
221 TupleTableSlot *slot);
222extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup);
223extern void tuplesort_putindextuplevalues(Tuplesortstate *state,
224 Relation rel, ItemPointer self,
225 Datum *values, bool *isnull);
226extern void tuplesort_putdatum(Tuplesortstate *state, Datum val,
227 bool isNull);
228
229extern void tuplesort_performsort(Tuplesortstate *state);
230
231extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward,
232 bool copy, TupleTableSlot *slot, Datum *abbrev);
233extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward);
234extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward);
235extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward,
236 Datum *val, bool *isNull, Datum *abbrev);
237
238extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples,
239 bool forward);
240
241extern void tuplesort_end(Tuplesortstate *state);
242
243extern void tuplesort_get_stats(Tuplesortstate *state,
244 TuplesortInstrumentation *stats);
245extern const char *tuplesort_method_name(TuplesortMethod m);
246extern const char *tuplesort_space_type_name(TuplesortSpaceType t);
247
248extern int tuplesort_merge_order(int64 allowedMem);
249
250extern Size tuplesort_estimate_shared(int nworkers);
251extern void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers,
252 dsm_segment *seg);
253extern void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg);
254
255/*
256 * These routines may only be called if randomAccess was specified 'true'.
257 * Likewise, backwards scan in gettuple/getdatum is only allowed if
258 * randomAccess was specified. Note that parallel sorts do not support
259 * randomAccess.
260 */
261
262extern void tuplesort_rescan(Tuplesortstate *state);
263extern void tuplesort_markpos(Tuplesortstate *state);
264extern void tuplesort_restorepos(Tuplesortstate *state);
265
266#endif /* TUPLESORT_H */
267