| 1 | /*------------------------------------------------------------------------- | 
| 2 |  * | 
| 3 |  * tuplesort.h | 
| 4 |  *	  Generalized tuple sorting routines. | 
| 5 |  * | 
| 6 |  * This module handles sorting of heap tuples, index tuples, or single | 
| 7 |  * Datums (and could easily support other kinds of sortable objects, | 
| 8 |  * if necessary).  It works efficiently for both small and large amounts | 
| 9 |  * of data.  Small amounts are sorted in-memory using qsort().  Large | 
| 10 |  * amounts are sorted using temporary files and a standard external sort | 
| 11 |  * algorithm.  Parallel sorts use a variant of this external sort | 
| 12 |  * algorithm, and are typically only used for large amounts of data. | 
| 13 |  * | 
| 14 |  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group | 
| 15 |  * Portions Copyright (c) 1994, Regents of the University of California | 
| 16 |  * | 
| 17 |  * src/include/utils/tuplesort.h | 
| 18 |  * | 
| 19 |  *------------------------------------------------------------------------- | 
| 20 |  */ | 
| 21 | #ifndef TUPLESORT_H | 
| 22 | #define TUPLESORT_H | 
| 23 |  | 
| 24 | #include "access/itup.h" | 
| 25 | #include "executor/tuptable.h" | 
| 26 | #include "fmgr.h" | 
| 27 | #include "storage/dsm.h" | 
| 28 | #include "utils/relcache.h" | 
| 29 |  | 
| 30 |  | 
| 31 | /* | 
| 32 |  * Tuplesortstate and Sharedsort are opaque types whose details are not | 
| 33 |  * known outside tuplesort.c. | 
| 34 |  */ | 
| 35 | typedef struct Tuplesortstate Tuplesortstate; | 
| 36 | typedef struct Sharedsort Sharedsort; | 
| 37 |  | 
| 38 | /* | 
| 39 |  * Tuplesort parallel coordination state, allocated by each participant in | 
| 40 |  * local memory.  Participant caller initializes everything.  See usage notes | 
| 41 |  * below. | 
| 42 |  */ | 
| 43 | typedef struct SortCoordinateData | 
| 44 | { | 
| 45 | 	/* Worker process?  If not, must be leader. */ | 
| 46 | 	bool		isWorker; | 
| 47 |  | 
| 48 | 	/* | 
| 49 | 	 * Leader-process-passed number of participants known launched (workers | 
| 50 | 	 * set this to -1).  Includes state within leader needed for it to | 
| 51 | 	 * participate as a worker, if any. | 
| 52 | 	 */ | 
| 53 | 	int			nParticipants; | 
| 54 |  | 
| 55 | 	/* Private opaque state (points to shared memory) */ | 
| 56 | 	Sharedsort *sharedsort; | 
| 57 | }			SortCoordinateData; | 
| 58 |  | 
| 59 | typedef struct SortCoordinateData *SortCoordinate; | 
| 60 |  | 
| 61 | /* | 
| 62 |  * Data structures for reporting sort statistics.  Note that | 
| 63 |  * TuplesortInstrumentation can't contain any pointers because we | 
| 64 |  * sometimes put it in shared memory. | 
| 65 |  */ | 
| 66 | typedef enum | 
| 67 | { | 
| 68 | 	SORT_TYPE_STILL_IN_PROGRESS = 0, | 
| 69 | 	SORT_TYPE_TOP_N_HEAPSORT, | 
| 70 | 	SORT_TYPE_QUICKSORT, | 
| 71 | 	SORT_TYPE_EXTERNAL_SORT, | 
| 72 | 	SORT_TYPE_EXTERNAL_MERGE | 
| 73 | } TuplesortMethod; | 
| 74 |  | 
| 75 | typedef enum | 
| 76 | { | 
| 77 | 	SORT_SPACE_TYPE_DISK, | 
| 78 | 	SORT_SPACE_TYPE_MEMORY | 
| 79 | } TuplesortSpaceType; | 
| 80 |  | 
| 81 | typedef struct TuplesortInstrumentation | 
| 82 | { | 
| 83 | 	TuplesortMethod sortMethod; /* sort algorithm used */ | 
| 84 | 	TuplesortSpaceType spaceType;	/* type of space spaceUsed represents */ | 
| 85 | 	long		spaceUsed;		/* space consumption, in kB */ | 
| 86 | } TuplesortInstrumentation; | 
| 87 |  | 
| 88 |  | 
| 89 | /* | 
| 90 |  * We provide multiple interfaces to what is essentially the same code, | 
| 91 |  * since different callers have different data to be sorted and want to | 
| 92 |  * specify the sort key information differently.  There are two APIs for | 
| 93 |  * sorting HeapTuples and two more for sorting IndexTuples.  Yet another | 
| 94 |  * API supports sorting bare Datums. | 
| 95 |  * | 
| 96 |  * Serial sort callers should pass NULL for their coordinate argument. | 
| 97 |  * | 
| 98 |  * The "heap" API actually stores/sorts MinimalTuples, which means it doesn't | 
| 99 |  * preserve the system columns (tuple identity and transaction visibility | 
| 100 |  * info).  The sort keys are specified by column numbers within the tuples | 
| 101 |  * and sort operator OIDs.  We save some cycles by passing and returning the | 
| 102 |  * tuples in TupleTableSlots, rather than forming actual HeapTuples (which'd | 
| 103 |  * have to be converted to MinimalTuples).  This API works well for sorts | 
| 104 |  * executed as parts of plan trees. | 
| 105 |  * | 
| 106 |  * The "cluster" API stores/sorts full HeapTuples including all visibility | 
| 107 |  * info. The sort keys are specified by reference to a btree index that is | 
| 108 |  * defined on the relation to be sorted.  Note that putheaptuple/getheaptuple | 
| 109 |  * go with this API, not the "begin_heap" one! | 
| 110 |  * | 
| 111 |  * The "index_btree" API stores/sorts IndexTuples (preserving all their | 
| 112 |  * header fields).  The sort keys are specified by a btree index definition. | 
| 113 |  * | 
| 114 |  * The "index_hash" API is similar to index_btree, but the tuples are | 
| 115 |  * actually sorted by their hash codes not the raw data. | 
| 116 |  * | 
| 117 |  * Parallel sort callers are required to coordinate multiple tuplesort states | 
| 118 |  * in a leader process and one or more worker processes.  The leader process | 
| 119 |  * must launch workers, and have each perform an independent "partial" | 
| 120 |  * tuplesort, typically fed by the parallel heap interface.  The leader later | 
| 121 |  * produces the final output (internally, it merges runs output by workers). | 
| 122 |  * | 
| 123 |  * Callers must do the following to perform a sort in parallel using multiple | 
| 124 |  * worker processes: | 
| 125 |  * | 
| 126 |  * 1. Request tuplesort-private shared memory for n workers.  Use | 
| 127 |  *    tuplesort_estimate_shared() to get the required size. | 
| 128 |  * 2. Have leader process initialize allocated shared memory using | 
| 129 |  *    tuplesort_initialize_shared().  Launch workers. | 
| 130 |  * 3. Initialize a coordinate argument within both the leader process, and | 
| 131 |  *    for each worker process.  This has a pointer to the shared | 
| 132 |  *    tuplesort-private structure, as well as some caller-initialized fields. | 
| 133 |  *    Leader's coordinate argument reliably indicates number of workers | 
| 134 |  *    launched (this is unused by workers). | 
| 135 |  * 4. Begin a tuplesort using some appropriate tuplesort_begin* routine, | 
| 136 |  *    (passing the coordinate argument) within each worker.  The workMem | 
| 137 |  *    arguments need not be identical.  All other arguments should match | 
| 138 |  *    exactly, though. | 
| 139 |  * 5. tuplesort_attach_shared() should be called by all workers.  Feed tuples | 
| 140 |  *    to each worker, and call tuplesort_performsort() within each when input | 
| 141 |  *    is exhausted. | 
| 142 |  * 6. Call tuplesort_end() in each worker process.  Worker processes can shut | 
| 143 |  *    down once tuplesort_end() returns. | 
| 144 |  * 7. Begin a tuplesort in the leader using the same tuplesort_begin* | 
| 145 |  *    routine, passing a leader-appropriate coordinate argument (this can | 
| 146 |  *    happen as early as during step 3, actually, since we only need to know | 
| 147 |  *    the number of workers successfully launched).  The leader must now wait | 
| 148 |  *    for workers to finish.  Caller must use own mechanism for ensuring that | 
| 149 |  *    next step isn't reached until all workers have called and returned from | 
| 150 |  *    tuplesort_performsort().  (Note that it's okay if workers have already | 
| 151 |  *    also called tuplesort_end() by then.) | 
| 152 |  * 8. Call tuplesort_performsort() in leader.  Consume output using the | 
| 153 |  *    appropriate tuplesort_get* routine.  Leader can skip this step if | 
| 154 |  *    tuplesort turns out to be unnecessary. | 
| 155 |  * 9. Call tuplesort_end() in leader. | 
| 156 |  * | 
| 157 |  * This division of labor assumes nothing about how input tuples are produced, | 
| 158 |  * but does require that caller combine the state of multiple tuplesorts for | 
| 159 |  * any purpose other than producing the final output.  For example, callers | 
| 160 |  * must consider that tuplesort_get_stats() reports on only one worker's role | 
| 161 |  * in a sort (or the leader's role), and not statistics for the sort as a | 
| 162 |  * whole. | 
| 163 |  * | 
| 164 |  * Note that callers may use the leader process to sort runs as if it was an | 
| 165 |  * independent worker process (prior to the process performing a leader sort | 
| 166 |  * to produce the final sorted output).  Doing so only requires a second | 
| 167 |  * "partial" tuplesort within the leader process, initialized like that of a | 
| 168 |  * worker process.  The steps above don't touch on this directly.  The only | 
| 169 |  * difference is that the tuplesort_attach_shared() call is never needed within | 
| 170 |  * leader process, because the backend as a whole holds the shared fileset | 
| 171 |  * reference.  A worker Tuplesortstate in leader is expected to do exactly the | 
| 172 |  * same amount of total initial processing work as a worker process | 
| 173 |  * Tuplesortstate, since the leader process has nothing else to do before | 
| 174 |  * workers finish. | 
| 175 |  * | 
| 176 |  * Note that only a very small amount of memory will be allocated prior to | 
| 177 |  * the leader state first consuming input, and that workers will free the | 
| 178 |  * vast majority of their memory upon returning from tuplesort_performsort(). | 
| 179 |  * Callers can rely on this to arrange for memory to be used in a way that | 
| 180 |  * respects a workMem-style budget across an entire parallel sort operation. | 
| 181 |  * | 
| 182 |  * Callers are responsible for parallel safety in general.  However, they | 
| 183 |  * can at least rely on there being no parallel safety hazards within | 
| 184 |  * tuplesort, because tuplesort thinks of the sort as several independent | 
| 185 |  * sorts whose results are combined.  Since, in general, the behavior of | 
| 186 |  * sort operators is immutable, caller need only worry about the parallel | 
| 187 |  * safety of whatever the process is through which input tuples are | 
| 188 |  * generated (typically, caller uses a parallel heap scan). | 
| 189 |  */ | 
| 190 |  | 
| 191 | extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc, | 
| 192 | 											int nkeys, AttrNumber *attNums, | 
| 193 | 											Oid *sortOperators, Oid *sortCollations, | 
| 194 | 											bool *nullsFirstFlags, | 
| 195 | 											int workMem, SortCoordinate coordinate, | 
| 196 | 											bool randomAccess); | 
| 197 | extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc, | 
| 198 | 											   Relation indexRel, int workMem, | 
| 199 | 											   SortCoordinate coordinate, bool randomAccess); | 
| 200 | extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, | 
| 201 | 												   Relation indexRel, | 
| 202 | 												   bool enforceUnique, | 
| 203 | 												   int workMem, SortCoordinate coordinate, | 
| 204 | 												   bool randomAccess); | 
| 205 | extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, | 
| 206 | 												  Relation indexRel, | 
| 207 | 												  uint32 high_mask, | 
| 208 | 												  uint32 low_mask, | 
| 209 | 												  uint32 max_buckets, | 
| 210 | 												  int workMem, SortCoordinate coordinate, | 
| 211 | 												  bool randomAccess); | 
| 212 | extern Tuplesortstate *tuplesort_begin_datum(Oid datumType, | 
| 213 | 											 Oid sortOperator, Oid sortCollation, | 
| 214 | 											 bool nullsFirstFlag, | 
| 215 | 											 int workMem, SortCoordinate coordinate, | 
| 216 | 											 bool randomAccess); | 
| 217 |  | 
| 218 | extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound); | 
| 219 |  | 
| 220 | extern void tuplesort_puttupleslot(Tuplesortstate *state, | 
| 221 | 								   TupleTableSlot *slot); | 
| 222 | extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup); | 
| 223 | extern void tuplesort_putindextuplevalues(Tuplesortstate *state, | 
| 224 | 										  Relation rel, ItemPointer self, | 
| 225 | 										  Datum *values, bool *isnull); | 
| 226 | extern void tuplesort_putdatum(Tuplesortstate *state, Datum val, | 
| 227 | 							   bool isNull); | 
| 228 |  | 
| 229 | extern void tuplesort_performsort(Tuplesortstate *state); | 
| 230 |  | 
| 231 | extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, | 
| 232 | 								   bool copy, TupleTableSlot *slot, Datum *abbrev); | 
| 233 | extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward); | 
| 234 | extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward); | 
| 235 | extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward, | 
| 236 | 							   Datum *val, bool *isNull, Datum *abbrev); | 
| 237 |  | 
| 238 | extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, | 
| 239 | 								 bool forward); | 
| 240 |  | 
| 241 | extern void tuplesort_end(Tuplesortstate *state); | 
| 242 |  | 
| 243 | extern void tuplesort_get_stats(Tuplesortstate *state, | 
| 244 | 								TuplesortInstrumentation *stats); | 
| 245 | extern const char *tuplesort_method_name(TuplesortMethod m); | 
| 246 | extern const char *tuplesort_space_type_name(TuplesortSpaceType t); | 
| 247 |  | 
| 248 | extern int	tuplesort_merge_order(int64 allowedMem); | 
| 249 |  | 
| 250 | extern Size tuplesort_estimate_shared(int nworkers); | 
| 251 | extern void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, | 
| 252 | 										dsm_segment *seg); | 
| 253 | extern void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg); | 
| 254 |  | 
| 255 | /* | 
| 256 |  * These routines may only be called if randomAccess was specified 'true'. | 
| 257 |  * Likewise, backwards scan in gettuple/getdatum is only allowed if | 
| 258 |  * randomAccess was specified.  Note that parallel sorts do not support | 
| 259 |  * randomAccess. | 
| 260 |  */ | 
| 261 |  | 
| 262 | extern void tuplesort_rescan(Tuplesortstate *state); | 
| 263 | extern void tuplesort_markpos(Tuplesortstate *state); | 
| 264 | extern void tuplesort_restorepos(Tuplesortstate *state); | 
| 265 |  | 
| 266 | #endif							/* TUPLESORT_H */ | 
| 267 |  |