| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * jsonb.h |
| 4 | * Declarations for jsonb data type support. |
| 5 | * |
| 6 | * Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 7 | * |
| 8 | * src/include/utils/jsonb.h |
| 9 | * |
| 10 | *------------------------------------------------------------------------- |
| 11 | */ |
| 12 | #ifndef __JSONB_H__ |
| 13 | #define __JSONB_H__ |
| 14 | |
| 15 | #include "lib/stringinfo.h" |
| 16 | #include "utils/array.h" |
| 17 | #include "utils/numeric.h" |
| 18 | |
| 19 | /* Tokens used when sequentially processing a jsonb value */ |
| 20 | typedef enum |
| 21 | { |
| 22 | WJB_DONE, |
| 23 | WJB_KEY, |
| 24 | WJB_VALUE, |
| 25 | WJB_ELEM, |
| 26 | WJB_BEGIN_ARRAY, |
| 27 | WJB_END_ARRAY, |
| 28 | WJB_BEGIN_OBJECT, |
| 29 | WJB_END_OBJECT |
| 30 | } JsonbIteratorToken; |
| 31 | |
| 32 | /* Strategy numbers for GIN index opclasses */ |
| 33 | #define JsonbContainsStrategyNumber 7 |
| 34 | #define JsonbExistsStrategyNumber 9 |
| 35 | #define JsonbExistsAnyStrategyNumber 10 |
| 36 | #define JsonbExistsAllStrategyNumber 11 |
| 37 | #define JsonbJsonpathExistsStrategyNumber 15 |
| 38 | #define JsonbJsonpathPredicateStrategyNumber 16 |
| 39 | |
| 40 | |
| 41 | /* |
| 42 | * In the standard jsonb_ops GIN opclass for jsonb, we choose to index both |
| 43 | * keys and values. The storage format is text. The first byte of the text |
| 44 | * string distinguishes whether this is a key (always a string), null value, |
| 45 | * boolean value, numeric value, or string value. However, array elements |
| 46 | * that are strings are marked as though they were keys; this imprecision |
| 47 | * supports the definition of the "exists" operator, which treats array |
| 48 | * elements like keys. The remainder of the text string is empty for a null |
| 49 | * value, "t" or "f" for a boolean value, a normalized print representation of |
| 50 | * a numeric value, or the text of a string value. However, if the length of |
| 51 | * this text representation would exceed JGIN_MAXLENGTH bytes, we instead hash |
| 52 | * the text representation and store an 8-hex-digit representation of the |
| 53 | * uint32 hash value, marking the prefix byte with an additional bit to |
| 54 | * distinguish that this has happened. Hashing long strings saves space and |
| 55 | * ensures that we won't overrun the maximum entry length for a GIN index. |
| 56 | * (But JGIN_MAXLENGTH is quite a bit shorter than GIN's limit. It's chosen |
| 57 | * to ensure that the on-disk text datum will have a short varlena header.) |
| 58 | * Note that when any hashed item appears in a query, we must recheck index |
| 59 | * matches against the heap tuple; currently, this costs nothing because we |
| 60 | * must always recheck for other reasons. |
| 61 | */ |
| 62 | #define JGINFLAG_KEY 0x01 /* key (or string array element) */ |
| 63 | #define JGINFLAG_NULL 0x02 /* null value */ |
| 64 | #define JGINFLAG_BOOL 0x03 /* boolean value */ |
| 65 | #define JGINFLAG_NUM 0x04 /* numeric value */ |
| 66 | #define JGINFLAG_STR 0x05 /* string value (if not an array element) */ |
| 67 | #define JGINFLAG_HASHED 0x10 /* OR'd into flag if value was hashed */ |
| 68 | #define JGIN_MAXLENGTH 125 /* max length of text part before hashing */ |
| 69 | |
| 70 | /* Convenience macros */ |
| 71 | #define DatumGetJsonbP(d) ((Jsonb *) PG_DETOAST_DATUM(d)) |
| 72 | #define DatumGetJsonbPCopy(d) ((Jsonb *) PG_DETOAST_DATUM_COPY(d)) |
| 73 | #define JsonbPGetDatum(p) PointerGetDatum(p) |
| 74 | #define PG_GETARG_JSONB_P(x) DatumGetJsonbP(PG_GETARG_DATUM(x)) |
| 75 | #define PG_GETARG_JSONB_P_COPY(x) DatumGetJsonbPCopy(PG_GETARG_DATUM(x)) |
| 76 | #define PG_RETURN_JSONB_P(x) PG_RETURN_POINTER(x) |
| 77 | |
| 78 | typedef struct JsonbPair JsonbPair; |
| 79 | typedef struct JsonbValue JsonbValue; |
| 80 | |
| 81 | /* |
| 82 | * Jsonbs are varlena objects, so must meet the varlena convention that the |
| 83 | * first int32 of the object contains the total object size in bytes. Be sure |
| 84 | * to use VARSIZE() and SET_VARSIZE() to access it, though! |
| 85 | * |
| 86 | * Jsonb is the on-disk representation, in contrast to the in-memory JsonbValue |
| 87 | * representation. Often, JsonbValues are just shims through which a Jsonb |
| 88 | * buffer is accessed, but they can also be deep copied and passed around. |
| 89 | * |
| 90 | * Jsonb is a tree structure. Each node in the tree consists of a JEntry |
| 91 | * header and a variable-length content (possibly of zero size). The JEntry |
| 92 | * header indicates what kind of a node it is, e.g. a string or an array, |
| 93 | * and provides the length of its variable-length portion. |
| 94 | * |
| 95 | * The JEntry and the content of a node are not stored physically together. |
| 96 | * Instead, the container array or object has an array that holds the JEntrys |
| 97 | * of all the child nodes, followed by their variable-length portions. |
| 98 | * |
| 99 | * The root node is an exception; it has no parent array or object that could |
| 100 | * hold its JEntry. Hence, no JEntry header is stored for the root node. It |
| 101 | * is implicitly known that the root node must be an array or an object, |
| 102 | * so we can get away without the type indicator as long as we can distinguish |
| 103 | * the two. For that purpose, both an array and an object begin with a uint32 |
| 104 | * header field, which contains an JB_FOBJECT or JB_FARRAY flag. When a naked |
| 105 | * scalar value needs to be stored as a Jsonb value, what we actually store is |
| 106 | * an array with one element, with the flags in the array's header field set |
| 107 | * to JB_FSCALAR | JB_FARRAY. |
| 108 | * |
| 109 | * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct, |
| 110 | * the variable-length portion of some node types is aligned to a 4-byte |
| 111 | * boundary, while others are not. When alignment is needed, the padding is |
| 112 | * in the beginning of the node that requires it. For example, if a numeric |
| 113 | * node is stored after a string node, so that the numeric node begins at |
| 114 | * offset 3, the variable-length portion of the numeric node will begin with |
| 115 | * one padding byte so that the actual numeric data is 4-byte aligned. |
| 116 | */ |
| 117 | |
| 118 | /* |
| 119 | * JEntry format. |
| 120 | * |
| 121 | * The least significant 28 bits store either the data length of the entry, |
| 122 | * or its end+1 offset from the start of the variable-length portion of the |
| 123 | * containing object. The next three bits store the type of the entry, and |
| 124 | * the high-order bit tells whether the least significant bits store a length |
| 125 | * or an offset. |
| 126 | * |
| 127 | * The reason for the offset-or-length complication is to compromise between |
| 128 | * access speed and data compressibility. In the initial design each JEntry |
| 129 | * always stored an offset, but this resulted in JEntry arrays with horrible |
| 130 | * compressibility properties, so that TOAST compression of a JSONB did not |
| 131 | * work well. Storing only lengths would greatly improve compressibility, |
| 132 | * but it makes random access into large arrays expensive (O(N) not O(1)). |
| 133 | * So what we do is store an offset in every JB_OFFSET_STRIDE'th JEntry and |
| 134 | * a length in the rest. This results in reasonably compressible data (as |
| 135 | * long as the stride isn't too small). We may have to examine as many as |
| 136 | * JB_OFFSET_STRIDE JEntrys in order to find out the offset or length of any |
| 137 | * given item, but that's still O(1) no matter how large the container is. |
| 138 | * |
| 139 | * We could avoid eating a flag bit for this purpose if we were to store |
| 140 | * the stride in the container header, or if we were willing to treat the |
| 141 | * stride as an unchangeable constant. Neither of those options is very |
| 142 | * attractive though. |
| 143 | */ |
| 144 | typedef uint32 JEntry; |
| 145 | |
| 146 | #define JENTRY_OFFLENMASK 0x0FFFFFFF |
| 147 | #define JENTRY_TYPEMASK 0x70000000 |
| 148 | #define JENTRY_HAS_OFF 0x80000000 |
| 149 | |
| 150 | /* values stored in the type bits */ |
| 151 | #define JENTRY_ISSTRING 0x00000000 |
| 152 | #define JENTRY_ISNUMERIC 0x10000000 |
| 153 | #define JENTRY_ISBOOL_FALSE 0x20000000 |
| 154 | #define JENTRY_ISBOOL_TRUE 0x30000000 |
| 155 | #define JENTRY_ISNULL 0x40000000 |
| 156 | #define JENTRY_ISCONTAINER 0x50000000 /* array or object */ |
| 157 | |
| 158 | /* Access macros. Note possible multiple evaluations */ |
| 159 | #define JBE_OFFLENFLD(je_) ((je_) & JENTRY_OFFLENMASK) |
| 160 | #define JBE_HAS_OFF(je_) (((je_) & JENTRY_HAS_OFF) != 0) |
| 161 | #define JBE_ISSTRING(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING) |
| 162 | #define JBE_ISNUMERIC(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC) |
| 163 | #define JBE_ISCONTAINER(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER) |
| 164 | #define JBE_ISNULL(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISNULL) |
| 165 | #define JBE_ISBOOL_TRUE(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_TRUE) |
| 166 | #define JBE_ISBOOL_FALSE(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_FALSE) |
| 167 | #define JBE_ISBOOL(je_) (JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_)) |
| 168 | |
| 169 | /* Macro for advancing an offset variable to the next JEntry */ |
| 170 | #define JBE_ADVANCE_OFFSET(offset, je) \ |
| 171 | do { \ |
| 172 | JEntry je_ = (je); \ |
| 173 | if (JBE_HAS_OFF(je_)) \ |
| 174 | (offset) = JBE_OFFLENFLD(je_); \ |
| 175 | else \ |
| 176 | (offset) += JBE_OFFLENFLD(je_); \ |
| 177 | } while(0) |
| 178 | |
| 179 | /* |
| 180 | * We store an offset, not a length, every JB_OFFSET_STRIDE children. |
| 181 | * Caution: this macro should only be referenced when creating a JSONB |
| 182 | * value. When examining an existing value, pay attention to the HAS_OFF |
| 183 | * bits instead. This allows changes in the offset-placement heuristic |
| 184 | * without breaking on-disk compatibility. |
| 185 | */ |
| 186 | #define JB_OFFSET_STRIDE 32 |
| 187 | |
| 188 | /* |
| 189 | * A jsonb array or object node, within a Jsonb Datum. |
| 190 | * |
| 191 | * An array has one child for each element, stored in array order. |
| 192 | * |
| 193 | * An object has two children for each key/value pair. The keys all appear |
| 194 | * first, in key sort order; then the values appear, in an order matching the |
| 195 | * key order. This arrangement keeps the keys compact in memory, making a |
| 196 | * search for a particular key more cache-friendly. |
| 197 | */ |
| 198 | typedef struct JsonbContainer |
| 199 | { |
| 200 | uint32 ; /* number of elements or key/value pairs, and |
| 201 | * flags */ |
| 202 | JEntry children[FLEXIBLE_ARRAY_MEMBER]; |
| 203 | |
| 204 | /* the data for each child node follows. */ |
| 205 | } JsonbContainer; |
| 206 | |
| 207 | /* flags for the header-field in JsonbContainer */ |
| 208 | #define JB_CMASK 0x0FFFFFFF /* mask for count field */ |
| 209 | #define JB_FSCALAR 0x10000000 /* flag bits */ |
| 210 | #define JB_FOBJECT 0x20000000 |
| 211 | #define JB_FARRAY 0x40000000 |
| 212 | |
| 213 | /* convenience macros for accessing a JsonbContainer struct */ |
| 214 | #define JsonContainerSize(jc) ((jc)->header & JB_CMASK) |
| 215 | #define JsonContainerIsScalar(jc) (((jc)->header & JB_FSCALAR) != 0) |
| 216 | #define JsonContainerIsObject(jc) (((jc)->header & JB_FOBJECT) != 0) |
| 217 | #define JsonContainerIsArray(jc) (((jc)->header & JB_FARRAY) != 0) |
| 218 | |
| 219 | /* The top-level on-disk format for a jsonb datum. */ |
| 220 | typedef struct |
| 221 | { |
| 222 | int32 vl_len_; /* varlena header (do not touch directly!) */ |
| 223 | JsonbContainer root; |
| 224 | } Jsonb; |
| 225 | |
| 226 | /* convenience macros for accessing the root container in a Jsonb datum */ |
| 227 | #define JB_ROOT_COUNT(jbp_) (*(uint32 *) VARDATA(jbp_) & JB_CMASK) |
| 228 | #define JB_ROOT_IS_SCALAR(jbp_) ((*(uint32 *) VARDATA(jbp_) & JB_FSCALAR) != 0) |
| 229 | #define JB_ROOT_IS_OBJECT(jbp_) ((*(uint32 *) VARDATA(jbp_) & JB_FOBJECT) != 0) |
| 230 | #define JB_ROOT_IS_ARRAY(jbp_) ((*(uint32 *) VARDATA(jbp_) & JB_FARRAY) != 0) |
| 231 | |
| 232 | |
| 233 | enum jbvType |
| 234 | { |
| 235 | /* Scalar types */ |
| 236 | jbvNull = 0x0, |
| 237 | jbvString, |
| 238 | jbvNumeric, |
| 239 | jbvBool, |
| 240 | /* Composite types */ |
| 241 | jbvArray = 0x10, |
| 242 | jbvObject, |
| 243 | /* Binary (i.e. struct Jsonb) jbvArray/jbvObject */ |
| 244 | jbvBinary |
| 245 | }; |
| 246 | |
| 247 | /* |
| 248 | * JsonbValue: In-memory representation of Jsonb. This is a convenient |
| 249 | * deserialized representation, that can easily support using the "val" |
| 250 | * union across underlying types during manipulation. The Jsonb on-disk |
| 251 | * representation has various alignment considerations. |
| 252 | */ |
| 253 | struct JsonbValue |
| 254 | { |
| 255 | enum jbvType type; /* Influences sort order */ |
| 256 | |
| 257 | union |
| 258 | { |
| 259 | Numeric numeric; |
| 260 | bool boolean; |
| 261 | struct |
| 262 | { |
| 263 | int len; |
| 264 | char *val; /* Not necessarily null-terminated */ |
| 265 | } string; /* String primitive type */ |
| 266 | |
| 267 | struct |
| 268 | { |
| 269 | int nElems; |
| 270 | JsonbValue *elems; |
| 271 | bool rawScalar; /* Top-level "raw scalar" array? */ |
| 272 | } array; /* Array container type */ |
| 273 | |
| 274 | struct |
| 275 | { |
| 276 | int nPairs; /* 1 pair, 2 elements */ |
| 277 | JsonbPair *pairs; |
| 278 | } object; /* Associative container type */ |
| 279 | |
| 280 | struct |
| 281 | { |
| 282 | int len; |
| 283 | JsonbContainer *data; |
| 284 | } binary; /* Array or object, in on-disk format */ |
| 285 | } val; |
| 286 | }; |
| 287 | |
| 288 | #define IsAJsonbScalar(jsonbval) ((jsonbval)->type >= jbvNull && \ |
| 289 | (jsonbval)->type <= jbvBool) |
| 290 | |
| 291 | /* |
| 292 | * Key/value pair within an Object. |
| 293 | * |
| 294 | * This struct type is only used briefly while constructing a Jsonb; it is |
| 295 | * *not* the on-disk representation. |
| 296 | * |
| 297 | * Pairs with duplicate keys are de-duplicated. We store the originally |
| 298 | * observed pair ordering for the purpose of removing duplicates in a |
| 299 | * well-defined way (which is "last observed wins"). |
| 300 | */ |
| 301 | struct JsonbPair |
| 302 | { |
| 303 | JsonbValue key; /* Must be a jbvString */ |
| 304 | JsonbValue value; /* May be of any type */ |
| 305 | uint32 order; /* Pair's index in original sequence */ |
| 306 | }; |
| 307 | |
| 308 | /* Conversion state used when parsing Jsonb from text, or for type coercion */ |
| 309 | typedef struct JsonbParseState |
| 310 | { |
| 311 | JsonbValue contVal; |
| 312 | Size size; |
| 313 | struct JsonbParseState *next; |
| 314 | } JsonbParseState; |
| 315 | |
| 316 | /* |
| 317 | * JsonbIterator holds details of the type for each iteration. It also stores a |
| 318 | * Jsonb varlena buffer, which can be directly accessed in some contexts. |
| 319 | */ |
| 320 | typedef enum |
| 321 | { |
| 322 | JBI_ARRAY_START, |
| 323 | JBI_ARRAY_ELEM, |
| 324 | JBI_OBJECT_START, |
| 325 | JBI_OBJECT_KEY, |
| 326 | JBI_OBJECT_VALUE |
| 327 | } JsonbIterState; |
| 328 | |
| 329 | typedef struct JsonbIterator |
| 330 | { |
| 331 | /* Container being iterated */ |
| 332 | JsonbContainer *container; |
| 333 | uint32 nElems; /* Number of elements in children array (will |
| 334 | * be nPairs for objects) */ |
| 335 | bool isScalar; /* Pseudo-array scalar value? */ |
| 336 | JEntry *children; /* JEntrys for child nodes */ |
| 337 | /* Data proper. This points to the beginning of the variable-length data */ |
| 338 | char *dataProper; |
| 339 | |
| 340 | /* Current item in buffer (up to nElems) */ |
| 341 | int curIndex; |
| 342 | |
| 343 | /* Data offset corresponding to current item */ |
| 344 | uint32 curDataOffset; |
| 345 | |
| 346 | /* |
| 347 | * If the container is an object, we want to return keys and values |
| 348 | * alternately; so curDataOffset points to the current key, and |
| 349 | * curValueOffset points to the current value. |
| 350 | */ |
| 351 | uint32 curValueOffset; |
| 352 | |
| 353 | /* Private state */ |
| 354 | JsonbIterState state; |
| 355 | |
| 356 | struct JsonbIterator *parent; |
| 357 | } JsonbIterator; |
| 358 | |
| 359 | |
| 360 | /* Support functions */ |
| 361 | extern uint32 getJsonbOffset(const JsonbContainer *jc, int index); |
| 362 | extern uint32 getJsonbLength(const JsonbContainer *jc, int index); |
| 363 | extern int compareJsonbContainers(JsonbContainer *a, JsonbContainer *b); |
| 364 | extern JsonbValue *findJsonbValueFromContainer(JsonbContainer *, |
| 365 | uint32 flags, |
| 366 | JsonbValue *key); |
| 367 | extern JsonbValue *getIthJsonbValueFromContainer(JsonbContainer *, |
| 368 | uint32 i); |
| 369 | extern JsonbValue *pushJsonbValue(JsonbParseState **pstate, |
| 370 | JsonbIteratorToken seq, JsonbValue *jbVal); |
| 371 | extern JsonbIterator *JsonbIteratorInit(JsonbContainer *container); |
| 372 | extern JsonbIteratorToken JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, |
| 373 | bool skipNested); |
| 374 | extern Jsonb *JsonbValueToJsonb(JsonbValue *val); |
| 375 | extern bool JsonbDeepContains(JsonbIterator **val, |
| 376 | JsonbIterator **mContained); |
| 377 | extern void JsonbHashScalarValue(const JsonbValue *scalarVal, uint32 *hash); |
| 378 | extern void JsonbHashScalarValueExtended(const JsonbValue *scalarVal, |
| 379 | uint64 *hash, uint64 seed); |
| 380 | |
| 381 | /* jsonb.c support functions */ |
| 382 | extern char *JsonbToCString(StringInfo out, JsonbContainer *in, |
| 383 | int estimated_len); |
| 384 | extern char *JsonbToCStringIndent(StringInfo out, JsonbContainer *in, |
| 385 | int estimated_len); |
| 386 | extern bool (JsonbContainer *jbc, JsonbValue *res); |
| 387 | extern const char *JsonbTypeName(JsonbValue *jb); |
| 388 | |
| 389 | |
| 390 | #endif /* __JSONB_H__ */ |
| 391 | |