| 1 | // Copyright 2001 and onwards Google Inc. |
| 2 | |
| 3 | #ifndef BASE_DOCID_H_ |
| 4 | #define BASE_DOCID_H_ |
| 5 | |
| 6 | #include <assert.h> |
| 7 | |
| 8 | // sys/types.h is only needed by this file if !defined(NDEBUG), |
| 9 | // but including it conditionally can cause hard-to-track-down |
| 10 | // compilation failures that only happen when doing optimized builds. |
| 11 | // Let's include it unconditionally to reduce the number of such breakages |
| 12 | // we need to track down. |
| 13 | // Note that uint is defined by sys/types.h only if _GNU_SOURCE or its ilk is |
| 14 | // defined, and that mysql.h seems to define _GNU_SOURCE, so the problem |
| 15 | // may only happen if sys/types.h is included after mysql.h. |
| 16 | // We now define uint in base/port.h if needed, and in a way that won't clash. |
| 17 | #include <sys/types.h> // for size_t |
| 18 | |
| 19 | #include "base/basictypes.h" |
| 20 | |
| 21 | #define DOCID_FORMAT "%llu" |
| 22 | #define DOCID32BIT_FORMAT "%u" |
| 23 | |
| 24 | // The following help ensure type safety for docids. Use them when |
| 25 | // you want to printf/scanf a docid. Note that even so, when the |
| 26 | // type of docid changes you'll need to change all the format args |
| 27 | // for these printf/scanf statements by hand (to be %qu, say). |
| 28 | // |
| 29 | // We do two kinds of docids to facilitate the transition from 32-bit |
| 30 | // docids to 64-bit docids. We need both in the transition period |
| 31 | // as the base index will use 32 bit docids and the incremental index |
| 32 | // will use 64-bit docids. |
| 33 | |
| 34 | #if defined NDEBUG |
| 35 | typedef uint64 DocId; |
| 36 | const DocId kMaxDocId = DocId(GG_ULONGLONG(0xFFFFFFFFFFFFFFFF)); |
| 37 | |
| 38 | static inline uint64 DocidForPrintf(const DocId& d) { return d; } |
| 39 | static inline uint64* DocidForScanf(DocId* d) { return d; } |
| 40 | // Used with the SparseBitmap class, primarily |
| 41 | static inline uint64 DocidForBitmap(const DocId& d) { return d; } |
| 42 | // Used with the docservercache stuff, for odd but reasonable reasons |
| 43 | static inline uint64 DocidForFingerprinting(const DocId& d) { return d; } |
| 44 | // Used with the protocol-buffer stuff |
| 45 | static inline uint64 DocidForProtocolBuffer(const DocId& d) { return d; } |
| 46 | // Used during anchor processing |
| 47 | static inline uint64 DocidForAnchorPosition(const DocId& d) { return d; } |
| 48 | // Used for checkpointing |
| 49 | static inline uint64 DocidForCheckpointing(const DocId& d) { return d; } |
| 50 | // Used for approximate dups detection |
| 51 | static inline uint64 DocidForApproxDups(const DocId& d) { return d; } |
| 52 | // Used for SWIG |
| 53 | static inline uint64* DocidForSWIG(DocId* d) { return d; } |
| 54 | |
| 55 | // Used for miscellaneous arithmetic. |
| 56 | static inline uint64 DocIdAsNumber(const DocId& d) { return d; } |
| 57 | |
| 58 | // ------------------------------------------------------------------ |
| 59 | // 32 bit docids |
| 60 | // ------------------------------------------------------------------ |
| 61 | typedef uint32 DocId32Bit; |
| 62 | const DocId32Bit kMaxDocId32Bit = static_cast<DocId32Bit>(0xFFFFFFFFul); |
| 63 | static inline uint32 Docid32BitForPrintf(const DocId32Bit& d) { return d; } |
| 64 | static inline uint32* Docid32BitForScanf(DocId32Bit* d) { return d; } |
| 65 | static inline uint32 Docid32BitForBitmap(const DocId32Bit& d) { return d; } |
| 66 | static inline uint32 Docid32BitForFingerprinting(const DocId32Bit& d) { |
| 67 | return d; |
| 68 | } |
| 69 | static inline uint32 Docid32BitForEncryption(const DocId32Bit &d) { |
| 70 | return d; |
| 71 | } |
| 72 | static inline uint32 Docid32BitForProtocolBuffer(const DocId32Bit& d) { |
| 73 | return d; |
| 74 | } |
| 75 | static inline uint32 Docid32BitForAnchorPosition(const DocId32Bit& d) { |
| 76 | return d; |
| 77 | } |
| 78 | static inline uint32 Docid32BitForCheckpointing(const DocId32Bit& d) { |
| 79 | return d; |
| 80 | } |
| 81 | static inline uint32 Docid32BitForApproxDups(const DocId32Bit& d) { return d; } |
| 82 | |
| 83 | static inline uint32 DocId32BitAsNumber(const DocId32Bit& d) { return d; } |
| 84 | |
| 85 | // return true if the value cannot fit in DocId32Bit |
| 86 | static inline bool Is64BitDocId(const DocId& d) { return d > kMaxDocId32Bit; } |
| 87 | |
| 88 | #else |
| 89 | // In debug mode, we make docid its own type so we can be sure of |
| 90 | // type-safety. In particular, we're concerned with people who treat |
| 91 | // docids as if they were signed, or who assume they're 32 bits long. |
| 92 | // We don't define this always because it results in 14% bigger executables. |
| 93 | // |
| 94 | // DocId is thread-compatible. |
| 95 | #define BINPRED_(pred) \ |
| 96 | bool operator pred (const DocId& x) const { return docid_ pred x.docid_; } |
| 97 | class DocId { |
| 98 | protected: |
| 99 | typedef uint64 value_type; |
| 100 | value_type docid_; |
| 101 | |
| 102 | public: |
| 103 | DocId() { docid_ = 0; } |
| 104 | explicit DocId(value_type docid) { docid_ = docid; } |
| 105 | value_type docid() const { return docid_; } // ONLY use here |
| 106 | value_type* docidptr() { return &docid_; } // ie in this file |
| 107 | DocId& operator=(const DocId& x) { |
| 108 | docid_ = x.docid_; |
| 109 | return *this; |
| 110 | } |
| 111 | // These two are required for delta encoding |
| 112 | value_type operator+(const DocId& x) const { return docid_ + x.docid_; } |
| 113 | value_type operator-(const DocId& x) const { return docid_ - x.docid_; } |
| 114 | |
| 115 | // needed in incrementalpr.cc |
| 116 | DocId operator+(uint64 x) const { return DocId(docid_ + x); } |
| 117 | DocId operator-(uint64 x) const { return DocId(docid_ - x); } |
| 118 | // Required for hashing, typically |
| 119 | value_type operator%(value_type m) const { return docid_ % m; } |
| 120 | // Required for sortedrepptrs.cc and partialindexreader.cc |
| 121 | DocId& operator+=(uint64 x) { |
| 122 | docid_ += x; |
| 123 | return *this; |
| 124 | } |
| 125 | // Required for moreoverreposwriter.cc & sortpageranks.cc |
| 126 | DocId& operator++() { |
| 127 | ++docid_; |
| 128 | return *this; |
| 129 | } |
| 130 | DocId& operator--() { |
| 131 | --docid_; |
| 132 | return *this; |
| 133 | } |
| 134 | // Required by dup_docid_categorizer.cc and some other twiddlers, who needs |
| 135 | // to convert 64-bit doc id into 32-bit category id. Now the only intended use |
| 136 | // of this operator is to mask off the top bits of a docid. |
| 137 | value_type operator&(value_type x) const { return docid_ & x; } |
| 138 | // Comparators |
| 139 | BINPRED_(==) BINPRED_(!=) BINPRED_(<) BINPRED_(>) BINPRED_(<=) BINPRED_(>=) |
| 140 | }; |
| 141 | DECLARE_POD(DocId); |
| 142 | #undef BINPRED_ |
| 143 | |
| 144 | // Required for anchorbucket.cc |
| 145 | inline double operator/(double a, const DocId& b) { |
| 146 | return a / static_cast<double>(b.docid()); } |
| 147 | // Required for restrict-tool.cc |
| 148 | inline uint64 operator/(const DocId& a, uint64 b) { return a.docid() / b; } |
| 149 | // Required for index-request.cc |
| 150 | inline double operator*(const DocId& a, double b) { |
| 151 | return static_cast<double>(a.docid()) * b; } |
| 152 | // Required for linksreader.cc |
| 153 | inline uint64 operator*(const DocId& a, uint64 b) { return a.docid() * b; } |
| 154 | // Required for indexdefs.h (really should be HitPosition, not uint64) |
| 155 | inline uint64 operator+(uint64 a, const DocId& b) { return a + b.docid(); } |
| 156 | |
| 157 | // Required for hashing docids. docservercache.cc needs to fingerprint them. |
| 158 | #if !defined __SGI_STL_HASH_FUN_H // taken from stl_decl.h |
| 159 | HASH_NAMESPACE_DECLARATION_START |
| 160 | template <class Key> struct hash; |
| 161 | HASH_NAMESPACE_DECLARATION_END |
| 162 | #endif |
| 163 | HASH_NAMESPACE_DECLARATION_START |
| 164 | template<> struct hash<DocId> { |
| 165 | size_t operator()(const DocId& d) const { |
| 166 | return static_cast<size_t>(d.docid()); |
| 167 | } |
| 168 | }; |
| 169 | HASH_NAMESPACE_DECLARATION_END |
| 170 | |
| 171 | const DocId kMaxDocId = DocId(GG_ULONGLONG(0xFFFFFFFFFFFFFFFF)); |
| 172 | |
| 173 | // These are defined in non-debug mode too; they're always-on type-safety |
| 174 | static inline uint64 DocidForPrintf(const DocId& d) { return d.docid(); } |
| 175 | static inline uint64* DocidForScanf(DocId* d) { return d->docidptr(); } |
| 176 | static inline uint64 DocidForBitmap(const DocId& d) { return d.docid(); } |
| 177 | static inline uint64 DocidForFingerprinting(const DocId& d) |
| 178 | { return d.docid(); } |
| 179 | static inline uint64 DocidForProtocolBuffer(const DocId& d) |
| 180 | { return d.docid(); } |
| 181 | static inline uint64 DocidForAnchorPosition(const DocId& d) |
| 182 | { return d.docid(); } |
| 183 | static inline uint64 DocidForCheckpointing(const DocId& d) |
| 184 | { return d.docid(); } |
| 185 | static inline uint64 DocidForApproxDups(const DocId& d) { return d.docid(); } |
| 186 | static inline uint64 DocIdAsNumber(const DocId& d) { return d.docid(); } |
| 187 | static inline uint64* DocidForSWIG(DocId* d) { return d->docidptr(); } |
| 188 | |
| 189 | // --------------------------------------------------------------------- |
| 190 | // 32 bit docids |
| 191 | // |
| 192 | // --------------------------------------------------------------------- |
| 193 | |
| 194 | #define BINPRED32Bit_(pred) \ |
| 195 | bool operator pred(const DocId32Bit& x) const { \ |
| 196 | return docid_ pred x.docid_; \ |
| 197 | } |
| 198 | |
| 199 | class DocId32Bit { |
| 200 | protected: |
| 201 | typedef uint32 value_type; |
| 202 | value_type docid_; |
| 203 | |
| 204 | public: |
| 205 | DocId32Bit() { docid_ = 0; } |
| 206 | explicit DocId32Bit(value_type docid) { docid_ = docid; } |
| 207 | explicit DocId32Bit(DocId docid) { |
| 208 | // we can't use kMaxDocId32Bit as it is not yet defined |
| 209 | assert(DocIdAsNumber(docid) <= 0xFFFFFFFFul); |
| 210 | docid_ = static_cast<uint32> (DocIdAsNumber(docid)); |
| 211 | } |
| 212 | value_type docid() const { return docid_; } // ONLY use here |
| 213 | value_type* docidptr() { return &docid_; } // ie in this file |
| 214 | DocId32Bit& operator=(const DocId32Bit& x) { |
| 215 | docid_ = x.docid_; |
| 216 | return *this; |
| 217 | } |
| 218 | // These two are required for delta encoding |
| 219 | value_type operator+(const DocId32Bit& x) const { return docid_ + x.docid_; } |
| 220 | value_type operator-(const DocId32Bit& x) const { return docid_ - x.docid_; } |
| 221 | |
| 222 | // needed in incrementalpr.cc |
| 223 | DocId32Bit operator+(uint32 x) const { return DocId32Bit(docid_ + x); } |
| 224 | DocId32Bit operator-(uint32 x) const { return DocId32Bit(docid_ - x); } |
| 225 | // Required for hashing, typically |
| 226 | value_type operator%(value_type m) const { return docid_ % m; } |
| 227 | // Required for sortedrepptrs.cc and partialindexreader.cc |
| 228 | DocId32Bit& operator+=(uint32 x) { |
| 229 | docid_ += x; |
| 230 | return *this; |
| 231 | } |
| 232 | // Required for moreoverreposwriter.cc & sortpageranks.cc |
| 233 | DocId32Bit& operator++() { |
| 234 | ++docid_; |
| 235 | return *this; |
| 236 | } |
| 237 | DocId32Bit& operator--() { |
| 238 | --docid_; |
| 239 | return *this; |
| 240 | } |
| 241 | // Comparators |
| 242 | BINPRED32Bit_(==) BINPRED32Bit_(!=) BINPRED32Bit_(<) BINPRED32Bit_(>) |
| 243 | BINPRED32Bit_(<=) BINPRED32Bit_(>=) |
| 244 | }; |
| 245 | DECLARE_POD(DocId32Bit); |
| 246 | #undef BINPRED32Bit_ |
| 247 | |
| 248 | const DocId32Bit kMaxDocId32Bit = DocId32Bit(0xFFFFFFFFul); |
| 249 | static inline uint32 Docid32BitForBitmap(const DocId32Bit& d) |
| 250 | { return d.docid(); } |
| 251 | static inline uint32 Docid32BitForPrintf(const DocId32Bit& d) |
| 252 | { return d.docid(); } |
| 253 | static inline uint32* Docid32BitForScanf(DocId32Bit* d) |
| 254 | { return d->docidptr(); } |
| 255 | static inline uint32 Docid32BitForFingerprinting(const DocId32Bit& d) |
| 256 | { return d.docid(); } |
| 257 | static inline uint32 Docid32BitForEncryption(const DocId32Bit& d) |
| 258 | { return d.docid(); } |
| 259 | static inline int32 Docid32BitForProtocolBuffer(const DocId32Bit& d) |
| 260 | { return d.docid(); } |
| 261 | static inline uint32 Docid32BitForAnchorPosition(const DocId32Bit& d) |
| 262 | { return d.docid(); } |
| 263 | static inline uint32 Docid32BitForCheckpointing(const DocId32Bit& d) |
| 264 | { return d.docid(); } |
| 265 | static inline uint32 Docid32BitForApproxDups(const DocId32Bit& d) |
| 266 | { return d.docid(); } |
| 267 | static inline uint32 DocId32BitAsNumber(const DocId32Bit& d) |
| 268 | { return d.docid(); } |
| 269 | |
| 270 | static inline bool Is64BitDocId(const DocId& d) |
| 271 | { return d.docid() > DocId32BitAsNumber(kMaxDocId32Bit); } |
| 272 | |
| 273 | inline double operator/(double a, const DocId32Bit& b) |
| 274 | { return a / b.docid(); } |
| 275 | inline int operator/(const DocId32Bit& a, int b) { return a.docid() / b; } |
| 276 | inline double operator*(const DocId32Bit& a, double b) |
| 277 | { return a.docid() * b; } |
| 278 | inline uint64 operator*(const DocId32Bit& a, uint64 b) |
| 279 | { return a.docid() * b; } |
| 280 | inline uint32 operator+(uint32 a, const DocId32Bit& b) |
| 281 | { return a + b.docid(); } |
| 282 | |
| 283 | HASH_NAMESPACE_DECLARATION_START |
| 284 | template<> struct hash<DocId32Bit> { |
| 285 | size_t operator()(const DocId32Bit& d) const { return d.docid(); } |
| 286 | }; |
| 287 | HASH_NAMESPACE_DECLARATION_END |
| 288 | |
| 289 | #endif |
| 290 | |
| 291 | // some definitions for 64 bit docids |
| 292 | const DocId kIllegalDocId = DocId(0); |
| 293 | const DocId kInitialDocId = DocId(1); |
| 294 | inline DocId NextDocIdOnShard(DocId d, int shard, int num_shards) { |
| 295 | return DocId((((DocIdAsNumber(d) + num_shards - 1 - shard) / |
| 296 | num_shards) * num_shards) + shard); |
| 297 | } |
| 298 | |
| 299 | // some definitions for 32 bit docids |
| 300 | const DocId32Bit kIllegalDocId32Bit = DocId32Bit(0); |
| 301 | const DocId32Bit kInitialDocId32Bit = DocId32Bit(1); |
| 302 | |
| 303 | // Type for index-localized docids (normal docid divided by # of |
| 304 | // shards). This type is signed because 0 is a valid local docid (for |
| 305 | // shard > 0), and we need an illegal value. |
| 306 | typedef int32 LocalDocId; |
| 307 | const LocalDocId kMaxLocalDocId = static_cast<LocalDocId>(0x7FFFFFFF); |
| 308 | const LocalDocId kIllegalLocalDocId = static_cast<LocalDocId>(~0); |
| 309 | |
| 310 | // Conversion between local and global docids. |
| 311 | // REQUIRES: the arguments should be neither Illegal nor Max. |
| 312 | inline LocalDocId GlobalToLocalDocId(DocId d, int /*shard*/, int num_shards) { |
| 313 | assert(d > kIllegalDocId); |
| 314 | assert(d < kMaxDocId); |
| 315 | return static_cast<LocalDocId>(DocIdAsNumber(d) / num_shards); |
| 316 | } |
| 317 | inline DocId LocalToGlobalDocId(LocalDocId d, int shard, int num_shards) { |
| 318 | assert(d > kIllegalLocalDocId); |
| 319 | assert(d < kMaxLocalDocId); |
| 320 | return DocId((static_cast<uint32>(d)*num_shards) + shard); |
| 321 | } |
| 322 | |
| 323 | // Equivalent to GlobalToLocalDocId(NextDocIdOnShard(d, sh, ns), sh, ns) |
| 324 | inline LocalDocId NextLocalDocIdOnShard(DocId d, int shard, int num_shards) { |
| 325 | return GlobalToLocalDocId(NextDocIdOnShard(d, shard, num_shards), |
| 326 | shard, num_shards); |
| 327 | } |
| 328 | |
| 329 | inline DocId DocIdFromUrlfp(Fprint urlfp) { |
| 330 | return DocId(urlfp); |
| 331 | } |
| 332 | |
| 333 | // DocVersionId |
| 334 | |
| 335 | // For real time cache. A higher value means more recent. |
| 336 | typedef uint32 DocVersionIdVal; |
| 337 | const DocVersionIdVal kIllegalDocVersionId = static_cast<DocVersionIdVal>(0); |
| 338 | |
| 339 | #endif // BASE_DOCID_H_ |
| 340 | |