| 1 | // Copyright 2001 and onwards Google Inc. | 
|---|
| 2 |  | 
|---|
| 3 | #ifndef BASE_DOCID_H_ | 
|---|
| 4 | #define BASE_DOCID_H_ | 
|---|
| 5 |  | 
|---|
| 6 | #include <assert.h> | 
|---|
| 7 |  | 
|---|
| 8 | // sys/types.h is only needed by this file if !defined(NDEBUG), | 
|---|
| 9 | // but including it conditionally can cause hard-to-track-down | 
|---|
| 10 | // compilation failures that only happen when doing optimized builds. | 
|---|
| 11 | // Let's include it unconditionally to reduce the number of such breakages | 
|---|
| 12 | // we need to track down. | 
|---|
| 13 | // Note that uint is defined by sys/types.h only if _GNU_SOURCE or its ilk is | 
|---|
| 14 | // defined, and that mysql.h seems to define _GNU_SOURCE, so the problem | 
|---|
| 15 | // may only happen if sys/types.h is included after mysql.h. | 
|---|
| 16 | // We now define uint in base/port.h if needed, and in a way that won't clash. | 
|---|
| 17 | #include <sys/types.h>       // for size_t | 
|---|
| 18 |  | 
|---|
| 19 | #include "base/basictypes.h" | 
|---|
| 20 |  | 
|---|
| 21 | #define DOCID_FORMAT "%llu" | 
|---|
| 22 | #define DOCID32BIT_FORMAT "%u" | 
|---|
| 23 |  | 
|---|
| 24 | // The following help ensure type safety for docids.  Use them when | 
|---|
| 25 | // you want to printf/scanf a docid.  Note that even so, when the | 
|---|
| 26 | // type of docid changes you'll need to change all the format args | 
|---|
| 27 | // for these printf/scanf statements by hand (to be %qu, say). | 
|---|
| 28 | // | 
|---|
| 29 | // We do two kinds of docids to facilitate the transition from 32-bit | 
|---|
| 30 | // docids to 64-bit docids. We need both in the transition period | 
|---|
| 31 | // as the base index will use 32 bit docids and the incremental index | 
|---|
| 32 | // will use 64-bit docids. | 
|---|
| 33 |  | 
|---|
| 34 | #if defined NDEBUG | 
|---|
| 35 | typedef uint64 DocId; | 
|---|
| 36 | const DocId kMaxDocId = DocId(GG_ULONGLONG(0xFFFFFFFFFFFFFFFF)); | 
|---|
| 37 |  | 
|---|
| 38 | static inline uint64 DocidForPrintf(const DocId& d)   { return d; } | 
|---|
| 39 | static inline uint64* DocidForScanf(DocId* d)         { return d; } | 
|---|
| 40 | // Used with the SparseBitmap class, primarily | 
|---|
| 41 | static inline uint64 DocidForBitmap(const DocId& d)  { return d; } | 
|---|
| 42 | // Used with the docservercache stuff, for odd but reasonable reasons | 
|---|
| 43 | static inline uint64 DocidForFingerprinting(const DocId& d)  { return d; } | 
|---|
| 44 | // Used with the protocol-buffer stuff | 
|---|
| 45 | static inline uint64 DocidForProtocolBuffer(const DocId& d)  { return d; } | 
|---|
| 46 | // Used during anchor processing | 
|---|
| 47 | static inline uint64 DocidForAnchorPosition(const DocId& d) { return d; } | 
|---|
| 48 | // Used for checkpointing | 
|---|
| 49 | static inline uint64 DocidForCheckpointing(const DocId& d) { return d; } | 
|---|
| 50 | // Used for approximate dups detection | 
|---|
| 51 | static inline uint64 DocidForApproxDups(const DocId& d) { return d; } | 
|---|
| 52 | // Used for SWIG | 
|---|
| 53 | static inline uint64* DocidForSWIG(DocId* d)         { return d; } | 
|---|
| 54 |  | 
|---|
| 55 | // Used for miscellaneous arithmetic. | 
|---|
| 56 | static inline uint64 DocIdAsNumber(const DocId& d) { return d; } | 
|---|
| 57 |  | 
|---|
| 58 | // ------------------------------------------------------------------ | 
|---|
| 59 | // 32 bit docids | 
|---|
| 60 | // ------------------------------------------------------------------ | 
|---|
| 61 | typedef uint32 DocId32Bit; | 
|---|
| 62 | const DocId32Bit kMaxDocId32Bit = static_cast<DocId32Bit>(0xFFFFFFFFul); | 
|---|
| 63 | static inline uint32 Docid32BitForPrintf(const DocId32Bit& d) { return d; } | 
|---|
| 64 | static inline uint32* Docid32BitForScanf(DocId32Bit* d) { return d; } | 
|---|
| 65 | static inline uint32 Docid32BitForBitmap(const DocId32Bit& d)  { return d; } | 
|---|
| 66 | static inline uint32 Docid32BitForFingerprinting(const DocId32Bit& d) { | 
|---|
| 67 | return d; | 
|---|
| 68 | } | 
|---|
| 69 | static inline uint32 Docid32BitForEncryption(const DocId32Bit &d) { | 
|---|
| 70 | return d; | 
|---|
| 71 | } | 
|---|
| 72 | static inline uint32 Docid32BitForProtocolBuffer(const DocId32Bit& d) { | 
|---|
| 73 | return d; | 
|---|
| 74 | } | 
|---|
| 75 | static inline uint32 Docid32BitForAnchorPosition(const DocId32Bit& d) { | 
|---|
| 76 | return d; | 
|---|
| 77 | } | 
|---|
| 78 | static inline uint32 Docid32BitForCheckpointing(const DocId32Bit& d) { | 
|---|
| 79 | return d; | 
|---|
| 80 | } | 
|---|
| 81 | static inline uint32 Docid32BitForApproxDups(const DocId32Bit& d) { return d; } | 
|---|
| 82 |  | 
|---|
| 83 | static inline uint32 DocId32BitAsNumber(const DocId32Bit& d) { return d; } | 
|---|
| 84 |  | 
|---|
| 85 | // return true if the value cannot fit in DocId32Bit | 
|---|
| 86 | static inline bool Is64BitDocId(const DocId& d) { return d > kMaxDocId32Bit; } | 
|---|
| 87 |  | 
|---|
| 88 | #else | 
|---|
| 89 | // In debug mode, we make docid its own type so we can be sure of | 
|---|
| 90 | // type-safety.  In particular, we're concerned with people who treat | 
|---|
| 91 | // docids as if they were signed, or who assume they're 32 bits long. | 
|---|
| 92 | // We don't define this always because it results in 14% bigger executables. | 
|---|
| 93 | // | 
|---|
| 94 | // DocId is thread-compatible. | 
|---|
| 95 | #define BINPRED_(pred)   \ | 
|---|
| 96 | bool operator pred (const DocId& x) const { return docid_ pred x.docid_; } | 
|---|
| 97 | class DocId { | 
|---|
| 98 | protected: | 
|---|
| 99 | typedef uint64 value_type; | 
|---|
| 100 | value_type docid_; | 
|---|
| 101 |  | 
|---|
| 102 | public: | 
|---|
| 103 | DocId()                               { docid_ = 0; } | 
|---|
| 104 | explicit DocId(value_type docid)      { docid_ = docid; } | 
|---|
| 105 | value_type docid() const              { return docid_; }   // ONLY use here | 
|---|
| 106 | value_type* docidptr()                { return &docid_; }  // ie in this file | 
|---|
| 107 | DocId& operator=(const DocId& x) { | 
|---|
| 108 | docid_ = x.docid_; | 
|---|
| 109 | return *this; | 
|---|
| 110 | } | 
|---|
| 111 | // These two are required for delta encoding | 
|---|
| 112 | value_type operator+(const DocId& x) const { return docid_ + x.docid_; } | 
|---|
| 113 | value_type operator-(const DocId& x) const { return docid_ - x.docid_; } | 
|---|
| 114 |  | 
|---|
| 115 | // needed in incrementalpr.cc | 
|---|
| 116 | DocId operator+(uint64 x) const      { return DocId(docid_ + x); } | 
|---|
| 117 | DocId operator-(uint64 x) const      { return DocId(docid_ - x); } | 
|---|
| 118 | // Required for hashing, typically | 
|---|
| 119 | value_type operator%(value_type m) const   { return docid_ % m; } | 
|---|
| 120 | // Required for sortedrepptrs.cc and partialindexreader.cc | 
|---|
| 121 | DocId& operator+=(uint64 x) { | 
|---|
| 122 | docid_ += x; | 
|---|
| 123 | return *this; | 
|---|
| 124 | } | 
|---|
| 125 | // Required for moreoverreposwriter.cc & sortpageranks.cc | 
|---|
| 126 | DocId& operator++() { | 
|---|
| 127 | ++docid_; | 
|---|
| 128 | return *this; | 
|---|
| 129 | } | 
|---|
| 130 | DocId& operator--() { | 
|---|
| 131 | --docid_; | 
|---|
| 132 | return *this; | 
|---|
| 133 | } | 
|---|
| 134 | // Required by dup_docid_categorizer.cc and some other twiddlers, who needs | 
|---|
| 135 | // to convert 64-bit doc id into 32-bit category id. Now the only intended use | 
|---|
| 136 | // of this operator is to mask off the top bits of a docid. | 
|---|
| 137 | value_type operator&(value_type x) const { return docid_ & x; } | 
|---|
| 138 | // Comparators | 
|---|
| 139 | BINPRED_(==) BINPRED_(!=)  BINPRED_(<) BINPRED_(>)  BINPRED_(<=) BINPRED_(>=) | 
|---|
| 140 | }; | 
|---|
| 141 | DECLARE_POD(DocId); | 
|---|
| 142 | #undef BINPRED_ | 
|---|
| 143 |  | 
|---|
| 144 | // Required for anchorbucket.cc | 
|---|
| 145 | inline double operator/(double a, const DocId& b) { | 
|---|
| 146 | return a / static_cast<double>(b.docid()); } | 
|---|
| 147 | // Required for restrict-tool.cc | 
|---|
| 148 | inline uint64 operator/(const DocId& a, uint64 b) { return a.docid() / b; } | 
|---|
| 149 | // Required for index-request.cc | 
|---|
| 150 | inline double operator*(const DocId& a, double b) { | 
|---|
| 151 | return static_cast<double>(a.docid()) * b; } | 
|---|
| 152 | // Required for linksreader.cc | 
|---|
| 153 | inline uint64 operator*(const DocId& a, uint64 b) { return a.docid() * b; } | 
|---|
| 154 | // Required for indexdefs.h (really should be HitPosition, not uint64) | 
|---|
| 155 | inline uint64 operator+(uint64 a, const DocId& b) { return a + b.docid(); } | 
|---|
| 156 |  | 
|---|
| 157 | // Required for hashing docids.  docservercache.cc needs to fingerprint them. | 
|---|
| 158 | #if !defined __SGI_STL_HASH_FUN_H       // taken from stl_decl.h | 
|---|
| 159 | HASH_NAMESPACE_DECLARATION_START | 
|---|
| 160 | template <class Key> struct hash; | 
|---|
| 161 | HASH_NAMESPACE_DECLARATION_END | 
|---|
| 162 | #endif | 
|---|
| 163 | HASH_NAMESPACE_DECLARATION_START | 
|---|
| 164 | template<> struct hash<DocId> { | 
|---|
| 165 | size_t operator()(const DocId& d) const { | 
|---|
| 166 | return static_cast<size_t>(d.docid()); | 
|---|
| 167 | } | 
|---|
| 168 | }; | 
|---|
| 169 | HASH_NAMESPACE_DECLARATION_END | 
|---|
| 170 |  | 
|---|
| 171 | const DocId kMaxDocId = DocId(GG_ULONGLONG(0xFFFFFFFFFFFFFFFF)); | 
|---|
| 172 |  | 
|---|
| 173 | // These are defined in non-debug mode too; they're always-on type-safety | 
|---|
| 174 | static inline uint64 DocidForPrintf(const DocId& d)  { return d.docid(); } | 
|---|
| 175 | static inline uint64* DocidForScanf(DocId* d)        { return d->docidptr(); } | 
|---|
| 176 | static inline uint64 DocidForBitmap(const DocId& d)  { return d.docid(); } | 
|---|
| 177 | static inline uint64 DocidForFingerprinting(const DocId& d) | 
|---|
| 178 | { return d.docid(); } | 
|---|
| 179 | static inline uint64 DocidForProtocolBuffer(const DocId& d) | 
|---|
| 180 | { return d.docid(); } | 
|---|
| 181 | static inline uint64 DocidForAnchorPosition(const DocId& d) | 
|---|
| 182 | { return d.docid(); } | 
|---|
| 183 | static inline uint64 DocidForCheckpointing(const DocId& d) | 
|---|
| 184 | { return d.docid(); } | 
|---|
| 185 | static inline uint64 DocidForApproxDups(const DocId& d) { return d.docid(); } | 
|---|
| 186 | static inline uint64 DocIdAsNumber(const DocId& d) { return d.docid(); } | 
|---|
| 187 | static inline uint64* DocidForSWIG(DocId* d)        { return d->docidptr(); } | 
|---|
| 188 |  | 
|---|
| 189 | // --------------------------------------------------------------------- | 
|---|
| 190 | // 32 bit docids | 
|---|
| 191 | // | 
|---|
| 192 | // --------------------------------------------------------------------- | 
|---|
| 193 |  | 
|---|
| 194 | #define BINPRED32Bit_(pred)   \ | 
|---|
| 195 | bool operator pred(const DocId32Bit& x) const { \ | 
|---|
| 196 | return docid_ pred x.docid_; \ | 
|---|
| 197 | } | 
|---|
| 198 |  | 
|---|
| 199 | class DocId32Bit { | 
|---|
| 200 | protected: | 
|---|
| 201 | typedef uint32 value_type; | 
|---|
| 202 | value_type docid_; | 
|---|
| 203 |  | 
|---|
| 204 | public: | 
|---|
| 205 | DocId32Bit()                          { docid_ = 0; } | 
|---|
| 206 | explicit DocId32Bit(value_type docid) { docid_ = docid; } | 
|---|
| 207 | explicit DocId32Bit(DocId docid) { | 
|---|
| 208 | // we can't use kMaxDocId32Bit as it is not yet defined | 
|---|
| 209 | assert(DocIdAsNumber(docid) <= 0xFFFFFFFFul); | 
|---|
| 210 | docid_ = static_cast<uint32> (DocIdAsNumber(docid)); | 
|---|
| 211 | } | 
|---|
| 212 | value_type docid() const              { return docid_; }   // ONLY use here | 
|---|
| 213 | value_type* docidptr()                { return &docid_; }  // ie in this file | 
|---|
| 214 | DocId32Bit& operator=(const DocId32Bit& x) { | 
|---|
| 215 | docid_ = x.docid_; | 
|---|
| 216 | return *this; | 
|---|
| 217 | } | 
|---|
| 218 | // These two are required for delta encoding | 
|---|
| 219 | value_type operator+(const DocId32Bit& x) const { return docid_ + x.docid_; } | 
|---|
| 220 | value_type operator-(const DocId32Bit& x) const { return docid_ - x.docid_; } | 
|---|
| 221 |  | 
|---|
| 222 | // needed in incrementalpr.cc | 
|---|
| 223 | DocId32Bit operator+(uint32 x) const      { return DocId32Bit(docid_ + x); } | 
|---|
| 224 | DocId32Bit operator-(uint32 x) const      { return DocId32Bit(docid_ - x); } | 
|---|
| 225 | // Required for hashing, typically | 
|---|
| 226 | value_type operator%(value_type m) const   { return docid_ % m; } | 
|---|
| 227 | // Required for sortedrepptrs.cc and partialindexreader.cc | 
|---|
| 228 | DocId32Bit& operator+=(uint32 x) { | 
|---|
| 229 | docid_ += x; | 
|---|
| 230 | return *this; | 
|---|
| 231 | } | 
|---|
| 232 | // Required for moreoverreposwriter.cc & sortpageranks.cc | 
|---|
| 233 | DocId32Bit& operator++() { | 
|---|
| 234 | ++docid_; | 
|---|
| 235 | return *this; | 
|---|
| 236 | } | 
|---|
| 237 | DocId32Bit& operator--() { | 
|---|
| 238 | --docid_; | 
|---|
| 239 | return *this; | 
|---|
| 240 | } | 
|---|
| 241 | // Comparators | 
|---|
| 242 | BINPRED32Bit_(==) BINPRED32Bit_(!=)  BINPRED32Bit_(<) BINPRED32Bit_(>) | 
|---|
| 243 | BINPRED32Bit_(<=) BINPRED32Bit_(>=) | 
|---|
| 244 | }; | 
|---|
| 245 | DECLARE_POD(DocId32Bit); | 
|---|
| 246 | #undef BINPRED32Bit_ | 
|---|
| 247 |  | 
|---|
| 248 | const DocId32Bit kMaxDocId32Bit = DocId32Bit(0xFFFFFFFFul); | 
|---|
| 249 | static inline uint32 Docid32BitForBitmap(const DocId32Bit& d) | 
|---|
| 250 | { return d.docid(); } | 
|---|
| 251 | static inline uint32 Docid32BitForPrintf(const DocId32Bit& d) | 
|---|
| 252 | { return d.docid(); } | 
|---|
| 253 | static inline uint32* Docid32BitForScanf(DocId32Bit* d) | 
|---|
| 254 | { return d->docidptr(); } | 
|---|
| 255 | static inline uint32 Docid32BitForFingerprinting(const DocId32Bit& d) | 
|---|
| 256 | { return d.docid(); } | 
|---|
| 257 | static inline uint32 Docid32BitForEncryption(const DocId32Bit& d) | 
|---|
| 258 | { return d.docid(); } | 
|---|
| 259 | static inline int32 Docid32BitForProtocolBuffer(const DocId32Bit& d) | 
|---|
| 260 | { return d.docid(); } | 
|---|
| 261 | static inline uint32 Docid32BitForAnchorPosition(const DocId32Bit& d) | 
|---|
| 262 | { return d.docid(); } | 
|---|
| 263 | static inline uint32 Docid32BitForCheckpointing(const DocId32Bit& d) | 
|---|
| 264 | { return d.docid(); } | 
|---|
| 265 | static inline uint32 Docid32BitForApproxDups(const DocId32Bit& d) | 
|---|
| 266 | { return d.docid(); } | 
|---|
| 267 | static inline uint32 DocId32BitAsNumber(const DocId32Bit& d) | 
|---|
| 268 | { return d.docid(); } | 
|---|
| 269 |  | 
|---|
| 270 | static inline bool Is64BitDocId(const DocId& d) | 
|---|
| 271 | { return d.docid() > DocId32BitAsNumber(kMaxDocId32Bit); } | 
|---|
| 272 |  | 
|---|
| 273 | inline double operator/(double a, const DocId32Bit& b) | 
|---|
| 274 | { return a / b.docid(); } | 
|---|
| 275 | inline int operator/(const DocId32Bit& a, int b)  { return a.docid() / b; } | 
|---|
| 276 | inline double operator*(const DocId32Bit& a, double b) | 
|---|
| 277 | { return a.docid() * b; } | 
|---|
| 278 | inline uint64 operator*(const DocId32Bit& a, uint64 b) | 
|---|
| 279 | { return a.docid() * b; } | 
|---|
| 280 | inline uint32 operator+(uint32 a, const DocId32Bit& b) | 
|---|
| 281 | { return a + b.docid(); } | 
|---|
| 282 |  | 
|---|
| 283 | HASH_NAMESPACE_DECLARATION_START | 
|---|
| 284 | template<> struct hash<DocId32Bit> { | 
|---|
| 285 | size_t operator()(const DocId32Bit& d) const { return d.docid(); } | 
|---|
| 286 | }; | 
|---|
| 287 | HASH_NAMESPACE_DECLARATION_END | 
|---|
| 288 |  | 
|---|
| 289 | #endif | 
|---|
| 290 |  | 
|---|
| 291 | // some definitions for 64 bit docids | 
|---|
| 292 | const DocId kIllegalDocId = DocId(0); | 
|---|
| 293 | const DocId kInitialDocId = DocId(1); | 
|---|
| 294 | inline DocId NextDocIdOnShard(DocId d, int shard, int num_shards) { | 
|---|
| 295 | return DocId((((DocIdAsNumber(d) + num_shards - 1 - shard) / | 
|---|
| 296 | num_shards) * num_shards) + shard); | 
|---|
| 297 | } | 
|---|
| 298 |  | 
|---|
| 299 | // some definitions for 32 bit docids | 
|---|
| 300 | const DocId32Bit kIllegalDocId32Bit = DocId32Bit(0); | 
|---|
| 301 | const DocId32Bit kInitialDocId32Bit = DocId32Bit(1); | 
|---|
| 302 |  | 
|---|
| 303 | // Type for index-localized docids (normal docid divided by # of | 
|---|
| 304 | // shards).  This type is signed because 0 is a valid local docid (for | 
|---|
| 305 | // shard > 0), and we need an illegal value. | 
|---|
| 306 | typedef int32 LocalDocId; | 
|---|
| 307 | const LocalDocId kMaxLocalDocId = static_cast<LocalDocId>(0x7FFFFFFF); | 
|---|
| 308 | const LocalDocId kIllegalLocalDocId = static_cast<LocalDocId>(~0); | 
|---|
| 309 |  | 
|---|
| 310 | // Conversion between local and global docids. | 
|---|
| 311 | // REQUIRES: the arguments should be neither Illegal nor Max. | 
|---|
| 312 | inline LocalDocId GlobalToLocalDocId(DocId d, int /*shard*/, int num_shards) { | 
|---|
| 313 | assert(d > kIllegalDocId); | 
|---|
| 314 | assert(d < kMaxDocId); | 
|---|
| 315 | return static_cast<LocalDocId>(DocIdAsNumber(d) / num_shards); | 
|---|
| 316 | } | 
|---|
| 317 | inline DocId LocalToGlobalDocId(LocalDocId d, int shard, int num_shards) { | 
|---|
| 318 | assert(d > kIllegalLocalDocId); | 
|---|
| 319 | assert(d < kMaxLocalDocId); | 
|---|
| 320 | return DocId((static_cast<uint32>(d)*num_shards) + shard); | 
|---|
| 321 | } | 
|---|
| 322 |  | 
|---|
| 323 | // Equivalent to GlobalToLocalDocId(NextDocIdOnShard(d, sh, ns), sh, ns) | 
|---|
| 324 | inline LocalDocId NextLocalDocIdOnShard(DocId d, int shard, int num_shards) { | 
|---|
| 325 | return GlobalToLocalDocId(NextDocIdOnShard(d, shard, num_shards), | 
|---|
| 326 | shard, num_shards); | 
|---|
| 327 | } | 
|---|
| 328 |  | 
|---|
| 329 | inline DocId DocIdFromUrlfp(Fprint urlfp) { | 
|---|
| 330 | return DocId(urlfp); | 
|---|
| 331 | } | 
|---|
| 332 |  | 
|---|
| 333 | // DocVersionId | 
|---|
| 334 |  | 
|---|
| 335 | // For real time cache.  A higher value means more recent. | 
|---|
| 336 | typedef uint32 DocVersionIdVal; | 
|---|
| 337 | const DocVersionIdVal kIllegalDocVersionId = static_cast<DocVersionIdVal>(0); | 
|---|
| 338 |  | 
|---|
| 339 | #endif  // BASE_DOCID_H_ | 
|---|
| 340 |  | 
|---|