1 | // Copyright 2001 and onwards Google Inc. |
2 | |
3 | #ifndef BASE_DOCID_H_ |
4 | #define BASE_DOCID_H_ |
5 | |
6 | #include <assert.h> |
7 | |
8 | // sys/types.h is only needed by this file if !defined(NDEBUG), |
9 | // but including it conditionally can cause hard-to-track-down |
10 | // compilation failures that only happen when doing optimized builds. |
11 | // Let's include it unconditionally to reduce the number of such breakages |
12 | // we need to track down. |
13 | // Note that uint is defined by sys/types.h only if _GNU_SOURCE or its ilk is |
14 | // defined, and that mysql.h seems to define _GNU_SOURCE, so the problem |
15 | // may only happen if sys/types.h is included after mysql.h. |
16 | // We now define uint in base/port.h if needed, and in a way that won't clash. |
17 | #include <sys/types.h> // for size_t |
18 | |
19 | #include "base/basictypes.h" |
20 | |
21 | #define DOCID_FORMAT "%llu" |
22 | #define DOCID32BIT_FORMAT "%u" |
23 | |
24 | // The following help ensure type safety for docids. Use them when |
25 | // you want to printf/scanf a docid. Note that even so, when the |
26 | // type of docid changes you'll need to change all the format args |
27 | // for these printf/scanf statements by hand (to be %qu, say). |
28 | // |
29 | // We do two kinds of docids to facilitate the transition from 32-bit |
30 | // docids to 64-bit docids. We need both in the transition period |
31 | // as the base index will use 32 bit docids and the incremental index |
32 | // will use 64-bit docids. |
33 | |
34 | #if defined NDEBUG |
35 | typedef uint64 DocId; |
36 | const DocId kMaxDocId = DocId(GG_ULONGLONG(0xFFFFFFFFFFFFFFFF)); |
37 | |
38 | static inline uint64 DocidForPrintf(const DocId& d) { return d; } |
39 | static inline uint64* DocidForScanf(DocId* d) { return d; } |
40 | // Used with the SparseBitmap class, primarily |
41 | static inline uint64 DocidForBitmap(const DocId& d) { return d; } |
42 | // Used with the docservercache stuff, for odd but reasonable reasons |
43 | static inline uint64 DocidForFingerprinting(const DocId& d) { return d; } |
44 | // Used with the protocol-buffer stuff |
45 | static inline uint64 DocidForProtocolBuffer(const DocId& d) { return d; } |
46 | // Used during anchor processing |
47 | static inline uint64 DocidForAnchorPosition(const DocId& d) { return d; } |
48 | // Used for checkpointing |
49 | static inline uint64 DocidForCheckpointing(const DocId& d) { return d; } |
50 | // Used for approximate dups detection |
51 | static inline uint64 DocidForApproxDups(const DocId& d) { return d; } |
52 | // Used for SWIG |
53 | static inline uint64* DocidForSWIG(DocId* d) { return d; } |
54 | |
55 | // Used for miscellaneous arithmetic. |
56 | static inline uint64 DocIdAsNumber(const DocId& d) { return d; } |
57 | |
58 | // ------------------------------------------------------------------ |
59 | // 32 bit docids |
60 | // ------------------------------------------------------------------ |
61 | typedef uint32 DocId32Bit; |
62 | const DocId32Bit kMaxDocId32Bit = static_cast<DocId32Bit>(0xFFFFFFFFul); |
63 | static inline uint32 Docid32BitForPrintf(const DocId32Bit& d) { return d; } |
64 | static inline uint32* Docid32BitForScanf(DocId32Bit* d) { return d; } |
65 | static inline uint32 Docid32BitForBitmap(const DocId32Bit& d) { return d; } |
66 | static inline uint32 Docid32BitForFingerprinting(const DocId32Bit& d) { |
67 | return d; |
68 | } |
69 | static inline uint32 Docid32BitForEncryption(const DocId32Bit &d) { |
70 | return d; |
71 | } |
72 | static inline uint32 Docid32BitForProtocolBuffer(const DocId32Bit& d) { |
73 | return d; |
74 | } |
75 | static inline uint32 Docid32BitForAnchorPosition(const DocId32Bit& d) { |
76 | return d; |
77 | } |
78 | static inline uint32 Docid32BitForCheckpointing(const DocId32Bit& d) { |
79 | return d; |
80 | } |
81 | static inline uint32 Docid32BitForApproxDups(const DocId32Bit& d) { return d; } |
82 | |
83 | static inline uint32 DocId32BitAsNumber(const DocId32Bit& d) { return d; } |
84 | |
85 | // return true if the value cannot fit in DocId32Bit |
86 | static inline bool Is64BitDocId(const DocId& d) { return d > kMaxDocId32Bit; } |
87 | |
88 | #else |
89 | // In debug mode, we make docid its own type so we can be sure of |
90 | // type-safety. In particular, we're concerned with people who treat |
91 | // docids as if they were signed, or who assume they're 32 bits long. |
92 | // We don't define this always because it results in 14% bigger executables. |
93 | // |
94 | // DocId is thread-compatible. |
95 | #define BINPRED_(pred) \ |
96 | bool operator pred (const DocId& x) const { return docid_ pred x.docid_; } |
97 | class DocId { |
98 | protected: |
99 | typedef uint64 value_type; |
100 | value_type docid_; |
101 | |
102 | public: |
103 | DocId() { docid_ = 0; } |
104 | explicit DocId(value_type docid) { docid_ = docid; } |
105 | value_type docid() const { return docid_; } // ONLY use here |
106 | value_type* docidptr() { return &docid_; } // ie in this file |
107 | DocId& operator=(const DocId& x) { |
108 | docid_ = x.docid_; |
109 | return *this; |
110 | } |
111 | // These two are required for delta encoding |
112 | value_type operator+(const DocId& x) const { return docid_ + x.docid_; } |
113 | value_type operator-(const DocId& x) const { return docid_ - x.docid_; } |
114 | |
115 | // needed in incrementalpr.cc |
116 | DocId operator+(uint64 x) const { return DocId(docid_ + x); } |
117 | DocId operator-(uint64 x) const { return DocId(docid_ - x); } |
118 | // Required for hashing, typically |
119 | value_type operator%(value_type m) const { return docid_ % m; } |
120 | // Required for sortedrepptrs.cc and partialindexreader.cc |
121 | DocId& operator+=(uint64 x) { |
122 | docid_ += x; |
123 | return *this; |
124 | } |
125 | // Required for moreoverreposwriter.cc & sortpageranks.cc |
126 | DocId& operator++() { |
127 | ++docid_; |
128 | return *this; |
129 | } |
130 | DocId& operator--() { |
131 | --docid_; |
132 | return *this; |
133 | } |
134 | // Required by dup_docid_categorizer.cc and some other twiddlers, who needs |
135 | // to convert 64-bit doc id into 32-bit category id. Now the only intended use |
136 | // of this operator is to mask off the top bits of a docid. |
137 | value_type operator&(value_type x) const { return docid_ & x; } |
138 | // Comparators |
139 | BINPRED_(==) BINPRED_(!=) BINPRED_(<) BINPRED_(>) BINPRED_(<=) BINPRED_(>=) |
140 | }; |
141 | DECLARE_POD(DocId); |
142 | #undef BINPRED_ |
143 | |
144 | // Required for anchorbucket.cc |
145 | inline double operator/(double a, const DocId& b) { |
146 | return a / static_cast<double>(b.docid()); } |
147 | // Required for restrict-tool.cc |
148 | inline uint64 operator/(const DocId& a, uint64 b) { return a.docid() / b; } |
149 | // Required for index-request.cc |
150 | inline double operator*(const DocId& a, double b) { |
151 | return static_cast<double>(a.docid()) * b; } |
152 | // Required for linksreader.cc |
153 | inline uint64 operator*(const DocId& a, uint64 b) { return a.docid() * b; } |
154 | // Required for indexdefs.h (really should be HitPosition, not uint64) |
155 | inline uint64 operator+(uint64 a, const DocId& b) { return a + b.docid(); } |
156 | |
157 | // Required for hashing docids. docservercache.cc needs to fingerprint them. |
158 | #if !defined __SGI_STL_HASH_FUN_H // taken from stl_decl.h |
159 | HASH_NAMESPACE_DECLARATION_START |
160 | template <class Key> struct hash; |
161 | HASH_NAMESPACE_DECLARATION_END |
162 | #endif |
163 | HASH_NAMESPACE_DECLARATION_START |
164 | template<> struct hash<DocId> { |
165 | size_t operator()(const DocId& d) const { |
166 | return static_cast<size_t>(d.docid()); |
167 | } |
168 | }; |
169 | HASH_NAMESPACE_DECLARATION_END |
170 | |
171 | const DocId kMaxDocId = DocId(GG_ULONGLONG(0xFFFFFFFFFFFFFFFF)); |
172 | |
173 | // These are defined in non-debug mode too; they're always-on type-safety |
174 | static inline uint64 DocidForPrintf(const DocId& d) { return d.docid(); } |
175 | static inline uint64* DocidForScanf(DocId* d) { return d->docidptr(); } |
176 | static inline uint64 DocidForBitmap(const DocId& d) { return d.docid(); } |
177 | static inline uint64 DocidForFingerprinting(const DocId& d) |
178 | { return d.docid(); } |
179 | static inline uint64 DocidForProtocolBuffer(const DocId& d) |
180 | { return d.docid(); } |
181 | static inline uint64 DocidForAnchorPosition(const DocId& d) |
182 | { return d.docid(); } |
183 | static inline uint64 DocidForCheckpointing(const DocId& d) |
184 | { return d.docid(); } |
185 | static inline uint64 DocidForApproxDups(const DocId& d) { return d.docid(); } |
186 | static inline uint64 DocIdAsNumber(const DocId& d) { return d.docid(); } |
187 | static inline uint64* DocidForSWIG(DocId* d) { return d->docidptr(); } |
188 | |
189 | // --------------------------------------------------------------------- |
190 | // 32 bit docids |
191 | // |
192 | // --------------------------------------------------------------------- |
193 | |
194 | #define BINPRED32Bit_(pred) \ |
195 | bool operator pred(const DocId32Bit& x) const { \ |
196 | return docid_ pred x.docid_; \ |
197 | } |
198 | |
199 | class DocId32Bit { |
200 | protected: |
201 | typedef uint32 value_type; |
202 | value_type docid_; |
203 | |
204 | public: |
205 | DocId32Bit() { docid_ = 0; } |
206 | explicit DocId32Bit(value_type docid) { docid_ = docid; } |
207 | explicit DocId32Bit(DocId docid) { |
208 | // we can't use kMaxDocId32Bit as it is not yet defined |
209 | assert(DocIdAsNumber(docid) <= 0xFFFFFFFFul); |
210 | docid_ = static_cast<uint32> (DocIdAsNumber(docid)); |
211 | } |
212 | value_type docid() const { return docid_; } // ONLY use here |
213 | value_type* docidptr() { return &docid_; } // ie in this file |
214 | DocId32Bit& operator=(const DocId32Bit& x) { |
215 | docid_ = x.docid_; |
216 | return *this; |
217 | } |
218 | // These two are required for delta encoding |
219 | value_type operator+(const DocId32Bit& x) const { return docid_ + x.docid_; } |
220 | value_type operator-(const DocId32Bit& x) const { return docid_ - x.docid_; } |
221 | |
222 | // needed in incrementalpr.cc |
223 | DocId32Bit operator+(uint32 x) const { return DocId32Bit(docid_ + x); } |
224 | DocId32Bit operator-(uint32 x) const { return DocId32Bit(docid_ - x); } |
225 | // Required for hashing, typically |
226 | value_type operator%(value_type m) const { return docid_ % m; } |
227 | // Required for sortedrepptrs.cc and partialindexreader.cc |
228 | DocId32Bit& operator+=(uint32 x) { |
229 | docid_ += x; |
230 | return *this; |
231 | } |
232 | // Required for moreoverreposwriter.cc & sortpageranks.cc |
233 | DocId32Bit& operator++() { |
234 | ++docid_; |
235 | return *this; |
236 | } |
237 | DocId32Bit& operator--() { |
238 | --docid_; |
239 | return *this; |
240 | } |
241 | // Comparators |
242 | BINPRED32Bit_(==) BINPRED32Bit_(!=) BINPRED32Bit_(<) BINPRED32Bit_(>) |
243 | BINPRED32Bit_(<=) BINPRED32Bit_(>=) |
244 | }; |
245 | DECLARE_POD(DocId32Bit); |
246 | #undef BINPRED32Bit_ |
247 | |
248 | const DocId32Bit kMaxDocId32Bit = DocId32Bit(0xFFFFFFFFul); |
249 | static inline uint32 Docid32BitForBitmap(const DocId32Bit& d) |
250 | { return d.docid(); } |
251 | static inline uint32 Docid32BitForPrintf(const DocId32Bit& d) |
252 | { return d.docid(); } |
253 | static inline uint32* Docid32BitForScanf(DocId32Bit* d) |
254 | { return d->docidptr(); } |
255 | static inline uint32 Docid32BitForFingerprinting(const DocId32Bit& d) |
256 | { return d.docid(); } |
257 | static inline uint32 Docid32BitForEncryption(const DocId32Bit& d) |
258 | { return d.docid(); } |
259 | static inline int32 Docid32BitForProtocolBuffer(const DocId32Bit& d) |
260 | { return d.docid(); } |
261 | static inline uint32 Docid32BitForAnchorPosition(const DocId32Bit& d) |
262 | { return d.docid(); } |
263 | static inline uint32 Docid32BitForCheckpointing(const DocId32Bit& d) |
264 | { return d.docid(); } |
265 | static inline uint32 Docid32BitForApproxDups(const DocId32Bit& d) |
266 | { return d.docid(); } |
267 | static inline uint32 DocId32BitAsNumber(const DocId32Bit& d) |
268 | { return d.docid(); } |
269 | |
270 | static inline bool Is64BitDocId(const DocId& d) |
271 | { return d.docid() > DocId32BitAsNumber(kMaxDocId32Bit); } |
272 | |
273 | inline double operator/(double a, const DocId32Bit& b) |
274 | { return a / b.docid(); } |
275 | inline int operator/(const DocId32Bit& a, int b) { return a.docid() / b; } |
276 | inline double operator*(const DocId32Bit& a, double b) |
277 | { return a.docid() * b; } |
278 | inline uint64 operator*(const DocId32Bit& a, uint64 b) |
279 | { return a.docid() * b; } |
280 | inline uint32 operator+(uint32 a, const DocId32Bit& b) |
281 | { return a + b.docid(); } |
282 | |
283 | HASH_NAMESPACE_DECLARATION_START |
284 | template<> struct hash<DocId32Bit> { |
285 | size_t operator()(const DocId32Bit& d) const { return d.docid(); } |
286 | }; |
287 | HASH_NAMESPACE_DECLARATION_END |
288 | |
289 | #endif |
290 | |
291 | // some definitions for 64 bit docids |
292 | const DocId kIllegalDocId = DocId(0); |
293 | const DocId kInitialDocId = DocId(1); |
294 | inline DocId NextDocIdOnShard(DocId d, int shard, int num_shards) { |
295 | return DocId((((DocIdAsNumber(d) + num_shards - 1 - shard) / |
296 | num_shards) * num_shards) + shard); |
297 | } |
298 | |
299 | // some definitions for 32 bit docids |
300 | const DocId32Bit kIllegalDocId32Bit = DocId32Bit(0); |
301 | const DocId32Bit kInitialDocId32Bit = DocId32Bit(1); |
302 | |
303 | // Type for index-localized docids (normal docid divided by # of |
304 | // shards). This type is signed because 0 is a valid local docid (for |
305 | // shard > 0), and we need an illegal value. |
306 | typedef int32 LocalDocId; |
307 | const LocalDocId kMaxLocalDocId = static_cast<LocalDocId>(0x7FFFFFFF); |
308 | const LocalDocId kIllegalLocalDocId = static_cast<LocalDocId>(~0); |
309 | |
310 | // Conversion between local and global docids. |
311 | // REQUIRES: the arguments should be neither Illegal nor Max. |
312 | inline LocalDocId GlobalToLocalDocId(DocId d, int /*shard*/, int num_shards) { |
313 | assert(d > kIllegalDocId); |
314 | assert(d < kMaxDocId); |
315 | return static_cast<LocalDocId>(DocIdAsNumber(d) / num_shards); |
316 | } |
317 | inline DocId LocalToGlobalDocId(LocalDocId d, int shard, int num_shards) { |
318 | assert(d > kIllegalLocalDocId); |
319 | assert(d < kMaxLocalDocId); |
320 | return DocId((static_cast<uint32>(d)*num_shards) + shard); |
321 | } |
322 | |
323 | // Equivalent to GlobalToLocalDocId(NextDocIdOnShard(d, sh, ns), sh, ns) |
324 | inline LocalDocId NextLocalDocIdOnShard(DocId d, int shard, int num_shards) { |
325 | return GlobalToLocalDocId(NextDocIdOnShard(d, shard, num_shards), |
326 | shard, num_shards); |
327 | } |
328 | |
329 | inline DocId DocIdFromUrlfp(Fprint urlfp) { |
330 | return DocId(urlfp); |
331 | } |
332 | |
333 | // DocVersionId |
334 | |
335 | // For real time cache. A higher value means more recent. |
336 | typedef uint32 DocVersionIdVal; |
337 | const DocVersionIdVal kIllegalDocVersionId = static_cast<DocVersionIdVal>(0); |
338 | |
339 | #endif // BASE_DOCID_H_ |
340 | |