1// Copyright 2001 and onwards Google Inc.
2
3#ifndef BASE_DOCID_H_
4#define BASE_DOCID_H_
5
6#include <assert.h>
7
8// sys/types.h is only needed by this file if !defined(NDEBUG),
9// but including it conditionally can cause hard-to-track-down
10// compilation failures that only happen when doing optimized builds.
11// Let's include it unconditionally to reduce the number of such breakages
12// we need to track down.
13// Note that uint is defined by sys/types.h only if _GNU_SOURCE or its ilk is
14// defined, and that mysql.h seems to define _GNU_SOURCE, so the problem
15// may only happen if sys/types.h is included after mysql.h.
16// We now define uint in base/port.h if needed, and in a way that won't clash.
17#include <sys/types.h> // for size_t
18
19#include "base/basictypes.h"
20
21#define DOCID_FORMAT "%llu"
22#define DOCID32BIT_FORMAT "%u"
23
24// The following help ensure type safety for docids. Use them when
25// you want to printf/scanf a docid. Note that even so, when the
26// type of docid changes you'll need to change all the format args
27// for these printf/scanf statements by hand (to be %qu, say).
28//
29// We do two kinds of docids to facilitate the transition from 32-bit
30// docids to 64-bit docids. We need both in the transition period
31// as the base index will use 32 bit docids and the incremental index
32// will use 64-bit docids.
33
34#if defined NDEBUG
35typedef uint64 DocId;
36const DocId kMaxDocId = DocId(GG_ULONGLONG(0xFFFFFFFFFFFFFFFF));
37
38static inline uint64 DocidForPrintf(const DocId& d) { return d; }
39static inline uint64* DocidForScanf(DocId* d) { return d; }
40// Used with the SparseBitmap class, primarily
41static inline uint64 DocidForBitmap(const DocId& d) { return d; }
42// Used with the docservercache stuff, for odd but reasonable reasons
43static inline uint64 DocidForFingerprinting(const DocId& d) { return d; }
44// Used with the protocol-buffer stuff
45static inline uint64 DocidForProtocolBuffer(const DocId& d) { return d; }
46// Used during anchor processing
47static inline uint64 DocidForAnchorPosition(const DocId& d) { return d; }
48// Used for checkpointing
49static inline uint64 DocidForCheckpointing(const DocId& d) { return d; }
50// Used for approximate dups detection
51static inline uint64 DocidForApproxDups(const DocId& d) { return d; }
52// Used for SWIG
53static inline uint64* DocidForSWIG(DocId* d) { return d; }
54
55// Used for miscellaneous arithmetic.
56static inline uint64 DocIdAsNumber(const DocId& d) { return d; }
57
58// ------------------------------------------------------------------
59// 32 bit docids
60// ------------------------------------------------------------------
61typedef uint32 DocId32Bit;
62const DocId32Bit kMaxDocId32Bit = static_cast<DocId32Bit>(0xFFFFFFFFul);
63static inline uint32 Docid32BitForPrintf(const DocId32Bit& d) { return d; }
64static inline uint32* Docid32BitForScanf(DocId32Bit* d) { return d; }
65static inline uint32 Docid32BitForBitmap(const DocId32Bit& d) { return d; }
66static inline uint32 Docid32BitForFingerprinting(const DocId32Bit& d) {
67 return d;
68}
69static inline uint32 Docid32BitForEncryption(const DocId32Bit &d) {
70 return d;
71}
72static inline uint32 Docid32BitForProtocolBuffer(const DocId32Bit& d) {
73 return d;
74}
75static inline uint32 Docid32BitForAnchorPosition(const DocId32Bit& d) {
76 return d;
77}
78static inline uint32 Docid32BitForCheckpointing(const DocId32Bit& d) {
79 return d;
80}
81static inline uint32 Docid32BitForApproxDups(const DocId32Bit& d) { return d; }
82
83static inline uint32 DocId32BitAsNumber(const DocId32Bit& d) { return d; }
84
85// return true if the value cannot fit in DocId32Bit
86static inline bool Is64BitDocId(const DocId& d) { return d > kMaxDocId32Bit; }
87
88#else
89// In debug mode, we make docid its own type so we can be sure of
90// type-safety. In particular, we're concerned with people who treat
91// docids as if they were signed, or who assume they're 32 bits long.
92// We don't define this always because it results in 14% bigger executables.
93//
94// DocId is thread-compatible.
95#define BINPRED_(pred) \
96 bool operator pred (const DocId& x) const { return docid_ pred x.docid_; }
97class DocId {
98 protected:
99 typedef uint64 value_type;
100 value_type docid_;
101
102 public:
103 DocId() { docid_ = 0; }
104 explicit DocId(value_type docid) { docid_ = docid; }
105 value_type docid() const { return docid_; } // ONLY use here
106 value_type* docidptr() { return &docid_; } // ie in this file
107 DocId& operator=(const DocId& x) {
108 docid_ = x.docid_;
109 return *this;
110 }
111 // These two are required for delta encoding
112 value_type operator+(const DocId& x) const { return docid_ + x.docid_; }
113 value_type operator-(const DocId& x) const { return docid_ - x.docid_; }
114
115 // needed in incrementalpr.cc
116 DocId operator+(uint64 x) const { return DocId(docid_ + x); }
117 DocId operator-(uint64 x) const { return DocId(docid_ - x); }
118 // Required for hashing, typically
119 value_type operator%(value_type m) const { return docid_ % m; }
120 // Required for sortedrepptrs.cc and partialindexreader.cc
121 DocId& operator+=(uint64 x) {
122 docid_ += x;
123 return *this;
124 }
125 // Required for moreoverreposwriter.cc & sortpageranks.cc
126 DocId& operator++() {
127 ++docid_;
128 return *this;
129 }
130 DocId& operator--() {
131 --docid_;
132 return *this;
133 }
134 // Required by dup_docid_categorizer.cc and some other twiddlers, who needs
135 // to convert 64-bit doc id into 32-bit category id. Now the only intended use
136 // of this operator is to mask off the top bits of a docid.
137 value_type operator&(value_type x) const { return docid_ & x; }
138 // Comparators
139 BINPRED_(==) BINPRED_(!=) BINPRED_(<) BINPRED_(>) BINPRED_(<=) BINPRED_(>=)
140};
141DECLARE_POD(DocId);
142#undef BINPRED_
143
144// Required for anchorbucket.cc
145inline double operator/(double a, const DocId& b) {
146 return a / static_cast<double>(b.docid()); }
147// Required for restrict-tool.cc
148inline uint64 operator/(const DocId& a, uint64 b) { return a.docid() / b; }
149// Required for index-request.cc
150inline double operator*(const DocId& a, double b) {
151 return static_cast<double>(a.docid()) * b; }
152// Required for linksreader.cc
153inline uint64 operator*(const DocId& a, uint64 b) { return a.docid() * b; }
154// Required for indexdefs.h (really should be HitPosition, not uint64)
155inline uint64 operator+(uint64 a, const DocId& b) { return a + b.docid(); }
156
157// Required for hashing docids. docservercache.cc needs to fingerprint them.
158#if !defined __SGI_STL_HASH_FUN_H // taken from stl_decl.h
159HASH_NAMESPACE_DECLARATION_START
160template <class Key> struct hash;
161HASH_NAMESPACE_DECLARATION_END
162#endif
163HASH_NAMESPACE_DECLARATION_START
164template<> struct hash<DocId> {
165 size_t operator()(const DocId& d) const {
166 return static_cast<size_t>(d.docid());
167 }
168};
169HASH_NAMESPACE_DECLARATION_END
170
171const DocId kMaxDocId = DocId(GG_ULONGLONG(0xFFFFFFFFFFFFFFFF));
172
173// These are defined in non-debug mode too; they're always-on type-safety
174static inline uint64 DocidForPrintf(const DocId& d) { return d.docid(); }
175static inline uint64* DocidForScanf(DocId* d) { return d->docidptr(); }
176static inline uint64 DocidForBitmap(const DocId& d) { return d.docid(); }
177static inline uint64 DocidForFingerprinting(const DocId& d)
178 { return d.docid(); }
179static inline uint64 DocidForProtocolBuffer(const DocId& d)
180 { return d.docid(); }
181static inline uint64 DocidForAnchorPosition(const DocId& d)
182 { return d.docid(); }
183static inline uint64 DocidForCheckpointing(const DocId& d)
184 { return d.docid(); }
185static inline uint64 DocidForApproxDups(const DocId& d) { return d.docid(); }
186static inline uint64 DocIdAsNumber(const DocId& d) { return d.docid(); }
187static inline uint64* DocidForSWIG(DocId* d) { return d->docidptr(); }
188
189// ---------------------------------------------------------------------
190// 32 bit docids
191//
192// ---------------------------------------------------------------------
193
194#define BINPRED32Bit_(pred) \
195 bool operator pred(const DocId32Bit& x) const { \
196 return docid_ pred x.docid_; \
197 }
198
199class DocId32Bit {
200 protected:
201 typedef uint32 value_type;
202 value_type docid_;
203
204 public:
205 DocId32Bit() { docid_ = 0; }
206 explicit DocId32Bit(value_type docid) { docid_ = docid; }
207 explicit DocId32Bit(DocId docid) {
208 // we can't use kMaxDocId32Bit as it is not yet defined
209 assert(DocIdAsNumber(docid) <= 0xFFFFFFFFul);
210 docid_ = static_cast<uint32> (DocIdAsNumber(docid));
211 }
212 value_type docid() const { return docid_; } // ONLY use here
213 value_type* docidptr() { return &docid_; } // ie in this file
214 DocId32Bit& operator=(const DocId32Bit& x) {
215 docid_ = x.docid_;
216 return *this;
217 }
218 // These two are required for delta encoding
219 value_type operator+(const DocId32Bit& x) const { return docid_ + x.docid_; }
220 value_type operator-(const DocId32Bit& x) const { return docid_ - x.docid_; }
221
222 // needed in incrementalpr.cc
223 DocId32Bit operator+(uint32 x) const { return DocId32Bit(docid_ + x); }
224 DocId32Bit operator-(uint32 x) const { return DocId32Bit(docid_ - x); }
225 // Required for hashing, typically
226 value_type operator%(value_type m) const { return docid_ % m; }
227 // Required for sortedrepptrs.cc and partialindexreader.cc
228 DocId32Bit& operator+=(uint32 x) {
229 docid_ += x;
230 return *this;
231 }
232 // Required for moreoverreposwriter.cc & sortpageranks.cc
233 DocId32Bit& operator++() {
234 ++docid_;
235 return *this;
236 }
237 DocId32Bit& operator--() {
238 --docid_;
239 return *this;
240 }
241 // Comparators
242 BINPRED32Bit_(==) BINPRED32Bit_(!=) BINPRED32Bit_(<) BINPRED32Bit_(>)
243 BINPRED32Bit_(<=) BINPRED32Bit_(>=)
244};
245DECLARE_POD(DocId32Bit);
246#undef BINPRED32Bit_
247
248const DocId32Bit kMaxDocId32Bit = DocId32Bit(0xFFFFFFFFul);
249static inline uint32 Docid32BitForBitmap(const DocId32Bit& d)
250 { return d.docid(); }
251static inline uint32 Docid32BitForPrintf(const DocId32Bit& d)
252 { return d.docid(); }
253static inline uint32* Docid32BitForScanf(DocId32Bit* d)
254 { return d->docidptr(); }
255static inline uint32 Docid32BitForFingerprinting(const DocId32Bit& d)
256 { return d.docid(); }
257static inline uint32 Docid32BitForEncryption(const DocId32Bit& d)
258 { return d.docid(); }
259static inline int32 Docid32BitForProtocolBuffer(const DocId32Bit& d)
260 { return d.docid(); }
261static inline uint32 Docid32BitForAnchorPosition(const DocId32Bit& d)
262 { return d.docid(); }
263static inline uint32 Docid32BitForCheckpointing(const DocId32Bit& d)
264 { return d.docid(); }
265static inline uint32 Docid32BitForApproxDups(const DocId32Bit& d)
266 { return d.docid(); }
267static inline uint32 DocId32BitAsNumber(const DocId32Bit& d)
268 { return d.docid(); }
269
270static inline bool Is64BitDocId(const DocId& d)
271 { return d.docid() > DocId32BitAsNumber(kMaxDocId32Bit); }
272
273inline double operator/(double a, const DocId32Bit& b)
274 { return a / b.docid(); }
275inline int operator/(const DocId32Bit& a, int b) { return a.docid() / b; }
276inline double operator*(const DocId32Bit& a, double b)
277 { return a.docid() * b; }
278inline uint64 operator*(const DocId32Bit& a, uint64 b)
279 { return a.docid() * b; }
280inline uint32 operator+(uint32 a, const DocId32Bit& b)
281 { return a + b.docid(); }
282
283HASH_NAMESPACE_DECLARATION_START
284template<> struct hash<DocId32Bit> {
285 size_t operator()(const DocId32Bit& d) const { return d.docid(); }
286};
287HASH_NAMESPACE_DECLARATION_END
288
289#endif
290
291// some definitions for 64 bit docids
292const DocId kIllegalDocId = DocId(0);
293const DocId kInitialDocId = DocId(1);
294inline DocId NextDocIdOnShard(DocId d, int shard, int num_shards) {
295 return DocId((((DocIdAsNumber(d) + num_shards - 1 - shard) /
296 num_shards) * num_shards) + shard);
297}
298
299// some definitions for 32 bit docids
300const DocId32Bit kIllegalDocId32Bit = DocId32Bit(0);
301const DocId32Bit kInitialDocId32Bit = DocId32Bit(1);
302
303// Type for index-localized docids (normal docid divided by # of
304// shards). This type is signed because 0 is a valid local docid (for
305// shard > 0), and we need an illegal value.
306typedef int32 LocalDocId;
307const LocalDocId kMaxLocalDocId = static_cast<LocalDocId>(0x7FFFFFFF);
308const LocalDocId kIllegalLocalDocId = static_cast<LocalDocId>(~0);
309
310// Conversion between local and global docids.
311// REQUIRES: the arguments should be neither Illegal nor Max.
312inline LocalDocId GlobalToLocalDocId(DocId d, int /*shard*/, int num_shards) {
313 assert(d > kIllegalDocId);
314 assert(d < kMaxDocId);
315 return static_cast<LocalDocId>(DocIdAsNumber(d) / num_shards);
316}
317inline DocId LocalToGlobalDocId(LocalDocId d, int shard, int num_shards) {
318 assert(d > kIllegalLocalDocId);
319 assert(d < kMaxLocalDocId);
320 return DocId((static_cast<uint32>(d)*num_shards) + shard);
321}
322
323// Equivalent to GlobalToLocalDocId(NextDocIdOnShard(d, sh, ns), sh, ns)
324inline LocalDocId NextLocalDocIdOnShard(DocId d, int shard, int num_shards) {
325 return GlobalToLocalDocId(NextDocIdOnShard(d, shard, num_shards),
326 shard, num_shards);
327}
328
329inline DocId DocIdFromUrlfp(Fprint urlfp) {
330 return DocId(urlfp);
331}
332
333// DocVersionId
334
335// For real time cache. A higher value means more recent.
336typedef uint32 DocVersionIdVal;
337const DocVersionIdVal kIllegalDocVersionId = static_cast<DocVersionIdVal>(0);
338
339#endif // BASE_DOCID_H_
340