BucketedDataFile.h source code [DuckDB/third_party/tpce-tool/include/input/BucketedDataFile.h]

1	#ifndef BUCKETED_DATA_FILE_H
2	#define BUCKETED_DATA_FILE_H
3
4	/*
5	* Legal Notice
6	*
7	* This document and associated source code (the "Work") is a part of a
8	* benchmark specification maintained by the TPC.
9	*
10	* The TPC reserves all right, title, and interest to the Work as provided
11	* under U.S. and international laws, including without limitation all patent
12	* and trademark rights therein.
13	*
14	* No Warranty
15	*
16	* 1.1 TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THE INFORMATION
17	* CONTAINED HEREIN IS PROVIDED "AS IS" AND WITH ALL FAULTS, AND THE
18	* AUTHORS AND DEVELOPERS OF THE WORK HEREBY DISCLAIM ALL OTHER
19	* WARRANTIES AND CONDITIONS, EITHER EXPRESS, IMPLIED OR STATUTORY,
20	* INCLUDING, BUT NOT LIMITED TO, ANY (IF ANY) IMPLIED WARRANTIES,
21	* DUTIES OR CONDITIONS OF MERCHANTABILITY, OF FITNESS FOR A PARTICULAR
22	* PURPOSE, OF ACCURACY OR COMPLETENESS OF RESPONSES, OF RESULTS, OF
23	* WORKMANLIKE EFFORT, OF LACK OF VIRUSES, AND OF LACK OF NEGLIGENCE.
24	* ALSO, THERE IS NO WARRANTY OR CONDITION OF TITLE, QUIET ENJOYMENT,
25	* QUIET POSSESSION, CORRESPONDENCE TO DESCRIPTION OR NON-INFRINGEMENT
26	* WITH REGARD TO THE WORK.
27	* 1.2 IN NO EVENT WILL ANY AUTHOR OR DEVELOPER OF THE WORK BE LIABLE TO
28	* ANY OTHER PARTY FOR ANY DAMAGES, INCLUDING BUT NOT LIMITED TO THE
29	* COST OF PROCURING SUBSTITUTE GOODS OR SERVICES, LOST PROFITS, LOSS
30	* OF USE, LOSS OF DATA, OR ANY INCIDENTAL, CONSEQUENTIAL, DIRECT,
31	* INDIRECT, OR SPECIAL DAMAGES WHETHER UNDER CONTRACT, TORT, WARRANTY,
32	* OR OTHERWISE, ARISING IN ANY WAY OUT OF THIS OR ANY OTHER AGREEMENT
33	* RELATING TO THE WORK, WHETHER OR NOT SUCH AUTHOR OR DEVELOPER HAD
34	* ADVANCE NOTICE OF THE POSSIBILITY OF SUCH DAMAGES.
35	*
36	* Contributors
37	* - Doug Johnson
38	*/
39
40	#include <vector>
41
42	//#include <string> // for stoi C++11
43	#include <cstdlib> // for atoi
44
45	#include "ITextSplitter.h"
46	#include "ShrinkToFit.h"
47
48	namespace TPCE {
49	//
50	// Description:
51	// A template class for converting a series of text records into a
52	// bucketed binary in-memory structure for quick easy access.
53	//
54	// Exception Safety:
55	// The Basic guarantee is provided.
56	//
57	// Copy Behavior:
58	// Copying is allowed.
59	//
60	//
61	// Assumptions:
62	// - bucket IDs start at 1.
63	// - records are sorted by bucket ID smallest to largest.
64	//
65	template <class T> class BucketedDataFile {
66	public:
67	// Leverage the size type of our underlying storage container but
68	// insulate clients from the implementation particulars by creating
69	// our own type.
70	// Set this first so we can use it for recordCount.
71	typedef typename std::vector<T>::size_type size_type;
72
73	private:
74	std::vector<std::vector<T>> buckets;
75	size_type recordCount;
76
77	public:
78	enum SizeFilter { AllRecords, BucketsOnly };
79
80	explicit BucketedDataFile(ITextSplitter &splitter) : recordCount(`0`) {
81	// eof only returns true after trying to read the end, so
82	// "prime the pump" by doing an initial read.
83	std::deque<std::string> fields = splitter.getNextRecord();
84
85	// Process each record.
86	while (!splitter.eof()) {
87	if (`1` == fields.size() && "" == fields [`0`]) {
88	// We found a blank line so skip it and move on.
89	fields = splitter.getNextRecord();
90	continue;
91	}
92
93	// The first field is the bucket ID for this record.
94	// int bucketID = std::stoi(fields[0]); // C++11
95	unsigned int bucketID = std::atoi(fields [`0`].c_str());
96	fields.pop_front();
97
98	if (buckets.size() == bucketID - `1`) {
99	// First record of a new bucket so add the bucket.
100	buckets.push_back(std::vector<T>());
101	}
102
103	// Now we know the bucket exists so go ahead and add the record.
104	buckets[bucketID - `1`].push_back(T(fields));
105	++recordCount;
106
107	// Move on to the next record.
108	fields = splitter.getNextRecord();
109	}
110
111	// Now that everything has been loaded tighten up our storage.
112	// NOTE: shrinking the outer bucket vector has the side effect of
113	// shrinking all the internal bucket vectors.
114	shrink_to_fit<std::vector<std::vector<T>>>(buckets);
115	// buckets.shrink_to_fit(); // C++11
116	}
117
118	//
119	// Default copies and destructor are ok.
120	//
121	// ~BucketedDataFile();
122	// BucketedDataFile(const BucketedDataFile&);
123	// BucketedDataFile& operator=(const BucketedDataFile&);
124	//
125
126	size_type size(SizeFilter filter = AllRecords) const {
127	return (filter == AllRecords ? recordCount : buckets.size());
128	}
129
130	// Provide 0-based access to the buckets.
131	const std::vector<T> &operator[](size_type idx) const {
132	return buckets[idx];
133	}
134
135	// Provide range-checked 0-based access to the buckets.
136	const std::vector<T> &at(size_type idx) const {
137	return buckets.at(idx);
138	}
139
140	// Provide range-checked bucket-ID-based access by to the buckets.
141	const std::vector<T> &getBucket(size_type bucketID, bool rangeCheckedAccess = false) const {
142	size_type idx = bucketID - `1`;
143	return (rangeCheckedAccess ? buckets.at(idx) : buckets[idx]);
144	}
145	};
146
147	} // namespace TPCE
148	#endif // BUCKETED_DATA_FILE_H
149

Browse the source code of DuckDB/third_party/tpce-tool/include/input/BucketedDataFile.h