1 | /* |
2 | * Copyright 2013-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | /** |
18 | * RecordIO: self-synchronizing stream of variable length records |
19 | * |
20 | * RecordIO gives you the ability to write a stream of variable length records |
21 | * and read them later even in the face of data corruption -- randomly inserted |
22 | * or deleted chunks of the file, or modified data. When reading, you may lose |
23 | * corrupted records, but the stream will resynchronize automatically. |
24 | */ |
25 | |
26 | #pragma once |
27 | #define FOLLY_IO_RECORDIO_H_ |
28 | |
29 | #include <atomic> |
30 | #include <memory> |
31 | #include <mutex> |
32 | |
33 | #include <folly/File.h> |
34 | #include <folly/Range.h> |
35 | #include <folly/io/IOBuf.h> |
36 | #include <folly/system/MemoryMapping.h> |
37 | |
38 | namespace folly { |
39 | |
40 | /** |
41 | * Class to write a stream of RecordIO records to a file. |
42 | * |
43 | * RecordIOWriter is thread-safe |
44 | */ |
45 | class RecordIOWriter { |
46 | public: |
47 | /** |
48 | * Create a RecordIOWriter around a file; will append to the end of |
49 | * file if it exists. |
50 | * |
51 | * Each file must have a non-zero file id, which is embedded in all |
52 | * record headers. Readers will only return records with the requested |
53 | * file id (or, if the reader is created with fileId=0 in the constructor, |
54 | * the reader will return all records). File ids are only used to allow |
55 | * resynchronization if you store RecordIO records (with headers) inside |
56 | * other RecordIO records (for example, if a record consists of a fragment |
57 | * from another RecordIO file). If you're not planning to do that, |
58 | * the defaults are fine. |
59 | */ |
60 | explicit RecordIOWriter(File file, uint32_t fileId = 1); |
61 | |
62 | /** |
63 | * Write a record. We will use at most headerSize() bytes of headroom, |
64 | * you might want to arrange that before copying your data into it. |
65 | */ |
66 | void write(std::unique_ptr<IOBuf> buf); |
67 | |
68 | /** |
69 | * Return the position in the file where the next byte will be written. |
70 | * Conservative, as stuff can be written at any time from another thread. |
71 | */ |
72 | off_t filePos() const { |
73 | return filePos_; |
74 | } |
75 | |
76 | private: |
77 | File file_; |
78 | uint32_t fileId_; |
79 | std::unique_lock<File> writeLock_; |
80 | std::atomic<off_t> filePos_; |
81 | }; |
82 | |
83 | /** |
84 | * Class to read from a RecordIO file. Will skip invalid records. |
85 | */ |
86 | class RecordIOReader { |
87 | public: |
88 | class Iterator; |
89 | |
90 | /** |
91 | * RecordIOReader is iterable, returning pairs of ByteRange (record content) |
92 | * and position in file where the record (including header) begins. |
93 | * Note that the position includes the header, that is, it can be passed back |
94 | * to seek(). |
95 | */ |
96 | typedef Iterator iterator; |
97 | typedef Iterator const_iterator; |
98 | typedef std::pair<ByteRange, off_t> value_type; |
99 | typedef value_type& reference; |
100 | typedef const value_type& const_reference; |
101 | |
102 | /** |
103 | * A record reader with a fileId of 0 will return all records. |
104 | * A record reader with a non-zero fileId will only return records where |
105 | * the fileId matches. |
106 | */ |
107 | explicit RecordIOReader(File file, uint32_t fileId = 0); |
108 | |
109 | Iterator cbegin() const; |
110 | Iterator begin() const; |
111 | Iterator cend() const; |
112 | Iterator end() const; |
113 | |
114 | /** |
115 | * Create an iterator to the first valid record after pos. |
116 | */ |
117 | Iterator seek(off_t pos) const; |
118 | |
119 | private: |
120 | MemoryMapping map_; |
121 | uint32_t fileId_; |
122 | }; |
123 | |
124 | namespace recordio_helpers { |
125 | |
126 | // We're exposing the guts of the RecordIO implementation for two reasons: |
127 | // 1. It makes unit testing easier, and |
128 | // 2. It allows you to build different RecordIO readers / writers that use |
129 | // different storage systems underneath (not standard files) |
130 | |
131 | /** |
132 | * Header size. |
133 | */ |
134 | constexpr size_t (); // defined in RecordIO-inl.h |
135 | |
136 | /** |
137 | * Write a header in the buffer. We will prepend the header to the front |
138 | * of the chain. Do not write the buffer if empty (we don't allow empty |
139 | * records). Returns the total length, including header (0 if empty) |
140 | * (same as buf->computeChainDataLength(), but likely faster) |
141 | * |
142 | * The fileId should be unique per stream and allows you to have RecordIO |
143 | * headers stored inside the data (for example, have an entire RecordIO |
144 | * file stored as a record inside another RecordIO file). The fileId may |
145 | * not be 0. |
146 | */ |
147 | size_t (std::unique_ptr<IOBuf>& buf, uint32_t fileId = 1); |
148 | |
149 | /** |
150 | * Search for the first valid record that begins in searchRange (which must be |
151 | * a subrange of wholeRange). Returns the record data (not the header) if |
152 | * found, ByteRange() otherwise. |
153 | * |
154 | * The fileId may be 0, in which case we'll return the first valid record for |
155 | * *any* fileId, or non-zero, in which case we'll only look for records with |
156 | * the requested fileId. |
157 | */ |
158 | struct RecordInfo { |
159 | uint32_t fileId; |
160 | ByteRange record; |
161 | }; |
162 | RecordInfo |
163 | findRecord(ByteRange searchRange, ByteRange wholeRange, uint32_t fileId); |
164 | |
165 | /** |
166 | * Search for the first valid record in range. |
167 | */ |
168 | RecordInfo findRecord(ByteRange range, uint32_t fileId); |
169 | |
170 | /** |
171 | * Check if there is a valid record at the beginning of range. Returns the |
172 | * record data (not the header) if the record is valid, ByteRange() otherwise. |
173 | */ |
174 | RecordInfo validateRecord(ByteRange range, uint32_t fileId); |
175 | |
176 | } // namespace recordio_helpers |
177 | |
178 | } // namespace folly |
179 | |
180 | #include <folly/io/RecordIO-inl.h> |
181 | |