1/*
2 * Copyright 2013-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/**
18 * RecordIO: self-synchronizing stream of variable length records
19 *
20 * RecordIO gives you the ability to write a stream of variable length records
21 * and read them later even in the face of data corruption -- randomly inserted
22 * or deleted chunks of the file, or modified data. When reading, you may lose
23 * corrupted records, but the stream will resynchronize automatically.
24 */
25
26#pragma once
27#define FOLLY_IO_RECORDIO_H_
28
29#include <atomic>
30#include <memory>
31#include <mutex>
32
33#include <folly/File.h>
34#include <folly/Range.h>
35#include <folly/io/IOBuf.h>
36#include <folly/system/MemoryMapping.h>
37
38namespace folly {
39
40/**
41 * Class to write a stream of RecordIO records to a file.
42 *
43 * RecordIOWriter is thread-safe
44 */
45class RecordIOWriter {
46 public:
47 /**
48 * Create a RecordIOWriter around a file; will append to the end of
49 * file if it exists.
50 *
51 * Each file must have a non-zero file id, which is embedded in all
52 * record headers. Readers will only return records with the requested
53 * file id (or, if the reader is created with fileId=0 in the constructor,
54 * the reader will return all records). File ids are only used to allow
55 * resynchronization if you store RecordIO records (with headers) inside
56 * other RecordIO records (for example, if a record consists of a fragment
57 * from another RecordIO file). If you're not planning to do that,
58 * the defaults are fine.
59 */
60 explicit RecordIOWriter(File file, uint32_t fileId = 1);
61
62 /**
63 * Write a record. We will use at most headerSize() bytes of headroom,
64 * you might want to arrange that before copying your data into it.
65 */
66 void write(std::unique_ptr<IOBuf> buf);
67
68 /**
69 * Return the position in the file where the next byte will be written.
70 * Conservative, as stuff can be written at any time from another thread.
71 */
72 off_t filePos() const {
73 return filePos_;
74 }
75
76 private:
77 File file_;
78 uint32_t fileId_;
79 std::unique_lock<File> writeLock_;
80 std::atomic<off_t> filePos_;
81};
82
83/**
84 * Class to read from a RecordIO file. Will skip invalid records.
85 */
86class RecordIOReader {
87 public:
88 class Iterator;
89
90 /**
91 * RecordIOReader is iterable, returning pairs of ByteRange (record content)
92 * and position in file where the record (including header) begins.
93 * Note that the position includes the header, that is, it can be passed back
94 * to seek().
95 */
96 typedef Iterator iterator;
97 typedef Iterator const_iterator;
98 typedef std::pair<ByteRange, off_t> value_type;
99 typedef value_type& reference;
100 typedef const value_type& const_reference;
101
102 /**
103 * A record reader with a fileId of 0 will return all records.
104 * A record reader with a non-zero fileId will only return records where
105 * the fileId matches.
106 */
107 explicit RecordIOReader(File file, uint32_t fileId = 0);
108
109 Iterator cbegin() const;
110 Iterator begin() const;
111 Iterator cend() const;
112 Iterator end() const;
113
114 /**
115 * Create an iterator to the first valid record after pos.
116 */
117 Iterator seek(off_t pos) const;
118
119 private:
120 MemoryMapping map_;
121 uint32_t fileId_;
122};
123
124namespace recordio_helpers {
125
126// We're exposing the guts of the RecordIO implementation for two reasons:
127// 1. It makes unit testing easier, and
128// 2. It allows you to build different RecordIO readers / writers that use
129// different storage systems underneath (not standard files)
130
131/**
132 * Header size.
133 */
134constexpr size_t headerSize(); // defined in RecordIO-inl.h
135
136/**
137 * Write a header in the buffer. We will prepend the header to the front
138 * of the chain. Do not write the buffer if empty (we don't allow empty
139 * records). Returns the total length, including header (0 if empty)
140 * (same as buf->computeChainDataLength(), but likely faster)
141 *
142 * The fileId should be unique per stream and allows you to have RecordIO
143 * headers stored inside the data (for example, have an entire RecordIO
144 * file stored as a record inside another RecordIO file). The fileId may
145 * not be 0.
146 */
147size_t prependHeader(std::unique_ptr<IOBuf>& buf, uint32_t fileId = 1);
148
149/**
150 * Search for the first valid record that begins in searchRange (which must be
151 * a subrange of wholeRange). Returns the record data (not the header) if
152 * found, ByteRange() otherwise.
153 *
154 * The fileId may be 0, in which case we'll return the first valid record for
155 * *any* fileId, or non-zero, in which case we'll only look for records with
156 * the requested fileId.
157 */
158struct RecordInfo {
159 uint32_t fileId;
160 ByteRange record;
161};
162RecordInfo
163findRecord(ByteRange searchRange, ByteRange wholeRange, uint32_t fileId);
164
165/**
166 * Search for the first valid record in range.
167 */
168RecordInfo findRecord(ByteRange range, uint32_t fileId);
169
170/**
171 * Check if there is a valid record at the beginning of range. Returns the
172 * record data (not the header) if the record is valid, ByteRange() otherwise.
173 */
174RecordInfo validateRecord(ByteRange range, uint32_t fileId);
175
176} // namespace recordio_helpers
177
178} // namespace folly
179
180#include <folly/io/RecordIO-inl.h>
181