1 | #include <Processors/Transforms/MergingSortedTransform.h> |
2 | #include <DataStreams/ColumnGathererStream.h> |
3 | #include <IO/WriteBuffer.h> |
4 | #include <DataStreams/materializeBlock.h> |
5 | |
6 | namespace DB |
7 | { |
8 | |
9 | MergingSortedTransform::MergingSortedTransform( |
10 | const Block & , |
11 | size_t num_inputs, |
12 | const SortDescription & description_, |
13 | size_t max_block_size_, |
14 | UInt64 limit_, |
15 | bool quiet_, |
16 | bool have_all_inputs_) |
17 | : IProcessor(InputPorts(num_inputs, header), {materializeBlock(header)}) |
18 | , description(description_), max_block_size(max_block_size_), limit(limit_), quiet(quiet_) |
19 | , have_all_inputs(have_all_inputs_) |
20 | , merged_data(header), source_chunks(num_inputs), cursors(num_inputs) |
21 | { |
22 | auto & sample = outputs.front().getHeader(); |
23 | /// Replace column names in description to positions. |
24 | for (auto & column_description : description) |
25 | { |
26 | has_collation |= column_description.collator != nullptr; |
27 | if (!column_description.column_name.empty()) |
28 | { |
29 | column_description.column_number = sample.getPositionByName(column_description.column_name); |
30 | column_description.column_name.clear(); |
31 | } |
32 | } |
33 | } |
34 | |
35 | void MergingSortedTransform::addInput() |
36 | { |
37 | if (have_all_inputs) |
38 | throw Exception("MergingSortedTransform already have all inputs." , ErrorCodes::LOGICAL_ERROR); |
39 | |
40 | inputs.emplace_back(outputs.front().getHeader(), this); |
41 | source_chunks.emplace_back(); |
42 | cursors.emplace_back(); |
43 | } |
44 | |
45 | void MergingSortedTransform::setHaveAllInputs() |
46 | { |
47 | if (have_all_inputs) |
48 | throw Exception("MergingSortedTransform already have all inputs." , ErrorCodes::LOGICAL_ERROR); |
49 | |
50 | have_all_inputs = true; |
51 | } |
52 | |
53 | IProcessor::Status MergingSortedTransform::prepare() |
54 | { |
55 | if (!have_all_inputs) |
56 | return Status::NeedData; |
57 | |
58 | auto & output = outputs.front(); |
59 | |
60 | /// Special case for no inputs. |
61 | if (inputs.empty()) |
62 | { |
63 | output.finish(); |
64 | return Status::Finished; |
65 | } |
66 | |
67 | /// Check can output. |
68 | |
69 | if (output.isFinished()) |
70 | { |
71 | for (auto & in : inputs) |
72 | in.close(); |
73 | |
74 | return Status::Finished; |
75 | } |
76 | |
77 | if (!output.isNeeded()) |
78 | { |
79 | for (auto & in : inputs) |
80 | in.setNotNeeded(); |
81 | |
82 | return Status::PortFull; |
83 | } |
84 | |
85 | if (output.hasData()) |
86 | return Status::PortFull; |
87 | |
88 | /// Special case for single input. |
89 | if (inputs.size() == 1) |
90 | { |
91 | auto & input = inputs.front(); |
92 | if (input.isFinished()) |
93 | { |
94 | output.finish(); |
95 | return Status::Finished; |
96 | } |
97 | |
98 | input.setNeeded(); |
99 | if (input.hasData()) |
100 | output.push(input.pull()); |
101 | |
102 | return Status::NeedData; |
103 | } |
104 | |
105 | /// Push if has data. |
106 | if (merged_data.mergedRows()) |
107 | output.push(merged_data.pull()); |
108 | |
109 | if (!is_initialized) |
110 | { |
111 | /// Check for inputs we need. |
112 | bool all_inputs_has_data = true; |
113 | auto it = inputs.begin(); |
114 | for (size_t i = 0; it != inputs.end(); ++i, ++it) |
115 | { |
116 | auto & input = *it; |
117 | if (input.isFinished()) |
118 | continue; |
119 | |
120 | if (!cursors[i].empty()) |
121 | { |
122 | input.setNotNeeded(); |
123 | continue; |
124 | } |
125 | |
126 | input.setNeeded(); |
127 | |
128 | if (!input.hasData()) |
129 | { |
130 | all_inputs_has_data = false; |
131 | continue; |
132 | } |
133 | |
134 | auto chunk = input.pull(); |
135 | if (!chunk.hasRows()) |
136 | { |
137 | |
138 | if (!input.isFinished()) |
139 | all_inputs_has_data = false; |
140 | |
141 | continue; |
142 | } |
143 | |
144 | updateCursor(std::move(chunk), i); |
145 | } |
146 | |
147 | if (!all_inputs_has_data) |
148 | return Status::NeedData; |
149 | |
150 | if (has_collation) |
151 | initQueue(queue_with_collation); |
152 | else |
153 | initQueue(queue_without_collation); |
154 | |
155 | is_initialized = true; |
156 | return Status::Ready; |
157 | } |
158 | else |
159 | { |
160 | if (is_finished) |
161 | { |
162 | for (auto & input : inputs) |
163 | input.close(); |
164 | |
165 | outputs.front().finish(); |
166 | |
167 | return Status::Finished; |
168 | } |
169 | |
170 | if (need_data) |
171 | { |
172 | |
173 | auto & input = *std::next(inputs.begin(), next_input_to_read); |
174 | if (!input.isFinished()) |
175 | { |
176 | input.setNeeded(); |
177 | |
178 | if (!input.hasData()) |
179 | return Status::NeedData; |
180 | |
181 | auto chunk = input.pull(); |
182 | if (!chunk.hasRows() && !input.isFinished()) |
183 | return Status::NeedData; |
184 | |
185 | updateCursor(std::move(chunk), next_input_to_read); |
186 | pushToQueue(next_input_to_read); |
187 | } |
188 | |
189 | need_data = false; |
190 | } |
191 | |
192 | return Status::Ready; |
193 | } |
194 | } |
195 | |
196 | void MergingSortedTransform::work() |
197 | { |
198 | if (has_collation) |
199 | merge(queue_with_collation); |
200 | else |
201 | merge(queue_without_collation); |
202 | } |
203 | |
204 | template <typename TSortCursor> |
205 | void MergingSortedTransform::merge(std::priority_queue<TSortCursor> & queue) |
206 | { |
207 | /// Returns MergeStatus which we should return if we are going to finish now. |
208 | auto can_read_another_row = [&, this]() |
209 | { |
210 | if (limit && merged_data.totalMergedRows() >= limit) |
211 | { |
212 | //std::cerr << "Limit reached\n"; |
213 | is_finished = true; |
214 | return false; |
215 | } |
216 | |
217 | if (merged_data.mergedRows() >= max_block_size) |
218 | { |
219 | //std::cerr << "max_block_size reached\n"; |
220 | return false; |
221 | } |
222 | |
223 | return true; |
224 | }; |
225 | |
226 | /// Take rows in required order and put them into `merged_data`, while the rows are no more than `max_block_size` |
227 | while (!queue.empty()) |
228 | { |
229 | /// Shouldn't happen at first iteration, but check just in case. |
230 | if (!can_read_another_row()) |
231 | return; |
232 | |
233 | TSortCursor current = queue.top(); |
234 | queue.pop(); |
235 | bool first_iteration = true; |
236 | |
237 | while (true) |
238 | { |
239 | if (!first_iteration && !can_read_another_row()) |
240 | { |
241 | queue.push(current); |
242 | return; |
243 | } |
244 | first_iteration = false; |
245 | |
246 | /** And what if the block is totally less or equal than the rest for the current cursor? |
247 | * Or is there only one data source left in the queue? Then you can take the entire block on current cursor. |
248 | */ |
249 | if (current.impl->isFirst() && (queue.empty() || current.totallyLessOrEquals(queue.top()))) |
250 | { |
251 | //std::cerr << "current block is totally less or equals\n"; |
252 | |
253 | /// If there are already data in the current block, we first return it. We'll get here again the next time we call the merge function. |
254 | if (merged_data.mergedRows() != 0) |
255 | { |
256 | //std::cerr << "merged rows is non-zero\n"; |
257 | queue.push(current); |
258 | return; |
259 | } |
260 | |
261 | /// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl) |
262 | size_t source_num = current.impl->order; |
263 | insertFromChunk(source_num); |
264 | return; |
265 | } |
266 | |
267 | //std::cerr << "total_merged_rows: " << total_merged_rows << ", merged_rows: " << merged_rows << "\n"; |
268 | //std::cerr << "Inserting row\n"; |
269 | merged_data.insertRow(current->all_columns, current->pos); |
270 | |
271 | if (out_row_sources_buf) |
272 | { |
273 | /// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl) |
274 | RowSourcePart row_source(current.impl->order); |
275 | out_row_sources_buf->write(row_source.data); |
276 | } |
277 | |
278 | if (current->isLast()) |
279 | { |
280 | need_data = true; |
281 | next_input_to_read = current.impl->order; |
282 | |
283 | if (limit && merged_data.totalMergedRows() >= limit) |
284 | is_finished = true; |
285 | |
286 | return; |
287 | } |
288 | |
289 | //std::cerr << "moving to next row\n"; |
290 | current->next(); |
291 | |
292 | if (!queue.empty() && current.greater(queue.top())) |
293 | { |
294 | //std::cerr << "next row is not least, pushing back to queue\n"; |
295 | queue.push(current); |
296 | break; |
297 | } |
298 | } |
299 | } |
300 | is_finished = true; |
301 | } |
302 | |
303 | void MergingSortedTransform::insertFromChunk(size_t source_num) |
304 | { |
305 | if (source_num >= cursors.size()) |
306 | throw Exception("Logical error in MergingSortedTrandform" , ErrorCodes::LOGICAL_ERROR); |
307 | |
308 | //std::cerr << "copied columns\n"; |
309 | |
310 | auto num_rows = source_chunks[source_num]->getNumRows(); |
311 | |
312 | UInt64 total_merged_rows_after_insertion = merged_data.mergedRows() + num_rows; |
313 | if (limit && total_merged_rows_after_insertion > limit) |
314 | { |
315 | num_rows = total_merged_rows_after_insertion - limit; |
316 | merged_data.insertFromChunk(std::move(*source_chunks[source_num]), num_rows); |
317 | is_finished = true; |
318 | } |
319 | else |
320 | { |
321 | merged_data.insertFromChunk(std::move(*source_chunks[source_num]), 0); |
322 | need_data = true; |
323 | next_input_to_read = source_num; |
324 | } |
325 | |
326 | if (out_row_sources_buf) |
327 | { |
328 | RowSourcePart row_source(source_num); |
329 | for (size_t i = 0; i < num_rows; ++i) |
330 | out_row_sources_buf->write(row_source.data); |
331 | } |
332 | } |
333 | |
334 | |
335 | } |
336 | |