1 | /* |
2 | * L2/refcount table cache for the QCOW2 format |
3 | * |
4 | * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com> |
5 | * |
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | * of this software and associated documentation files (the "Software"), to deal |
8 | * in the Software without restriction, including without limitation the rights |
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | * copies of the Software, and to permit persons to whom the Software is |
11 | * furnished to do so, subject to the following conditions: |
12 | * |
13 | * The above copyright notice and this permission notice shall be included in |
14 | * all copies or substantial portions of the Software. |
15 | * |
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
22 | * THE SOFTWARE. |
23 | */ |
24 | |
25 | #include "qemu/osdep.h" |
26 | #include "qcow2.h" |
27 | #include "trace.h" |
28 | |
29 | typedef struct Qcow2CachedTable { |
30 | int64_t offset; |
31 | uint64_t lru_counter; |
32 | int ref; |
33 | bool dirty; |
34 | } Qcow2CachedTable; |
35 | |
36 | struct Qcow2Cache { |
37 | Qcow2CachedTable *entries; |
38 | struct Qcow2Cache *depends; |
39 | int size; |
40 | int table_size; |
41 | bool depends_on_flush; |
42 | void *table_array; |
43 | uint64_t lru_counter; |
44 | uint64_t cache_clean_lru_counter; |
45 | }; |
46 | |
47 | static inline void *qcow2_cache_get_table_addr(Qcow2Cache *c, int table) |
48 | { |
49 | return (uint8_t *) c->table_array + (size_t) table * c->table_size; |
50 | } |
51 | |
52 | static inline int qcow2_cache_get_table_idx(Qcow2Cache *c, void *table) |
53 | { |
54 | ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array; |
55 | int idx = table_offset / c->table_size; |
56 | assert(idx >= 0 && idx < c->size && table_offset % c->table_size == 0); |
57 | return idx; |
58 | } |
59 | |
60 | static inline const char *qcow2_cache_get_name(BDRVQcow2State *s, Qcow2Cache *c) |
61 | { |
62 | if (c == s->refcount_block_cache) { |
63 | return "refcount block" ; |
64 | } else if (c == s->l2_table_cache) { |
65 | return "L2 table" ; |
66 | } else { |
67 | /* Do not abort, because this is not critical */ |
68 | return "unknown" ; |
69 | } |
70 | } |
71 | |
72 | static void qcow2_cache_table_release(Qcow2Cache *c, int i, int num_tables) |
73 | { |
74 | /* Using MADV_DONTNEED to discard memory is a Linux-specific feature */ |
75 | #ifdef CONFIG_LINUX |
76 | void *t = qcow2_cache_get_table_addr(c, i); |
77 | int align = getpagesize(); |
78 | size_t mem_size = (size_t) c->table_size * num_tables; |
79 | size_t offset = QEMU_ALIGN_UP((uintptr_t) t, align) - (uintptr_t) t; |
80 | size_t length = QEMU_ALIGN_DOWN(mem_size - offset, align); |
81 | if (mem_size > offset && length > 0) { |
82 | madvise((uint8_t *) t + offset, length, MADV_DONTNEED); |
83 | } |
84 | #endif |
85 | } |
86 | |
87 | static inline bool can_clean_entry(Qcow2Cache *c, int i) |
88 | { |
89 | Qcow2CachedTable *t = &c->entries[i]; |
90 | return t->ref == 0 && !t->dirty && t->offset != 0 && |
91 | t->lru_counter <= c->cache_clean_lru_counter; |
92 | } |
93 | |
94 | void qcow2_cache_clean_unused(Qcow2Cache *c) |
95 | { |
96 | int i = 0; |
97 | while (i < c->size) { |
98 | int to_clean = 0; |
99 | |
100 | /* Skip the entries that we don't need to clean */ |
101 | while (i < c->size && !can_clean_entry(c, i)) { |
102 | i++; |
103 | } |
104 | |
105 | /* And count how many we can clean in a row */ |
106 | while (i < c->size && can_clean_entry(c, i)) { |
107 | c->entries[i].offset = 0; |
108 | c->entries[i].lru_counter = 0; |
109 | i++; |
110 | to_clean++; |
111 | } |
112 | |
113 | if (to_clean > 0) { |
114 | qcow2_cache_table_release(c, i - to_clean, to_clean); |
115 | } |
116 | } |
117 | |
118 | c->cache_clean_lru_counter = c->lru_counter; |
119 | } |
120 | |
121 | Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables, |
122 | unsigned table_size) |
123 | { |
124 | BDRVQcow2State *s = bs->opaque; |
125 | Qcow2Cache *c; |
126 | |
127 | assert(num_tables > 0); |
128 | assert(is_power_of_2(table_size)); |
129 | assert(table_size >= (1 << MIN_CLUSTER_BITS)); |
130 | assert(table_size <= s->cluster_size); |
131 | |
132 | c = g_new0(Qcow2Cache, 1); |
133 | c->size = num_tables; |
134 | c->table_size = table_size; |
135 | c->entries = g_try_new0(Qcow2CachedTable, num_tables); |
136 | c->table_array = qemu_try_blockalign(bs->file->bs, |
137 | (size_t) num_tables * c->table_size); |
138 | |
139 | if (!c->entries || !c->table_array) { |
140 | qemu_vfree(c->table_array); |
141 | g_free(c->entries); |
142 | g_free(c); |
143 | c = NULL; |
144 | } |
145 | |
146 | return c; |
147 | } |
148 | |
149 | int qcow2_cache_destroy(Qcow2Cache *c) |
150 | { |
151 | int i; |
152 | |
153 | for (i = 0; i < c->size; i++) { |
154 | assert(c->entries[i].ref == 0); |
155 | } |
156 | |
157 | qemu_vfree(c->table_array); |
158 | g_free(c->entries); |
159 | g_free(c); |
160 | |
161 | return 0; |
162 | } |
163 | |
164 | static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c) |
165 | { |
166 | int ret; |
167 | |
168 | ret = qcow2_cache_flush(bs, c->depends); |
169 | if (ret < 0) { |
170 | return ret; |
171 | } |
172 | |
173 | c->depends = NULL; |
174 | c->depends_on_flush = false; |
175 | |
176 | return 0; |
177 | } |
178 | |
179 | static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) |
180 | { |
181 | BDRVQcow2State *s = bs->opaque; |
182 | int ret = 0; |
183 | |
184 | if (!c->entries[i].dirty || !c->entries[i].offset) { |
185 | return 0; |
186 | } |
187 | |
188 | trace_qcow2_cache_entry_flush(qemu_coroutine_self(), |
189 | c == s->l2_table_cache, i); |
190 | |
191 | if (c->depends) { |
192 | ret = qcow2_cache_flush_dependency(bs, c); |
193 | } else if (c->depends_on_flush) { |
194 | ret = bdrv_flush(bs->file->bs); |
195 | if (ret >= 0) { |
196 | c->depends_on_flush = false; |
197 | } |
198 | } |
199 | |
200 | if (ret < 0) { |
201 | return ret; |
202 | } |
203 | |
204 | if (c == s->refcount_block_cache) { |
205 | ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_BLOCK, |
206 | c->entries[i].offset, c->table_size, false); |
207 | } else if (c == s->l2_table_cache) { |
208 | ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, |
209 | c->entries[i].offset, c->table_size, false); |
210 | } else { |
211 | ret = qcow2_pre_write_overlap_check(bs, 0, |
212 | c->entries[i].offset, c->table_size, false); |
213 | } |
214 | |
215 | if (ret < 0) { |
216 | return ret; |
217 | } |
218 | |
219 | if (c == s->refcount_block_cache) { |
220 | BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); |
221 | } else if (c == s->l2_table_cache) { |
222 | BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); |
223 | } |
224 | |
225 | ret = bdrv_pwrite(bs->file, c->entries[i].offset, |
226 | qcow2_cache_get_table_addr(c, i), c->table_size); |
227 | if (ret < 0) { |
228 | return ret; |
229 | } |
230 | |
231 | c->entries[i].dirty = false; |
232 | |
233 | return 0; |
234 | } |
235 | |
236 | int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c) |
237 | { |
238 | BDRVQcow2State *s = bs->opaque; |
239 | int result = 0; |
240 | int ret; |
241 | int i; |
242 | |
243 | trace_qcow2_cache_flush(qemu_coroutine_self(), c == s->l2_table_cache); |
244 | |
245 | for (i = 0; i < c->size; i++) { |
246 | ret = qcow2_cache_entry_flush(bs, c, i); |
247 | if (ret < 0 && result != -ENOSPC) { |
248 | result = ret; |
249 | } |
250 | } |
251 | |
252 | return result; |
253 | } |
254 | |
255 | int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) |
256 | { |
257 | int result = qcow2_cache_write(bs, c); |
258 | |
259 | if (result == 0) { |
260 | int ret = bdrv_flush(bs->file->bs); |
261 | if (ret < 0) { |
262 | result = ret; |
263 | } |
264 | } |
265 | |
266 | return result; |
267 | } |
268 | |
269 | int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, |
270 | Qcow2Cache *dependency) |
271 | { |
272 | int ret; |
273 | |
274 | if (dependency->depends) { |
275 | ret = qcow2_cache_flush_dependency(bs, dependency); |
276 | if (ret < 0) { |
277 | return ret; |
278 | } |
279 | } |
280 | |
281 | if (c->depends && (c->depends != dependency)) { |
282 | ret = qcow2_cache_flush_dependency(bs, c); |
283 | if (ret < 0) { |
284 | return ret; |
285 | } |
286 | } |
287 | |
288 | c->depends = dependency; |
289 | return 0; |
290 | } |
291 | |
292 | void qcow2_cache_depends_on_flush(Qcow2Cache *c) |
293 | { |
294 | c->depends_on_flush = true; |
295 | } |
296 | |
297 | int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c) |
298 | { |
299 | int ret, i; |
300 | |
301 | ret = qcow2_cache_flush(bs, c); |
302 | if (ret < 0) { |
303 | return ret; |
304 | } |
305 | |
306 | for (i = 0; i < c->size; i++) { |
307 | assert(c->entries[i].ref == 0); |
308 | c->entries[i].offset = 0; |
309 | c->entries[i].lru_counter = 0; |
310 | } |
311 | |
312 | qcow2_cache_table_release(c, 0, c->size); |
313 | |
314 | c->lru_counter = 0; |
315 | |
316 | return 0; |
317 | } |
318 | |
319 | static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, |
320 | uint64_t offset, void **table, bool read_from_disk) |
321 | { |
322 | BDRVQcow2State *s = bs->opaque; |
323 | int i; |
324 | int ret; |
325 | int lookup_index; |
326 | uint64_t min_lru_counter = UINT64_MAX; |
327 | int min_lru_index = -1; |
328 | |
329 | assert(offset != 0); |
330 | |
331 | trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache, |
332 | offset, read_from_disk); |
333 | |
334 | if (!QEMU_IS_ALIGNED(offset, c->table_size)) { |
335 | qcow2_signal_corruption(bs, true, -1, -1, "Cannot get entry from %s " |
336 | "cache: Offset %#" PRIx64 " is unaligned" , |
337 | qcow2_cache_get_name(s, c), offset); |
338 | return -EIO; |
339 | } |
340 | |
341 | /* Check if the table is already cached */ |
342 | i = lookup_index = (offset / c->table_size * 4) % c->size; |
343 | do { |
344 | const Qcow2CachedTable *t = &c->entries[i]; |
345 | if (t->offset == offset) { |
346 | goto found; |
347 | } |
348 | if (t->ref == 0 && t->lru_counter < min_lru_counter) { |
349 | min_lru_counter = t->lru_counter; |
350 | min_lru_index = i; |
351 | } |
352 | if (++i == c->size) { |
353 | i = 0; |
354 | } |
355 | } while (i != lookup_index); |
356 | |
357 | if (min_lru_index == -1) { |
358 | /* This can't happen in current synchronous code, but leave the check |
359 | * here as a reminder for whoever starts using AIO with the cache */ |
360 | abort(); |
361 | } |
362 | |
363 | /* Cache miss: write a table back and replace it */ |
364 | i = min_lru_index; |
365 | trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(), |
366 | c == s->l2_table_cache, i); |
367 | |
368 | ret = qcow2_cache_entry_flush(bs, c, i); |
369 | if (ret < 0) { |
370 | return ret; |
371 | } |
372 | |
373 | trace_qcow2_cache_get_read(qemu_coroutine_self(), |
374 | c == s->l2_table_cache, i); |
375 | c->entries[i].offset = 0; |
376 | if (read_from_disk) { |
377 | if (c == s->l2_table_cache) { |
378 | BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); |
379 | } |
380 | |
381 | ret = bdrv_pread(bs->file, offset, |
382 | qcow2_cache_get_table_addr(c, i), |
383 | c->table_size); |
384 | if (ret < 0) { |
385 | return ret; |
386 | } |
387 | } |
388 | |
389 | c->entries[i].offset = offset; |
390 | |
391 | /* And return the right table */ |
392 | found: |
393 | c->entries[i].ref++; |
394 | *table = qcow2_cache_get_table_addr(c, i); |
395 | |
396 | trace_qcow2_cache_get_done(qemu_coroutine_self(), |
397 | c == s->l2_table_cache, i); |
398 | |
399 | return 0; |
400 | } |
401 | |
402 | int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, |
403 | void **table) |
404 | { |
405 | return qcow2_cache_do_get(bs, c, offset, table, true); |
406 | } |
407 | |
408 | int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, |
409 | void **table) |
410 | { |
411 | return qcow2_cache_do_get(bs, c, offset, table, false); |
412 | } |
413 | |
414 | void qcow2_cache_put(Qcow2Cache *c, void **table) |
415 | { |
416 | int i = qcow2_cache_get_table_idx(c, *table); |
417 | |
418 | c->entries[i].ref--; |
419 | *table = NULL; |
420 | |
421 | if (c->entries[i].ref == 0) { |
422 | c->entries[i].lru_counter = ++c->lru_counter; |
423 | } |
424 | |
425 | assert(c->entries[i].ref >= 0); |
426 | } |
427 | |
428 | void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table) |
429 | { |
430 | int i = qcow2_cache_get_table_idx(c, table); |
431 | assert(c->entries[i].offset != 0); |
432 | c->entries[i].dirty = true; |
433 | } |
434 | |
435 | void *qcow2_cache_is_table_offset(Qcow2Cache *c, uint64_t offset) |
436 | { |
437 | int i; |
438 | |
439 | for (i = 0; i < c->size; i++) { |
440 | if (c->entries[i].offset == offset) { |
441 | return qcow2_cache_get_table_addr(c, i); |
442 | } |
443 | } |
444 | return NULL; |
445 | } |
446 | |
447 | void qcow2_cache_discard(Qcow2Cache *c, void *table) |
448 | { |
449 | int i = qcow2_cache_get_table_idx(c, table); |
450 | |
451 | assert(c->entries[i].ref == 0); |
452 | |
453 | c->entries[i].offset = 0; |
454 | c->entries[i].lru_counter = 0; |
455 | c->entries[i].dirty = false; |
456 | |
457 | qcow2_cache_table_release(c, i, 1); |
458 | } |
459 | |