diff options
author | Anand Avati <avati@redhat.com> | 2013-03-06 01:11:59 -0800 |
---|---|---|
committer | Anand Avati <avati@redhat.com> | 2013-09-03 11:25:33 -0700 |
commit | 0d60175bd684cf6a14f750579d82dbd1ba97fcbc (patch) | |
tree | 1571f530548196006526442f3fc027cb623bb6fa /contrib | |
parent | 7dbfbfd3694e02b90e8f3ce509f5279da1523a02 (diff) |
contrib/qemu: Import qemu block source code
This qemu block format source code and its minimal
dependency files will be used in the next patch to implement
a qemu-block format translator.
Change-Id: Ic87638972f7ea9b3df84d7a0539512a250c11c1c
BUG: 986775
Signed-off-by: Anand Avati <avati@redhat.com>
Reviewed-on: http://review.gluster.org/5366
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Diffstat (limited to 'contrib')
107 files changed, 39119 insertions, 0 deletions
diff --git a/contrib/qemu/block.c b/contrib/qemu/block.c new file mode 100644 index 000000000..b56024113 --- /dev/null +++ b/contrib/qemu/block.c @@ -0,0 +1,4604 @@ +/* + * QEMU System Emulator block driver + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "config-host.h" +#include "qemu-common.h" +#include "trace.h" +#include "monitor/monitor.h" +#include "block/block_int.h" +#include "block/blockjob.h" +#include "qemu/module.h" +#include "qapi/qmp/qjson.h" +#include "sysemu/sysemu.h" +#include "qemu/notify.h" +#include "block/coroutine.h" +#include "qmp-commands.h" +#include "qemu/timer.h" + +#ifdef CONFIG_BSD +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/queue.h> +#ifndef __DragonFly__ +#include <sys/disk.h> +#endif +#endif + +#ifdef _WIN32 +#include <windows.h> +#endif + +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ + +typedef enum { + BDRV_REQ_COPY_ON_READ = 0x1, + BDRV_REQ_ZERO_WRITE = 0x2, +} BdrvRequestFlags; + +static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); +static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque); +static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque); +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov); +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov); +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags); +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags); +static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque, + bool is_write); +static void coroutine_fn bdrv_co_do_rw(void *opaque); +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors); + +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, double elapsed_time, uint64_t *wait); +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, + double elapsed_time, uint64_t *wait); +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, int64_t *wait); + +static QTAILQ_HEAD(, BlockDriverState) bdrv_states = + QTAILQ_HEAD_INITIALIZER(bdrv_states); + +static QLIST_HEAD(, BlockDriver) bdrv_drivers = + QLIST_HEAD_INITIALIZER(bdrv_drivers); + +/* If non-zero, use only whitelisted block drivers */ +static int use_bdrv_whitelist; + +#ifdef _WIN32 +static int is_windows_drive_prefix(const char *filename) +{ + return (((filename[0] >= 'a' && filename[0] <= 'z') || + (filename[0] >= 'A' && filename[0] <= 'Z')) && + filename[1] == ':'); +} + +int is_windows_drive(const char *filename) +{ + if (is_windows_drive_prefix(filename) && + filename[2] == '\0') + return 1; + if (strstart(filename, "\\\\.\\", NULL) || + strstart(filename, "//./", NULL)) + return 1; + return 0; +} +#endif + +/* throttling disk I/O limits */ +void bdrv_io_limits_disable(BlockDriverState *bs) +{ + bs->io_limits_enabled = false; + + while (qemu_co_queue_next(&bs->throttled_reqs)); + + if (bs->block_timer) { + qemu_del_timer(bs->block_timer); + qemu_free_timer(bs->block_timer); + bs->block_timer = NULL; + } + + bs->slice_start = 0; + bs->slice_end = 0; +} + +static void bdrv_block_timer(void *opaque) +{ + BlockDriverState *bs = opaque; + + qemu_co_queue_next(&bs->throttled_reqs); +} + +void bdrv_io_limits_enable(BlockDriverState *bs) +{ + qemu_co_queue_init(&bs->throttled_reqs); + bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs); + bs->io_limits_enabled = true; +} + +bool bdrv_io_limits_enabled(BlockDriverState *bs) +{ + BlockIOLimit *io_limits = &bs->io_limits; + return io_limits->bps[BLOCK_IO_LIMIT_READ] + || io_limits->bps[BLOCK_IO_LIMIT_WRITE] + || io_limits->bps[BLOCK_IO_LIMIT_TOTAL] + || io_limits->iops[BLOCK_IO_LIMIT_READ] + || io_limits->iops[BLOCK_IO_LIMIT_WRITE] + || io_limits->iops[BLOCK_IO_LIMIT_TOTAL]; +} + +static void bdrv_io_limits_intercept(BlockDriverState *bs, + bool is_write, int nb_sectors) +{ + int64_t wait_time = -1; + + if (!qemu_co_queue_empty(&bs->throttled_reqs)) { + qemu_co_queue_wait(&bs->throttled_reqs); + } + + /* In fact, we hope to keep each request's timing, in FIFO mode. The next + * throttled requests will not be dequeued until the current request is + * allowed to be serviced. So if the current request still exceeds the + * limits, it will be inserted to the head. All requests followed it will + * be still in throttled_reqs queue. + */ + + while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) { + qemu_mod_timer(bs->block_timer, + wait_time + qemu_get_clock_ns(vm_clock)); + qemu_co_queue_wait_insert_head(&bs->throttled_reqs); + } + + qemu_co_queue_next(&bs->throttled_reqs); +} + +/* check if the path starts with "<protocol>:" */ +static int path_has_protocol(const char *path) +{ + const char *p; + +#ifdef _WIN32 + if (is_windows_drive(path) || + is_windows_drive_prefix(path)) { + return 0; + } + p = path + strcspn(path, ":/\\"); +#else + p = path + strcspn(path, ":/"); +#endif + + return *p == ':'; +} + +int path_is_absolute(const char *path) +{ +#ifdef _WIN32 + /* specific case for names like: "\\.\d:" */ + if (is_windows_drive(path) || is_windows_drive_prefix(path)) { + return 1; + } + return (*path == '/' || *path == '\\'); +#else + return (*path == '/'); +#endif +} + +/* if filename is absolute, just copy it to dest. Otherwise, build a + path to it by considering it is relative to base_path. URL are + supported. */ +void path_combine(char *dest, int dest_size, + const char *base_path, + const char *filename) +{ + const char *p, *p1; + int len; + + if (dest_size <= 0) + return; + if (path_is_absolute(filename)) { + pstrcpy(dest, dest_size, filename); + } else { + p = strchr(base_path, ':'); + if (p) + p++; + else + p = base_path; + p1 = strrchr(base_path, '/'); +#ifdef _WIN32 + { + const char *p2; + p2 = strrchr(base_path, '\\'); + if (!p1 || p2 > p1) + p1 = p2; + } +#endif + if (p1) + p1++; + else + p1 = base_path; + if (p1 > p) + p = p1; + len = p - base_path; + if (len > dest_size - 1) + len = dest_size - 1; + memcpy(dest, base_path, len); + dest[len] = '\0'; + pstrcat(dest, dest_size, filename); + } +} + +void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz) +{ + if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) { + pstrcpy(dest, sz, bs->backing_file); + } else { + path_combine(dest, sz, bs->filename, bs->backing_file); + } +} + +void bdrv_register(BlockDriver *bdrv) +{ + /* Block drivers without coroutine functions need emulation */ + if (!bdrv->bdrv_co_readv) { + bdrv->bdrv_co_readv = bdrv_co_readv_em; + bdrv->bdrv_co_writev = bdrv_co_writev_em; + + /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if + * the block driver lacks aio we need to emulate that too. + */ + if (!bdrv->bdrv_aio_readv) { + /* add AIO emulation layer */ + bdrv->bdrv_aio_readv = bdrv_aio_readv_em; + bdrv->bdrv_aio_writev = bdrv_aio_writev_em; + } + } + + QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list); +} + +/* create a new block device (by default it is empty) */ +BlockDriverState *bdrv_new(const char *device_name) +{ + BlockDriverState *bs; + + bs = g_malloc0(sizeof(BlockDriverState)); + pstrcpy(bs->device_name, sizeof(bs->device_name), device_name); + if (device_name[0] != '\0') { + QTAILQ_INSERT_TAIL(&bdrv_states, bs, list); + } + bdrv_iostatus_disable(bs); + notifier_list_init(&bs->close_notifiers); + notifier_with_return_list_init(&bs->before_write_notifiers); + + return bs; +} + +void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify) +{ + notifier_list_add(&bs->close_notifiers, notify); +} + +BlockDriver *bdrv_find_format(const char *format_name) +{ + BlockDriver *drv1; + QLIST_FOREACH(drv1, &bdrv_drivers, list) { + if (!strcmp(drv1->format_name, format_name)) { + return drv1; + } + } + return NULL; +} + +static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only) +{ + static const char *whitelist_rw[] = { + CONFIG_BDRV_RW_WHITELIST + }; + static const char *whitelist_ro[] = { + CONFIG_BDRV_RO_WHITELIST + }; + const char **p; + + if (!whitelist_rw[0] && !whitelist_ro[0]) { + return 1; /* no whitelist, anything goes */ + } + + for (p = whitelist_rw; *p; p++) { + if (!strcmp(drv->format_name, *p)) { + return 1; + } + } + if (read_only) { + for (p = whitelist_ro; *p; p++) { + if (!strcmp(drv->format_name, *p)) { + return 1; + } + } + } + return 0; +} + +BlockDriver *bdrv_find_whitelisted_format(const char *format_name, + bool read_only) +{ + BlockDriver *drv = bdrv_find_format(format_name); + return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL; +} + +typedef struct CreateCo { + BlockDriver *drv; + char *filename; + QEMUOptionParameter *options; + int ret; +} CreateCo; + +static void coroutine_fn bdrv_create_co_entry(void *opaque) +{ + CreateCo *cco = opaque; + assert(cco->drv); + + cco->ret = cco->drv->bdrv_create(cco->filename, cco->options); +} + +int bdrv_create(BlockDriver *drv, const char* filename, + QEMUOptionParameter *options) +{ + int ret; + + Coroutine *co; + CreateCo cco = { + .drv = drv, + .filename = g_strdup(filename), + .options = options, + .ret = NOT_DONE, + }; + + if (!drv->bdrv_create) { + ret = -ENOTSUP; + goto out; + } + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_create_co_entry(&cco); + } else { + co = qemu_coroutine_create(bdrv_create_co_entry); + qemu_coroutine_enter(co, &cco); + while (cco.ret == NOT_DONE) { + qemu_aio_wait(); + } + } + + ret = cco.ret; + +out: + g_free(cco.filename); + return ret; +} + +int bdrv_create_file(const char* filename, QEMUOptionParameter *options) +{ + BlockDriver *drv; + + drv = bdrv_find_protocol(filename, true); + if (drv == NULL) { + return -ENOENT; + } + + return bdrv_create(drv, filename, options); +} + +/* + * Create a uniquely-named empty temporary file. + * Return 0 upon success, otherwise a negative errno value. + */ +int get_tmp_filename(char *filename, int size) +{ +#ifdef _WIN32 + char temp_dir[MAX_PATH]; + /* GetTempFileName requires that its output buffer (4th param) + have length MAX_PATH or greater. */ + assert(size >= MAX_PATH); + return (GetTempPath(MAX_PATH, temp_dir) + && GetTempFileName(temp_dir, "qem", 0, filename) + ? 0 : -GetLastError()); +#else + int fd; + const char *tmpdir; + tmpdir = getenv("TMPDIR"); + if (!tmpdir) + tmpdir = "/tmp"; + if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) { + return -EOVERFLOW; + } + fd = mkstemp(filename); + if (fd < 0) { + return -errno; + } + if (close(fd) != 0) { + unlink(filename); + return -errno; + } + return 0; +#endif +} + +/* + * Detect host devices. By convention, /dev/cdrom[N] is always + * recognized as a host CDROM. + */ +static BlockDriver *find_hdev_driver(const char *filename) +{ + int score_max = 0, score; + BlockDriver *drv = NULL, *d; + + QLIST_FOREACH(d, &bdrv_drivers, list) { + if (d->bdrv_probe_device) { + score = d->bdrv_probe_device(filename); + if (score > score_max) { + score_max = score; + drv = d; + } + } + } + + return drv; +} + +BlockDriver *bdrv_find_protocol(const char *filename, + bool allow_protocol_prefix) +{ + BlockDriver *drv1; + char protocol[128]; + int len; + const char *p; + + /* TODO Drivers without bdrv_file_open must be specified explicitly */ + + /* + * XXX(hch): we really should not let host device detection + * override an explicit protocol specification, but moving this + * later breaks access to device names with colons in them. + * Thanks to the brain-dead persistent naming schemes on udev- + * based Linux systems those actually are quite common. + */ + drv1 = find_hdev_driver(filename); + if (drv1) { + return drv1; + } + + if (!path_has_protocol(filename) || !allow_protocol_prefix) { + return bdrv_find_format("file"); + } + + p = strchr(filename, ':'); + assert(p != NULL); + len = p - filename; + if (len > sizeof(protocol) - 1) + len = sizeof(protocol) - 1; + memcpy(protocol, filename, len); + protocol[len] = '\0'; + QLIST_FOREACH(drv1, &bdrv_drivers, list) { + if (drv1->protocol_name && + !strcmp(drv1->protocol_name, protocol)) { + return drv1; + } + } + return NULL; +} + +static int find_image_format(BlockDriverState *bs, const char *filename, + BlockDriver **pdrv) +{ + int score, score_max; + BlockDriver *drv1, *drv; + uint8_t buf[2048]; + int ret = 0; + + /* Return the raw BlockDriver * to scsi-generic devices or empty drives */ + if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) { + drv = bdrv_find_format("raw"); + if (!drv) { + ret = -ENOENT; + } + *pdrv = drv; + return ret; + } + + ret = bdrv_pread(bs, 0, buf, sizeof(buf)); + if (ret < 0) { + *pdrv = NULL; + return ret; + } + + score_max = 0; + drv = NULL; + QLIST_FOREACH(drv1, &bdrv_drivers, list) { + if (drv1->bdrv_probe) { + score = drv1->bdrv_probe(buf, ret, filename); + if (score > score_max) { + score_max = score; + drv = drv1; + } + } + } + if (!drv) { + ret = -ENOENT; + } + *pdrv = drv; + return ret; +} + +/** + * Set the current 'total_sectors' value + */ +static int refresh_total_sectors(BlockDriverState *bs, int64_t hint) +{ + BlockDriver *drv = bs->drv; + + /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */ + if (bs->sg) + return 0; + + /* query actual device if possible, otherwise just trust the hint */ + if (drv->bdrv_getlength) { + int64_t length = drv->bdrv_getlength(bs); + if (length < 0) { + return length; + } + hint = length >> BDRV_SECTOR_BITS; + } + + bs->total_sectors = hint; + return 0; +} + +/** + * Set open flags for a given discard mode + * + * Return 0 on success, -1 if the discard mode was invalid. + */ +int bdrv_parse_discard_flags(const char *mode, int *flags) +{ + *flags &= ~BDRV_O_UNMAP; + + if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) { + /* do nothing */ + } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) { + *flags |= BDRV_O_UNMAP; + } else { + return -1; + } + + return 0; +} + +/** + * Set open flags for a given cache mode + * + * Return 0 on success, -1 if the cache mode was invalid. + */ +int bdrv_parse_cache_flags(const char *mode, int *flags) +{ + *flags &= ~BDRV_O_CACHE_MASK; + + if (!strcmp(mode, "off") || !strcmp(mode, "none")) { + *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB; + } else if (!strcmp(mode, "directsync")) { + *flags |= BDRV_O_NOCACHE; + } else if (!strcmp(mode, "writeback")) { + *flags |= BDRV_O_CACHE_WB; + } else if (!strcmp(mode, "unsafe")) { + *flags |= BDRV_O_CACHE_WB; + *flags |= BDRV_O_NO_FLUSH; + } else if (!strcmp(mode, "writethrough")) { + /* this is the default */ + } else { + return -1; + } + + return 0; +} + +/** + * The copy-on-read flag is actually a reference count so multiple users may + * use the feature without worrying about clobbering its previous state. + * Copy-on-read stays enabled until all users have called to disable it. + */ +void bdrv_enable_copy_on_read(BlockDriverState *bs) +{ + bs->copy_on_read++; +} + +void bdrv_disable_copy_on_read(BlockDriverState *bs) +{ + assert(bs->copy_on_read > 0); + bs->copy_on_read--; +} + +static int bdrv_open_flags(BlockDriverState *bs, int flags) +{ + int open_flags = flags | BDRV_O_CACHE_WB; + + /* + * Clear flags that are internal to the block layer before opening the + * image. + */ + open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); + + /* + * Snapshots should be writable. + */ + if (bs->is_temporary) { + open_flags |= BDRV_O_RDWR; + } + + return open_flags; +} + +/* + * Common part for opening disk images and files + * + * Removes all processed options from *options. + */ +static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, + QDict *options, int flags, BlockDriver *drv) +{ + int ret, open_flags; + const char *filename; + + assert(drv != NULL); + assert(bs->file == NULL); + assert(options != NULL && bs->options != options); + + if (file != NULL) { + filename = file->filename; + } else { + filename = qdict_get_try_str(options, "filename"); + } + + trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name); + + /* bdrv_open() with directly using a protocol as drv. This layer is already + * opened, so assign it to bs (while file becomes a closed BlockDriverState) + * and return immediately. */ + if (file != NULL && drv->bdrv_file_open) { + bdrv_swap(file, bs); + return 0; + } + + bs->open_flags = flags; + bs->buffer_alignment = 512; + open_flags = bdrv_open_flags(bs, flags); + bs->read_only = !(open_flags & BDRV_O_RDWR); + + if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) { + return -ENOTSUP; + } + + assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */ + if (!bs->read_only && (flags & BDRV_O_COPY_ON_READ)) { + bdrv_enable_copy_on_read(bs); + } + + if (filename != NULL) { + pstrcpy(bs->filename, sizeof(bs->filename), filename); + } else { + bs->filename[0] = '\0'; + } + + bs->drv = drv; + bs->opaque = g_malloc0(drv->instance_size); + + bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB); + + /* Open the image, either directly or using a protocol */ + if (drv->bdrv_file_open) { + assert(file == NULL); + assert(drv->bdrv_parse_filename || filename != NULL); + ret = drv->bdrv_file_open(bs, options, open_flags); + } else { + if (file == NULL) { + qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't use '%s' as a " + "block driver for the protocol level", + drv->format_name); + ret = -EINVAL; + goto free_and_fail; + } + assert(file != NULL); + bs->file = file; + ret = drv->bdrv_open(bs, options, open_flags); + } + + if (ret < 0) { + goto free_and_fail; + } + + ret = refresh_total_sectors(bs, bs->total_sectors); + if (ret < 0) { + goto free_and_fail; + } + +#ifndef _WIN32 + if (bs->is_temporary) { + assert(filename != NULL); + unlink(filename); + } +#endif + return 0; + +free_and_fail: + bs->file = NULL; + g_free(bs->opaque); + bs->opaque = NULL; + bs->drv = NULL; + return ret; +} + +/* + * Opens a file using a protocol (file, host_device, nbd, ...) + * + * options is a QDict of options to pass to the block drivers, or NULL for an + * empty set of options. The reference to the QDict belongs to the block layer + * after the call (even on failure), so if the caller intends to reuse the + * dictionary, it needs to use QINCREF() before calling bdrv_file_open. + */ +int bdrv_file_open(BlockDriverState **pbs, const char *filename, + QDict *options, int flags) +{ + BlockDriverState *bs; + BlockDriver *drv; + const char *drvname; + bool allow_protocol_prefix = false; + int ret; + + /* NULL means an empty set of options */ + if (options == NULL) { + options = qdict_new(); + } + + bs = bdrv_new(""); + bs->options = options; + options = qdict_clone_shallow(options); + + /* Fetch the file name from the options QDict if necessary */ + if (!filename) { + filename = qdict_get_try_str(options, "filename"); + } else if (filename && !qdict_haskey(options, "filename")) { + qdict_put(options, "filename", qstring_from_str(filename)); + allow_protocol_prefix = true; + } else { + qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't specify 'file' and " + "'filename' options at the same time"); + ret = -EINVAL; + goto fail; + } + + /* Find the right block driver */ + drvname = qdict_get_try_str(options, "driver"); + if (drvname) { + drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR)); + qdict_del(options, "driver"); + } else if (filename) { + drv = bdrv_find_protocol(filename, allow_protocol_prefix); + if (!drv) { + qerror_report(ERROR_CLASS_GENERIC_ERROR, "Unknown protocol"); + } + } else { + qerror_report(ERROR_CLASS_GENERIC_ERROR, + "Must specify either driver or file"); + drv = NULL; + } + + if (!drv) { + ret = -ENOENT; + goto fail; + } + + /* Parse the filename and open it */ + if (drv->bdrv_parse_filename && filename) { + Error *local_err = NULL; + drv->bdrv_parse_filename(filename, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + ret = -EINVAL; + goto fail; + } + qdict_del(options, "filename"); + } else if (!drv->bdrv_parse_filename && !filename) { + qerror_report(ERROR_CLASS_GENERIC_ERROR, + "The '%s' block driver requires a file name", + drv->format_name); + ret = -EINVAL; + goto fail; + } + + ret = bdrv_open_common(bs, NULL, options, flags, drv); + if (ret < 0) { + goto fail; + } + + /* Check if any unknown options were used */ + if (qdict_size(options) != 0) { + const QDictEntry *entry = qdict_first(options); + qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block protocol '%s' doesn't " + "support the option '%s'", + drv->format_name, entry->key); + ret = -EINVAL; + goto fail; + } + QDECREF(options); + + bs->growable = 1; + *pbs = bs; + return 0; + +fail: + QDECREF(options); + if (!bs->drv) { + QDECREF(bs->options); + } + bdrv_delete(bs); + return ret; +} + +/* + * Opens the backing file for a BlockDriverState if not yet open + * + * options is a QDict of options to pass to the block drivers, or NULL for an + * empty set of options. The reference to the QDict is transferred to this + * function (even on failure), so if the caller intends to reuse the dictionary, + * it needs to use QINCREF() before calling bdrv_file_open. + */ +int bdrv_open_backing_file(BlockDriverState *bs, QDict *options) +{ + char backing_filename[PATH_MAX]; + int back_flags, ret; + BlockDriver *back_drv = NULL; + + if (bs->backing_hd != NULL) { + QDECREF(options); + return 0; + } + + /* NULL means an empty set of options */ + if (options == NULL) { + options = qdict_new(); + } + + bs->open_flags &= ~BDRV_O_NO_BACKING; + if (qdict_haskey(options, "file.filename")) { + backing_filename[0] = '\0'; + } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) { + QDECREF(options); + return 0; + } + + bs->backing_hd = bdrv_new(""); + bdrv_get_full_backing_filename(bs, backing_filename, + sizeof(backing_filename)); + + if (bs->backing_format[0] != '\0') { + back_drv = bdrv_find_format(bs->backing_format); + } + + /* backing files always opened read-only */ + back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT); + + ret = bdrv_open(bs->backing_hd, + *backing_filename ? backing_filename : NULL, options, + back_flags, back_drv); + if (ret < 0) { + bdrv_delete(bs->backing_hd); + bs->backing_hd = NULL; + bs->open_flags |= BDRV_O_NO_BACKING; + return ret; + } + return 0; +} + +static void extract_subqdict(QDict *src, QDict **dst, const char *start) +{ + const QDictEntry *entry, *next; + const char *p; + + *dst = qdict_new(); + entry = qdict_first(src); + + while (entry != NULL) { + next = qdict_next(src, entry); + if (strstart(entry->key, start, &p)) { + qobject_incref(entry->value); + qdict_put_obj(*dst, p, entry->value); + qdict_del(src, entry->key); + } + entry = next; + } +} + +/* + * Opens a disk image (raw, qcow2, vmdk, ...) + * + * options is a QDict of options to pass to the block drivers, or NULL for an + * empty set of options. The reference to the QDict belongs to the block layer + * after the call (even on failure), so if the caller intends to reuse the + * dictionary, it needs to use QINCREF() before calling bdrv_open. + */ +int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, + int flags, BlockDriver *drv) +{ + int ret; + /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ + char tmp_filename[PATH_MAX + 1]; + BlockDriverState *file = NULL; + QDict *file_options = NULL; + + /* NULL means an empty set of options */ + if (options == NULL) { + options = qdict_new(); + } + + bs->options = options; + options = qdict_clone_shallow(options); + + /* For snapshot=on, create a temporary qcow2 overlay */ + if (flags & BDRV_O_SNAPSHOT) { + BlockDriverState *bs1; + int64_t total_size; + BlockDriver *bdrv_qcow2; + QEMUOptionParameter *create_options; + char backing_filename[PATH_MAX]; + + if (qdict_size(options) != 0) { + error_report("Can't use snapshot=on with driver-specific options"); + ret = -EINVAL; + goto fail; + } + assert(filename != NULL); + + /* if snapshot, we create a temporary backing file and open it + instead of opening 'filename' directly */ + + /* if there is a backing file, use it */ + bs1 = bdrv_new(""); + ret = bdrv_open(bs1, filename, NULL, 0, drv); + if (ret < 0) { + bdrv_delete(bs1); + goto fail; + } + total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK; + + bdrv_delete(bs1); + + ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename)); + if (ret < 0) { + goto fail; + } + + /* Real path is meaningless for protocols */ + if (path_has_protocol(filename)) { + snprintf(backing_filename, sizeof(backing_filename), + "%s", filename); + } else if (!realpath(filename, backing_filename)) { + ret = -errno; + goto fail; + } + + bdrv_qcow2 = bdrv_find_format("qcow2"); + create_options = parse_option_parameters("", bdrv_qcow2->create_options, + NULL); + + set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size); + set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE, + backing_filename); + if (drv) { + set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT, + drv->format_name); + } + + ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options); + free_option_parameters(create_options); + if (ret < 0) { + goto fail; + } + + filename = tmp_filename; + drv = bdrv_qcow2; + bs->is_temporary = 1; + } + + /* Open image file without format layer */ + if (flags & BDRV_O_RDWR) { + flags |= BDRV_O_ALLOW_RDWR; + } + + extract_subqdict(options, &file_options, "file."); + + ret = bdrv_file_open(&file, filename, file_options, + bdrv_open_flags(bs, flags | BDRV_O_UNMAP)); + if (ret < 0) { + goto fail; + } + + /* Find the right image format driver */ + if (!drv) { + ret = find_image_format(file, filename, &drv); + } + + if (!drv) { + goto unlink_and_fail; + } + + /* Open the image */ + ret = bdrv_open_common(bs, file, options, flags, drv); + if (ret < 0) { + goto unlink_and_fail; + } + + if (bs->file != file) { + bdrv_delete(file); + file = NULL; + } + + /* If there is a backing file, use it */ + if ((flags & BDRV_O_NO_BACKING) == 0) { + QDict *backing_options; + + extract_subqdict(options, &backing_options, "backing."); + ret = bdrv_open_backing_file(bs, backing_options); + if (ret < 0) { + goto close_and_fail; + } + } + + /* Check if any unknown options were used */ + if (qdict_size(options) != 0) { + const QDictEntry *entry = qdict_first(options); + qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by " + "device '%s' doesn't support the option '%s'", + drv->format_name, bs->device_name, entry->key); + + ret = -EINVAL; + goto close_and_fail; + } + QDECREF(options); + + if (!bdrv_key_required(bs)) { + bdrv_dev_change_media_cb(bs, true); + } + + /* throttling disk I/O limits */ + if (bs->io_limits_enabled) { + bdrv_io_limits_enable(bs); + } + + return 0; + +unlink_and_fail: + if (file != NULL) { + bdrv_delete(file); + } + if (bs->is_temporary) { + unlink(filename); + } +fail: + QDECREF(bs->options); + QDECREF(options); + bs->options = NULL; + return ret; + +close_and_fail: + bdrv_close(bs); + QDECREF(options); + return ret; +} + +typedef struct BlockReopenQueueEntry { + bool prepared; + BDRVReopenState state; + QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry; +} BlockReopenQueueEntry; + +/* + * Adds a BlockDriverState to a simple queue for an atomic, transactional + * reopen of multiple devices. + * + * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT + * already performed, or alternatively may be NULL a new BlockReopenQueue will + * be created and initialized. This newly created BlockReopenQueue should be + * passed back in for subsequent calls that are intended to be of the same + * atomic 'set'. + * + * bs is the BlockDriverState to add to the reopen queue. + * + * flags contains the open flags for the associated bs + * + * returns a pointer to bs_queue, which is either the newly allocated + * bs_queue, or the existing bs_queue being used. + * + */ +BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, + BlockDriverState *bs, int flags) +{ + assert(bs != NULL); + + BlockReopenQueueEntry *bs_entry; + if (bs_queue == NULL) { + bs_queue = g_new0(BlockReopenQueue, 1); + QSIMPLEQ_INIT(bs_queue); + } + + if (bs->file) { + bdrv_reopen_queue(bs_queue, bs->file, flags); + } + + bs_entry = g_new0(BlockReopenQueueEntry, 1); + QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry); + + bs_entry->state.bs = bs; + bs_entry->state.flags = flags; + + return bs_queue; +} + +/* + * Reopen multiple BlockDriverStates atomically & transactionally. + * + * The queue passed in (bs_queue) must have been built up previous + * via bdrv_reopen_queue(). + * + * Reopens all BDS specified in the queue, with the appropriate + * flags. All devices are prepared for reopen, and failure of any + * device will cause all device changes to be abandonded, and intermediate + * data cleaned up. + * + * If all devices prepare successfully, then the changes are committed + * to all devices. + * + */ +int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) +{ + int ret = -1; + BlockReopenQueueEntry *bs_entry, *next; + Error *local_err = NULL; + + assert(bs_queue != NULL); + + bdrv_drain_all(); + + QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { + if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) { + error_propagate(errp, local_err); + goto cleanup; + } + bs_entry->prepared = true; + } + + /* If we reach this point, we have success and just need to apply the + * changes + */ + QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { + bdrv_reopen_commit(&bs_entry->state); + } + + ret = 0; + +cleanup: + QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { + if (ret && bs_entry->prepared) { + bdrv_reopen_abort(&bs_entry->state); + } + g_free(bs_entry); + } + g_free(bs_queue); + return ret; +} + + +/* Reopen a single BlockDriverState with the specified flags. */ +int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp) +{ + int ret = -1; + Error *local_err = NULL; + BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags); + + ret = bdrv_reopen_multiple(queue, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); + } + return ret; +} + + +/* + * Prepares a BlockDriverState for reopen. All changes are staged in the + * 'opaque' field of the BDRVReopenState, which is used and allocated by + * the block driver layer .bdrv_reopen_prepare() + * + * bs is the BlockDriverState to reopen + * flags are the new open flags + * queue is the reopen queue + * + * Returns 0 on success, non-zero on error. On error errp will be set + * as well. + * + * On failure, bdrv_reopen_abort() will be called to clean up any data. + * It is the responsibility of the caller to then call the abort() or + * commit() for any other BDS that have been left in a prepare() state + * + */ +int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, + Error **errp) +{ + int ret = -1; + Error *local_err = NULL; + BlockDriver *drv; + + assert(reopen_state != NULL); + assert(reopen_state->bs->drv != NULL); + drv = reopen_state->bs->drv; + + /* if we are to stay read-only, do not allow permission change + * to r/w */ + if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) && + reopen_state->flags & BDRV_O_RDWR) { + error_set(errp, QERR_DEVICE_IS_READ_ONLY, + reopen_state->bs->device_name); + goto error; + } + + + ret = bdrv_flush(reopen_state->bs); + if (ret) { + error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive", + strerror(-ret)); + goto error; + } + + if (drv->bdrv_reopen_prepare) { + ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err); + if (ret) { + if (local_err != NULL) { + error_propagate(errp, local_err); + } else { + error_setg(errp, "failed while preparing to reopen image '%s'", + reopen_state->bs->filename); + } + goto error; + } + } else { + /* It is currently mandatory to have a bdrv_reopen_prepare() + * handler for each supported drv. */ + error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + drv->format_name, reopen_state->bs->device_name, + "reopening of file"); + ret = -1; + goto error; + } + + ret = 0; + +error: + return ret; +} + +/* + * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and + * makes them final by swapping the staging BlockDriverState contents into + * the active BlockDriverState contents. + */ +void bdrv_reopen_commit(BDRVReopenState *reopen_state) +{ + BlockDriver *drv; + + assert(reopen_state != NULL); + drv = reopen_state->bs->drv; + assert(drv != NULL); + + /* If there are any driver level actions to take */ + if (drv->bdrv_reopen_commit) { + drv->bdrv_reopen_commit(reopen_state); + } + + /* set BDS specific flags now */ + reopen_state->bs->open_flags = reopen_state->flags; + reopen_state->bs->enable_write_cache = !!(reopen_state->flags & + BDRV_O_CACHE_WB); + reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR); +} + +/* + * Abort the reopen, and delete and free the staged changes in + * reopen_state + */ +void bdrv_reopen_abort(BDRVReopenState *reopen_state) +{ + BlockDriver *drv; + + assert(reopen_state != NULL); + drv = reopen_state->bs->drv; + assert(drv != NULL); + + if (drv->bdrv_reopen_abort) { + drv->bdrv_reopen_abort(reopen_state); + } +} + + +void bdrv_close(BlockDriverState *bs) +{ + if (bs->job) { + block_job_cancel_sync(bs->job); + } + bdrv_drain_all(); /* complete I/O */ + bdrv_flush(bs); + bdrv_drain_all(); /* in case flush left pending I/O */ + notifier_list_notify(&bs->close_notifiers, bs); + + if (bs->drv) { + if (bs->backing_hd) { + bdrv_delete(bs->backing_hd); + bs->backing_hd = NULL; + } + bs->drv->bdrv_close(bs); + g_free(bs->opaque); +#ifdef _WIN32 + if (bs->is_temporary) { + unlink(bs->filename); + } +#endif + bs->opaque = NULL; + bs->drv = NULL; + bs->copy_on_read = 0; + bs->backing_file[0] = '\0'; + bs->backing_format[0] = '\0'; + bs->total_sectors = 0; + bs->encrypted = 0; + bs->valid_key = 0; + bs->sg = 0; + bs->growable = 0; + QDECREF(bs->options); + bs->options = NULL; + + if (bs->file != NULL) { + bdrv_delete(bs->file); + bs->file = NULL; + } + } + + bdrv_dev_change_media_cb(bs, false); + + /*throttling disk I/O limits*/ + if (bs->io_limits_enabled) { + bdrv_io_limits_disable(bs); + } +} + +void bdrv_close_all(void) +{ + BlockDriverState *bs; + + QTAILQ_FOREACH(bs, &bdrv_states, list) { + bdrv_close(bs); + } +} + +/* + * Wait for pending requests to complete across all BlockDriverStates + * + * This function does not flush data to disk, use bdrv_flush_all() for that + * after calling this function. + * + * Note that completion of an asynchronous I/O operation can trigger any + * number of other I/O operations on other devices---for example a coroutine + * can be arbitrarily complex and a constant flow of I/O can come until the + * coroutine is complete. Because of this, it is not possible to have a + * function to drain a single device's I/O queue. + */ +void bdrv_drain_all(void) +{ + BlockDriverState *bs; + bool busy; + + do { + busy = qemu_aio_wait(); + + /* FIXME: We do not have timer support here, so this is effectively + * a busy wait. + */ + QTAILQ_FOREACH(bs, &bdrv_states, list) { + if (!qemu_co_queue_empty(&bs->throttled_reqs)) { + qemu_co_queue_restart_all(&bs->throttled_reqs); + busy = true; + } + } + } while (busy); + + /* If requests are still pending there is a bug somewhere */ + QTAILQ_FOREACH(bs, &bdrv_states, list) { + assert(QLIST_EMPTY(&bs->tracked_requests)); + assert(qemu_co_queue_empty(&bs->throttled_reqs)); + } +} + +/* make a BlockDriverState anonymous by removing from bdrv_state list. + Also, NULL terminate the device_name to prevent double remove */ +void bdrv_make_anon(BlockDriverState *bs) +{ + if (bs->device_name[0] != '\0') { + QTAILQ_REMOVE(&bdrv_states, bs, list); + } + bs->device_name[0] = '\0'; +} + +static void bdrv_rebind(BlockDriverState *bs) +{ + if (bs->drv && bs->drv->bdrv_rebind) { + bs->drv->bdrv_rebind(bs); + } +} + +static void bdrv_move_feature_fields(BlockDriverState *bs_dest, + BlockDriverState *bs_src) +{ + /* move some fields that need to stay attached to the device */ + bs_dest->open_flags = bs_src->open_flags; + + /* dev info */ + bs_dest->dev_ops = bs_src->dev_ops; + bs_dest->dev_opaque = bs_src->dev_opaque; + bs_dest->dev = bs_src->dev; + bs_dest->buffer_alignment = bs_src->buffer_alignment; + bs_dest->copy_on_read = bs_src->copy_on_read; + + bs_dest->enable_write_cache = bs_src->enable_write_cache; + + /* i/o timing parameters */ + bs_dest->slice_start = bs_src->slice_start; + bs_dest->slice_end = bs_src->slice_end; + bs_dest->slice_submitted = bs_src->slice_submitted; + bs_dest->io_limits = bs_src->io_limits; + bs_dest->throttled_reqs = bs_src->throttled_reqs; + bs_dest->block_timer = bs_src->block_timer; + bs_dest->io_limits_enabled = bs_src->io_limits_enabled; + + /* r/w error */ + bs_dest->on_read_error = bs_src->on_read_error; + bs_dest->on_write_error = bs_src->on_write_error; + + /* i/o status */ + bs_dest->iostatus_enabled = bs_src->iostatus_enabled; + bs_dest->iostatus = bs_src->iostatus; + + /* dirty bitmap */ + bs_dest->dirty_bitmap = bs_src->dirty_bitmap; + + /* job */ + bs_dest->in_use = bs_src->in_use; + bs_dest->job = bs_src->job; + + /* keep the same entry in bdrv_states */ + pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name), + bs_src->device_name); + bs_dest->list = bs_src->list; +} + +/* + * Swap bs contents for two image chains while they are live, + * while keeping required fields on the BlockDriverState that is + * actually attached to a device. + * + * This will modify the BlockDriverState fields, and swap contents + * between bs_new and bs_old. Both bs_new and bs_old are modified. + * + * bs_new is required to be anonymous. + * + * This function does not create any image files. + */ +void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old) +{ + BlockDriverState tmp; + + /* bs_new must be anonymous and shouldn't have anything fancy enabled */ + assert(bs_new->device_name[0] == '\0'); + assert(bs_new->dirty_bitmap == NULL); + assert(bs_new->job == NULL); + assert(bs_new->dev == NULL); + assert(bs_new->in_use == 0); + assert(bs_new->io_limits_enabled == false); + assert(bs_new->block_timer == NULL); + + tmp = *bs_new; + *bs_new = *bs_old; + *bs_old = tmp; + + /* there are some fields that should not be swapped, move them back */ + bdrv_move_feature_fields(&tmp, bs_old); + bdrv_move_feature_fields(bs_old, bs_new); + bdrv_move_feature_fields(bs_new, &tmp); + + /* bs_new shouldn't be in bdrv_states even after the swap! */ + assert(bs_new->device_name[0] == '\0'); + + /* Check a few fields that should remain attached to the device */ + assert(bs_new->dev == NULL); + assert(bs_new->job == NULL); + assert(bs_new->in_use == 0); + assert(bs_new->io_limits_enabled == false); + assert(bs_new->block_timer == NULL); + + bdrv_rebind(bs_new); + bdrv_rebind(bs_old); +} + +/* + * Add new bs contents at the top of an image chain while the chain is + * live, while keeping required fields on the top layer. + * + * This will modify the BlockDriverState fields, and swap contents + * between bs_new and bs_top. Both bs_new and bs_top are modified. + * + * bs_new is required to be anonymous. + * + * This function does not create any image files. + */ +void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top) +{ + bdrv_swap(bs_new, bs_top); + + /* The contents of 'tmp' will become bs_top, as we are + * swapping bs_new and bs_top contents. */ + bs_top->backing_hd = bs_new; + bs_top->open_flags &= ~BDRV_O_NO_BACKING; + pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file), + bs_new->filename); + pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format), + bs_new->drv ? bs_new->drv->format_name : ""); +} + +void bdrv_delete(BlockDriverState *bs) +{ + assert(!bs->dev); + assert(!bs->job); + assert(!bs->in_use); + + /* remove from list, if necessary */ + bdrv_make_anon(bs); + + bdrv_close(bs); + + g_free(bs); +} + +int bdrv_attach_dev(BlockDriverState *bs, void *dev) +/* TODO change to DeviceState *dev when all users are qdevified */ +{ + if (bs->dev) { + return -EBUSY; + } + bs->dev = dev; + bdrv_iostatus_reset(bs); + return 0; +} + +/* TODO qdevified devices don't use this, remove when devices are qdevified */ +void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev) +{ + if (bdrv_attach_dev(bs, dev) < 0) { + abort(); + } +} + +void bdrv_detach_dev(BlockDriverState *bs, void *dev) +/* TODO change to DeviceState *dev when all users are qdevified */ +{ + assert(bs->dev == dev); + bs->dev = NULL; + bs->dev_ops = NULL; + bs->dev_opaque = NULL; + bs->buffer_alignment = 512; +} + +/* TODO change to return DeviceState * when all users are qdevified */ +void *bdrv_get_attached_dev(BlockDriverState *bs) +{ + return bs->dev; +} + +void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops, + void *opaque) +{ + bs->dev_ops = ops; + bs->dev_opaque = opaque; +} + +void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv, + enum MonitorEvent ev, + BlockErrorAction action, bool is_read) +{ + QObject *data; + const char *action_str; + + switch (action) { + case BDRV_ACTION_REPORT: + action_str = "report"; + break; + case BDRV_ACTION_IGNORE: + action_str = "ignore"; + break; + case BDRV_ACTION_STOP: + action_str = "stop"; + break; + default: + abort(); + } + + data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }", + bdrv->device_name, + action_str, + is_read ? "read" : "write"); + monitor_protocol_event(ev, data); + + qobject_decref(data); +} + +static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected) +{ + QObject *data; + + data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }", + bdrv_get_device_name(bs), ejected); + monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data); + + qobject_decref(data); +} + +static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load) +{ + if (bs->dev_ops && bs->dev_ops->change_media_cb) { + bool tray_was_closed = !bdrv_dev_is_tray_open(bs); + bs->dev_ops->change_media_cb(bs->dev_opaque, load); + if (tray_was_closed) { + /* tray open */ + bdrv_emit_qmp_eject_event(bs, true); + } + if (load) { + /* tray close */ + bdrv_emit_qmp_eject_event(bs, false); + } + } +} + +bool bdrv_dev_has_removable_media(BlockDriverState *bs) +{ + return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb); +} + +void bdrv_dev_eject_request(BlockDriverState *bs, bool force) +{ + if (bs->dev_ops && bs->dev_ops->eject_request_cb) { + bs->dev_ops->eject_request_cb(bs->dev_opaque, force); + } +} + +bool bdrv_dev_is_tray_open(BlockDriverState *bs) +{ + if (bs->dev_ops && bs->dev_ops->is_tray_open) { + return bs->dev_ops->is_tray_open(bs->dev_opaque); + } + return false; +} + +static void bdrv_dev_resize_cb(BlockDriverState *bs) +{ + if (bs->dev_ops && bs->dev_ops->resize_cb) { + bs->dev_ops->resize_cb(bs->dev_opaque); + } +} + +bool bdrv_dev_is_medium_locked(BlockDriverState *bs) +{ + if (bs->dev_ops && bs->dev_ops->is_medium_locked) { + return bs->dev_ops->is_medium_locked(bs->dev_opaque); + } + return false; +} + +/* + * Run consistency checks on an image + * + * Returns 0 if the check could be completed (it doesn't mean that the image is + * free of errors) or -errno when an internal error occurred. The results of the + * check are stored in res. + */ +int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) +{ + if (bs->drv->bdrv_check == NULL) { + return -ENOTSUP; + } + + memset(res, 0, sizeof(*res)); + return bs->drv->bdrv_check(bs, res, fix); +} + +#define COMMIT_BUF_SECTORS 2048 + +/* commit COW file into the raw image */ +int bdrv_commit(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + int64_t sector, total_sectors; + int n, ro, open_flags; + int ret = 0; + uint8_t *buf; + char filename[PATH_MAX]; + + if (!drv) + return -ENOMEDIUM; + + if (!bs->backing_hd) { + return -ENOTSUP; + } + + if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) { + return -EBUSY; + } + + ro = bs->backing_hd->read_only; + /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */ + pstrcpy(filename, sizeof(filename), bs->backing_hd->filename); + open_flags = bs->backing_hd->open_flags; + + if (ro) { + if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) { + return -EACCES; + } + } + + total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; + buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); + + for (sector = 0; sector < total_sectors; sector += n) { + if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) { + + if (bdrv_read(bs, sector, buf, n) != 0) { + ret = -EIO; + goto ro_cleanup; + } + + if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) { + ret = -EIO; + goto ro_cleanup; + } + } + } + + if (drv->bdrv_make_empty) { + ret = drv->bdrv_make_empty(bs); + bdrv_flush(bs); + } + + /* + * Make sure all data we wrote to the backing device is actually + * stable on disk. + */ + if (bs->backing_hd) + bdrv_flush(bs->backing_hd); + +ro_cleanup: + g_free(buf); + + if (ro) { + /* ignoring error return here */ + bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL); + } + + return ret; +} + +int bdrv_commit_all(void) +{ + BlockDriverState *bs; + + QTAILQ_FOREACH(bs, &bdrv_states, list) { + if (bs->drv && bs->backing_hd) { + int ret = bdrv_commit(bs); + if (ret < 0) { + return ret; + } + } + } + return 0; +} + +/** + * Remove an active request from the tracked requests list + * + * This function should be called when a tracked request is completing. + */ +static void tracked_request_end(BdrvTrackedRequest *req) +{ + QLIST_REMOVE(req, list); + qemu_co_queue_restart_all(&req->wait_queue); +} + +/** + * Add an active request to the tracked requests list + */ +static void tracked_request_begin(BdrvTrackedRequest *req, + BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, bool is_write) +{ + *req = (BdrvTrackedRequest){ + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .is_write = is_write, + .co = qemu_coroutine_self(), + }; + + qemu_co_queue_init(&req->wait_queue); + + QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); +} + +/** + * Round a region to cluster boundaries + */ +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors) +{ + BlockDriverInfo bdi; + + if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { + *cluster_sector_num = sector_num; + *cluster_nb_sectors = nb_sectors; + } else { + int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; + *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); + *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + + nb_sectors, c); + } +} + +static bool tracked_request_overlaps(BdrvTrackedRequest *req, + int64_t sector_num, int nb_sectors) { + /* aaaa bbbb */ + if (sector_num >= req->sector_num + req->nb_sectors) { + return false; + } + /* bbbb aaaa */ + if (req->sector_num >= sector_num + nb_sectors) { + return false; + } + return true; +} + +static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + BdrvTrackedRequest *req; + int64_t cluster_sector_num; + int cluster_nb_sectors; + bool retry; + + /* If we touch the same cluster it counts as an overlap. This guarantees + * that allocating writes will be serialized and not race with each other + * for the same cluster. For example, in copy-on-read it ensures that the + * CoR read and write operations are atomic and guest writes cannot + * interleave between them. + */ + bdrv_round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); + + do { + retry = false; + QLIST_FOREACH(req, &bs->tracked_requests, list) { + if (tracked_request_overlaps(req, cluster_sector_num, + cluster_nb_sectors)) { + /* Hitting this means there was a reentrant request, for + * example, a block driver issuing nested requests. This must + * never happen since it means deadlock. + */ + assert(qemu_coroutine_self() != req->co); + + qemu_co_queue_wait(&req->wait_queue); + retry = true; + break; + } + } + } while (retry); +} + +/* + * Return values: + * 0 - success + * -EINVAL - backing format specified, but no file + * -ENOSPC - can't update the backing file because no space is left in the + * image file header + * -ENOTSUP - format driver doesn't support changing the backing file + */ +int bdrv_change_backing_file(BlockDriverState *bs, + const char *backing_file, const char *backing_fmt) +{ + BlockDriver *drv = bs->drv; + int ret; + + /* Backing file format doesn't make sense without a backing file */ + if (backing_fmt && !backing_file) { + return -EINVAL; + } + + if (drv->bdrv_change_backing_file != NULL) { + ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt); + } else { + ret = -ENOTSUP; + } + + if (ret == 0) { + pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); + pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); + } + return ret; +} + +/* + * Finds the image layer in the chain that has 'bs' as its backing file. + * + * active is the current topmost image. + * + * Returns NULL if bs is not found in active's image chain, + * or if active == bs. + */ +BlockDriverState *bdrv_find_overlay(BlockDriverState *active, + BlockDriverState *bs) +{ + BlockDriverState *overlay = NULL; + BlockDriverState *intermediate; + + assert(active != NULL); + assert(bs != NULL); + + /* if bs is the same as active, then by definition it has no overlay + */ + if (active == bs) { + return NULL; + } + + intermediate = active; + while (intermediate->backing_hd) { + if (intermediate->backing_hd == bs) { + overlay = intermediate; + break; + } + intermediate = intermediate->backing_hd; + } + + return overlay; +} + +typedef struct BlkIntermediateStates { + BlockDriverState *bs; + QSIMPLEQ_ENTRY(BlkIntermediateStates) entry; +} BlkIntermediateStates; + + +/* + * Drops images above 'base' up to and including 'top', and sets the image + * above 'top' to have base as its backing file. + * + * Requires that the overlay to 'top' is opened r/w, so that the backing file + * information in 'bs' can be properly updated. + * + * E.g., this will convert the following chain: + * bottom <- base <- intermediate <- top <- active + * + * to + * + * bottom <- base <- active + * + * It is allowed for bottom==base, in which case it converts: + * + * base <- intermediate <- top <- active + * + * to + * + * base <- active + * + * Error conditions: + * if active == top, that is considered an error + * + */ +int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, + BlockDriverState *base) +{ + BlockDriverState *intermediate; + BlockDriverState *base_bs = NULL; + BlockDriverState *new_top_bs = NULL; + BlkIntermediateStates *intermediate_state, *next; + int ret = -EIO; + + QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete; + QSIMPLEQ_INIT(&states_to_delete); + + if (!top->drv || !base->drv) { + goto exit; + } + + new_top_bs = bdrv_find_overlay(active, top); + + if (new_top_bs == NULL) { + /* we could not find the image above 'top', this is an error */ + goto exit; + } + + /* special case of new_top_bs->backing_hd already pointing to base - nothing + * to do, no intermediate images */ + if (new_top_bs->backing_hd == base) { + ret = 0; + goto exit; + } + + intermediate = top; + + /* now we will go down through the list, and add each BDS we find + * into our deletion queue, until we hit the 'base' + */ + while (intermediate) { + intermediate_state = g_malloc0(sizeof(BlkIntermediateStates)); + intermediate_state->bs = intermediate; + QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry); + + if (intermediate->backing_hd == base) { + base_bs = intermediate->backing_hd; + break; + } + intermediate = intermediate->backing_hd; + } + if (base_bs == NULL) { + /* something went wrong, we did not end at the base. safely + * unravel everything, and exit with error */ + goto exit; + } + + /* success - we can delete the intermediate states, and link top->base */ + ret = bdrv_change_backing_file(new_top_bs, base_bs->filename, + base_bs->drv ? base_bs->drv->format_name : ""); + if (ret) { + goto exit; + } + new_top_bs->backing_hd = base_bs; + + + QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { + /* so that bdrv_close() does not recursively close the chain */ + intermediate_state->bs->backing_hd = NULL; + bdrv_delete(intermediate_state->bs); + } + ret = 0; + +exit: + QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { + g_free(intermediate_state); + } + return ret; +} + + +static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, + size_t size) +{ + int64_t len; + + if (!bdrv_is_inserted(bs)) + return -ENOMEDIUM; + + if (bs->growable) + return 0; + + len = bdrv_getlength(bs); + + if (offset < 0) + return -EIO; + + if ((offset > len) || (len - offset < size)) + return -EIO; + + return 0; +} + +static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE); +} + +typedef struct RwCo { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + QEMUIOVector *qiov; + bool is_write; + int ret; +} RwCo; + +static void coroutine_fn bdrv_rw_co_entry(void *opaque) +{ + RwCo *rwco = opaque; + + if (!rwco->is_write) { + rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num, + rwco->nb_sectors, rwco->qiov, 0); + } else { + rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num, + rwco->nb_sectors, rwco->qiov, 0); + } +} + +/* + * Process a vectored synchronous request using coroutines + */ +static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, bool is_write) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = qiov->size >> BDRV_SECTOR_BITS, + .qiov = qiov, + .is_write = is_write, + .ret = NOT_DONE, + }; + assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0); + + /** + * In sync call context, when the vcpu is blocked, this throttling timer + * will not fire; so the I/O throttling function has to be disabled here + * if it has been enabled. + */ + if (bs->io_limits_enabled) { + fprintf(stderr, "Disabling I/O throttling on '%s' due " + "to synchronous I/O.\n", bdrv_get_device_name(bs)); + bdrv_io_limits_disable(bs); + } + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_rw_co_entry(&rwco); + } else { + co = qemu_coroutine_create(bdrv_rw_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + qemu_aio_wait(); + } + } + return rwco.ret; +} + +/* + * Process a synchronous request using coroutines + */ +static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, + int nb_sectors, bool is_write) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = nb_sectors * BDRV_SECTOR_SIZE, + }; + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_rwv_co(bs, sector_num, &qiov, is_write); +} + +/* return < 0 if error. See bdrv_write() for the return codes */ +int bdrv_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false); +} + +/* Just like bdrv_read(), but with I/O throttling temporarily disabled */ +int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + bool enabled; + int ret; + + enabled = bs->io_limits_enabled; + bs->io_limits_enabled = false; + ret = bdrv_read(bs, 0, buf, 1); + bs->io_limits_enabled = enabled; + return ret; +} + +/* Return < 0 if error. Important errors are: + -EIO generic I/O error (may happen for all errors) + -ENOMEDIUM No media inserted. + -EINVAL Invalid sector number or nb_sectors + -EACCES Trying to write a read-only device +*/ +int bdrv_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true); +} + +int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov) +{ + return bdrv_rwv_co(bs, sector_num, qiov, true); +} + +int bdrv_pread(BlockDriverState *bs, int64_t offset, + void *buf, int count1) +{ + uint8_t tmp_buf[BDRV_SECTOR_SIZE]; + int len, nb_sectors, count; + int64_t sector_num; + int ret; + + count = count1; + /* first read to align to sector start */ + len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); + if (len > count) + len = count; + sector_num = offset >> BDRV_SECTOR_BITS; + if (len > 0) { + if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) + return ret; + memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len); + count -= len; + if (count == 0) + return count1; + sector_num++; + buf += len; + } + + /* read the sectors "in place" */ + nb_sectors = count >> BDRV_SECTOR_BITS; + if (nb_sectors > 0) { + if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0) + return ret; + sector_num += nb_sectors; + len = nb_sectors << BDRV_SECTOR_BITS; + buf += len; + count -= len; + } + + /* add data from the last sector */ + if (count > 0) { + if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) + return ret; + memcpy(buf, tmp_buf, count); + } + return count1; +} + +int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) +{ + uint8_t tmp_buf[BDRV_SECTOR_SIZE]; + int len, nb_sectors, count; + int64_t sector_num; + int ret; + + count = qiov->size; + + /* first write to align to sector start */ + len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); + if (len > count) + len = count; + sector_num = offset >> BDRV_SECTOR_BITS; + if (len > 0) { + if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) + return ret; + qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), + len); + if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) + return ret; + count -= len; + if (count == 0) + return qiov->size; + sector_num++; + } + + /* write the sectors "in place" */ + nb_sectors = count >> BDRV_SECTOR_BITS; + if (nb_sectors > 0) { + QEMUIOVector qiov_inplace; + + qemu_iovec_init(&qiov_inplace, qiov->niov); + qemu_iovec_concat(&qiov_inplace, qiov, len, + nb_sectors << BDRV_SECTOR_BITS); + ret = bdrv_writev(bs, sector_num, &qiov_inplace); + qemu_iovec_destroy(&qiov_inplace); + if (ret < 0) { + return ret; + } + + sector_num += nb_sectors; + len = nb_sectors << BDRV_SECTOR_BITS; + count -= len; + } + + /* add data from the last sector */ + if (count > 0) { + if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) + return ret; + qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count); + if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) + return ret; + } + return qiov->size; +} + +int bdrv_pwrite(BlockDriverState *bs, int64_t offset, + const void *buf, int count1) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = count1, + }; + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_pwritev(bs, offset, &qiov); +} + +/* + * Writes to the file and ensures that no writes are reordered across this + * request (acts as a barrier) + * + * Returns 0 on success, -errno in error cases. + */ +int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, + const void *buf, int count) +{ + int ret; + + ret = bdrv_pwrite(bs, offset, buf, count); + if (ret < 0) { + return ret; + } + + /* No flush needed for cache modes that already do it */ + if (bs->enable_write_cache) { + bdrv_flush(bs); + } + + return 0; +} + +static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + /* Perform I/O through a temporary buffer so that users who scribble over + * their read buffer while the operation is in progress do not end up + * modifying the image file. This is critical for zero-copy guest I/O + * where anything might happen inside guest memory. + */ + void *bounce_buffer; + + BlockDriver *drv = bs->drv; + struct iovec iov; + QEMUIOVector bounce_qiov; + int64_t cluster_sector_num; + int cluster_nb_sectors; + size_t skip_bytes; + int ret; + + /* Cover entire cluster so no additional backing file I/O is required when + * allocating cluster in the image file. + */ + bdrv_round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); + + trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, + cluster_sector_num, cluster_nb_sectors); + + iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; + iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len); + qemu_iovec_init_external(&bounce_qiov, &iov, 1); + + ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + if (ret < 0) { + goto err; + } + + if (drv->bdrv_co_write_zeroes && + buffer_is_zero(bounce_buffer, iov.iov_len)) { + ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, + cluster_nb_sectors); + } else { + /* This does not change the data on the disk, it is not necessary + * to flush even in cache=writethrough mode. + */ + ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + } + + if (ret < 0) { + /* It might be okay to ignore write errors for guest requests. If this + * is a deliberate copy-on-read then we don't want to ignore the error. + * Simply report it in all cases. + */ + goto err; + } + + skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; + qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, + nb_sectors * BDRV_SECTOR_SIZE); + +err: + qemu_vfree(bounce_buffer); + return ret; +} + +/* + * Handle a read request in coroutine context + */ +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + int ret; + + if (!drv) { + return -ENOMEDIUM; + } + if (bdrv_check_request(bs, sector_num, nb_sectors)) { + return -EIO; + } + + /* throttling disk read I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, false, nb_sectors); + } + + if (bs->copy_on_read) { + flags |= BDRV_REQ_COPY_ON_READ; + } + if (flags & BDRV_REQ_COPY_ON_READ) { + bs->copy_on_read_in_flight++; + } + + if (bs->copy_on_read_in_flight) { + wait_for_overlapping_requests(bs, sector_num, nb_sectors); + } + + tracked_request_begin(&req, bs, sector_num, nb_sectors, false); + + if (flags & BDRV_REQ_COPY_ON_READ) { + int pnum; + + ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum); + if (ret < 0) { + goto out; + } + + if (!ret || pnum != nb_sectors) { + ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); + goto out; + } + } + + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + +out: + tracked_request_end(&req); + + if (flags & BDRV_REQ_COPY_ON_READ) { + bs->copy_on_read_in_flight--; + } + + return ret; +} + +int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, + BDRV_REQ_COPY_ON_READ); +} + +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + BlockDriver *drv = bs->drv; + QEMUIOVector qiov; + struct iovec iov; + int ret; + + /* TODO Emulate only part of misaligned requests instead of letting block + * drivers return -ENOTSUP and emulate everything */ + + /* First try the efficient write zeroes operation */ + if (drv->bdrv_co_write_zeroes) { + ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors); + if (ret != -ENOTSUP) { + return ret; + } + } + + /* Fall back to bounce buffer if write zeroes is unsupported */ + iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; + iov.iov_base = qemu_blockalign(bs, iov.iov_len); + memset(iov.iov_base, 0, iov.iov_len); + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov); + + qemu_vfree(iov.iov_base); + return ret; +} + +/* + * Handle a write request in coroutine context + */ +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + int ret; + + if (!bs->drv) { + return -ENOMEDIUM; + } + if (bs->read_only) { + return -EACCES; + } + if (bdrv_check_request(bs, sector_num, nb_sectors)) { + return -EIO; + } + + /* throttling disk write I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, true, nb_sectors); + } + + if (bs->copy_on_read_in_flight) { + wait_for_overlapping_requests(bs, sector_num, nb_sectors); + } + + tracked_request_begin(&req, bs, sector_num, nb_sectors, true); + + ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); + + if (ret < 0) { + /* Do nothing, write notifier decided to fail this request */ + } else if (flags & BDRV_REQ_ZERO_WRITE) { + ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors); + } else { + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); + } + + if (ret == 0 && !bs->enable_write_cache) { + ret = bdrv_co_flush(bs); + } + + if (bs->dirty_bitmap) { + bdrv_set_dirty(bs, sector_num, nb_sectors); + } + + if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { + bs->wr_highest_sector = sector_num + nb_sectors - 1; + } + + tracked_request_end(&req); + + return ret; +} + +int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_writev(bs, sector_num, nb_sectors); + + return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors); + + return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, + BDRV_REQ_ZERO_WRITE); +} + +/** + * Truncate file to 'offset' bytes (needed only for file protocols) + */ +int bdrv_truncate(BlockDriverState *bs, int64_t offset) +{ + BlockDriver *drv = bs->drv; + int ret; + if (!drv) + return -ENOMEDIUM; + if (!drv->bdrv_truncate) + return -ENOTSUP; + if (bs->read_only) + return -EACCES; + if (bdrv_in_use(bs)) + return -EBUSY; + ret = drv->bdrv_truncate(bs, offset); + if (ret == 0) { + ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); + bdrv_dev_resize_cb(bs); + } + return ret; +} + +/** + * Length of a allocated file in bytes. Sparse files are counted by actual + * allocated space. Return < 0 if error or unknown. + */ +int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (!drv) { + return -ENOMEDIUM; + } + if (drv->bdrv_get_allocated_file_size) { + return drv->bdrv_get_allocated_file_size(bs); + } + if (bs->file) { + return bdrv_get_allocated_file_size(bs->file); + } + return -ENOTSUP; +} + +/** + * Length of a file in bytes. Return < 0 if error or unknown. + */ +int64_t bdrv_getlength(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (!drv) + return -ENOMEDIUM; + + if (bs->growable || bdrv_dev_has_removable_media(bs)) { + if (drv->bdrv_getlength) { + return drv->bdrv_getlength(bs); + } + } + return bs->total_sectors * BDRV_SECTOR_SIZE; +} + +/* return 0 as number of sectors if no device present or error */ +void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) +{ + int64_t length; + length = bdrv_getlength(bs); + if (length < 0) + length = 0; + else + length = length >> BDRV_SECTOR_BITS; + *nb_sectors_ptr = length; +} + +/* throttling disk io limits */ +void bdrv_set_io_limits(BlockDriverState *bs, + BlockIOLimit *io_limits) +{ + bs->io_limits = *io_limits; + bs->io_limits_enabled = bdrv_io_limits_enabled(bs); +} + +void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, + BlockdevOnError on_write_error) +{ + bs->on_read_error = on_read_error; + bs->on_write_error = on_write_error; +} + +BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read) +{ + return is_read ? bs->on_read_error : bs->on_write_error; +} + +BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error) +{ + BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error; + + switch (on_err) { + case BLOCKDEV_ON_ERROR_ENOSPC: + return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT; + case BLOCKDEV_ON_ERROR_STOP: + return BDRV_ACTION_STOP; + case BLOCKDEV_ON_ERROR_REPORT: + return BDRV_ACTION_REPORT; + case BLOCKDEV_ON_ERROR_IGNORE: + return BDRV_ACTION_IGNORE; + default: + abort(); + } +} + +/* This is done by device models because, while the block layer knows + * about the error, it does not know whether an operation comes from + * the device or the block layer (from a job, for example). + */ +void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action, + bool is_read, int error) +{ + assert(error >= 0); + bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read); + if (action == BDRV_ACTION_STOP) { + vm_stop(RUN_STATE_IO_ERROR); + bdrv_iostatus_set_err(bs, error); + } +} + +int bdrv_is_read_only(BlockDriverState *bs) +{ + return bs->read_only; +} + +int bdrv_is_sg(BlockDriverState *bs) +{ + return bs->sg; +} + +int bdrv_enable_write_cache(BlockDriverState *bs) +{ + return bs->enable_write_cache; +} + +void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce) +{ + bs->enable_write_cache = wce; + + /* so a reopen() will preserve wce */ + if (wce) { + bs->open_flags |= BDRV_O_CACHE_WB; + } else { + bs->open_flags &= ~BDRV_O_CACHE_WB; + } +} + +int bdrv_is_encrypted(BlockDriverState *bs) +{ + if (bs->backing_hd && bs->backing_hd->encrypted) + return 1; + return bs->encrypted; +} + +int bdrv_key_required(BlockDriverState *bs) +{ + BlockDriverState *backing_hd = bs->backing_hd; + + if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key) + return 1; + return (bs->encrypted && !bs->valid_key); +} + +int bdrv_set_key(BlockDriverState *bs, const char *key) +{ + int ret; + if (bs->backing_hd && bs->backing_hd->encrypted) { + ret = bdrv_set_key(bs->backing_hd, key); + if (ret < 0) + return ret; + if (!bs->encrypted) + return 0; + } + if (!bs->encrypted) { + return -EINVAL; + } else if (!bs->drv || !bs->drv->bdrv_set_key) { + return -ENOMEDIUM; + } + ret = bs->drv->bdrv_set_key(bs, key); + if (ret < 0) { + bs->valid_key = 0; + } else if (!bs->valid_key) { + bs->valid_key = 1; + /* call the change callback now, we skipped it on open */ + bdrv_dev_change_media_cb(bs, true); + } + return ret; +} + +const char *bdrv_get_format_name(BlockDriverState *bs) +{ + return bs->drv ? bs->drv->format_name : NULL; +} + +void bdrv_iterate_format(void (*it)(void *opaque, const char *name), + void *opaque) +{ + BlockDriver *drv; + + QLIST_FOREACH(drv, &bdrv_drivers, list) { + it(opaque, drv->format_name); + } +} + +BlockDriverState *bdrv_find(const char *name) +{ + BlockDriverState *bs; + + QTAILQ_FOREACH(bs, &bdrv_states, list) { + if (!strcmp(name, bs->device_name)) { + return bs; + } + } + return NULL; +} + +BlockDriverState *bdrv_next(BlockDriverState *bs) +{ + if (!bs) { + return QTAILQ_FIRST(&bdrv_states); + } + return QTAILQ_NEXT(bs, list); +} + +void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque) +{ + BlockDriverState *bs; + + QTAILQ_FOREACH(bs, &bdrv_states, list) { + it(opaque, bs); + } +} + +const char *bdrv_get_device_name(BlockDriverState *bs) +{ + return bs->device_name; +} + +int bdrv_get_flags(BlockDriverState *bs) +{ + return bs->open_flags; +} + +int bdrv_flush_all(void) +{ + BlockDriverState *bs; + int result = 0; + + QTAILQ_FOREACH(bs, &bdrv_states, list) { + int ret = bdrv_flush(bs); + if (ret < 0 && !result) { + result = ret; + } + } + + return result; +} + +int bdrv_has_zero_init_1(BlockDriverState *bs) +{ + return 1; +} + +int bdrv_has_zero_init(BlockDriverState *bs) +{ + assert(bs->drv); + + if (bs->drv->bdrv_has_zero_init) { + return bs->drv->bdrv_has_zero_init(bs); + } + + /* safe default */ + return 0; +} + +typedef struct BdrvCoIsAllocatedData { + BlockDriverState *bs; + BlockDriverState *base; + int64_t sector_num; + int nb_sectors; + int *pnum; + int ret; + bool done; +} BdrvCoIsAllocatedData; + +/* + * Returns true iff the specified sector is present in the disk image. Drivers + * not implementing the functionality are assumed to not support backing files, + * hence all their sectors are reported as allocated. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. + */ +int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + int64_t n; + + if (sector_num >= bs->total_sectors) { + *pnum = 0; + return 0; + } + + n = bs->total_sectors - sector_num; + if (n < nb_sectors) { + nb_sectors = n; + } + + if (!bs->drv->bdrv_co_is_allocated) { + *pnum = nb_sectors; + return 1; + } + + return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum); +} + +/* Coroutine wrapper for bdrv_is_allocated() */ +static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque) +{ + BdrvCoIsAllocatedData *data = opaque; + BlockDriverState *bs = data->bs; + + data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors, + data->pnum); + data->done = true; +} + +/* + * Synchronous wrapper around bdrv_co_is_allocated(). + * + * See bdrv_co_is_allocated() for details. + */ +int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, + int *pnum) +{ + Coroutine *co; + BdrvCoIsAllocatedData data = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .pnum = pnum, + .done = false, + }; + + co = qemu_coroutine_create(bdrv_is_allocated_co_entry); + qemu_coroutine_enter(co, &data); + while (!data.done) { + qemu_aio_wait(); + } + return data.ret; +} + +/* + * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] + * + * Return true if the given sector is allocated in any image between + * BASE and TOP (inclusive). BASE can be NULL to check if the given + * sector is allocated in any image of the chain. Return false otherwise. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + */ +int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top, + BlockDriverState *base, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + BlockDriverState *intermediate; + int ret, n = nb_sectors; + + intermediate = top; + while (intermediate && intermediate != base) { + int pnum_inter; + ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors, + &pnum_inter); + if (ret < 0) { + return ret; + } else if (ret) { + *pnum = pnum_inter; + return 1; + } + + /* + * [sector_num, nb_sectors] is unallocated on top but intermediate + * might have + * + * [sector_num+x, nr_sectors] allocated. + */ + if (n > pnum_inter && + (intermediate == top || + sector_num + pnum_inter < intermediate->total_sectors)) { + n = pnum_inter; + } + + intermediate = intermediate->backing_hd; + } + + *pnum = n; + return 0; +} + +/* Coroutine wrapper for bdrv_is_allocated_above() */ +static void coroutine_fn bdrv_is_allocated_above_co_entry(void *opaque) +{ + BdrvCoIsAllocatedData *data = opaque; + BlockDriverState *top = data->bs; + BlockDriverState *base = data->base; + + data->ret = bdrv_co_is_allocated_above(top, base, data->sector_num, + data->nb_sectors, data->pnum); + data->done = true; +} + +/* + * Synchronous wrapper around bdrv_co_is_allocated_above(). + * + * See bdrv_co_is_allocated_above() for details. + */ +int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, + int64_t sector_num, int nb_sectors, int *pnum) +{ + Coroutine *co; + BdrvCoIsAllocatedData data = { + .bs = top, + .base = base, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .pnum = pnum, + .done = false, + }; + + co = qemu_coroutine_create(bdrv_is_allocated_above_co_entry); + qemu_coroutine_enter(co, &data); + while (!data.done) { + qemu_aio_wait(); + } + return data.ret; +} + +const char *bdrv_get_encrypted_filename(BlockDriverState *bs) +{ + if (bs->backing_hd && bs->backing_hd->encrypted) + return bs->backing_file; + else if (bs->encrypted) + return bs->filename; + else + return NULL; +} + +void bdrv_get_backing_filename(BlockDriverState *bs, + char *filename, int filename_size) +{ + pstrcpy(filename, filename_size, bs->backing_file); +} + +int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BlockDriver *drv = bs->drv; + if (!drv) + return -ENOMEDIUM; + if (!drv->bdrv_write_compressed) + return -ENOTSUP; + if (bdrv_check_request(bs, sector_num, nb_sectors)) + return -EIO; + + assert(!bs->dirty_bitmap); + + return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); +} + +int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BlockDriver *drv = bs->drv; + if (!drv) + return -ENOMEDIUM; + if (!drv->bdrv_get_info) + return -ENOTSUP; + memset(bdi, 0, sizeof(*bdi)); + return drv->bdrv_get_info(bs, bdi); +} + +int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, + int64_t pos, int size) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = size, + }; + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_writev_vmstate(bs, &qiov, pos); +} + +int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) +{ + BlockDriver *drv = bs->drv; + + if (!drv) { + return -ENOMEDIUM; + } else if (drv->bdrv_save_vmstate) { + return drv->bdrv_save_vmstate(bs, qiov, pos); + } else if (bs->file) { + return bdrv_writev_vmstate(bs->file, qiov, pos); + } + + return -ENOTSUP; +} + +int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size) +{ + BlockDriver *drv = bs->drv; + if (!drv) + return -ENOMEDIUM; + if (drv->bdrv_load_vmstate) + return drv->bdrv_load_vmstate(bs, buf, pos, size); + if (bs->file) + return bdrv_load_vmstate(bs->file, buf, pos, size); + return -ENOTSUP; +} + +void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event) +{ + if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) { + return; + } + + bs->drv->bdrv_debug_event(bs, event); +} + +int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, + const char *tag) +{ + while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) { + bs = bs->file; + } + + if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) { + return bs->drv->bdrv_debug_breakpoint(bs, event, tag); + } + + return -ENOTSUP; +} + +int bdrv_debug_resume(BlockDriverState *bs, const char *tag) +{ + while (bs && bs->drv && !bs->drv->bdrv_debug_resume) { + bs = bs->file; + } + + if (bs && bs->drv && bs->drv->bdrv_debug_resume) { + return bs->drv->bdrv_debug_resume(bs, tag); + } + + return -ENOTSUP; +} + +bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag) +{ + while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) { + bs = bs->file; + } + + if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) { + return bs->drv->bdrv_debug_is_suspended(bs, tag); + } + + return false; +} + +int bdrv_is_snapshot(BlockDriverState *bs) +{ + return !!(bs->open_flags & BDRV_O_SNAPSHOT); +} + +/* backing_file can either be relative, or absolute, or a protocol. If it is + * relative, it must be relative to the chain. So, passing in bs->filename + * from a BDS as backing_file should not be done, as that may be relative to + * the CWD rather than the chain. */ +BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, + const char *backing_file) +{ + char *filename_full = NULL; + char *backing_file_full = NULL; + char *filename_tmp = NULL; + int is_protocol = 0; + BlockDriverState *curr_bs = NULL; + BlockDriverState *retval = NULL; + + if (!bs || !bs->drv || !backing_file) { + return NULL; + } + + filename_full = g_malloc(PATH_MAX); + backing_file_full = g_malloc(PATH_MAX); + filename_tmp = g_malloc(PATH_MAX); + + is_protocol = path_has_protocol(backing_file); + + for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) { + + /* If either of the filename paths is actually a protocol, then + * compare unmodified paths; otherwise make paths relative */ + if (is_protocol || path_has_protocol(curr_bs->backing_file)) { + if (strcmp(backing_file, curr_bs->backing_file) == 0) { + retval = curr_bs->backing_hd; + break; + } + } else { + /* If not an absolute filename path, make it relative to the current + * image's filename path */ + path_combine(filename_tmp, PATH_MAX, curr_bs->filename, + backing_file); + + /* We are going to compare absolute pathnames */ + if (!realpath(filename_tmp, filename_full)) { + continue; + } + + /* We need to make sure the backing filename we are comparing against + * is relative to the current image filename (or absolute) */ + path_combine(filename_tmp, PATH_MAX, curr_bs->filename, + curr_bs->backing_file); + + if (!realpath(filename_tmp, backing_file_full)) { + continue; + } + + if (strcmp(backing_file_full, filename_full) == 0) { + retval = curr_bs->backing_hd; + break; + } + } + } + + g_free(filename_full); + g_free(backing_file_full); + g_free(filename_tmp); + return retval; +} + +int bdrv_get_backing_file_depth(BlockDriverState *bs) +{ + if (!bs->drv) { + return 0; + } + + if (!bs->backing_hd) { + return 0; + } + + return 1 + bdrv_get_backing_file_depth(bs->backing_hd); +} + +BlockDriverState *bdrv_find_base(BlockDriverState *bs) +{ + BlockDriverState *curr_bs = NULL; + + if (!bs) { + return NULL; + } + + curr_bs = bs; + + while (curr_bs->backing_hd) { + curr_bs = curr_bs->backing_hd; + } + return curr_bs; +} + +/**************************************************************/ +/* async I/Os */ + +BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, + cb, opaque, false); +} + +BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, + cb, opaque, true); +} + + +typedef struct MultiwriteCB { + int error; + int num_requests; + int num_callbacks; + struct { + BlockDriverCompletionFunc *cb; + void *opaque; + QEMUIOVector *free_qiov; + } callbacks[]; +} MultiwriteCB; + +static void multiwrite_user_cb(MultiwriteCB *mcb) +{ + int i; + + for (i = 0; i < mcb->num_callbacks; i++) { + mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); + if (mcb->callbacks[i].free_qiov) { + qemu_iovec_destroy(mcb->callbacks[i].free_qiov); + } + g_free(mcb->callbacks[i].free_qiov); + } +} + +static void multiwrite_cb(void *opaque, int ret) +{ + MultiwriteCB *mcb = opaque; + + trace_multiwrite_cb(mcb, ret); + + if (ret < 0 && !mcb->error) { + mcb->error = ret; + } + + mcb->num_requests--; + if (mcb->num_requests == 0) { + multiwrite_user_cb(mcb); + g_free(mcb); + } +} + +static int multiwrite_req_compare(const void *a, const void *b) +{ + const BlockRequest *req1 = a, *req2 = b; + + /* + * Note that we can't simply subtract req2->sector from req1->sector + * here as that could overflow the return value. + */ + if (req1->sector > req2->sector) { + return 1; + } else if (req1->sector < req2->sector) { + return -1; + } else { + return 0; + } +} + +/* + * Takes a bunch of requests and tries to merge them. Returns the number of + * requests that remain after merging. + */ +static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, + int num_reqs, MultiwriteCB *mcb) +{ + int i, outidx; + + // Sort requests by start sector + qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); + + // Check if adjacent requests touch the same clusters. If so, combine them, + // filling up gaps with zero sectors. + outidx = 0; + for (i = 1; i < num_reqs; i++) { + int merge = 0; + int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; + + // Handle exactly sequential writes and overlapping writes. + if (reqs[i].sector <= oldreq_last) { + merge = 1; + } + + if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { + merge = 0; + } + + if (merge) { + size_t size; + QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); + qemu_iovec_init(qiov, + reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); + + // Add the first request to the merged one. If the requests are + // overlapping, drop the last sectors of the first request. + size = (reqs[i].sector - reqs[outidx].sector) << 9; + qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); + + // We should need to add any zeros between the two requests + assert (reqs[i].sector <= oldreq_last); + + // Add the second request + qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); + + reqs[outidx].nb_sectors = qiov->size >> 9; + reqs[outidx].qiov = qiov; + + mcb->callbacks[i].free_qiov = reqs[outidx].qiov; + } else { + outidx++; + reqs[outidx].sector = reqs[i].sector; + reqs[outidx].nb_sectors = reqs[i].nb_sectors; + reqs[outidx].qiov = reqs[i].qiov; + } + } + + return outidx + 1; +} + +/* + * Submit multiple AIO write requests at once. + * + * On success, the function returns 0 and all requests in the reqs array have + * been submitted. In error case this function returns -1, and any of the + * requests may or may not be submitted yet. In particular, this means that the + * callback will be called for some of the requests, for others it won't. The + * caller must check the error field of the BlockRequest to wait for the right + * callbacks (if error != 0, no callback will be called). + * + * The implementation may modify the contents of the reqs array, e.g. to merge + * requests. However, the fields opaque and error are left unmodified as they + * are used to signal failure for a single request to the caller. + */ +int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) +{ + MultiwriteCB *mcb; + int i; + + /* don't submit writes if we don't have a medium */ + if (bs->drv == NULL) { + for (i = 0; i < num_reqs; i++) { + reqs[i].error = -ENOMEDIUM; + } + return -1; + } + + if (num_reqs == 0) { + return 0; + } + + // Create MultiwriteCB structure + mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); + mcb->num_requests = 0; + mcb->num_callbacks = num_reqs; + + for (i = 0; i < num_reqs; i++) { + mcb->callbacks[i].cb = reqs[i].cb; + mcb->callbacks[i].opaque = reqs[i].opaque; + } + + // Check for mergable requests + num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); + + trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); + + /* Run the aio requests. */ + mcb->num_requests = num_reqs; + for (i = 0; i < num_reqs; i++) { + bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov, + reqs[i].nb_sectors, multiwrite_cb, mcb); + } + + return 0; +} + +void bdrv_aio_cancel(BlockDriverAIOCB *acb) +{ + acb->aiocb_info->cancel(acb); +} + +/* block I/O throttling */ +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, double elapsed_time, uint64_t *wait) +{ + uint64_t bps_limit = 0; + uint64_t extension; + double bytes_limit, bytes_base, bytes_res; + double slice_time, wait_time; + + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { + bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; + } else if (bs->io_limits.bps[is_write]) { + bps_limit = bs->io_limits.bps[is_write]; + } else { + if (wait) { + *wait = 0; + } + + return false; + } + + slice_time = bs->slice_end - bs->slice_start; + slice_time /= (NANOSECONDS_PER_SECOND); + bytes_limit = bps_limit * slice_time; + bytes_base = bs->slice_submitted.bytes[is_write]; + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { + bytes_base += bs->slice_submitted.bytes[!is_write]; + } + + /* bytes_base: the bytes of data which have been read/written; and + * it is obtained from the history statistic info. + * bytes_res: the remaining bytes of data which need to be read/written. + * (bytes_base + bytes_res) / bps_limit: used to calcuate + * the total time for completing reading/writting all data. + */ + bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE; + + if (bytes_base + bytes_res <= bytes_limit) { + if (wait) { + *wait = 0; + } + + return false; + } + + /* Calc approx time to dispatch */ + wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time; + + /* When the I/O rate at runtime exceeds the limits, + * bs->slice_end need to be extended in order that the current statistic + * info can be kept until the timer fire, so it is increased and tuned + * based on the result of experiment. + */ + extension = wait_time * NANOSECONDS_PER_SECOND; + extension = DIV_ROUND_UP(extension, BLOCK_IO_SLICE_TIME) * + BLOCK_IO_SLICE_TIME; + bs->slice_end += extension; + if (wait) { + *wait = wait_time * NANOSECONDS_PER_SECOND; + } + + return true; +} + +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, + double elapsed_time, uint64_t *wait) +{ + uint64_t iops_limit = 0; + double ios_limit, ios_base; + double slice_time, wait_time; + + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { + iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; + } else if (bs->io_limits.iops[is_write]) { + iops_limit = bs->io_limits.iops[is_write]; + } else { + if (wait) { + *wait = 0; + } + + return false; + } + + slice_time = bs->slice_end - bs->slice_start; + slice_time /= (NANOSECONDS_PER_SECOND); + ios_limit = iops_limit * slice_time; + ios_base = bs->slice_submitted.ios[is_write]; + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { + ios_base += bs->slice_submitted.ios[!is_write]; + } + + if (ios_base + 1 <= ios_limit) { + if (wait) { + *wait = 0; + } + + return false; + } + + /* Calc approx time to dispatch, in seconds */ + wait_time = (ios_base + 1) / iops_limit; + if (wait_time > elapsed_time) { + wait_time = wait_time - elapsed_time; + } else { + wait_time = 0; + } + + /* Exceeded current slice, extend it by another slice time */ + bs->slice_end += BLOCK_IO_SLICE_TIME; + if (wait) { + *wait = wait_time * NANOSECONDS_PER_SECOND; + } + + return true; +} + +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, int64_t *wait) +{ + int64_t now, max_wait; + uint64_t bps_wait = 0, iops_wait = 0; + double elapsed_time; + int bps_ret, iops_ret; + + now = qemu_get_clock_ns(vm_clock); + if (now > bs->slice_end) { + bs->slice_start = now; + bs->slice_end = now + BLOCK_IO_SLICE_TIME; + memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted)); + } + + elapsed_time = now - bs->slice_start; + elapsed_time /= (NANOSECONDS_PER_SECOND); + + bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors, + is_write, elapsed_time, &bps_wait); + iops_ret = bdrv_exceed_iops_limits(bs, is_write, + elapsed_time, &iops_wait); + if (bps_ret || iops_ret) { + max_wait = bps_wait > iops_wait ? bps_wait : iops_wait; + if (wait) { + *wait = max_wait; + } + + now = qemu_get_clock_ns(vm_clock); + if (bs->slice_end < now + max_wait) { + bs->slice_end = now + max_wait; + } + + return true; + } + + if (wait) { + *wait = 0; + } + + bs->slice_submitted.bytes[is_write] += (int64_t)nb_sectors * + BDRV_SECTOR_SIZE; + bs->slice_submitted.ios[is_write]++; + + return false; +} + +/**************************************************************/ +/* async block device emulation */ + +typedef struct BlockDriverAIOCBSync { + BlockDriverAIOCB common; + QEMUBH *bh; + int ret; + /* vector translation state */ + QEMUIOVector *qiov; + uint8_t *bounce; + int is_write; +} BlockDriverAIOCBSync; + +static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb) +{ + BlockDriverAIOCBSync *acb = + container_of(blockacb, BlockDriverAIOCBSync, common); + qemu_bh_delete(acb->bh); + acb->bh = NULL; + qemu_aio_release(acb); +} + +static const AIOCBInfo bdrv_em_aiocb_info = { + .aiocb_size = sizeof(BlockDriverAIOCBSync), + .cancel = bdrv_aio_cancel_em, +}; + +static void bdrv_aio_bh_cb(void *opaque) +{ + BlockDriverAIOCBSync *acb = opaque; + + if (!acb->is_write) + qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); + qemu_vfree(acb->bounce); + acb->common.cb(acb->common.opaque, acb->ret); + qemu_bh_delete(acb->bh); + acb->bh = NULL; + qemu_aio_release(acb); +} + +static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque, + int is_write) + +{ + BlockDriverAIOCBSync *acb; + + acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); + acb->is_write = is_write; + acb->qiov = qiov; + acb->bounce = qemu_blockalign(bs, qiov->size); + acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb); + + if (is_write) { + qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); + acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); + } else { + acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); + } + + qemu_bh_schedule(acb->bh); + + return &acb->common; +} + +static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); +} + +static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); +} + + +typedef struct BlockDriverAIOCBCoroutine { + BlockDriverAIOCB common; + BlockRequest req; + bool is_write; + bool *done; + QEMUBH* bh; +} BlockDriverAIOCBCoroutine; + +static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb) +{ + BlockDriverAIOCBCoroutine *acb = + container_of(blockacb, BlockDriverAIOCBCoroutine, common); + bool done = false; + + acb->done = &done; + while (!done) { + qemu_aio_wait(); + } +} + +static const AIOCBInfo bdrv_em_co_aiocb_info = { + .aiocb_size = sizeof(BlockDriverAIOCBCoroutine), + .cancel = bdrv_aio_co_cancel_em, +}; + +static void bdrv_co_em_bh(void *opaque) +{ + BlockDriverAIOCBCoroutine *acb = opaque; + + acb->common.cb(acb->common.opaque, acb->req.error); + + if (acb->done) { + *acb->done = true; + } + + qemu_bh_delete(acb->bh); + qemu_aio_release(acb); +} + +/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ +static void coroutine_fn bdrv_co_do_rw(void *opaque) +{ + BlockDriverAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + if (!acb->is_write) { + acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov, 0); + } else { + acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov, 0); + } + + acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); + qemu_bh_schedule(acb->bh); +} + +static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque, + bool is_write) +{ + Coroutine *co; + BlockDriverAIOCBCoroutine *acb; + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->req.sector = sector_num; + acb->req.nb_sectors = nb_sectors; + acb->req.qiov = qiov; + acb->is_write = is_write; + acb->done = NULL; + + co = qemu_coroutine_create(bdrv_co_do_rw); + qemu_coroutine_enter(co, acb); + + return &acb->common; +} + +static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) +{ + BlockDriverAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + acb->req.error = bdrv_co_flush(bs); + acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); + qemu_bh_schedule(acb->bh); +} + +BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_flush(bs, opaque); + + Coroutine *co; + BlockDriverAIOCBCoroutine *acb; + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->done = NULL; + + co = qemu_coroutine_create(bdrv_aio_flush_co_entry); + qemu_coroutine_enter(co, acb); + + return &acb->common; +} + +static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) +{ + BlockDriverAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); + acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); + qemu_bh_schedule(acb->bh); +} + +BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + Coroutine *co; + BlockDriverAIOCBCoroutine *acb; + + trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->req.sector = sector_num; + acb->req.nb_sectors = nb_sectors; + acb->done = NULL; + co = qemu_coroutine_create(bdrv_aio_discard_co_entry); + qemu_coroutine_enter(co, acb); + + return &acb->common; +} + +void bdrv_init(void) +{ + module_call_init(MODULE_INIT_BLOCK); +} + +void bdrv_init_with_whitelist(void) +{ + use_bdrv_whitelist = 1; + bdrv_init(); +} + +void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BlockDriverAIOCB *acb; + + acb = g_slice_alloc(aiocb_info->aiocb_size); + acb->aiocb_info = aiocb_info; + acb->bs = bs; + acb->cb = cb; + acb->opaque = opaque; + return acb; +} + +void qemu_aio_release(void *p) +{ + BlockDriverAIOCB *acb = p; + g_slice_free1(acb->aiocb_info->aiocb_size, acb); +} + +/**************************************************************/ +/* Coroutine block device emulation */ + +typedef struct CoroutineIOCompletion { + Coroutine *coroutine; + int ret; +} CoroutineIOCompletion; + +static void bdrv_co_io_em_complete(void *opaque, int ret) +{ + CoroutineIOCompletion *co = opaque; + + co->ret = ret; + qemu_coroutine_enter(co->coroutine, NULL); +} + +static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *iov, + bool is_write) +{ + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + BlockDriverAIOCB *acb; + + if (is_write) { + acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); + } else { + acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); + } + + trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); + if (!acb) { + return -EIO; + } + qemu_coroutine_yield(); + + return co.ret; +} + +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); +} + +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); +} + +static void coroutine_fn bdrv_flush_co_entry(void *opaque) +{ + RwCo *rwco = opaque; + + rwco->ret = bdrv_co_flush(rwco->bs); +} + +int coroutine_fn bdrv_co_flush(BlockDriverState *bs) +{ + int ret; + + if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { + return 0; + } + + /* Write back cached data to the OS even with cache=unsafe */ + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); + if (bs->drv->bdrv_co_flush_to_os) { + ret = bs->drv->bdrv_co_flush_to_os(bs); + if (ret < 0) { + return ret; + } + } + + /* But don't actually force it to the disk with cache=unsafe */ + if (bs->open_flags & BDRV_O_NO_FLUSH) { + goto flush_parent; + } + + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); + if (bs->drv->bdrv_co_flush_to_disk) { + ret = bs->drv->bdrv_co_flush_to_disk(bs); + } else if (bs->drv->bdrv_aio_flush) { + BlockDriverAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); + if (acb == NULL) { + ret = -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + } else { + /* + * Some block drivers always operate in either writethrough or unsafe + * mode and don't support bdrv_flush therefore. Usually qemu doesn't + * know how the server works (because the behaviour is hardcoded or + * depends on server-side configuration), so we can't ensure that + * everything is safe on disk. Returning an error doesn't work because + * that would break guests even if the server operates in writethrough + * mode. + * + * Let's hope the user knows what he's doing. + */ + ret = 0; + } + if (ret < 0) { + return ret; + } + + /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH + * in the case of cache=unsafe, so there are no useless flushes. + */ +flush_parent: + return bdrv_co_flush(bs->file); +} + +void bdrv_invalidate_cache(BlockDriverState *bs) +{ + if (bs->drv && bs->drv->bdrv_invalidate_cache) { + bs->drv->bdrv_invalidate_cache(bs); + } +} + +void bdrv_invalidate_cache_all(void) +{ + BlockDriverState *bs; + + QTAILQ_FOREACH(bs, &bdrv_states, list) { + bdrv_invalidate_cache(bs); + } +} + +void bdrv_clear_incoming_migration_all(void) +{ + BlockDriverState *bs; + + QTAILQ_FOREACH(bs, &bdrv_states, list) { + bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING); + } +} + +int bdrv_flush(BlockDriverState *bs) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .ret = NOT_DONE, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_flush_co_entry(&rwco); + } else { + co = qemu_coroutine_create(bdrv_flush_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + qemu_aio_wait(); + } + } + + return rwco.ret; +} + +static void coroutine_fn bdrv_discard_co_entry(void *opaque) +{ + RwCo *rwco = opaque; + + rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); +} + +int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + if (!bs->drv) { + return -ENOMEDIUM; + } else if (bdrv_check_request(bs, sector_num, nb_sectors)) { + return -EIO; + } else if (bs->read_only) { + return -EROFS; + } + + if (bs->dirty_bitmap) { + bdrv_reset_dirty(bs, sector_num, nb_sectors); + } + + /* Do nothing if disabled. */ + if (!(bs->open_flags & BDRV_O_UNMAP)) { + return 0; + } + + if (bs->drv->bdrv_co_discard) { + return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors); + } else if (bs->drv->bdrv_aio_discard) { + BlockDriverAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + return -EIO; + } else { + qemu_coroutine_yield(); + return co.ret; + } + } else { + return 0; + } +} + +int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .ret = NOT_DONE, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_discard_co_entry(&rwco); + } else { + co = qemu_coroutine_create(bdrv_discard_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + qemu_aio_wait(); + } + } + + return rwco.ret; +} + +/**************************************************************/ +/* removable device support */ + +/** + * Return TRUE if the media is present + */ +int bdrv_is_inserted(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + + if (!drv) + return 0; + if (!drv->bdrv_is_inserted) + return 1; + return drv->bdrv_is_inserted(bs); +} + +/** + * Return whether the media changed since the last call to this + * function, or -ENOTSUP if we don't know. Most drivers don't know. + */ +int bdrv_media_changed(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + + if (drv && drv->bdrv_media_changed) { + return drv->bdrv_media_changed(bs); + } + return -ENOTSUP; +} + +/** + * If eject_flag is TRUE, eject the media. Otherwise, close the tray + */ +void bdrv_eject(BlockDriverState *bs, bool eject_flag) +{ + BlockDriver *drv = bs->drv; + + if (drv && drv->bdrv_eject) { + drv->bdrv_eject(bs, eject_flag); + } + + if (bs->device_name[0] != '\0') { + bdrv_emit_qmp_eject_event(bs, eject_flag); + } +} + +/** + * Lock or unlock the media (if it is locked, the user won't be able + * to eject it manually). + */ +void bdrv_lock_medium(BlockDriverState *bs, bool locked) +{ + BlockDriver *drv = bs->drv; + + trace_bdrv_lock_medium(bs, locked); + + if (drv && drv->bdrv_lock_medium) { + drv->bdrv_lock_medium(bs, locked); + } +} + +/* needed for generic scsi interface */ + +int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + BlockDriver *drv = bs->drv; + + if (drv && drv->bdrv_ioctl) + return drv->bdrv_ioctl(bs, req, buf); + return -ENOTSUP; +} + +BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BlockDriver *drv = bs->drv; + + if (drv && drv->bdrv_aio_ioctl) + return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); + return NULL; +} + +void bdrv_set_buffer_alignment(BlockDriverState *bs, int align) +{ + bs->buffer_alignment = align; +} + +void *qemu_blockalign(BlockDriverState *bs, size_t size) +{ + return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size); +} + +/* + * Check if all memory in this vector is sector aligned. + */ +bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) +{ + int i; + + for (i = 0; i < qiov->niov; i++) { + if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) { + return false; + } + } + + return true; +} + +void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity) +{ + int64_t bitmap_size; + + assert((granularity & (granularity - 1)) == 0); + + if (granularity) { + granularity >>= BDRV_SECTOR_BITS; + assert(!bs->dirty_bitmap); + bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS); + bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); + } else { + if (bs->dirty_bitmap) { + hbitmap_free(bs->dirty_bitmap); + bs->dirty_bitmap = NULL; + } + } +} + +int bdrv_get_dirty(BlockDriverState *bs, int64_t sector) +{ + if (bs->dirty_bitmap) { + return hbitmap_get(bs->dirty_bitmap, sector); + } else { + return 0; + } +} + +void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi) +{ + hbitmap_iter_init(hbi, bs->dirty_bitmap, 0); +} + +void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, + int nr_sectors) +{ + hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors); +} + +void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, + int nr_sectors) +{ + hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors); +} + +int64_t bdrv_get_dirty_count(BlockDriverState *bs) +{ + if (bs->dirty_bitmap) { + return hbitmap_count(bs->dirty_bitmap); + } else { + return 0; + } +} + +void bdrv_set_in_use(BlockDriverState *bs, int in_use) +{ + assert(bs->in_use != in_use); + bs->in_use = in_use; +} + +int bdrv_in_use(BlockDriverState *bs) +{ + return bs->in_use; +} + +void bdrv_iostatus_enable(BlockDriverState *bs) +{ + bs->iostatus_enabled = true; + bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; +} + +/* The I/O status is only enabled if the drive explicitly + * enables it _and_ the VM is configured to stop on errors */ +bool bdrv_iostatus_is_enabled(const BlockDriverState *bs) +{ + return (bs->iostatus_enabled && + (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC || + bs->on_write_error == BLOCKDEV_ON_ERROR_STOP || + bs->on_read_error == BLOCKDEV_ON_ERROR_STOP)); +} + +void bdrv_iostatus_disable(BlockDriverState *bs) +{ + bs->iostatus_enabled = false; +} + +void bdrv_iostatus_reset(BlockDriverState *bs) +{ + if (bdrv_iostatus_is_enabled(bs)) { + bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; + if (bs->job) { + block_job_iostatus_reset(bs->job); + } + } +} + +void bdrv_iostatus_set_err(BlockDriverState *bs, int error) +{ + assert(bdrv_iostatus_is_enabled(bs)); + if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { + bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE : + BLOCK_DEVICE_IO_STATUS_FAILED; + } +} + +void +bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes, + enum BlockAcctType type) +{ + assert(type < BDRV_MAX_IOTYPE); + + cookie->bytes = bytes; + cookie->start_time_ns = get_clock(); + cookie->type = type; +} + +void +bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie) +{ + assert(cookie->type < BDRV_MAX_IOTYPE); + + bs->nr_bytes[cookie->type] += cookie->bytes; + bs->nr_ops[cookie->type]++; + bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns; +} + +void bdrv_img_create(const char *filename, const char *fmt, + const char *base_filename, const char *base_fmt, + char *options, uint64_t img_size, int flags, + Error **errp, bool quiet) +{ + QEMUOptionParameter *param = NULL, *create_options = NULL; + QEMUOptionParameter *backing_fmt, *backing_file, *size; + BlockDriverState *bs = NULL; + BlockDriver *drv, *proto_drv; + BlockDriver *backing_drv = NULL; + int ret = 0; + + /* Find driver and parse its options */ + drv = bdrv_find_format(fmt); + if (!drv) { + error_setg(errp, "Unknown file format '%s'", fmt); + return; + } + + proto_drv = bdrv_find_protocol(filename, true); + if (!proto_drv) { + error_setg(errp, "Unknown protocol '%s'", filename); + return; + } + + create_options = append_option_parameters(create_options, + drv->create_options); + create_options = append_option_parameters(create_options, + proto_drv->create_options); + + /* Create parameter list with default values */ + param = parse_option_parameters("", create_options, param); + + set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size); + + /* Parse -o options */ + if (options) { + param = parse_option_parameters(options, create_options, param); + if (param == NULL) { + error_setg(errp, "Invalid options for file format '%s'.", fmt); + goto out; + } + } + + if (base_filename) { + if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE, + base_filename)) { + error_setg(errp, "Backing file not supported for file format '%s'", + fmt); + goto out; + } + } + + if (base_fmt) { + if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) { + error_setg(errp, "Backing file format not supported for file " + "format '%s'", fmt); + goto out; + } + } + + backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE); + if (backing_file && backing_file->value.s) { + if (!strcmp(filename, backing_file->value.s)) { + error_setg(errp, "Error: Trying to create an image with the " + "same filename as the backing file"); + goto out; + } + } + + backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT); + if (backing_fmt && backing_fmt->value.s) { + backing_drv = bdrv_find_format(backing_fmt->value.s); + if (!backing_drv) { + error_setg(errp, "Unknown backing file format '%s'", + backing_fmt->value.s); + goto out; + } + } + + // The size for the image must always be specified, with one exception: + // If we are using a backing file, we can obtain the size from there + size = get_option_parameter(param, BLOCK_OPT_SIZE); + if (size && size->value.n == -1) { + if (backing_file && backing_file->value.s) { + uint64_t size; + char buf[32]; + int back_flags; + + /* backing files always opened read-only */ + back_flags = + flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); + + bs = bdrv_new(""); + + ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags, + backing_drv); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not open '%s'", + backing_file->value.s); + goto out; + } + bdrv_get_geometry(bs, &size); + size *= 512; + + snprintf(buf, sizeof(buf), "%" PRId64, size); + set_option_parameter(param, BLOCK_OPT_SIZE, buf); + } else { + error_setg(errp, "Image creation needs a size parameter"); + goto out; + } + } + + if (!quiet) { + printf("Formatting '%s', fmt=%s ", filename, fmt); + print_option_parameters(param); + puts(""); + } + ret = bdrv_create(drv, filename, param); + if (ret < 0) { + if (ret == -ENOTSUP) { + error_setg(errp,"Formatting or formatting option not supported for " + "file format '%s'", fmt); + } else if (ret == -EFBIG) { + const char *cluster_size_hint = ""; + if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) { + cluster_size_hint = " (try using a larger cluster size)"; + } + error_setg(errp, "The image size is too large for file format '%s'%s", + fmt, cluster_size_hint); + } else { + error_setg(errp, "%s: error while creating %s: %s", filename, fmt, + strerror(-ret)); + } + } + +out: + free_option_parameters(create_options); + free_option_parameters(param); + + if (bs) { + bdrv_delete(bs); + } +} + +AioContext *bdrv_get_aio_context(BlockDriverState *bs) +{ + /* Currently BlockDriverState always uses the main loop AioContext */ + return qemu_get_aio_context(); +} + +void bdrv_add_before_write_notifier(BlockDriverState *bs, + NotifierWithReturn *notifier) +{ + notifier_with_return_list_add(&bs->before_write_notifiers, notifier); +} diff --git a/contrib/qemu/block/qcow.c b/contrib/qemu/block/qcow.c new file mode 100644 index 000000000..5239bd68f --- /dev/null +++ b/contrib/qemu/block/qcow.c @@ -0,0 +1,914 @@ +/* + * Block driver for the QCOW format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include <zlib.h> +#include "qemu/aes.h" +#include "migration/migration.h" + +/**************************************************************/ +/* QEMU COW block driver with compression and encryption support */ + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) +#define QCOW_VERSION 1 + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES 1 + +#define QCOW_OFLAG_COMPRESSED (1LL << 63) + +typedef struct QCowHeader { + uint32_t magic; + uint32_t version; + uint64_t backing_file_offset; + uint32_t backing_file_size; + uint32_t mtime; + uint64_t size; /* in bytes */ + uint8_t cluster_bits; + uint8_t l2_bits; + uint32_t crypt_method; + uint64_t l1_table_offset; +} QCowHeader; + +#define L2_CACHE_SIZE 16 + +typedef struct BDRVQcowState { + int cluster_bits; + int cluster_size; + int cluster_sectors; + int l2_bits; + int l2_size; + int l1_size; + uint64_t cluster_offset_mask; + uint64_t l1_table_offset; + uint64_t *l1_table; + uint64_t *l2_cache; + uint64_t l2_cache_offsets[L2_CACHE_SIZE]; + uint32_t l2_cache_counts[L2_CACHE_SIZE]; + uint8_t *cluster_cache; + uint8_t *cluster_data; + uint64_t cluster_cache_offset; + uint32_t crypt_method; /* current crypt method, 0 if no key yet */ + uint32_t crypt_method_header; + AES_KEY aes_encrypt_key; + AES_KEY aes_decrypt_key; + CoMutex lock; + Error *migration_blocker; +} BDRVQcowState; + +static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); + +static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const QCowHeader *cow_header = (const void *)buf; + + if (buf_size >= sizeof(QCowHeader) && + be32_to_cpu(cow_header->magic) == QCOW_MAGIC && + be32_to_cpu(cow_header->version) == QCOW_VERSION) + return 100; + else + return 0; +} + +static int qcow_open(BlockDriverState *bs, QDict *options, int flags) +{ + BDRVQcowState *s = bs->opaque; + int len, i, shift, ret; + QCowHeader header; + + ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); + if (ret < 0) { + goto fail; + } + be32_to_cpus(&header.magic); + be32_to_cpus(&header.version); + be64_to_cpus(&header.backing_file_offset); + be32_to_cpus(&header.backing_file_size); + be32_to_cpus(&header.mtime); + be64_to_cpus(&header.size); + be32_to_cpus(&header.crypt_method); + be64_to_cpus(&header.l1_table_offset); + + if (header.magic != QCOW_MAGIC) { + ret = -EMEDIUMTYPE; + goto fail; + } + if (header.version != QCOW_VERSION) { + char version[64]; + snprintf(version, sizeof(version), "QCOW version %d", header.version); + qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bs->device_name, "qcow", version); + ret = -ENOTSUP; + goto fail; + } + + if (header.size <= 1 || header.cluster_bits < 9) { + ret = -EINVAL; + goto fail; + } + if (header.crypt_method > QCOW_CRYPT_AES) { + ret = -EINVAL; + goto fail; + } + s->crypt_method_header = header.crypt_method; + if (s->crypt_method_header) { + bs->encrypted = 1; + } + s->cluster_bits = header.cluster_bits; + s->cluster_size = 1 << s->cluster_bits; + s->cluster_sectors = 1 << (s->cluster_bits - 9); + s->l2_bits = header.l2_bits; + s->l2_size = 1 << s->l2_bits; + bs->total_sectors = header.size / 512; + s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1; + + /* read the level 1 table */ + shift = s->cluster_bits + s->l2_bits; + s->l1_size = (header.size + (1LL << shift) - 1) >> shift; + + s->l1_table_offset = header.l1_table_offset; + s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); + + ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, + s->l1_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + + for(i = 0;i < s->l1_size; i++) { + be64_to_cpus(&s->l1_table[i]); + } + /* alloc L2 cache */ + s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + s->cluster_cache = g_malloc(s->cluster_size); + s->cluster_data = g_malloc(s->cluster_size); + s->cluster_cache_offset = -1; + + /* read the backing file name */ + if (header.backing_file_offset != 0) { + len = header.backing_file_size; + if (len > 1023) { + len = 1023; + } + ret = bdrv_pread(bs->file, header.backing_file_offset, + bs->backing_file, len); + if (ret < 0) { + goto fail; + } + bs->backing_file[len] = '\0'; + } + + /* Disable migration when qcow images are used */ + error_set(&s->migration_blocker, + QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + "qcow", bs->device_name, "live migration"); + migrate_add_blocker(s->migration_blocker); + + qemu_co_mutex_init(&s->lock); + return 0; + + fail: + g_free(s->l1_table); + g_free(s->l2_cache); + g_free(s->cluster_cache); + g_free(s->cluster_data); + return ret; +} + + +/* We have nothing to do for QCOW reopen, stubs just return + * success */ +static int qcow_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + +static int qcow_set_key(BlockDriverState *bs, const char *key) +{ + BDRVQcowState *s = bs->opaque; + uint8_t keybuf[16]; + int len, i; + + memset(keybuf, 0, 16); + len = strlen(key); + if (len > 16) + len = 16; + /* XXX: we could compress the chars to 7 bits to increase + entropy */ + for(i = 0;i < len;i++) { + keybuf[i] = key[i]; + } + s->crypt_method = s->crypt_method_header; + + if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) + return -1; + if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) + return -1; + return 0; +} + +/* The crypt function is compatible with the linux cryptoloop + algorithm for < 4 GB images. NOTE: out_buf == in_buf is + supported */ +static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, int enc, + const AES_KEY *key) +{ + union { + uint64_t ll[2]; + uint8_t b[16]; + } ivec; + int i; + + for(i = 0; i < nb_sectors; i++) { + ivec.ll[0] = cpu_to_le64(sector_num); + ivec.ll[1] = 0; + AES_cbc_encrypt(in_buf, out_buf, 512, key, + ivec.b, enc); + sector_num++; + in_buf += 512; + out_buf += 512; + } +} + +/* 'allocate' is: + * + * 0 to not allocate. + * + * 1 to allocate a normal cluster (for sector indexes 'n_start' to + * 'n_end') + * + * 2 to allocate a compressed cluster of size + * 'compressed_size'. 'compressed_size' must be > 0 and < + * cluster_size + * + * return 0 if not allocated. + */ +static uint64_t get_cluster_offset(BlockDriverState *bs, + uint64_t offset, int allocate, + int compressed_size, + int n_start, int n_end) +{ + BDRVQcowState *s = bs->opaque; + int min_index, i, j, l1_index, l2_index; + uint64_t l2_offset, *l2_table, cluster_offset, tmp; + uint32_t min_count; + int new_l2_table; + + l1_index = offset >> (s->l2_bits + s->cluster_bits); + l2_offset = s->l1_table[l1_index]; + new_l2_table = 0; + if (!l2_offset) { + if (!allocate) + return 0; + /* allocate a new l2 entry */ + l2_offset = bdrv_getlength(bs->file); + /* round to cluster size */ + l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); + /* update the L1 entry */ + s->l1_table[l1_index] = l2_offset; + tmp = cpu_to_be64(l2_offset); + if (bdrv_pwrite_sync(bs->file, + s->l1_table_offset + l1_index * sizeof(tmp), + &tmp, sizeof(tmp)) < 0) + return 0; + new_l2_table = 1; + } + for(i = 0; i < L2_CACHE_SIZE; i++) { + if (l2_offset == s->l2_cache_offsets[i]) { + /* increment the hit count */ + if (++s->l2_cache_counts[i] == 0xffffffff) { + for(j = 0; j < L2_CACHE_SIZE; j++) { + s->l2_cache_counts[j] >>= 1; + } + } + l2_table = s->l2_cache + (i << s->l2_bits); + goto found; + } + } + /* not found: load a new entry in the least used one */ + min_index = 0; + min_count = 0xffffffff; + for(i = 0; i < L2_CACHE_SIZE; i++) { + if (s->l2_cache_counts[i] < min_count) { + min_count = s->l2_cache_counts[i]; + min_index = i; + } + } + l2_table = s->l2_cache + (min_index << s->l2_bits); + if (new_l2_table) { + memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); + if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table, + s->l2_size * sizeof(uint64_t)) < 0) + return 0; + } else { + if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) + return 0; + } + s->l2_cache_offsets[min_index] = l2_offset; + s->l2_cache_counts[min_index] = 1; + found: + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + cluster_offset = be64_to_cpu(l2_table[l2_index]); + if (!cluster_offset || + ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) { + if (!allocate) + return 0; + /* allocate a new cluster */ + if ((cluster_offset & QCOW_OFLAG_COMPRESSED) && + (n_end - n_start) < s->cluster_sectors) { + /* if the cluster is already compressed, we must + decompress it in the case it is not completely + overwritten */ + if (decompress_cluster(bs, cluster_offset) < 0) + return 0; + cluster_offset = bdrv_getlength(bs->file); + cluster_offset = (cluster_offset + s->cluster_size - 1) & + ~(s->cluster_size - 1); + /* write the cluster content */ + if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) != + s->cluster_size) + return -1; + } else { + cluster_offset = bdrv_getlength(bs->file); + if (allocate == 1) { + /* round to cluster size */ + cluster_offset = (cluster_offset + s->cluster_size - 1) & + ~(s->cluster_size - 1); + bdrv_truncate(bs->file, cluster_offset + s->cluster_size); + /* if encrypted, we must initialize the cluster + content which won't be written */ + if (s->crypt_method && + (n_end - n_start) < s->cluster_sectors) { + uint64_t start_sect; + start_sect = (offset & ~(s->cluster_size - 1)) >> 9; + memset(s->cluster_data + 512, 0x00, 512); + for(i = 0; i < s->cluster_sectors; i++) { + if (i < n_start || i >= n_end) { + encrypt_sectors(s, start_sect + i, + s->cluster_data, + s->cluster_data + 512, 1, 1, + &s->aes_encrypt_key); + if (bdrv_pwrite(bs->file, cluster_offset + i * 512, + s->cluster_data, 512) != 512) + return -1; + } + } + } + } else if (allocate == 2) { + cluster_offset |= QCOW_OFLAG_COMPRESSED | + (uint64_t)compressed_size << (63 - s->cluster_bits); + } + } + /* update L2 table */ + tmp = cpu_to_be64(cluster_offset); + l2_table[l2_index] = tmp; + if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp), + &tmp, sizeof(tmp)) < 0) + return 0; + } + return cluster_offset; +} + +static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) +{ + BDRVQcowState *s = bs->opaque; + int index_in_cluster, n; + uint64_t cluster_offset; + + qemu_co_mutex_lock(&s->lock); + cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); + qemu_co_mutex_unlock(&s->lock); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + *pnum = n; + return (cluster_offset != 0); +} + +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, + const uint8_t *buf, int buf_size) +{ + z_stream strm1, *strm = &strm1; + int ret, out_len; + + memset(strm, 0, sizeof(*strm)); + + strm->next_in = (uint8_t *)buf; + strm->avail_in = buf_size; + strm->next_out = out_buf; + strm->avail_out = out_buf_size; + + ret = inflateInit2(strm, -12); + if (ret != Z_OK) + return -1; + ret = inflate(strm, Z_FINISH); + out_len = strm->next_out - out_buf; + if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || + out_len != out_buf_size) { + inflateEnd(strm); + return -1; + } + inflateEnd(strm); + return 0; +} + +static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) +{ + BDRVQcowState *s = bs->opaque; + int ret, csize; + uint64_t coffset; + + coffset = cluster_offset & s->cluster_offset_mask; + if (s->cluster_cache_offset != coffset) { + csize = cluster_offset >> (63 - s->cluster_bits); + csize &= (s->cluster_size - 1); + ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize); + if (ret != csize) + return -1; + if (decompress_buffer(s->cluster_cache, s->cluster_size, + s->cluster_data, csize) < 0) { + return -1; + } + s->cluster_cache_offset = coffset; + } + return 0; +} + +static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BDRVQcowState *s = bs->opaque; + int index_in_cluster; + int ret = 0, n; + uint64_t cluster_offset; + struct iovec hd_iov; + QEMUIOVector hd_qiov; + uint8_t *buf; + void *orig_buf; + + if (qiov->niov > 1) { + buf = orig_buf = qemu_blockalign(bs, qiov->size); + } else { + orig_buf = NULL; + buf = (uint8_t *)qiov->iov->iov_base; + } + + qemu_co_mutex_lock(&s->lock); + + while (nb_sectors != 0) { + /* prepare next request */ + cluster_offset = get_cluster_offset(bs, sector_num << 9, + 0, 0, 0, 0); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + + if (!cluster_offset) { + if (bs->backing_hd) { + /* read from the base image */ + hd_iov.iov_base = (void *)buf; + hd_iov.iov_len = n * 512; + qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->backing_hd, sector_num, + n, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } + } else { + /* Note: in this case, no need to wait */ + memset(buf, 0, 512 * n); + } + } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { + /* add AIO support for compressed blocks ? */ + if (decompress_cluster(bs, cluster_offset) < 0) { + goto fail; + } + memcpy(buf, + s->cluster_cache + index_in_cluster * 512, 512 * n); + } else { + if ((cluster_offset & 511) != 0) { + goto fail; + } + hd_iov.iov_base = (void *)buf; + hd_iov.iov_len = n * 512; + qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->file, + (cluster_offset >> 9) + index_in_cluster, + n, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + break; + } + if (s->crypt_method) { + encrypt_sectors(s, sector_num, buf, buf, + n, 0, + &s->aes_decrypt_key); + } + } + ret = 0; + + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + +done: + qemu_co_mutex_unlock(&s->lock); + + if (qiov->niov > 1) { + qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size); + qemu_vfree(orig_buf); + } + + return ret; + +fail: + ret = -EIO; + goto done; +} + +static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BDRVQcowState *s = bs->opaque; + int index_in_cluster; + uint64_t cluster_offset; + const uint8_t *src_buf; + int ret = 0, n; + uint8_t *cluster_data = NULL; + struct iovec hd_iov; + QEMUIOVector hd_qiov; + uint8_t *buf; + void *orig_buf; + + s->cluster_cache_offset = -1; /* disable compressed cache */ + + if (qiov->niov > 1) { + buf = orig_buf = qemu_blockalign(bs, qiov->size); + qemu_iovec_to_buf(qiov, 0, buf, qiov->size); + } else { + orig_buf = NULL; + buf = (uint8_t *)qiov->iov->iov_base; + } + + qemu_co_mutex_lock(&s->lock); + + while (nb_sectors != 0) { + + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0, + index_in_cluster, + index_in_cluster + n); + if (!cluster_offset || (cluster_offset & 511) != 0) { + ret = -EIO; + break; + } + if (s->crypt_method) { + if (!cluster_data) { + cluster_data = g_malloc0(s->cluster_size); + } + encrypt_sectors(s, sector_num, cluster_data, buf, + n, 1, &s->aes_encrypt_key); + src_buf = cluster_data; + } else { + src_buf = buf; + } + + hd_iov.iov_base = (void *)src_buf; + hd_iov.iov_len = n * 512; + qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_writev(bs->file, + (cluster_offset >> 9) + index_in_cluster, + n, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + break; + } + ret = 0; + + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + qemu_co_mutex_unlock(&s->lock); + + if (qiov->niov > 1) { + qemu_vfree(orig_buf); + } + g_free(cluster_data); + + return ret; +} + +static void qcow_close(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + + g_free(s->l1_table); + g_free(s->l2_cache); + g_free(s->cluster_cache); + g_free(s->cluster_data); + + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); +} + +static int qcow_create(const char *filename, QEMUOptionParameter *options) +{ + int header_size, backing_filename_len, l1_size, shift, i; + QCowHeader header; + uint8_t *tmp; + int64_t total_size = 0; + const char *backing_file = NULL; + int flags = 0; + int ret; + BlockDriverState *qcow_bs; + + /* Read out options */ + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + total_size = options->value.n / 512; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { + backing_file = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) { + flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0; + } + options++; + } + + ret = bdrv_create_file(filename, options); + if (ret < 0) { + return ret; + } + + ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR); + if (ret < 0) { + return ret; + } + + ret = bdrv_truncate(qcow_bs, 0); + if (ret < 0) { + goto exit; + } + + memset(&header, 0, sizeof(header)); + header.magic = cpu_to_be32(QCOW_MAGIC); + header.version = cpu_to_be32(QCOW_VERSION); + header.size = cpu_to_be64(total_size * 512); + header_size = sizeof(header); + backing_filename_len = 0; + if (backing_file) { + if (strcmp(backing_file, "fat:")) { + header.backing_file_offset = cpu_to_be64(header_size); + backing_filename_len = strlen(backing_file); + header.backing_file_size = cpu_to_be32(backing_filename_len); + header_size += backing_filename_len; + } else { + /* special backing file for vvfat */ + backing_file = NULL; + } + header.cluster_bits = 9; /* 512 byte cluster to avoid copying + unmodifyed sectors */ + header.l2_bits = 12; /* 32 KB L2 tables */ + } else { + header.cluster_bits = 12; /* 4 KB clusters */ + header.l2_bits = 9; /* 4 KB L2 tables */ + } + header_size = (header_size + 7) & ~7; + shift = header.cluster_bits + header.l2_bits; + l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift; + + header.l1_table_offset = cpu_to_be64(header_size); + if (flags & BLOCK_FLAG_ENCRYPT) { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); + } else { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); + } + + /* write all the data */ + ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header)); + if (ret != sizeof(header)) { + goto exit; + } + + if (backing_file) { + ret = bdrv_pwrite(qcow_bs, sizeof(header), + backing_file, backing_filename_len); + if (ret != backing_filename_len) { + goto exit; + } + } + + tmp = g_malloc0(BDRV_SECTOR_SIZE); + for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/ + BDRV_SECTOR_SIZE); i++) { + ret = bdrv_pwrite(qcow_bs, header_size + + BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE); + if (ret != BDRV_SECTOR_SIZE) { + g_free(tmp); + goto exit; + } + } + + g_free(tmp); + ret = 0; +exit: + bdrv_delete(qcow_bs); + return ret; +} + +static int qcow_make_empty(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + uint32_t l1_length = s->l1_size * sizeof(uint64_t); + int ret; + + memset(s->l1_table, 0, l1_length); + if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table, + l1_length) < 0) + return -1; + ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); + if (ret < 0) + return ret; + + memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t)); + memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t)); + + return 0; +} + +/* XXX: put compressed sectors first, then all the cluster aligned + tables to avoid losing bytes in alignment */ +static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + z_stream strm; + int ret, out_len; + uint8_t *out_buf; + uint64_t cluster_offset; + + if (nb_sectors != s->cluster_sectors) { + ret = -EINVAL; + + /* Zero-pad last write if image size is not cluster aligned */ + if (sector_num + nb_sectors == bs->total_sectors && + nb_sectors < s->cluster_sectors) { + uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); + memset(pad_buf, 0, s->cluster_size); + memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); + ret = qcow_write_compressed(bs, sector_num, + pad_buf, s->cluster_sectors); + qemu_vfree(pad_buf); + } + return ret; + } + + out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + + /* best compression, small window, no zlib header */ + memset(&strm, 0, sizeof(strm)); + ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -12, + 9, Z_DEFAULT_STRATEGY); + if (ret != 0) { + ret = -EINVAL; + goto fail; + } + + strm.avail_in = s->cluster_size; + strm.next_in = (uint8_t *)buf; + strm.avail_out = s->cluster_size; + strm.next_out = out_buf; + + ret = deflate(&strm, Z_FINISH); + if (ret != Z_STREAM_END && ret != Z_OK) { + deflateEnd(&strm); + ret = -EINVAL; + goto fail; + } + out_len = strm.next_out - out_buf; + + deflateEnd(&strm); + + if (ret != Z_STREAM_END || out_len >= s->cluster_size) { + /* could not compress: write normal cluster */ + ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); + if (ret < 0) { + goto fail; + } + } else { + cluster_offset = get_cluster_offset(bs, sector_num << 9, 2, + out_len, 0, 0); + if (cluster_offset == 0) { + ret = -EIO; + goto fail; + } + + cluster_offset &= s->cluster_offset_mask; + ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); + if (ret < 0) { + goto fail; + } + } + + ret = 0; +fail: + g_free(out_buf); + return ret; +} + +static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVQcowState *s = bs->opaque; + bdi->cluster_size = s->cluster_size; + return 0; +} + + +static QEMUOptionParameter qcow_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_ENCRYPT, + .type = OPT_FLAG, + .help = "Encrypt the image" + }, + { NULL } +}; + +static BlockDriver bdrv_qcow = { + .format_name = "qcow", + .instance_size = sizeof(BDRVQcowState), + .bdrv_probe = qcow_probe, + .bdrv_open = qcow_open, + .bdrv_close = qcow_close, + .bdrv_reopen_prepare = qcow_reopen_prepare, + .bdrv_create = qcow_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + + .bdrv_co_readv = qcow_co_readv, + .bdrv_co_writev = qcow_co_writev, + .bdrv_co_is_allocated = qcow_co_is_allocated, + + .bdrv_set_key = qcow_set_key, + .bdrv_make_empty = qcow_make_empty, + .bdrv_write_compressed = qcow_write_compressed, + .bdrv_get_info = qcow_get_info, + + .create_options = qcow_create_options, +}; + +static void bdrv_qcow_init(void) +{ + bdrv_register(&bdrv_qcow); +} + +block_init(bdrv_qcow_init); diff --git a/contrib/qemu/block/qcow2-cache.c b/contrib/qemu/block/qcow2-cache.c new file mode 100644 index 000000000..2f3114ecc --- /dev/null +++ b/contrib/qemu/block/qcow2-cache.c @@ -0,0 +1,323 @@ +/* + * L2/refcount table cache for the QCOW2 format + * + * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/block_int.h" +#include "qemu-common.h" +#include "qcow2.h" +#include "trace.h" + +typedef struct Qcow2CachedTable { + void* table; + int64_t offset; + bool dirty; + int cache_hits; + int ref; +} Qcow2CachedTable; + +struct Qcow2Cache { + Qcow2CachedTable* entries; + struct Qcow2Cache* depends; + int size; + bool depends_on_flush; +}; + +Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables) +{ + BDRVQcowState *s = bs->opaque; + Qcow2Cache *c; + int i; + + c = g_malloc0(sizeof(*c)); + c->size = num_tables; + c->entries = g_malloc0(sizeof(*c->entries) * num_tables); + + for (i = 0; i < c->size; i++) { + c->entries[i].table = qemu_blockalign(bs, s->cluster_size); + } + + return c; +} + +int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c) +{ + int i; + + for (i = 0; i < c->size; i++) { + assert(c->entries[i].ref == 0); + qemu_vfree(c->entries[i].table); + } + + g_free(c->entries); + g_free(c); + + return 0; +} + +static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c) +{ + int ret; + + ret = qcow2_cache_flush(bs, c->depends); + if (ret < 0) { + return ret; + } + + c->depends = NULL; + c->depends_on_flush = false; + + return 0; +} + +static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) +{ + BDRVQcowState *s = bs->opaque; + int ret = 0; + + if (!c->entries[i].dirty || !c->entries[i].offset) { + return 0; + } + + trace_qcow2_cache_entry_flush(qemu_coroutine_self(), + c == s->l2_table_cache, i); + + if (c->depends) { + ret = qcow2_cache_flush_dependency(bs, c); + } else if (c->depends_on_flush) { + ret = bdrv_flush(bs->file); + if (ret >= 0) { + c->depends_on_flush = false; + } + } + + if (ret < 0) { + return ret; + } + + if (c == s->refcount_block_cache) { + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); + } else if (c == s->l2_table_cache) { + BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); + } + + ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table, + s->cluster_size); + if (ret < 0) { + return ret; + } + + c->entries[i].dirty = false; + + return 0; +} + +int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) +{ + BDRVQcowState *s = bs->opaque; + int result = 0; + int ret; + int i; + + trace_qcow2_cache_flush(qemu_coroutine_self(), c == s->l2_table_cache); + + for (i = 0; i < c->size; i++) { + ret = qcow2_cache_entry_flush(bs, c, i); + if (ret < 0 && result != -ENOSPC) { + result = ret; + } + } + + if (result == 0) { + ret = bdrv_flush(bs->file); + if (ret < 0) { + result = ret; + } + } + + return result; +} + +int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, + Qcow2Cache *dependency) +{ + int ret; + + if (dependency->depends) { + ret = qcow2_cache_flush_dependency(bs, dependency); + if (ret < 0) { + return ret; + } + } + + if (c->depends && (c->depends != dependency)) { + ret = qcow2_cache_flush_dependency(bs, c); + if (ret < 0) { + return ret; + } + } + + c->depends = dependency; + return 0; +} + +void qcow2_cache_depends_on_flush(Qcow2Cache *c) +{ + c->depends_on_flush = true; +} + +static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c) +{ + int i; + int min_count = INT_MAX; + int min_index = -1; + + + for (i = 0; i < c->size; i++) { + if (c->entries[i].ref) { + continue; + } + + if (c->entries[i].cache_hits < min_count) { + min_index = i; + min_count = c->entries[i].cache_hits; + } + + /* Give newer hits priority */ + /* TODO Check how to optimize the replacement strategy */ + c->entries[i].cache_hits /= 2; + } + + if (min_index == -1) { + /* This can't happen in current synchronous code, but leave the check + * here as a reminder for whoever starts using AIO with the cache */ + abort(); + } + return min_index; +} + +static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, + uint64_t offset, void **table, bool read_from_disk) +{ + BDRVQcowState *s = bs->opaque; + int i; + int ret; + + trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache, + offset, read_from_disk); + + /* Check if the table is already cached */ + for (i = 0; i < c->size; i++) { + if (c->entries[i].offset == offset) { + goto found; + } + } + + /* If not, write a table back and replace it */ + i = qcow2_cache_find_entry_to_replace(c); + trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(), + c == s->l2_table_cache, i); + if (i < 0) { + return i; + } + + ret = qcow2_cache_entry_flush(bs, c, i); + if (ret < 0) { + return ret; + } + + trace_qcow2_cache_get_read(qemu_coroutine_self(), + c == s->l2_table_cache, i); + c->entries[i].offset = 0; + if (read_from_disk) { + if (c == s->l2_table_cache) { + BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); + } + + ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size); + if (ret < 0) { + return ret; + } + } + + /* Give the table some hits for the start so that it won't be replaced + * immediately. The number 32 is completely arbitrary. */ + c->entries[i].cache_hits = 32; + c->entries[i].offset = offset; + + /* And return the right table */ +found: + c->entries[i].cache_hits++; + c->entries[i].ref++; + *table = c->entries[i].table; + + trace_qcow2_cache_get_done(qemu_coroutine_self(), + c == s->l2_table_cache, i); + + return 0; +} + +int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, + void **table) +{ + return qcow2_cache_do_get(bs, c, offset, table, true); +} + +int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, + void **table) +{ + return qcow2_cache_do_get(bs, c, offset, table, false); +} + +int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table) +{ + int i; + + for (i = 0; i < c->size; i++) { + if (c->entries[i].table == *table) { + goto found; + } + } + return -ENOENT; + +found: + c->entries[i].ref--; + *table = NULL; + + assert(c->entries[i].ref >= 0); + return 0; +} + +void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table) +{ + int i; + + for (i = 0; i < c->size; i++) { + if (c->entries[i].table == table) { + goto found; + } + } + abort(); + +found: + c->entries[i].dirty = true; +} diff --git a/contrib/qemu/block/qcow2-cluster.c b/contrib/qemu/block/qcow2-cluster.c new file mode 100644 index 000000000..cca76d4fc --- /dev/null +++ b/contrib/qemu/block/qcow2-cluster.c @@ -0,0 +1,1478 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <zlib.h> + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/qcow2.h" +#include "trace.h" + +int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, + bool exact_size) +{ + BDRVQcowState *s = bs->opaque; + int new_l1_size2, ret, i; + uint64_t *new_l1_table; + int64_t new_l1_table_offset, new_l1_size; + uint8_t data[12]; + + if (min_size <= s->l1_size) + return 0; + + if (exact_size) { + new_l1_size = min_size; + } else { + /* Bump size up to reduce the number of times we have to grow */ + new_l1_size = s->l1_size; + if (new_l1_size == 0) { + new_l1_size = 1; + } + while (min_size > new_l1_size) { + new_l1_size = (new_l1_size * 3 + 1) / 2; + } + } + + if (new_l1_size > INT_MAX) { + return -EFBIG; + } + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", + s->l1_size, new_l1_size); +#endif + + new_l1_size2 = sizeof(uint64_t) * new_l1_size; + new_l1_table = g_malloc0(align_offset(new_l1_size2, 512)); + memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); + + /* write new table (align to cluster) */ + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); + new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); + if (new_l1_table_offset < 0) { + g_free(new_l1_table); + return new_l1_table_offset; + } + + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail; + } + + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); + for(i = 0; i < s->l1_size; i++) + new_l1_table[i] = cpu_to_be64(new_l1_table[i]); + ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2); + if (ret < 0) + goto fail; + for(i = 0; i < s->l1_size; i++) + new_l1_table[i] = be64_to_cpu(new_l1_table[i]); + + /* set new table */ + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); + cpu_to_be32w((uint32_t*)data, new_l1_size); + cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset); + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data)); + if (ret < 0) { + goto fail; + } + g_free(s->l1_table); + qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t), + QCOW2_DISCARD_OTHER); + s->l1_table_offset = new_l1_table_offset; + s->l1_table = new_l1_table; + s->l1_size = new_l1_size; + return 0; + fail: + g_free(new_l1_table); + qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, + QCOW2_DISCARD_OTHER); + return ret; +} + +/* + * l2_load + * + * Loads a L2 table into memory. If the table is in the cache, the cache + * is used; otherwise the L2 table is loaded from the image file. + * + * Returns a pointer to the L2 table on success, or NULL if the read from + * the image file failed. + */ + +static int l2_load(BlockDriverState *bs, uint64_t l2_offset, + uint64_t **l2_table) +{ + BDRVQcowState *s = bs->opaque; + int ret; + + ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table); + + return ret; +} + +/* + * Writes one sector of the L1 table to the disk (can't update single entries + * and we really don't want bdrv_pread to perform a read-modify-write) + */ +#define L1_ENTRIES_PER_SECTOR (512 / 8) +static int write_l1_entry(BlockDriverState *bs, int l1_index) +{ + BDRVQcowState *s = bs->opaque; + uint64_t buf[L1_ENTRIES_PER_SECTOR]; + int l1_start_index; + int i, ret; + + l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); + for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) { + buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); + } + + BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); + ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index, + buf, sizeof(buf)); + if (ret < 0) { + return ret; + } + + return 0; +} + +/* + * l2_allocate + * + * Allocate a new l2 entry in the file. If l1_index points to an already + * used entry in the L2 table (i.e. we are doing a copy on write for the L2 + * table) copy the contents of the old L2 table into the newly allocated one. + * Otherwise the new table is initialized with zeros. + * + */ + +static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) +{ + BDRVQcowState *s = bs->opaque; + uint64_t old_l2_offset; + uint64_t *l2_table; + int64_t l2_offset; + int ret; + + old_l2_offset = s->l1_table[l1_index]; + + trace_qcow2_l2_allocate(bs, l1_index); + + /* allocate a new l2 entry */ + + l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); + if (l2_offset < 0) { + return l2_offset; + } + + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail; + } + + /* allocate a new entry in the l2 cache */ + + trace_qcow2_l2_allocate_get_empty(bs, l1_index); + ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); + if (ret < 0) { + return ret; + } + + l2_table = *table; + + if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { + /* if there was no old l2 table, clear the new table */ + memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); + } else { + uint64_t* old_table; + + /* if there was an old l2 table, read it from the disk */ + BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); + ret = qcow2_cache_get(bs, s->l2_table_cache, + old_l2_offset & L1E_OFFSET_MASK, + (void**) &old_table); + if (ret < 0) { + goto fail; + } + + memcpy(l2_table, old_table, s->cluster_size); + + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table); + if (ret < 0) { + goto fail; + } + } + + /* write the l2 table to the file */ + BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); + + trace_qcow2_l2_allocate_write_l2(bs, l1_index); + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + ret = qcow2_cache_flush(bs, s->l2_table_cache); + if (ret < 0) { + goto fail; + } + + /* update the L1 entry */ + trace_qcow2_l2_allocate_write_l1(bs, l1_index); + s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; + ret = write_l1_entry(bs, l1_index); + if (ret < 0) { + goto fail; + } + + *table = l2_table; + trace_qcow2_l2_allocate_done(bs, l1_index, 0); + return 0; + +fail: + trace_qcow2_l2_allocate_done(bs, l1_index, ret); + qcow2_cache_put(bs, s->l2_table_cache, (void**) table); + s->l1_table[l1_index] = old_l2_offset; + return ret; +} + +/* + * Checks how many clusters in a given L2 table are contiguous in the image + * file. As soon as one of the flags in the bitmask stop_flags changes compared + * to the first cluster, the search is stopped and the cluster is not counted + * as contiguous. (This allows it, for example, to stop at the first compressed + * cluster which may require a different handling) + */ +static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, + uint64_t *l2_table, uint64_t start, uint64_t stop_flags) +{ + int i; + uint64_t mask = stop_flags | L2E_OFFSET_MASK; + uint64_t offset = be64_to_cpu(l2_table[0]) & mask; + + if (!offset) + return 0; + + for (i = start; i < start + nb_clusters; i++) { + uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; + if (offset + (uint64_t) i * cluster_size != l2_entry) { + break; + } + } + + return (i - start); +} + +static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table) +{ + int i; + + for (i = 0; i < nb_clusters; i++) { + int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i])); + + if (type != QCOW2_CLUSTER_UNALLOCATED) { + break; + } + } + + return i; +} + +/* The crypt function is compatible with the linux cryptoloop + algorithm for < 4 GB images. NOTE: out_buf == in_buf is + supported */ +void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, int enc, + const AES_KEY *key) +{ + union { + uint64_t ll[2]; + uint8_t b[16]; + } ivec; + int i; + + for(i = 0; i < nb_sectors; i++) { + ivec.ll[0] = cpu_to_le64(sector_num); + ivec.ll[1] = 0; + AES_cbc_encrypt(in_buf, out_buf, 512, key, + ivec.b, enc); + sector_num++; + in_buf += 512; + out_buf += 512; + } +} + +static int coroutine_fn copy_sectors(BlockDriverState *bs, + uint64_t start_sect, + uint64_t cluster_offset, + int n_start, int n_end) +{ + BDRVQcowState *s = bs->opaque; + QEMUIOVector qiov; + struct iovec iov; + int n, ret; + + /* + * If this is the last cluster and it is only partially used, we must only + * copy until the end of the image, or bdrv_check_request will fail for the + * bdrv_read/write calls below. + */ + if (start_sect + n_end > bs->total_sectors) { + n_end = bs->total_sectors - start_sect; + } + + n = n_end - n_start; + if (n <= 0) { + return 0; + } + + iov.iov_len = n * BDRV_SECTOR_SIZE; + iov.iov_base = qemu_blockalign(bs, iov.iov_len); + + qemu_iovec_init_external(&qiov, &iov, 1); + + BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); + + /* Call .bdrv_co_readv() directly instead of using the public block-layer + * interface. This avoids double I/O throttling and request tracking, + * which can lead to deadlock when block layer copy-on-read is enabled. + */ + ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov); + if (ret < 0) { + goto out; + } + + if (s->crypt_method) { + qcow2_encrypt_sectors(s, start_sect + n_start, + iov.iov_base, iov.iov_base, n, 1, + &s->aes_encrypt_key); + } + + BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); + ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov); + if (ret < 0) { + goto out; + } + + ret = 0; +out: + qemu_vfree(iov.iov_base); + return ret; +} + + +/* + * get_cluster_offset + * + * For a given offset of the disk image, find the cluster offset in + * qcow2 file. The offset is stored in *cluster_offset. + * + * on entry, *num is the number of contiguous sectors we'd like to + * access following offset. + * + * on exit, *num is the number of contiguous sectors we can read. + * + * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error + * cases. + */ +int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, + int *num, uint64_t *cluster_offset) +{ + BDRVQcowState *s = bs->opaque; + unsigned int l2_index; + uint64_t l1_index, l2_offset, *l2_table; + int l1_bits, c; + unsigned int index_in_cluster, nb_clusters; + uint64_t nb_available, nb_needed; + int ret; + + index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1); + nb_needed = *num + index_in_cluster; + + l1_bits = s->l2_bits + s->cluster_bits; + + /* compute how many bytes there are between the offset and + * the end of the l1 entry + */ + + nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)); + + /* compute the number of available sectors */ + + nb_available = (nb_available >> 9) + index_in_cluster; + + if (nb_needed > nb_available) { + nb_needed = nb_available; + } + + *cluster_offset = 0; + + /* seek the the l2 offset in the l1 table */ + + l1_index = offset >> l1_bits; + if (l1_index >= s->l1_size) { + ret = QCOW2_CLUSTER_UNALLOCATED; + goto out; + } + + l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; + if (!l2_offset) { + ret = QCOW2_CLUSTER_UNALLOCATED; + goto out; + } + + /* load the l2 table in memory */ + + ret = l2_load(bs, l2_offset, &l2_table); + if (ret < 0) { + return ret; + } + + /* find the cluster offset for the given disk offset */ + + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + *cluster_offset = be64_to_cpu(l2_table[l2_index]); + nb_clusters = size_to_clusters(s, nb_needed << 9); + + ret = qcow2_get_cluster_type(*cluster_offset); + switch (ret) { + case QCOW2_CLUSTER_COMPRESSED: + /* Compressed clusters can only be processed one by one */ + c = 1; + *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; + break; + case QCOW2_CLUSTER_ZERO: + if (s->qcow_version < 3) { + return -EIO; + } + c = count_contiguous_clusters(nb_clusters, s->cluster_size, + &l2_table[l2_index], 0, + QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); + *cluster_offset = 0; + break; + case QCOW2_CLUSTER_UNALLOCATED: + /* how many empty clusters ? */ + c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]); + *cluster_offset = 0; + break; + case QCOW2_CLUSTER_NORMAL: + /* how many allocated clusters ? */ + c = count_contiguous_clusters(nb_clusters, s->cluster_size, + &l2_table[l2_index], 0, + QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); + *cluster_offset &= L2E_OFFSET_MASK; + break; + default: + abort(); + } + + qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + + nb_available = (c * s->cluster_sectors); + +out: + if (nb_available > nb_needed) + nb_available = nb_needed; + + *num = nb_available - index_in_cluster; + + return ret; +} + +/* + * get_cluster_table + * + * for a given disk offset, load (and allocate if needed) + * the l2 table. + * + * the l2 table offset in the qcow2 file and the cluster index + * in the l2 table are given to the caller. + * + * Returns 0 on success, -errno in failure case + */ +static int get_cluster_table(BlockDriverState *bs, uint64_t offset, + uint64_t **new_l2_table, + int *new_l2_index) +{ + BDRVQcowState *s = bs->opaque; + unsigned int l2_index; + uint64_t l1_index, l2_offset; + uint64_t *l2_table = NULL; + int ret; + + /* seek the the l2 offset in the l1 table */ + + l1_index = offset >> (s->l2_bits + s->cluster_bits); + if (l1_index >= s->l1_size) { + ret = qcow2_grow_l1_table(bs, l1_index + 1, false); + if (ret < 0) { + return ret; + } + } + + assert(l1_index < s->l1_size); + l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; + + /* seek the l2 table of the given l2 offset */ + + if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) { + /* load the l2 table in memory */ + ret = l2_load(bs, l2_offset, &l2_table); + if (ret < 0) { + return ret; + } + } else { + /* First allocate a new L2 table (and do COW if needed) */ + ret = l2_allocate(bs, l1_index, &l2_table); + if (ret < 0) { + return ret; + } + + /* Then decrease the refcount of the old table */ + if (l2_offset) { + qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), + QCOW2_DISCARD_OTHER); + } + } + + /* find the cluster offset for the given disk offset */ + + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + + *new_l2_table = l2_table; + *new_l2_index = l2_index; + + return 0; +} + +/* + * alloc_compressed_cluster_offset + * + * For a given offset of the disk image, return cluster offset in + * qcow2 file. + * + * If the offset is not found, allocate a new compressed cluster. + * + * Return the cluster offset if successful, + * Return 0, otherwise. + * + */ + +uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, + uint64_t offset, + int compressed_size) +{ + BDRVQcowState *s = bs->opaque; + int l2_index, ret; + uint64_t *l2_table; + int64_t cluster_offset; + int nb_csectors; + + ret = get_cluster_table(bs, offset, &l2_table, &l2_index); + if (ret < 0) { + return 0; + } + + /* Compression can't overwrite anything. Fail if the cluster was already + * allocated. */ + cluster_offset = be64_to_cpu(l2_table[l2_index]); + if (cluster_offset & L2E_OFFSET_MASK) { + qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + return 0; + } + + cluster_offset = qcow2_alloc_bytes(bs, compressed_size); + if (cluster_offset < 0) { + qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + return 0; + } + + nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - + (cluster_offset >> 9); + + cluster_offset |= QCOW_OFLAG_COMPRESSED | + ((uint64_t)nb_csectors << s->csize_shift); + + /* update L2 table */ + + /* compressed clusters never have the copied flag */ + + BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + l2_table[l2_index] = cpu_to_be64(cluster_offset); + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (ret < 0) { + return 0; + } + + return cluster_offset; +} + +static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) +{ + BDRVQcowState *s = bs->opaque; + int ret; + + if (r->nb_sectors == 0) { + return 0; + } + + qemu_co_mutex_unlock(&s->lock); + ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset, + r->offset / BDRV_SECTOR_SIZE, + r->offset / BDRV_SECTOR_SIZE + r->nb_sectors); + qemu_co_mutex_lock(&s->lock); + + if (ret < 0) { + return ret; + } + + /* + * Before we update the L2 table to actually point to the new cluster, we + * need to be sure that the refcounts have been increased and COW was + * handled. + */ + qcow2_cache_depends_on_flush(s->l2_table_cache); + + return 0; +} + +int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) +{ + BDRVQcowState *s = bs->opaque; + int i, j = 0, l2_index, ret; + uint64_t *old_cluster, *l2_table; + uint64_t cluster_offset = m->alloc_offset; + + trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); + assert(m->nb_clusters > 0); + + old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t)); + + /* copy content of unmodified sectors */ + ret = perform_cow(bs, m, &m->cow_start); + if (ret < 0) { + goto err; + } + + ret = perform_cow(bs, m, &m->cow_end); + if (ret < 0) { + goto err; + } + + /* Update L2 table. */ + if (s->use_lazy_refcounts) { + qcow2_mark_dirty(bs); + } + if (qcow2_need_accurate_refcounts(s)) { + qcow2_cache_set_dependency(bs, s->l2_table_cache, + s->refcount_block_cache); + } + + ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); + if (ret < 0) { + goto err; + } + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + + for (i = 0; i < m->nb_clusters; i++) { + /* if two concurrent writes happen to the same unallocated cluster + * each write allocates separate cluster and writes data concurrently. + * The first one to complete updates l2 table with pointer to its + * cluster the second one has to do RMW (which is done above by + * copy_sectors()), update l2 table with its cluster pointer and free + * old cluster. This is what this loop does */ + if(l2_table[l2_index + i] != 0) + old_cluster[j++] = l2_table[l2_index + i]; + + l2_table[l2_index + i] = cpu_to_be64((cluster_offset + + (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); + } + + + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (ret < 0) { + goto err; + } + + /* + * If this was a COW, we need to decrease the refcount of the old cluster. + * Also flush bs->file to get the right order for L2 and refcount update. + * + * Don't discard clusters that reach a refcount of 0 (e.g. compressed + * clusters), the next write will reuse them anyway. + */ + if (j != 0) { + for (i = 0; i < j; i++) { + qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, + QCOW2_DISCARD_NEVER); + } + } + + ret = 0; +err: + g_free(old_cluster); + return ret; + } + +/* + * Returns the number of contiguous clusters that can be used for an allocating + * write, but require COW to be performed (this includes yet unallocated space, + * which must copy from the backing file) + */ +static int count_cow_clusters(BDRVQcowState *s, int nb_clusters, + uint64_t *l2_table, int l2_index) +{ + int i; + + for (i = 0; i < nb_clusters; i++) { + uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]); + int cluster_type = qcow2_get_cluster_type(l2_entry); + + switch(cluster_type) { + case QCOW2_CLUSTER_NORMAL: + if (l2_entry & QCOW_OFLAG_COPIED) { + goto out; + } + break; + case QCOW2_CLUSTER_UNALLOCATED: + case QCOW2_CLUSTER_COMPRESSED: + case QCOW2_CLUSTER_ZERO: + break; + default: + abort(); + } + } + +out: + assert(i <= nb_clusters); + return i; +} + +/* + * Check if there already is an AIO write request in flight which allocates + * the same cluster. In this case we need to wait until the previous + * request has completed and updated the L2 table accordingly. + * + * Returns: + * 0 if there was no dependency. *cur_bytes indicates the number of + * bytes from guest_offset that can be read before the next + * dependency must be processed (or the request is complete) + * + * -EAGAIN if we had to wait for another request, previously gathered + * information on cluster allocation may be invalid now. The caller + * must start over anyway, so consider *cur_bytes undefined. + */ +static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, + uint64_t *cur_bytes, QCowL2Meta **m) +{ + BDRVQcowState *s = bs->opaque; + QCowL2Meta *old_alloc; + uint64_t bytes = *cur_bytes; + + QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { + + uint64_t start = guest_offset; + uint64_t end = start + bytes; + uint64_t old_start = l2meta_cow_start(old_alloc); + uint64_t old_end = l2meta_cow_end(old_alloc); + + if (end <= old_start || start >= old_end) { + /* No intersection */ + } else { + if (start < old_start) { + /* Stop at the start of a running allocation */ + bytes = old_start - start; + } else { + bytes = 0; + } + + /* Stop if already an l2meta exists. After yielding, it wouldn't + * be valid any more, so we'd have to clean up the old L2Metas + * and deal with requests depending on them before starting to + * gather new ones. Not worth the trouble. */ + if (bytes == 0 && *m) { + *cur_bytes = 0; + return 0; + } + + if (bytes == 0) { + /* Wait for the dependency to complete. We need to recheck + * the free/allocated clusters when we continue. */ + qemu_co_mutex_unlock(&s->lock); + qemu_co_queue_wait(&old_alloc->dependent_requests); + qemu_co_mutex_lock(&s->lock); + return -EAGAIN; + } + } + } + + /* Make sure that existing clusters and new allocations are only used up to + * the next dependency if we shortened the request above */ + *cur_bytes = bytes; + + return 0; +} + +/* + * Checks how many already allocated clusters that don't require a copy on + * write there are at the given guest_offset (up to *bytes). If + * *host_offset is not zero, only physically contiguous clusters beginning at + * this host offset are counted. + * + * Note that guest_offset may not be cluster aligned. In this case, the + * returned *host_offset points to exact byte referenced by guest_offset and + * therefore isn't cluster aligned as well. + * + * Returns: + * 0: if no allocated clusters are available at the given offset. + * *bytes is normally unchanged. It is set to 0 if the cluster + * is allocated and doesn't need COW, but doesn't have the right + * physical offset. + * + * 1: if allocated clusters that don't require a COW are available at + * the requested offset. *bytes may have decreased and describes + * the length of the area that can be written to. + * + * -errno: in error cases + */ +static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, + uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) +{ + BDRVQcowState *s = bs->opaque; + int l2_index; + uint64_t cluster_offset; + uint64_t *l2_table; + unsigned int nb_clusters; + unsigned int keep_clusters; + int ret, pret; + + trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, + *bytes); + + assert(*host_offset == 0 || offset_into_cluster(s, guest_offset) + == offset_into_cluster(s, *host_offset)); + + /* + * Calculate the number of clusters to look for. We stop at L2 table + * boundaries to keep things simple. + */ + nb_clusters = + size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); + + l2_index = offset_to_l2_index(s, guest_offset); + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + + /* Find L2 entry for the first involved cluster */ + ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); + if (ret < 0) { + return ret; + } + + cluster_offset = be64_to_cpu(l2_table[l2_index]); + + /* Check how many clusters are already allocated and don't need COW */ + if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL + && (cluster_offset & QCOW_OFLAG_COPIED)) + { + /* If a specific host_offset is required, check it */ + bool offset_matches = + (cluster_offset & L2E_OFFSET_MASK) == *host_offset; + + if (*host_offset != 0 && !offset_matches) { + *bytes = 0; + ret = 0; + goto out; + } + + /* We keep all QCOW_OFLAG_COPIED clusters */ + keep_clusters = + count_contiguous_clusters(nb_clusters, s->cluster_size, + &l2_table[l2_index], 0, + QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); + assert(keep_clusters <= nb_clusters); + + *bytes = MIN(*bytes, + keep_clusters * s->cluster_size + - offset_into_cluster(s, guest_offset)); + + ret = 1; + } else { + ret = 0; + } + + /* Cleanup */ +out: + pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (pret < 0) { + return pret; + } + + /* Only return a host offset if we actually made progress. Otherwise we + * would make requirements for handle_alloc() that it can't fulfill */ + if (ret) { + *host_offset = (cluster_offset & L2E_OFFSET_MASK) + + offset_into_cluster(s, guest_offset); + } + + return ret; +} + +/* + * Allocates new clusters for the given guest_offset. + * + * At most *nb_clusters are allocated, and on return *nb_clusters is updated to + * contain the number of clusters that have been allocated and are contiguous + * in the image file. + * + * If *host_offset is non-zero, it specifies the offset in the image file at + * which the new clusters must start. *nb_clusters can be 0 on return in this + * case if the cluster at host_offset is already in use. If *host_offset is + * zero, the clusters can be allocated anywhere in the image file. + * + * *host_offset is updated to contain the offset into the image file at which + * the first allocated cluster starts. + * + * Return 0 on success and -errno in error cases. -EAGAIN means that the + * function has been waiting for another request and the allocation must be + * restarted, but the whole request should not be failed. + */ +static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, + uint64_t *host_offset, unsigned int *nb_clusters) +{ + BDRVQcowState *s = bs->opaque; + + trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, + *host_offset, *nb_clusters); + + /* Allocate new clusters */ + trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); + if (*host_offset == 0) { + int64_t cluster_offset = + qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); + if (cluster_offset < 0) { + return cluster_offset; + } + *host_offset = cluster_offset; + return 0; + } else { + int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); + if (ret < 0) { + return ret; + } + *nb_clusters = ret; + return 0; + } +} + +/* + * Allocates new clusters for an area that either is yet unallocated or needs a + * copy on write. If *host_offset is non-zero, clusters are only allocated if + * the new allocation can match the specified host offset. + * + * Note that guest_offset may not be cluster aligned. In this case, the + * returned *host_offset points to exact byte referenced by guest_offset and + * therefore isn't cluster aligned as well. + * + * Returns: + * 0: if no clusters could be allocated. *bytes is set to 0, + * *host_offset is left unchanged. + * + * 1: if new clusters were allocated. *bytes may be decreased if the + * new allocation doesn't cover all of the requested area. + * *host_offset is updated to contain the host offset of the first + * newly allocated cluster. + * + * -errno: in error cases + */ +static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, + uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) +{ + BDRVQcowState *s = bs->opaque; + int l2_index; + uint64_t *l2_table; + uint64_t entry; + unsigned int nb_clusters; + int ret; + + uint64_t alloc_cluster_offset; + + trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, + *bytes); + assert(*bytes > 0); + + /* + * Calculate the number of clusters to look for. We stop at L2 table + * boundaries to keep things simple. + */ + nb_clusters = + size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); + + l2_index = offset_to_l2_index(s, guest_offset); + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + + /* Find L2 entry for the first involved cluster */ + ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); + if (ret < 0) { + return ret; + } + + entry = be64_to_cpu(l2_table[l2_index]); + + /* For the moment, overwrite compressed clusters one by one */ + if (entry & QCOW_OFLAG_COMPRESSED) { + nb_clusters = 1; + } else { + nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index); + } + + /* This function is only called when there were no non-COW clusters, so if + * we can't find any unallocated or COW clusters either, something is + * wrong with our code. */ + assert(nb_clusters > 0); + + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (ret < 0) { + return ret; + } + + /* Allocate, if necessary at a given offset in the image file */ + alloc_cluster_offset = start_of_cluster(s, *host_offset); + ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, + &nb_clusters); + if (ret < 0) { + goto fail; + } + + /* Can't extend contiguous allocation */ + if (nb_clusters == 0) { + *bytes = 0; + return 0; + } + + /* + * Save info needed for meta data update. + * + * requested_sectors: Number of sectors from the start of the first + * newly allocated cluster to the end of the (possibly shortened + * before) write request. + * + * avail_sectors: Number of sectors from the start of the first + * newly allocated to the end of the last newly allocated cluster. + * + * nb_sectors: The number of sectors from the start of the first + * newly allocated cluster to the end of the area that the write + * request actually writes to (excluding COW at the end) + */ + int requested_sectors = + (*bytes + offset_into_cluster(s, guest_offset)) + >> BDRV_SECTOR_BITS; + int avail_sectors = nb_clusters + << (s->cluster_bits - BDRV_SECTOR_BITS); + int alloc_n_start = offset_into_cluster(s, guest_offset) + >> BDRV_SECTOR_BITS; + int nb_sectors = MIN(requested_sectors, avail_sectors); + QCowL2Meta *old_m = *m; + + *m = g_malloc0(sizeof(**m)); + + **m = (QCowL2Meta) { + .next = old_m, + + .alloc_offset = alloc_cluster_offset, + .offset = start_of_cluster(s, guest_offset), + .nb_clusters = nb_clusters, + .nb_available = nb_sectors, + + .cow_start = { + .offset = 0, + .nb_sectors = alloc_n_start, + }, + .cow_end = { + .offset = nb_sectors * BDRV_SECTOR_SIZE, + .nb_sectors = avail_sectors - nb_sectors, + }, + }; + qemu_co_queue_init(&(*m)->dependent_requests); + QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); + + *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); + *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE) + - offset_into_cluster(s, guest_offset)); + assert(*bytes != 0); + + return 1; + +fail: + if (*m && (*m)->nb_clusters > 0) { + QLIST_REMOVE(*m, next_in_flight); + } + return ret; +} + +/* + * alloc_cluster_offset + * + * For a given offset on the virtual disk, find the cluster offset in qcow2 + * file. If the offset is not found, allocate a new cluster. + * + * If the cluster was already allocated, m->nb_clusters is set to 0 and + * other fields in m are meaningless. + * + * If the cluster is newly allocated, m->nb_clusters is set to the number of + * contiguous clusters that have been allocated. In this case, the other + * fields of m are valid and contain information about the first allocated + * cluster. + * + * If the request conflicts with another write request in flight, the coroutine + * is queued and will be reentered when the dependency has completed. + * + * Return 0 on success and -errno in error cases + */ +int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, + int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m) +{ + BDRVQcowState *s = bs->opaque; + uint64_t start, remaining; + uint64_t cluster_offset; + uint64_t cur_bytes; + int ret; + + trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, + n_start, n_end); + + assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset)); + offset = start_of_cluster(s, offset); + +again: + start = offset + (n_start << BDRV_SECTOR_BITS); + remaining = (n_end - n_start) << BDRV_SECTOR_BITS; + cluster_offset = 0; + *host_offset = 0; + cur_bytes = 0; + *m = NULL; + + while (true) { + + if (!*host_offset) { + *host_offset = start_of_cluster(s, cluster_offset); + } + + assert(remaining >= cur_bytes); + + start += cur_bytes; + remaining -= cur_bytes; + cluster_offset += cur_bytes; + + if (remaining == 0) { + break; + } + + cur_bytes = remaining; + + /* + * Now start gathering as many contiguous clusters as possible: + * + * 1. Check for overlaps with in-flight allocations + * + * a) Overlap not in the first cluster -> shorten this request and + * let the caller handle the rest in its next loop iteration. + * + * b) Real overlaps of two requests. Yield and restart the search + * for contiguous clusters (the situation could have changed + * while we were sleeping) + * + * c) TODO: Request starts in the same cluster as the in-flight + * allocation ends. Shorten the COW of the in-fight allocation, + * set cluster_offset to write to the same cluster and set up + * the right synchronisation between the in-flight request and + * the new one. + */ + ret = handle_dependencies(bs, start, &cur_bytes, m); + if (ret == -EAGAIN) { + /* Currently handle_dependencies() doesn't yield if we already had + * an allocation. If it did, we would have to clean up the L2Meta + * structs before starting over. */ + assert(*m == NULL); + goto again; + } else if (ret < 0) { + return ret; + } else if (cur_bytes == 0) { + break; + } else { + /* handle_dependencies() may have decreased cur_bytes (shortened + * the allocations below) so that the next dependency is processed + * correctly during the next loop iteration. */ + } + + /* + * 2. Count contiguous COPIED clusters. + */ + ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); + if (ret < 0) { + return ret; + } else if (ret) { + continue; + } else if (cur_bytes == 0) { + break; + } + + /* + * 3. If the request still hasn't completed, allocate new clusters, + * considering any cluster_offset of steps 1c or 2. + */ + ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); + if (ret < 0) { + return ret; + } else if (ret) { + continue; + } else { + assert(cur_bytes == 0); + break; + } + } + + *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS); + assert(*num > 0); + assert(*host_offset != 0); + + return 0; +} + +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, + const uint8_t *buf, int buf_size) +{ + z_stream strm1, *strm = &strm1; + int ret, out_len; + + memset(strm, 0, sizeof(*strm)); + + strm->next_in = (uint8_t *)buf; + strm->avail_in = buf_size; + strm->next_out = out_buf; + strm->avail_out = out_buf_size; + + ret = inflateInit2(strm, -12); + if (ret != Z_OK) + return -1; + ret = inflate(strm, Z_FINISH); + out_len = strm->next_out - out_buf; + if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || + out_len != out_buf_size) { + inflateEnd(strm); + return -1; + } + inflateEnd(strm); + return 0; +} + +int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) +{ + BDRVQcowState *s = bs->opaque; + int ret, csize, nb_csectors, sector_offset; + uint64_t coffset; + + coffset = cluster_offset & s->cluster_offset_mask; + if (s->cluster_cache_offset != coffset) { + nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; + sector_offset = coffset & 511; + csize = nb_csectors * 512 - sector_offset; + BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); + ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors); + if (ret < 0) { + return ret; + } + if (decompress_buffer(s->cluster_cache, s->cluster_size, + s->cluster_data + sector_offset, csize) < 0) { + return -EIO; + } + s->cluster_cache_offset = coffset; + } + return 0; +} + +/* + * This discards as many clusters of nb_clusters as possible at once (i.e. + * all clusters in the same L2 table) and returns the number of discarded + * clusters. + */ +static int discard_single_l2(BlockDriverState *bs, uint64_t offset, + unsigned int nb_clusters) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l2_table; + int l2_index; + int ret; + int i; + + ret = get_cluster_table(bs, offset, &l2_table, &l2_index); + if (ret < 0) { + return ret; + } + + /* Limit nb_clusters to one L2 table */ + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + + for (i = 0; i < nb_clusters; i++) { + uint64_t old_offset; + + old_offset = be64_to_cpu(l2_table[l2_index + i]); + if ((old_offset & L2E_OFFSET_MASK) == 0) { + continue; + } + + /* First remove L2 entries */ + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + l2_table[l2_index + i] = cpu_to_be64(0); + + /* Then decrease the refcount */ + qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); + } + + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (ret < 0) { + return ret; + } + + return nb_clusters; +} + +int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, + int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + uint64_t end_offset; + unsigned int nb_clusters; + int ret; + + end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS); + + /* Round start up and end down */ + offset = align_offset(offset, s->cluster_size); + end_offset &= ~(s->cluster_size - 1); + + if (offset > end_offset) { + return 0; + } + + nb_clusters = size_to_clusters(s, end_offset - offset); + + s->cache_discards = true; + + /* Each L2 table is handled by its own loop iteration */ + while (nb_clusters > 0) { + ret = discard_single_l2(bs, offset, nb_clusters); + if (ret < 0) { + goto fail; + } + + nb_clusters -= ret; + offset += (ret * s->cluster_size); + } + + ret = 0; +fail: + s->cache_discards = false; + qcow2_process_discards(bs, ret); + + return ret; +} + +/* + * This zeroes as many clusters of nb_clusters as possible at once (i.e. + * all clusters in the same L2 table) and returns the number of zeroed + * clusters. + */ +static int zero_single_l2(BlockDriverState *bs, uint64_t offset, + unsigned int nb_clusters) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l2_table; + int l2_index; + int ret; + int i; + + ret = get_cluster_table(bs, offset, &l2_table, &l2_index); + if (ret < 0) { + return ret; + } + + /* Limit nb_clusters to one L2 table */ + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + + for (i = 0; i < nb_clusters; i++) { + uint64_t old_offset; + + old_offset = be64_to_cpu(l2_table[l2_index + i]); + + /* Update L2 entries */ + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + if (old_offset & QCOW_OFLAG_COMPRESSED) { + l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); + qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); + } else { + l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); + } + } + + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (ret < 0) { + return ret; + } + + return nb_clusters; +} + +int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + unsigned int nb_clusters; + int ret; + + /* The zero flag is only supported by version 3 and newer */ + if (s->qcow_version < 3) { + return -ENOTSUP; + } + + /* Each L2 table is handled by its own loop iteration */ + nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS); + + s->cache_discards = true; + + while (nb_clusters > 0) { + ret = zero_single_l2(bs, offset, nb_clusters); + if (ret < 0) { + goto fail; + } + + nb_clusters -= ret; + offset += (ret * s->cluster_size); + } + + ret = 0; +fail: + s->cache_discards = false; + qcow2_process_discards(bs, ret); + + return ret; +} diff --git a/contrib/qemu/block/qcow2-refcount.c b/contrib/qemu/block/qcow2-refcount.c new file mode 100644 index 000000000..1244693f3 --- /dev/null +++ b/contrib/qemu/block/qcow2-refcount.c @@ -0,0 +1,1374 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/qcow2.h" + +static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size); +static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, + int64_t offset, int64_t length, + int addend, enum qcow2_discard_type type); + + +/*********************************************************/ +/* refcount handling */ + +int qcow2_refcount_init(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int ret, refcount_table_size2, i; + + refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); + s->refcount_table = g_malloc(refcount_table_size2); + if (s->refcount_table_size > 0) { + BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); + ret = bdrv_pread(bs->file, s->refcount_table_offset, + s->refcount_table, refcount_table_size2); + if (ret != refcount_table_size2) + goto fail; + for(i = 0; i < s->refcount_table_size; i++) + be64_to_cpus(&s->refcount_table[i]); + } + return 0; + fail: + return -ENOMEM; +} + +void qcow2_refcount_close(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + g_free(s->refcount_table); +} + + +static int load_refcount_block(BlockDriverState *bs, + int64_t refcount_block_offset, + void **refcount_block) +{ + BDRVQcowState *s = bs->opaque; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); + ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, + refcount_block); + + return ret; +} + +/* + * Returns the refcount of the cluster given by its index. Any non-negative + * return value is the refcount of the cluster, negative values are -errno + * and indicate an error. + */ +static int get_refcount(BlockDriverState *bs, int64_t cluster_index) +{ + BDRVQcowState *s = bs->opaque; + int refcount_table_index, block_index; + int64_t refcount_block_offset; + int ret; + uint16_t *refcount_block; + uint16_t refcount; + + refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); + if (refcount_table_index >= s->refcount_table_size) + return 0; + refcount_block_offset = s->refcount_table[refcount_table_index]; + if (!refcount_block_offset) + return 0; + + ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, + (void**) &refcount_block); + if (ret < 0) { + return ret; + } + + block_index = cluster_index & + ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); + refcount = be16_to_cpu(refcount_block[block_index]); + + ret = qcow2_cache_put(bs, s->refcount_block_cache, + (void**) &refcount_block); + if (ret < 0) { + return ret; + } + + return refcount; +} + +/* + * Rounds the refcount table size up to avoid growing the table for each single + * refcount block that is allocated. + */ +static unsigned int next_refcount_table_size(BDRVQcowState *s, + unsigned int min_size) +{ + unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1; + unsigned int refcount_table_clusters = + MAX(1, s->refcount_table_size >> (s->cluster_bits - 3)); + + while (min_clusters > refcount_table_clusters) { + refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2; + } + + return refcount_table_clusters << (s->cluster_bits - 3); +} + + +/* Checks if two offsets are described by the same refcount block */ +static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a, + uint64_t offset_b) +{ + uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT); + uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT); + + return (block_a == block_b); +} + +/* + * Loads a refcount block. If it doesn't exist yet, it is allocated first + * (including growing the refcount table if needed). + * + * Returns 0 on success or -errno in error case + */ +static int alloc_refcount_block(BlockDriverState *bs, + int64_t cluster_index, uint16_t **refcount_block) +{ + BDRVQcowState *s = bs->opaque; + unsigned int refcount_table_index; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); + + /* Find the refcount block for the given cluster */ + refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); + + if (refcount_table_index < s->refcount_table_size) { + + uint64_t refcount_block_offset = + s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; + + /* If it's already there, we're done */ + if (refcount_block_offset) { + return load_refcount_block(bs, refcount_block_offset, + (void**) refcount_block); + } + } + + /* + * If we came here, we need to allocate something. Something is at least + * a cluster for the new refcount block. It may also include a new refcount + * table if the old refcount table is too small. + * + * Note that allocating clusters here needs some special care: + * + * - We can't use the normal qcow2_alloc_clusters(), it would try to + * increase the refcount and very likely we would end up with an endless + * recursion. Instead we must place the refcount blocks in a way that + * they can describe them themselves. + * + * - We need to consider that at this point we are inside update_refcounts + * and doing the initial refcount increase. This means that some clusters + * have already been allocated by the caller, but their refcount isn't + * accurate yet. free_cluster_index tells us where this allocation ends + * as long as we don't overwrite it by freeing clusters. + * + * - alloc_clusters_noref and qcow2_free_clusters may load a different + * refcount block into the cache + */ + + *refcount_block = NULL; + + /* We write to the refcount table, so we might depend on L2 tables */ + ret = qcow2_cache_flush(bs, s->l2_table_cache); + if (ret < 0) { + return ret; + } + + /* Allocate the refcount block itself and mark it as used */ + int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); + if (new_block < 0) { + return new_block; + } + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64 + " at %" PRIx64 "\n", + refcount_table_index, cluster_index << s->cluster_bits, new_block); +#endif + + if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) { + /* Zero the new refcount block before updating it */ + ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, + (void**) refcount_block); + if (ret < 0) { + goto fail_block; + } + + memset(*refcount_block, 0, s->cluster_size); + + /* The block describes itself, need to update the cache */ + int block_index = (new_block >> s->cluster_bits) & + ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); + (*refcount_block)[block_index] = cpu_to_be16(1); + } else { + /* Described somewhere else. This can recurse at most twice before we + * arrive at a block that describes itself. */ + ret = update_refcount(bs, new_block, s->cluster_size, 1, + QCOW2_DISCARD_NEVER); + if (ret < 0) { + goto fail_block; + } + + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail_block; + } + + /* Initialize the new refcount block only after updating its refcount, + * update_refcount uses the refcount cache itself */ + ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, + (void**) refcount_block); + if (ret < 0) { + goto fail_block; + } + + memset(*refcount_block, 0, s->cluster_size); + } + + /* Now the new refcount block needs to be written to disk */ + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE); + qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block); + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail_block; + } + + /* If the refcount table is big enough, just hook the block up there */ + if (refcount_table_index < s->refcount_table_size) { + uint64_t data64 = cpu_to_be64(new_block); + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); + ret = bdrv_pwrite_sync(bs->file, + s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), + &data64, sizeof(data64)); + if (ret < 0) { + goto fail_block; + } + + s->refcount_table[refcount_table_index] = new_block; + return 0; + } + + ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block); + if (ret < 0) { + goto fail_block; + } + + /* + * If we come here, we need to grow the refcount table. Again, a new + * refcount table needs some space and we can't simply allocate to avoid + * endless recursion. + * + * Therefore let's grab new refcount blocks at the end of the image, which + * will describe themselves and the new refcount table. This way we can + * reference them only in the new table and do the switch to the new + * refcount table at once without producing an inconsistent state in + * between. + */ + BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW); + + /* Calculate the number of refcount blocks needed so far */ + uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT); + uint64_t blocks_used = (s->free_cluster_index + + refcount_block_clusters - 1) / refcount_block_clusters; + + /* And now we need at least one block more for the new metadata */ + uint64_t table_size = next_refcount_table_size(s, blocks_used + 1); + uint64_t last_table_size; + uint64_t blocks_clusters; + do { + uint64_t table_clusters = + size_to_clusters(s, table_size * sizeof(uint64_t)); + blocks_clusters = 1 + + ((table_clusters + refcount_block_clusters - 1) + / refcount_block_clusters); + uint64_t meta_clusters = table_clusters + blocks_clusters; + + last_table_size = table_size; + table_size = next_refcount_table_size(s, blocks_used + + ((meta_clusters + refcount_block_clusters - 1) + / refcount_block_clusters)); + + } while (last_table_size != table_size); + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n", + s->refcount_table_size, table_size); +#endif + + /* Create the new refcount table and blocks */ + uint64_t meta_offset = (blocks_used * refcount_block_clusters) * + s->cluster_size; + uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size; + uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size); + uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t)); + + assert(meta_offset >= (s->free_cluster_index * s->cluster_size)); + + /* Fill the new refcount table */ + memcpy(new_table, s->refcount_table, + s->refcount_table_size * sizeof(uint64_t)); + new_table[refcount_table_index] = new_block; + + int i; + for (i = 0; i < blocks_clusters; i++) { + new_table[blocks_used + i] = meta_offset + (i * s->cluster_size); + } + + /* Fill the refcount blocks */ + uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t)); + int block = 0; + for (i = 0; i < table_clusters + blocks_clusters; i++) { + new_blocks[block++] = cpu_to_be16(1); + } + + /* Write refcount blocks to disk */ + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); + ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks, + blocks_clusters * s->cluster_size); + g_free(new_blocks); + if (ret < 0) { + goto fail_table; + } + + /* Write refcount table to disk */ + for(i = 0; i < table_size; i++) { + cpu_to_be64s(&new_table[i]); + } + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); + ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, + table_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail_table; + } + + for(i = 0; i < table_size; i++) { + be64_to_cpus(&new_table[i]); + } + + /* Hook up the new refcount table in the qcow2 header */ + uint8_t data[12]; + cpu_to_be64w((uint64_t*)data, table_offset); + cpu_to_be32w((uint32_t*)(data + 8), table_clusters); + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset), + data, sizeof(data)); + if (ret < 0) { + goto fail_table; + } + + /* And switch it in memory */ + uint64_t old_table_offset = s->refcount_table_offset; + uint64_t old_table_size = s->refcount_table_size; + + g_free(s->refcount_table); + s->refcount_table = new_table; + s->refcount_table_size = table_size; + s->refcount_table_offset = table_offset; + + /* Free old table. Remember, we must not change free_cluster_index */ + uint64_t old_free_cluster_index = s->free_cluster_index; + qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), + QCOW2_DISCARD_OTHER); + s->free_cluster_index = old_free_cluster_index; + + ret = load_refcount_block(bs, new_block, (void**) refcount_block); + if (ret < 0) { + return ret; + } + + return 0; + +fail_table: + g_free(new_table); +fail_block: + if (*refcount_block != NULL) { + qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block); + } + return ret; +} + +void qcow2_process_discards(BlockDriverState *bs, int ret) +{ + BDRVQcowState *s = bs->opaque; + Qcow2DiscardRegion *d, *next; + + QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) { + QTAILQ_REMOVE(&s->discards, d, next); + + /* Discard is optional, ignore the return value */ + if (ret >= 0) { + bdrv_discard(bs->file, + d->offset >> BDRV_SECTOR_BITS, + d->bytes >> BDRV_SECTOR_BITS); + } + + g_free(d); + } +} + +static void update_refcount_discard(BlockDriverState *bs, + uint64_t offset, uint64_t length) +{ + BDRVQcowState *s = bs->opaque; + Qcow2DiscardRegion *d, *p, *next; + + QTAILQ_FOREACH(d, &s->discards, next) { + uint64_t new_start = MIN(offset, d->offset); + uint64_t new_end = MAX(offset + length, d->offset + d->bytes); + + if (new_end - new_start <= length + d->bytes) { + /* There can't be any overlap, areas ending up here have no + * references any more and therefore shouldn't get freed another + * time. */ + assert(d->bytes + length == new_end - new_start); + d->offset = new_start; + d->bytes = new_end - new_start; + goto found; + } + } + + d = g_malloc(sizeof(*d)); + *d = (Qcow2DiscardRegion) { + .bs = bs, + .offset = offset, + .bytes = length, + }; + QTAILQ_INSERT_TAIL(&s->discards, d, next); + +found: + /* Merge discard requests if they are adjacent now */ + QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) { + if (p == d + || p->offset > d->offset + d->bytes + || d->offset > p->offset + p->bytes) + { + continue; + } + + /* Still no overlap possible */ + assert(p->offset == d->offset + d->bytes + || d->offset == p->offset + p->bytes); + + QTAILQ_REMOVE(&s->discards, p, next); + d->offset = MIN(d->offset, p->offset); + d->bytes += p->bytes; + } +} + +/* XXX: cache several refcount block clusters ? */ +static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, + int64_t offset, int64_t length, int addend, enum qcow2_discard_type type) +{ + BDRVQcowState *s = bs->opaque; + int64_t start, last, cluster_offset; + uint16_t *refcount_block = NULL; + int64_t old_table_index = -1; + int ret; + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n", + offset, length, addend); +#endif + if (length < 0) { + return -EINVAL; + } else if (length == 0) { + return 0; + } + + if (addend < 0) { + qcow2_cache_set_dependency(bs, s->refcount_block_cache, + s->l2_table_cache); + } + + start = offset & ~(s->cluster_size - 1); + last = (offset + length - 1) & ~(s->cluster_size - 1); + for(cluster_offset = start; cluster_offset <= last; + cluster_offset += s->cluster_size) + { + int block_index, refcount; + int64_t cluster_index = cluster_offset >> s->cluster_bits; + int64_t table_index = + cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); + + /* Load the refcount block and allocate it if needed */ + if (table_index != old_table_index) { + if (refcount_block) { + ret = qcow2_cache_put(bs, s->refcount_block_cache, + (void**) &refcount_block); + if (ret < 0) { + goto fail; + } + } + + ret = alloc_refcount_block(bs, cluster_index, &refcount_block); + if (ret < 0) { + goto fail; + } + } + old_table_index = table_index; + + qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block); + + /* we can update the count and save it */ + block_index = cluster_index & + ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); + + refcount = be16_to_cpu(refcount_block[block_index]); + refcount += addend; + if (refcount < 0 || refcount > 0xffff) { + ret = -EINVAL; + goto fail; + } + if (refcount == 0 && cluster_index < s->free_cluster_index) { + s->free_cluster_index = cluster_index; + } + refcount_block[block_index] = cpu_to_be16(refcount); + + if (refcount == 0 && s->discard_passthrough[type]) { + update_refcount_discard(bs, cluster_offset, s->cluster_size); + } + } + + ret = 0; +fail: + if (!s->cache_discards) { + qcow2_process_discards(bs, ret); + } + + /* Write last changed block to disk */ + if (refcount_block) { + int wret; + wret = qcow2_cache_put(bs, s->refcount_block_cache, + (void**) &refcount_block); + if (wret < 0) { + return ret < 0 ? ret : wret; + } + } + + /* + * Try do undo any updates if an error is returned (This may succeed in + * some cases like ENOSPC for allocating a new refcount block) + */ + if (ret < 0) { + int dummy; + dummy = update_refcount(bs, offset, cluster_offset - offset, -addend, + QCOW2_DISCARD_NEVER); + (void)dummy; + } + + return ret; +} + +/* + * Increases or decreases the refcount of a given cluster by one. + * addend must be 1 or -1. + * + * If the return value is non-negative, it is the new refcount of the cluster. + * If it is negative, it is -errno and indicates an error. + */ +static int update_cluster_refcount(BlockDriverState *bs, + int64_t cluster_index, + int addend, + enum qcow2_discard_type type) +{ + BDRVQcowState *s = bs->opaque; + int ret; + + ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend, + type); + if (ret < 0) { + return ret; + } + + return get_refcount(bs, cluster_index); +} + + + +/*********************************************************/ +/* cluster allocation functions */ + + + +/* return < 0 if error */ +static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size) +{ + BDRVQcowState *s = bs->opaque; + int i, nb_clusters, refcount; + + nb_clusters = size_to_clusters(s, size); +retry: + for(i = 0; i < nb_clusters; i++) { + int64_t next_cluster_index = s->free_cluster_index++; + refcount = get_refcount(bs, next_cluster_index); + + if (refcount < 0) { + return refcount; + } else if (refcount != 0) { + goto retry; + } + } +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", + size, + (s->free_cluster_index - nb_clusters) << s->cluster_bits); +#endif + return (s->free_cluster_index - nb_clusters) << s->cluster_bits; +} + +int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size) +{ + int64_t offset; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); + offset = alloc_clusters_noref(bs, size); + if (offset < 0) { + return offset; + } + + ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER); + if (ret < 0) { + return ret; + } + + return offset; +} + +int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, + int nb_clusters) +{ + BDRVQcowState *s = bs->opaque; + uint64_t cluster_index; + uint64_t old_free_cluster_index; + int i, refcount, ret; + + /* Check how many clusters there are free */ + cluster_index = offset >> s->cluster_bits; + for(i = 0; i < nb_clusters; i++) { + refcount = get_refcount(bs, cluster_index++); + + if (refcount < 0) { + return refcount; + } else if (refcount != 0) { + break; + } + } + + /* And then allocate them */ + old_free_cluster_index = s->free_cluster_index; + s->free_cluster_index = cluster_index + i; + + ret = update_refcount(bs, offset, i << s->cluster_bits, 1, + QCOW2_DISCARD_NEVER); + if (ret < 0) { + return ret; + } + + s->free_cluster_index = old_free_cluster_index; + + return i; +} + +/* only used to allocate compressed sectors. We try to allocate + contiguous sectors. size must be <= cluster_size */ +int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) +{ + BDRVQcowState *s = bs->opaque; + int64_t offset, cluster_offset; + int free_in_cluster; + + BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); + assert(size > 0 && size <= s->cluster_size); + if (s->free_byte_offset == 0) { + offset = qcow2_alloc_clusters(bs, s->cluster_size); + if (offset < 0) { + return offset; + } + s->free_byte_offset = offset; + } + redo: + free_in_cluster = s->cluster_size - + (s->free_byte_offset & (s->cluster_size - 1)); + if (size <= free_in_cluster) { + /* enough space in current cluster */ + offset = s->free_byte_offset; + s->free_byte_offset += size; + free_in_cluster -= size; + if (free_in_cluster == 0) + s->free_byte_offset = 0; + if ((offset & (s->cluster_size - 1)) != 0) + update_cluster_refcount(bs, offset >> s->cluster_bits, 1, + QCOW2_DISCARD_NEVER); + } else { + offset = qcow2_alloc_clusters(bs, s->cluster_size); + if (offset < 0) { + return offset; + } + cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1); + if ((cluster_offset + s->cluster_size) == offset) { + /* we are lucky: contiguous data */ + offset = s->free_byte_offset; + update_cluster_refcount(bs, offset >> s->cluster_bits, 1, + QCOW2_DISCARD_NEVER); + s->free_byte_offset += size; + } else { + s->free_byte_offset = offset; + goto redo; + } + } + + /* The cluster refcount was incremented, either by qcow2_alloc_clusters() + * or explicitly by update_cluster_refcount(). Refcount blocks must be + * flushed before the caller's L2 table updates. + */ + qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); + return offset; +} + +void qcow2_free_clusters(BlockDriverState *bs, + int64_t offset, int64_t size, + enum qcow2_discard_type type) +{ + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); + ret = update_refcount(bs, offset, size, -1, type); + if (ret < 0) { + fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); + /* TODO Remember the clusters to free them later and avoid leaking */ + } +} + +/* + * Free a cluster using its L2 entry (handles clusters of all types, e.g. + * normal cluster, compressed cluster, etc.) + */ +void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, + int nb_clusters, enum qcow2_discard_type type) +{ + BDRVQcowState *s = bs->opaque; + + switch (qcow2_get_cluster_type(l2_entry)) { + case QCOW2_CLUSTER_COMPRESSED: + { + int nb_csectors; + nb_csectors = ((l2_entry >> s->csize_shift) & + s->csize_mask) + 1; + qcow2_free_clusters(bs, + (l2_entry & s->cluster_offset_mask) & ~511, + nb_csectors * 512, type); + } + break; + case QCOW2_CLUSTER_NORMAL: + qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, + nb_clusters << s->cluster_bits, type); + break; + case QCOW2_CLUSTER_UNALLOCATED: + case QCOW2_CLUSTER_ZERO: + break; + default: + abort(); + } +} + + + +/*********************************************************/ +/* snapshots and image creation */ + + + +/* update the refcounts of snapshots and the copied flag */ +int qcow2_update_snapshot_refcount(BlockDriverState *bs, + int64_t l1_table_offset, int l1_size, int addend) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated; + int64_t old_offset, old_l2_offset; + int i, j, l1_modified = 0, nb_csectors, refcount; + int ret; + + l2_table = NULL; + l1_table = NULL; + l1_size2 = l1_size * sizeof(uint64_t); + + s->cache_discards = true; + + /* WARNING: qcow2_snapshot_goto relies on this function not using the + * l1_table_offset when it is the current s->l1_table_offset! Be careful + * when changing this! */ + if (l1_table_offset != s->l1_table_offset) { + l1_table = g_malloc0(align_offset(l1_size2, 512)); + l1_allocated = 1; + + ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); + if (ret < 0) { + goto fail; + } + + for(i = 0;i < l1_size; i++) + be64_to_cpus(&l1_table[i]); + } else { + assert(l1_size == s->l1_size); + l1_table = s->l1_table; + l1_allocated = 0; + } + + for(i = 0; i < l1_size; i++) { + l2_offset = l1_table[i]; + if (l2_offset) { + old_l2_offset = l2_offset; + l2_offset &= L1E_OFFSET_MASK; + + ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, + (void**) &l2_table); + if (ret < 0) { + goto fail; + } + + for(j = 0; j < s->l2_size; j++) { + offset = be64_to_cpu(l2_table[j]); + if (offset != 0) { + old_offset = offset; + offset &= ~QCOW_OFLAG_COPIED; + if (offset & QCOW_OFLAG_COMPRESSED) { + nb_csectors = ((offset >> s->csize_shift) & + s->csize_mask) + 1; + if (addend != 0) { + int ret; + ret = update_refcount(bs, + (offset & s->cluster_offset_mask) & ~511, + nb_csectors * 512, addend, + QCOW2_DISCARD_SNAPSHOT); + if (ret < 0) { + goto fail; + } + } + /* compressed clusters are never modified */ + refcount = 2; + } else { + uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits; + if (addend != 0) { + refcount = update_cluster_refcount(bs, cluster_index, addend, + QCOW2_DISCARD_SNAPSHOT); + } else { + refcount = get_refcount(bs, cluster_index); + } + + if (refcount < 0) { + ret = refcount; + goto fail; + } + } + + if (refcount == 1) { + offset |= QCOW_OFLAG_COPIED; + } + if (offset != old_offset) { + if (addend > 0) { + qcow2_cache_set_dependency(bs, s->l2_table_cache, + s->refcount_block_cache); + } + l2_table[j] = cpu_to_be64(offset); + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + } + } + } + + ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + if (ret < 0) { + goto fail; + } + + + if (addend != 0) { + refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend, + QCOW2_DISCARD_SNAPSHOT); + } else { + refcount = get_refcount(bs, l2_offset >> s->cluster_bits); + } + if (refcount < 0) { + ret = refcount; + goto fail; + } else if (refcount == 1) { + l2_offset |= QCOW_OFLAG_COPIED; + } + if (l2_offset != old_l2_offset) { + l1_table[i] = l2_offset; + l1_modified = 1; + } + } + } + + ret = bdrv_flush(bs); +fail: + if (l2_table) { + qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + } + + s->cache_discards = false; + qcow2_process_discards(bs, ret); + + /* Update L1 only if it isn't deleted anyway (addend = -1) */ + if (ret == 0 && addend >= 0 && l1_modified) { + for (i = 0; i < l1_size; i++) { + cpu_to_be64s(&l1_table[i]); + } + + ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2); + + for (i = 0; i < l1_size; i++) { + be64_to_cpus(&l1_table[i]); + } + } + if (l1_allocated) + g_free(l1_table); + return ret; +} + + + + +/*********************************************************/ +/* refcount checking functions */ + + + +/* + * Increases the refcount for a range of clusters in a given refcount table. + * This is used to construct a temporary refcount table out of L1 and L2 tables + * which can be compared the the refcount table saved in the image. + * + * Modifies the number of errors in res. + */ +static void inc_refcounts(BlockDriverState *bs, + BdrvCheckResult *res, + uint16_t *refcount_table, + int refcount_table_size, + int64_t offset, int64_t size) +{ + BDRVQcowState *s = bs->opaque; + int64_t start, last, cluster_offset; + int k; + + if (size <= 0) + return; + + start = offset & ~(s->cluster_size - 1); + last = (offset + size - 1) & ~(s->cluster_size - 1); + for(cluster_offset = start; cluster_offset <= last; + cluster_offset += s->cluster_size) { + k = cluster_offset >> s->cluster_bits; + if (k < 0) { + fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n", + cluster_offset); + res->corruptions++; + } else if (k >= refcount_table_size) { + fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after " + "the end of the image file, can't properly check refcounts.\n", + cluster_offset); + res->check_errors++; + } else { + if (++refcount_table[k] == 0) { + fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 + "\n", cluster_offset); + res->corruptions++; + } + } + } +} + +/* Flags for check_refcounts_l1() and check_refcounts_l2() */ +enum { + CHECK_OFLAG_COPIED = 0x1, /* check QCOW_OFLAG_COPIED matches refcount */ + CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */ +}; + +/* + * Increases the refcount in the given refcount table for the all clusters + * referenced in the L2 table. While doing so, performs some checks on L2 + * entries. + * + * Returns the number of errors found by the checks or -errno if an internal + * error occurred. + */ +static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, + uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset, + int flags) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l2_table, l2_entry; + uint64_t next_contiguous_offset = 0; + int i, l2_size, nb_csectors, refcount; + + /* Read L2 table from disk */ + l2_size = s->l2_size * sizeof(uint64_t); + l2_table = g_malloc(l2_size); + + if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size) + goto fail; + + /* Do the actual checks */ + for(i = 0; i < s->l2_size; i++) { + l2_entry = be64_to_cpu(l2_table[i]); + + switch (qcow2_get_cluster_type(l2_entry)) { + case QCOW2_CLUSTER_COMPRESSED: + /* Compressed clusters don't have QCOW_OFLAG_COPIED */ + if (l2_entry & QCOW_OFLAG_COPIED) { + fprintf(stderr, "ERROR: cluster %" PRId64 ": " + "copied flag must never be set for compressed " + "clusters\n", l2_entry >> s->cluster_bits); + l2_entry &= ~QCOW_OFLAG_COPIED; + res->corruptions++; + } + + /* Mark cluster as used */ + nb_csectors = ((l2_entry >> s->csize_shift) & + s->csize_mask) + 1; + l2_entry &= s->cluster_offset_mask; + inc_refcounts(bs, res, refcount_table, refcount_table_size, + l2_entry & ~511, nb_csectors * 512); + + if (flags & CHECK_FRAG_INFO) { + res->bfi.allocated_clusters++; + res->bfi.compressed_clusters++; + + /* Compressed clusters are fragmented by nature. Since they + * take up sub-sector space but we only have sector granularity + * I/O we need to re-read the same sectors even for adjacent + * compressed clusters. + */ + res->bfi.fragmented_clusters++; + } + break; + + case QCOW2_CLUSTER_ZERO: + if ((l2_entry & L2E_OFFSET_MASK) == 0) { + break; + } + /* fall through */ + + case QCOW2_CLUSTER_NORMAL: + { + /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ + uint64_t offset = l2_entry & L2E_OFFSET_MASK; + + if (flags & CHECK_OFLAG_COPIED) { + refcount = get_refcount(bs, offset >> s->cluster_bits); + if (refcount < 0) { + fprintf(stderr, "Can't get refcount for offset %" + PRIx64 ": %s\n", l2_entry, strerror(-refcount)); + goto fail; + } + if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { + fprintf(stderr, "ERROR OFLAG_COPIED: offset=%" + PRIx64 " refcount=%d\n", l2_entry, refcount); + res->corruptions++; + } + } + + if (flags & CHECK_FRAG_INFO) { + res->bfi.allocated_clusters++; + if (next_contiguous_offset && + offset != next_contiguous_offset) { + res->bfi.fragmented_clusters++; + } + next_contiguous_offset = offset + s->cluster_size; + } + + /* Mark cluster as used */ + inc_refcounts(bs, res, refcount_table,refcount_table_size, + offset, s->cluster_size); + + /* Correct offsets are cluster aligned */ + if (offset & (s->cluster_size - 1)) { + fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not " + "properly aligned; L2 entry corrupted.\n", offset); + res->corruptions++; + } + break; + } + + case QCOW2_CLUSTER_UNALLOCATED: + break; + + default: + abort(); + } + } + + g_free(l2_table); + return 0; + +fail: + fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); + g_free(l2_table); + return -EIO; +} + +/* + * Increases the refcount for the L1 table, its L2 tables and all referenced + * clusters in the given refcount table. While doing so, performs some checks + * on L1 and L2 entries. + * + * Returns the number of errors found by the checks or -errno if an internal + * error occurred. + */ +static int check_refcounts_l1(BlockDriverState *bs, + BdrvCheckResult *res, + uint16_t *refcount_table, + int refcount_table_size, + int64_t l1_table_offset, int l1_size, + int flags) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l1_table, l2_offset, l1_size2; + int i, refcount, ret; + + l1_size2 = l1_size * sizeof(uint64_t); + + /* Mark L1 table as used */ + inc_refcounts(bs, res, refcount_table, refcount_table_size, + l1_table_offset, l1_size2); + + /* Read L1 table entries from disk */ + if (l1_size2 == 0) { + l1_table = NULL; + } else { + l1_table = g_malloc(l1_size2); + if (bdrv_pread(bs->file, l1_table_offset, + l1_table, l1_size2) != l1_size2) + goto fail; + for(i = 0;i < l1_size; i++) + be64_to_cpus(&l1_table[i]); + } + + /* Do the actual checks */ + for(i = 0; i < l1_size; i++) { + l2_offset = l1_table[i]; + if (l2_offset) { + /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ + if (flags & CHECK_OFLAG_COPIED) { + refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) + >> s->cluster_bits); + if (refcount < 0) { + fprintf(stderr, "Can't get refcount for l2_offset %" + PRIx64 ": %s\n", l2_offset, strerror(-refcount)); + goto fail; + } + if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) { + fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64 + " refcount=%d\n", l2_offset, refcount); + res->corruptions++; + } + } + + /* Mark L2 table as used */ + l2_offset &= L1E_OFFSET_MASK; + inc_refcounts(bs, res, refcount_table, refcount_table_size, + l2_offset, s->cluster_size); + + /* L2 tables are cluster aligned */ + if (l2_offset & (s->cluster_size - 1)) { + fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " + "cluster aligned; L1 entry corrupted\n", l2_offset); + res->corruptions++; + } + + /* Process and check L2 entries */ + ret = check_refcounts_l2(bs, res, refcount_table, + refcount_table_size, l2_offset, flags); + if (ret < 0) { + goto fail; + } + } + } + g_free(l1_table); + return 0; + +fail: + fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); + res->check_errors++; + g_free(l1_table); + return -EIO; +} + +/* + * Checks an image for refcount consistency. + * + * Returns 0 if no errors are found, the number of errors in case the image is + * detected as corrupted, and -errno when an internal error occurred. + */ +int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) +{ + BDRVQcowState *s = bs->opaque; + int64_t size, i, highest_cluster; + int nb_clusters, refcount1, refcount2; + QCowSnapshot *sn; + uint16_t *refcount_table; + int ret; + + size = bdrv_getlength(bs->file); + nb_clusters = size_to_clusters(s, size); + refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t)); + + res->bfi.total_clusters = + size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE); + + /* header */ + inc_refcounts(bs, res, refcount_table, nb_clusters, + 0, s->cluster_size); + + /* current L1 table */ + ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, + s->l1_table_offset, s->l1_size, + CHECK_OFLAG_COPIED | CHECK_FRAG_INFO); + if (ret < 0) { + goto fail; + } + + /* snapshots */ + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, + sn->l1_table_offset, sn->l1_size, 0); + if (ret < 0) { + goto fail; + } + } + inc_refcounts(bs, res, refcount_table, nb_clusters, + s->snapshots_offset, s->snapshots_size); + + /* refcount data */ + inc_refcounts(bs, res, refcount_table, nb_clusters, + s->refcount_table_offset, + s->refcount_table_size * sizeof(uint64_t)); + + for(i = 0; i < s->refcount_table_size; i++) { + uint64_t offset, cluster; + offset = s->refcount_table[i]; + cluster = offset >> s->cluster_bits; + + /* Refcount blocks are cluster aligned */ + if (offset & (s->cluster_size - 1)) { + fprintf(stderr, "ERROR refcount block %" PRId64 " is not " + "cluster aligned; refcount table entry corrupted\n", i); + res->corruptions++; + continue; + } + + if (cluster >= nb_clusters) { + fprintf(stderr, "ERROR refcount block %" PRId64 + " is outside image\n", i); + res->corruptions++; + continue; + } + + if (offset != 0) { + inc_refcounts(bs, res, refcount_table, nb_clusters, + offset, s->cluster_size); + if (refcount_table[cluster] != 1) { + fprintf(stderr, "ERROR refcount block %" PRId64 + " refcount=%d\n", + i, refcount_table[cluster]); + res->corruptions++; + } + } + } + + /* compare ref counts */ + for (i = 0, highest_cluster = 0; i < nb_clusters; i++) { + refcount1 = get_refcount(bs, i); + if (refcount1 < 0) { + fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", + i, strerror(-refcount1)); + res->check_errors++; + continue; + } + + refcount2 = refcount_table[i]; + + if (refcount1 > 0 || refcount2 > 0) { + highest_cluster = i; + } + + if (refcount1 != refcount2) { + + /* Check if we're allowed to fix the mismatch */ + int *num_fixed = NULL; + if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) { + num_fixed = &res->leaks_fixed; + } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) { + num_fixed = &res->corruptions_fixed; + } + + fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n", + num_fixed != NULL ? "Repairing" : + refcount1 < refcount2 ? "ERROR" : + "Leaked", + i, refcount1, refcount2); + + if (num_fixed) { + ret = update_refcount(bs, i << s->cluster_bits, 1, + refcount2 - refcount1, + QCOW2_DISCARD_ALWAYS); + if (ret >= 0) { + (*num_fixed)++; + continue; + } + } + + /* And if we couldn't, print an error */ + if (refcount1 < refcount2) { + res->corruptions++; + } else { + res->leaks++; + } + } + } + + res->image_end_offset = (highest_cluster + 1) * s->cluster_size; + ret = 0; + +fail: + g_free(refcount_table); + + return ret; +} + diff --git a/contrib/qemu/block/qcow2-snapshot.c b/contrib/qemu/block/qcow2-snapshot.c new file mode 100644 index 000000000..0caac9055 --- /dev/null +++ b/contrib/qemu/block/qcow2-snapshot.c @@ -0,0 +1,660 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/qcow2.h" + +typedef struct QEMU_PACKED QCowSnapshotHeader { + /* header is 8 byte aligned */ + uint64_t l1_table_offset; + + uint32_t l1_size; + uint16_t id_str_size; + uint16_t name_size; + + uint32_t date_sec; + uint32_t date_nsec; + + uint64_t vm_clock_nsec; + + uint32_t vm_state_size; + uint32_t extra_data_size; /* for extension */ + /* extra data follows */ + /* id_str follows */ + /* name follows */ +} QCowSnapshotHeader; + +typedef struct QEMU_PACKED QCowSnapshotExtraData { + uint64_t vm_state_size_large; + uint64_t disk_size; +} QCowSnapshotExtraData; + +void qcow2_free_snapshots(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int i; + + for(i = 0; i < s->nb_snapshots; i++) { + g_free(s->snapshots[i].name); + g_free(s->snapshots[i].id_str); + } + g_free(s->snapshots); + s->snapshots = NULL; + s->nb_snapshots = 0; +} + +int qcow2_read_snapshots(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshotHeader h; + QCowSnapshotExtraData extra; + QCowSnapshot *sn; + int i, id_str_size, name_size; + int64_t offset; + uint32_t extra_data_size; + int ret; + + if (!s->nb_snapshots) { + s->snapshots = NULL; + s->snapshots_size = 0; + return 0; + } + + offset = s->snapshots_offset; + s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot)); + + for(i = 0; i < s->nb_snapshots; i++) { + /* Read statically sized part of the snapshot header */ + offset = align_offset(offset, 8); + ret = bdrv_pread(bs->file, offset, &h, sizeof(h)); + if (ret < 0) { + goto fail; + } + + offset += sizeof(h); + sn = s->snapshots + i; + sn->l1_table_offset = be64_to_cpu(h.l1_table_offset); + sn->l1_size = be32_to_cpu(h.l1_size); + sn->vm_state_size = be32_to_cpu(h.vm_state_size); + sn->date_sec = be32_to_cpu(h.date_sec); + sn->date_nsec = be32_to_cpu(h.date_nsec); + sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec); + extra_data_size = be32_to_cpu(h.extra_data_size); + + id_str_size = be16_to_cpu(h.id_str_size); + name_size = be16_to_cpu(h.name_size); + + /* Read extra data */ + ret = bdrv_pread(bs->file, offset, &extra, + MIN(sizeof(extra), extra_data_size)); + if (ret < 0) { + goto fail; + } + offset += extra_data_size; + + if (extra_data_size >= 8) { + sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large); + } + + if (extra_data_size >= 16) { + sn->disk_size = be64_to_cpu(extra.disk_size); + } else { + sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; + } + + /* Read snapshot ID */ + sn->id_str = g_malloc(id_str_size + 1); + ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size); + if (ret < 0) { + goto fail; + } + offset += id_str_size; + sn->id_str[id_str_size] = '\0'; + + /* Read snapshot name */ + sn->name = g_malloc(name_size + 1); + ret = bdrv_pread(bs->file, offset, sn->name, name_size); + if (ret < 0) { + goto fail; + } + offset += name_size; + sn->name[name_size] = '\0'; + } + + s->snapshots_size = offset - s->snapshots_offset; + return 0; + +fail: + qcow2_free_snapshots(bs); + return ret; +} + +/* add at the end of the file a new list of snapshots */ +static int qcow2_write_snapshots(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot *sn; + QCowSnapshotHeader h; + QCowSnapshotExtraData extra; + int i, name_size, id_str_size, snapshots_size; + struct { + uint32_t nb_snapshots; + uint64_t snapshots_offset; + } QEMU_PACKED header_data; + int64_t offset, snapshots_offset; + int ret; + + /* compute the size of the snapshots */ + offset = 0; + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + offset = align_offset(offset, 8); + offset += sizeof(h); + offset += sizeof(extra); + offset += strlen(sn->id_str); + offset += strlen(sn->name); + } + snapshots_size = offset; + + /* Allocate space for the new snapshot list */ + snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size); + offset = snapshots_offset; + if (offset < 0) { + return offset; + } + ret = bdrv_flush(bs); + if (ret < 0) { + return ret; + } + + /* Write all snapshots to the new list */ + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + memset(&h, 0, sizeof(h)); + h.l1_table_offset = cpu_to_be64(sn->l1_table_offset); + h.l1_size = cpu_to_be32(sn->l1_size); + /* If it doesn't fit in 32 bit, older implementations should treat it + * as a disk-only snapshot rather than truncate the VM state */ + if (sn->vm_state_size <= 0xffffffff) { + h.vm_state_size = cpu_to_be32(sn->vm_state_size); + } + h.date_sec = cpu_to_be32(sn->date_sec); + h.date_nsec = cpu_to_be32(sn->date_nsec); + h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec); + h.extra_data_size = cpu_to_be32(sizeof(extra)); + + memset(&extra, 0, sizeof(extra)); + extra.vm_state_size_large = cpu_to_be64(sn->vm_state_size); + extra.disk_size = cpu_to_be64(sn->disk_size); + + id_str_size = strlen(sn->id_str); + name_size = strlen(sn->name); + h.id_str_size = cpu_to_be16(id_str_size); + h.name_size = cpu_to_be16(name_size); + offset = align_offset(offset, 8); + + ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h)); + if (ret < 0) { + goto fail; + } + offset += sizeof(h); + + ret = bdrv_pwrite(bs->file, offset, &extra, sizeof(extra)); + if (ret < 0) { + goto fail; + } + offset += sizeof(extra); + + ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size); + if (ret < 0) { + goto fail; + } + offset += id_str_size; + + ret = bdrv_pwrite(bs->file, offset, sn->name, name_size); + if (ret < 0) { + goto fail; + } + offset += name_size; + } + + /* + * Update the header to point to the new snapshot table. This requires the + * new table and its refcounts to be stable on disk. + */ + ret = bdrv_flush(bs); + if (ret < 0) { + goto fail; + } + + QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) != + offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots)); + + header_data.nb_snapshots = cpu_to_be32(s->nb_snapshots); + header_data.snapshots_offset = cpu_to_be64(snapshots_offset); + + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots), + &header_data, sizeof(header_data)); + if (ret < 0) { + goto fail; + } + + /* free the old snapshot table */ + qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size, + QCOW2_DISCARD_SNAPSHOT); + s->snapshots_offset = snapshots_offset; + s->snapshots_size = snapshots_size; + return 0; + +fail: + return ret; +} + +static void find_new_snapshot_id(BlockDriverState *bs, + char *id_str, int id_str_size) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot *sn; + int i, id, id_max = 0; + + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + id = strtoul(sn->id_str, NULL, 10); + if (id > id_max) + id_max = id; + } + snprintf(id_str, id_str_size, "%d", id_max + 1); +} + +static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str) +{ + BDRVQcowState *s = bs->opaque; + int i; + + for(i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].id_str, id_str)) + return i; + } + return -1; +} + +static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name) +{ + BDRVQcowState *s = bs->opaque; + int i, ret; + + ret = find_snapshot_by_id(bs, name); + if (ret >= 0) + return ret; + for(i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].name, name)) + return i; + } + return -1; +} + +/* if no id is provided, a new one is constructed */ +int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot *new_snapshot_list = NULL; + QCowSnapshot *old_snapshot_list = NULL; + QCowSnapshot sn1, *sn = &sn1; + int i, ret; + uint64_t *l1_table = NULL; + int64_t l1_table_offset; + + memset(sn, 0, sizeof(*sn)); + + /* Generate an ID if it wasn't passed */ + if (sn_info->id_str[0] == '\0') { + find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str)); + } + + /* Check that the ID is unique */ + if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) { + return -EEXIST; + } + + /* Populate sn with passed data */ + sn->id_str = g_strdup(sn_info->id_str); + sn->name = g_strdup(sn_info->name); + + sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; + sn->vm_state_size = sn_info->vm_state_size; + sn->date_sec = sn_info->date_sec; + sn->date_nsec = sn_info->date_nsec; + sn->vm_clock_nsec = sn_info->vm_clock_nsec; + + /* Allocate the L1 table of the snapshot and copy the current one there. */ + l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t)); + if (l1_table_offset < 0) { + ret = l1_table_offset; + goto fail; + } + + sn->l1_table_offset = l1_table_offset; + sn->l1_size = s->l1_size; + + l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); + for(i = 0; i < s->l1_size; i++) { + l1_table[i] = cpu_to_be64(s->l1_table[i]); + } + + ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table, + s->l1_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + + g_free(l1_table); + l1_table = NULL; + + /* + * Increase the refcounts of all clusters and make sure everything is + * stable on disk before updating the snapshot table to contain a pointer + * to the new L1 table. + */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1); + if (ret < 0) { + goto fail; + } + + /* Append the new snapshot to the snapshot list */ + new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot)); + if (s->snapshots) { + memcpy(new_snapshot_list, s->snapshots, + s->nb_snapshots * sizeof(QCowSnapshot)); + old_snapshot_list = s->snapshots; + } + s->snapshots = new_snapshot_list; + s->snapshots[s->nb_snapshots++] = *sn; + + ret = qcow2_write_snapshots(bs); + if (ret < 0) { + g_free(s->snapshots); + s->snapshots = old_snapshot_list; + goto fail; + } + + g_free(old_snapshot_list); + +#ifdef DEBUG_ALLOC + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result, 0); + } +#endif + return 0; + +fail: + g_free(sn->id_str); + g_free(sn->name); + g_free(l1_table); + + return ret; +} + +/* copy the snapshot 'snapshot_name' into the current disk image */ +int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot *sn; + int i, snapshot_index; + int cur_l1_bytes, sn_l1_bytes; + int ret; + uint64_t *sn_l1_table = NULL; + + /* Search the snapshot */ + snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); + if (snapshot_index < 0) { + return -ENOENT; + } + sn = &s->snapshots[snapshot_index]; + + if (sn->disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) { + error_report("qcow2: Loading snapshots with different disk " + "size is not implemented"); + ret = -ENOTSUP; + goto fail; + } + + /* + * Make sure that the current L1 table is big enough to contain the whole + * L1 table of the snapshot. If the snapshot L1 table is smaller, the + * current one must be padded with zeros. + */ + ret = qcow2_grow_l1_table(bs, sn->l1_size, true); + if (ret < 0) { + goto fail; + } + + cur_l1_bytes = s->l1_size * sizeof(uint64_t); + sn_l1_bytes = sn->l1_size * sizeof(uint64_t); + + /* + * Copy the snapshot L1 table to the current L1 table. + * + * Before overwriting the old current L1 table on disk, make sure to + * increase all refcounts for the clusters referenced by the new one. + * Decrease the refcount referenced by the old one only when the L1 + * table is overwritten. + */ + sn_l1_table = g_malloc0(cur_l1_bytes); + + ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes); + if (ret < 0) { + goto fail; + } + + ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset, + sn->l1_size, 1); + if (ret < 0) { + goto fail; + } + + ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table, + cur_l1_bytes); + if (ret < 0) { + goto fail; + } + + /* + * Decrease refcount of clusters of current L1 table. + * + * At this point, the in-memory s->l1_table points to the old L1 table, + * whereas on disk we already have the new one. + * + * qcow2_update_snapshot_refcount special cases the current L1 table to use + * the in-memory data instead of really using the offset to load a new one, + * which is why this works. + */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, + s->l1_size, -1); + + /* + * Now update the in-memory L1 table to be in sync with the on-disk one. We + * need to do this even if updating refcounts failed. + */ + for(i = 0;i < s->l1_size; i++) { + s->l1_table[i] = be64_to_cpu(sn_l1_table[i]); + } + + if (ret < 0) { + goto fail; + } + + g_free(sn_l1_table); + sn_l1_table = NULL; + + /* + * Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed + * when we decreased the refcount of the old snapshot. + */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); + if (ret < 0) { + goto fail; + } + +#ifdef DEBUG_ALLOC + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result, 0); + } +#endif + return 0; + +fail: + g_free(sn_l1_table); + return ret; +} + +int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot sn; + int snapshot_index, ret; + + /* Search the snapshot */ + snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); + if (snapshot_index < 0) { + return -ENOENT; + } + sn = s->snapshots[snapshot_index]; + + /* Remove it from the snapshot list */ + memmove(s->snapshots + snapshot_index, + s->snapshots + snapshot_index + 1, + (s->nb_snapshots - snapshot_index - 1) * sizeof(sn)); + s->nb_snapshots--; + ret = qcow2_write_snapshots(bs); + if (ret < 0) { + return ret; + } + + /* + * The snapshot is now unused, clean up. If we fail after this point, we + * won't recover but just leak clusters. + */ + g_free(sn.id_str); + g_free(sn.name); + + /* + * Now decrease the refcounts of clusters referenced by the snapshot and + * free the L1 table. + */ + ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset, + sn.l1_size, -1); + if (ret < 0) { + return ret; + } + qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t), + QCOW2_DISCARD_SNAPSHOT); + + /* must update the copied flag on the current cluster offsets */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); + if (ret < 0) { + return ret; + } + +#ifdef DEBUG_ALLOC + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result, 0); + } +#endif + return 0; +} + +int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) +{ + BDRVQcowState *s = bs->opaque; + QEMUSnapshotInfo *sn_tab, *sn_info; + QCowSnapshot *sn; + int i; + + if (!s->nb_snapshots) { + *psn_tab = NULL; + return s->nb_snapshots; + } + + sn_tab = g_malloc0(s->nb_snapshots * sizeof(QEMUSnapshotInfo)); + for(i = 0; i < s->nb_snapshots; i++) { + sn_info = sn_tab + i; + sn = s->snapshots + i; + pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), + sn->id_str); + pstrcpy(sn_info->name, sizeof(sn_info->name), + sn->name); + sn_info->vm_state_size = sn->vm_state_size; + sn_info->date_sec = sn->date_sec; + sn_info->date_nsec = sn->date_nsec; + sn_info->vm_clock_nsec = sn->vm_clock_nsec; + } + *psn_tab = sn_tab; + return s->nb_snapshots; +} + +int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name) +{ + int i, snapshot_index; + BDRVQcowState *s = bs->opaque; + QCowSnapshot *sn; + uint64_t *new_l1_table; + int new_l1_bytes; + int ret; + + assert(bs->read_only); + + /* Search the snapshot */ + snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name); + if (snapshot_index < 0) { + return -ENOENT; + } + sn = &s->snapshots[snapshot_index]; + + /* Allocate and read in the snapshot's L1 table */ + new_l1_bytes = s->l1_size * sizeof(uint64_t); + new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512)); + + ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes); + if (ret < 0) { + g_free(new_l1_table); + return ret; + } + + /* Switch the L1 table */ + g_free(s->l1_table); + + s->l1_size = sn->l1_size; + s->l1_table_offset = sn->l1_table_offset; + s->l1_table = new_l1_table; + + for(i = 0;i < s->l1_size; i++) { + be64_to_cpus(&s->l1_table[i]); + } + + return 0; +} diff --git a/contrib/qemu/block/qcow2.c b/contrib/qemu/block/qcow2.c new file mode 100644 index 000000000..0eceefe2c --- /dev/null +++ b/contrib/qemu/block/qcow2.c @@ -0,0 +1,1825 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include <zlib.h> +#include "qemu/aes.h" +#include "block/qcow2.h" +#include "qemu/error-report.h" +#include "qapi/qmp/qerror.h" +#include "qapi/qmp/qbool.h" +#include "trace.h" + +/* + Differences with QCOW: + + - Support for multiple incremental snapshots. + - Memory management by reference counts. + - Clusters which have a reference count of one have the bit + QCOW_OFLAG_COPIED to optimize write performance. + - Size of compressed clusters is stored in sectors to reduce bit usage + in the cluster offsets. + - Support for storing additional data (such as the VM state) in the + snapshots. + - If a backing store is used, the cluster size is not constrained + (could be backported to QCOW). + - L2 tables have always a size of one cluster. +*/ + + +typedef struct { + uint32_t magic; + uint32_t len; +} QCowExtension; + +#define QCOW2_EXT_MAGIC_END 0 +#define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA +#define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857 + +static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const QCowHeader *cow_header = (const void *)buf; + + if (buf_size >= sizeof(QCowHeader) && + be32_to_cpu(cow_header->magic) == QCOW_MAGIC && + be32_to_cpu(cow_header->version) >= 2) + return 100; + else + return 0; +} + + +/* + * read qcow2 extension and fill bs + * start reading from start_offset + * finish reading upon magic of value 0 or when end_offset reached + * unknown magic is skipped (future extension this version knows nothing about) + * return 0 upon success, non-0 otherwise + */ +static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, + uint64_t end_offset, void **p_feature_table) +{ + BDRVQcowState *s = bs->opaque; + QCowExtension ext; + uint64_t offset; + int ret; + +#ifdef DEBUG_EXT + printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset); +#endif + offset = start_offset; + while (offset < end_offset) { + +#ifdef DEBUG_EXT + /* Sanity check */ + if (offset > s->cluster_size) + printf("qcow2_read_extension: suspicious offset %lu\n", offset); + + printf("attempting to read extended header in offset %lu\n", offset); +#endif + + if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) { + fprintf(stderr, "qcow2_read_extension: ERROR: " + "pread fail from offset %" PRIu64 "\n", + offset); + return 1; + } + be32_to_cpus(&ext.magic); + be32_to_cpus(&ext.len); + offset += sizeof(ext); +#ifdef DEBUG_EXT + printf("ext.magic = 0x%x\n", ext.magic); +#endif + if (ext.len > end_offset - offset) { + error_report("Header extension too large"); + return -EINVAL; + } + + switch (ext.magic) { + case QCOW2_EXT_MAGIC_END: + return 0; + + case QCOW2_EXT_MAGIC_BACKING_FORMAT: + if (ext.len >= sizeof(bs->backing_format)) { + fprintf(stderr, "ERROR: ext_backing_format: len=%u too large" + " (>=%zu)\n", + ext.len, sizeof(bs->backing_format)); + return 2; + } + if (bdrv_pread(bs->file, offset , bs->backing_format, + ext.len) != ext.len) + return 3; + bs->backing_format[ext.len] = '\0'; +#ifdef DEBUG_EXT + printf("Qcow2: Got format extension %s\n", bs->backing_format); +#endif + break; + + case QCOW2_EXT_MAGIC_FEATURE_TABLE: + if (p_feature_table != NULL) { + void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); + ret = bdrv_pread(bs->file, offset , feature_table, ext.len); + if (ret < 0) { + return ret; + } + + *p_feature_table = feature_table; + } + break; + + default: + /* unknown magic - save it in case we need to rewrite the header */ + { + Qcow2UnknownHeaderExtension *uext; + + uext = g_malloc0(sizeof(*uext) + ext.len); + uext->magic = ext.magic; + uext->len = ext.len; + QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next); + + ret = bdrv_pread(bs->file, offset , uext->data, uext->len); + if (ret < 0) { + return ret; + } + } + break; + } + + offset += ((ext.len + 7) & ~7); + } + + return 0; +} + +static void cleanup_unknown_header_ext(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + Qcow2UnknownHeaderExtension *uext, *next; + + QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) { + QLIST_REMOVE(uext, next); + g_free(uext); + } +} + +static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs, + const char *fmt, ...) +{ + char msg[64]; + va_list ap; + + va_start(ap, fmt); + vsnprintf(msg, sizeof(msg), fmt, ap); + va_end(ap); + + qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bs->device_name, "qcow2", msg); +} + +static void report_unsupported_feature(BlockDriverState *bs, + Qcow2Feature *table, uint64_t mask) +{ + while (table && table->name[0] != '\0') { + if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) { + if (mask & (1 << table->bit)) { + report_unsupported(bs, "%.46s",table->name); + mask &= ~(1 << table->bit); + } + } + table++; + } + + if (mask) { + report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask); + } +} + +/* + * Sets the dirty bit and flushes afterwards if necessary. + * + * The incompatible_features bit is only set if the image file header was + * updated successfully. Therefore it is not required to check the return + * value of this function. + */ +int qcow2_mark_dirty(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + uint64_t val; + int ret; + + assert(s->qcow_version >= 3); + + if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { + return 0; /* already dirty */ + } + + val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY); + ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features), + &val, sizeof(val)); + if (ret < 0) { + return ret; + } + ret = bdrv_flush(bs->file); + if (ret < 0) { + return ret; + } + + /* Only treat image as dirty if the header was updated successfully */ + s->incompatible_features |= QCOW2_INCOMPAT_DIRTY; + return 0; +} + +/* + * Clears the dirty bit and flushes before if necessary. Only call this + * function when there are no pending requests, it does not guard against + * concurrent requests dirtying the image. + */ +static int qcow2_mark_clean(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + + if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { + int ret = bdrv_flush(bs); + if (ret < 0) { + return ret; + } + + s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY; + return qcow2_update_header(bs); + } + return 0; +} + +static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) +{ + int ret = qcow2_check_refcounts(bs, result, fix); + if (ret < 0) { + return ret; + } + + if (fix && result->check_errors == 0 && result->corruptions == 0) { + return qcow2_mark_clean(bs); + } + return ret; +} + +static QemuOptsList qcow2_runtime_opts = { + .name = "qcow2", + .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head), + .desc = { + { + .name = "lazy_refcounts", + .type = QEMU_OPT_BOOL, + .help = "Postpone refcount updates", + }, + { + .name = QCOW2_OPT_DISCARD_REQUEST, + .type = QEMU_OPT_BOOL, + .help = "Pass guest discard requests to the layer below", + }, + { + .name = QCOW2_OPT_DISCARD_SNAPSHOT, + .type = QEMU_OPT_BOOL, + .help = "Generate discard requests when snapshot related space " + "is freed", + }, + { + .name = QCOW2_OPT_DISCARD_OTHER, + .type = QEMU_OPT_BOOL, + .help = "Generate discard requests when other clusters are freed", + }, + { /* end of list */ } + }, +}; + +static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) +{ + BDRVQcowState *s = bs->opaque; + int len, i, ret = 0; + QCowHeader header; + QemuOpts *opts; + Error *local_err = NULL; + uint64_t ext_end; + uint64_t l1_vm_state_index; + + ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); + if (ret < 0) { + goto fail; + } + be32_to_cpus(&header.magic); + be32_to_cpus(&header.version); + be64_to_cpus(&header.backing_file_offset); + be32_to_cpus(&header.backing_file_size); + be64_to_cpus(&header.size); + be32_to_cpus(&header.cluster_bits); + be32_to_cpus(&header.crypt_method); + be64_to_cpus(&header.l1_table_offset); + be32_to_cpus(&header.l1_size); + be64_to_cpus(&header.refcount_table_offset); + be32_to_cpus(&header.refcount_table_clusters); + be64_to_cpus(&header.snapshots_offset); + be32_to_cpus(&header.nb_snapshots); + + if (header.magic != QCOW_MAGIC) { + ret = -EMEDIUMTYPE; + goto fail; + } + if (header.version < 2 || header.version > 3) { + report_unsupported(bs, "QCOW version %d", header.version); + ret = -ENOTSUP; + goto fail; + } + + s->qcow_version = header.version; + + /* Initialise version 3 header fields */ + if (header.version == 2) { + header.incompatible_features = 0; + header.compatible_features = 0; + header.autoclear_features = 0; + header.refcount_order = 4; + header.header_length = 72; + } else { + be64_to_cpus(&header.incompatible_features); + be64_to_cpus(&header.compatible_features); + be64_to_cpus(&header.autoclear_features); + be32_to_cpus(&header.refcount_order); + be32_to_cpus(&header.header_length); + } + + if (header.header_length > sizeof(header)) { + s->unknown_header_fields_size = header.header_length - sizeof(header); + s->unknown_header_fields = g_malloc(s->unknown_header_fields_size); + ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields, + s->unknown_header_fields_size); + if (ret < 0) { + goto fail; + } + } + + if (header.backing_file_offset) { + ext_end = header.backing_file_offset; + } else { + ext_end = 1 << header.cluster_bits; + } + + /* Handle feature bits */ + s->incompatible_features = header.incompatible_features; + s->compatible_features = header.compatible_features; + s->autoclear_features = header.autoclear_features; + + if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) { + void *feature_table = NULL; + qcow2_read_extensions(bs, header.header_length, ext_end, + &feature_table); + report_unsupported_feature(bs, feature_table, + s->incompatible_features & + ~QCOW2_INCOMPAT_MASK); + ret = -ENOTSUP; + goto fail; + } + + /* Check support for various header values */ + if (header.refcount_order != 4) { + report_unsupported(bs, "%d bit reference counts", + 1 << header.refcount_order); + ret = -ENOTSUP; + goto fail; + } + + if (header.cluster_bits < MIN_CLUSTER_BITS || + header.cluster_bits > MAX_CLUSTER_BITS) { + ret = -EINVAL; + goto fail; + } + if (header.crypt_method > QCOW_CRYPT_AES) { + ret = -EINVAL; + goto fail; + } + s->crypt_method_header = header.crypt_method; + if (s->crypt_method_header) { + bs->encrypted = 1; + } + s->cluster_bits = header.cluster_bits; + s->cluster_size = 1 << s->cluster_bits; + s->cluster_sectors = 1 << (s->cluster_bits - 9); + s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ + s->l2_size = 1 << s->l2_bits; + bs->total_sectors = header.size / 512; + s->csize_shift = (62 - (s->cluster_bits - 8)); + s->csize_mask = (1 << (s->cluster_bits - 8)) - 1; + s->cluster_offset_mask = (1LL << s->csize_shift) - 1; + s->refcount_table_offset = header.refcount_table_offset; + s->refcount_table_size = + header.refcount_table_clusters << (s->cluster_bits - 3); + + s->snapshots_offset = header.snapshots_offset; + s->nb_snapshots = header.nb_snapshots; + + /* read the level 1 table */ + s->l1_size = header.l1_size; + + l1_vm_state_index = size_to_l1(s, header.size); + if (l1_vm_state_index > INT_MAX) { + ret = -EFBIG; + goto fail; + } + s->l1_vm_state_index = l1_vm_state_index; + + /* the L1 table must contain at least enough entries to put + header.size bytes */ + if (s->l1_size < s->l1_vm_state_index) { + ret = -EINVAL; + goto fail; + } + s->l1_table_offset = header.l1_table_offset; + if (s->l1_size > 0) { + s->l1_table = g_malloc0( + align_offset(s->l1_size * sizeof(uint64_t), 512)); + ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, + s->l1_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + for(i = 0;i < s->l1_size; i++) { + be64_to_cpus(&s->l1_table[i]); + } + } + + /* alloc L2 table/refcount block cache */ + s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE); + s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE); + + s->cluster_cache = g_malloc(s->cluster_size); + /* one more sector for decompressed data alignment */ + s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size + + 512); + s->cluster_cache_offset = -1; + s->flags = flags; + + ret = qcow2_refcount_init(bs); + if (ret != 0) { + goto fail; + } + + QLIST_INIT(&s->cluster_allocs); + QTAILQ_INIT(&s->discards); + + /* read qcow2 extensions */ + if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) { + ret = -EINVAL; + goto fail; + } + + /* read the backing file name */ + if (header.backing_file_offset != 0) { + len = header.backing_file_size; + if (len > 1023) { + len = 1023; + } + ret = bdrv_pread(bs->file, header.backing_file_offset, + bs->backing_file, len); + if (ret < 0) { + goto fail; + } + bs->backing_file[len] = '\0'; + } + + ret = qcow2_read_snapshots(bs); + if (ret < 0) { + goto fail; + } + + /* Clear unknown autoclear feature bits */ + if (!bs->read_only && s->autoclear_features != 0) { + s->autoclear_features = 0; + ret = qcow2_update_header(bs); + if (ret < 0) { + goto fail; + } + } + + /* Initialise locks */ + qemu_co_mutex_init(&s->lock); + + /* Repair image if dirty */ + if (!(flags & BDRV_O_CHECK) && !bs->read_only && + (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) { + BdrvCheckResult result = {0}; + + ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS); + if (ret < 0) { + goto fail; + } + } + + /* Enable lazy_refcounts according to image and command line options */ + opts = qemu_opts_create_nofail(&qcow2_runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + ret = -EINVAL; + goto fail; + } + + s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS, + (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)); + + s->discard_passthrough[QCOW2_DISCARD_NEVER] = false; + s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true; + s->discard_passthrough[QCOW2_DISCARD_REQUEST] = + qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST, + flags & BDRV_O_UNMAP); + s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] = + qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true); + s->discard_passthrough[QCOW2_DISCARD_OTHER] = + qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); + + qemu_opts_del(opts); + + if (s->use_lazy_refcounts && s->qcow_version < 3) { + qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require " + "a qcow2 image with at least qemu 1.1 compatibility level"); + ret = -EINVAL; + goto fail; + } + +#ifdef DEBUG_ALLOC + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result, 0); + } +#endif + return ret; + + fail: + g_free(s->unknown_header_fields); + cleanup_unknown_header_ext(bs); + qcow2_free_snapshots(bs); + qcow2_refcount_close(bs); + g_free(s->l1_table); + if (s->l2_table_cache) { + qcow2_cache_destroy(bs, s->l2_table_cache); + } + g_free(s->cluster_cache); + qemu_vfree(s->cluster_data); + return ret; +} + +static int qcow2_set_key(BlockDriverState *bs, const char *key) +{ + BDRVQcowState *s = bs->opaque; + uint8_t keybuf[16]; + int len, i; + + memset(keybuf, 0, 16); + len = strlen(key); + if (len > 16) + len = 16; + /* XXX: we could compress the chars to 7 bits to increase + entropy */ + for(i = 0;i < len;i++) { + keybuf[i] = key[i]; + } + s->crypt_method = s->crypt_method_header; + + if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) + return -1; + if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) + return -1; +#if 0 + /* test */ + { + uint8_t in[16]; + uint8_t out[16]; + uint8_t tmp[16]; + for(i=0;i<16;i++) + in[i] = i; + AES_encrypt(in, tmp, &s->aes_encrypt_key); + AES_decrypt(tmp, out, &s->aes_decrypt_key); + for(i = 0; i < 16; i++) + printf(" %02x", tmp[i]); + printf("\n"); + for(i = 0; i < 16; i++) + printf(" %02x", out[i]); + printf("\n"); + } +#endif + return 0; +} + +/* We have nothing to do for QCOW2 reopen, stubs just return + * success */ +static int qcow2_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + +static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) +{ + BDRVQcowState *s = bs->opaque; + uint64_t cluster_offset; + int ret; + + *pnum = nb_sectors; + /* FIXME We can get errors here, but the bdrv_co_is_allocated interface + * can't pass them on today */ + qemu_co_mutex_lock(&s->lock); + ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset); + qemu_co_mutex_unlock(&s->lock); + if (ret < 0) { + *pnum = 0; + } + + return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO); +} + +/* handle reading after the end of the backing file */ +int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t sector_num, int nb_sectors) +{ + int n1; + if ((sector_num + nb_sectors) <= bs->total_sectors) + return nb_sectors; + if (sector_num >= bs->total_sectors) + n1 = 0; + else + n1 = bs->total_sectors - sector_num; + + qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1)); + + return n1; +} + +static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, + int remaining_sectors, QEMUIOVector *qiov) +{ + BDRVQcowState *s = bs->opaque; + int index_in_cluster, n1; + int ret; + int cur_nr_sectors; /* number of sectors in current iteration */ + uint64_t cluster_offset = 0; + uint64_t bytes_done = 0; + QEMUIOVector hd_qiov; + uint8_t *cluster_data = NULL; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + qemu_co_mutex_lock(&s->lock); + + while (remaining_sectors != 0) { + + /* prepare next request */ + cur_nr_sectors = remaining_sectors; + if (s->crypt_method) { + cur_nr_sectors = MIN(cur_nr_sectors, + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); + } + + ret = qcow2_get_cluster_offset(bs, sector_num << 9, + &cur_nr_sectors, &cluster_offset); + if (ret < 0) { + goto fail; + } + + index_in_cluster = sector_num & (s->cluster_sectors - 1); + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, + cur_nr_sectors * 512); + + switch (ret) { + case QCOW2_CLUSTER_UNALLOCATED: + + if (bs->backing_hd) { + /* read from the base image */ + n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov, + sector_num, cur_nr_sectors); + if (n1 > 0) { + BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->backing_hd, sector_num, + n1, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } + } + } else { + /* Note: in this case, no need to wait */ + qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); + } + break; + + case QCOW2_CLUSTER_ZERO: + qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); + break; + + case QCOW2_CLUSTER_COMPRESSED: + /* add AIO support for compressed blocks ? */ + ret = qcow2_decompress_cluster(bs, cluster_offset); + if (ret < 0) { + goto fail; + } + + qemu_iovec_from_buf(&hd_qiov, 0, + s->cluster_cache + index_in_cluster * 512, + 512 * cur_nr_sectors); + break; + + case QCOW2_CLUSTER_NORMAL: + if ((cluster_offset & 511) != 0) { + ret = -EIO; + goto fail; + } + + if (s->crypt_method) { + /* + * For encrypted images, read everything into a temporary + * contiguous buffer on which the AES functions can work. + */ + if (!cluster_data) { + cluster_data = + qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); + } + + assert(cur_nr_sectors <= + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); + qemu_iovec_reset(&hd_qiov); + qemu_iovec_add(&hd_qiov, cluster_data, + 512 * cur_nr_sectors); + } + + BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->file, + (cluster_offset >> 9) + index_in_cluster, + cur_nr_sectors, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } + if (s->crypt_method) { + qcow2_encrypt_sectors(s, sector_num, cluster_data, + cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key); + qemu_iovec_from_buf(qiov, bytes_done, + cluster_data, 512 * cur_nr_sectors); + } + break; + + default: + g_assert_not_reached(); + ret = -EIO; + goto fail; + } + + remaining_sectors -= cur_nr_sectors; + sector_num += cur_nr_sectors; + bytes_done += cur_nr_sectors * 512; + } + ret = 0; + +fail: + qemu_co_mutex_unlock(&s->lock); + + qemu_iovec_destroy(&hd_qiov); + qemu_vfree(cluster_data); + + return ret; +} + +static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, + int64_t sector_num, + int remaining_sectors, + QEMUIOVector *qiov) +{ + BDRVQcowState *s = bs->opaque; + int index_in_cluster; + int n_end; + int ret; + int cur_nr_sectors; /* number of sectors in current iteration */ + uint64_t cluster_offset; + QEMUIOVector hd_qiov; + uint64_t bytes_done = 0; + uint8_t *cluster_data = NULL; + QCowL2Meta *l2meta = NULL; + + trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num, + remaining_sectors); + + qemu_iovec_init(&hd_qiov, qiov->niov); + + s->cluster_cache_offset = -1; /* disable compressed cache */ + + qemu_co_mutex_lock(&s->lock); + + while (remaining_sectors != 0) { + + l2meta = NULL; + + trace_qcow2_writev_start_part(qemu_coroutine_self()); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n_end = index_in_cluster + remaining_sectors; + if (s->crypt_method && + n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) { + n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; + } + + ret = qcow2_alloc_cluster_offset(bs, sector_num << 9, + index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta); + if (ret < 0) { + goto fail; + } + + assert((cluster_offset & 511) == 0); + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, + cur_nr_sectors * 512); + + if (s->crypt_method) { + if (!cluster_data) { + cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * + s->cluster_size); + } + + assert(hd_qiov.size <= + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); + qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size); + + qcow2_encrypt_sectors(s, sector_num, cluster_data, + cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key); + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_add(&hd_qiov, cluster_data, + cur_nr_sectors * 512); + } + + qemu_co_mutex_unlock(&s->lock); + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); + trace_qcow2_writev_data(qemu_coroutine_self(), + (cluster_offset >> 9) + index_in_cluster); + ret = bdrv_co_writev(bs->file, + (cluster_offset >> 9) + index_in_cluster, + cur_nr_sectors, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } + + while (l2meta != NULL) { + QCowL2Meta *next; + + ret = qcow2_alloc_cluster_link_l2(bs, l2meta); + if (ret < 0) { + goto fail; + } + + /* Take the request off the list of running requests */ + if (l2meta->nb_clusters != 0) { + QLIST_REMOVE(l2meta, next_in_flight); + } + + qemu_co_queue_restart_all(&l2meta->dependent_requests); + + next = l2meta->next; + g_free(l2meta); + l2meta = next; + } + + remaining_sectors -= cur_nr_sectors; + sector_num += cur_nr_sectors; + bytes_done += cur_nr_sectors * 512; + trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors); + } + ret = 0; + +fail: + qemu_co_mutex_unlock(&s->lock); + + while (l2meta != NULL) { + QCowL2Meta *next; + + if (l2meta->nb_clusters != 0) { + QLIST_REMOVE(l2meta, next_in_flight); + } + qemu_co_queue_restart_all(&l2meta->dependent_requests); + + next = l2meta->next; + g_free(l2meta); + l2meta = next; + } + + qemu_iovec_destroy(&hd_qiov); + qemu_vfree(cluster_data); + trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); + + return ret; +} + +static void qcow2_close(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + g_free(s->l1_table); + + qcow2_cache_flush(bs, s->l2_table_cache); + qcow2_cache_flush(bs, s->refcount_block_cache); + + qcow2_mark_clean(bs); + + qcow2_cache_destroy(bs, s->l2_table_cache); + qcow2_cache_destroy(bs, s->refcount_block_cache); + + g_free(s->unknown_header_fields); + cleanup_unknown_header_ext(bs); + + g_free(s->cluster_cache); + qemu_vfree(s->cluster_data); + qcow2_refcount_close(bs); + qcow2_free_snapshots(bs); +} + +static void qcow2_invalidate_cache(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int flags = s->flags; + AES_KEY aes_encrypt_key; + AES_KEY aes_decrypt_key; + uint32_t crypt_method = 0; + QDict *options; + + /* + * Backing files are read-only which makes all of their metadata immutable, + * that means we don't have to worry about reopening them here. + */ + + if (s->crypt_method) { + crypt_method = s->crypt_method; + memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key)); + memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key)); + } + + qcow2_close(bs); + + options = qdict_new(); + qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS, + qbool_from_int(s->use_lazy_refcounts)); + + memset(s, 0, sizeof(BDRVQcowState)); + qcow2_open(bs, options, flags); + + QDECREF(options); + + if (crypt_method) { + s->crypt_method = crypt_method; + memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key)); + memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key)); + } +} + +static size_t header_ext_add(char *buf, uint32_t magic, const void *s, + size_t len, size_t buflen) +{ + QCowExtension *ext_backing_fmt = (QCowExtension*) buf; + size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7); + + if (buflen < ext_len) { + return -ENOSPC; + } + + *ext_backing_fmt = (QCowExtension) { + .magic = cpu_to_be32(magic), + .len = cpu_to_be32(len), + }; + memcpy(buf + sizeof(QCowExtension), s, len); + + return ext_len; +} + +/* + * Updates the qcow2 header, including the variable length parts of it, i.e. + * the backing file name and all extensions. qcow2 was not designed to allow + * such changes, so if we run out of space (we can only use the first cluster) + * this function may fail. + * + * Returns 0 on success, -errno in error cases. + */ +int qcow2_update_header(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + QCowHeader *header; + char *buf; + size_t buflen = s->cluster_size; + int ret; + uint64_t total_size; + uint32_t refcount_table_clusters; + size_t header_length; + Qcow2UnknownHeaderExtension *uext; + + buf = qemu_blockalign(bs, buflen); + + /* Header structure */ + header = (QCowHeader*) buf; + + if (buflen < sizeof(*header)) { + ret = -ENOSPC; + goto fail; + } + + header_length = sizeof(*header) + s->unknown_header_fields_size; + total_size = bs->total_sectors * BDRV_SECTOR_SIZE; + refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3); + + *header = (QCowHeader) { + /* Version 2 fields */ + .magic = cpu_to_be32(QCOW_MAGIC), + .version = cpu_to_be32(s->qcow_version), + .backing_file_offset = 0, + .backing_file_size = 0, + .cluster_bits = cpu_to_be32(s->cluster_bits), + .size = cpu_to_be64(total_size), + .crypt_method = cpu_to_be32(s->crypt_method_header), + .l1_size = cpu_to_be32(s->l1_size), + .l1_table_offset = cpu_to_be64(s->l1_table_offset), + .refcount_table_offset = cpu_to_be64(s->refcount_table_offset), + .refcount_table_clusters = cpu_to_be32(refcount_table_clusters), + .nb_snapshots = cpu_to_be32(s->nb_snapshots), + .snapshots_offset = cpu_to_be64(s->snapshots_offset), + + /* Version 3 fields */ + .incompatible_features = cpu_to_be64(s->incompatible_features), + .compatible_features = cpu_to_be64(s->compatible_features), + .autoclear_features = cpu_to_be64(s->autoclear_features), + .refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT), + .header_length = cpu_to_be32(header_length), + }; + + /* For older versions, write a shorter header */ + switch (s->qcow_version) { + case 2: + ret = offsetof(QCowHeader, incompatible_features); + break; + case 3: + ret = sizeof(*header); + break; + default: + ret = -EINVAL; + goto fail; + } + + buf += ret; + buflen -= ret; + memset(buf, 0, buflen); + + /* Preserve any unknown field in the header */ + if (s->unknown_header_fields_size) { + if (buflen < s->unknown_header_fields_size) { + ret = -ENOSPC; + goto fail; + } + + memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size); + buf += s->unknown_header_fields_size; + buflen -= s->unknown_header_fields_size; + } + + /* Backing file format header extension */ + if (*bs->backing_format) { + ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT, + bs->backing_format, strlen(bs->backing_format), + buflen); + if (ret < 0) { + goto fail; + } + + buf += ret; + buflen -= ret; + } + + /* Feature table */ + Qcow2Feature features[] = { + { + .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, + .bit = QCOW2_INCOMPAT_DIRTY_BITNR, + .name = "dirty bit", + }, + { + .type = QCOW2_FEAT_TYPE_COMPATIBLE, + .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, + .name = "lazy refcounts", + }, + }; + + ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE, + features, sizeof(features), buflen); + if (ret < 0) { + goto fail; + } + buf += ret; + buflen -= ret; + + /* Keep unknown header extensions */ + QLIST_FOREACH(uext, &s->unknown_header_ext, next) { + ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen); + if (ret < 0) { + goto fail; + } + + buf += ret; + buflen -= ret; + } + + /* End of header extensions */ + ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen); + if (ret < 0) { + goto fail; + } + + buf += ret; + buflen -= ret; + + /* Backing file name */ + if (*bs->backing_file) { + size_t backing_file_len = strlen(bs->backing_file); + + if (buflen < backing_file_len) { + ret = -ENOSPC; + goto fail; + } + + /* Using strncpy is ok here, since buf is not NUL-terminated. */ + strncpy(buf, bs->backing_file, buflen); + + header->backing_file_offset = cpu_to_be64(buf - ((char*) header)); + header->backing_file_size = cpu_to_be32(backing_file_len); + } + + /* Write the new header */ + ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size); + if (ret < 0) { + goto fail; + } + + ret = 0; +fail: + qemu_vfree(header); + return ret; +} + +static int qcow2_change_backing_file(BlockDriverState *bs, + const char *backing_file, const char *backing_fmt) +{ + pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); + pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); + + return qcow2_update_header(bs); +} + +static int preallocate(BlockDriverState *bs) +{ + uint64_t nb_sectors; + uint64_t offset; + uint64_t host_offset = 0; + int num; + int ret; + QCowL2Meta *meta; + + nb_sectors = bdrv_getlength(bs) >> 9; + offset = 0; + + while (nb_sectors) { + num = MIN(nb_sectors, INT_MAX >> 9); + ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, + &host_offset, &meta); + if (ret < 0) { + return ret; + } + + ret = qcow2_alloc_cluster_link_l2(bs, meta); + if (ret < 0) { + qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters, + QCOW2_DISCARD_NEVER); + return ret; + } + + /* There are no dependent requests, but we need to remove our request + * from the list of in-flight requests */ + if (meta != NULL) { + QLIST_REMOVE(meta, next_in_flight); + } + + /* TODO Preallocate data if requested */ + + nb_sectors -= num; + offset += num << 9; + } + + /* + * It is expected that the image file is large enough to actually contain + * all of the allocated clusters (otherwise we get failing reads after + * EOF). Extend the image to the last allocated sector. + */ + if (host_offset != 0) { + uint8_t buf[512]; + memset(buf, 0, 512); + ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1); + if (ret < 0) { + return ret; + } + } + + return 0; +} + +static int qcow2_create2(const char *filename, int64_t total_size, + const char *backing_file, const char *backing_format, + int flags, size_t cluster_size, int prealloc, + QEMUOptionParameter *options, int version) +{ + /* Calculate cluster_bits */ + int cluster_bits; + cluster_bits = ffs(cluster_size) - 1; + if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || + (1 << cluster_bits) != cluster_size) + { + error_report( + "Cluster size must be a power of two between %d and %dk", + 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); + return -EINVAL; + } + + /* + * Open the image file and write a minimal qcow2 header. + * + * We keep things simple and start with a zero-sized image. We also + * do without refcount blocks or a L1 table for now. We'll fix the + * inconsistency later. + * + * We do need a refcount table because growing the refcount table means + * allocating two new refcount blocks - the seconds of which would be at + * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file + * size for any qcow2 image. + */ + BlockDriverState* bs; + QCowHeader header; + uint8_t* refcount_table; + int ret; + + ret = bdrv_create_file(filename, options); + if (ret < 0) { + return ret; + } + + ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR); + if (ret < 0) { + return ret; + } + + /* Write the header */ + memset(&header, 0, sizeof(header)); + header.magic = cpu_to_be32(QCOW_MAGIC); + header.version = cpu_to_be32(version); + header.cluster_bits = cpu_to_be32(cluster_bits); + header.size = cpu_to_be64(0); + header.l1_table_offset = cpu_to_be64(0); + header.l1_size = cpu_to_be32(0); + header.refcount_table_offset = cpu_to_be64(cluster_size); + header.refcount_table_clusters = cpu_to_be32(1); + header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT); + header.header_length = cpu_to_be32(sizeof(header)); + + if (flags & BLOCK_FLAG_ENCRYPT) { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); + } else { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); + } + + if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) { + header.compatible_features |= + cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); + } + + ret = bdrv_pwrite(bs, 0, &header, sizeof(header)); + if (ret < 0) { + goto out; + } + + /* Write an empty refcount table */ + refcount_table = g_malloc0(cluster_size); + ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size); + g_free(refcount_table); + + if (ret < 0) { + goto out; + } + + bdrv_close(bs); + + /* + * And now open the image and make it consistent first (i.e. increase the + * refcount of the cluster that is occupied by the header and the refcount + * table) + */ + BlockDriver* drv = bdrv_find_format("qcow2"); + assert(drv != NULL); + ret = bdrv_open(bs, filename, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv); + if (ret < 0) { + goto out; + } + + ret = qcow2_alloc_clusters(bs, 2 * cluster_size); + if (ret < 0) { + goto out; + + } else if (ret != 0) { + error_report("Huh, first cluster in empty image is already in use?"); + abort(); + } + + /* Okay, now that we have a valid image, let's give it the right size */ + ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE); + if (ret < 0) { + goto out; + } + + /* Want a backing file? There you go.*/ + if (backing_file) { + ret = bdrv_change_backing_file(bs, backing_file, backing_format); + if (ret < 0) { + goto out; + } + } + + /* And if we're supposed to preallocate metadata, do that now */ + if (prealloc) { + BDRVQcowState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = preallocate(bs); + qemu_co_mutex_unlock(&s->lock); + if (ret < 0) { + goto out; + } + } + + ret = 0; +out: + bdrv_delete(bs); + return ret; +} + +static int qcow2_create(const char *filename, QEMUOptionParameter *options) +{ + const char *backing_file = NULL; + const char *backing_fmt = NULL; + uint64_t sectors = 0; + int flags = 0; + size_t cluster_size = DEFAULT_CLUSTER_SIZE; + int prealloc = 0; + int version = 2; + + /* Read out options */ + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + sectors = options->value.n / 512; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { + backing_file = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { + backing_fmt = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) { + flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0; + } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { + if (options->value.n) { + cluster_size = options->value.n; + } + } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { + if (!options->value.s || !strcmp(options->value.s, "off")) { + prealloc = 0; + } else if (!strcmp(options->value.s, "metadata")) { + prealloc = 1; + } else { + fprintf(stderr, "Invalid preallocation mode: '%s'\n", + options->value.s); + return -EINVAL; + } + } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) { + if (!options->value.s || !strcmp(options->value.s, "0.10")) { + version = 2; + } else if (!strcmp(options->value.s, "1.1")) { + version = 3; + } else { + fprintf(stderr, "Invalid compatibility level: '%s'\n", + options->value.s); + return -EINVAL; + } + } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) { + flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0; + } + options++; + } + + if (backing_file && prealloc) { + fprintf(stderr, "Backing file and preallocation cannot be used at " + "the same time\n"); + return -EINVAL; + } + + if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) { + fprintf(stderr, "Lazy refcounts only supported with compatibility " + "level 1.1 and above (use compat=1.1 or greater)\n"); + return -EINVAL; + } + + return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags, + cluster_size, prealloc, options, version); +} + +static int qcow2_make_empty(BlockDriverState *bs) +{ +#if 0 + /* XXX: not correct */ + BDRVQcowState *s = bs->opaque; + uint32_t l1_length = s->l1_size * sizeof(uint64_t); + int ret; + + memset(s->l1_table, 0, l1_length); + if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0) + return -1; + ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); + if (ret < 0) + return ret; + + l2_cache_reset(bs); +#endif + return 0; +} + +static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + int ret; + BDRVQcowState *s = bs->opaque; + + /* Emulate misaligned zero writes */ + if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) { + return -ENOTSUP; + } + + /* Whatever is left can use real zero clusters */ + qemu_co_mutex_lock(&s->lock); + ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors); + qemu_co_mutex_unlock(&s->lock); + + return ret; +} + +static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + int ret; + BDRVQcowState *s = bs->opaque; + + qemu_co_mutex_lock(&s->lock); + ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static int qcow2_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVQcowState *s = bs->opaque; + int64_t new_l1_size; + int ret; + + if (offset & 511) { + error_report("The new size must be a multiple of 512"); + return -EINVAL; + } + + /* cannot proceed if image has snapshots */ + if (s->nb_snapshots) { + error_report("Can't resize an image which has snapshots"); + return -ENOTSUP; + } + + /* shrinking is currently not supported */ + if (offset < bs->total_sectors * 512) { + error_report("qcow2 doesn't support shrinking images yet"); + return -ENOTSUP; + } + + new_l1_size = size_to_l1(s, offset); + ret = qcow2_grow_l1_table(bs, new_l1_size, true); + if (ret < 0) { + return ret; + } + + /* write updated header.size */ + offset = cpu_to_be64(offset); + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size), + &offset, sizeof(uint64_t)); + if (ret < 0) { + return ret; + } + + s->l1_vm_state_index = new_l1_size; + return 0; +} + +/* XXX: put compressed sectors first, then all the cluster aligned + tables to avoid losing bytes in alignment */ +static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + z_stream strm; + int ret, out_len; + uint8_t *out_buf; + uint64_t cluster_offset; + + if (nb_sectors == 0) { + /* align end of file to a sector boundary to ease reading with + sector based I/Os */ + cluster_offset = bdrv_getlength(bs->file); + cluster_offset = (cluster_offset + 511) & ~511; + bdrv_truncate(bs->file, cluster_offset); + return 0; + } + + if (nb_sectors != s->cluster_sectors) { + ret = -EINVAL; + + /* Zero-pad last write if image size is not cluster aligned */ + if (sector_num + nb_sectors == bs->total_sectors && + nb_sectors < s->cluster_sectors) { + uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); + memset(pad_buf, 0, s->cluster_size); + memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); + ret = qcow2_write_compressed(bs, sector_num, + pad_buf, s->cluster_sectors); + qemu_vfree(pad_buf); + } + return ret; + } + + out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + + /* best compression, small window, no zlib header */ + memset(&strm, 0, sizeof(strm)); + ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -12, + 9, Z_DEFAULT_STRATEGY); + if (ret != 0) { + ret = -EINVAL; + goto fail; + } + + strm.avail_in = s->cluster_size; + strm.next_in = (uint8_t *)buf; + strm.avail_out = s->cluster_size; + strm.next_out = out_buf; + + ret = deflate(&strm, Z_FINISH); + if (ret != Z_STREAM_END && ret != Z_OK) { + deflateEnd(&strm); + ret = -EINVAL; + goto fail; + } + out_len = strm.next_out - out_buf; + + deflateEnd(&strm); + + if (ret != Z_STREAM_END || out_len >= s->cluster_size) { + /* could not compress: write normal cluster */ + ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); + if (ret < 0) { + goto fail; + } + } else { + cluster_offset = qcow2_alloc_compressed_cluster_offset(bs, + sector_num << 9, out_len); + if (!cluster_offset) { + ret = -EIO; + goto fail; + } + cluster_offset &= s->cluster_offset_mask; + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); + ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); + if (ret < 0) { + goto fail; + } + } + + ret = 0; +fail: + g_free(out_buf); + return ret; +} + +static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int ret; + + qemu_co_mutex_lock(&s->lock); + ret = qcow2_cache_flush(bs, s->l2_table_cache); + if (ret < 0) { + qemu_co_mutex_unlock(&s->lock); + return ret; + } + + if (qcow2_need_accurate_refcounts(s)) { + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + qemu_co_mutex_unlock(&s->lock); + return ret; + } + } + qemu_co_mutex_unlock(&s->lock); + + return 0; +} + +static int64_t qcow2_vm_state_offset(BDRVQcowState *s) +{ + return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); +} + +static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVQcowState *s = bs->opaque; + bdi->cluster_size = s->cluster_size; + bdi->vm_state_offset = qcow2_vm_state_offset(s); + return 0; +} + +#if 0 +static void dump_refcounts(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int64_t nb_clusters, k, k1, size; + int refcount; + + size = bdrv_getlength(bs->file); + nb_clusters = size_to_clusters(s, size); + for(k = 0; k < nb_clusters;) { + k1 = k; + refcount = get_refcount(bs, k); + k++; + while (k < nb_clusters && get_refcount(bs, k) == refcount) + k++; + printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount, + k - k1); + } +} +#endif + +static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t pos) +{ + BDRVQcowState *s = bs->opaque; + int growable = bs->growable; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); + bs->growable = 1; + ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov); + bs->growable = growable; + + return ret; +} + +static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size) +{ + BDRVQcowState *s = bs->opaque; + int growable = bs->growable; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); + bs->growable = 1; + ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size); + bs->growable = growable; + + return ret; +} + +static QEMUOptionParameter qcow2_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_COMPAT_LEVEL, + .type = OPT_STRING, + .help = "Compatibility level (0.10 or 1.1)" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_BACKING_FMT, + .type = OPT_STRING, + .help = "Image format of the base image" + }, + { + .name = BLOCK_OPT_ENCRYPT, + .type = OPT_FLAG, + .help = "Encrypt the image" + }, + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = OPT_SIZE, + .help = "qcow2 cluster size", + .value = { .n = DEFAULT_CLUSTER_SIZE }, + }, + { + .name = BLOCK_OPT_PREALLOC, + .type = OPT_STRING, + .help = "Preallocation mode (allowed values: off, metadata)" + }, + { + .name = BLOCK_OPT_LAZY_REFCOUNTS, + .type = OPT_FLAG, + .help = "Postpone refcount updates", + }, + { NULL } +}; + +static BlockDriver bdrv_qcow2 = { + .format_name = "qcow2", + .instance_size = sizeof(BDRVQcowState), + .bdrv_probe = qcow2_probe, + .bdrv_open = qcow2_open, + .bdrv_close = qcow2_close, + .bdrv_reopen_prepare = qcow2_reopen_prepare, + .bdrv_create = qcow2_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .bdrv_co_is_allocated = qcow2_co_is_allocated, + .bdrv_set_key = qcow2_set_key, + .bdrv_make_empty = qcow2_make_empty, + + .bdrv_co_readv = qcow2_co_readv, + .bdrv_co_writev = qcow2_co_writev, + .bdrv_co_flush_to_os = qcow2_co_flush_to_os, + + .bdrv_co_write_zeroes = qcow2_co_write_zeroes, + .bdrv_co_discard = qcow2_co_discard, + .bdrv_truncate = qcow2_truncate, + .bdrv_write_compressed = qcow2_write_compressed, + + .bdrv_snapshot_create = qcow2_snapshot_create, + .bdrv_snapshot_goto = qcow2_snapshot_goto, + .bdrv_snapshot_delete = qcow2_snapshot_delete, + .bdrv_snapshot_list = qcow2_snapshot_list, + .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp, + .bdrv_get_info = qcow2_get_info, + + .bdrv_save_vmstate = qcow2_save_vmstate, + .bdrv_load_vmstate = qcow2_load_vmstate, + + .bdrv_change_backing_file = qcow2_change_backing_file, + + .bdrv_invalidate_cache = qcow2_invalidate_cache, + + .create_options = qcow2_create_options, + .bdrv_check = qcow2_check, +}; + +static void bdrv_qcow2_init(void) +{ + bdrv_register(&bdrv_qcow2); +} + +block_init(bdrv_qcow2_init); diff --git a/contrib/qemu/block/qcow2.h b/contrib/qemu/block/qcow2.h new file mode 100644 index 000000000..3b2d5cda7 --- /dev/null +++ b/contrib/qemu/block/qcow2.h @@ -0,0 +1,437 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef BLOCK_QCOW2_H +#define BLOCK_QCOW2_H + +#include "qemu/aes.h" +#include "block/coroutine.h" + +//#define DEBUG_ALLOC +//#define DEBUG_ALLOC2 +//#define DEBUG_EXT + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES 1 + +#define QCOW_MAX_CRYPT_CLUSTERS 32 + +/* indicate that the refcount of the referenced cluster is exactly one. */ +#define QCOW_OFLAG_COPIED (1LL << 63) +/* indicate that the cluster is compressed (they never have the copied flag) */ +#define QCOW_OFLAG_COMPRESSED (1LL << 62) +/* The cluster reads as all zeros */ +#define QCOW_OFLAG_ZERO (1LL << 0) + +#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */ + +#define MIN_CLUSTER_BITS 9 +#define MAX_CLUSTER_BITS 21 + +#define L2_CACHE_SIZE 16 + +/* Must be at least 4 to cover all cases of refcount table growth */ +#define REFCOUNT_CACHE_SIZE 4 + +#define DEFAULT_CLUSTER_SIZE 65536 + + +#define QCOW2_OPT_LAZY_REFCOUNTS "lazy_refcounts" +#define QCOW2_OPT_DISCARD_REQUEST "pass_discard_request" +#define QCOW2_OPT_DISCARD_SNAPSHOT "pass_discard_snapshot" +#define QCOW2_OPT_DISCARD_OTHER "pass_discard_other" + +typedef struct QCowHeader { + uint32_t magic; + uint32_t version; + uint64_t backing_file_offset; + uint32_t backing_file_size; + uint32_t cluster_bits; + uint64_t size; /* in bytes */ + uint32_t crypt_method; + uint32_t l1_size; /* XXX: save number of clusters instead ? */ + uint64_t l1_table_offset; + uint64_t refcount_table_offset; + uint32_t refcount_table_clusters; + uint32_t nb_snapshots; + uint64_t snapshots_offset; + + /* The following fields are only valid for version >= 3 */ + uint64_t incompatible_features; + uint64_t compatible_features; + uint64_t autoclear_features; + + uint32_t refcount_order; + uint32_t header_length; +} QCowHeader; + +typedef struct QCowSnapshot { + uint64_t l1_table_offset; + uint32_t l1_size; + char *id_str; + char *name; + uint64_t disk_size; + uint64_t vm_state_size; + uint32_t date_sec; + uint32_t date_nsec; + uint64_t vm_clock_nsec; +} QCowSnapshot; + +struct Qcow2Cache; +typedef struct Qcow2Cache Qcow2Cache; + +typedef struct Qcow2UnknownHeaderExtension { + uint32_t magic; + uint32_t len; + QLIST_ENTRY(Qcow2UnknownHeaderExtension) next; + uint8_t data[]; +} Qcow2UnknownHeaderExtension; + +enum { + QCOW2_FEAT_TYPE_INCOMPATIBLE = 0, + QCOW2_FEAT_TYPE_COMPATIBLE = 1, + QCOW2_FEAT_TYPE_AUTOCLEAR = 2, +}; + +/* Incompatible feature bits */ +enum { + QCOW2_INCOMPAT_DIRTY_BITNR = 0, + QCOW2_INCOMPAT_DIRTY = 1 << QCOW2_INCOMPAT_DIRTY_BITNR, + + QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY, +}; + +/* Compatible feature bits */ +enum { + QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0, + QCOW2_COMPAT_LAZY_REFCOUNTS = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, + + QCOW2_COMPAT_FEAT_MASK = QCOW2_COMPAT_LAZY_REFCOUNTS, +}; + +enum qcow2_discard_type { + QCOW2_DISCARD_NEVER = 0, + QCOW2_DISCARD_ALWAYS, + QCOW2_DISCARD_REQUEST, + QCOW2_DISCARD_SNAPSHOT, + QCOW2_DISCARD_OTHER, + QCOW2_DISCARD_MAX +}; + +typedef struct Qcow2Feature { + uint8_t type; + uint8_t bit; + char name[46]; +} QEMU_PACKED Qcow2Feature; + +typedef struct Qcow2DiscardRegion { + BlockDriverState *bs; + uint64_t offset; + uint64_t bytes; + QTAILQ_ENTRY(Qcow2DiscardRegion) next; +} Qcow2DiscardRegion; + +typedef struct BDRVQcowState { + int cluster_bits; + int cluster_size; + int cluster_sectors; + int l2_bits; + int l2_size; + int l1_size; + int l1_vm_state_index; + int csize_shift; + int csize_mask; + uint64_t cluster_offset_mask; + uint64_t l1_table_offset; + uint64_t *l1_table; + + Qcow2Cache* l2_table_cache; + Qcow2Cache* refcount_block_cache; + + uint8_t *cluster_cache; + uint8_t *cluster_data; + uint64_t cluster_cache_offset; + QLIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs; + + uint64_t *refcount_table; + uint64_t refcount_table_offset; + uint32_t refcount_table_size; + int64_t free_cluster_index; + int64_t free_byte_offset; + + CoMutex lock; + + uint32_t crypt_method; /* current crypt method, 0 if no key yet */ + uint32_t crypt_method_header; + AES_KEY aes_encrypt_key; + AES_KEY aes_decrypt_key; + uint64_t snapshots_offset; + int snapshots_size; + int nb_snapshots; + QCowSnapshot *snapshots; + + int flags; + int qcow_version; + bool use_lazy_refcounts; + + bool discard_passthrough[QCOW2_DISCARD_MAX]; + + uint64_t incompatible_features; + uint64_t compatible_features; + uint64_t autoclear_features; + + size_t unknown_header_fields_size; + void* unknown_header_fields; + QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext; + QTAILQ_HEAD (, Qcow2DiscardRegion) discards; + bool cache_discards; +} BDRVQcowState; + +/* XXX: use std qcow open function ? */ +typedef struct QCowCreateState { + int cluster_size; + int cluster_bits; + uint16_t *refcount_block; + uint64_t *refcount_table; + int64_t l1_table_offset; + int64_t refcount_table_offset; + int64_t refcount_block_offset; +} QCowCreateState; + +struct QCowAIOCB; + +typedef struct Qcow2COWRegion { + /** + * Offset of the COW region in bytes from the start of the first cluster + * touched by the request. + */ + uint64_t offset; + + /** Number of sectors to copy */ + int nb_sectors; +} Qcow2COWRegion; + +/** + * Describes an in-flight (part of a) write request that writes to clusters + * that are not referenced in their L2 table yet. + */ +typedef struct QCowL2Meta +{ + /** Guest offset of the first newly allocated cluster */ + uint64_t offset; + + /** Host offset of the first newly allocated cluster */ + uint64_t alloc_offset; + + /** + * Number of sectors from the start of the first allocated cluster to + * the end of the (possibly shortened) request + */ + int nb_available; + + /** Number of newly allocated clusters */ + int nb_clusters; + + /** + * Requests that overlap with this allocation and wait to be restarted + * when the allocating request has completed. + */ + CoQueue dependent_requests; + + /** + * The COW Region between the start of the first allocated cluster and the + * area the guest actually writes to. + */ + Qcow2COWRegion cow_start; + + /** + * The COW Region between the area the guest actually writes to and the + * end of the last allocated cluster. + */ + Qcow2COWRegion cow_end; + + /** Pointer to next L2Meta of the same write request */ + struct QCowL2Meta *next; + + QLIST_ENTRY(QCowL2Meta) next_in_flight; +} QCowL2Meta; + +enum { + QCOW2_CLUSTER_UNALLOCATED, + QCOW2_CLUSTER_NORMAL, + QCOW2_CLUSTER_COMPRESSED, + QCOW2_CLUSTER_ZERO +}; + +#define L1E_OFFSET_MASK 0x00ffffffffffff00ULL +#define L2E_OFFSET_MASK 0x00ffffffffffff00ULL +#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL + +#define REFT_OFFSET_MASK 0xffffffffffffff00ULL + +static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset) +{ + return offset & ~(s->cluster_size - 1); +} + +static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset) +{ + return offset & (s->cluster_size - 1); +} + +static inline int size_to_clusters(BDRVQcowState *s, int64_t size) +{ + return (size + (s->cluster_size - 1)) >> s->cluster_bits; +} + +static inline int64_t size_to_l1(BDRVQcowState *s, int64_t size) +{ + int shift = s->cluster_bits + s->l2_bits; + return (size + (1ULL << shift) - 1) >> shift; +} + +static inline int offset_to_l2_index(BDRVQcowState *s, int64_t offset) +{ + return (offset >> s->cluster_bits) & (s->l2_size - 1); +} + +static inline int64_t align_offset(int64_t offset, int n) +{ + offset = (offset + n - 1) & ~(n - 1); + return offset; +} + +static inline int qcow2_get_cluster_type(uint64_t l2_entry) +{ + if (l2_entry & QCOW_OFLAG_COMPRESSED) { + return QCOW2_CLUSTER_COMPRESSED; + } else if (l2_entry & QCOW_OFLAG_ZERO) { + return QCOW2_CLUSTER_ZERO; + } else if (!(l2_entry & L2E_OFFSET_MASK)) { + return QCOW2_CLUSTER_UNALLOCATED; + } else { + return QCOW2_CLUSTER_NORMAL; + } +} + +/* Check whether refcounts are eager or lazy */ +static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s) +{ + return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY); +} + +static inline uint64_t l2meta_cow_start(QCowL2Meta *m) +{ + return m->offset + m->cow_start.offset; +} + +static inline uint64_t l2meta_cow_end(QCowL2Meta *m) +{ + return m->offset + m->cow_end.offset + + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS); +} + +// FIXME Need qcow2_ prefix to global functions + +/* qcow2.c functions */ +int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t sector_num, int nb_sectors); + +int qcow2_mark_dirty(BlockDriverState *bs); +int qcow2_update_header(BlockDriverState *bs); + +/* qcow2-refcount.c functions */ +int qcow2_refcount_init(BlockDriverState *bs); +void qcow2_refcount_close(BlockDriverState *bs); + +int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size); +int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, + int nb_clusters); +int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size); +void qcow2_free_clusters(BlockDriverState *bs, + int64_t offset, int64_t size, + enum qcow2_discard_type type); +void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, + int nb_clusters, enum qcow2_discard_type type); + +int qcow2_update_snapshot_refcount(BlockDriverState *bs, + int64_t l1_table_offset, int l1_size, int addend); + +int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix); + +void qcow2_process_discards(BlockDriverState *bs, int ret); + +/* qcow2-cluster.c functions */ +int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, + bool exact_size); +void qcow2_l2_cache_reset(BlockDriverState *bs); +int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); +void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, int enc, + const AES_KEY *key); + +int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, + int *num, uint64_t *cluster_offset); +int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, + int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m); +uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, + uint64_t offset, + int compressed_size); + +int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); +int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, + int nb_sectors); +int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors); + +/* qcow2-snapshot.c functions */ +int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info); +int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id); +int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id); +int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab); +int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name); + +void qcow2_free_snapshots(BlockDriverState *bs); +int qcow2_read_snapshots(BlockDriverState *bs); + +/* qcow2-cache.c functions */ +Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables); +int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c); + +void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table); +int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c); +int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, + Qcow2Cache *dependency); +void qcow2_cache_depends_on_flush(Qcow2Cache *c); + +int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, + void **table); +int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, + void **table); +int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table); + +#endif diff --git a/contrib/qemu/block/qed-check.c b/contrib/qemu/block/qed-check.c new file mode 100644 index 000000000..b473dcd61 --- /dev/null +++ b/contrib/qemu/block/qed-check.c @@ -0,0 +1,248 @@ +/* + * QEMU Enhanced Disk Format Consistency Check + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +typedef struct { + BDRVQEDState *s; + BdrvCheckResult *result; + bool fix; /* whether to fix invalid offsets */ + + uint64_t nclusters; + uint32_t *used_clusters; /* referenced cluster bitmap */ + + QEDRequest request; +} QEDCheck; + +static bool qed_test_bit(uint32_t *bitmap, uint64_t n) { + return !!(bitmap[n / 32] & (1 << (n % 32))); +} + +static void qed_set_bit(uint32_t *bitmap, uint64_t n) { + bitmap[n / 32] |= 1 << (n % 32); +} + +/** + * Set bitmap bits for clusters + * + * @check: Check structure + * @offset: Starting offset in bytes + * @n: Number of clusters + */ +static bool qed_set_used_clusters(QEDCheck *check, uint64_t offset, + unsigned int n) +{ + uint64_t cluster = qed_bytes_to_clusters(check->s, offset); + unsigned int corruptions = 0; + + while (n-- != 0) { + /* Clusters should only be referenced once */ + if (qed_test_bit(check->used_clusters, cluster)) { + corruptions++; + } + + qed_set_bit(check->used_clusters, cluster); + cluster++; + } + + check->result->corruptions += corruptions; + return corruptions == 0; +} + +/** + * Check an L2 table + * + * @ret: Number of invalid cluster offsets + */ +static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table) +{ + BDRVQEDState *s = check->s; + unsigned int i, num_invalid = 0; + uint64_t last_offset = 0; + + for (i = 0; i < s->table_nelems; i++) { + uint64_t offset = table->offsets[i]; + + if (qed_offset_is_unalloc_cluster(offset) || + qed_offset_is_zero_cluster(offset)) { + continue; + } + check->result->bfi.allocated_clusters++; + if (last_offset && (last_offset + s->header.cluster_size != offset)) { + check->result->bfi.fragmented_clusters++; + } + last_offset = offset; + + /* Detect invalid cluster offset */ + if (!qed_check_cluster_offset(s, offset)) { + if (check->fix) { + table->offsets[i] = 0; + check->result->corruptions_fixed++; + } else { + check->result->corruptions++; + } + + num_invalid++; + continue; + } + + qed_set_used_clusters(check, offset, 1); + } + + return num_invalid; +} + +/** + * Descend tables and check each cluster is referenced once only + */ +static int qed_check_l1_table(QEDCheck *check, QEDTable *table) +{ + BDRVQEDState *s = check->s; + unsigned int i, num_invalid_l1 = 0; + int ret, last_error = 0; + + /* Mark L1 table clusters used */ + qed_set_used_clusters(check, s->header.l1_table_offset, + s->header.table_size); + + for (i = 0; i < s->table_nelems; i++) { + unsigned int num_invalid_l2; + uint64_t offset = table->offsets[i]; + + if (qed_offset_is_unalloc_cluster(offset)) { + continue; + } + + /* Detect invalid L2 offset */ + if (!qed_check_table_offset(s, offset)) { + /* Clear invalid offset */ + if (check->fix) { + table->offsets[i] = 0; + check->result->corruptions_fixed++; + } else { + check->result->corruptions++; + } + + num_invalid_l1++; + continue; + } + + if (!qed_set_used_clusters(check, offset, s->header.table_size)) { + continue; /* skip an invalid table */ + } + + ret = qed_read_l2_table_sync(s, &check->request, offset); + if (ret) { + check->result->check_errors++; + last_error = ret; + continue; + } + + num_invalid_l2 = qed_check_l2_table(check, + check->request.l2_table->table); + + /* Write out fixed L2 table */ + if (num_invalid_l2 > 0 && check->fix) { + ret = qed_write_l2_table_sync(s, &check->request, 0, + s->table_nelems, false); + if (ret) { + check->result->check_errors++; + last_error = ret; + continue; + } + } + } + + /* Drop reference to final table */ + qed_unref_l2_cache_entry(check->request.l2_table); + check->request.l2_table = NULL; + + /* Write out fixed L1 table */ + if (num_invalid_l1 > 0 && check->fix) { + ret = qed_write_l1_table_sync(s, 0, s->table_nelems); + if (ret) { + check->result->check_errors++; + last_error = ret; + } + } + + return last_error; +} + +/** + * Check for unreferenced (leaked) clusters + */ +static void qed_check_for_leaks(QEDCheck *check) +{ + BDRVQEDState *s = check->s; + uint64_t i; + + for (i = s->header.header_size; i < check->nclusters; i++) { + if (!qed_test_bit(check->used_clusters, i)) { + check->result->leaks++; + } + } +} + +/** + * Mark an image clean once it passes check or has been repaired + */ +static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result) +{ + /* Skip if there were unfixable corruptions or I/O errors */ + if (result->corruptions > 0 || result->check_errors > 0) { + return; + } + + /* Skip if image is already marked clean */ + if (!(s->header.features & QED_F_NEED_CHECK)) { + return; + } + + /* Ensure fixes reach storage before clearing check bit */ + bdrv_flush(s->bs); + + s->header.features &= ~QED_F_NEED_CHECK; + qed_write_header_sync(s); +} + +int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix) +{ + QEDCheck check = { + .s = s, + .result = result, + .nclusters = qed_bytes_to_clusters(s, s->file_size), + .request = { .l2_table = NULL }, + .fix = fix, + }; + int ret; + + check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) * + sizeof(check.used_clusters[0])); + + check.result->bfi.total_clusters = + (s->header.image_size + s->header.cluster_size - 1) / + s->header.cluster_size; + ret = qed_check_l1_table(&check, s->l1_table); + if (ret == 0) { + /* Only check for leaks if entire image was scanned successfully */ + qed_check_for_leaks(&check); + + if (fix) { + qed_check_mark_clean(s, result); + } + } + + g_free(check.used_clusters); + return ret; +} diff --git a/contrib/qemu/block/qed-cluster.c b/contrib/qemu/block/qed-cluster.c new file mode 100644 index 000000000..f64b2af8f --- /dev/null +++ b/contrib/qemu/block/qed-cluster.c @@ -0,0 +1,165 @@ +/* + * QEMU Enhanced Disk Format Cluster functions + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +/** + * Count the number of contiguous data clusters + * + * @s: QED state + * @table: L2 table + * @index: First cluster index + * @n: Maximum number of clusters + * @offset: Set to first cluster offset + * + * This function scans tables for contiguous clusters. A contiguous run of + * clusters may be allocated, unallocated, or zero. + */ +static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s, + QEDTable *table, + unsigned int index, + unsigned int n, + uint64_t *offset) +{ + unsigned int end = MIN(index + n, s->table_nelems); + uint64_t last = table->offsets[index]; + unsigned int i; + + *offset = last; + + for (i = index + 1; i < end; i++) { + if (qed_offset_is_unalloc_cluster(last)) { + /* Counting unallocated clusters */ + if (!qed_offset_is_unalloc_cluster(table->offsets[i])) { + break; + } + } else if (qed_offset_is_zero_cluster(last)) { + /* Counting zero clusters */ + if (!qed_offset_is_zero_cluster(table->offsets[i])) { + break; + } + } else { + /* Counting allocated clusters */ + if (table->offsets[i] != last + s->header.cluster_size) { + break; + } + last = table->offsets[i]; + } + } + return i - index; +} + +typedef struct { + BDRVQEDState *s; + uint64_t pos; + size_t len; + + QEDRequest *request; + + /* User callback */ + QEDFindClusterFunc *cb; + void *opaque; +} QEDFindClusterCB; + +static void qed_find_cluster_cb(void *opaque, int ret) +{ + QEDFindClusterCB *find_cluster_cb = opaque; + BDRVQEDState *s = find_cluster_cb->s; + QEDRequest *request = find_cluster_cb->request; + uint64_t offset = 0; + size_t len = 0; + unsigned int index; + unsigned int n; + + if (ret) { + goto out; + } + + index = qed_l2_index(s, find_cluster_cb->pos); + n = qed_bytes_to_clusters(s, + qed_offset_into_cluster(s, find_cluster_cb->pos) + + find_cluster_cb->len); + n = qed_count_contiguous_clusters(s, request->l2_table->table, + index, n, &offset); + + if (qed_offset_is_unalloc_cluster(offset)) { + ret = QED_CLUSTER_L2; + } else if (qed_offset_is_zero_cluster(offset)) { + ret = QED_CLUSTER_ZERO; + } else if (qed_check_cluster_offset(s, offset)) { + ret = QED_CLUSTER_FOUND; + } else { + ret = -EINVAL; + } + + len = MIN(find_cluster_cb->len, n * s->header.cluster_size - + qed_offset_into_cluster(s, find_cluster_cb->pos)); + +out: + find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len); + g_free(find_cluster_cb); +} + +/** + * Find the offset of a data cluster + * + * @s: QED state + * @request: L2 cache entry + * @pos: Byte position in device + * @len: Number of bytes + * @cb: Completion function + * @opaque: User data for completion function + * + * This function translates a position in the block device to an offset in the + * image file. It invokes the cb completion callback to report back the + * translated offset or unallocated range in the image file. + * + * If the L2 table exists, request->l2_table points to the L2 table cache entry + * and the caller must free the reference when they are finished. The cache + * entry is exposed in this way to avoid callers having to read the L2 table + * again later during request processing. If request->l2_table is non-NULL it + * will be unreferenced before taking on the new cache entry. + */ +void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, + size_t len, QEDFindClusterFunc *cb, void *opaque) +{ + QEDFindClusterCB *find_cluster_cb; + uint64_t l2_offset; + + /* Limit length to L2 boundary. Requests are broken up at the L2 boundary + * so that a request acts on one L2 table at a time. + */ + len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos); + + l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)]; + if (qed_offset_is_unalloc_cluster(l2_offset)) { + cb(opaque, QED_CLUSTER_L1, 0, len); + return; + } + if (!qed_check_table_offset(s, l2_offset)) { + cb(opaque, -EINVAL, 0, 0); + return; + } + + find_cluster_cb = g_malloc(sizeof(*find_cluster_cb)); + find_cluster_cb->s = s; + find_cluster_cb->pos = pos; + find_cluster_cb->len = len; + find_cluster_cb->cb = cb; + find_cluster_cb->opaque = opaque; + find_cluster_cb->request = request; + + qed_read_l2_table(s, request, l2_offset, + qed_find_cluster_cb, find_cluster_cb); +} diff --git a/contrib/qemu/block/qed-gencb.c b/contrib/qemu/block/qed-gencb.c new file mode 100644 index 000000000..7d7ac1ffc --- /dev/null +++ b/contrib/qemu/block/qed-gencb.c @@ -0,0 +1,32 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque) +{ + GenericCB *gencb = g_malloc(len); + gencb->cb = cb; + gencb->opaque = opaque; + return gencb; +} + +void gencb_complete(void *opaque, int ret) +{ + GenericCB *gencb = opaque; + BlockDriverCompletionFunc *cb = gencb->cb; + void *user_opaque = gencb->opaque; + + g_free(gencb); + cb(user_opaque, ret); +} diff --git a/contrib/qemu/block/qed-l2-cache.c b/contrib/qemu/block/qed-l2-cache.c new file mode 100644 index 000000000..e9b2aae44 --- /dev/null +++ b/contrib/qemu/block/qed-l2-cache.c @@ -0,0 +1,187 @@ +/* + * QEMU Enhanced Disk Format L2 Cache + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +/* + * L2 table cache usage is as follows: + * + * An open image has one L2 table cache that is used to avoid accessing the + * image file for recently referenced L2 tables. + * + * Cluster offset lookup translates the logical offset within the block device + * to a cluster offset within the image file. This is done by indexing into + * the L1 and L2 tables which store cluster offsets. It is here where the L2 + * table cache serves up recently referenced L2 tables. + * + * If there is a cache miss, that L2 table is read from the image file and + * committed to the cache. Subsequent accesses to that L2 table will be served + * from the cache until the table is evicted from the cache. + * + * L2 tables are also committed to the cache when new L2 tables are allocated + * in the image file. Since the L2 table cache is write-through, the new L2 + * table is first written out to the image file and then committed to the + * cache. + * + * Multiple I/O requests may be using an L2 table cache entry at any given + * time. That means an entry may be in use across several requests and + * reference counting is needed to free the entry at the correct time. In + * particular, an entry evicted from the cache will only be freed once all + * references are dropped. + * + * An in-flight I/O request will hold a reference to a L2 table cache entry for + * the period during which it needs to access the L2 table. This includes + * cluster offset lookup, L2 table allocation, and L2 table update when a new + * data cluster has been allocated. + * + * An interesting case occurs when two requests need to access an L2 table that + * is not in the cache. Since the operation to read the table from the image + * file takes some time to complete, both requests may see a cache miss and + * start reading the L2 table from the image file. The first to finish will + * commit its L2 table into the cache. When the second tries to commit its + * table will be deleted in favor of the existing cache entry. + */ + +#include "trace.h" +#include "qed.h" + +/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */ +#define MAX_L2_CACHE_SIZE 50 + +/** + * Initialize the L2 cache + */ +void qed_init_l2_cache(L2TableCache *l2_cache) +{ + QTAILQ_INIT(&l2_cache->entries); + l2_cache->n_entries = 0; +} + +/** + * Free the L2 cache + */ +void qed_free_l2_cache(L2TableCache *l2_cache) +{ + CachedL2Table *entry, *next_entry; + + QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) { + qemu_vfree(entry->table); + g_free(entry); + } +} + +/** + * Allocate an uninitialized entry from the cache + * + * The returned entry has a reference count of 1 and is owned by the caller. + * The caller must allocate the actual table field for this entry and it must + * be freeable using qemu_vfree(). + */ +CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache) +{ + CachedL2Table *entry; + + entry = g_malloc0(sizeof(*entry)); + entry->ref++; + + trace_qed_alloc_l2_cache_entry(l2_cache, entry); + + return entry; +} + +/** + * Decrease an entry's reference count and free if necessary when the reference + * count drops to zero. + */ +void qed_unref_l2_cache_entry(CachedL2Table *entry) +{ + if (!entry) { + return; + } + + entry->ref--; + trace_qed_unref_l2_cache_entry(entry, entry->ref); + if (entry->ref == 0) { + qemu_vfree(entry->table); + g_free(entry); + } +} + +/** + * Find an entry in the L2 cache. This may return NULL and it's up to the + * caller to satisfy the cache miss. + * + * For a cached entry, this function increases the reference count and returns + * the entry. + */ +CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset) +{ + CachedL2Table *entry; + + QTAILQ_FOREACH(entry, &l2_cache->entries, node) { + if (entry->offset == offset) { + trace_qed_find_l2_cache_entry(l2_cache, entry, offset, entry->ref); + entry->ref++; + return entry; + } + } + return NULL; +} + +/** + * Commit an L2 cache entry into the cache. This is meant to be used as part of + * the process to satisfy a cache miss. A caller would allocate an entry which + * is not actually in the L2 cache and then once the entry was valid and + * present on disk, the entry can be committed into the cache. + * + * Since the cache is write-through, it's important that this function is not + * called until the entry is present on disk and the L1 has been updated to + * point to the entry. + * + * N.B. This function steals a reference to the l2_table from the caller so the + * caller must obtain a new reference by issuing a call to + * qed_find_l2_cache_entry(). + */ +void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table) +{ + CachedL2Table *entry; + + entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset); + if (entry) { + qed_unref_l2_cache_entry(entry); + qed_unref_l2_cache_entry(l2_table); + return; + } + + /* Evict an unused cache entry so we have space. If all entries are in use + * we can grow the cache temporarily and we try to shrink back down later. + */ + if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) { + CachedL2Table *next; + QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next) { + if (entry->ref > 1) { + continue; + } + + QTAILQ_REMOVE(&l2_cache->entries, entry, node); + l2_cache->n_entries--; + qed_unref_l2_cache_entry(entry); + + /* Stop evicting when we've shrunk back to max size */ + if (l2_cache->n_entries < MAX_L2_CACHE_SIZE) { + break; + } + } + } + + l2_cache->n_entries++; + QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node); +} diff --git a/contrib/qemu/block/qed-table.c b/contrib/qemu/block/qed-table.c new file mode 100644 index 000000000..76d2dcccf --- /dev/null +++ b/contrib/qemu/block/qed-table.c @@ -0,0 +1,296 @@ +/* + * QEMU Enhanced Disk Format Table I/O + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "qemu/sockets.h" /* for EINPROGRESS on Windows */ +#include "qed.h" + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + QEDTable *table; + + struct iovec iov; + QEMUIOVector qiov; +} QEDReadTableCB; + +static void qed_read_table_cb(void *opaque, int ret) +{ + QEDReadTableCB *read_table_cb = opaque; + QEDTable *table = read_table_cb->table; + int noffsets = read_table_cb->qiov.size / sizeof(uint64_t); + int i; + + /* Handle I/O error */ + if (ret) { + goto out; + } + + /* Byteswap offsets */ + for (i = 0; i < noffsets; i++) { + table->offsets[i] = le64_to_cpu(table->offsets[i]); + } + +out: + /* Completion */ + trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret); + gencb_complete(&read_table_cb->gencb, ret); +} + +static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, + BlockDriverCompletionFunc *cb, void *opaque) +{ + QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb), + cb, opaque); + QEMUIOVector *qiov = &read_table_cb->qiov; + + trace_qed_read_table(s, offset, table); + + read_table_cb->s = s; + read_table_cb->table = table; + read_table_cb->iov.iov_base = table->offsets, + read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size, + + qemu_iovec_init_external(qiov, &read_table_cb->iov, 1); + bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov, + qiov->size / BDRV_SECTOR_SIZE, + qed_read_table_cb, read_table_cb); +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + QEDTable *orig_table; + QEDTable *table; + bool flush; /* flush after write? */ + + struct iovec iov; + QEMUIOVector qiov; +} QEDWriteTableCB; + +static void qed_write_table_cb(void *opaque, int ret) +{ + QEDWriteTableCB *write_table_cb = opaque; + + trace_qed_write_table_cb(write_table_cb->s, + write_table_cb->orig_table, + write_table_cb->flush, + ret); + + if (ret) { + goto out; + } + + if (write_table_cb->flush) { + /* We still need to flush first */ + write_table_cb->flush = false; + bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb, + write_table_cb); + return; + } + +out: + qemu_vfree(write_table_cb->table); + gencb_complete(&write_table_cb->gencb, ret); +} + +/** + * Write out an updated part or all of a table + * + * @s: QED state + * @offset: Offset of table in image file, in bytes + * @table: Table + * @index: Index of first element + * @n: Number of elements + * @flush: Whether or not to sync to disk + * @cb: Completion function + * @opaque: Argument for completion function + */ +static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, + unsigned int index, unsigned int n, bool flush, + BlockDriverCompletionFunc *cb, void *opaque) +{ + QEDWriteTableCB *write_table_cb; + unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1; + unsigned int start, end, i; + size_t len_bytes; + + trace_qed_write_table(s, offset, table, index, n); + + /* Calculate indices of the first and one after last elements */ + start = index & ~sector_mask; + end = (index + n + sector_mask) & ~sector_mask; + + len_bytes = (end - start) * sizeof(uint64_t); + + write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque); + write_table_cb->s = s; + write_table_cb->orig_table = table; + write_table_cb->flush = flush; + write_table_cb->table = qemu_blockalign(s->bs, len_bytes); + write_table_cb->iov.iov_base = write_table_cb->table->offsets; + write_table_cb->iov.iov_len = len_bytes; + qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1); + + /* Byteswap table */ + for (i = start; i < end; i++) { + uint64_t le_offset = cpu_to_le64(table->offsets[i]); + write_table_cb->table->offsets[i - start] = le_offset; + } + + /* Adjust for offset into table */ + offset += start * sizeof(uint64_t); + + bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, + &write_table_cb->qiov, + write_table_cb->qiov.size / BDRV_SECTOR_SIZE, + qed_write_table_cb, write_table_cb); +} + +/** + * Propagate return value from async callback + */ +static void qed_sync_cb(void *opaque, int ret) +{ + *(int *)opaque = ret; +} + +int qed_read_l1_table_sync(BDRVQEDState *s) +{ + int ret = -EINPROGRESS; + + qed_read_table(s, s->header.l1_table_offset, + s->l1_table, qed_sync_cb, &ret); + while (ret == -EINPROGRESS) { + qemu_aio_wait(); + } + + return ret; +} + +void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE); + qed_write_table(s, s->header.l1_table_offset, + s->l1_table, index, n, false, cb, opaque); +} + +int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, + unsigned int n) +{ + int ret = -EINPROGRESS; + + qed_write_l1_table(s, index, n, qed_sync_cb, &ret); + while (ret == -EINPROGRESS) { + qemu_aio_wait(); + } + + return ret; +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + uint64_t l2_offset; + QEDRequest *request; +} QEDReadL2TableCB; + +static void qed_read_l2_table_cb(void *opaque, int ret) +{ + QEDReadL2TableCB *read_l2_table_cb = opaque; + QEDRequest *request = read_l2_table_cb->request; + BDRVQEDState *s = read_l2_table_cb->s; + CachedL2Table *l2_table = request->l2_table; + uint64_t l2_offset = read_l2_table_cb->l2_offset; + + if (ret) { + /* can't trust loaded L2 table anymore */ + qed_unref_l2_cache_entry(l2_table); + request->l2_table = NULL; + } else { + l2_table->offset = l2_offset; + + qed_commit_l2_cache_entry(&s->l2_cache, l2_table); + + /* This is guaranteed to succeed because we just committed the entry + * to the cache. + */ + request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); + assert(request->l2_table != NULL); + } + + gencb_complete(&read_l2_table_cb->gencb, ret); +} + +void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, + BlockDriverCompletionFunc *cb, void *opaque) +{ + QEDReadL2TableCB *read_l2_table_cb; + + qed_unref_l2_cache_entry(request->l2_table); + + /* Check for cached L2 entry */ + request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset); + if (request->l2_table) { + cb(opaque, 0); + return; + } + + request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); + request->l2_table->table = qed_alloc_table(s); + + read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque); + read_l2_table_cb->s = s; + read_l2_table_cb->l2_offset = offset; + read_l2_table_cb->request = request; + + BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD); + qed_read_table(s, offset, request->l2_table->table, + qed_read_l2_table_cb, read_l2_table_cb); +} + +int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset) +{ + int ret = -EINPROGRESS; + + qed_read_l2_table(s, request, offset, qed_sync_cb, &ret); + while (ret == -EINPROGRESS) { + qemu_aio_wait(); + } + + return ret; +} + +void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, + unsigned int index, unsigned int n, bool flush, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE); + qed_write_table(s, request->l2_table->offset, + request->l2_table->table, index, n, flush, cb, opaque); +} + +int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, + unsigned int index, unsigned int n, bool flush) +{ + int ret = -EINPROGRESS; + + qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret); + while (ret == -EINPROGRESS) { + qemu_aio_wait(); + } + + return ret; +} diff --git a/contrib/qemu/block/qed.c b/contrib/qemu/block/qed.c new file mode 100644 index 000000000..f767b0528 --- /dev/null +++ b/contrib/qemu/block/qed.c @@ -0,0 +1,1596 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu/timer.h" +#include "trace.h" +#include "qed.h" +#include "qapi/qmp/qerror.h" +#include "migration/migration.h" + +static void qed_aio_cancel(BlockDriverAIOCB *blockacb) +{ + QEDAIOCB *acb = (QEDAIOCB *)blockacb; + bool finished = false; + + /* Wait for the request to finish */ + acb->finished = &finished; + while (!finished) { + qemu_aio_wait(); + } +} + +static const AIOCBInfo qed_aiocb_info = { + .aiocb_size = sizeof(QEDAIOCB), + .cancel = qed_aio_cancel, +}; + +static int bdrv_qed_probe(const uint8_t *buf, int buf_size, + const char *filename) +{ + const QEDHeader *header = (const QEDHeader *)buf; + + if (buf_size < sizeof(*header)) { + return 0; + } + if (le32_to_cpu(header->magic) != QED_MAGIC) { + return 0; + } + return 100; +} + +/** + * Check whether an image format is raw + * + * @fmt: Backing file format, may be NULL + */ +static bool qed_fmt_is_raw(const char *fmt) +{ + return fmt && strcmp(fmt, "raw") == 0; +} + +static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu) +{ + cpu->magic = le32_to_cpu(le->magic); + cpu->cluster_size = le32_to_cpu(le->cluster_size); + cpu->table_size = le32_to_cpu(le->table_size); + cpu->header_size = le32_to_cpu(le->header_size); + cpu->features = le64_to_cpu(le->features); + cpu->compat_features = le64_to_cpu(le->compat_features); + cpu->autoclear_features = le64_to_cpu(le->autoclear_features); + cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset); + cpu->image_size = le64_to_cpu(le->image_size); + cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset); + cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size); +} + +static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le) +{ + le->magic = cpu_to_le32(cpu->magic); + le->cluster_size = cpu_to_le32(cpu->cluster_size); + le->table_size = cpu_to_le32(cpu->table_size); + le->header_size = cpu_to_le32(cpu->header_size); + le->features = cpu_to_le64(cpu->features); + le->compat_features = cpu_to_le64(cpu->compat_features); + le->autoclear_features = cpu_to_le64(cpu->autoclear_features); + le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset); + le->image_size = cpu_to_le64(cpu->image_size); + le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset); + le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size); +} + +int qed_write_header_sync(BDRVQEDState *s) +{ + QEDHeader le; + int ret; + + qed_header_cpu_to_le(&s->header, &le); + ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le)); + if (ret != sizeof(le)) { + return ret; + } + return 0; +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + struct iovec iov; + QEMUIOVector qiov; + int nsectors; + uint8_t *buf; +} QEDWriteHeaderCB; + +static void qed_write_header_cb(void *opaque, int ret) +{ + QEDWriteHeaderCB *write_header_cb = opaque; + + qemu_vfree(write_header_cb->buf); + gencb_complete(write_header_cb, ret); +} + +static void qed_write_header_read_cb(void *opaque, int ret) +{ + QEDWriteHeaderCB *write_header_cb = opaque; + BDRVQEDState *s = write_header_cb->s; + + if (ret) { + qed_write_header_cb(write_header_cb, ret); + return; + } + + /* Update header */ + qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf); + + bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov, + write_header_cb->nsectors, qed_write_header_cb, + write_header_cb); +} + +/** + * Update header in-place (does not rewrite backing filename or other strings) + * + * This function only updates known header fields in-place and does not affect + * extra data after the QED header. + */ +static void qed_write_header(BDRVQEDState *s, BlockDriverCompletionFunc cb, + void *opaque) +{ + /* We must write full sectors for O_DIRECT but cannot necessarily generate + * the data following the header if an unrecognized compat feature is + * active. Therefore, first read the sectors containing the header, update + * them, and write back. + */ + + int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) / + BDRV_SECTOR_SIZE; + size_t len = nsectors * BDRV_SECTOR_SIZE; + QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb), + cb, opaque); + + write_header_cb->s = s; + write_header_cb->nsectors = nsectors; + write_header_cb->buf = qemu_blockalign(s->bs, len); + write_header_cb->iov.iov_base = write_header_cb->buf; + write_header_cb->iov.iov_len = len; + qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1); + + bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors, + qed_write_header_read_cb, write_header_cb); +} + +static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size) +{ + uint64_t table_entries; + uint64_t l2_size; + + table_entries = (table_size * cluster_size) / sizeof(uint64_t); + l2_size = table_entries * cluster_size; + + return l2_size * table_entries; +} + +static bool qed_is_cluster_size_valid(uint32_t cluster_size) +{ + if (cluster_size < QED_MIN_CLUSTER_SIZE || + cluster_size > QED_MAX_CLUSTER_SIZE) { + return false; + } + if (cluster_size & (cluster_size - 1)) { + return false; /* not power of 2 */ + } + return true; +} + +static bool qed_is_table_size_valid(uint32_t table_size) +{ + if (table_size < QED_MIN_TABLE_SIZE || + table_size > QED_MAX_TABLE_SIZE) { + return false; + } + if (table_size & (table_size - 1)) { + return false; /* not power of 2 */ + } + return true; +} + +static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size, + uint32_t table_size) +{ + if (image_size % BDRV_SECTOR_SIZE != 0) { + return false; /* not multiple of sector size */ + } + if (image_size > qed_max_image_size(cluster_size, table_size)) { + return false; /* image is too large */ + } + return true; +} + +/** + * Read a string of known length from the image file + * + * @file: Image file + * @offset: File offset to start of string, in bytes + * @n: String length in bytes + * @buf: Destination buffer + * @buflen: Destination buffer length in bytes + * @ret: 0 on success, -errno on failure + * + * The string is NUL-terminated. + */ +static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n, + char *buf, size_t buflen) +{ + int ret; + if (n >= buflen) { + return -EINVAL; + } + ret = bdrv_pread(file, offset, buf, n); + if (ret < 0) { + return ret; + } + buf[n] = '\0'; + return 0; +} + +/** + * Allocate new clusters + * + * @s: QED state + * @n: Number of contiguous clusters to allocate + * @ret: Offset of first allocated cluster + * + * This function only produces the offset where the new clusters should be + * written. It updates BDRVQEDState but does not make any changes to the image + * file. + */ +static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n) +{ + uint64_t offset = s->file_size; + s->file_size += n * s->header.cluster_size; + return offset; +} + +QEDTable *qed_alloc_table(BDRVQEDState *s) +{ + /* Honor O_DIRECT memory alignment requirements */ + return qemu_blockalign(s->bs, + s->header.cluster_size * s->header.table_size); +} + +/** + * Allocate a new zeroed L2 table + */ +static CachedL2Table *qed_new_l2_table(BDRVQEDState *s) +{ + CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); + + l2_table->table = qed_alloc_table(s); + l2_table->offset = qed_alloc_clusters(s, s->header.table_size); + + memset(l2_table->table->offsets, 0, + s->header.cluster_size * s->header.table_size); + return l2_table; +} + +static void qed_aio_next_io(void *opaque, int ret); + +static void qed_plug_allocating_write_reqs(BDRVQEDState *s) +{ + assert(!s->allocating_write_reqs_plugged); + + s->allocating_write_reqs_plugged = true; +} + +static void qed_unplug_allocating_write_reqs(BDRVQEDState *s) +{ + QEDAIOCB *acb; + + assert(s->allocating_write_reqs_plugged); + + s->allocating_write_reqs_plugged = false; + + acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); + if (acb) { + qed_aio_next_io(acb, 0); + } +} + +static void qed_finish_clear_need_check(void *opaque, int ret) +{ + /* Do nothing */ +} + +static void qed_flush_after_clear_need_check(void *opaque, int ret) +{ + BDRVQEDState *s = opaque; + + bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s); + + /* No need to wait until flush completes */ + qed_unplug_allocating_write_reqs(s); +} + +static void qed_clear_need_check(void *opaque, int ret) +{ + BDRVQEDState *s = opaque; + + if (ret) { + qed_unplug_allocating_write_reqs(s); + return; + } + + s->header.features &= ~QED_F_NEED_CHECK; + qed_write_header(s, qed_flush_after_clear_need_check, s); +} + +static void qed_need_check_timer_cb(void *opaque) +{ + BDRVQEDState *s = opaque; + + /* The timer should only fire when allocating writes have drained */ + assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs)); + + trace_qed_need_check_timer_cb(s); + + qed_plug_allocating_write_reqs(s); + + /* Ensure writes are on disk before clearing flag */ + bdrv_aio_flush(s->bs, qed_clear_need_check, s); +} + +static void qed_start_need_check_timer(BDRVQEDState *s) +{ + trace_qed_start_need_check_timer(s); + + /* Use vm_clock so we don't alter the image file while suspended for + * migration. + */ + qemu_mod_timer(s->need_check_timer, qemu_get_clock_ns(vm_clock) + + get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT); +} + +/* It's okay to call this multiple times or when no timer is started */ +static void qed_cancel_need_check_timer(BDRVQEDState *s) +{ + trace_qed_cancel_need_check_timer(s); + qemu_del_timer(s->need_check_timer); +} + +static void bdrv_qed_rebind(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + s->bs = bs; +} + +static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags) +{ + BDRVQEDState *s = bs->opaque; + QEDHeader le_header; + int64_t file_size; + int ret; + + s->bs = bs; + QSIMPLEQ_INIT(&s->allocating_write_reqs); + + ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header)); + if (ret < 0) { + return ret; + } + qed_header_le_to_cpu(&le_header, &s->header); + + if (s->header.magic != QED_MAGIC) { + return -EMEDIUMTYPE; + } + if (s->header.features & ~QED_FEATURE_MASK) { + /* image uses unsupported feature bits */ + char buf[64]; + snprintf(buf, sizeof(buf), "%" PRIx64, + s->header.features & ~QED_FEATURE_MASK); + qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bs->device_name, "QED", buf); + return -ENOTSUP; + } + if (!qed_is_cluster_size_valid(s->header.cluster_size)) { + return -EINVAL; + } + + /* Round down file size to the last cluster */ + file_size = bdrv_getlength(bs->file); + if (file_size < 0) { + return file_size; + } + s->file_size = qed_start_of_cluster(s, file_size); + + if (!qed_is_table_size_valid(s->header.table_size)) { + return -EINVAL; + } + if (!qed_is_image_size_valid(s->header.image_size, + s->header.cluster_size, + s->header.table_size)) { + return -EINVAL; + } + if (!qed_check_table_offset(s, s->header.l1_table_offset)) { + return -EINVAL; + } + + s->table_nelems = (s->header.cluster_size * s->header.table_size) / + sizeof(uint64_t); + s->l2_shift = ffs(s->header.cluster_size) - 1; + s->l2_mask = s->table_nelems - 1; + s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1; + + if ((s->header.features & QED_F_BACKING_FILE)) { + if ((uint64_t)s->header.backing_filename_offset + + s->header.backing_filename_size > + s->header.cluster_size * s->header.header_size) { + return -EINVAL; + } + + ret = qed_read_string(bs->file, s->header.backing_filename_offset, + s->header.backing_filename_size, bs->backing_file, + sizeof(bs->backing_file)); + if (ret < 0) { + return ret; + } + + if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) { + pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw"); + } + } + + /* Reset unknown autoclear feature bits. This is a backwards + * compatibility mechanism that allows images to be opened by older + * programs, which "knock out" unknown feature bits. When an image is + * opened by a newer program again it can detect that the autoclear + * feature is no longer valid. + */ + if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 && + !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) { + s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK; + + ret = qed_write_header_sync(s); + if (ret) { + return ret; + } + + /* From here on only known autoclear feature bits are valid */ + bdrv_flush(bs->file); + } + + s->l1_table = qed_alloc_table(s); + qed_init_l2_cache(&s->l2_cache); + + ret = qed_read_l1_table_sync(s); + if (ret) { + goto out; + } + + /* If image was not closed cleanly, check consistency */ + if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) { + /* Read-only images cannot be fixed. There is no risk of corruption + * since write operations are not possible. Therefore, allow + * potentially inconsistent images to be opened read-only. This can + * aid data recovery from an otherwise inconsistent image. + */ + if (!bdrv_is_read_only(bs->file) && + !(flags & BDRV_O_INCOMING)) { + BdrvCheckResult result = {0}; + + ret = qed_check(s, &result, true); + if (ret) { + goto out; + } + } + } + + s->need_check_timer = qemu_new_timer_ns(vm_clock, + qed_need_check_timer_cb, s); + +out: + if (ret) { + qed_free_l2_cache(&s->l2_cache); + qemu_vfree(s->l1_table); + } + return ret; +} + +/* We have nothing to do for QED reopen, stubs just return + * success */ +static int bdrv_qed_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + +static void bdrv_qed_close(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + qed_cancel_need_check_timer(s); + qemu_free_timer(s->need_check_timer); + + /* Ensure writes reach stable storage */ + bdrv_flush(bs->file); + + /* Clean shutdown, no check required on next open */ + if (s->header.features & QED_F_NEED_CHECK) { + s->header.features &= ~QED_F_NEED_CHECK; + qed_write_header_sync(s); + } + + qed_free_l2_cache(&s->l2_cache); + qemu_vfree(s->l1_table); +} + +static int qed_create(const char *filename, uint32_t cluster_size, + uint64_t image_size, uint32_t table_size, + const char *backing_file, const char *backing_fmt) +{ + QEDHeader header = { + .magic = QED_MAGIC, + .cluster_size = cluster_size, + .table_size = table_size, + .header_size = 1, + .features = 0, + .compat_features = 0, + .l1_table_offset = cluster_size, + .image_size = image_size, + }; + QEDHeader le_header; + uint8_t *l1_table = NULL; + size_t l1_size = header.cluster_size * header.table_size; + int ret = 0; + BlockDriverState *bs = NULL; + + ret = bdrv_create_file(filename, NULL); + if (ret < 0) { + return ret; + } + + ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB); + if (ret < 0) { + return ret; + } + + /* File must start empty and grow, check truncate is supported */ + ret = bdrv_truncate(bs, 0); + if (ret < 0) { + goto out; + } + + if (backing_file) { + header.features |= QED_F_BACKING_FILE; + header.backing_filename_offset = sizeof(le_header); + header.backing_filename_size = strlen(backing_file); + + if (qed_fmt_is_raw(backing_fmt)) { + header.features |= QED_F_BACKING_FORMAT_NO_PROBE; + } + } + + qed_header_cpu_to_le(&header, &le_header); + ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header)); + if (ret < 0) { + goto out; + } + ret = bdrv_pwrite(bs, sizeof(le_header), backing_file, + header.backing_filename_size); + if (ret < 0) { + goto out; + } + + l1_table = g_malloc0(l1_size); + ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size); + if (ret < 0) { + goto out; + } + + ret = 0; /* success */ +out: + g_free(l1_table); + bdrv_delete(bs); + return ret; +} + +static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options) +{ + uint64_t image_size = 0; + uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE; + uint32_t table_size = QED_DEFAULT_TABLE_SIZE; + const char *backing_file = NULL; + const char *backing_fmt = NULL; + + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + image_size = options->value.n; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { + backing_file = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { + backing_fmt = options->value.s; + } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { + if (options->value.n) { + cluster_size = options->value.n; + } + } else if (!strcmp(options->name, BLOCK_OPT_TABLE_SIZE)) { + if (options->value.n) { + table_size = options->value.n; + } + } + options++; + } + + if (!qed_is_cluster_size_valid(cluster_size)) { + fprintf(stderr, "QED cluster size must be within range [%u, %u] and power of 2\n", + QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE); + return -EINVAL; + } + if (!qed_is_table_size_valid(table_size)) { + fprintf(stderr, "QED table size must be within range [%u, %u] and power of 2\n", + QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE); + return -EINVAL; + } + if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) { + fprintf(stderr, "QED image size must be a non-zero multiple of " + "cluster size and less than %" PRIu64 " bytes\n", + qed_max_image_size(cluster_size, table_size)); + return -EINVAL; + } + + return qed_create(filename, cluster_size, image_size, table_size, + backing_file, backing_fmt); +} + +typedef struct { + Coroutine *co; + int is_allocated; + int *pnum; +} QEDIsAllocatedCB; + +static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) +{ + QEDIsAllocatedCB *cb = opaque; + *cb->pnum = len / BDRV_SECTOR_SIZE; + cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO); + if (cb->co) { + qemu_coroutine_enter(cb->co, NULL); + } +} + +static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + BDRVQEDState *s = bs->opaque; + uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; + size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE; + QEDIsAllocatedCB cb = { + .is_allocated = -1, + .pnum = pnum, + }; + QEDRequest request = { .l2_table = NULL }; + + qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb); + + /* Now sleep if the callback wasn't invoked immediately */ + while (cb.is_allocated == -1) { + cb.co = qemu_coroutine_self(); + qemu_coroutine_yield(); + } + + qed_unref_l2_cache_entry(request.l2_table); + + return cb.is_allocated; +} + +static int bdrv_qed_make_empty(BlockDriverState *bs) +{ + return -ENOTSUP; +} + +static BDRVQEDState *acb_to_s(QEDAIOCB *acb) +{ + return acb->common.bs->opaque; +} + +/** + * Read from the backing file or zero-fill if no backing file + * + * @s: QED state + * @pos: Byte position in device + * @qiov: Destination I/O vector + * @cb: Completion function + * @opaque: User data for completion function + * + * This function reads qiov->size bytes starting at pos from the backing file. + * If there is no backing file then zeroes are read. + */ +static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, + QEMUIOVector *qiov, + BlockDriverCompletionFunc *cb, void *opaque) +{ + uint64_t backing_length = 0; + size_t size; + + /* If there is a backing file, get its length. Treat the absence of a + * backing file like a zero length backing file. + */ + if (s->bs->backing_hd) { + int64_t l = bdrv_getlength(s->bs->backing_hd); + if (l < 0) { + cb(opaque, l); + return; + } + backing_length = l; + } + + /* Zero all sectors if reading beyond the end of the backing file */ + if (pos >= backing_length || + pos + qiov->size > backing_length) { + qemu_iovec_memset(qiov, 0, 0, qiov->size); + } + + /* Complete now if there are no backing file sectors to read */ + if (pos >= backing_length) { + cb(opaque, 0); + return; + } + + /* If the read straddles the end of the backing file, shorten it */ + size = MIN((uint64_t)backing_length - pos, qiov->size); + + BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO); + bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE, + qiov, size / BDRV_SECTOR_SIZE, cb, opaque); +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + QEMUIOVector qiov; + struct iovec iov; + uint64_t offset; +} CopyFromBackingFileCB; + +static void qed_copy_from_backing_file_cb(void *opaque, int ret) +{ + CopyFromBackingFileCB *copy_cb = opaque; + qemu_vfree(copy_cb->iov.iov_base); + gencb_complete(©_cb->gencb, ret); +} + +static void qed_copy_from_backing_file_write(void *opaque, int ret) +{ + CopyFromBackingFileCB *copy_cb = opaque; + BDRVQEDState *s = copy_cb->s; + + if (ret) { + qed_copy_from_backing_file_cb(copy_cb, ret); + return; + } + + BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE); + bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE, + ©_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE, + qed_copy_from_backing_file_cb, copy_cb); +} + +/** + * Copy data from backing file into the image + * + * @s: QED state + * @pos: Byte position in device + * @len: Number of bytes + * @offset: Byte offset in image file + * @cb: Completion function + * @opaque: User data for completion function + */ +static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, + uint64_t len, uint64_t offset, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + CopyFromBackingFileCB *copy_cb; + + /* Skip copy entirely if there is no work to do */ + if (len == 0) { + cb(opaque, 0); + return; + } + + copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque); + copy_cb->s = s; + copy_cb->offset = offset; + copy_cb->iov.iov_base = qemu_blockalign(s->bs, len); + copy_cb->iov.iov_len = len; + qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1); + + qed_read_backing_file(s, pos, ©_cb->qiov, + qed_copy_from_backing_file_write, copy_cb); +} + +/** + * Link one or more contiguous clusters into a table + * + * @s: QED state + * @table: L2 table + * @index: First cluster index + * @n: Number of contiguous clusters + * @cluster: First cluster offset + * + * The cluster offset may be an allocated byte offset in the image file, the + * zero cluster marker, or the unallocated cluster marker. + */ +static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index, + unsigned int n, uint64_t cluster) +{ + int i; + for (i = index; i < index + n; i++) { + table->offsets[i] = cluster; + if (!qed_offset_is_unalloc_cluster(cluster) && + !qed_offset_is_zero_cluster(cluster)) { + cluster += s->header.cluster_size; + } + } +} + +static void qed_aio_complete_bh(void *opaque) +{ + QEDAIOCB *acb = opaque; + BlockDriverCompletionFunc *cb = acb->common.cb; + void *user_opaque = acb->common.opaque; + int ret = acb->bh_ret; + bool *finished = acb->finished; + + qemu_bh_delete(acb->bh); + qemu_aio_release(acb); + + /* Invoke callback */ + cb(user_opaque, ret); + + /* Signal cancel completion */ + if (finished) { + *finished = true; + } +} + +static void qed_aio_complete(QEDAIOCB *acb, int ret) +{ + BDRVQEDState *s = acb_to_s(acb); + + trace_qed_aio_complete(s, acb, ret); + + /* Free resources */ + qemu_iovec_destroy(&acb->cur_qiov); + qed_unref_l2_cache_entry(acb->request.l2_table); + + /* Free the buffer we may have allocated for zero writes */ + if (acb->flags & QED_AIOCB_ZERO) { + qemu_vfree(acb->qiov->iov[0].iov_base); + acb->qiov->iov[0].iov_base = NULL; + } + + /* Arrange for a bh to invoke the completion function */ + acb->bh_ret = ret; + acb->bh = qemu_bh_new(qed_aio_complete_bh, acb); + qemu_bh_schedule(acb->bh); + + /* Start next allocating write request waiting behind this one. Note that + * requests enqueue themselves when they first hit an unallocated cluster + * but they wait until the entire request is finished before waking up the + * next request in the queue. This ensures that we don't cycle through + * requests multiple times but rather finish one at a time completely. + */ + if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { + QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next); + acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); + if (acb) { + qed_aio_next_io(acb, 0); + } else if (s->header.features & QED_F_NEED_CHECK) { + qed_start_need_check_timer(s); + } + } +} + +/** + * Commit the current L2 table to the cache + */ +static void qed_commit_l2_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + CachedL2Table *l2_table = acb->request.l2_table; + uint64_t l2_offset = l2_table->offset; + + qed_commit_l2_cache_entry(&s->l2_cache, l2_table); + + /* This is guaranteed to succeed because we just committed the entry to the + * cache. + */ + acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); + assert(acb->request.l2_table != NULL); + + qed_aio_next_io(opaque, ret); +} + +/** + * Update L1 table with new L2 table offset and write it out + */ +static void qed_aio_write_l1_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + int index; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + index = qed_l1_index(s, acb->cur_pos); + s->l1_table->offsets[index] = acb->request.l2_table->offset; + + qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb); +} + +/** + * Update L2 table with new cluster offsets and write them out + */ +static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset) +{ + BDRVQEDState *s = acb_to_s(acb); + bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1; + int index; + + if (ret) { + goto err; + } + + if (need_alloc) { + qed_unref_l2_cache_entry(acb->request.l2_table); + acb->request.l2_table = qed_new_l2_table(s); + } + + index = qed_l2_index(s, acb->cur_pos); + qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters, + offset); + + if (need_alloc) { + /* Write out the whole new L2 table */ + qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true, + qed_aio_write_l1_update, acb); + } else { + /* Write out only the updated part of the L2 table */ + qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false, + qed_aio_next_io, acb); + } + return; + +err: + qed_aio_complete(acb, ret); +} + +static void qed_aio_write_l2_update_cb(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + qed_aio_write_l2_update(acb, ret, acb->cur_cluster); +} + +/** + * Flush new data clusters before updating the L2 table + * + * This flush is necessary when a backing file is in use. A crash during an + * allocating write could result in empty clusters in the image. If the write + * only touched a subregion of the cluster, then backing image sectors have + * been lost in the untouched region. The solution is to flush after writing a + * new data cluster and before updating the L2 table. + */ +static void qed_aio_write_flush_before_l2_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + + if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) { + qed_aio_complete(acb, -EIO); + } +} + +/** + * Write data to the image file + */ +static void qed_aio_write_main(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + uint64_t offset = acb->cur_cluster + + qed_offset_into_cluster(s, acb->cur_pos); + BlockDriverCompletionFunc *next_fn; + + trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size); + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + if (acb->find_cluster_ret == QED_CLUSTER_FOUND) { + next_fn = qed_aio_next_io; + } else { + if (s->bs->backing_hd) { + next_fn = qed_aio_write_flush_before_l2_update; + } else { + next_fn = qed_aio_write_l2_update_cb; + } + } + + BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); + bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, + &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, + next_fn, acb); +} + +/** + * Populate back untouched region of new data cluster + */ +static void qed_aio_write_postfill(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + uint64_t start = acb->cur_pos + acb->cur_qiov.size; + uint64_t len = + qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start; + uint64_t offset = acb->cur_cluster + + qed_offset_into_cluster(s, acb->cur_pos) + + acb->cur_qiov.size; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + trace_qed_aio_write_postfill(s, acb, start, len, offset); + qed_copy_from_backing_file(s, start, len, offset, + qed_aio_write_main, acb); +} + +/** + * Populate front untouched region of new data cluster + */ +static void qed_aio_write_prefill(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + uint64_t start = qed_start_of_cluster(s, acb->cur_pos); + uint64_t len = qed_offset_into_cluster(s, acb->cur_pos); + + trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster); + qed_copy_from_backing_file(s, start, len, acb->cur_cluster, + qed_aio_write_postfill, acb); +} + +/** + * Check if the QED_F_NEED_CHECK bit should be set during allocating write + */ +static bool qed_should_set_need_check(BDRVQEDState *s) +{ + /* The flush before L2 update path ensures consistency */ + if (s->bs->backing_hd) { + return false; + } + + return !(s->header.features & QED_F_NEED_CHECK); +} + +static void qed_aio_write_zero_cluster(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + qed_aio_write_l2_update(acb, 0, 1); +} + +/** + * Write new data cluster + * + * @acb: Write request + * @len: Length in bytes + * + * This path is taken when writing to previously unallocated clusters. + */ +static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) +{ + BDRVQEDState *s = acb_to_s(acb); + BlockDriverCompletionFunc *cb; + + /* Cancel timer when the first allocating request comes in */ + if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) { + qed_cancel_need_check_timer(s); + } + + /* Freeze this request if another allocating write is in progress */ + if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { + QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next); + } + if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) || + s->allocating_write_reqs_plugged) { + return; /* wait for existing request to finish */ + } + + acb->cur_nclusters = qed_bytes_to_clusters(s, + qed_offset_into_cluster(s, acb->cur_pos) + len); + qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + + if (acb->flags & QED_AIOCB_ZERO) { + /* Skip ahead if the clusters are already zero */ + if (acb->find_cluster_ret == QED_CLUSTER_ZERO) { + qed_aio_next_io(acb, 0); + return; + } + + cb = qed_aio_write_zero_cluster; + } else { + cb = qed_aio_write_prefill; + acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters); + } + + if (qed_should_set_need_check(s)) { + s->header.features |= QED_F_NEED_CHECK; + qed_write_header(s, cb, acb); + } else { + cb(acb, 0); + } +} + +/** + * Write data cluster in place + * + * @acb: Write request + * @offset: Cluster offset in bytes + * @len: Length in bytes + * + * This path is taken when writing to already allocated clusters. + */ +static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) +{ + /* Allocate buffer for zero writes */ + if (acb->flags & QED_AIOCB_ZERO) { + struct iovec *iov = acb->qiov->iov; + + if (!iov->iov_base) { + iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len); + memset(iov->iov_base, 0, iov->iov_len); + } + } + + /* Calculate the I/O vector */ + acb->cur_cluster = offset; + qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + + /* Do the actual write */ + qed_aio_write_main(acb, 0); +} + +/** + * Write data cluster + * + * @opaque: Write request + * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, + * or -errno + * @offset: Cluster offset in bytes + * @len: Length in bytes + * + * Callback from qed_find_cluster(). + */ +static void qed_aio_write_data(void *opaque, int ret, + uint64_t offset, size_t len) +{ + QEDAIOCB *acb = opaque; + + trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len); + + acb->find_cluster_ret = ret; + + switch (ret) { + case QED_CLUSTER_FOUND: + qed_aio_write_inplace(acb, offset, len); + break; + + case QED_CLUSTER_L2: + case QED_CLUSTER_L1: + case QED_CLUSTER_ZERO: + qed_aio_write_alloc(acb, len); + break; + + default: + qed_aio_complete(acb, ret); + break; + } +} + +/** + * Read data cluster + * + * @opaque: Read request + * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, + * or -errno + * @offset: Cluster offset in bytes + * @len: Length in bytes + * + * Callback from qed_find_cluster(). + */ +static void qed_aio_read_data(void *opaque, int ret, + uint64_t offset, size_t len) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + BlockDriverState *bs = acb->common.bs; + + /* Adjust offset into cluster */ + offset += qed_offset_into_cluster(s, acb->cur_pos); + + trace_qed_aio_read_data(s, acb, ret, offset, len); + + if (ret < 0) { + goto err; + } + + qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + + /* Handle zero cluster and backing file reads */ + if (ret == QED_CLUSTER_ZERO) { + qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size); + qed_aio_next_io(acb, 0); + return; + } else if (ret != QED_CLUSTER_FOUND) { + qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, + qed_aio_next_io, acb); + return; + } + + BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); + bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE, + &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, + qed_aio_next_io, acb); + return; + +err: + qed_aio_complete(acb, ret); +} + +/** + * Begin next I/O or complete the request + */ +static void qed_aio_next_io(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ? + qed_aio_write_data : qed_aio_read_data; + + trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size); + + /* Handle I/O error */ + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + acb->qiov_offset += acb->cur_qiov.size; + acb->cur_pos += acb->cur_qiov.size; + qemu_iovec_reset(&acb->cur_qiov); + + /* Complete request */ + if (acb->cur_pos >= acb->end_pos) { + qed_aio_complete(acb, 0); + return; + } + + /* Find next cluster and start I/O */ + qed_find_cluster(s, &acb->request, + acb->cur_pos, acb->end_pos - acb->cur_pos, + io_fn, acb); +} + +static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque, int flags) +{ + QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque); + + trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, + opaque, flags); + + acb->flags = flags; + acb->finished = NULL; + acb->qiov = qiov; + acb->qiov_offset = 0; + acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; + acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE; + acb->request.l2_table = NULL; + qemu_iovec_init(&acb->cur_qiov, qiov->niov); + + /* Start request */ + qed_aio_next_io(acb, 0); + return &acb->common; +} + +static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); +} + +static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, + opaque, QED_AIOCB_WRITE); +} + +typedef struct { + Coroutine *co; + int ret; + bool done; +} QEDWriteZeroesCB; + +static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret) +{ + QEDWriteZeroesCB *cb = opaque; + + cb->done = true; + cb->ret = ret; + if (cb->co) { + qemu_coroutine_enter(cb->co, NULL); + } +} + +static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors) +{ + BlockDriverAIOCB *blockacb; + BDRVQEDState *s = bs->opaque; + QEDWriteZeroesCB cb = { .done = false }; + QEMUIOVector qiov; + struct iovec iov; + + /* Refuse if there are untouched backing file sectors */ + if (bs->backing_hd) { + if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) { + return -ENOTSUP; + } + if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) { + return -ENOTSUP; + } + } + + /* Zero writes start without an I/O buffer. If a buffer becomes necessary + * then it will be allocated during request processing. + */ + iov.iov_base = NULL, + iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE, + + qemu_iovec_init_external(&qiov, &iov, 1); + blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors, + qed_co_write_zeroes_cb, &cb, + QED_AIOCB_WRITE | QED_AIOCB_ZERO); + if (!blockacb) { + return -EIO; + } + if (!cb.done) { + cb.co = qemu_coroutine_self(); + qemu_coroutine_yield(); + } + assert(cb.done); + return cb.ret; +} + +static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVQEDState *s = bs->opaque; + uint64_t old_image_size; + int ret; + + if (!qed_is_image_size_valid(offset, s->header.cluster_size, + s->header.table_size)) { + return -EINVAL; + } + + /* Shrinking is currently not supported */ + if ((uint64_t)offset < s->header.image_size) { + return -ENOTSUP; + } + + old_image_size = s->header.image_size; + s->header.image_size = offset; + ret = qed_write_header_sync(s); + if (ret < 0) { + s->header.image_size = old_image_size; + } + return ret; +} + +static int64_t bdrv_qed_getlength(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + return s->header.image_size; +} + +static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVQEDState *s = bs->opaque; + + memset(bdi, 0, sizeof(*bdi)); + bdi->cluster_size = s->header.cluster_size; + bdi->is_dirty = s->header.features & QED_F_NEED_CHECK; + return 0; +} + +static int bdrv_qed_change_backing_file(BlockDriverState *bs, + const char *backing_file, + const char *backing_fmt) +{ + BDRVQEDState *s = bs->opaque; + QEDHeader new_header, le_header; + void *buffer; + size_t buffer_len, backing_file_len; + int ret; + + /* Refuse to set backing filename if unknown compat feature bits are + * active. If the image uses an unknown compat feature then we may not + * know the layout of data following the header structure and cannot safely + * add a new string. + */ + if (backing_file && (s->header.compat_features & + ~QED_COMPAT_FEATURE_MASK)) { + return -ENOTSUP; + } + + memcpy(&new_header, &s->header, sizeof(new_header)); + + new_header.features &= ~(QED_F_BACKING_FILE | + QED_F_BACKING_FORMAT_NO_PROBE); + + /* Adjust feature flags */ + if (backing_file) { + new_header.features |= QED_F_BACKING_FILE; + + if (qed_fmt_is_raw(backing_fmt)) { + new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE; + } + } + + /* Calculate new header size */ + backing_file_len = 0; + + if (backing_file) { + backing_file_len = strlen(backing_file); + } + + buffer_len = sizeof(new_header); + new_header.backing_filename_offset = buffer_len; + new_header.backing_filename_size = backing_file_len; + buffer_len += backing_file_len; + + /* Make sure we can rewrite header without failing */ + if (buffer_len > new_header.header_size * new_header.cluster_size) { + return -ENOSPC; + } + + /* Prepare new header */ + buffer = g_malloc(buffer_len); + + qed_header_cpu_to_le(&new_header, &le_header); + memcpy(buffer, &le_header, sizeof(le_header)); + buffer_len = sizeof(le_header); + + if (backing_file) { + memcpy(buffer + buffer_len, backing_file, backing_file_len); + buffer_len += backing_file_len; + } + + /* Write new header */ + ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len); + g_free(buffer); + if (ret == 0) { + memcpy(&s->header, &new_header, sizeof(new_header)); + } + return ret; +} + +static void bdrv_qed_invalidate_cache(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + bdrv_qed_close(bs); + memset(s, 0, sizeof(BDRVQEDState)); + bdrv_qed_open(bs, NULL, bs->open_flags); +} + +static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) +{ + BDRVQEDState *s = bs->opaque; + + return qed_check(s, result, !!fix); +} + +static QEMUOptionParameter qed_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size (in bytes)" + }, { + .name = BLOCK_OPT_BACKING_FILE, + .type = OPT_STRING, + .help = "File name of a base image" + }, { + .name = BLOCK_OPT_BACKING_FMT, + .type = OPT_STRING, + .help = "Image format of the base image" + }, { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = OPT_SIZE, + .help = "Cluster size (in bytes)", + .value = { .n = QED_DEFAULT_CLUSTER_SIZE }, + }, { + .name = BLOCK_OPT_TABLE_SIZE, + .type = OPT_SIZE, + .help = "L1/L2 table size (in clusters)" + }, + { /* end of list */ } +}; + +static BlockDriver bdrv_qed = { + .format_name = "qed", + .instance_size = sizeof(BDRVQEDState), + .create_options = qed_create_options, + + .bdrv_probe = bdrv_qed_probe, + .bdrv_rebind = bdrv_qed_rebind, + .bdrv_open = bdrv_qed_open, + .bdrv_close = bdrv_qed_close, + .bdrv_reopen_prepare = bdrv_qed_reopen_prepare, + .bdrv_create = bdrv_qed_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .bdrv_co_is_allocated = bdrv_qed_co_is_allocated, + .bdrv_make_empty = bdrv_qed_make_empty, + .bdrv_aio_readv = bdrv_qed_aio_readv, + .bdrv_aio_writev = bdrv_qed_aio_writev, + .bdrv_co_write_zeroes = bdrv_qed_co_write_zeroes, + .bdrv_truncate = bdrv_qed_truncate, + .bdrv_getlength = bdrv_qed_getlength, + .bdrv_get_info = bdrv_qed_get_info, + .bdrv_change_backing_file = bdrv_qed_change_backing_file, + .bdrv_invalidate_cache = bdrv_qed_invalidate_cache, + .bdrv_check = bdrv_qed_check, +}; + +static void bdrv_qed_init(void) +{ + bdrv_register(&bdrv_qed); +} + +block_init(bdrv_qed_init); diff --git a/contrib/qemu/block/qed.h b/contrib/qemu/block/qed.h new file mode 100644 index 000000000..2b4ddedf3 --- /dev/null +++ b/contrib/qemu/block/qed.h @@ -0,0 +1,344 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#ifndef BLOCK_QED_H +#define BLOCK_QED_H + +#include "block/block_int.h" + +/* The layout of a QED file is as follows: + * + * +--------+----------+----------+----------+-----+ + * | header | L1 table | cluster0 | cluster1 | ... | + * +--------+----------+----------+----------+-----+ + * + * There is a 2-level pagetable for cluster allocation: + * + * +----------+ + * | L1 table | + * +----------+ + * ,------' | '------. + * +----------+ | +----------+ + * | L2 table | ... | L2 table | + * +----------+ +----------+ + * ,------' | '------. + * +----------+ | +----------+ + * | Data | ... | Data | + * +----------+ +----------+ + * + * The L1 table is fixed size and always present. L2 tables are allocated on + * demand. The L1 table size determines the maximum possible image size; it + * can be influenced using the cluster_size and table_size values. + * + * All fields are little-endian on disk. + */ + +enum { + QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24, + + /* The image supports a backing file */ + QED_F_BACKING_FILE = 0x01, + + /* The image needs a consistency check before use */ + QED_F_NEED_CHECK = 0x02, + + /* The backing file format must not be probed, treat as raw image */ + QED_F_BACKING_FORMAT_NO_PROBE = 0x04, + + /* Feature bits must be used when the on-disk format changes */ + QED_FEATURE_MASK = QED_F_BACKING_FILE | /* supported feature bits */ + QED_F_NEED_CHECK | + QED_F_BACKING_FORMAT_NO_PROBE, + QED_COMPAT_FEATURE_MASK = 0, /* supported compat feature bits */ + QED_AUTOCLEAR_FEATURE_MASK = 0, /* supported autoclear feature bits */ + + /* Data is stored in groups of sectors called clusters. Cluster size must + * be large to avoid keeping too much metadata. I/O requests that have + * sub-cluster size will require read-modify-write. + */ + QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */ + QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024, + QED_DEFAULT_CLUSTER_SIZE = 64 * 1024, + + /* Allocated clusters are tracked using a 2-level pagetable. Table size is + * a multiple of clusters so large maximum image sizes can be supported + * without jacking up the cluster size too much. + */ + QED_MIN_TABLE_SIZE = 1, /* in clusters */ + QED_MAX_TABLE_SIZE = 16, + QED_DEFAULT_TABLE_SIZE = 4, + + /* Delay to flush and clean image after last allocating write completes */ + QED_NEED_CHECK_TIMEOUT = 5, /* in seconds */ +}; + +typedef struct { + uint32_t magic; /* QED\0 */ + + uint32_t cluster_size; /* in bytes */ + uint32_t table_size; /* for L1 and L2 tables, in clusters */ + uint32_t header_size; /* in clusters */ + + uint64_t features; /* format feature bits */ + uint64_t compat_features; /* compatible feature bits */ + uint64_t autoclear_features; /* self-resetting feature bits */ + + uint64_t l1_table_offset; /* in bytes */ + uint64_t image_size; /* total logical image size, in bytes */ + + /* if (features & QED_F_BACKING_FILE) */ + uint32_t backing_filename_offset; /* in bytes from start of header */ + uint32_t backing_filename_size; /* in bytes */ +} QEDHeader; + +typedef struct { + uint64_t offsets[0]; /* in bytes */ +} QEDTable; + +/* The L2 cache is a simple write-through cache for L2 structures */ +typedef struct CachedL2Table { + QEDTable *table; + uint64_t offset; /* offset=0 indicates an invalidate entry */ + QTAILQ_ENTRY(CachedL2Table) node; + int ref; +} CachedL2Table; + +typedef struct { + QTAILQ_HEAD(, CachedL2Table) entries; + unsigned int n_entries; +} L2TableCache; + +typedef struct QEDRequest { + CachedL2Table *l2_table; +} QEDRequest; + +enum { + QED_AIOCB_WRITE = 0x0001, /* read or write? */ + QED_AIOCB_ZERO = 0x0002, /* zero write, used with QED_AIOCB_WRITE */ +}; + +typedef struct QEDAIOCB { + BlockDriverAIOCB common; + QEMUBH *bh; + int bh_ret; /* final return status for completion bh */ + QSIMPLEQ_ENTRY(QEDAIOCB) next; /* next request */ + int flags; /* QED_AIOCB_* bits ORed together */ + bool *finished; /* signal for cancel completion */ + uint64_t end_pos; /* request end on block device, in bytes */ + + /* User scatter-gather list */ + QEMUIOVector *qiov; + size_t qiov_offset; /* byte count already processed */ + + /* Current cluster scatter-gather list */ + QEMUIOVector cur_qiov; + uint64_t cur_pos; /* position on block device, in bytes */ + uint64_t cur_cluster; /* cluster offset in image file */ + unsigned int cur_nclusters; /* number of clusters being accessed */ + int find_cluster_ret; /* used for L1/L2 update */ + + QEDRequest request; +} QEDAIOCB; + +typedef struct { + BlockDriverState *bs; /* device */ + uint64_t file_size; /* length of image file, in bytes */ + + QEDHeader header; /* always cpu-endian */ + QEDTable *l1_table; + L2TableCache l2_cache; /* l2 table cache */ + uint32_t table_nelems; + uint32_t l1_shift; + uint32_t l2_shift; + uint32_t l2_mask; + + /* Allocating write request queue */ + QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs; + bool allocating_write_reqs_plugged; + + /* Periodic flush and clear need check flag */ + QEMUTimer *need_check_timer; +} BDRVQEDState; + +enum { + QED_CLUSTER_FOUND, /* cluster found */ + QED_CLUSTER_ZERO, /* zero cluster found */ + QED_CLUSTER_L2, /* cluster missing in L2 */ + QED_CLUSTER_L1, /* cluster missing in L1 */ +}; + +/** + * qed_find_cluster() completion callback + * + * @opaque: User data for completion callback + * @ret: QED_CLUSTER_FOUND Success + * QED_CLUSTER_L2 Data cluster unallocated in L2 + * QED_CLUSTER_L1 L2 unallocated in L1 + * -errno POSIX error occurred + * @offset: Data cluster offset + * @len: Contiguous bytes starting from cluster offset + * + * This function is invoked when qed_find_cluster() completes. + * + * On success ret is QED_CLUSTER_FOUND and offset/len are a contiguous range + * in the image file. + * + * On failure ret is QED_CLUSTER_L2 or QED_CLUSTER_L1 for missing L2 or L1 + * table offset, respectively. len is number of contiguous unallocated bytes. + */ +typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len); + +/** + * Generic callback for chaining async callbacks + */ +typedef struct { + BlockDriverCompletionFunc *cb; + void *opaque; +} GenericCB; + +void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque); +void gencb_complete(void *opaque, int ret); + +/** + * Header functions + */ +int qed_write_header_sync(BDRVQEDState *s); + +/** + * L2 cache functions + */ +void qed_init_l2_cache(L2TableCache *l2_cache); +void qed_free_l2_cache(L2TableCache *l2_cache); +CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache); +void qed_unref_l2_cache_entry(CachedL2Table *entry); +CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset); +void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table); + +/** + * Table I/O functions + */ +int qed_read_l1_table_sync(BDRVQEDState *s); +void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, + BlockDriverCompletionFunc *cb, void *opaque); +int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, + unsigned int n); +int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, + uint64_t offset); +void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, + BlockDriverCompletionFunc *cb, void *opaque); +void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, + unsigned int index, unsigned int n, bool flush, + BlockDriverCompletionFunc *cb, void *opaque); +int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, + unsigned int index, unsigned int n, bool flush); + +/** + * Cluster functions + */ +void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, + size_t len, QEDFindClusterFunc *cb, void *opaque); + +/** + * Consistency check + */ +int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix); + +QEDTable *qed_alloc_table(BDRVQEDState *s); + +/** + * Round down to the start of a cluster + */ +static inline uint64_t qed_start_of_cluster(BDRVQEDState *s, uint64_t offset) +{ + return offset & ~(uint64_t)(s->header.cluster_size - 1); +} + +static inline uint64_t qed_offset_into_cluster(BDRVQEDState *s, uint64_t offset) +{ + return offset & (s->header.cluster_size - 1); +} + +static inline uint64_t qed_bytes_to_clusters(BDRVQEDState *s, uint64_t bytes) +{ + return qed_start_of_cluster(s, bytes + (s->header.cluster_size - 1)) / + (s->header.cluster_size - 1); +} + +static inline unsigned int qed_l1_index(BDRVQEDState *s, uint64_t pos) +{ + return pos >> s->l1_shift; +} + +static inline unsigned int qed_l2_index(BDRVQEDState *s, uint64_t pos) +{ + return (pos >> s->l2_shift) & s->l2_mask; +} + +/** + * Test if a cluster offset is valid + */ +static inline bool qed_check_cluster_offset(BDRVQEDState *s, uint64_t offset) +{ + uint64_t header_size = (uint64_t)s->header.header_size * + s->header.cluster_size; + + if (offset & (s->header.cluster_size - 1)) { + return false; + } + return offset >= header_size && offset < s->file_size; +} + +/** + * Test if a table offset is valid + */ +static inline bool qed_check_table_offset(BDRVQEDState *s, uint64_t offset) +{ + uint64_t end_offset = offset + (s->header.table_size - 1) * + s->header.cluster_size; + + /* Overflow check */ + if (end_offset <= offset) { + return false; + } + + return qed_check_cluster_offset(s, offset) && + qed_check_cluster_offset(s, end_offset); +} + +static inline bool qed_offset_is_cluster_aligned(BDRVQEDState *s, + uint64_t offset) +{ + if (qed_offset_into_cluster(s, offset)) { + return false; + } + return true; +} + +static inline bool qed_offset_is_unalloc_cluster(uint64_t offset) +{ + if (offset == 0) { + return true; + } + return false; +} + +static inline bool qed_offset_is_zero_cluster(uint64_t offset) +{ + if (offset == 1) { + return true; + } + return false; +} + +#endif /* BLOCK_QED_H */ diff --git a/contrib/qemu/block/snapshot.c b/contrib/qemu/block/snapshot.c new file mode 100644 index 000000000..6c6d9deea --- /dev/null +++ b/contrib/qemu/block/snapshot.c @@ -0,0 +1,157 @@ +/* + * Block layer snapshot related functions + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/snapshot.h" +#include "block/block_int.h" + +int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, + const char *name) +{ + QEMUSnapshotInfo *sn_tab, *sn; + int nb_sns, i, ret; + + ret = -ENOENT; + nb_sns = bdrv_snapshot_list(bs, &sn_tab); + if (nb_sns < 0) { + return ret; + } + for (i = 0; i < nb_sns; i++) { + sn = &sn_tab[i]; + if (!strcmp(sn->id_str, name) || !strcmp(sn->name, name)) { + *sn_info = *sn; + ret = 0; + break; + } + } + g_free(sn_tab); + return ret; +} + +int bdrv_can_snapshot(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { + return 0; + } + + if (!drv->bdrv_snapshot_create) { + if (bs->file != NULL) { + return bdrv_can_snapshot(bs->file); + } + return 0; + } + + return 1; +} + +int bdrv_snapshot_create(BlockDriverState *bs, + QEMUSnapshotInfo *sn_info) +{ + BlockDriver *drv = bs->drv; + if (!drv) { + return -ENOMEDIUM; + } + if (drv->bdrv_snapshot_create) { + return drv->bdrv_snapshot_create(bs, sn_info); + } + if (bs->file) { + return bdrv_snapshot_create(bs->file, sn_info); + } + return -ENOTSUP; +} + +int bdrv_snapshot_goto(BlockDriverState *bs, + const char *snapshot_id) +{ + BlockDriver *drv = bs->drv; + int ret, open_ret; + + if (!drv) { + return -ENOMEDIUM; + } + if (drv->bdrv_snapshot_goto) { + return drv->bdrv_snapshot_goto(bs, snapshot_id); + } + + if (bs->file) { + drv->bdrv_close(bs); + ret = bdrv_snapshot_goto(bs->file, snapshot_id); + open_ret = drv->bdrv_open(bs, NULL, bs->open_flags); + if (open_ret < 0) { + bdrv_delete(bs->file); + bs->drv = NULL; + return open_ret; + } + return ret; + } + + return -ENOTSUP; +} + +int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) +{ + BlockDriver *drv = bs->drv; + if (!drv) { + return -ENOMEDIUM; + } + if (drv->bdrv_snapshot_delete) { + return drv->bdrv_snapshot_delete(bs, snapshot_id); + } + if (bs->file) { + return bdrv_snapshot_delete(bs->file, snapshot_id); + } + return -ENOTSUP; +} + +int bdrv_snapshot_list(BlockDriverState *bs, + QEMUSnapshotInfo **psn_info) +{ + BlockDriver *drv = bs->drv; + if (!drv) { + return -ENOMEDIUM; + } + if (drv->bdrv_snapshot_list) { + return drv->bdrv_snapshot_list(bs, psn_info); + } + if (bs->file) { + return bdrv_snapshot_list(bs->file, psn_info); + } + return -ENOTSUP; +} + +int bdrv_snapshot_load_tmp(BlockDriverState *bs, + const char *snapshot_name) +{ + BlockDriver *drv = bs->drv; + if (!drv) { + return -ENOMEDIUM; + } + if (!bs->read_only) { + return -EINVAL; + } + if (drv->bdrv_snapshot_load_tmp) { + return drv->bdrv_snapshot_load_tmp(bs, snapshot_name); + } + return -ENOTSUP; +} diff --git a/contrib/qemu/config-host.h b/contrib/qemu/config-host.h new file mode 100644 index 000000000..874b04053 --- /dev/null +++ b/contrib/qemu/config-host.h @@ -0,0 +1,74 @@ +/* Automatically generated by create_config - do not modify */ +#define CONFIG_QEMU_CONFDIR "/usr/local/etc/qemu" +#define CONFIG_QEMU_DATADIR "/usr/local/share/qemu" +#define CONFIG_QEMU_DOCDIR "/usr/local/share/doc/qemu" +#define CONFIG_QEMU_LOCALSTATEDIR "/usr/local/var" +#define CONFIG_QEMU_HELPERDIR "/usr/local/libexec" +#define CONFIG_QEMU_LOCALEDIR "/usr/local/share/locale" +#define HOST_X86_64 1 +#define CONFIG_QEMU_LDST_OPTIMIZATION 1 +#define CONFIG_POSIX 1 +#define CONFIG_LINUX 1 +#define CONFIG_SLIRP 1 +#define CONFIG_SMBD_COMMAND "/usr/sbin/smbd" +#define CONFIG_AUDIO_DRIVERS \ + &oss_audio_driver,\ + +#define CONFIG_OSS 1 +#define CONFIG_BDRV_RW_WHITELIST\ + NULL +#define CONFIG_BDRV_RO_WHITELIST\ + NULL +#define CONFIG_VNC 1 +#define CONFIG_VNC_TLS 1 +#define CONFIG_VNC_SASL 1 +#define CONFIG_VNC_WS 1 +#define CONFIG_FNMATCH 1 +#define CONFIG_UUID 1 +#define CONFIG_XFS 1 +#define QEMU_VERSION "1.5.50" +#define QEMU_PKGVERSION "" +#define CONFIG_CURSES 1 +#define CONFIG_UTIMENSAT 1 +#define CONFIG_PIPE2 1 +#define CONFIG_ACCEPT4 1 +#define CONFIG_SPLICE 1 +#define CONFIG_EVENTFD 1 +#define CONFIG_FALLOCATE 1 +#define CONFIG_FALLOCATE_PUNCH_HOLE 1 +#define CONFIG_SYNC_FILE_RANGE 1 +#define CONFIG_FIEMAP 1 +#define CONFIG_DUP3 1 +#define CONFIG_EPOLL 1 +#define CONFIG_EPOLL_CREATE1 1 +#define CONFIG_EPOLL_PWAIT 1 +#define CONFIG_SENDFILE 1 +#define CONFIG_INOTIFY 1 +#define CONFIG_INOTIFY1 1 +#define CONFIG_BYTESWAP_H 1 +#define CONFIG_CURL 1 +#define CONFIG_LINUX_AIO 1 +#define CONFIG_ATTR 1 +#define CONFIG_VHOST_SCSI 1 +#define CONFIG_IOVEC 1 +#define CONFIG_PREADV 1 +#define CONFIG_FDT 1 +#define CONFIG_SIGNALFD 1 +#define CONFIG_FDATASYNC 1 +#define CONFIG_MADVISE 1 +#define CONFIG_POSIX_MADVISE 1 +#define CONFIG_SIGEV_THREAD_ID 1 +#define CONFIG_UNAME_RELEASE "" +#define CONFIG_QOM_CAST_DEBUG 1 +#define CONFIG_COROUTINE_BACKEND ucontext +#define CONFIG_OPEN_BY_HANDLE 1 +#define CONFIG_LINUX_MAGIC_H 1 +#define CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE 1 +#define CONFIG_VALGRIND_H 1 +#define CONFIG_HAS_ENVIRON 1 +#define CONFIG_CPUID_H 1 +#define CONFIG_INT128 1 +#define CONFIG_VIRTIO_BLK_DATA_PLANE $(CONFIG_VIRTIO) +#define CONFIG_TRACE_NOP 1 +#define CONFIG_TRACE_FILE trace +#define CONFIG_TRACE_DEFAULT 1 diff --git a/contrib/qemu/include/block/aio.h b/contrib/qemu/include/block/aio.h new file mode 100644 index 000000000..183679374 --- /dev/null +++ b/contrib/qemu/include/block/aio.h @@ -0,0 +1,247 @@ +/* + * QEMU aio implementation + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_AIO_H +#define QEMU_AIO_H + +#include "qemu-common.h" +#include "qemu/queue.h" +#include "qemu/event_notifier.h" + +typedef struct BlockDriverAIOCB BlockDriverAIOCB; +typedef void BlockDriverCompletionFunc(void *opaque, int ret); + +typedef struct AIOCBInfo { + void (*cancel)(BlockDriverAIOCB *acb); + size_t aiocb_size; +} AIOCBInfo; + +struct BlockDriverAIOCB { + const AIOCBInfo *aiocb_info; + BlockDriverState *bs; + BlockDriverCompletionFunc *cb; + void *opaque; +}; + +void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque); +void qemu_aio_release(void *p); + +typedef struct AioHandler AioHandler; +typedef void QEMUBHFunc(void *opaque); +typedef void IOHandler(void *opaque); + +typedef struct AioContext { + GSource source; + + /* The list of registered AIO handlers */ + QLIST_HEAD(, AioHandler) aio_handlers; + + /* This is a simple lock used to protect the aio_handlers list. + * Specifically, it's used to ensure that no callbacks are removed while + * we're walking and dispatching callbacks. + */ + int walking_handlers; + + /* Anchor of the list of Bottom Halves belonging to the context */ + struct QEMUBH *first_bh; + + /* A simple lock used to protect the first_bh list, and ensure that + * no callbacks are removed while we're walking and dispatching callbacks. + */ + int walking_bh; + + /* Used for aio_notify. */ + EventNotifier notifier; + + /* GPollFDs for aio_poll() */ + GArray *pollfds; + + /* Thread pool for performing work and receiving completion callbacks */ + struct ThreadPool *thread_pool; +} AioContext; + +/* Returns 1 if there are still outstanding AIO requests; 0 otherwise */ +typedef int (AioFlushEventNotifierHandler)(EventNotifier *e); + +/** + * aio_context_new: Allocate a new AioContext. + * + * AioContext provide a mini event-loop that can be waited on synchronously. + * They also provide bottom halves, a service to execute a piece of code + * as soon as possible. + */ +AioContext *aio_context_new(void); + +/** + * aio_context_ref: + * @ctx: The AioContext to operate on. + * + * Add a reference to an AioContext. + */ +void aio_context_ref(AioContext *ctx); + +/** + * aio_context_unref: + * @ctx: The AioContext to operate on. + * + * Drop a reference to an AioContext. + */ +void aio_context_unref(AioContext *ctx); + +/** + * aio_bh_new: Allocate a new bottom half structure. + * + * Bottom halves are lightweight callbacks whose invocation is guaranteed + * to be wait-free, thread-safe and signal-safe. The #QEMUBH structure + * is opaque and must be allocated prior to its use. + */ +QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque); + +/** + * aio_notify: Force processing of pending events. + * + * Similar to signaling a condition variable, aio_notify forces + * aio_wait to exit, so that the next call will re-examine pending events. + * The caller of aio_notify will usually call aio_wait again very soon, + * or go through another iteration of the GLib main loop. Hence, aio_notify + * also has the side effect of recalculating the sets of file descriptors + * that the main loop waits for. + * + * Calling aio_notify is rarely necessary, because for example scheduling + * a bottom half calls it already. + */ +void aio_notify(AioContext *ctx); + +/** + * aio_bh_poll: Poll bottom halves for an AioContext. + * + * These are internal functions used by the QEMU main loop. + */ +int aio_bh_poll(AioContext *ctx); + +/** + * qemu_bh_schedule: Schedule a bottom half. + * + * Scheduling a bottom half interrupts the main loop and causes the + * execution of the callback that was passed to qemu_bh_new. + * + * Bottom halves that are scheduled from a bottom half handler are instantly + * invoked. This can create an infinite loop if a bottom half handler + * schedules itself. + * + * @bh: The bottom half to be scheduled. + */ +void qemu_bh_schedule(QEMUBH *bh); + +/** + * qemu_bh_cancel: Cancel execution of a bottom half. + * + * Canceling execution of a bottom half undoes the effect of calls to + * qemu_bh_schedule without freeing its resources yet. While cancellation + * itself is also wait-free and thread-safe, it can of course race with the + * loop that executes bottom halves unless you are holding the iothread + * mutex. This makes it mostly useless if you are not holding the mutex. + * + * @bh: The bottom half to be canceled. + */ +void qemu_bh_cancel(QEMUBH *bh); + +/** + *qemu_bh_delete: Cancel execution of a bottom half and free its resources. + * + * Deleting a bottom half frees the memory that was allocated for it by + * qemu_bh_new. It also implies canceling the bottom half if it was + * scheduled. + * + * @bh: The bottom half to be deleted. + */ +void qemu_bh_delete(QEMUBH *bh); + +/* Return whether there are any pending callbacks from the GSource + * attached to the AioContext. + * + * This is used internally in the implementation of the GSource. + */ +bool aio_pending(AioContext *ctx); + +/* Progress in completing AIO work to occur. This can issue new pending + * aio as a result of executing I/O completion or bh callbacks. + * + * If there is no pending AIO operation or completion (bottom half), + * return false. If there are pending AIO operations of bottom halves, + * return true. + * + * If there are no pending bottom halves, but there are pending AIO + * operations, it may not be possible to make any progress without + * blocking. If @blocking is true, this function will wait until one + * or more AIO events have completed, to ensure something has moved + * before returning. + */ +bool aio_poll(AioContext *ctx, bool blocking); + +#ifdef CONFIG_POSIX +/* Returns 1 if there are still outstanding AIO requests; 0 otherwise */ +typedef int (AioFlushHandler)(void *opaque); + +/* Register a file descriptor and associated callbacks. Behaves very similarly + * to qemu_set_fd_handler2. Unlike qemu_set_fd_handler2, these callbacks will + * be invoked when using qemu_aio_wait(). + * + * Code that invokes AIO completion functions should rely on this function + * instead of qemu_set_fd_handler[2]. + */ +void aio_set_fd_handler(AioContext *ctx, + int fd, + IOHandler *io_read, + IOHandler *io_write, + AioFlushHandler *io_flush, + void *opaque); +#endif + +/* Register an event notifier and associated callbacks. Behaves very similarly + * to event_notifier_set_handler. Unlike event_notifier_set_handler, these callbacks + * will be invoked when using qemu_aio_wait(). + * + * Code that invokes AIO completion functions should rely on this function + * instead of event_notifier_set_handler. + */ +void aio_set_event_notifier(AioContext *ctx, + EventNotifier *notifier, + EventNotifierHandler *io_read, + AioFlushEventNotifierHandler *io_flush); + +/* Return a GSource that lets the main loop poll the file descriptors attached + * to this AioContext. + */ +GSource *aio_get_g_source(AioContext *ctx); + +/* Return the ThreadPool bound to this AioContext */ +struct ThreadPool *aio_get_thread_pool(AioContext *ctx); + +/* Functions to operate on the main QEMU AioContext. */ + +bool qemu_aio_wait(void); +void qemu_aio_set_event_notifier(EventNotifier *notifier, + EventNotifierHandler *io_read, + AioFlushEventNotifierHandler *io_flush); + +#ifdef CONFIG_POSIX +void qemu_aio_set_fd_handler(int fd, + IOHandler *io_read, + IOHandler *io_write, + AioFlushHandler *io_flush, + void *opaque); +#endif + +#endif diff --git a/contrib/qemu/include/block/block.h b/contrib/qemu/include/block/block.h new file mode 100644 index 000000000..b6b9014a9 --- /dev/null +++ b/contrib/qemu/include/block/block.h @@ -0,0 +1,443 @@ +#ifndef BLOCK_H +#define BLOCK_H + +#include "block/aio.h" +#include "qemu-common.h" +#include "qemu/option.h" +#include "block/coroutine.h" +#include "qapi/qmp/qobject.h" +#include "qapi-types.h" + +/* block.c */ +typedef struct BlockDriver BlockDriver; +typedef struct BlockJob BlockJob; + +typedef struct BlockDriverInfo { + /* in bytes, 0 if irrelevant */ + int cluster_size; + /* offset at which the VM state can be saved (0 if not possible) */ + int64_t vm_state_offset; + bool is_dirty; +} BlockDriverInfo; + +typedef struct BlockFragInfo { + uint64_t allocated_clusters; + uint64_t total_clusters; + uint64_t fragmented_clusters; + uint64_t compressed_clusters; +} BlockFragInfo; + +/* Callbacks for block device models */ +typedef struct BlockDevOps { + /* + * Runs when virtual media changed (monitor commands eject, change) + * Argument load is true on load and false on eject. + * Beware: doesn't run when a host device's physical media + * changes. Sure would be useful if it did. + * Device models with removable media must implement this callback. + */ + void (*change_media_cb)(void *opaque, bool load); + /* + * Runs when an eject request is issued from the monitor, the tray + * is closed, and the medium is locked. + * Device models that do not implement is_medium_locked will not need + * this callback. Device models that can lock the medium or tray might + * want to implement the callback and unlock the tray when "force" is + * true, even if they do not support eject requests. + */ + void (*eject_request_cb)(void *opaque, bool force); + /* + * Is the virtual tray open? + * Device models implement this only when the device has a tray. + */ + bool (*is_tray_open)(void *opaque); + /* + * Is the virtual medium locked into the device? + * Device models implement this only when device has such a lock. + */ + bool (*is_medium_locked)(void *opaque); + /* + * Runs when the size changed (e.g. monitor command block_resize) + */ + void (*resize_cb)(void *opaque); +} BlockDevOps; + +#define BDRV_O_RDWR 0x0002 +#define BDRV_O_SNAPSHOT 0x0008 /* open the file read only and save writes in a snapshot */ +#define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */ +#define BDRV_O_CACHE_WB 0x0040 /* use write-back caching */ +#define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the thread pool */ +#define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */ +#define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */ +#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */ +#define BDRV_O_INCOMING 0x0800 /* consistency hint for incoming migration */ +#define BDRV_O_CHECK 0x1000 /* open solely for consistency check */ +#define BDRV_O_ALLOW_RDWR 0x2000 /* allow reopen to change from r/o to r/w */ +#define BDRV_O_UNMAP 0x4000 /* execute guest UNMAP/TRIM operations */ + +#define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH) + +#define BDRV_SECTOR_BITS 9 +#define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) +#define BDRV_SECTOR_MASK ~(BDRV_SECTOR_SIZE - 1) + +typedef enum { + BDRV_ACTION_REPORT, BDRV_ACTION_IGNORE, BDRV_ACTION_STOP +} BlockErrorAction; + +typedef QSIMPLEQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue; + +typedef struct BDRVReopenState { + BlockDriverState *bs; + int flags; + void *opaque; +} BDRVReopenState; + + +void bdrv_iostatus_enable(BlockDriverState *bs); +void bdrv_iostatus_reset(BlockDriverState *bs); +void bdrv_iostatus_disable(BlockDriverState *bs); +bool bdrv_iostatus_is_enabled(const BlockDriverState *bs); +void bdrv_iostatus_set_err(BlockDriverState *bs, int error); +void bdrv_info_print(Monitor *mon, const QObject *data); +void bdrv_info(Monitor *mon, QObject **ret_data); +void bdrv_stats_print(Monitor *mon, const QObject *data); +void bdrv_info_stats(Monitor *mon, QObject **ret_data); + +/* disk I/O throttling */ +void bdrv_io_limits_enable(BlockDriverState *bs); +void bdrv_io_limits_disable(BlockDriverState *bs); +bool bdrv_io_limits_enabled(BlockDriverState *bs); + +void bdrv_init(void); +void bdrv_init_with_whitelist(void); +BlockDriver *bdrv_find_protocol(const char *filename, + bool allow_protocol_prefix); +BlockDriver *bdrv_find_format(const char *format_name); +BlockDriver *bdrv_find_whitelisted_format(const char *format_name, + bool readonly); +int bdrv_create(BlockDriver *drv, const char* filename, + QEMUOptionParameter *options); +int bdrv_create_file(const char* filename, QEMUOptionParameter *options); +BlockDriverState *bdrv_new(const char *device_name); +void bdrv_make_anon(BlockDriverState *bs); +void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old); +void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top); +void bdrv_delete(BlockDriverState *bs); +int bdrv_parse_cache_flags(const char *mode, int *flags); +int bdrv_parse_discard_flags(const char *mode, int *flags); +int bdrv_file_open(BlockDriverState **pbs, const char *filename, + QDict *options, int flags); +int bdrv_open_backing_file(BlockDriverState *bs, QDict *options); +int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, + int flags, BlockDriver *drv); +BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, + BlockDriverState *bs, int flags); +int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp); +int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp); +int bdrv_reopen_prepare(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp); +void bdrv_reopen_commit(BDRVReopenState *reopen_state); +void bdrv_reopen_abort(BDRVReopenState *reopen_state); +void bdrv_close(BlockDriverState *bs); +void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify); +int bdrv_attach_dev(BlockDriverState *bs, void *dev); +void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev); +void bdrv_detach_dev(BlockDriverState *bs, void *dev); +void *bdrv_get_attached_dev(BlockDriverState *bs); +void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops, + void *opaque); +void bdrv_dev_eject_request(BlockDriverState *bs, bool force); +bool bdrv_dev_has_removable_media(BlockDriverState *bs); +bool bdrv_dev_is_tray_open(BlockDriverState *bs); +bool bdrv_dev_is_medium_locked(BlockDriverState *bs); +int bdrv_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors); +int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors); +int bdrv_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors); +int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov); +int bdrv_pread(BlockDriverState *bs, int64_t offset, + void *buf, int count); +int bdrv_pwrite(BlockDriverState *bs, int64_t offset, + const void *buf, int count); +int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov); +int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, + const void *buf, int count); +int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov); +int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); +int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov); +/* + * Efficiently zero a region of the disk image. Note that this is a regular + * I/O request like read or write and should have a reasonable size. This + * function is not suitable for zeroing the entire image in a single request + * because it may allocate memory for the entire region. + */ +int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors); +int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum); +int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top, + BlockDriverState *base, + int64_t sector_num, + int nb_sectors, int *pnum); +BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, + const char *backing_file); +int bdrv_get_backing_file_depth(BlockDriverState *bs); +int bdrv_truncate(BlockDriverState *bs, int64_t offset); +int64_t bdrv_getlength(BlockDriverState *bs); +int64_t bdrv_get_allocated_file_size(BlockDriverState *bs); +void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr); +int bdrv_commit(BlockDriverState *bs); +int bdrv_commit_all(void); +int bdrv_change_backing_file(BlockDriverState *bs, + const char *backing_file, const char *backing_fmt); +void bdrv_register(BlockDriver *bdrv); +int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, + BlockDriverState *base); +BlockDriverState *bdrv_find_overlay(BlockDriverState *active, + BlockDriverState *bs); +BlockDriverState *bdrv_find_base(BlockDriverState *bs); + + +typedef struct BdrvCheckResult { + int corruptions; + int leaks; + int check_errors; + int corruptions_fixed; + int leaks_fixed; + int64_t image_end_offset; + BlockFragInfo bfi; +} BdrvCheckResult; + +typedef enum { + BDRV_FIX_LEAKS = 1, + BDRV_FIX_ERRORS = 2, +} BdrvCheckMode; + +int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix); + +/* async block I/O */ +typedef void BlockDriverDirtyHandler(BlockDriverState *bs, int64_t sector, + int sector_num); +BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *iov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque); +BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *iov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque); +BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque); +BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque); +void bdrv_aio_cancel(BlockDriverAIOCB *acb); + +typedef struct BlockRequest { + /* Fields to be filled by multiwrite caller */ + int64_t sector; + int nb_sectors; + QEMUIOVector *qiov; + BlockDriverCompletionFunc *cb; + void *opaque; + + /* Filled by multiwrite implementation */ + int error; +} BlockRequest; + +int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, + int num_reqs); + +/* sg packet commands */ +int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf); +BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockDriverCompletionFunc *cb, void *opaque); + +/* Invalidate any cached metadata used by image formats */ +void bdrv_invalidate_cache(BlockDriverState *bs); +void bdrv_invalidate_cache_all(void); + +void bdrv_clear_incoming_migration_all(void); + +/* Ensure contents are flushed to disk. */ +int bdrv_flush(BlockDriverState *bs); +int coroutine_fn bdrv_co_flush(BlockDriverState *bs); +int bdrv_flush_all(void); +void bdrv_close_all(void); +void bdrv_drain_all(void); + +int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); +int bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); +int bdrv_has_zero_init_1(BlockDriverState *bs); +int bdrv_has_zero_init(BlockDriverState *bs); +int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, + int *pnum); +int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, + int64_t sector_num, int nb_sectors, int *pnum); + +void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, + BlockdevOnError on_write_error); +BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read); +BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error); +void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action, + bool is_read, int error); +int bdrv_is_read_only(BlockDriverState *bs); +int bdrv_is_sg(BlockDriverState *bs); +int bdrv_enable_write_cache(BlockDriverState *bs); +void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce); +int bdrv_is_inserted(BlockDriverState *bs); +int bdrv_media_changed(BlockDriverState *bs); +void bdrv_lock_medium(BlockDriverState *bs, bool locked); +void bdrv_eject(BlockDriverState *bs, bool eject_flag); +const char *bdrv_get_format_name(BlockDriverState *bs); +BlockDriverState *bdrv_find(const char *name); +BlockDriverState *bdrv_next(BlockDriverState *bs); +void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), + void *opaque); +int bdrv_is_encrypted(BlockDriverState *bs); +int bdrv_key_required(BlockDriverState *bs); +int bdrv_set_key(BlockDriverState *bs, const char *key); +int bdrv_query_missing_keys(void); +void bdrv_iterate_format(void (*it)(void *opaque, const char *name), + void *opaque); +const char *bdrv_get_device_name(BlockDriverState *bs); +int bdrv_get_flags(BlockDriverState *bs); +int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors); +int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi); +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors); + +const char *bdrv_get_encrypted_filename(BlockDriverState *bs); +void bdrv_get_backing_filename(BlockDriverState *bs, + char *filename, int filename_size); +void bdrv_get_full_backing_filename(BlockDriverState *bs, + char *dest, size_t sz); +int bdrv_is_snapshot(BlockDriverState *bs); + +int path_is_absolute(const char *path); +void path_combine(char *dest, int dest_size, + const char *base_path, + const char *filename); + +int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); +int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, + int64_t pos, int size); + +int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size); + +void bdrv_img_create(const char *filename, const char *fmt, + const char *base_filename, const char *base_fmt, + char *options, uint64_t img_size, int flags, + Error **errp, bool quiet); + +void bdrv_set_buffer_alignment(BlockDriverState *bs, int align); +void *qemu_blockalign(BlockDriverState *bs, size_t size); +bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); + +struct HBitmapIter; +void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity); +int bdrv_get_dirty(BlockDriverState *bs, int64_t sector); +void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors); +void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors); +void bdrv_dirty_iter_init(BlockDriverState *bs, struct HBitmapIter *hbi); +int64_t bdrv_get_dirty_count(BlockDriverState *bs); + +void bdrv_enable_copy_on_read(BlockDriverState *bs); +void bdrv_disable_copy_on_read(BlockDriverState *bs); + +void bdrv_set_in_use(BlockDriverState *bs, int in_use); +int bdrv_in_use(BlockDriverState *bs); + +#ifdef CONFIG_LINUX_AIO +int raw_get_aio_fd(BlockDriverState *bs); +#else +static inline int raw_get_aio_fd(BlockDriverState *bs) +{ + return -ENOTSUP; +} +#endif + +enum BlockAcctType { + BDRV_ACCT_READ, + BDRV_ACCT_WRITE, + BDRV_ACCT_FLUSH, + BDRV_MAX_IOTYPE, +}; + +typedef struct BlockAcctCookie { + int64_t bytes; + int64_t start_time_ns; + enum BlockAcctType type; +} BlockAcctCookie; + +void bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, + int64_t bytes, enum BlockAcctType type); +void bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie); + +typedef enum { + BLKDBG_L1_UPDATE, + + BLKDBG_L1_GROW_ALLOC_TABLE, + BLKDBG_L1_GROW_WRITE_TABLE, + BLKDBG_L1_GROW_ACTIVATE_TABLE, + + BLKDBG_L2_LOAD, + BLKDBG_L2_UPDATE, + BLKDBG_L2_UPDATE_COMPRESSED, + BLKDBG_L2_ALLOC_COW_READ, + BLKDBG_L2_ALLOC_WRITE, + + BLKDBG_READ_AIO, + BLKDBG_READ_BACKING_AIO, + BLKDBG_READ_COMPRESSED, + + BLKDBG_WRITE_AIO, + BLKDBG_WRITE_COMPRESSED, + + BLKDBG_VMSTATE_LOAD, + BLKDBG_VMSTATE_SAVE, + + BLKDBG_COW_READ, + BLKDBG_COW_WRITE, + + BLKDBG_REFTABLE_LOAD, + BLKDBG_REFTABLE_GROW, + + BLKDBG_REFBLOCK_LOAD, + BLKDBG_REFBLOCK_UPDATE, + BLKDBG_REFBLOCK_UPDATE_PART, + BLKDBG_REFBLOCK_ALLOC, + BLKDBG_REFBLOCK_ALLOC_HOOKUP, + BLKDBG_REFBLOCK_ALLOC_WRITE, + BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS, + BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE, + BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE, + + BLKDBG_CLUSTER_ALLOC, + BLKDBG_CLUSTER_ALLOC_BYTES, + BLKDBG_CLUSTER_FREE, + + BLKDBG_FLUSH_TO_OS, + BLKDBG_FLUSH_TO_DISK, + + BLKDBG_EVENT_MAX, +} BlkDebugEvent; + +#define BLKDBG_EVENT(bs, evt) bdrv_debug_event(bs, evt) +void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event); + +int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, + const char *tag); +int bdrv_debug_resume(BlockDriverState *bs, const char *tag); +bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag); + +#endif diff --git a/contrib/qemu/include/block/block_int.h b/contrib/qemu/include/block/block_int.h new file mode 100644 index 000000000..c6ac871e2 --- /dev/null +++ b/contrib/qemu/include/block/block_int.h @@ -0,0 +1,421 @@ +/* + * QEMU System Emulator block driver + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef BLOCK_INT_H +#define BLOCK_INT_H + +#include "block/block.h" +#include "qemu/option.h" +#include "qemu/queue.h" +#include "block/coroutine.h" +#include "qemu/timer.h" +#include "qapi-types.h" +#include "qapi/qmp/qerror.h" +#include "monitor/monitor.h" +#include "qemu/hbitmap.h" +#include "block/snapshot.h" + +#define BLOCK_FLAG_ENCRYPT 1 +#define BLOCK_FLAG_COMPAT6 4 +#define BLOCK_FLAG_LAZY_REFCOUNTS 8 + +#define BLOCK_IO_LIMIT_READ 0 +#define BLOCK_IO_LIMIT_WRITE 1 +#define BLOCK_IO_LIMIT_TOTAL 2 + +#define BLOCK_IO_SLICE_TIME 100000000 +#define NANOSECONDS_PER_SECOND 1000000000.0 + +#define BLOCK_OPT_SIZE "size" +#define BLOCK_OPT_ENCRYPT "encryption" +#define BLOCK_OPT_COMPAT6 "compat6" +#define BLOCK_OPT_BACKING_FILE "backing_file" +#define BLOCK_OPT_BACKING_FMT "backing_fmt" +#define BLOCK_OPT_CLUSTER_SIZE "cluster_size" +#define BLOCK_OPT_TABLE_SIZE "table_size" +#define BLOCK_OPT_PREALLOC "preallocation" +#define BLOCK_OPT_SUBFMT "subformat" +#define BLOCK_OPT_COMPAT_LEVEL "compat" +#define BLOCK_OPT_LAZY_REFCOUNTS "lazy_refcounts" +#define BLOCK_OPT_ADAPTER_TYPE "adapter_type" + +typedef struct BdrvTrackedRequest { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + bool is_write; + QLIST_ENTRY(BdrvTrackedRequest) list; + Coroutine *co; /* owner, used for deadlock detection */ + CoQueue wait_queue; /* coroutines blocked on this request */ +} BdrvTrackedRequest; + + +typedef struct BlockIOLimit { + int64_t bps[3]; + int64_t iops[3]; +} BlockIOLimit; + +typedef struct BlockIOBaseValue { + uint64_t bytes[2]; + uint64_t ios[2]; +} BlockIOBaseValue; + +struct BlockDriver { + const char *format_name; + int instance_size; + int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename); + int (*bdrv_probe_device)(const char *filename); + + /* Any driver implementing this callback is expected to be able to handle + * NULL file names in its .bdrv_open() implementation */ + void (*bdrv_parse_filename)(const char *filename, QDict *options, Error **errp); + + /* For handling image reopen for split or non-split files */ + int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp); + void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state); + void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state); + + int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags); + int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags); + int (*bdrv_read)(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors); + int (*bdrv_write)(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors); + void (*bdrv_close)(BlockDriverState *bs); + void (*bdrv_rebind)(BlockDriverState *bs); + int (*bdrv_create)(const char *filename, QEMUOptionParameter *options); + int (*bdrv_set_key)(BlockDriverState *bs, const char *key); + int (*bdrv_make_empty)(BlockDriverState *bs); + /* aio */ + BlockDriverAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque); + BlockDriverAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque); + BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque); + BlockDriverAIOCB *(*bdrv_aio_discard)(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque); + + int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); + int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); + /* + * Efficiently zero a region of the disk image. Typically an image format + * would use a compact metadata representation to implement this. This + * function pointer may be NULL and .bdrv_co_writev() will be called + * instead. + */ + int coroutine_fn (*bdrv_co_write_zeroes)(BlockDriverState *bs, + int64_t sector_num, int nb_sectors); + int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs, + int64_t sector_num, int nb_sectors); + int coroutine_fn (*bdrv_co_is_allocated)(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum); + + /* + * Invalidate any cached meta-data. + */ + void (*bdrv_invalidate_cache)(BlockDriverState *bs); + + /* + * Flushes all data that was already written to the OS all the way down to + * the disk (for example raw-posix calls fsync()). + */ + int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs); + + /* + * Flushes all internal caches to the OS. The data may still sit in a + * writeback cache of the host OS, but it will survive a crash of the qemu + * process. + */ + int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs); + + const char *protocol_name; + int (*bdrv_truncate)(BlockDriverState *bs, int64_t offset); + int64_t (*bdrv_getlength)(BlockDriverState *bs); + int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs); + int (*bdrv_write_compressed)(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors); + + int (*bdrv_snapshot_create)(BlockDriverState *bs, + QEMUSnapshotInfo *sn_info); + int (*bdrv_snapshot_goto)(BlockDriverState *bs, + const char *snapshot_id); + int (*bdrv_snapshot_delete)(BlockDriverState *bs, const char *snapshot_id); + int (*bdrv_snapshot_list)(BlockDriverState *bs, + QEMUSnapshotInfo **psn_info); + int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs, + const char *snapshot_name); + int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi); + + int (*bdrv_save_vmstate)(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t pos); + int (*bdrv_load_vmstate)(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size); + + int (*bdrv_change_backing_file)(BlockDriverState *bs, + const char *backing_file, const char *backing_fmt); + + /* removable device specific */ + int (*bdrv_is_inserted)(BlockDriverState *bs); + int (*bdrv_media_changed)(BlockDriverState *bs); + void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); + void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked); + + /* to control generic scsi devices */ + int (*bdrv_ioctl)(BlockDriverState *bs, unsigned long int req, void *buf); + BlockDriverAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockDriverCompletionFunc *cb, void *opaque); + + /* List of options for creating images, terminated by name == NULL */ + QEMUOptionParameter *create_options; + + + /* + * Returns 0 for completed check, -errno for internal errors. + * The check results are stored in result. + */ + int (*bdrv_check)(BlockDriverState* bs, BdrvCheckResult *result, + BdrvCheckMode fix); + + void (*bdrv_debug_event)(BlockDriverState *bs, BlkDebugEvent event); + + /* TODO Better pass a option string/QDict/QemuOpts to add any rule? */ + int (*bdrv_debug_breakpoint)(BlockDriverState *bs, const char *event, + const char *tag); + int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag); + bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag); + + /* + * Returns 1 if newly created images are guaranteed to contain only + * zeros, 0 otherwise. + */ + int (*bdrv_has_zero_init)(BlockDriverState *bs); + + QLIST_ENTRY(BlockDriver) list; +}; + +/* + * Note: the function bdrv_append() copies and swaps contents of + * BlockDriverStates, so if you add new fields to this struct, please + * inspect bdrv_append() to determine if the new fields need to be + * copied as well. + */ +struct BlockDriverState { + int64_t total_sectors; /* if we are reading a disk image, give its + size in sectors */ + int read_only; /* if true, the media is read only */ + int open_flags; /* flags used to open the file, re-used for re-open */ + int encrypted; /* if true, the media is encrypted */ + int valid_key; /* if true, a valid encryption key has been set */ + int sg; /* if true, the device is a /dev/sg* */ + int copy_on_read; /* if true, copy read backing sectors into image + note this is a reference count */ + + BlockDriver *drv; /* NULL means no media */ + void *opaque; + + void *dev; /* attached device model, if any */ + /* TODO change to DeviceState when all users are qdevified */ + const BlockDevOps *dev_ops; + void *dev_opaque; + + char filename[1024]; + char backing_file[1024]; /* if non zero, the image is a diff of + this file image */ + char backing_format[16]; /* if non-zero and backing_file exists */ + int is_temporary; + + BlockDriverState *backing_hd; + BlockDriverState *file; + + NotifierList close_notifiers; + + /* Callback before write request is processed */ + NotifierWithReturnList before_write_notifiers; + + /* number of in-flight copy-on-read requests */ + unsigned int copy_on_read_in_flight; + + /* the time for latest disk I/O */ + int64_t slice_start; + int64_t slice_end; + BlockIOLimit io_limits; + BlockIOBaseValue slice_submitted; + CoQueue throttled_reqs; + QEMUTimer *block_timer; + bool io_limits_enabled; + + /* I/O stats (display with "info blockstats"). */ + uint64_t nr_bytes[BDRV_MAX_IOTYPE]; + uint64_t nr_ops[BDRV_MAX_IOTYPE]; + uint64_t total_time_ns[BDRV_MAX_IOTYPE]; + uint64_t wr_highest_sector; + + /* Whether the disk can expand beyond total_sectors */ + int growable; + + /* the memory alignment required for the buffers handled by this driver */ + int buffer_alignment; + + /* do we need to tell the quest if we have a volatile write cache? */ + int enable_write_cache; + + /* NOTE: the following infos are only hints for real hardware + drivers. They are not used by the block driver */ + BlockdevOnError on_read_error, on_write_error; + bool iostatus_enabled; + BlockDeviceIoStatus iostatus; + char device_name[32]; + HBitmap *dirty_bitmap; + int in_use; /* users other than guest access, eg. block migration */ + QTAILQ_ENTRY(BlockDriverState) list; + + QLIST_HEAD(, BdrvTrackedRequest) tracked_requests; + + /* long-running background operation */ + BlockJob *job; + + QDict *options; +}; + +int get_tmp_filename(char *filename, int size); + +void bdrv_set_io_limits(BlockDriverState *bs, + BlockIOLimit *io_limits); + +/** + * bdrv_add_before_write_notifier: + * + * Register a callback that is invoked before write requests are processed but + * after any throttling or waiting for overlapping requests. + */ +void bdrv_add_before_write_notifier(BlockDriverState *bs, + NotifierWithReturn *notifier); + +/** + * bdrv_get_aio_context: + * + * Returns: the currently bound #AioContext + */ +AioContext *bdrv_get_aio_context(BlockDriverState *bs); + +#ifdef _WIN32 +int is_windows_drive(const char *filename); +#endif +void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv, + enum MonitorEvent ev, + BlockErrorAction action, bool is_read); + +/** + * stream_start: + * @bs: Block device to operate on. + * @base: Block device that will become the new base, or %NULL to + * flatten the whole backing file chain onto @bs. + * @base_id: The file name that will be written to @bs as the new + * backing file if the job completes. Ignored if @base is %NULL. + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @on_error: The action to take upon error. + * @cb: Completion function for the job. + * @opaque: Opaque pointer value passed to @cb. + * @errp: Error object. + * + * Start a streaming operation on @bs. Clusters that are unallocated + * in @bs, but allocated in any image between @base and @bs (both + * exclusive) will be written to @bs. At the end of a successful + * streaming job, the backing file of @bs will be changed to + * @base_id in the written image and to @base in the live BlockDriverState. + */ +void stream_start(BlockDriverState *bs, BlockDriverState *base, + const char *base_id, int64_t speed, BlockdevOnError on_error, + BlockDriverCompletionFunc *cb, + void *opaque, Error **errp); + +/** + * commit_start: + * @bs: Top Block device + * @base: Block device that will be written into, and become the new top + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @on_error: The action to take upon error. + * @cb: Completion function for the job. + * @opaque: Opaque pointer value passed to @cb. + * @errp: Error object. + * + */ +void commit_start(BlockDriverState *bs, BlockDriverState *base, + BlockDriverState *top, int64_t speed, + BlockdevOnError on_error, BlockDriverCompletionFunc *cb, + void *opaque, Error **errp); + +/* + * mirror_start: + * @bs: Block device to operate on. + * @target: Block device to write to. + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @granularity: The chosen granularity for the dirty bitmap. + * @buf_size: The amount of data that can be in flight at one time. + * @mode: Whether to collapse all images in the chain to the target. + * @on_source_error: The action to take upon error reading from the source. + * @on_target_error: The action to take upon error writing to the target. + * @cb: Completion function for the job. + * @opaque: Opaque pointer value passed to @cb. + * @errp: Error object. + * + * Start a mirroring operation on @bs. Clusters that are allocated + * in @bs will be written to @bs until the job is cancelled or + * manually completed. At the end of a successful mirroring job, + * @bs will be switched to read from @target. + */ +void mirror_start(BlockDriverState *bs, BlockDriverState *target, + int64_t speed, int64_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + BlockDriverCompletionFunc *cb, + void *opaque, Error **errp); + +/* + * backup_start: + * @bs: Block device to operate on. + * @target: Block device to write to. + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @on_source_error: The action to take upon error reading from the source. + * @on_target_error: The action to take upon error writing to the target. + * @cb: Completion function for the job. + * @opaque: Opaque pointer value passed to @cb. + * + * Start a backup operation on @bs. Clusters in @bs are written to @target + * until the job is cancelled or manually completed. + */ +void backup_start(BlockDriverState *bs, BlockDriverState *target, + int64_t speed, BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + BlockDriverCompletionFunc *cb, void *opaque, + Error **errp); + +#endif /* BLOCK_INT_H */ diff --git a/contrib/qemu/include/block/blockjob.h b/contrib/qemu/include/block/blockjob.h new file mode 100644 index 000000000..c290d07bb --- /dev/null +++ b/contrib/qemu/include/block/blockjob.h @@ -0,0 +1,278 @@ +/* + * Declarations for long-running block device operations + * + * Copyright (c) 2011 IBM Corp. + * Copyright (c) 2012 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef BLOCKJOB_H +#define BLOCKJOB_H 1 + +#include "block/block.h" + +/** + * BlockJobType: + * + * A class type for block job objects. + */ +typedef struct BlockJobType { + /** Derived BlockJob struct size */ + size_t instance_size; + + /** String describing the operation, part of query-block-jobs QMP API */ + const char *job_type; + + /** Optional callback for job types that support setting a speed limit */ + void (*set_speed)(BlockJob *job, int64_t speed, Error **errp); + + /** Optional callback for job types that need to forward I/O status reset */ + void (*iostatus_reset)(BlockJob *job); + + /** + * Optional callback for job types whose completion must be triggered + * manually. + */ + void (*complete)(BlockJob *job, Error **errp); +} BlockJobType; + +/** + * BlockJob: + * + * Long-running operation on a BlockDriverState. + */ +struct BlockJob { + /** The job type, including the job vtable. */ + const BlockJobType *job_type; + + /** The block device on which the job is operating. */ + BlockDriverState *bs; + + /** + * The coroutine that executes the job. If not NULL, it is + * reentered when busy is false and the job is cancelled. + */ + Coroutine *co; + + /** + * Set to true if the job should cancel itself. The flag must + * always be tested just before toggling the busy flag from false + * to true. After a job has been cancelled, it should only yield + * if #qemu_aio_wait will ("sooner or later") reenter the coroutine. + */ + bool cancelled; + + /** + * Set to true if the job is either paused, or will pause itself + * as soon as possible (if busy == true). + */ + bool paused; + + /** + * Set to false by the job while it is in a quiescent state, where + * no I/O is pending and the job has yielded on any condition + * that is not detected by #qemu_aio_wait, such as a timer. + */ + bool busy; + + /** Status that is published by the query-block-jobs QMP API */ + BlockDeviceIoStatus iostatus; + + /** Offset that is published by the query-block-jobs QMP API */ + int64_t offset; + + /** Length that is published by the query-block-jobs QMP API */ + int64_t len; + + /** Speed that was set with @block_job_set_speed. */ + int64_t speed; + + /** The completion function that will be called when the job completes. */ + BlockDriverCompletionFunc *cb; + + /** The opaque value that is passed to the completion function. */ + void *opaque; +}; + +/** + * block_job_create: + * @job_type: The class object for the newly-created job. + * @bs: The block + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @cb: Completion function for the job. + * @opaque: Opaque pointer value passed to @cb. + * @errp: Error object. + * + * Create a new long-running block device job and return it. The job + * will call @cb asynchronously when the job completes. Note that + * @bs may have been closed at the time the @cb it is called. If + * this is the case, the job may be reported as either cancelled or + * completed. + * + * This function is not part of the public job interface; it should be + * called from a wrapper that is specific to the job type. + */ +void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs, + int64_t speed, BlockDriverCompletionFunc *cb, + void *opaque, Error **errp); + +/** + * block_job_sleep_ns: + * @job: The job that calls the function. + * @clock: The clock to sleep on. + * @ns: How many nanoseconds to stop for. + * + * Put the job to sleep (assuming that it wasn't canceled) for @ns + * nanoseconds. Canceling the job will interrupt the wait immediately. + */ +void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns); + +/** + * block_job_completed: + * @job: The job being completed. + * @ret: The status code. + * + * Call the completion function that was registered at creation time, and + * free @job. + */ +void block_job_completed(BlockJob *job, int ret); + +/** + * block_job_set_speed: + * @job: The job to set the speed for. + * @speed: The new value + * @errp: Error object. + * + * Set a rate-limiting parameter for the job; the actual meaning may + * vary depending on the job type. + */ +void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp); + +/** + * block_job_cancel: + * @job: The job to be canceled. + * + * Asynchronously cancel the specified job. + */ +void block_job_cancel(BlockJob *job); + +/** + * block_job_complete: + * @job: The job to be completed. + * @errp: Error object. + * + * Asynchronously complete the specified job. + */ +void block_job_complete(BlockJob *job, Error **errp); + +/** + * block_job_is_cancelled: + * @job: The job being queried. + * + * Returns whether the job is scheduled for cancellation. + */ +bool block_job_is_cancelled(BlockJob *job); + +/** + * block_job_query: + * @job: The job to get information about. + * + * Return information about a job. + */ +BlockJobInfo *block_job_query(BlockJob *job); + +/** + * block_job_pause: + * @job: The job to be paused. + * + * Asynchronously pause the specified job. + */ +void block_job_pause(BlockJob *job); + +/** + * block_job_resume: + * @job: The job to be resumed. + * + * Resume the specified job. + */ +void block_job_resume(BlockJob *job); + +/** + * qobject_from_block_job: + * @job: The job whose information is requested. + * + * Return a QDict corresponding to @job's query-block-jobs entry. + */ +QObject *qobject_from_block_job(BlockJob *job); + +/** + * block_job_ready: + * @job: The job which is now ready to complete. + * + * Send a BLOCK_JOB_READY event for the specified job. + */ +void block_job_ready(BlockJob *job); + +/** + * block_job_is_paused: + * @job: The job being queried. + * + * Returns whether the job is currently paused, or will pause + * as soon as it reaches a sleeping point. + */ +bool block_job_is_paused(BlockJob *job); + +/** + * block_job_cancel_sync: + * @job: The job to be canceled. + * + * Synchronously cancel the job. The completion callback is called + * before the function returns. The job may actually complete + * instead of canceling itself; the circumstances under which this + * happens depend on the kind of job that is active. + * + * Returns the return value from the job if the job actually completed + * during the call, or -ECANCELED if it was canceled. + */ +int block_job_cancel_sync(BlockJob *job); + +/** + * block_job_iostatus_reset: + * @job: The job whose I/O status should be reset. + * + * Reset I/O status on @job and on BlockDriverState objects it uses, + * other than job->bs. + */ +void block_job_iostatus_reset(BlockJob *job); + +/** + * block_job_error_action: + * @job: The job to signal an error for. + * @bs: The block device on which to set an I/O error. + * @on_err: The error action setting. + * @is_read: Whether the operation was a read. + * @error: The error that was reported. + * + * Report an I/O error for a block job and possibly stop the VM. Return the + * action that was selected based on @on_err and @error. + */ +BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs, + BlockdevOnError on_err, + int is_read, int error); +#endif diff --git a/contrib/qemu/include/block/coroutine.h b/contrib/qemu/include/block/coroutine.h new file mode 100644 index 000000000..377805a3b --- /dev/null +++ b/contrib/qemu/include/block/coroutine.h @@ -0,0 +1,218 @@ +/* + * QEMU coroutine implementation + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Kevin Wolf <kwolf@redhat.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#ifndef QEMU_COROUTINE_H +#define QEMU_COROUTINE_H + +#include <stdbool.h> +#include "qemu/queue.h" +#include "qemu/timer.h" + +/** + * Coroutines are a mechanism for stack switching and can be used for + * cooperative userspace threading. These functions provide a simple but + * useful flavor of coroutines that is suitable for writing sequential code, + * rather than callbacks, for operations that need to give up control while + * waiting for events to complete. + * + * These functions are re-entrant and may be used outside the global mutex. + */ + +/** + * Mark a function that executes in coroutine context + * + * Functions that execute in coroutine context cannot be called directly from + * normal functions. In the future it would be nice to enable compiler or + * static checker support for catching such errors. This annotation might make + * it possible and in the meantime it serves as documentation. + * + * For example: + * + * static void coroutine_fn foo(void) { + * .... + * } + */ +#define coroutine_fn + +typedef struct Coroutine Coroutine; + +/** + * Coroutine entry point + * + * When the coroutine is entered for the first time, opaque is passed in as an + * argument. + * + * When this function returns, the coroutine is destroyed automatically and + * execution continues in the caller who last entered the coroutine. + */ +typedef void coroutine_fn CoroutineEntry(void *opaque); + +/** + * Create a new coroutine + * + * Use qemu_coroutine_enter() to actually transfer control to the coroutine. + */ +Coroutine *qemu_coroutine_create(CoroutineEntry *entry); + +/** + * Transfer control to a coroutine + * + * The opaque argument is passed as the argument to the entry point when + * entering the coroutine for the first time. It is subsequently ignored. + */ +void qemu_coroutine_enter(Coroutine *coroutine, void *opaque); + +/** + * Transfer control back to a coroutine's caller + * + * This function does not return until the coroutine is re-entered using + * qemu_coroutine_enter(). + */ +void coroutine_fn qemu_coroutine_yield(void); + +/** + * Get the currently executing coroutine + */ +Coroutine *coroutine_fn qemu_coroutine_self(void); + +/** + * Return whether or not currently inside a coroutine + * + * This can be used to write functions that work both when in coroutine context + * and when not in coroutine context. Note that such functions cannot use the + * coroutine_fn annotation since they work outside coroutine context. + */ +bool qemu_in_coroutine(void); + + + +/** + * CoQueues are a mechanism to queue coroutines in order to continue executing + * them later. They provide the fundamental primitives on which coroutine locks + * are built. + */ +typedef struct CoQueue { + QTAILQ_HEAD(, Coroutine) entries; + AioContext *ctx; +} CoQueue; + +/** + * Initialise a CoQueue. This must be called before any other operation is used + * on the CoQueue. + */ +void qemu_co_queue_init(CoQueue *queue); + +/** + * Adds the current coroutine to the CoQueue and transfers control to the + * caller of the coroutine. + */ +void coroutine_fn qemu_co_queue_wait(CoQueue *queue); + +/** + * Adds the current coroutine to the head of the CoQueue and transfers control to the + * caller of the coroutine. + */ +void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue); + +/** + * Restarts the next coroutine in the CoQueue and removes it from the queue. + * + * Returns true if a coroutine was restarted, false if the queue is empty. + */ +bool qemu_co_queue_next(CoQueue *queue); + +/** + * Restarts all coroutines in the CoQueue and leaves the queue empty. + */ +void qemu_co_queue_restart_all(CoQueue *queue); + +/** + * Checks if the CoQueue is empty. + */ +bool qemu_co_queue_empty(CoQueue *queue); + + +/** + * Provides a mutex that can be used to synchronise coroutines + */ +typedef struct CoMutex { + bool locked; + CoQueue queue; +} CoMutex; + +/** + * Initialises a CoMutex. This must be called before any other operation is used + * on the CoMutex. + */ +void qemu_co_mutex_init(CoMutex *mutex); + +/** + * Locks the mutex. If the lock cannot be taken immediately, control is + * transferred to the caller of the current coroutine. + */ +void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex); + +/** + * Unlocks the mutex and schedules the next coroutine that was waiting for this + * lock to be run. + */ +void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex); + +typedef struct CoRwlock { + bool writer; + int reader; + CoQueue queue; +} CoRwlock; + +/** + * Initialises a CoRwlock. This must be called before any other operation + * is used on the CoRwlock + */ +void qemu_co_rwlock_init(CoRwlock *lock); + +/** + * Read locks the CoRwlock. If the lock cannot be taken immediately because + * of a parallel writer, control is transferred to the caller of the current + * coroutine. + */ +void qemu_co_rwlock_rdlock(CoRwlock *lock); + +/** + * Write Locks the mutex. If the lock cannot be taken immediately because + * of a parallel reader, control is transferred to the caller of the current + * coroutine. + */ +void qemu_co_rwlock_wrlock(CoRwlock *lock); + +/** + * Unlocks the read/write lock and schedules the next coroutine that was + * waiting for this lock to be run. + */ +void qemu_co_rwlock_unlock(CoRwlock *lock); + +/** + * Yield the coroutine for a given duration + * + * Note this function uses timers and hence only works when a main loop is in + * use. See main-loop.h and do not use from qemu-tool programs. + */ +void coroutine_fn co_sleep_ns(QEMUClock *clock, int64_t ns); + +/** + * Yield until a file descriptor becomes readable + * + * Note that this function clobbers the handlers for the file descriptor. + */ +void coroutine_fn yield_until_fd_readable(int fd); +#endif /* QEMU_COROUTINE_H */ diff --git a/contrib/qemu/include/block/coroutine_int.h b/contrib/qemu/include/block/coroutine_int.h new file mode 100644 index 000000000..f133d65af --- /dev/null +++ b/contrib/qemu/include/block/coroutine_int.h @@ -0,0 +1,53 @@ +/* + * Coroutine internals + * + * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef QEMU_COROUTINE_INT_H +#define QEMU_COROUTINE_INT_H + +#include "qemu/queue.h" +#include "block/coroutine.h" + +typedef enum { + COROUTINE_YIELD = 1, + COROUTINE_TERMINATE = 2, +} CoroutineAction; + +struct Coroutine { + CoroutineEntry *entry; + void *entry_arg; + Coroutine *caller; + QSLIST_ENTRY(Coroutine) pool_next; + + /* Coroutines that should be woken up when we yield or terminate */ + QTAILQ_HEAD(, Coroutine) co_queue_wakeup; + QTAILQ_ENTRY(Coroutine) co_queue_next; +}; + +Coroutine *qemu_coroutine_new(void); +void qemu_coroutine_delete(Coroutine *co); +CoroutineAction qemu_coroutine_switch(Coroutine *from, Coroutine *to, + CoroutineAction action); +void coroutine_fn qemu_co_queue_run_restart(Coroutine *co); + +#endif diff --git a/contrib/qemu/include/block/snapshot.h b/contrib/qemu/include/block/snapshot.h new file mode 100644 index 000000000..eaf61f032 --- /dev/null +++ b/contrib/qemu/include/block/snapshot.h @@ -0,0 +1,53 @@ +/* + * Block layer snapshot related functions + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef SNAPSHOT_H +#define SNAPSHOT_H + +#include "qemu-common.h" + +typedef struct QEMUSnapshotInfo { + char id_str[128]; /* unique snapshot id */ + /* the following fields are informative. They are not needed for + the consistency of the snapshot */ + char name[256]; /* user chosen name */ + uint64_t vm_state_size; /* VM state info size */ + uint32_t date_sec; /* UTC date of the snapshot */ + uint32_t date_nsec; + uint64_t vm_clock_nsec; /* VM clock relative to boot */ +} QEMUSnapshotInfo; + +int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, + const char *name); +int bdrv_can_snapshot(BlockDriverState *bs); +int bdrv_snapshot_create(BlockDriverState *bs, + QEMUSnapshotInfo *sn_info); +int bdrv_snapshot_goto(BlockDriverState *bs, + const char *snapshot_id); +int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id); +int bdrv_snapshot_list(BlockDriverState *bs, + QEMUSnapshotInfo **psn_info); +int bdrv_snapshot_load_tmp(BlockDriverState *bs, + const char *snapshot_name); +#endif diff --git a/contrib/qemu/include/config.h b/contrib/qemu/include/config.h new file mode 100644 index 000000000..e20f78696 --- /dev/null +++ b/contrib/qemu/include/config.h @@ -0,0 +1,2 @@ +#include "config-host.h" +#include "config-target.h" diff --git a/contrib/qemu/include/exec/cpu-common.h b/contrib/qemu/include/exec/cpu-common.h new file mode 100644 index 000000000..e4996e19c --- /dev/null +++ b/contrib/qemu/include/exec/cpu-common.h @@ -0,0 +1,124 @@ +#ifndef CPU_COMMON_H +#define CPU_COMMON_H 1 + +/* CPU interfaces that are target independent. */ + +#ifndef CONFIG_USER_ONLY +#include "exec/hwaddr.h" +#endif + +#ifndef NEED_CPU_H +#include "exec/poison.h" +#endif + +#include "qemu/bswap.h" +#include "qemu/queue.h" + +/** + * CPUListState: + * @cpu_fprintf: Print function. + * @file: File to print to using @cpu_fprint. + * + * State commonly used for iterating over CPU models. + */ +typedef struct CPUListState { + fprintf_function cpu_fprintf; + FILE *file; +} CPUListState; + +#if !defined(CONFIG_USER_ONLY) + +enum device_endian { + DEVICE_NATIVE_ENDIAN, + DEVICE_BIG_ENDIAN, + DEVICE_LITTLE_ENDIAN, +}; + +/* address in the RAM (different from a physical address) */ +#if defined(CONFIG_XEN_BACKEND) +typedef uint64_t ram_addr_t; +# define RAM_ADDR_MAX UINT64_MAX +# define RAM_ADDR_FMT "%" PRIx64 +#else +typedef uintptr_t ram_addr_t; +# define RAM_ADDR_MAX UINTPTR_MAX +# define RAM_ADDR_FMT "%" PRIxPTR +#endif + +/* memory API */ + +typedef void CPUWriteMemoryFunc(void *opaque, hwaddr addr, uint32_t value); +typedef uint32_t CPUReadMemoryFunc(void *opaque, hwaddr addr); + +void qemu_ram_remap(ram_addr_t addr, ram_addr_t length); +/* This should not be used by devices. */ +MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr); +void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev); + +void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf, + int len, int is_write); +static inline void cpu_physical_memory_read(hwaddr addr, + void *buf, int len) +{ + cpu_physical_memory_rw(addr, buf, len, 0); +} +static inline void cpu_physical_memory_write(hwaddr addr, + const void *buf, int len) +{ + cpu_physical_memory_rw(addr, (void *)buf, len, 1); +} +void *cpu_physical_memory_map(hwaddr addr, + hwaddr *plen, + int is_write); +void cpu_physical_memory_unmap(void *buffer, hwaddr len, + int is_write, hwaddr access_len); +void *cpu_register_map_client(void *opaque, void (*callback)(void *opaque)); + +bool cpu_physical_memory_is_io(hwaddr phys_addr); + +/* Coalesced MMIO regions are areas where write operations can be reordered. + * This usually implies that write operations are side-effect free. This allows + * batching which can make a major impact on performance when using + * virtualization. + */ +void qemu_flush_coalesced_mmio_buffer(void); + +uint32_t ldub_phys(hwaddr addr); +uint32_t lduw_le_phys(hwaddr addr); +uint32_t lduw_be_phys(hwaddr addr); +uint32_t ldl_le_phys(hwaddr addr); +uint32_t ldl_be_phys(hwaddr addr); +uint64_t ldq_le_phys(hwaddr addr); +uint64_t ldq_be_phys(hwaddr addr); +void stb_phys(hwaddr addr, uint32_t val); +void stw_le_phys(hwaddr addr, uint32_t val); +void stw_be_phys(hwaddr addr, uint32_t val); +void stl_le_phys(hwaddr addr, uint32_t val); +void stl_be_phys(hwaddr addr, uint32_t val); +void stq_le_phys(hwaddr addr, uint64_t val); +void stq_be_phys(hwaddr addr, uint64_t val); + +#ifdef NEED_CPU_H +uint32_t lduw_phys(hwaddr addr); +uint32_t ldl_phys(hwaddr addr); +uint64_t ldq_phys(hwaddr addr); +void stl_phys_notdirty(hwaddr addr, uint32_t val); +void stw_phys(hwaddr addr, uint32_t val); +void stl_phys(hwaddr addr, uint32_t val); +void stq_phys(hwaddr addr, uint64_t val); +#endif + +void cpu_physical_memory_write_rom(hwaddr addr, + const uint8_t *buf, int len); + +extern struct MemoryRegion io_mem_rom; +extern struct MemoryRegion io_mem_notdirty; + +typedef void (RAMBlockIterFunc)(void *host_addr, + ram_addr_t offset, ram_addr_t length, void *opaque); + +void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque); + +#endif + +#endif /* !CPU_COMMON_H */ diff --git a/contrib/qemu/include/exec/hwaddr.h b/contrib/qemu/include/exec/hwaddr.h new file mode 100644 index 000000000..c9eb78fba --- /dev/null +++ b/contrib/qemu/include/exec/hwaddr.h @@ -0,0 +1,20 @@ +/* Define hwaddr if it exists. */ + +#ifndef HWADDR_H +#define HWADDR_H + +#define HWADDR_BITS 64 +/* hwaddr is the type of a physical address (its size can + be different from 'target_ulong'). */ + +typedef uint64_t hwaddr; +#define HWADDR_MAX UINT64_MAX +#define TARGET_FMT_plx "%016" PRIx64 +#define HWADDR_PRId PRId64 +#define HWADDR_PRIi PRIi64 +#define HWADDR_PRIo PRIo64 +#define HWADDR_PRIu PRIu64 +#define HWADDR_PRIx PRIx64 +#define HWADDR_PRIX PRIX64 + +#endif diff --git a/contrib/qemu/include/exec/poison.h b/contrib/qemu/include/exec/poison.h new file mode 100644 index 000000000..2341a7504 --- /dev/null +++ b/contrib/qemu/include/exec/poison.h @@ -0,0 +1,63 @@ +/* Poison identifiers that should not be used when building + target independent device code. */ + +#ifndef HW_POISON_H +#define HW_POISON_H +#ifdef __GNUC__ + +#pragma GCC poison TARGET_I386 +#pragma GCC poison TARGET_X86_64 +#pragma GCC poison TARGET_ALPHA +#pragma GCC poison TARGET_ARM +#pragma GCC poison TARGET_CRIS +#pragma GCC poison TARGET_LM32 +#pragma GCC poison TARGET_M68K +#pragma GCC poison TARGET_MIPS +#pragma GCC poison TARGET_MIPS64 +#pragma GCC poison TARGET_OPENRISC +#pragma GCC poison TARGET_PPC +#pragma GCC poison TARGET_PPCEMB +#pragma GCC poison TARGET_PPC64 +#pragma GCC poison TARGET_ABI32 +#pragma GCC poison TARGET_SH4 +#pragma GCC poison TARGET_SPARC +#pragma GCC poison TARGET_SPARC64 + +#pragma GCC poison TARGET_WORDS_BIGENDIAN +#pragma GCC poison BSWAP_NEEDED + +#pragma GCC poison TARGET_LONG_BITS +#pragma GCC poison TARGET_FMT_lx +#pragma GCC poison TARGET_FMT_ld + +#pragma GCC poison TARGET_PAGE_SIZE +#pragma GCC poison TARGET_PAGE_MASK +#pragma GCC poison TARGET_PAGE_BITS +#pragma GCC poison TARGET_PAGE_ALIGN + +#pragma GCC poison CPUArchState +#pragma GCC poison env + +#pragma GCC poison lduw_phys +#pragma GCC poison ldl_phys +#pragma GCC poison ldq_phys +#pragma GCC poison stl_phys_notdirty +#pragma GCC poison stw_phys +#pragma GCC poison stl_phys +#pragma GCC poison stq_phys + +#pragma GCC poison CPU_INTERRUPT_HARD +#pragma GCC poison CPU_INTERRUPT_EXITTB +#pragma GCC poison CPU_INTERRUPT_HALT +#pragma GCC poison CPU_INTERRUPT_DEBUG +#pragma GCC poison CPU_INTERRUPT_TGT_EXT_0 +#pragma GCC poison CPU_INTERRUPT_TGT_EXT_1 +#pragma GCC poison CPU_INTERRUPT_TGT_EXT_2 +#pragma GCC poison CPU_INTERRUPT_TGT_EXT_3 +#pragma GCC poison CPU_INTERRUPT_TGT_EXT_4 +#pragma GCC poison CPU_INTERRUPT_TGT_INT_0 +#pragma GCC poison CPU_INTERRUPT_TGT_INT_1 +#pragma GCC poison CPU_INTERRUPT_TGT_INT_2 + +#endif +#endif diff --git a/contrib/qemu/include/fpu/softfloat.h b/contrib/qemu/include/fpu/softfloat.h new file mode 100644 index 000000000..f3927e241 --- /dev/null +++ b/contrib/qemu/include/fpu/softfloat.h @@ -0,0 +1,641 @@ +/* + * QEMU float support + * + * Derived from SoftFloat. + */ + +/*============================================================================ + +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +#ifndef SOFTFLOAT_H +#define SOFTFLOAT_H + +#if defined(CONFIG_SOLARIS) && defined(CONFIG_NEEDS_LIBSUNMATH) +#include <sunmath.h> +#endif + +#include <inttypes.h> +#include "config-host.h" +#include "qemu/osdep.h" + +/*---------------------------------------------------------------------------- +| Each of the following `typedef's defines the most convenient type that holds +| integers of at least as many bits as specified. For example, `uint8' should +| be the most convenient type that can hold unsigned integers of as many as +| 8 bits. The `flag' type must be able to hold either a 0 or 1. For most +| implementations of C, `flag', `uint8', and `int8' should all be `typedef'ed +| to the same as `int'. +*----------------------------------------------------------------------------*/ +typedef uint8_t flag; +typedef uint8_t uint8; +typedef int8_t int8; +typedef unsigned int uint32; +typedef signed int int32; +typedef uint64_t uint64; +typedef int64_t int64; + +#define LIT64( a ) a##LL +#define INLINE static inline + +#define STATUS_PARAM , float_status *status +#define STATUS(field) status->field +#define STATUS_VAR , status + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point ordering relations +*----------------------------------------------------------------------------*/ +enum { + float_relation_less = -1, + float_relation_equal = 0, + float_relation_greater = 1, + float_relation_unordered = 2 +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point types. +*----------------------------------------------------------------------------*/ +/* Use structures for soft-float types. This prevents accidentally mixing + them with native int/float types. A sufficiently clever compiler and + sane ABI should be able to see though these structs. However + x86/gcc 3.x seems to struggle a bit, so leave them disabled by default. */ +//#define USE_SOFTFLOAT_STRUCT_TYPES +#ifdef USE_SOFTFLOAT_STRUCT_TYPES +typedef struct { + uint16_t v; +} float16; +#define float16_val(x) (((float16)(x)).v) +#define make_float16(x) __extension__ ({ float16 f16_val = {x}; f16_val; }) +#define const_float16(x) { x } +typedef struct { + uint32_t v; +} float32; +/* The cast ensures an error if the wrong type is passed. */ +#define float32_val(x) (((float32)(x)).v) +#define make_float32(x) __extension__ ({ float32 f32_val = {x}; f32_val; }) +#define const_float32(x) { x } +typedef struct { + uint64_t v; +} float64; +#define float64_val(x) (((float64)(x)).v) +#define make_float64(x) __extension__ ({ float64 f64_val = {x}; f64_val; }) +#define const_float64(x) { x } +#else +typedef uint16_t float16; +typedef uint32_t float32; +typedef uint64_t float64; +#define float16_val(x) (x) +#define float32_val(x) (x) +#define float64_val(x) (x) +#define make_float16(x) (x) +#define make_float32(x) (x) +#define make_float64(x) (x) +#define const_float16(x) (x) +#define const_float32(x) (x) +#define const_float64(x) (x) +#endif +typedef struct { + uint64_t low; + uint16_t high; +} floatx80; +#define make_floatx80(exp, mant) ((floatx80) { mant, exp }) +#define make_floatx80_init(exp, mant) { .low = mant, .high = exp } +typedef struct { +#ifdef HOST_WORDS_BIGENDIAN + uint64_t high, low; +#else + uint64_t low, high; +#endif +} float128; +#define make_float128(high_, low_) ((float128) { .high = high_, .low = low_ }) +#define make_float128_init(high_, low_) { .high = high_, .low = low_ } + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point underflow tininess-detection mode. +*----------------------------------------------------------------------------*/ +enum { + float_tininess_after_rounding = 0, + float_tininess_before_rounding = 1 +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point rounding mode. +*----------------------------------------------------------------------------*/ +enum { + float_round_nearest_even = 0, + float_round_down = 1, + float_round_up = 2, + float_round_to_zero = 3 +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point exception flags. +*----------------------------------------------------------------------------*/ +enum { + float_flag_invalid = 1, + float_flag_divbyzero = 4, + float_flag_overflow = 8, + float_flag_underflow = 16, + float_flag_inexact = 32, + float_flag_input_denormal = 64, + float_flag_output_denormal = 128 +}; + +typedef struct float_status { + signed char float_detect_tininess; + signed char float_rounding_mode; + signed char float_exception_flags; + signed char floatx80_rounding_precision; + /* should denormalised results go to zero and set the inexact flag? */ + flag flush_to_zero; + /* should denormalised inputs go to zero and set the input_denormal flag? */ + flag flush_inputs_to_zero; + flag default_nan_mode; +} float_status; + +void set_float_rounding_mode(int val STATUS_PARAM); +void set_float_exception_flags(int val STATUS_PARAM); +INLINE void set_float_detect_tininess(int val STATUS_PARAM) +{ + STATUS(float_detect_tininess) = val; +} +INLINE void set_flush_to_zero(flag val STATUS_PARAM) +{ + STATUS(flush_to_zero) = val; +} +INLINE void set_flush_inputs_to_zero(flag val STATUS_PARAM) +{ + STATUS(flush_inputs_to_zero) = val; +} +INLINE void set_default_nan_mode(flag val STATUS_PARAM) +{ + STATUS(default_nan_mode) = val; +} +INLINE int get_float_exception_flags(float_status *status) +{ + return STATUS(float_exception_flags); +} +void set_floatx80_rounding_precision(int val STATUS_PARAM); + +/*---------------------------------------------------------------------------- +| Routine to raise any or all of the software IEC/IEEE floating-point +| exception flags. +*----------------------------------------------------------------------------*/ +void float_raise( int8 flags STATUS_PARAM); + +/*---------------------------------------------------------------------------- +| Options to indicate which negations to perform in float*_muladd() +| Using these differs from negating an input or output before calling +| the muladd function in that this means that a NaN doesn't have its +| sign bit inverted before it is propagated. +*----------------------------------------------------------------------------*/ +enum { + float_muladd_negate_c = 1, + float_muladd_negate_product = 2, + float_muladd_negate_result = 4, +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE integer-to-floating-point conversion routines. +*----------------------------------------------------------------------------*/ +float32 int32_to_float32( int32 STATUS_PARAM ); +float64 int32_to_float64( int32 STATUS_PARAM ); +float32 uint32_to_float32( uint32 STATUS_PARAM ); +float64 uint32_to_float64( uint32 STATUS_PARAM ); +floatx80 int32_to_floatx80( int32 STATUS_PARAM ); +float128 int32_to_float128( int32 STATUS_PARAM ); +float32 int64_to_float32( int64 STATUS_PARAM ); +float32 uint64_to_float32( uint64 STATUS_PARAM ); +float64 int64_to_float64( int64 STATUS_PARAM ); +float64 uint64_to_float64( uint64 STATUS_PARAM ); +floatx80 int64_to_floatx80( int64 STATUS_PARAM ); +float128 int64_to_float128( int64 STATUS_PARAM ); +float128 uint64_to_float128( uint64 STATUS_PARAM ); + +/*---------------------------------------------------------------------------- +| Software half-precision conversion routines. +*----------------------------------------------------------------------------*/ +float16 float32_to_float16( float32, flag STATUS_PARAM ); +float32 float16_to_float32( float16, flag STATUS_PARAM ); + +/*---------------------------------------------------------------------------- +| Software half-precision operations. +*----------------------------------------------------------------------------*/ +int float16_is_quiet_nan( float16 ); +int float16_is_signaling_nan( float16 ); +float16 float16_maybe_silence_nan( float16 ); + +INLINE int float16_is_any_nan(float16 a) +{ + return ((float16_val(a) & ~0x8000) > 0x7c00); +} + +/*---------------------------------------------------------------------------- +| The pattern for a default generated half-precision NaN. +*----------------------------------------------------------------------------*/ +extern const float16 float16_default_nan; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE single-precision conversion routines. +*----------------------------------------------------------------------------*/ +int_fast16_t float32_to_int16_round_to_zero(float32 STATUS_PARAM); +uint_fast16_t float32_to_uint16_round_to_zero(float32 STATUS_PARAM); +int32 float32_to_int32( float32 STATUS_PARAM ); +int32 float32_to_int32_round_to_zero( float32 STATUS_PARAM ); +uint32 float32_to_uint32( float32 STATUS_PARAM ); +uint32 float32_to_uint32_round_to_zero( float32 STATUS_PARAM ); +int64 float32_to_int64( float32 STATUS_PARAM ); +int64 float32_to_int64_round_to_zero( float32 STATUS_PARAM ); +float64 float32_to_float64( float32 STATUS_PARAM ); +floatx80 float32_to_floatx80( float32 STATUS_PARAM ); +float128 float32_to_float128( float32 STATUS_PARAM ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE single-precision operations. +*----------------------------------------------------------------------------*/ +float32 float32_round_to_int( float32 STATUS_PARAM ); +float32 float32_add( float32, float32 STATUS_PARAM ); +float32 float32_sub( float32, float32 STATUS_PARAM ); +float32 float32_mul( float32, float32 STATUS_PARAM ); +float32 float32_div( float32, float32 STATUS_PARAM ); +float32 float32_rem( float32, float32 STATUS_PARAM ); +float32 float32_muladd(float32, float32, float32, int STATUS_PARAM); +float32 float32_sqrt( float32 STATUS_PARAM ); +float32 float32_exp2( float32 STATUS_PARAM ); +float32 float32_log2( float32 STATUS_PARAM ); +int float32_eq( float32, float32 STATUS_PARAM ); +int float32_le( float32, float32 STATUS_PARAM ); +int float32_lt( float32, float32 STATUS_PARAM ); +int float32_unordered( float32, float32 STATUS_PARAM ); +int float32_eq_quiet( float32, float32 STATUS_PARAM ); +int float32_le_quiet( float32, float32 STATUS_PARAM ); +int float32_lt_quiet( float32, float32 STATUS_PARAM ); +int float32_unordered_quiet( float32, float32 STATUS_PARAM ); +int float32_compare( float32, float32 STATUS_PARAM ); +int float32_compare_quiet( float32, float32 STATUS_PARAM ); +float32 float32_min(float32, float32 STATUS_PARAM); +float32 float32_max(float32, float32 STATUS_PARAM); +int float32_is_quiet_nan( float32 ); +int float32_is_signaling_nan( float32 ); +float32 float32_maybe_silence_nan( float32 ); +float32 float32_scalbn( float32, int STATUS_PARAM ); + +INLINE float32 float32_abs(float32 a) +{ + /* Note that abs does *not* handle NaN specially, nor does + * it flush denormal inputs to zero. + */ + return make_float32(float32_val(a) & 0x7fffffff); +} + +INLINE float32 float32_chs(float32 a) +{ + /* Note that chs does *not* handle NaN specially, nor does + * it flush denormal inputs to zero. + */ + return make_float32(float32_val(a) ^ 0x80000000); +} + +INLINE int float32_is_infinity(float32 a) +{ + return (float32_val(a) & 0x7fffffff) == 0x7f800000; +} + +INLINE int float32_is_neg(float32 a) +{ + return float32_val(a) >> 31; +} + +INLINE int float32_is_zero(float32 a) +{ + return (float32_val(a) & 0x7fffffff) == 0; +} + +INLINE int float32_is_any_nan(float32 a) +{ + return ((float32_val(a) & ~(1 << 31)) > 0x7f800000UL); +} + +INLINE int float32_is_zero_or_denormal(float32 a) +{ + return (float32_val(a) & 0x7f800000) == 0; +} + +INLINE float32 float32_set_sign(float32 a, int sign) +{ + return make_float32((float32_val(a) & 0x7fffffff) | (sign << 31)); +} + +#define float32_zero make_float32(0) +#define float32_one make_float32(0x3f800000) +#define float32_ln2 make_float32(0x3f317218) +#define float32_pi make_float32(0x40490fdb) +#define float32_half make_float32(0x3f000000) +#define float32_infinity make_float32(0x7f800000) + + +/*---------------------------------------------------------------------------- +| The pattern for a default generated single-precision NaN. +*----------------------------------------------------------------------------*/ +extern const float32 float32_default_nan; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE double-precision conversion routines. +*----------------------------------------------------------------------------*/ +int_fast16_t float64_to_int16_round_to_zero(float64 STATUS_PARAM); +uint_fast16_t float64_to_uint16_round_to_zero(float64 STATUS_PARAM); +int32 float64_to_int32( float64 STATUS_PARAM ); +int32 float64_to_int32_round_to_zero( float64 STATUS_PARAM ); +uint32 float64_to_uint32( float64 STATUS_PARAM ); +uint32 float64_to_uint32_round_to_zero( float64 STATUS_PARAM ); +int64 float64_to_int64( float64 STATUS_PARAM ); +int64 float64_to_int64_round_to_zero( float64 STATUS_PARAM ); +uint64 float64_to_uint64 (float64 a STATUS_PARAM); +uint64 float64_to_uint64_round_to_zero (float64 a STATUS_PARAM); +float32 float64_to_float32( float64 STATUS_PARAM ); +floatx80 float64_to_floatx80( float64 STATUS_PARAM ); +float128 float64_to_float128( float64 STATUS_PARAM ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE double-precision operations. +*----------------------------------------------------------------------------*/ +float64 float64_round_to_int( float64 STATUS_PARAM ); +float64 float64_trunc_to_int( float64 STATUS_PARAM ); +float64 float64_add( float64, float64 STATUS_PARAM ); +float64 float64_sub( float64, float64 STATUS_PARAM ); +float64 float64_mul( float64, float64 STATUS_PARAM ); +float64 float64_div( float64, float64 STATUS_PARAM ); +float64 float64_rem( float64, float64 STATUS_PARAM ); +float64 float64_muladd(float64, float64, float64, int STATUS_PARAM); +float64 float64_sqrt( float64 STATUS_PARAM ); +float64 float64_log2( float64 STATUS_PARAM ); +int float64_eq( float64, float64 STATUS_PARAM ); +int float64_le( float64, float64 STATUS_PARAM ); +int float64_lt( float64, float64 STATUS_PARAM ); +int float64_unordered( float64, float64 STATUS_PARAM ); +int float64_eq_quiet( float64, float64 STATUS_PARAM ); +int float64_le_quiet( float64, float64 STATUS_PARAM ); +int float64_lt_quiet( float64, float64 STATUS_PARAM ); +int float64_unordered_quiet( float64, float64 STATUS_PARAM ); +int float64_compare( float64, float64 STATUS_PARAM ); +int float64_compare_quiet( float64, float64 STATUS_PARAM ); +float64 float64_min(float64, float64 STATUS_PARAM); +float64 float64_max(float64, float64 STATUS_PARAM); +int float64_is_quiet_nan( float64 a ); +int float64_is_signaling_nan( float64 ); +float64 float64_maybe_silence_nan( float64 ); +float64 float64_scalbn( float64, int STATUS_PARAM ); + +INLINE float64 float64_abs(float64 a) +{ + /* Note that abs does *not* handle NaN specially, nor does + * it flush denormal inputs to zero. + */ + return make_float64(float64_val(a) & 0x7fffffffffffffffLL); +} + +INLINE float64 float64_chs(float64 a) +{ + /* Note that chs does *not* handle NaN specially, nor does + * it flush denormal inputs to zero. + */ + return make_float64(float64_val(a) ^ 0x8000000000000000LL); +} + +INLINE int float64_is_infinity(float64 a) +{ + return (float64_val(a) & 0x7fffffffffffffffLL ) == 0x7ff0000000000000LL; +} + +INLINE int float64_is_neg(float64 a) +{ + return float64_val(a) >> 63; +} + +INLINE int float64_is_zero(float64 a) +{ + return (float64_val(a) & 0x7fffffffffffffffLL) == 0; +} + +INLINE int float64_is_any_nan(float64 a) +{ + return ((float64_val(a) & ~(1ULL << 63)) > 0x7ff0000000000000ULL); +} + +INLINE int float64_is_zero_or_denormal(float64 a) +{ + return (float64_val(a) & 0x7ff0000000000000LL) == 0; +} + +INLINE float64 float64_set_sign(float64 a, int sign) +{ + return make_float64((float64_val(a) & 0x7fffffffffffffffULL) + | ((int64_t)sign << 63)); +} + +#define float64_zero make_float64(0) +#define float64_one make_float64(0x3ff0000000000000LL) +#define float64_ln2 make_float64(0x3fe62e42fefa39efLL) +#define float64_pi make_float64(0x400921fb54442d18LL) +#define float64_half make_float64(0x3fe0000000000000LL) +#define float64_infinity make_float64(0x7ff0000000000000LL) + +/*---------------------------------------------------------------------------- +| The pattern for a default generated double-precision NaN. +*----------------------------------------------------------------------------*/ +extern const float64 float64_default_nan; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE extended double-precision conversion routines. +*----------------------------------------------------------------------------*/ +int32 floatx80_to_int32( floatx80 STATUS_PARAM ); +int32 floatx80_to_int32_round_to_zero( floatx80 STATUS_PARAM ); +int64 floatx80_to_int64( floatx80 STATUS_PARAM ); +int64 floatx80_to_int64_round_to_zero( floatx80 STATUS_PARAM ); +float32 floatx80_to_float32( floatx80 STATUS_PARAM ); +float64 floatx80_to_float64( floatx80 STATUS_PARAM ); +float128 floatx80_to_float128( floatx80 STATUS_PARAM ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE extended double-precision operations. +*----------------------------------------------------------------------------*/ +floatx80 floatx80_round_to_int( floatx80 STATUS_PARAM ); +floatx80 floatx80_add( floatx80, floatx80 STATUS_PARAM ); +floatx80 floatx80_sub( floatx80, floatx80 STATUS_PARAM ); +floatx80 floatx80_mul( floatx80, floatx80 STATUS_PARAM ); +floatx80 floatx80_div( floatx80, floatx80 STATUS_PARAM ); +floatx80 floatx80_rem( floatx80, floatx80 STATUS_PARAM ); +floatx80 floatx80_sqrt( floatx80 STATUS_PARAM ); +int floatx80_eq( floatx80, floatx80 STATUS_PARAM ); +int floatx80_le( floatx80, floatx80 STATUS_PARAM ); +int floatx80_lt( floatx80, floatx80 STATUS_PARAM ); +int floatx80_unordered( floatx80, floatx80 STATUS_PARAM ); +int floatx80_eq_quiet( floatx80, floatx80 STATUS_PARAM ); +int floatx80_le_quiet( floatx80, floatx80 STATUS_PARAM ); +int floatx80_lt_quiet( floatx80, floatx80 STATUS_PARAM ); +int floatx80_unordered_quiet( floatx80, floatx80 STATUS_PARAM ); +int floatx80_compare( floatx80, floatx80 STATUS_PARAM ); +int floatx80_compare_quiet( floatx80, floatx80 STATUS_PARAM ); +int floatx80_is_quiet_nan( floatx80 ); +int floatx80_is_signaling_nan( floatx80 ); +floatx80 floatx80_maybe_silence_nan( floatx80 ); +floatx80 floatx80_scalbn( floatx80, int STATUS_PARAM ); + +INLINE floatx80 floatx80_abs(floatx80 a) +{ + a.high &= 0x7fff; + return a; +} + +INLINE floatx80 floatx80_chs(floatx80 a) +{ + a.high ^= 0x8000; + return a; +} + +INLINE int floatx80_is_infinity(floatx80 a) +{ + return (a.high & 0x7fff) == 0x7fff && a.low == 0x8000000000000000LL; +} + +INLINE int floatx80_is_neg(floatx80 a) +{ + return a.high >> 15; +} + +INLINE int floatx80_is_zero(floatx80 a) +{ + return (a.high & 0x7fff) == 0 && a.low == 0; +} + +INLINE int floatx80_is_zero_or_denormal(floatx80 a) +{ + return (a.high & 0x7fff) == 0; +} + +INLINE int floatx80_is_any_nan(floatx80 a) +{ + return ((a.high & 0x7fff) == 0x7fff) && (a.low<<1); +} + +#define floatx80_zero make_floatx80(0x0000, 0x0000000000000000LL) +#define floatx80_one make_floatx80(0x3fff, 0x8000000000000000LL) +#define floatx80_ln2 make_floatx80(0x3ffe, 0xb17217f7d1cf79acLL) +#define floatx80_pi make_floatx80(0x4000, 0xc90fdaa22168c235LL) +#define floatx80_half make_floatx80(0x3ffe, 0x8000000000000000LL) +#define floatx80_infinity make_floatx80(0x7fff, 0x8000000000000000LL) + +/*---------------------------------------------------------------------------- +| The pattern for a default generated extended double-precision NaN. +*----------------------------------------------------------------------------*/ +extern const floatx80 floatx80_default_nan; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE quadruple-precision conversion routines. +*----------------------------------------------------------------------------*/ +int32 float128_to_int32( float128 STATUS_PARAM ); +int32 float128_to_int32_round_to_zero( float128 STATUS_PARAM ); +int64 float128_to_int64( float128 STATUS_PARAM ); +int64 float128_to_int64_round_to_zero( float128 STATUS_PARAM ); +float32 float128_to_float32( float128 STATUS_PARAM ); +float64 float128_to_float64( float128 STATUS_PARAM ); +floatx80 float128_to_floatx80( float128 STATUS_PARAM ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE quadruple-precision operations. +*----------------------------------------------------------------------------*/ +float128 float128_round_to_int( float128 STATUS_PARAM ); +float128 float128_add( float128, float128 STATUS_PARAM ); +float128 float128_sub( float128, float128 STATUS_PARAM ); +float128 float128_mul( float128, float128 STATUS_PARAM ); +float128 float128_div( float128, float128 STATUS_PARAM ); +float128 float128_rem( float128, float128 STATUS_PARAM ); +float128 float128_sqrt( float128 STATUS_PARAM ); +int float128_eq( float128, float128 STATUS_PARAM ); +int float128_le( float128, float128 STATUS_PARAM ); +int float128_lt( float128, float128 STATUS_PARAM ); +int float128_unordered( float128, float128 STATUS_PARAM ); +int float128_eq_quiet( float128, float128 STATUS_PARAM ); +int float128_le_quiet( float128, float128 STATUS_PARAM ); +int float128_lt_quiet( float128, float128 STATUS_PARAM ); +int float128_unordered_quiet( float128, float128 STATUS_PARAM ); +int float128_compare( float128, float128 STATUS_PARAM ); +int float128_compare_quiet( float128, float128 STATUS_PARAM ); +int float128_is_quiet_nan( float128 ); +int float128_is_signaling_nan( float128 ); +float128 float128_maybe_silence_nan( float128 ); +float128 float128_scalbn( float128, int STATUS_PARAM ); + +INLINE float128 float128_abs(float128 a) +{ + a.high &= 0x7fffffffffffffffLL; + return a; +} + +INLINE float128 float128_chs(float128 a) +{ + a.high ^= 0x8000000000000000LL; + return a; +} + +INLINE int float128_is_infinity(float128 a) +{ + return (a.high & 0x7fffffffffffffffLL) == 0x7fff000000000000LL && a.low == 0; +} + +INLINE int float128_is_neg(float128 a) +{ + return a.high >> 63; +} + +INLINE int float128_is_zero(float128 a) +{ + return (a.high & 0x7fffffffffffffffLL) == 0 && a.low == 0; +} + +INLINE int float128_is_zero_or_denormal(float128 a) +{ + return (a.high & 0x7fff000000000000LL) == 0; +} + +INLINE int float128_is_any_nan(float128 a) +{ + return ((a.high >> 48) & 0x7fff) == 0x7fff && + ((a.low != 0) || ((a.high & 0xffffffffffffLL) != 0)); +} + +#define float128_zero make_float128(0, 0) + +/*---------------------------------------------------------------------------- +| The pattern for a default generated quadruple-precision NaN. +*----------------------------------------------------------------------------*/ +extern const float128 float128_default_nan; + +#endif /* !SOFTFLOAT_H */ diff --git a/contrib/qemu/include/glib-compat.h b/contrib/qemu/include/glib-compat.h new file mode 100644 index 000000000..8aa77afd6 --- /dev/null +++ b/contrib/qemu/include/glib-compat.h @@ -0,0 +1,27 @@ +/* + * GLIB Compatibility Functions + * + * Copyright IBM, Corp. 2013 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_GLIB_COMPAT_H +#define QEMU_GLIB_COMPAT_H + +#include <glib.h> + +#if !GLIB_CHECK_VERSION(2, 14, 0) +static inline guint g_timeout_add_seconds(guint interval, GSourceFunc function, + gpointer data) +{ + return g_timeout_add(interval * 1000, function, data); +} +#endif + +#endif diff --git a/contrib/qemu/include/migration/migration.h b/contrib/qemu/include/migration/migration.h new file mode 100644 index 000000000..bc9fde0b2 --- /dev/null +++ b/contrib/qemu/include/migration/migration.h @@ -0,0 +1,157 @@ +/* + * QEMU live migration + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_MIGRATION_H +#define QEMU_MIGRATION_H + +#include "qapi/qmp/qdict.h" +#include "qemu-common.h" +#include "qemu/thread.h" +#include "qemu/notify.h" +#include "qapi/error.h" +#include "migration/vmstate.h" +#include "qapi-types.h" +#include "exec/cpu-common.h" + +struct MigrationParams { + bool blk; + bool shared; +}; + +typedef struct MigrationState MigrationState; + +struct MigrationState +{ + int64_t bandwidth_limit; + size_t bytes_xfer; + size_t xfer_limit; + QemuThread thread; + QEMUBH *cleanup_bh; + QEMUFile *file; + + int state; + MigrationParams params; + double mbps; + int64_t total_time; + int64_t downtime; + int64_t expected_downtime; + int64_t dirty_pages_rate; + int64_t dirty_bytes_rate; + bool enabled_capabilities[MIGRATION_CAPABILITY_MAX]; + int64_t xbzrle_cache_size; +}; + +void process_incoming_migration(QEMUFile *f); + +void qemu_start_incoming_migration(const char *uri, Error **errp); + +uint64_t migrate_max_downtime(void); + +void do_info_migrate_print(Monitor *mon, const QObject *data); + +void do_info_migrate(Monitor *mon, QObject **ret_data); + +void exec_start_incoming_migration(const char *host_port, Error **errp); + +void exec_start_outgoing_migration(MigrationState *s, const char *host_port, Error **errp); + +void tcp_start_incoming_migration(const char *host_port, Error **errp); + +void tcp_start_outgoing_migration(MigrationState *s, const char *host_port, Error **errp); + +void unix_start_incoming_migration(const char *path, Error **errp); + +void unix_start_outgoing_migration(MigrationState *s, const char *path, Error **errp); + +void fd_start_incoming_migration(const char *path, Error **errp); + +void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp); + +void migrate_fd_error(MigrationState *s); + +void migrate_fd_connect(MigrationState *s); + +int migrate_fd_close(MigrationState *s); + +void add_migration_state_change_notifier(Notifier *notify); +void remove_migration_state_change_notifier(Notifier *notify); +bool migration_is_active(MigrationState *); +bool migration_has_finished(MigrationState *); +bool migration_has_failed(MigrationState *); +MigrationState *migrate_get_current(void); + +uint64_t ram_bytes_remaining(void); +uint64_t ram_bytes_transferred(void); +uint64_t ram_bytes_total(void); + +void acct_update_position(QEMUFile *f, size_t size, bool zero); + +extern SaveVMHandlers savevm_ram_handlers; + +uint64_t dup_mig_bytes_transferred(void); +uint64_t dup_mig_pages_transferred(void); +uint64_t skipped_mig_bytes_transferred(void); +uint64_t skipped_mig_pages_transferred(void); +uint64_t norm_mig_bytes_transferred(void); +uint64_t norm_mig_pages_transferred(void); +uint64_t xbzrle_mig_bytes_transferred(void); +uint64_t xbzrle_mig_pages_transferred(void); +uint64_t xbzrle_mig_pages_overflow(void); +uint64_t xbzrle_mig_pages_cache_miss(void); + +/** + * @migrate_add_blocker - prevent migration from proceeding + * + * @reason - an error to be returned whenever migration is attempted + */ +void migrate_add_blocker(Error *reason); + +/** + * @migrate_del_blocker - remove a blocking error from migration + * + * @reason - the error blocking migration + */ +void migrate_del_blocker(Error *reason); + +bool migrate_rdma_pin_all(void); + +bool migrate_auto_converge(void); + +int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, + uint8_t *dst, int dlen); +int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen); + +int migrate_use_xbzrle(void); +int64_t migrate_xbzrle_cache_size(void); + +int64_t xbzrle_cache_resize(int64_t new_size); + +void ram_control_before_iterate(QEMUFile *f, uint64_t flags); +void ram_control_after_iterate(QEMUFile *f, uint64_t flags); +void ram_control_load_hook(QEMUFile *f, uint64_t flags); + +/* Whenever this is found in the data stream, the flags + * will be passed to ram_control_load_hook in the incoming-migration + * side. This lets before_ram_iterate/after_ram_iterate add + * transport-specific sections to the RAM migration data. + */ +#define RAM_SAVE_FLAG_HOOK 0x80 + +#define RAM_SAVE_CONTROL_NOT_SUPP -1000 +#define RAM_SAVE_CONTROL_DELAYED -2000 + +size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, + ram_addr_t offset, size_t size, + int *bytes_sent); + +#endif diff --git a/contrib/qemu/include/migration/qemu-file.h b/contrib/qemu/include/migration/qemu-file.h new file mode 100644 index 000000000..0f757fbeb --- /dev/null +++ b/contrib/qemu/include/migration/qemu-file.h @@ -0,0 +1,266 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef QEMU_FILE_H +#define QEMU_FILE_H 1 +#include "exec/cpu-common.h" + +/* This function writes a chunk of data to a file at the given position. + * The pos argument can be ignored if the file is only being used for + * streaming. The handler should try to write all of the data it can. + */ +typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf, + int64_t pos, int size); + +/* Read a chunk of data from a file at the given position. The pos argument + * can be ignored if the file is only be used for streaming. The number of + * bytes actually read should be returned. + */ +typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf, + int64_t pos, int size); + +/* Close a file + * + * Return negative error number on error, 0 or positive value on success. + * + * The meaning of return value on success depends on the specific back-end being + * used. + */ +typedef int (QEMUFileCloseFunc)(void *opaque); + +/* Called to return the OS file descriptor associated to the QEMUFile. + */ +typedef int (QEMUFileGetFD)(void *opaque); + +/* + * This function writes an iovec to file. + */ +typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov, + int iovcnt, int64_t pos); + +/* + * This function provides hooks around different + * stages of RAM migration. + */ +typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags); + +/* + * Constants used by ram_control_* hooks + */ +#define RAM_CONTROL_SETUP 0 +#define RAM_CONTROL_ROUND 1 +#define RAM_CONTROL_HOOK 2 +#define RAM_CONTROL_FINISH 3 + +/* + * This function allows override of where the RAM page + * is saved (such as RDMA, for example.) + */ +typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque, + ram_addr_t block_offset, + ram_addr_t offset, + size_t size, + int *bytes_sent); + +typedef struct QEMUFileOps { + QEMUFilePutBufferFunc *put_buffer; + QEMUFileGetBufferFunc *get_buffer; + QEMUFileCloseFunc *close; + QEMUFileGetFD *get_fd; + QEMUFileWritevBufferFunc *writev_buffer; + QEMURamHookFunc *before_ram_iterate; + QEMURamHookFunc *after_ram_iterate; + QEMURamHookFunc *hook_ram_load; + QEMURamSaveFunc *save_page; +} QEMUFileOps; + +QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops); +QEMUFile *qemu_fopen(const char *filename, const char *mode); +QEMUFile *qemu_fdopen(int fd, const char *mode); +QEMUFile *qemu_fopen_socket(int fd, const char *mode); +QEMUFile *qemu_popen_cmd(const char *command, const char *mode); +int qemu_get_fd(QEMUFile *f); +int qemu_fclose(QEMUFile *f); +int64_t qemu_ftell(QEMUFile *f); +void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size); +void qemu_put_byte(QEMUFile *f, int v); +/* + * put_buffer without copying the buffer. + * The buffer should be available till it is sent asynchronously. + */ +void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, int size); +bool qemu_file_mode_is_not_valid(const char *mode); + +static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v) +{ + qemu_put_byte(f, (int)v); +} + +#define qemu_put_sbyte qemu_put_byte + +void qemu_put_be16(QEMUFile *f, unsigned int v); +void qemu_put_be32(QEMUFile *f, unsigned int v); +void qemu_put_be64(QEMUFile *f, uint64_t v); +int qemu_get_buffer(QEMUFile *f, uint8_t *buf, int size); +int qemu_get_byte(QEMUFile *f); +void qemu_update_position(QEMUFile *f, size_t size); + +static inline unsigned int qemu_get_ubyte(QEMUFile *f) +{ + return (unsigned int)qemu_get_byte(f); +} + +#define qemu_get_sbyte qemu_get_byte + +unsigned int qemu_get_be16(QEMUFile *f); +unsigned int qemu_get_be32(QEMUFile *f); +uint64_t qemu_get_be64(QEMUFile *f); + +int qemu_file_rate_limit(QEMUFile *f); +void qemu_file_reset_rate_limit(QEMUFile *f); +void qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate); +int64_t qemu_file_get_rate_limit(QEMUFile *f); +int qemu_file_get_error(QEMUFile *f); +void qemu_fflush(QEMUFile *f); + +static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv) +{ + qemu_put_be64(f, *pv); +} + +static inline void qemu_put_be32s(QEMUFile *f, const uint32_t *pv) +{ + qemu_put_be32(f, *pv); +} + +static inline void qemu_put_be16s(QEMUFile *f, const uint16_t *pv) +{ + qemu_put_be16(f, *pv); +} + +static inline void qemu_put_8s(QEMUFile *f, const uint8_t *pv) +{ + qemu_put_byte(f, *pv); +} + +static inline void qemu_get_be64s(QEMUFile *f, uint64_t *pv) +{ + *pv = qemu_get_be64(f); +} + +static inline void qemu_get_be32s(QEMUFile *f, uint32_t *pv) +{ + *pv = qemu_get_be32(f); +} + +static inline void qemu_get_be16s(QEMUFile *f, uint16_t *pv) +{ + *pv = qemu_get_be16(f); +} + +static inline void qemu_get_8s(QEMUFile *f, uint8_t *pv) +{ + *pv = qemu_get_byte(f); +} + +// Signed versions for type safety +static inline void qemu_put_sbuffer(QEMUFile *f, const int8_t *buf, int size) +{ + qemu_put_buffer(f, (const uint8_t *)buf, size); +} + +static inline void qemu_put_sbe16(QEMUFile *f, int v) +{ + qemu_put_be16(f, (unsigned int)v); +} + +static inline void qemu_put_sbe32(QEMUFile *f, int v) +{ + qemu_put_be32(f, (unsigned int)v); +} + +static inline void qemu_put_sbe64(QEMUFile *f, int64_t v) +{ + qemu_put_be64(f, (uint64_t)v); +} + +static inline size_t qemu_get_sbuffer(QEMUFile *f, int8_t *buf, int size) +{ + return qemu_get_buffer(f, (uint8_t *)buf, size); +} + +static inline int qemu_get_sbe16(QEMUFile *f) +{ + return (int)qemu_get_be16(f); +} + +static inline int qemu_get_sbe32(QEMUFile *f) +{ + return (int)qemu_get_be32(f); +} + +static inline int64_t qemu_get_sbe64(QEMUFile *f) +{ + return (int64_t)qemu_get_be64(f); +} + +static inline void qemu_put_s8s(QEMUFile *f, const int8_t *pv) +{ + qemu_put_8s(f, (const uint8_t *)pv); +} + +static inline void qemu_put_sbe16s(QEMUFile *f, const int16_t *pv) +{ + qemu_put_be16s(f, (const uint16_t *)pv); +} + +static inline void qemu_put_sbe32s(QEMUFile *f, const int32_t *pv) +{ + qemu_put_be32s(f, (const uint32_t *)pv); +} + +static inline void qemu_put_sbe64s(QEMUFile *f, const int64_t *pv) +{ + qemu_put_be64s(f, (const uint64_t *)pv); +} + +static inline void qemu_get_s8s(QEMUFile *f, int8_t *pv) +{ + qemu_get_8s(f, (uint8_t *)pv); +} + +static inline void qemu_get_sbe16s(QEMUFile *f, int16_t *pv) +{ + qemu_get_be16s(f, (uint16_t *)pv); +} + +static inline void qemu_get_sbe32s(QEMUFile *f, int32_t *pv) +{ + qemu_get_be32s(f, (uint32_t *)pv); +} + +static inline void qemu_get_sbe64s(QEMUFile *f, int64_t *pv) +{ + qemu_get_be64s(f, (uint64_t *)pv); +} +#endif diff --git a/contrib/qemu/include/migration/vmstate.h b/contrib/qemu/include/migration/vmstate.h new file mode 100644 index 000000000..1c31b5d6f --- /dev/null +++ b/contrib/qemu/include/migration/vmstate.h @@ -0,0 +1,740 @@ +/* + * QEMU migration/snapshot declarations + * + * Copyright (c) 2009-2011 Red Hat, Inc. + * + * Original author: Juan Quintela <quintela@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef QEMU_VMSTATE_H +#define QEMU_VMSTATE_H 1 + +#ifndef CONFIG_USER_ONLY +#include <migration/qemu-file.h> +#endif + +typedef void SaveStateHandler(QEMUFile *f, void *opaque); +typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id); + +typedef struct SaveVMHandlers { + /* This runs inside the iothread lock. */ + void (*set_params)(const MigrationParams *params, void * opaque); + SaveStateHandler *save_state; + + void (*cancel)(void *opaque); + int (*save_live_complete)(QEMUFile *f, void *opaque); + + /* This runs both outside and inside the iothread lock. */ + bool (*is_active)(void *opaque); + + /* This runs outside the iothread lock in the migration case, and + * within the lock in the savevm case. The callback had better only + * use data that is local to the migration thread or protected + * by other locks. + */ + int (*save_live_iterate)(QEMUFile *f, void *opaque); + + /* This runs outside the iothread lock! */ + int (*save_live_setup)(QEMUFile *f, void *opaque); + uint64_t (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size); + + LoadStateHandler *load_state; +} SaveVMHandlers; + +int register_savevm(DeviceState *dev, + const char *idstr, + int instance_id, + int version_id, + SaveStateHandler *save_state, + LoadStateHandler *load_state, + void *opaque); + +int register_savevm_live(DeviceState *dev, + const char *idstr, + int instance_id, + int version_id, + SaveVMHandlers *ops, + void *opaque); + +void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque); +void register_device_unmigratable(DeviceState *dev, const char *idstr, + void *opaque); + + +typedef struct VMStateInfo VMStateInfo; +typedef struct VMStateDescription VMStateDescription; + +struct VMStateInfo { + const char *name; + int (*get)(QEMUFile *f, void *pv, size_t size); + void (*put)(QEMUFile *f, void *pv, size_t size); +}; + +enum VMStateFlags { + VMS_SINGLE = 0x001, + VMS_POINTER = 0x002, + VMS_ARRAY = 0x004, + VMS_STRUCT = 0x008, + VMS_VARRAY_INT32 = 0x010, /* Array with size in int32_t field*/ + VMS_BUFFER = 0x020, /* static sized buffer */ + VMS_ARRAY_OF_POINTER = 0x040, + VMS_VARRAY_UINT16 = 0x080, /* Array with size in uint16_t field */ + VMS_VBUFFER = 0x100, /* Buffer with size in int32_t field */ + VMS_MULTIPLY = 0x200, /* multiply "size" field by field_size */ + VMS_VARRAY_UINT8 = 0x400, /* Array with size in uint8_t field*/ + VMS_VARRAY_UINT32 = 0x800, /* Array with size in uint32_t field*/ +}; + +typedef struct { + const char *name; + size_t offset; + size_t size; + size_t start; + int num; + size_t num_offset; + size_t size_offset; + const VMStateInfo *info; + enum VMStateFlags flags; + const VMStateDescription *vmsd; + int version_id; + bool (*field_exists)(void *opaque, int version_id); +} VMStateField; + +typedef struct VMStateSubsection { + const VMStateDescription *vmsd; + bool (*needed)(void *opaque); +} VMStateSubsection; + +struct VMStateDescription { + const char *name; + int unmigratable; + int version_id; + int minimum_version_id; + int minimum_version_id_old; + LoadStateHandler *load_state_old; + int (*pre_load)(void *opaque); + int (*post_load)(void *opaque, int version_id); + void (*pre_save)(void *opaque); + VMStateField *fields; + const VMStateSubsection *subsections; +}; + +#ifdef CONFIG_USER_ONLY +extern const VMStateDescription vmstate_dummy; +#endif + +extern const VMStateInfo vmstate_info_bool; + +extern const VMStateInfo vmstate_info_int8; +extern const VMStateInfo vmstate_info_int16; +extern const VMStateInfo vmstate_info_int32; +extern const VMStateInfo vmstate_info_int64; + +extern const VMStateInfo vmstate_info_uint8_equal; +extern const VMStateInfo vmstate_info_uint16_equal; +extern const VMStateInfo vmstate_info_int32_equal; +extern const VMStateInfo vmstate_info_uint32_equal; +extern const VMStateInfo vmstate_info_uint64_equal; +extern const VMStateInfo vmstate_info_int32_le; + +extern const VMStateInfo vmstate_info_uint8; +extern const VMStateInfo vmstate_info_uint16; +extern const VMStateInfo vmstate_info_uint32; +extern const VMStateInfo vmstate_info_uint64; + +extern const VMStateInfo vmstate_info_float64; + +extern const VMStateInfo vmstate_info_timer; +extern const VMStateInfo vmstate_info_buffer; +extern const VMStateInfo vmstate_info_unused_buffer; +extern const VMStateInfo vmstate_info_bitmap; + +#define type_check_2darray(t1,t2,n,m) ((t1(*)[n][m])0 - (t2*)0) +#define type_check_array(t1,t2,n) ((t1(*)[n])0 - (t2*)0) +#define type_check_pointer(t1,t2) ((t1**)0 - (t2*)0) + +#define vmstate_offset_value(_state, _field, _type) \ + (offsetof(_state, _field) + \ + type_check(_type, typeof_field(_state, _field))) + +#define vmstate_offset_pointer(_state, _field, _type) \ + (offsetof(_state, _field) + \ + type_check_pointer(_type, typeof_field(_state, _field))) + +#define vmstate_offset_array(_state, _field, _type, _num) \ + (offsetof(_state, _field) + \ + type_check_array(_type, typeof_field(_state, _field), _num)) + +#define vmstate_offset_2darray(_state, _field, _type, _n1, _n2) \ + (offsetof(_state, _field) + \ + type_check_2darray(_type, typeof_field(_state, _field), _n1, _n2)) + +#define vmstate_offset_sub_array(_state, _field, _type, _start) \ + (offsetof(_state, _field[_start])) + +#define vmstate_offset_buffer(_state, _field) \ + vmstate_offset_array(_state, _field, uint8_t, \ + sizeof(typeof_field(_state, _field))) + +#define VMSTATE_SINGLE_TEST(_field, _state, _test, _version, _info, _type) { \ + .name = (stringify(_field)), \ + .version_id = (_version), \ + .field_exists = (_test), \ + .size = sizeof(_type), \ + .info = &(_info), \ + .flags = VMS_SINGLE, \ + .offset = vmstate_offset_value(_state, _field, _type), \ +} + +#define VMSTATE_POINTER(_field, _state, _version, _info, _type) { \ + .name = (stringify(_field)), \ + .version_id = (_version), \ + .info = &(_info), \ + .size = sizeof(_type), \ + .flags = VMS_SINGLE|VMS_POINTER, \ + .offset = vmstate_offset_value(_state, _field, _type), \ +} + +#define VMSTATE_POINTER_TEST(_field, _state, _test, _info, _type) { \ + .name = (stringify(_field)), \ + .info = &(_info), \ + .field_exists = (_test), \ + .size = sizeof(_type), \ + .flags = VMS_SINGLE|VMS_POINTER, \ + .offset = vmstate_offset_value(_state, _field, _type), \ +} + +#define VMSTATE_ARRAY(_field, _state, _num, _version, _info, _type) {\ + .name = (stringify(_field)), \ + .version_id = (_version), \ + .num = (_num), \ + .info = &(_info), \ + .size = sizeof(_type), \ + .flags = VMS_ARRAY, \ + .offset = vmstate_offset_array(_state, _field, _type, _num), \ +} + +#define VMSTATE_2DARRAY(_field, _state, _n1, _n2, _version, _info, _type) { \ + .name = (stringify(_field)), \ + .version_id = (_version), \ + .num = (_n1) * (_n2), \ + .info = &(_info), \ + .size = sizeof(_type), \ + .flags = VMS_ARRAY, \ + .offset = vmstate_offset_2darray(_state, _field, _type, _n1, _n2), \ +} + +#define VMSTATE_ARRAY_TEST(_field, _state, _num, _test, _info, _type) {\ + .name = (stringify(_field)), \ + .field_exists = (_test), \ + .num = (_num), \ + .info = &(_info), \ + .size = sizeof(_type), \ + .flags = VMS_ARRAY, \ + .offset = vmstate_offset_array(_state, _field, _type, _num),\ +} + +#define VMSTATE_SUB_ARRAY(_field, _state, _start, _num, _version, _info, _type) { \ + .name = (stringify(_field)), \ + .version_id = (_version), \ + .num = (_num), \ + .info = &(_info), \ + .size = sizeof(_type), \ + .flags = VMS_ARRAY, \ + .offset = vmstate_offset_sub_array(_state, _field, _type, _start), \ +} + +#define VMSTATE_ARRAY_INT32_UNSAFE(_field, _state, _field_num, _info, _type) {\ + .name = (stringify(_field)), \ + .num_offset = vmstate_offset_value(_state, _field_num, int32_t), \ + .info = &(_info), \ + .size = sizeof(_type), \ + .flags = VMS_VARRAY_INT32, \ + .offset = offsetof(_state, _field), \ +} + +#define VMSTATE_VARRAY_INT32(_field, _state, _field_num, _version, _info, _type) {\ + .name = (stringify(_field)), \ + .version_id = (_version), \ + .num_offset = vmstate_offset_value(_state, _field_num, int32_t), \ + .info = &(_info), \ + .size = sizeof(_type), \ + .flags = VMS_VARR |