diff options
Diffstat (limited to 'contrib/qemu/block')
-rw-r--r-- | contrib/qemu/block/qcow.c | 914 | ||||
-rw-r--r-- | contrib/qemu/block/qcow2-cache.c | 323 | ||||
-rw-r--r-- | contrib/qemu/block/qcow2-cluster.c | 1478 | ||||
-rw-r--r-- | contrib/qemu/block/qcow2-refcount.c | 1374 | ||||
-rw-r--r-- | contrib/qemu/block/qcow2-snapshot.c | 660 | ||||
-rw-r--r-- | contrib/qemu/block/qcow2.c | 1825 | ||||
-rw-r--r-- | contrib/qemu/block/qcow2.h | 437 | ||||
-rw-r--r-- | contrib/qemu/block/qed-check.c | 248 | ||||
-rw-r--r-- | contrib/qemu/block/qed-cluster.c | 165 | ||||
-rw-r--r-- | contrib/qemu/block/qed-gencb.c | 32 | ||||
-rw-r--r-- | contrib/qemu/block/qed-l2-cache.c | 187 | ||||
-rw-r--r-- | contrib/qemu/block/qed-table.c | 296 | ||||
-rw-r--r-- | contrib/qemu/block/qed.c | 1596 | ||||
-rw-r--r-- | contrib/qemu/block/qed.h | 344 | ||||
-rw-r--r-- | contrib/qemu/block/snapshot.c | 157 |
15 files changed, 0 insertions, 10036 deletions
diff --git a/contrib/qemu/block/qcow.c b/contrib/qemu/block/qcow.c deleted file mode 100644 index 5239bd68f1c..00000000000 --- a/contrib/qemu/block/qcow.c +++ /dev/null @@ -1,914 +0,0 @@ -/* - * Block driver for the QCOW format - * - * Copyright (c) 2004-2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu-common.h" -#include "block/block_int.h" -#include "qemu/module.h" -#include <zlib.h> -#include "qemu/aes.h" -#include "migration/migration.h" - -/**************************************************************/ -/* QEMU COW block driver with compression and encryption support */ - -#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) -#define QCOW_VERSION 1 - -#define QCOW_CRYPT_NONE 0 -#define QCOW_CRYPT_AES 1 - -#define QCOW_OFLAG_COMPRESSED (1LL << 63) - -typedef struct QCowHeader { - uint32_t magic; - uint32_t version; - uint64_t backing_file_offset; - uint32_t backing_file_size; - uint32_t mtime; - uint64_t size; /* in bytes */ - uint8_t cluster_bits; - uint8_t l2_bits; - uint32_t crypt_method; - uint64_t l1_table_offset; -} QCowHeader; - -#define L2_CACHE_SIZE 16 - -typedef struct BDRVQcowState { - int cluster_bits; - int cluster_size; - int cluster_sectors; - int l2_bits; - int l2_size; - int l1_size; - uint64_t cluster_offset_mask; - uint64_t l1_table_offset; - uint64_t *l1_table; - uint64_t *l2_cache; - uint64_t l2_cache_offsets[L2_CACHE_SIZE]; - uint32_t l2_cache_counts[L2_CACHE_SIZE]; - uint8_t *cluster_cache; - uint8_t *cluster_data; - uint64_t cluster_cache_offset; - uint32_t crypt_method; /* current crypt method, 0 if no key yet */ - uint32_t crypt_method_header; - AES_KEY aes_encrypt_key; - AES_KEY aes_decrypt_key; - CoMutex lock; - Error *migration_blocker; -} BDRVQcowState; - -static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); - -static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) -{ - const QCowHeader *cow_header = (const void *)buf; - - if (buf_size >= sizeof(QCowHeader) && - be32_to_cpu(cow_header->magic) == QCOW_MAGIC && - be32_to_cpu(cow_header->version) == QCOW_VERSION) - return 100; - else - return 0; -} - -static int qcow_open(BlockDriverState *bs, QDict *options, int flags) -{ - BDRVQcowState *s = bs->opaque; - int len, i, shift, ret; - QCowHeader header; - - ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); - if (ret < 0) { - goto fail; - } - be32_to_cpus(&header.magic); - be32_to_cpus(&header.version); - be64_to_cpus(&header.backing_file_offset); - be32_to_cpus(&header.backing_file_size); - be32_to_cpus(&header.mtime); - be64_to_cpus(&header.size); - be32_to_cpus(&header.crypt_method); - be64_to_cpus(&header.l1_table_offset); - - if (header.magic != QCOW_MAGIC) { - ret = -EMEDIUMTYPE; - goto fail; - } - if (header.version != QCOW_VERSION) { - char version[64]; - snprintf(version, sizeof(version), "QCOW version %d", header.version); - qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bs->device_name, "qcow", version); - ret = -ENOTSUP; - goto fail; - } - - if (header.size <= 1 || header.cluster_bits < 9) { - ret = -EINVAL; - goto fail; - } - if (header.crypt_method > QCOW_CRYPT_AES) { - ret = -EINVAL; - goto fail; - } - s->crypt_method_header = header.crypt_method; - if (s->crypt_method_header) { - bs->encrypted = 1; - } - s->cluster_bits = header.cluster_bits; - s->cluster_size = 1 << s->cluster_bits; - s->cluster_sectors = 1 << (s->cluster_bits - 9); - s->l2_bits = header.l2_bits; - s->l2_size = 1 << s->l2_bits; - bs->total_sectors = header.size / 512; - s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1; - - /* read the level 1 table */ - shift = s->cluster_bits + s->l2_bits; - s->l1_size = (header.size + (1LL << shift) - 1) >> shift; - - s->l1_table_offset = header.l1_table_offset; - s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); - - ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, - s->l1_size * sizeof(uint64_t)); - if (ret < 0) { - goto fail; - } - - for(i = 0;i < s->l1_size; i++) { - be64_to_cpus(&s->l1_table[i]); - } - /* alloc L2 cache */ - s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); - s->cluster_cache = g_malloc(s->cluster_size); - s->cluster_data = g_malloc(s->cluster_size); - s->cluster_cache_offset = -1; - - /* read the backing file name */ - if (header.backing_file_offset != 0) { - len = header.backing_file_size; - if (len > 1023) { - len = 1023; - } - ret = bdrv_pread(bs->file, header.backing_file_offset, - bs->backing_file, len); - if (ret < 0) { - goto fail; - } - bs->backing_file[len] = '\0'; - } - - /* Disable migration when qcow images are used */ - error_set(&s->migration_blocker, - QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, - "qcow", bs->device_name, "live migration"); - migrate_add_blocker(s->migration_blocker); - - qemu_co_mutex_init(&s->lock); - return 0; - - fail: - g_free(s->l1_table); - g_free(s->l2_cache); - g_free(s->cluster_cache); - g_free(s->cluster_data); - return ret; -} - - -/* We have nothing to do for QCOW reopen, stubs just return - * success */ -static int qcow_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) -{ - return 0; -} - -static int qcow_set_key(BlockDriverState *bs, const char *key) -{ - BDRVQcowState *s = bs->opaque; - uint8_t keybuf[16]; - int len, i; - - memset(keybuf, 0, 16); - len = strlen(key); - if (len > 16) - len = 16; - /* XXX: we could compress the chars to 7 bits to increase - entropy */ - for(i = 0;i < len;i++) { - keybuf[i] = key[i]; - } - s->crypt_method = s->crypt_method_header; - - if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) - return -1; - if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) - return -1; - return 0; -} - -/* The crypt function is compatible with the linux cryptoloop - algorithm for < 4 GB images. NOTE: out_buf == in_buf is - supported */ -static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num, - uint8_t *out_buf, const uint8_t *in_buf, - int nb_sectors, int enc, - const AES_KEY *key) -{ - union { - uint64_t ll[2]; - uint8_t b[16]; - } ivec; - int i; - - for(i = 0; i < nb_sectors; i++) { - ivec.ll[0] = cpu_to_le64(sector_num); - ivec.ll[1] = 0; - AES_cbc_encrypt(in_buf, out_buf, 512, key, - ivec.b, enc); - sector_num++; - in_buf += 512; - out_buf += 512; - } -} - -/* 'allocate' is: - * - * 0 to not allocate. - * - * 1 to allocate a normal cluster (for sector indexes 'n_start' to - * 'n_end') - * - * 2 to allocate a compressed cluster of size - * 'compressed_size'. 'compressed_size' must be > 0 and < - * cluster_size - * - * return 0 if not allocated. - */ -static uint64_t get_cluster_offset(BlockDriverState *bs, - uint64_t offset, int allocate, - int compressed_size, - int n_start, int n_end) -{ - BDRVQcowState *s = bs->opaque; - int min_index, i, j, l1_index, l2_index; - uint64_t l2_offset, *l2_table, cluster_offset, tmp; - uint32_t min_count; - int new_l2_table; - - l1_index = offset >> (s->l2_bits + s->cluster_bits); - l2_offset = s->l1_table[l1_index]; - new_l2_table = 0; - if (!l2_offset) { - if (!allocate) - return 0; - /* allocate a new l2 entry */ - l2_offset = bdrv_getlength(bs->file); - /* round to cluster size */ - l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); - /* update the L1 entry */ - s->l1_table[l1_index] = l2_offset; - tmp = cpu_to_be64(l2_offset); - if (bdrv_pwrite_sync(bs->file, - s->l1_table_offset + l1_index * sizeof(tmp), - &tmp, sizeof(tmp)) < 0) - return 0; - new_l2_table = 1; - } - for(i = 0; i < L2_CACHE_SIZE; i++) { - if (l2_offset == s->l2_cache_offsets[i]) { - /* increment the hit count */ - if (++s->l2_cache_counts[i] == 0xffffffff) { - for(j = 0; j < L2_CACHE_SIZE; j++) { - s->l2_cache_counts[j] >>= 1; - } - } - l2_table = s->l2_cache + (i << s->l2_bits); - goto found; - } - } - /* not found: load a new entry in the least used one */ - min_index = 0; - min_count = 0xffffffff; - for(i = 0; i < L2_CACHE_SIZE; i++) { - if (s->l2_cache_counts[i] < min_count) { - min_count = s->l2_cache_counts[i]; - min_index = i; - } - } - l2_table = s->l2_cache + (min_index << s->l2_bits); - if (new_l2_table) { - memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); - if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table, - s->l2_size * sizeof(uint64_t)) < 0) - return 0; - } else { - if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != - s->l2_size * sizeof(uint64_t)) - return 0; - } - s->l2_cache_offsets[min_index] = l2_offset; - s->l2_cache_counts[min_index] = 1; - found: - l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); - cluster_offset = be64_to_cpu(l2_table[l2_index]); - if (!cluster_offset || - ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) { - if (!allocate) - return 0; - /* allocate a new cluster */ - if ((cluster_offset & QCOW_OFLAG_COMPRESSED) && - (n_end - n_start) < s->cluster_sectors) { - /* if the cluster is already compressed, we must - decompress it in the case it is not completely - overwritten */ - if (decompress_cluster(bs, cluster_offset) < 0) - return 0; - cluster_offset = bdrv_getlength(bs->file); - cluster_offset = (cluster_offset + s->cluster_size - 1) & - ~(s->cluster_size - 1); - /* write the cluster content */ - if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) != - s->cluster_size) - return -1; - } else { - cluster_offset = bdrv_getlength(bs->file); - if (allocate == 1) { - /* round to cluster size */ - cluster_offset = (cluster_offset + s->cluster_size - 1) & - ~(s->cluster_size - 1); - bdrv_truncate(bs->file, cluster_offset + s->cluster_size); - /* if encrypted, we must initialize the cluster - content which won't be written */ - if (s->crypt_method && - (n_end - n_start) < s->cluster_sectors) { - uint64_t start_sect; - start_sect = (offset & ~(s->cluster_size - 1)) >> 9; - memset(s->cluster_data + 512, 0x00, 512); - for(i = 0; i < s->cluster_sectors; i++) { - if (i < n_start || i >= n_end) { - encrypt_sectors(s, start_sect + i, - s->cluster_data, - s->cluster_data + 512, 1, 1, - &s->aes_encrypt_key); - if (bdrv_pwrite(bs->file, cluster_offset + i * 512, - s->cluster_data, 512) != 512) - return -1; - } - } - } - } else if (allocate == 2) { - cluster_offset |= QCOW_OFLAG_COMPRESSED | - (uint64_t)compressed_size << (63 - s->cluster_bits); - } - } - /* update L2 table */ - tmp = cpu_to_be64(cluster_offset); - l2_table[l2_index] = tmp; - if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp), - &tmp, sizeof(tmp)) < 0) - return 0; - } - return cluster_offset; -} - -static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum) -{ - BDRVQcowState *s = bs->opaque; - int index_in_cluster, n; - uint64_t cluster_offset; - - qemu_co_mutex_lock(&s->lock); - cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); - qemu_co_mutex_unlock(&s->lock); - index_in_cluster = sector_num & (s->cluster_sectors - 1); - n = s->cluster_sectors - index_in_cluster; - if (n > nb_sectors) - n = nb_sectors; - *pnum = n; - return (cluster_offset != 0); -} - -static int decompress_buffer(uint8_t *out_buf, int out_buf_size, - const uint8_t *buf, int buf_size) -{ - z_stream strm1, *strm = &strm1; - int ret, out_len; - - memset(strm, 0, sizeof(*strm)); - - strm->next_in = (uint8_t *)buf; - strm->avail_in = buf_size; - strm->next_out = out_buf; - strm->avail_out = out_buf_size; - - ret = inflateInit2(strm, -12); - if (ret != Z_OK) - return -1; - ret = inflate(strm, Z_FINISH); - out_len = strm->next_out - out_buf; - if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || - out_len != out_buf_size) { - inflateEnd(strm); - return -1; - } - inflateEnd(strm); - return 0; -} - -static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) -{ - BDRVQcowState *s = bs->opaque; - int ret, csize; - uint64_t coffset; - - coffset = cluster_offset & s->cluster_offset_mask; - if (s->cluster_cache_offset != coffset) { - csize = cluster_offset >> (63 - s->cluster_bits); - csize &= (s->cluster_size - 1); - ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize); - if (ret != csize) - return -1; - if (decompress_buffer(s->cluster_cache, s->cluster_size, - s->cluster_data, csize) < 0) { - return -1; - } - s->cluster_cache_offset = coffset; - } - return 0; -} - -static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - BDRVQcowState *s = bs->opaque; - int index_in_cluster; - int ret = 0, n; - uint64_t cluster_offset; - struct iovec hd_iov; - QEMUIOVector hd_qiov; - uint8_t *buf; - void *orig_buf; - - if (qiov->niov > 1) { - buf = orig_buf = qemu_blockalign(bs, qiov->size); - } else { - orig_buf = NULL; - buf = (uint8_t *)qiov->iov->iov_base; - } - - qemu_co_mutex_lock(&s->lock); - - while (nb_sectors != 0) { - /* prepare next request */ - cluster_offset = get_cluster_offset(bs, sector_num << 9, - 0, 0, 0, 0); - index_in_cluster = sector_num & (s->cluster_sectors - 1); - n = s->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } - - if (!cluster_offset) { - if (bs->backing_hd) { - /* read from the base image */ - hd_iov.iov_base = (void *)buf; - hd_iov.iov_len = n * 512; - qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->backing_hd, sector_num, - n, &hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - goto fail; - } - } else { - /* Note: in this case, no need to wait */ - memset(buf, 0, 512 * n); - } - } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { - /* add AIO support for compressed blocks ? */ - if (decompress_cluster(bs, cluster_offset) < 0) { - goto fail; - } - memcpy(buf, - s->cluster_cache + index_in_cluster * 512, 512 * n); - } else { - if ((cluster_offset & 511) != 0) { - goto fail; - } - hd_iov.iov_base = (void *)buf; - hd_iov.iov_len = n * 512; - qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->file, - (cluster_offset >> 9) + index_in_cluster, - n, &hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - break; - } - if (s->crypt_method) { - encrypt_sectors(s, sector_num, buf, buf, - n, 0, - &s->aes_decrypt_key); - } - } - ret = 0; - - nb_sectors -= n; - sector_num += n; - buf += n * 512; - } - -done: - qemu_co_mutex_unlock(&s->lock); - - if (qiov->niov > 1) { - qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size); - qemu_vfree(orig_buf); - } - - return ret; - -fail: - ret = -EIO; - goto done; -} - -static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - BDRVQcowState *s = bs->opaque; - int index_in_cluster; - uint64_t cluster_offset; - const uint8_t *src_buf; - int ret = 0, n; - uint8_t *cluster_data = NULL; - struct iovec hd_iov; - QEMUIOVector hd_qiov; - uint8_t *buf; - void *orig_buf; - - s->cluster_cache_offset = -1; /* disable compressed cache */ - - if (qiov->niov > 1) { - buf = orig_buf = qemu_blockalign(bs, qiov->size); - qemu_iovec_to_buf(qiov, 0, buf, qiov->size); - } else { - orig_buf = NULL; - buf = (uint8_t *)qiov->iov->iov_base; - } - - qemu_co_mutex_lock(&s->lock); - - while (nb_sectors != 0) { - - index_in_cluster = sector_num & (s->cluster_sectors - 1); - n = s->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } - cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0, - index_in_cluster, - index_in_cluster + n); - if (!cluster_offset || (cluster_offset & 511) != 0) { - ret = -EIO; - break; - } - if (s->crypt_method) { - if (!cluster_data) { - cluster_data = g_malloc0(s->cluster_size); - } - encrypt_sectors(s, sector_num, cluster_data, buf, - n, 1, &s->aes_encrypt_key); - src_buf = cluster_data; - } else { - src_buf = buf; - } - - hd_iov.iov_base = (void *)src_buf; - hd_iov.iov_len = n * 512; - qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_writev(bs->file, - (cluster_offset >> 9) + index_in_cluster, - n, &hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - break; - } - ret = 0; - - nb_sectors -= n; - sector_num += n; - buf += n * 512; - } - qemu_co_mutex_unlock(&s->lock); - - if (qiov->niov > 1) { - qemu_vfree(orig_buf); - } - g_free(cluster_data); - - return ret; -} - -static void qcow_close(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - - g_free(s->l1_table); - g_free(s->l2_cache); - g_free(s->cluster_cache); - g_free(s->cluster_data); - - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); -} - -static int qcow_create(const char *filename, QEMUOptionParameter *options) -{ - int header_size, backing_filename_len, l1_size, shift, i; - QCowHeader header; - uint8_t *tmp; - int64_t total_size = 0; - const char *backing_file = NULL; - int flags = 0; - int ret; - BlockDriverState *qcow_bs; - - /* Read out options */ - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - total_size = options->value.n / 512; - } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { - backing_file = options->value.s; - } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) { - flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0; - } - options++; - } - - ret = bdrv_create_file(filename, options); - if (ret < 0) { - return ret; - } - - ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR); - if (ret < 0) { - return ret; - } - - ret = bdrv_truncate(qcow_bs, 0); - if (ret < 0) { - goto exit; - } - - memset(&header, 0, sizeof(header)); - header.magic = cpu_to_be32(QCOW_MAGIC); - header.version = cpu_to_be32(QCOW_VERSION); - header.size = cpu_to_be64(total_size * 512); - header_size = sizeof(header); - backing_filename_len = 0; - if (backing_file) { - if (strcmp(backing_file, "fat:")) { - header.backing_file_offset = cpu_to_be64(header_size); - backing_filename_len = strlen(backing_file); - header.backing_file_size = cpu_to_be32(backing_filename_len); - header_size += backing_filename_len; - } else { - /* special backing file for vvfat */ - backing_file = NULL; - } - header.cluster_bits = 9; /* 512 byte cluster to avoid copying - unmodifyed sectors */ - header.l2_bits = 12; /* 32 KB L2 tables */ - } else { - header.cluster_bits = 12; /* 4 KB clusters */ - header.l2_bits = 9; /* 4 KB L2 tables */ - } - header_size = (header_size + 7) & ~7; - shift = header.cluster_bits + header.l2_bits; - l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift; - - header.l1_table_offset = cpu_to_be64(header_size); - if (flags & BLOCK_FLAG_ENCRYPT) { - header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); - } else { - header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); - } - - /* write all the data */ - ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header)); - if (ret != sizeof(header)) { - goto exit; - } - - if (backing_file) { - ret = bdrv_pwrite(qcow_bs, sizeof(header), - backing_file, backing_filename_len); - if (ret != backing_filename_len) { - goto exit; - } - } - - tmp = g_malloc0(BDRV_SECTOR_SIZE); - for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/ - BDRV_SECTOR_SIZE); i++) { - ret = bdrv_pwrite(qcow_bs, header_size + - BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE); - if (ret != BDRV_SECTOR_SIZE) { - g_free(tmp); - goto exit; - } - } - - g_free(tmp); - ret = 0; -exit: - bdrv_delete(qcow_bs); - return ret; -} - -static int qcow_make_empty(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - uint32_t l1_length = s->l1_size * sizeof(uint64_t); - int ret; - - memset(s->l1_table, 0, l1_length); - if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table, - l1_length) < 0) - return -1; - ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); - if (ret < 0) - return ret; - - memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); - memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t)); - memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t)); - - return 0; -} - -/* XXX: put compressed sectors first, then all the cluster aligned - tables to avoid losing bytes in alignment */ -static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - BDRVQcowState *s = bs->opaque; - z_stream strm; - int ret, out_len; - uint8_t *out_buf; - uint64_t cluster_offset; - - if (nb_sectors != s->cluster_sectors) { - ret = -EINVAL; - - /* Zero-pad last write if image size is not cluster aligned */ - if (sector_num + nb_sectors == bs->total_sectors && - nb_sectors < s->cluster_sectors) { - uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); - memset(pad_buf, 0, s->cluster_size); - memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); - ret = qcow_write_compressed(bs, sector_num, - pad_buf, s->cluster_sectors); - qemu_vfree(pad_buf); - } - return ret; - } - - out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); - - /* best compression, small window, no zlib header */ - memset(&strm, 0, sizeof(strm)); - ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, - Z_DEFLATED, -12, - 9, Z_DEFAULT_STRATEGY); - if (ret != 0) { - ret = -EINVAL; - goto fail; - } - - strm.avail_in = s->cluster_size; - strm.next_in = (uint8_t *)buf; - strm.avail_out = s->cluster_size; - strm.next_out = out_buf; - - ret = deflate(&strm, Z_FINISH); - if (ret != Z_STREAM_END && ret != Z_OK) { - deflateEnd(&strm); - ret = -EINVAL; - goto fail; - } - out_len = strm.next_out - out_buf; - - deflateEnd(&strm); - - if (ret != Z_STREAM_END || out_len >= s->cluster_size) { - /* could not compress: write normal cluster */ - ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); - if (ret < 0) { - goto fail; - } - } else { - cluster_offset = get_cluster_offset(bs, sector_num << 9, 2, - out_len, 0, 0); - if (cluster_offset == 0) { - ret = -EIO; - goto fail; - } - - cluster_offset &= s->cluster_offset_mask; - ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); - if (ret < 0) { - goto fail; - } - } - - ret = 0; -fail: - g_free(out_buf); - return ret; -} - -static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - BDRVQcowState *s = bs->opaque; - bdi->cluster_size = s->cluster_size; - return 0; -} - - -static QEMUOptionParameter qcow_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_BACKING_FILE, - .type = OPT_STRING, - .help = "File name of a base image" - }, - { - .name = BLOCK_OPT_ENCRYPT, - .type = OPT_FLAG, - .help = "Encrypt the image" - }, - { NULL } -}; - -static BlockDriver bdrv_qcow = { - .format_name = "qcow", - .instance_size = sizeof(BDRVQcowState), - .bdrv_probe = qcow_probe, - .bdrv_open = qcow_open, - .bdrv_close = qcow_close, - .bdrv_reopen_prepare = qcow_reopen_prepare, - .bdrv_create = qcow_create, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - - .bdrv_co_readv = qcow_co_readv, - .bdrv_co_writev = qcow_co_writev, - .bdrv_co_is_allocated = qcow_co_is_allocated, - - .bdrv_set_key = qcow_set_key, - .bdrv_make_empty = qcow_make_empty, - .bdrv_write_compressed = qcow_write_compressed, - .bdrv_get_info = qcow_get_info, - - .create_options = qcow_create_options, -}; - -static void bdrv_qcow_init(void) -{ - bdrv_register(&bdrv_qcow); -} - -block_init(bdrv_qcow_init); diff --git a/contrib/qemu/block/qcow2-cache.c b/contrib/qemu/block/qcow2-cache.c deleted file mode 100644 index 2f3114ecc24..00000000000 --- a/contrib/qemu/block/qcow2-cache.c +++ /dev/null @@ -1,323 +0,0 @@ -/* - * L2/refcount table cache for the QCOW2 format - * - * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com> - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "block/block_int.h" -#include "qemu-common.h" -#include "qcow2.h" -#include "trace.h" - -typedef struct Qcow2CachedTable { - void* table; - int64_t offset; - bool dirty; - int cache_hits; - int ref; -} Qcow2CachedTable; - -struct Qcow2Cache { - Qcow2CachedTable* entries; - struct Qcow2Cache* depends; - int size; - bool depends_on_flush; -}; - -Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables) -{ - BDRVQcowState *s = bs->opaque; - Qcow2Cache *c; - int i; - - c = g_malloc0(sizeof(*c)); - c->size = num_tables; - c->entries = g_malloc0(sizeof(*c->entries) * num_tables); - - for (i = 0; i < c->size; i++) { - c->entries[i].table = qemu_blockalign(bs, s->cluster_size); - } - - return c; -} - -int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c) -{ - int i; - - for (i = 0; i < c->size; i++) { - assert(c->entries[i].ref == 0); - qemu_vfree(c->entries[i].table); - } - - g_free(c->entries); - g_free(c); - - return 0; -} - -static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c) -{ - int ret; - - ret = qcow2_cache_flush(bs, c->depends); - if (ret < 0) { - return ret; - } - - c->depends = NULL; - c->depends_on_flush = false; - - return 0; -} - -static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) -{ - BDRVQcowState *s = bs->opaque; - int ret = 0; - - if (!c->entries[i].dirty || !c->entries[i].offset) { - return 0; - } - - trace_qcow2_cache_entry_flush(qemu_coroutine_self(), - c == s->l2_table_cache, i); - - if (c->depends) { - ret = qcow2_cache_flush_dependency(bs, c); - } else if (c->depends_on_flush) { - ret = bdrv_flush(bs->file); - if (ret >= 0) { - c->depends_on_flush = false; - } - } - - if (ret < 0) { - return ret; - } - - if (c == s->refcount_block_cache) { - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); - } else if (c == s->l2_table_cache) { - BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); - } - - ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table, - s->cluster_size); - if (ret < 0) { - return ret; - } - - c->entries[i].dirty = false; - - return 0; -} - -int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) -{ - BDRVQcowState *s = bs->opaque; - int result = 0; - int ret; - int i; - - trace_qcow2_cache_flush(qemu_coroutine_self(), c == s->l2_table_cache); - - for (i = 0; i < c->size; i++) { - ret = qcow2_cache_entry_flush(bs, c, i); - if (ret < 0 && result != -ENOSPC) { - result = ret; - } - } - - if (result == 0) { - ret = bdrv_flush(bs->file); - if (ret < 0) { - result = ret; - } - } - - return result; -} - -int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, - Qcow2Cache *dependency) -{ - int ret; - - if (dependency->depends) { - ret = qcow2_cache_flush_dependency(bs, dependency); - if (ret < 0) { - return ret; - } - } - - if (c->depends && (c->depends != dependency)) { - ret = qcow2_cache_flush_dependency(bs, c); - if (ret < 0) { - return ret; - } - } - - c->depends = dependency; - return 0; -} - -void qcow2_cache_depends_on_flush(Qcow2Cache *c) -{ - c->depends_on_flush = true; -} - -static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c) -{ - int i; - int min_count = INT_MAX; - int min_index = -1; - - - for (i = 0; i < c->size; i++) { - if (c->entries[i].ref) { - continue; - } - - if (c->entries[i].cache_hits < min_count) { - min_index = i; - min_count = c->entries[i].cache_hits; - } - - /* Give newer hits priority */ - /* TODO Check how to optimize the replacement strategy */ - c->entries[i].cache_hits /= 2; - } - - if (min_index == -1) { - /* This can't happen in current synchronous code, but leave the check - * here as a reminder for whoever starts using AIO with the cache */ - abort(); - } - return min_index; -} - -static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, - uint64_t offset, void **table, bool read_from_disk) -{ - BDRVQcowState *s = bs->opaque; - int i; - int ret; - - trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache, - offset, read_from_disk); - - /* Check if the table is already cached */ - for (i = 0; i < c->size; i++) { - if (c->entries[i].offset == offset) { - goto found; - } - } - - /* If not, write a table back and replace it */ - i = qcow2_cache_find_entry_to_replace(c); - trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(), - c == s->l2_table_cache, i); - if (i < 0) { - return i; - } - - ret = qcow2_cache_entry_flush(bs, c, i); - if (ret < 0) { - return ret; - } - - trace_qcow2_cache_get_read(qemu_coroutine_self(), - c == s->l2_table_cache, i); - c->entries[i].offset = 0; - if (read_from_disk) { - if (c == s->l2_table_cache) { - BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); - } - - ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size); - if (ret < 0) { - return ret; - } - } - - /* Give the table some hits for the start so that it won't be replaced - * immediately. The number 32 is completely arbitrary. */ - c->entries[i].cache_hits = 32; - c->entries[i].offset = offset; - - /* And return the right table */ -found: - c->entries[i].cache_hits++; - c->entries[i].ref++; - *table = c->entries[i].table; - - trace_qcow2_cache_get_done(qemu_coroutine_self(), - c == s->l2_table_cache, i); - - return 0; -} - -int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, - void **table) -{ - return qcow2_cache_do_get(bs, c, offset, table, true); -} - -int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, - void **table) -{ - return qcow2_cache_do_get(bs, c, offset, table, false); -} - -int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table) -{ - int i; - - for (i = 0; i < c->size; i++) { - if (c->entries[i].table == *table) { - goto found; - } - } - return -ENOENT; - -found: - c->entries[i].ref--; - *table = NULL; - - assert(c->entries[i].ref >= 0); - return 0; -} - -void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table) -{ - int i; - - for (i = 0; i < c->size; i++) { - if (c->entries[i].table == table) { - goto found; - } - } - abort(); - -found: - c->entries[i].dirty = true; -} diff --git a/contrib/qemu/block/qcow2-cluster.c b/contrib/qemu/block/qcow2-cluster.c deleted file mode 100644 index cca76d4fcdd..00000000000 --- a/contrib/qemu/block/qcow2-cluster.c +++ /dev/null @@ -1,1478 +0,0 @@ -/* - * Block driver for the QCOW version 2 format - * - * Copyright (c) 2004-2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include <zlib.h> - -#include "qemu-common.h" -#include "block/block_int.h" -#include "block/qcow2.h" -#include "trace.h" - -int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, - bool exact_size) -{ - BDRVQcowState *s = bs->opaque; - int new_l1_size2, ret, i; - uint64_t *new_l1_table; - int64_t new_l1_table_offset, new_l1_size; - uint8_t data[12]; - - if (min_size <= s->l1_size) - return 0; - - if (exact_size) { - new_l1_size = min_size; - } else { - /* Bump size up to reduce the number of times we have to grow */ - new_l1_size = s->l1_size; - if (new_l1_size == 0) { - new_l1_size = 1; - } - while (min_size > new_l1_size) { - new_l1_size = (new_l1_size * 3 + 1) / 2; - } - } - - if (new_l1_size > INT_MAX) { - return -EFBIG; - } - -#ifdef DEBUG_ALLOC2 - fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", - s->l1_size, new_l1_size); -#endif - - new_l1_size2 = sizeof(uint64_t) * new_l1_size; - new_l1_table = g_malloc0(align_offset(new_l1_size2, 512)); - memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); - - /* write new table (align to cluster) */ - BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); - new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); - if (new_l1_table_offset < 0) { - g_free(new_l1_table); - return new_l1_table_offset; - } - - ret = qcow2_cache_flush(bs, s->refcount_block_cache); - if (ret < 0) { - goto fail; - } - - BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); - for(i = 0; i < s->l1_size; i++) - new_l1_table[i] = cpu_to_be64(new_l1_table[i]); - ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2); - if (ret < 0) - goto fail; - for(i = 0; i < s->l1_size; i++) - new_l1_table[i] = be64_to_cpu(new_l1_table[i]); - - /* set new table */ - BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); - cpu_to_be32w((uint32_t*)data, new_l1_size); - cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset); - ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data)); - if (ret < 0) { - goto fail; - } - g_free(s->l1_table); - qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t), - QCOW2_DISCARD_OTHER); - s->l1_table_offset = new_l1_table_offset; - s->l1_table = new_l1_table; - s->l1_size = new_l1_size; - return 0; - fail: - g_free(new_l1_table); - qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, - QCOW2_DISCARD_OTHER); - return ret; -} - -/* - * l2_load - * - * Loads a L2 table into memory. If the table is in the cache, the cache - * is used; otherwise the L2 table is loaded from the image file. - * - * Returns a pointer to the L2 table on success, or NULL if the read from - * the image file failed. - */ - -static int l2_load(BlockDriverState *bs, uint64_t l2_offset, - uint64_t **l2_table) -{ - BDRVQcowState *s = bs->opaque; - int ret; - - ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table); - - return ret; -} - -/* - * Writes one sector of the L1 table to the disk (can't update single entries - * and we really don't want bdrv_pread to perform a read-modify-write) - */ -#define L1_ENTRIES_PER_SECTOR (512 / 8) -static int write_l1_entry(BlockDriverState *bs, int l1_index) -{ - BDRVQcowState *s = bs->opaque; - uint64_t buf[L1_ENTRIES_PER_SECTOR]; - int l1_start_index; - int i, ret; - - l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); - for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) { - buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); - } - - BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); - ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index, - buf, sizeof(buf)); - if (ret < 0) { - return ret; - } - - return 0; -} - -/* - * l2_allocate - * - * Allocate a new l2 entry in the file. If l1_index points to an already - * used entry in the L2 table (i.e. we are doing a copy on write for the L2 - * table) copy the contents of the old L2 table into the newly allocated one. - * Otherwise the new table is initialized with zeros. - * - */ - -static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) -{ - BDRVQcowState *s = bs->opaque; - uint64_t old_l2_offset; - uint64_t *l2_table; - int64_t l2_offset; - int ret; - - old_l2_offset = s->l1_table[l1_index]; - - trace_qcow2_l2_allocate(bs, l1_index); - - /* allocate a new l2 entry */ - - l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); - if (l2_offset < 0) { - return l2_offset; - } - - ret = qcow2_cache_flush(bs, s->refcount_block_cache); - if (ret < 0) { - goto fail; - } - - /* allocate a new entry in the l2 cache */ - - trace_qcow2_l2_allocate_get_empty(bs, l1_index); - ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); - if (ret < 0) { - return ret; - } - - l2_table = *table; - - if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { - /* if there was no old l2 table, clear the new table */ - memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); - } else { - uint64_t* old_table; - - /* if there was an old l2 table, read it from the disk */ - BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); - ret = qcow2_cache_get(bs, s->l2_table_cache, - old_l2_offset & L1E_OFFSET_MASK, - (void**) &old_table); - if (ret < 0) { - goto fail; - } - - memcpy(l2_table, old_table, s->cluster_size); - - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table); - if (ret < 0) { - goto fail; - } - } - - /* write the l2 table to the file */ - BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); - - trace_qcow2_l2_allocate_write_l2(bs, l1_index); - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); - ret = qcow2_cache_flush(bs, s->l2_table_cache); - if (ret < 0) { - goto fail; - } - - /* update the L1 entry */ - trace_qcow2_l2_allocate_write_l1(bs, l1_index); - s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; - ret = write_l1_entry(bs, l1_index); - if (ret < 0) { - goto fail; - } - - *table = l2_table; - trace_qcow2_l2_allocate_done(bs, l1_index, 0); - return 0; - -fail: - trace_qcow2_l2_allocate_done(bs, l1_index, ret); - qcow2_cache_put(bs, s->l2_table_cache, (void**) table); - s->l1_table[l1_index] = old_l2_offset; - return ret; -} - -/* - * Checks how many clusters in a given L2 table are contiguous in the image - * file. As soon as one of the flags in the bitmask stop_flags changes compared - * to the first cluster, the search is stopped and the cluster is not counted - * as contiguous. (This allows it, for example, to stop at the first compressed - * cluster which may require a different handling) - */ -static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, - uint64_t *l2_table, uint64_t start, uint64_t stop_flags) -{ - int i; - uint64_t mask = stop_flags | L2E_OFFSET_MASK; - uint64_t offset = be64_to_cpu(l2_table[0]) & mask; - - if (!offset) - return 0; - - for (i = start; i < start + nb_clusters; i++) { - uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; - if (offset + (uint64_t) i * cluster_size != l2_entry) { - break; - } - } - - return (i - start); -} - -static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table) -{ - int i; - - for (i = 0; i < nb_clusters; i++) { - int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i])); - - if (type != QCOW2_CLUSTER_UNALLOCATED) { - break; - } - } - - return i; -} - -/* The crypt function is compatible with the linux cryptoloop - algorithm for < 4 GB images. NOTE: out_buf == in_buf is - supported */ -void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, - uint8_t *out_buf, const uint8_t *in_buf, - int nb_sectors, int enc, - const AES_KEY *key) -{ - union { - uint64_t ll[2]; - uint8_t b[16]; - } ivec; - int i; - - for(i = 0; i < nb_sectors; i++) { - ivec.ll[0] = cpu_to_le64(sector_num); - ivec.ll[1] = 0; - AES_cbc_encrypt(in_buf, out_buf, 512, key, - ivec.b, enc); - sector_num++; - in_buf += 512; - out_buf += 512; - } -} - -static int coroutine_fn copy_sectors(BlockDriverState *bs, - uint64_t start_sect, - uint64_t cluster_offset, - int n_start, int n_end) -{ - BDRVQcowState *s = bs->opaque; - QEMUIOVector qiov; - struct iovec iov; - int n, ret; - - /* - * If this is the last cluster and it is only partially used, we must only - * copy until the end of the image, or bdrv_check_request will fail for the - * bdrv_read/write calls below. - */ - if (start_sect + n_end > bs->total_sectors) { - n_end = bs->total_sectors - start_sect; - } - - n = n_end - n_start; - if (n <= 0) { - return 0; - } - - iov.iov_len = n * BDRV_SECTOR_SIZE; - iov.iov_base = qemu_blockalign(bs, iov.iov_len); - - qemu_iovec_init_external(&qiov, &iov, 1); - - BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); - - /* Call .bdrv_co_readv() directly instead of using the public block-layer - * interface. This avoids double I/O throttling and request tracking, - * which can lead to deadlock when block layer copy-on-read is enabled. - */ - ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov); - if (ret < 0) { - goto out; - } - - if (s->crypt_method) { - qcow2_encrypt_sectors(s, start_sect + n_start, - iov.iov_base, iov.iov_base, n, 1, - &s->aes_encrypt_key); - } - - BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); - ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov); - if (ret < 0) { - goto out; - } - - ret = 0; -out: - qemu_vfree(iov.iov_base); - return ret; -} - - -/* - * get_cluster_offset - * - * For a given offset of the disk image, find the cluster offset in - * qcow2 file. The offset is stored in *cluster_offset. - * - * on entry, *num is the number of contiguous sectors we'd like to - * access following offset. - * - * on exit, *num is the number of contiguous sectors we can read. - * - * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error - * cases. - */ -int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, - int *num, uint64_t *cluster_offset) -{ - BDRVQcowState *s = bs->opaque; - unsigned int l2_index; - uint64_t l1_index, l2_offset, *l2_table; - int l1_bits, c; - unsigned int index_in_cluster, nb_clusters; - uint64_t nb_available, nb_needed; - int ret; - - index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1); - nb_needed = *num + index_in_cluster; - - l1_bits = s->l2_bits + s->cluster_bits; - - /* compute how many bytes there are between the offset and - * the end of the l1 entry - */ - - nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)); - - /* compute the number of available sectors */ - - nb_available = (nb_available >> 9) + index_in_cluster; - - if (nb_needed > nb_available) { - nb_needed = nb_available; - } - - *cluster_offset = 0; - - /* seek the the l2 offset in the l1 table */ - - l1_index = offset >> l1_bits; - if (l1_index >= s->l1_size) { - ret = QCOW2_CLUSTER_UNALLOCATED; - goto out; - } - - l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; - if (!l2_offset) { - ret = QCOW2_CLUSTER_UNALLOCATED; - goto out; - } - - /* load the l2 table in memory */ - - ret = l2_load(bs, l2_offset, &l2_table); - if (ret < 0) { - return ret; - } - - /* find the cluster offset for the given disk offset */ - - l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); - *cluster_offset = be64_to_cpu(l2_table[l2_index]); - nb_clusters = size_to_clusters(s, nb_needed << 9); - - ret = qcow2_get_cluster_type(*cluster_offset); - switch (ret) { - case QCOW2_CLUSTER_COMPRESSED: - /* Compressed clusters can only be processed one by one */ - c = 1; - *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; - break; - case QCOW2_CLUSTER_ZERO: - if (s->qcow_version < 3) { - return -EIO; - } - c = count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], 0, - QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); - *cluster_offset = 0; - break; - case QCOW2_CLUSTER_UNALLOCATED: - /* how many empty clusters ? */ - c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]); - *cluster_offset = 0; - break; - case QCOW2_CLUSTER_NORMAL: - /* how many allocated clusters ? */ - c = count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], 0, - QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); - *cluster_offset &= L2E_OFFSET_MASK; - break; - default: - abort(); - } - - qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - - nb_available = (c * s->cluster_sectors); - -out: - if (nb_available > nb_needed) - nb_available = nb_needed; - - *num = nb_available - index_in_cluster; - - return ret; -} - -/* - * get_cluster_table - * - * for a given disk offset, load (and allocate if needed) - * the l2 table. - * - * the l2 table offset in the qcow2 file and the cluster index - * in the l2 table are given to the caller. - * - * Returns 0 on success, -errno in failure case - */ -static int get_cluster_table(BlockDriverState *bs, uint64_t offset, - uint64_t **new_l2_table, - int *new_l2_index) -{ - BDRVQcowState *s = bs->opaque; - unsigned int l2_index; - uint64_t l1_index, l2_offset; - uint64_t *l2_table = NULL; - int ret; - - /* seek the the l2 offset in the l1 table */ - - l1_index = offset >> (s->l2_bits + s->cluster_bits); - if (l1_index >= s->l1_size) { - ret = qcow2_grow_l1_table(bs, l1_index + 1, false); - if (ret < 0) { - return ret; - } - } - - assert(l1_index < s->l1_size); - l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; - - /* seek the l2 table of the given l2 offset */ - - if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) { - /* load the l2 table in memory */ - ret = l2_load(bs, l2_offset, &l2_table); - if (ret < 0) { - return ret; - } - } else { - /* First allocate a new L2 table (and do COW if needed) */ - ret = l2_allocate(bs, l1_index, &l2_table); - if (ret < 0) { - return ret; - } - - /* Then decrease the refcount of the old table */ - if (l2_offset) { - qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), - QCOW2_DISCARD_OTHER); - } - } - - /* find the cluster offset for the given disk offset */ - - l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); - - *new_l2_table = l2_table; - *new_l2_index = l2_index; - - return 0; -} - -/* - * alloc_compressed_cluster_offset - * - * For a given offset of the disk image, return cluster offset in - * qcow2 file. - * - * If the offset is not found, allocate a new compressed cluster. - * - * Return the cluster offset if successful, - * Return 0, otherwise. - * - */ - -uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, - uint64_t offset, - int compressed_size) -{ - BDRVQcowState *s = bs->opaque; - int l2_index, ret; - uint64_t *l2_table; - int64_t cluster_offset; - int nb_csectors; - - ret = get_cluster_table(bs, offset, &l2_table, &l2_index); - if (ret < 0) { - return 0; - } - - /* Compression can't overwrite anything. Fail if the cluster was already - * allocated. */ - cluster_offset = be64_to_cpu(l2_table[l2_index]); - if (cluster_offset & L2E_OFFSET_MASK) { - qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - return 0; - } - - cluster_offset = qcow2_alloc_bytes(bs, compressed_size); - if (cluster_offset < 0) { - qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - return 0; - } - - nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - - (cluster_offset >> 9); - - cluster_offset |= QCOW_OFLAG_COMPRESSED | - ((uint64_t)nb_csectors << s->csize_shift); - - /* update L2 table */ - - /* compressed clusters never have the copied flag */ - - BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); - l2_table[l2_index] = cpu_to_be64(cluster_offset); - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (ret < 0) { - return 0; - } - - return cluster_offset; -} - -static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) -{ - BDRVQcowState *s = bs->opaque; - int ret; - - if (r->nb_sectors == 0) { - return 0; - } - - qemu_co_mutex_unlock(&s->lock); - ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset, - r->offset / BDRV_SECTOR_SIZE, - r->offset / BDRV_SECTOR_SIZE + r->nb_sectors); - qemu_co_mutex_lock(&s->lock); - - if (ret < 0) { - return ret; - } - - /* - * Before we update the L2 table to actually point to the new cluster, we - * need to be sure that the refcounts have been increased and COW was - * handled. - */ - qcow2_cache_depends_on_flush(s->l2_table_cache); - - return 0; -} - -int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) -{ - BDRVQcowState *s = bs->opaque; - int i, j = 0, l2_index, ret; - uint64_t *old_cluster, *l2_table; - uint64_t cluster_offset = m->alloc_offset; - - trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); - assert(m->nb_clusters > 0); - - old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t)); - - /* copy content of unmodified sectors */ - ret = perform_cow(bs, m, &m->cow_start); - if (ret < 0) { - goto err; - } - - ret = perform_cow(bs, m, &m->cow_end); - if (ret < 0) { - goto err; - } - - /* Update L2 table. */ - if (s->use_lazy_refcounts) { - qcow2_mark_dirty(bs); - } - if (qcow2_need_accurate_refcounts(s)) { - qcow2_cache_set_dependency(bs, s->l2_table_cache, - s->refcount_block_cache); - } - - ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); - if (ret < 0) { - goto err; - } - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); - - for (i = 0; i < m->nb_clusters; i++) { - /* if two concurrent writes happen to the same unallocated cluster - * each write allocates separate cluster and writes data concurrently. - * The first one to complete updates l2 table with pointer to its - * cluster the second one has to do RMW (which is done above by - * copy_sectors()), update l2 table with its cluster pointer and free - * old cluster. This is what this loop does */ - if(l2_table[l2_index + i] != 0) - old_cluster[j++] = l2_table[l2_index + i]; - - l2_table[l2_index + i] = cpu_to_be64((cluster_offset + - (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); - } - - - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (ret < 0) { - goto err; - } - - /* - * If this was a COW, we need to decrease the refcount of the old cluster. - * Also flush bs->file to get the right order for L2 and refcount update. - * - * Don't discard clusters that reach a refcount of 0 (e.g. compressed - * clusters), the next write will reuse them anyway. - */ - if (j != 0) { - for (i = 0; i < j; i++) { - qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, - QCOW2_DISCARD_NEVER); - } - } - - ret = 0; -err: - g_free(old_cluster); - return ret; - } - -/* - * Returns the number of contiguous clusters that can be used for an allocating - * write, but require COW to be performed (this includes yet unallocated space, - * which must copy from the backing file) - */ -static int count_cow_clusters(BDRVQcowState *s, int nb_clusters, - uint64_t *l2_table, int l2_index) -{ - int i; - - for (i = 0; i < nb_clusters; i++) { - uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]); - int cluster_type = qcow2_get_cluster_type(l2_entry); - - switch(cluster_type) { - case QCOW2_CLUSTER_NORMAL: - if (l2_entry & QCOW_OFLAG_COPIED) { - goto out; - } - break; - case QCOW2_CLUSTER_UNALLOCATED: - case QCOW2_CLUSTER_COMPRESSED: - case QCOW2_CLUSTER_ZERO: - break; - default: - abort(); - } - } - -out: - assert(i <= nb_clusters); - return i; -} - -/* - * Check if there already is an AIO write request in flight which allocates - * the same cluster. In this case we need to wait until the previous - * request has completed and updated the L2 table accordingly. - * - * Returns: - * 0 if there was no dependency. *cur_bytes indicates the number of - * bytes from guest_offset that can be read before the next - * dependency must be processed (or the request is complete) - * - * -EAGAIN if we had to wait for another request, previously gathered - * information on cluster allocation may be invalid now. The caller - * must start over anyway, so consider *cur_bytes undefined. - */ -static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, - uint64_t *cur_bytes, QCowL2Meta **m) -{ - BDRVQcowState *s = bs->opaque; - QCowL2Meta *old_alloc; - uint64_t bytes = *cur_bytes; - - QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { - - uint64_t start = guest_offset; - uint64_t end = start + bytes; - uint64_t old_start = l2meta_cow_start(old_alloc); - uint64_t old_end = l2meta_cow_end(old_alloc); - - if (end <= old_start || start >= old_end) { - /* No intersection */ - } else { - if (start < old_start) { - /* Stop at the start of a running allocation */ - bytes = old_start - start; - } else { - bytes = 0; - } - - /* Stop if already an l2meta exists. After yielding, it wouldn't - * be valid any more, so we'd have to clean up the old L2Metas - * and deal with requests depending on them before starting to - * gather new ones. Not worth the trouble. */ - if (bytes == 0 && *m) { - *cur_bytes = 0; - return 0; - } - - if (bytes == 0) { - /* Wait for the dependency to complete. We need to recheck - * the free/allocated clusters when we continue. */ - qemu_co_mutex_unlock(&s->lock); - qemu_co_queue_wait(&old_alloc->dependent_requests); - qemu_co_mutex_lock(&s->lock); - return -EAGAIN; - } - } - } - - /* Make sure that existing clusters and new allocations are only used up to - * the next dependency if we shortened the request above */ - *cur_bytes = bytes; - - return 0; -} - -/* - * Checks how many already allocated clusters that don't require a copy on - * write there are at the given guest_offset (up to *bytes). If - * *host_offset is not zero, only physically contiguous clusters beginning at - * this host offset are counted. - * - * Note that guest_offset may not be cluster aligned. In this case, the - * returned *host_offset points to exact byte referenced by guest_offset and - * therefore isn't cluster aligned as well. - * - * Returns: - * 0: if no allocated clusters are available at the given offset. - * *bytes is normally unchanged. It is set to 0 if the cluster - * is allocated and doesn't need COW, but doesn't have the right - * physical offset. - * - * 1: if allocated clusters that don't require a COW are available at - * the requested offset. *bytes may have decreased and describes - * the length of the area that can be written to. - * - * -errno: in error cases - */ -static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, - uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) -{ - BDRVQcowState *s = bs->opaque; - int l2_index; - uint64_t cluster_offset; - uint64_t *l2_table; - unsigned int nb_clusters; - unsigned int keep_clusters; - int ret, pret; - - trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, - *bytes); - - assert(*host_offset == 0 || offset_into_cluster(s, guest_offset) - == offset_into_cluster(s, *host_offset)); - - /* - * Calculate the number of clusters to look for. We stop at L2 table - * boundaries to keep things simple. - */ - nb_clusters = - size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); - - l2_index = offset_to_l2_index(s, guest_offset); - nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); - - /* Find L2 entry for the first involved cluster */ - ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); - if (ret < 0) { - return ret; - } - - cluster_offset = be64_to_cpu(l2_table[l2_index]); - - /* Check how many clusters are already allocated and don't need COW */ - if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL - && (cluster_offset & QCOW_OFLAG_COPIED)) - { - /* If a specific host_offset is required, check it */ - bool offset_matches = - (cluster_offset & L2E_OFFSET_MASK) == *host_offset; - - if (*host_offset != 0 && !offset_matches) { - *bytes = 0; - ret = 0; - goto out; - } - - /* We keep all QCOW_OFLAG_COPIED clusters */ - keep_clusters = - count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], 0, - QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); - assert(keep_clusters <= nb_clusters); - - *bytes = MIN(*bytes, - keep_clusters * s->cluster_size - - offset_into_cluster(s, guest_offset)); - - ret = 1; - } else { - ret = 0; - } - - /* Cleanup */ -out: - pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (pret < 0) { - return pret; - } - - /* Only return a host offset if we actually made progress. Otherwise we - * would make requirements for handle_alloc() that it can't fulfill */ - if (ret) { - *host_offset = (cluster_offset & L2E_OFFSET_MASK) - + offset_into_cluster(s, guest_offset); - } - - return ret; -} - -/* - * Allocates new clusters for the given guest_offset. - * - * At most *nb_clusters are allocated, and on return *nb_clusters is updated to - * contain the number of clusters that have been allocated and are contiguous - * in the image file. - * - * If *host_offset is non-zero, it specifies the offset in the image file at - * which the new clusters must start. *nb_clusters can be 0 on return in this - * case if the cluster at host_offset is already in use. If *host_offset is - * zero, the clusters can be allocated anywhere in the image file. - * - * *host_offset is updated to contain the offset into the image file at which - * the first allocated cluster starts. - * - * Return 0 on success and -errno in error cases. -EAGAIN means that the - * function has been waiting for another request and the allocation must be - * restarted, but the whole request should not be failed. - */ -static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, - uint64_t *host_offset, unsigned int *nb_clusters) -{ - BDRVQcowState *s = bs->opaque; - - trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, - *host_offset, *nb_clusters); - - /* Allocate new clusters */ - trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); - if (*host_offset == 0) { - int64_t cluster_offset = - qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); - if (cluster_offset < 0) { - return cluster_offset; - } - *host_offset = cluster_offset; - return 0; - } else { - int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); - if (ret < 0) { - return ret; - } - *nb_clusters = ret; - return 0; - } -} - -/* - * Allocates new clusters for an area that either is yet unallocated or needs a - * copy on write. If *host_offset is non-zero, clusters are only allocated if - * the new allocation can match the specified host offset. - * - * Note that guest_offset may not be cluster aligned. In this case, the - * returned *host_offset points to exact byte referenced by guest_offset and - * therefore isn't cluster aligned as well. - * - * Returns: - * 0: if no clusters could be allocated. *bytes is set to 0, - * *host_offset is left unchanged. - * - * 1: if new clusters were allocated. *bytes may be decreased if the - * new allocation doesn't cover all of the requested area. - * *host_offset is updated to contain the host offset of the first - * newly allocated cluster. - * - * -errno: in error cases - */ -static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, - uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) -{ - BDRVQcowState *s = bs->opaque; - int l2_index; - uint64_t *l2_table; - uint64_t entry; - unsigned int nb_clusters; - int ret; - - uint64_t alloc_cluster_offset; - - trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, - *bytes); - assert(*bytes > 0); - - /* - * Calculate the number of clusters to look for. We stop at L2 table - * boundaries to keep things simple. - */ - nb_clusters = - size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); - - l2_index = offset_to_l2_index(s, guest_offset); - nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); - - /* Find L2 entry for the first involved cluster */ - ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); - if (ret < 0) { - return ret; - } - - entry = be64_to_cpu(l2_table[l2_index]); - - /* For the moment, overwrite compressed clusters one by one */ - if (entry & QCOW_OFLAG_COMPRESSED) { - nb_clusters = 1; - } else { - nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index); - } - - /* This function is only called when there were no non-COW clusters, so if - * we can't find any unallocated or COW clusters either, something is - * wrong with our code. */ - assert(nb_clusters > 0); - - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (ret < 0) { - return ret; - } - - /* Allocate, if necessary at a given offset in the image file */ - alloc_cluster_offset = start_of_cluster(s, *host_offset); - ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, - &nb_clusters); - if (ret < 0) { - goto fail; - } - - /* Can't extend contiguous allocation */ - if (nb_clusters == 0) { - *bytes = 0; - return 0; - } - - /* - * Save info needed for meta data update. - * - * requested_sectors: Number of sectors from the start of the first - * newly allocated cluster to the end of the (possibly shortened - * before) write request. - * - * avail_sectors: Number of sectors from the start of the first - * newly allocated to the end of the last newly allocated cluster. - * - * nb_sectors: The number of sectors from the start of the first - * newly allocated cluster to the end of the area that the write - * request actually writes to (excluding COW at the end) - */ - int requested_sectors = - (*bytes + offset_into_cluster(s, guest_offset)) - >> BDRV_SECTOR_BITS; - int avail_sectors = nb_clusters - << (s->cluster_bits - BDRV_SECTOR_BITS); - int alloc_n_start = offset_into_cluster(s, guest_offset) - >> BDRV_SECTOR_BITS; - int nb_sectors = MIN(requested_sectors, avail_sectors); - QCowL2Meta *old_m = *m; - - *m = g_malloc0(sizeof(**m)); - - **m = (QCowL2Meta) { - .next = old_m, - - .alloc_offset = alloc_cluster_offset, - .offset = start_of_cluster(s, guest_offset), - .nb_clusters = nb_clusters, - .nb_available = nb_sectors, - - .cow_start = { - .offset = 0, - .nb_sectors = alloc_n_start, - }, - .cow_end = { - .offset = nb_sectors * BDRV_SECTOR_SIZE, - .nb_sectors = avail_sectors - nb_sectors, - }, - }; - qemu_co_queue_init(&(*m)->dependent_requests); - QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); - - *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); - *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE) - - offset_into_cluster(s, guest_offset)); - assert(*bytes != 0); - - return 1; - -fail: - if (*m && (*m)->nb_clusters > 0) { - QLIST_REMOVE(*m, next_in_flight); - } - return ret; -} - -/* - * alloc_cluster_offset - * - * For a given offset on the virtual disk, find the cluster offset in qcow2 - * file. If the offset is not found, allocate a new cluster. - * - * If the cluster was already allocated, m->nb_clusters is set to 0 and - * other fields in m are meaningless. - * - * If the cluster is newly allocated, m->nb_clusters is set to the number of - * contiguous clusters that have been allocated. In this case, the other - * fields of m are valid and contain information about the first allocated - * cluster. - * - * If the request conflicts with another write request in flight, the coroutine - * is queued and will be reentered when the dependency has completed. - * - * Return 0 on success and -errno in error cases - */ -int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, - int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m) -{ - BDRVQcowState *s = bs->opaque; - uint64_t start, remaining; - uint64_t cluster_offset; - uint64_t cur_bytes; - int ret; - - trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, - n_start, n_end); - - assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset)); - offset = start_of_cluster(s, offset); - -again: - start = offset + (n_start << BDRV_SECTOR_BITS); - remaining = (n_end - n_start) << BDRV_SECTOR_BITS; - cluster_offset = 0; - *host_offset = 0; - cur_bytes = 0; - *m = NULL; - - while (true) { - - if (!*host_offset) { - *host_offset = start_of_cluster(s, cluster_offset); - } - - assert(remaining >= cur_bytes); - - start += cur_bytes; - remaining -= cur_bytes; - cluster_offset += cur_bytes; - - if (remaining == 0) { - break; - } - - cur_bytes = remaining; - - /* - * Now start gathering as many contiguous clusters as possible: - * - * 1. Check for overlaps with in-flight allocations - * - * a) Overlap not in the first cluster -> shorten this request and - * let the caller handle the rest in its next loop iteration. - * - * b) Real overlaps of two requests. Yield and restart the search - * for contiguous clusters (the situation could have changed - * while we were sleeping) - * - * c) TODO: Request starts in the same cluster as the in-flight - * allocation ends. Shorten the COW of the in-fight allocation, - * set cluster_offset to write to the same cluster and set up - * the right synchronisation between the in-flight request and - * the new one. - */ - ret = handle_dependencies(bs, start, &cur_bytes, m); - if (ret == -EAGAIN) { - /* Currently handle_dependencies() doesn't yield if we already had - * an allocation. If it did, we would have to clean up the L2Meta - * structs before starting over. */ - assert(*m == NULL); - goto again; - } else if (ret < 0) { - return ret; - } else if (cur_bytes == 0) { - break; - } else { - /* handle_dependencies() may have decreased cur_bytes (shortened - * the allocations below) so that the next dependency is processed - * correctly during the next loop iteration. */ - } - - /* - * 2. Count contiguous COPIED clusters. - */ - ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); - if (ret < 0) { - return ret; - } else if (ret) { - continue; - } else if (cur_bytes == 0) { - break; - } - - /* - * 3. If the request still hasn't completed, allocate new clusters, - * considering any cluster_offset of steps 1c or 2. - */ - ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); - if (ret < 0) { - return ret; - } else if (ret) { - continue; - } else { - assert(cur_bytes == 0); - break; - } - } - - *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS); - assert(*num > 0); - assert(*host_offset != 0); - - return 0; -} - -static int decompress_buffer(uint8_t *out_buf, int out_buf_size, - const uint8_t *buf, int buf_size) -{ - z_stream strm1, *strm = &strm1; - int ret, out_len; - - memset(strm, 0, sizeof(*strm)); - - strm->next_in = (uint8_t *)buf; - strm->avail_in = buf_size; - strm->next_out = out_buf; - strm->avail_out = out_buf_size; - - ret = inflateInit2(strm, -12); - if (ret != Z_OK) - return -1; - ret = inflate(strm, Z_FINISH); - out_len = strm->next_out - out_buf; - if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || - out_len != out_buf_size) { - inflateEnd(strm); - return -1; - } - inflateEnd(strm); - return 0; -} - -int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) -{ - BDRVQcowState *s = bs->opaque; - int ret, csize, nb_csectors, sector_offset; - uint64_t coffset; - - coffset = cluster_offset & s->cluster_offset_mask; - if (s->cluster_cache_offset != coffset) { - nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; - sector_offset = coffset & 511; - csize = nb_csectors * 512 - sector_offset; - BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); - ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors); - if (ret < 0) { - return ret; - } - if (decompress_buffer(s->cluster_cache, s->cluster_size, - s->cluster_data + sector_offset, csize) < 0) { - return -EIO; - } - s->cluster_cache_offset = coffset; - } - return 0; -} - -/* - * This discards as many clusters of nb_clusters as possible at once (i.e. - * all clusters in the same L2 table) and returns the number of discarded - * clusters. - */ -static int discard_single_l2(BlockDriverState *bs, uint64_t offset, - unsigned int nb_clusters) -{ - BDRVQcowState *s = bs->opaque; - uint64_t *l2_table; - int l2_index; - int ret; - int i; - - ret = get_cluster_table(bs, offset, &l2_table, &l2_index); - if (ret < 0) { - return ret; - } - - /* Limit nb_clusters to one L2 table */ - nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); - - for (i = 0; i < nb_clusters; i++) { - uint64_t old_offset; - - old_offset = be64_to_cpu(l2_table[l2_index + i]); - if ((old_offset & L2E_OFFSET_MASK) == 0) { - continue; - } - - /* First remove L2 entries */ - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); - l2_table[l2_index + i] = cpu_to_be64(0); - - /* Then decrease the refcount */ - qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); - } - - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (ret < 0) { - return ret; - } - - return nb_clusters; -} - -int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, - int nb_sectors) -{ - BDRVQcowState *s = bs->opaque; - uint64_t end_offset; - unsigned int nb_clusters; - int ret; - - end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS); - - /* Round start up and end down */ - offset = align_offset(offset, s->cluster_size); - end_offset &= ~(s->cluster_size - 1); - - if (offset > end_offset) { - return 0; - } - - nb_clusters = size_to_clusters(s, end_offset - offset); - - s->cache_discards = true; - - /* Each L2 table is handled by its own loop iteration */ - while (nb_clusters > 0) { - ret = discard_single_l2(bs, offset, nb_clusters); - if (ret < 0) { - goto fail; - } - - nb_clusters -= ret; - offset += (ret * s->cluster_size); - } - - ret = 0; -fail: - s->cache_discards = false; - qcow2_process_discards(bs, ret); - - return ret; -} - -/* - * This zeroes as many clusters of nb_clusters as possible at once (i.e. - * all clusters in the same L2 table) and returns the number of zeroed - * clusters. - */ -static int zero_single_l2(BlockDriverState *bs, uint64_t offset, - unsigned int nb_clusters) -{ - BDRVQcowState *s = bs->opaque; - uint64_t *l2_table; - int l2_index; - int ret; - int i; - - ret = get_cluster_table(bs, offset, &l2_table, &l2_index); - if (ret < 0) { - return ret; - } - - /* Limit nb_clusters to one L2 table */ - nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); - - for (i = 0; i < nb_clusters; i++) { - uint64_t old_offset; - - old_offset = be64_to_cpu(l2_table[l2_index + i]); - - /* Update L2 entries */ - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); - if (old_offset & QCOW_OFLAG_COMPRESSED) { - l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); - qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); - } else { - l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); - } - } - - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (ret < 0) { - return ret; - } - - return nb_clusters; -} - -int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) -{ - BDRVQcowState *s = bs->opaque; - unsigned int nb_clusters; - int ret; - - /* The zero flag is only supported by version 3 and newer */ - if (s->qcow_version < 3) { - return -ENOTSUP; - } - - /* Each L2 table is handled by its own loop iteration */ - nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS); - - s->cache_discards = true; - - while (nb_clusters > 0) { - ret = zero_single_l2(bs, offset, nb_clusters); - if (ret < 0) { - goto fail; - } - - nb_clusters -= ret; - offset += (ret * s->cluster_size); - } - - ret = 0; -fail: - s->cache_discards = false; - qcow2_process_discards(bs, ret); - - return ret; -} diff --git a/contrib/qemu/block/qcow2-refcount.c b/contrib/qemu/block/qcow2-refcount.c deleted file mode 100644 index 1244693f39e..00000000000 --- a/contrib/qemu/block/qcow2-refcount.c +++ /dev/null @@ -1,1374 +0,0 @@ -/* - * Block driver for the QCOW version 2 format - * - * Copyright (c) 2004-2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu-common.h" -#include "block/block_int.h" -#include "block/qcow2.h" - -static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size); -static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, - int64_t offset, int64_t length, - int addend, enum qcow2_discard_type type); - - -/*********************************************************/ -/* refcount handling */ - -int qcow2_refcount_init(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - int ret, refcount_table_size2, i; - - refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); - s->refcount_table = g_malloc(refcount_table_size2); - if (s->refcount_table_size > 0) { - BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); - ret = bdrv_pread(bs->file, s->refcount_table_offset, - s->refcount_table, refcount_table_size2); - if (ret != refcount_table_size2) - goto fail; - for(i = 0; i < s->refcount_table_size; i++) - be64_to_cpus(&s->refcount_table[i]); - } - return 0; - fail: - return -ENOMEM; -} - -void qcow2_refcount_close(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - g_free(s->refcount_table); -} - - -static int load_refcount_block(BlockDriverState *bs, - int64_t refcount_block_offset, - void **refcount_block) -{ - BDRVQcowState *s = bs->opaque; - int ret; - - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); - ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, - refcount_block); - - return ret; -} - -/* - * Returns the refcount of the cluster given by its index. Any non-negative - * return value is the refcount of the cluster, negative values are -errno - * and indicate an error. - */ -static int get_refcount(BlockDriverState *bs, int64_t cluster_index) -{ - BDRVQcowState *s = bs->opaque; - int refcount_table_index, block_index; - int64_t refcount_block_offset; - int ret; - uint16_t *refcount_block; - uint16_t refcount; - - refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); - if (refcount_table_index >= s->refcount_table_size) - return 0; - refcount_block_offset = s->refcount_table[refcount_table_index]; - if (!refcount_block_offset) - return 0; - - ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, - (void**) &refcount_block); - if (ret < 0) { - return ret; - } - - block_index = cluster_index & - ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); - refcount = be16_to_cpu(refcount_block[block_index]); - - ret = qcow2_cache_put(bs, s->refcount_block_cache, - (void**) &refcount_block); - if (ret < 0) { - return ret; - } - - return refcount; -} - -/* - * Rounds the refcount table size up to avoid growing the table for each single - * refcount block that is allocated. - */ -static unsigned int next_refcount_table_size(BDRVQcowState *s, - unsigned int min_size) -{ - unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1; - unsigned int refcount_table_clusters = - MAX(1, s->refcount_table_size >> (s->cluster_bits - 3)); - - while (min_clusters > refcount_table_clusters) { - refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2; - } - - return refcount_table_clusters << (s->cluster_bits - 3); -} - - -/* Checks if two offsets are described by the same refcount block */ -static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a, - uint64_t offset_b) -{ - uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT); - uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT); - - return (block_a == block_b); -} - -/* - * Loads a refcount block. If it doesn't exist yet, it is allocated first - * (including growing the refcount table if needed). - * - * Returns 0 on success or -errno in error case - */ -static int alloc_refcount_block(BlockDriverState *bs, - int64_t cluster_index, uint16_t **refcount_block) -{ - BDRVQcowState *s = bs->opaque; - unsigned int refcount_table_index; - int ret; - - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); - - /* Find the refcount block for the given cluster */ - refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); - - if (refcount_table_index < s->refcount_table_size) { - - uint64_t refcount_block_offset = - s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; - - /* If it's already there, we're done */ - if (refcount_block_offset) { - return load_refcount_block(bs, refcount_block_offset, - (void**) refcount_block); - } - } - - /* - * If we came here, we need to allocate something. Something is at least - * a cluster for the new refcount block. It may also include a new refcount - * table if the old refcount table is too small. - * - * Note that allocating clusters here needs some special care: - * - * - We can't use the normal qcow2_alloc_clusters(), it would try to - * increase the refcount and very likely we would end up with an endless - * recursion. Instead we must place the refcount blocks in a way that - * they can describe them themselves. - * - * - We need to consider that at this point we are inside update_refcounts - * and doing the initial refcount increase. This means that some clusters - * have already been allocated by the caller, but their refcount isn't - * accurate yet. free_cluster_index tells us where this allocation ends - * as long as we don't overwrite it by freeing clusters. - * - * - alloc_clusters_noref and qcow2_free_clusters may load a different - * refcount block into the cache - */ - - *refcount_block = NULL; - - /* We write to the refcount table, so we might depend on L2 tables */ - ret = qcow2_cache_flush(bs, s->l2_table_cache); - if (ret < 0) { - return ret; - } - - /* Allocate the refcount block itself and mark it as used */ - int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); - if (new_block < 0) { - return new_block; - } - -#ifdef DEBUG_ALLOC2 - fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64 - " at %" PRIx64 "\n", - refcount_table_index, cluster_index << s->cluster_bits, new_block); -#endif - - if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) { - /* Zero the new refcount block before updating it */ - ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, - (void**) refcount_block); - if (ret < 0) { - goto fail_block; - } - - memset(*refcount_block, 0, s->cluster_size); - - /* The block describes itself, need to update the cache */ - int block_index = (new_block >> s->cluster_bits) & - ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); - (*refcount_block)[block_index] = cpu_to_be16(1); - } else { - /* Described somewhere else. This can recurse at most twice before we - * arrive at a block that describes itself. */ - ret = update_refcount(bs, new_block, s->cluster_size, 1, - QCOW2_DISCARD_NEVER); - if (ret < 0) { - goto fail_block; - } - - ret = qcow2_cache_flush(bs, s->refcount_block_cache); - if (ret < 0) { - goto fail_block; - } - - /* Initialize the new refcount block only after updating its refcount, - * update_refcount uses the refcount cache itself */ - ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, - (void**) refcount_block); - if (ret < 0) { - goto fail_block; - } - - memset(*refcount_block, 0, s->cluster_size); - } - - /* Now the new refcount block needs to be written to disk */ - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE); - qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block); - ret = qcow2_cache_flush(bs, s->refcount_block_cache); - if (ret < 0) { - goto fail_block; - } - - /* If the refcount table is big enough, just hook the block up there */ - if (refcount_table_index < s->refcount_table_size) { - uint64_t data64 = cpu_to_be64(new_block); - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); - ret = bdrv_pwrite_sync(bs->file, - s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), - &data64, sizeof(data64)); - if (ret < 0) { - goto fail_block; - } - - s->refcount_table[refcount_table_index] = new_block; - return 0; - } - - ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block); - if (ret < 0) { - goto fail_block; - } - - /* - * If we come here, we need to grow the refcount table. Again, a new - * refcount table needs some space and we can't simply allocate to avoid - * endless recursion. - * - * Therefore let's grab new refcount blocks at the end of the image, which - * will describe themselves and the new refcount table. This way we can - * reference them only in the new table and do the switch to the new - * refcount table at once without producing an inconsistent state in - * between. - */ - BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW); - - /* Calculate the number of refcount blocks needed so far */ - uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT); - uint64_t blocks_used = (s->free_cluster_index + - refcount_block_clusters - 1) / refcount_block_clusters; - - /* And now we need at least one block more for the new metadata */ - uint64_t table_size = next_refcount_table_size(s, blocks_used + 1); - uint64_t last_table_size; - uint64_t blocks_clusters; - do { - uint64_t table_clusters = - size_to_clusters(s, table_size * sizeof(uint64_t)); - blocks_clusters = 1 + - ((table_clusters + refcount_block_clusters - 1) - / refcount_block_clusters); - uint64_t meta_clusters = table_clusters + blocks_clusters; - - last_table_size = table_size; - table_size = next_refcount_table_size(s, blocks_used + - ((meta_clusters + refcount_block_clusters - 1) - / refcount_block_clusters)); - - } while (last_table_size != table_size); - -#ifdef DEBUG_ALLOC2 - fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n", - s->refcount_table_size, table_size); -#endif - - /* Create the new refcount table and blocks */ - uint64_t meta_offset = (blocks_used * refcount_block_clusters) * - s->cluster_size; - uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size; - uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size); - uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t)); - - assert(meta_offset >= (s->free_cluster_index * s->cluster_size)); - - /* Fill the new refcount table */ - memcpy(new_table, s->refcount_table, - s->refcount_table_size * sizeof(uint64_t)); - new_table[refcount_table_index] = new_block; - - int i; - for (i = 0; i < blocks_clusters; i++) { - new_table[blocks_used + i] = meta_offset + (i * s->cluster_size); - } - - /* Fill the refcount blocks */ - uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t)); - int block = 0; - for (i = 0; i < table_clusters + blocks_clusters; i++) { - new_blocks[block++] = cpu_to_be16(1); - } - - /* Write refcount blocks to disk */ - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); - ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks, - blocks_clusters * s->cluster_size); - g_free(new_blocks); - if (ret < 0) { - goto fail_table; - } - - /* Write refcount table to disk */ - for(i = 0; i < table_size; i++) { - cpu_to_be64s(&new_table[i]); - } - - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); - ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, - table_size * sizeof(uint64_t)); - if (ret < 0) { - goto fail_table; - } - - for(i = 0; i < table_size; i++) { - be64_to_cpus(&new_table[i]); - } - - /* Hook up the new refcount table in the qcow2 header */ - uint8_t data[12]; - cpu_to_be64w((uint64_t*)data, table_offset); - cpu_to_be32w((uint32_t*)(data + 8), table_clusters); - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); - ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset), - data, sizeof(data)); - if (ret < 0) { - goto fail_table; - } - - /* And switch it in memory */ - uint64_t old_table_offset = s->refcount_table_offset; - uint64_t old_table_size = s->refcount_table_size; - - g_free(s->refcount_table); - s->refcount_table = new_table; - s->refcount_table_size = table_size; - s->refcount_table_offset = table_offset; - - /* Free old table. Remember, we must not change free_cluster_index */ - uint64_t old_free_cluster_index = s->free_cluster_index; - qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), - QCOW2_DISCARD_OTHER); - s->free_cluster_index = old_free_cluster_index; - - ret = load_refcount_block(bs, new_block, (void**) refcount_block); - if (ret < 0) { - return ret; - } - - return 0; - -fail_table: - g_free(new_table); -fail_block: - if (*refcount_block != NULL) { - qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block); - } - return ret; -} - -void qcow2_process_discards(BlockDriverState *bs, int ret) -{ - BDRVQcowState *s = bs->opaque; - Qcow2DiscardRegion *d, *next; - - QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) { - QTAILQ_REMOVE(&s->discards, d, next); - - /* Discard is optional, ignore the return value */ - if (ret >= 0) { - bdrv_discard(bs->file, - d->offset >> BDRV_SECTOR_BITS, - d->bytes >> BDRV_SECTOR_BITS); - } - - g_free(d); - } -} - -static void update_refcount_discard(BlockDriverState *bs, - uint64_t offset, uint64_t length) -{ - BDRVQcowState *s = bs->opaque; - Qcow2DiscardRegion *d, *p, *next; - - QTAILQ_FOREACH(d, &s->discards, next) { - uint64_t new_start = MIN(offset, d->offset); - uint64_t new_end = MAX(offset + length, d->offset + d->bytes); - - if (new_end - new_start <= length + d->bytes) { - /* There can't be any overlap, areas ending up here have no - * references any more and therefore shouldn't get freed another - * time. */ - assert(d->bytes + length == new_end - new_start); - d->offset = new_start; - d->bytes = new_end - new_start; - goto found; - } - } - - d = g_malloc(sizeof(*d)); - *d = (Qcow2DiscardRegion) { - .bs = bs, - .offset = offset, - .bytes = length, - }; - QTAILQ_INSERT_TAIL(&s->discards, d, next); - -found: - /* Merge discard requests if they are adjacent now */ - QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) { - if (p == d - || p->offset > d->offset + d->bytes - || d->offset > p->offset + p->bytes) - { - continue; - } - - /* Still no overlap possible */ - assert(p->offset == d->offset + d->bytes - || d->offset == p->offset + p->bytes); - - QTAILQ_REMOVE(&s->discards, p, next); - d->offset = MIN(d->offset, p->offset); - d->bytes += p->bytes; - } -} - -/* XXX: cache several refcount block clusters ? */ -static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, - int64_t offset, int64_t length, int addend, enum qcow2_discard_type type) -{ - BDRVQcowState *s = bs->opaque; - int64_t start, last, cluster_offset; - uint16_t *refcount_block = NULL; - int64_t old_table_index = -1; - int ret; - -#ifdef DEBUG_ALLOC2 - fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n", - offset, length, addend); -#endif - if (length < 0) { - return -EINVAL; - } else if (length == 0) { - return 0; - } - - if (addend < 0) { - qcow2_cache_set_dependency(bs, s->refcount_block_cache, - s->l2_table_cache); - } - - start = offset & ~(s->cluster_size - 1); - last = (offset + length - 1) & ~(s->cluster_size - 1); - for(cluster_offset = start; cluster_offset <= last; - cluster_offset += s->cluster_size) - { - int block_index, refcount; - int64_t cluster_index = cluster_offset >> s->cluster_bits; - int64_t table_index = - cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); - - /* Load the refcount block and allocate it if needed */ - if (table_index != old_table_index) { - if (refcount_block) { - ret = qcow2_cache_put(bs, s->refcount_block_cache, - (void**) &refcount_block); - if (ret < 0) { - goto fail; - } - } - - ret = alloc_refcount_block(bs, cluster_index, &refcount_block); - if (ret < 0) { - goto fail; - } - } - old_table_index = table_index; - - qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block); - - /* we can update the count and save it */ - block_index = cluster_index & - ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); - - refcount = be16_to_cpu(refcount_block[block_index]); - refcount += addend; - if (refcount < 0 || refcount > 0xffff) { - ret = -EINVAL; - goto fail; - } - if (refcount == 0 && cluster_index < s->free_cluster_index) { - s->free_cluster_index = cluster_index; - } - refcount_block[block_index] = cpu_to_be16(refcount); - - if (refcount == 0 && s->discard_passthrough[type]) { - update_refcount_discard(bs, cluster_offset, s->cluster_size); - } - } - - ret = 0; -fail: - if (!s->cache_discards) { - qcow2_process_discards(bs, ret); - } - - /* Write last changed block to disk */ - if (refcount_block) { - int wret; - wret = qcow2_cache_put(bs, s->refcount_block_cache, - (void**) &refcount_block); - if (wret < 0) { - return ret < 0 ? ret : wret; - } - } - - /* - * Try do undo any updates if an error is returned (This may succeed in - * some cases like ENOSPC for allocating a new refcount block) - */ - if (ret < 0) { - int dummy; - dummy = update_refcount(bs, offset, cluster_offset - offset, -addend, - QCOW2_DISCARD_NEVER); - (void)dummy; - } - - return ret; -} - -/* - * Increases or decreases the refcount of a given cluster by one. - * addend must be 1 or -1. - * - * If the return value is non-negative, it is the new refcount of the cluster. - * If it is negative, it is -errno and indicates an error. - */ -static int update_cluster_refcount(BlockDriverState *bs, - int64_t cluster_index, - int addend, - enum qcow2_discard_type type) -{ - BDRVQcowState *s = bs->opaque; - int ret; - - ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend, - type); - if (ret < 0) { - return ret; - } - - return get_refcount(bs, cluster_index); -} - - - -/*********************************************************/ -/* cluster allocation functions */ - - - -/* return < 0 if error */ -static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size) -{ - BDRVQcowState *s = bs->opaque; - int i, nb_clusters, refcount; - - nb_clusters = size_to_clusters(s, size); -retry: - for(i = 0; i < nb_clusters; i++) { - int64_t next_cluster_index = s->free_cluster_index++; - refcount = get_refcount(bs, next_cluster_index); - - if (refcount < 0) { - return refcount; - } else if (refcount != 0) { - goto retry; - } - } -#ifdef DEBUG_ALLOC2 - fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", - size, - (s->free_cluster_index - nb_clusters) << s->cluster_bits); -#endif - return (s->free_cluster_index - nb_clusters) << s->cluster_bits; -} - -int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size) -{ - int64_t offset; - int ret; - - BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); - offset = alloc_clusters_noref(bs, size); - if (offset < 0) { - return offset; - } - - ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER); - if (ret < 0) { - return ret; - } - - return offset; -} - -int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, - int nb_clusters) -{ - BDRVQcowState *s = bs->opaque; - uint64_t cluster_index; - uint64_t old_free_cluster_index; - int i, refcount, ret; - - /* Check how many clusters there are free */ - cluster_index = offset >> s->cluster_bits; - for(i = 0; i < nb_clusters; i++) { - refcount = get_refcount(bs, cluster_index++); - - if (refcount < 0) { - return refcount; - } else if (refcount != 0) { - break; - } - } - - /* And then allocate them */ - old_free_cluster_index = s->free_cluster_index; - s->free_cluster_index = cluster_index + i; - - ret = update_refcount(bs, offset, i << s->cluster_bits, 1, - QCOW2_DISCARD_NEVER); - if (ret < 0) { - return ret; - } - - s->free_cluster_index = old_free_cluster_index; - - return i; -} - -/* only used to allocate compressed sectors. We try to allocate - contiguous sectors. size must be <= cluster_size */ -int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) -{ - BDRVQcowState *s = bs->opaque; - int64_t offset, cluster_offset; - int free_in_cluster; - - BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); - assert(size > 0 && size <= s->cluster_size); - if (s->free_byte_offset == 0) { - offset = qcow2_alloc_clusters(bs, s->cluster_size); - if (offset < 0) { - return offset; - } - s->free_byte_offset = offset; - } - redo: - free_in_cluster = s->cluster_size - - (s->free_byte_offset & (s->cluster_size - 1)); - if (size <= free_in_cluster) { - /* enough space in current cluster */ - offset = s->free_byte_offset; - s->free_byte_offset += size; - free_in_cluster -= size; - if (free_in_cluster == 0) - s->free_byte_offset = 0; - if ((offset & (s->cluster_size - 1)) != 0) - update_cluster_refcount(bs, offset >> s->cluster_bits, 1, - QCOW2_DISCARD_NEVER); - } else { - offset = qcow2_alloc_clusters(bs, s->cluster_size); - if (offset < 0) { - return offset; - } - cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1); - if ((cluster_offset + s->cluster_size) == offset) { - /* we are lucky: contiguous data */ - offset = s->free_byte_offset; - update_cluster_refcount(bs, offset >> s->cluster_bits, 1, - QCOW2_DISCARD_NEVER); - s->free_byte_offset += size; - } else { - s->free_byte_offset = offset; - goto redo; - } - } - - /* The cluster refcount was incremented, either by qcow2_alloc_clusters() - * or explicitly by update_cluster_refcount(). Refcount blocks must be - * flushed before the caller's L2 table updates. - */ - qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); - return offset; -} - -void qcow2_free_clusters(BlockDriverState *bs, - int64_t offset, int64_t size, - enum qcow2_discard_type type) -{ - int ret; - - BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); - ret = update_refcount(bs, offset, size, -1, type); - if (ret < 0) { - fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); - /* TODO Remember the clusters to free them later and avoid leaking */ - } -} - -/* - * Free a cluster using its L2 entry (handles clusters of all types, e.g. - * normal cluster, compressed cluster, etc.) - */ -void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, - int nb_clusters, enum qcow2_discard_type type) -{ - BDRVQcowState *s = bs->opaque; - - switch (qcow2_get_cluster_type(l2_entry)) { - case QCOW2_CLUSTER_COMPRESSED: - { - int nb_csectors; - nb_csectors = ((l2_entry >> s->csize_shift) & - s->csize_mask) + 1; - qcow2_free_clusters(bs, - (l2_entry & s->cluster_offset_mask) & ~511, - nb_csectors * 512, type); - } - break; - case QCOW2_CLUSTER_NORMAL: - qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, - nb_clusters << s->cluster_bits, type); - break; - case QCOW2_CLUSTER_UNALLOCATED: - case QCOW2_CLUSTER_ZERO: - break; - default: - abort(); - } -} - - - -/*********************************************************/ -/* snapshots and image creation */ - - - -/* update the refcounts of snapshots and the copied flag */ -int qcow2_update_snapshot_refcount(BlockDriverState *bs, - int64_t l1_table_offset, int l1_size, int addend) -{ - BDRVQcowState *s = bs->opaque; - uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated; - int64_t old_offset, old_l2_offset; - int i, j, l1_modified = 0, nb_csectors, refcount; - int ret; - - l2_table = NULL; - l1_table = NULL; - l1_size2 = l1_size * sizeof(uint64_t); - - s->cache_discards = true; - - /* WARNING: qcow2_snapshot_goto relies on this function not using the - * l1_table_offset when it is the current s->l1_table_offset! Be careful - * when changing this! */ - if (l1_table_offset != s->l1_table_offset) { - l1_table = g_malloc0(align_offset(l1_size2, 512)); - l1_allocated = 1; - - ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); - if (ret < 0) { - goto fail; - } - - for(i = 0;i < l1_size; i++) - be64_to_cpus(&l1_table[i]); - } else { - assert(l1_size == s->l1_size); - l1_table = s->l1_table; - l1_allocated = 0; - } - - for(i = 0; i < l1_size; i++) { - l2_offset = l1_table[i]; - if (l2_offset) { - old_l2_offset = l2_offset; - l2_offset &= L1E_OFFSET_MASK; - - ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, - (void**) &l2_table); - if (ret < 0) { - goto fail; - } - - for(j = 0; j < s->l2_size; j++) { - offset = be64_to_cpu(l2_table[j]); - if (offset != 0) { - old_offset = offset; - offset &= ~QCOW_OFLAG_COPIED; - if (offset & QCOW_OFLAG_COMPRESSED) { - nb_csectors = ((offset >> s->csize_shift) & - s->csize_mask) + 1; - if (addend != 0) { - int ret; - ret = update_refcount(bs, - (offset & s->cluster_offset_mask) & ~511, - nb_csectors * 512, addend, - QCOW2_DISCARD_SNAPSHOT); - if (ret < 0) { - goto fail; - } - } - /* compressed clusters are never modified */ - refcount = 2; - } else { - uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits; - if (addend != 0) { - refcount = update_cluster_refcount(bs, cluster_index, addend, - QCOW2_DISCARD_SNAPSHOT); - } else { - refcount = get_refcount(bs, cluster_index); - } - - if (refcount < 0) { - ret = refcount; - goto fail; - } - } - - if (refcount == 1) { - offset |= QCOW_OFLAG_COPIED; - } - if (offset != old_offset) { - if (addend > 0) { - qcow2_cache_set_dependency(bs, s->l2_table_cache, - s->refcount_block_cache); - } - l2_table[j] = cpu_to_be64(offset); - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); - } - } - } - - ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - if (ret < 0) { - goto fail; - } - - - if (addend != 0) { - refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend, - QCOW2_DISCARD_SNAPSHOT); - } else { - refcount = get_refcount(bs, l2_offset >> s->cluster_bits); - } - if (refcount < 0) { - ret = refcount; - goto fail; - } else if (refcount == 1) { - l2_offset |= QCOW_OFLAG_COPIED; - } - if (l2_offset != old_l2_offset) { - l1_table[i] = l2_offset; - l1_modified = 1; - } - } - } - - ret = bdrv_flush(bs); -fail: - if (l2_table) { - qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - } - - s->cache_discards = false; - qcow2_process_discards(bs, ret); - - /* Update L1 only if it isn't deleted anyway (addend = -1) */ - if (ret == 0 && addend >= 0 && l1_modified) { - for (i = 0; i < l1_size; i++) { - cpu_to_be64s(&l1_table[i]); - } - - ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2); - - for (i = 0; i < l1_size; i++) { - be64_to_cpus(&l1_table[i]); - } - } - if (l1_allocated) - g_free(l1_table); - return ret; -} - - - - -/*********************************************************/ -/* refcount checking functions */ - - - -/* - * Increases the refcount for a range of clusters in a given refcount table. - * This is used to construct a temporary refcount table out of L1 and L2 tables - * which can be compared the the refcount table saved in the image. - * - * Modifies the number of errors in res. - */ -static void inc_refcounts(BlockDriverState *bs, - BdrvCheckResult *res, - uint16_t *refcount_table, - int refcount_table_size, - int64_t offset, int64_t size) -{ - BDRVQcowState *s = bs->opaque; - int64_t start, last, cluster_offset; - int k; - - if (size <= 0) - return; - - start = offset & ~(s->cluster_size - 1); - last = (offset + size - 1) & ~(s->cluster_size - 1); - for(cluster_offset = start; cluster_offset <= last; - cluster_offset += s->cluster_size) { - k = cluster_offset >> s->cluster_bits; - if (k < 0) { - fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n", - cluster_offset); - res->corruptions++; - } else if (k >= refcount_table_size) { - fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after " - "the end of the image file, can't properly check refcounts.\n", - cluster_offset); - res->check_errors++; - } else { - if (++refcount_table[k] == 0) { - fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 - "\n", cluster_offset); - res->corruptions++; - } - } - } -} - -/* Flags for check_refcounts_l1() and check_refcounts_l2() */ -enum { - CHECK_OFLAG_COPIED = 0x1, /* check QCOW_OFLAG_COPIED matches refcount */ - CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */ -}; - -/* - * Increases the refcount in the given refcount table for the all clusters - * referenced in the L2 table. While doing so, performs some checks on L2 - * entries. - * - * Returns the number of errors found by the checks or -errno if an internal - * error occurred. - */ -static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, - uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset, - int flags) -{ - BDRVQcowState *s = bs->opaque; - uint64_t *l2_table, l2_entry; - uint64_t next_contiguous_offset = 0; - int i, l2_size, nb_csectors, refcount; - - /* Read L2 table from disk */ - l2_size = s->l2_size * sizeof(uint64_t); - l2_table = g_malloc(l2_size); - - if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size) - goto fail; - - /* Do the actual checks */ - for(i = 0; i < s->l2_size; i++) { - l2_entry = be64_to_cpu(l2_table[i]); - - switch (qcow2_get_cluster_type(l2_entry)) { - case QCOW2_CLUSTER_COMPRESSED: - /* Compressed clusters don't have QCOW_OFLAG_COPIED */ - if (l2_entry & QCOW_OFLAG_COPIED) { - fprintf(stderr, "ERROR: cluster %" PRId64 ": " - "copied flag must never be set for compressed " - "clusters\n", l2_entry >> s->cluster_bits); - l2_entry &= ~QCOW_OFLAG_COPIED; - res->corruptions++; - } - - /* Mark cluster as used */ - nb_csectors = ((l2_entry >> s->csize_shift) & - s->csize_mask) + 1; - l2_entry &= s->cluster_offset_mask; - inc_refcounts(bs, res, refcount_table, refcount_table_size, - l2_entry & ~511, nb_csectors * 512); - - if (flags & CHECK_FRAG_INFO) { - res->bfi.allocated_clusters++; - res->bfi.compressed_clusters++; - - /* Compressed clusters are fragmented by nature. Since they - * take up sub-sector space but we only have sector granularity - * I/O we need to re-read the same sectors even for adjacent - * compressed clusters. - */ - res->bfi.fragmented_clusters++; - } - break; - - case QCOW2_CLUSTER_ZERO: - if ((l2_entry & L2E_OFFSET_MASK) == 0) { - break; - } - /* fall through */ - - case QCOW2_CLUSTER_NORMAL: - { - /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ - uint64_t offset = l2_entry & L2E_OFFSET_MASK; - - if (flags & CHECK_OFLAG_COPIED) { - refcount = get_refcount(bs, offset >> s->cluster_bits); - if (refcount < 0) { - fprintf(stderr, "Can't get refcount for offset %" - PRIx64 ": %s\n", l2_entry, strerror(-refcount)); - goto fail; - } - if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { - fprintf(stderr, "ERROR OFLAG_COPIED: offset=%" - PRIx64 " refcount=%d\n", l2_entry, refcount); - res->corruptions++; - } - } - - if (flags & CHECK_FRAG_INFO) { - res->bfi.allocated_clusters++; - if (next_contiguous_offset && - offset != next_contiguous_offset) { - res->bfi.fragmented_clusters++; - } - next_contiguous_offset = offset + s->cluster_size; - } - - /* Mark cluster as used */ - inc_refcounts(bs, res, refcount_table,refcount_table_size, - offset, s->cluster_size); - - /* Correct offsets are cluster aligned */ - if (offset & (s->cluster_size - 1)) { - fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not " - "properly aligned; L2 entry corrupted.\n", offset); - res->corruptions++; - } - break; - } - - case QCOW2_CLUSTER_UNALLOCATED: - break; - - default: - abort(); - } - } - - g_free(l2_table); - return 0; - -fail: - fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); - g_free(l2_table); - return -EIO; -} - -/* - * Increases the refcount for the L1 table, its L2 tables and all referenced - * clusters in the given refcount table. While doing so, performs some checks - * on L1 and L2 entries. - * - * Returns the number of errors found by the checks or -errno if an internal - * error occurred. - */ -static int check_refcounts_l1(BlockDriverState *bs, - BdrvCheckResult *res, - uint16_t *refcount_table, - int refcount_table_size, - int64_t l1_table_offset, int l1_size, - int flags) -{ - BDRVQcowState *s = bs->opaque; - uint64_t *l1_table, l2_offset, l1_size2; - int i, refcount, ret; - - l1_size2 = l1_size * sizeof(uint64_t); - - /* Mark L1 table as used */ - inc_refcounts(bs, res, refcount_table, refcount_table_size, - l1_table_offset, l1_size2); - - /* Read L1 table entries from disk */ - if (l1_size2 == 0) { - l1_table = NULL; - } else { - l1_table = g_malloc(l1_size2); - if (bdrv_pread(bs->file, l1_table_offset, - l1_table, l1_size2) != l1_size2) - goto fail; - for(i = 0;i < l1_size; i++) - be64_to_cpus(&l1_table[i]); - } - - /* Do the actual checks */ - for(i = 0; i < l1_size; i++) { - l2_offset = l1_table[i]; - if (l2_offset) { - /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ - if (flags & CHECK_OFLAG_COPIED) { - refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) - >> s->cluster_bits); - if (refcount < 0) { - fprintf(stderr, "Can't get refcount for l2_offset %" - PRIx64 ": %s\n", l2_offset, strerror(-refcount)); - goto fail; - } - if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) { - fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64 - " refcount=%d\n", l2_offset, refcount); - res->corruptions++; - } - } - - /* Mark L2 table as used */ - l2_offset &= L1E_OFFSET_MASK; - inc_refcounts(bs, res, refcount_table, refcount_table_size, - l2_offset, s->cluster_size); - - /* L2 tables are cluster aligned */ - if (l2_offset & (s->cluster_size - 1)) { - fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " - "cluster aligned; L1 entry corrupted\n", l2_offset); - res->corruptions++; - } - - /* Process and check L2 entries */ - ret = check_refcounts_l2(bs, res, refcount_table, - refcount_table_size, l2_offset, flags); - if (ret < 0) { - goto fail; - } - } - } - g_free(l1_table); - return 0; - -fail: - fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); - res->check_errors++; - g_free(l1_table); - return -EIO; -} - -/* - * Checks an image for refcount consistency. - * - * Returns 0 if no errors are found, the number of errors in case the image is - * detected as corrupted, and -errno when an internal error occurred. - */ -int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, - BdrvCheckMode fix) -{ - BDRVQcowState *s = bs->opaque; - int64_t size, i, highest_cluster; - int nb_clusters, refcount1, refcount2; - QCowSnapshot *sn; - uint16_t *refcount_table; - int ret; - - size = bdrv_getlength(bs->file); - nb_clusters = size_to_clusters(s, size); - refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t)); - - res->bfi.total_clusters = - size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE); - - /* header */ - inc_refcounts(bs, res, refcount_table, nb_clusters, - 0, s->cluster_size); - - /* current L1 table */ - ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, - s->l1_table_offset, s->l1_size, - CHECK_OFLAG_COPIED | CHECK_FRAG_INFO); - if (ret < 0) { - goto fail; - } - - /* snapshots */ - for(i = 0; i < s->nb_snapshots; i++) { - sn = s->snapshots + i; - ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, - sn->l1_table_offset, sn->l1_size, 0); - if (ret < 0) { - goto fail; - } - } - inc_refcounts(bs, res, refcount_table, nb_clusters, - s->snapshots_offset, s->snapshots_size); - - /* refcount data */ - inc_refcounts(bs, res, refcount_table, nb_clusters, - s->refcount_table_offset, - s->refcount_table_size * sizeof(uint64_t)); - - for(i = 0; i < s->refcount_table_size; i++) { - uint64_t offset, cluster; - offset = s->refcount_table[i]; - cluster = offset >> s->cluster_bits; - - /* Refcount blocks are cluster aligned */ - if (offset & (s->cluster_size - 1)) { - fprintf(stderr, "ERROR refcount block %" PRId64 " is not " - "cluster aligned; refcount table entry corrupted\n", i); - res->corruptions++; - continue; - } - - if (cluster >= nb_clusters) { - fprintf(stderr, "ERROR refcount block %" PRId64 - " is outside image\n", i); - res->corruptions++; - continue; - } - - if (offset != 0) { - inc_refcounts(bs, res, refcount_table, nb_clusters, - offset, s->cluster_size); - if (refcount_table[cluster] != 1) { - fprintf(stderr, "ERROR refcount block %" PRId64 - " refcount=%d\n", - i, refcount_table[cluster]); - res->corruptions++; - } - } - } - - /* compare ref counts */ - for (i = 0, highest_cluster = 0; i < nb_clusters; i++) { - refcount1 = get_refcount(bs, i); - if (refcount1 < 0) { - fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", - i, strerror(-refcount1)); - res->check_errors++; - continue; - } - - refcount2 = refcount_table[i]; - - if (refcount1 > 0 || refcount2 > 0) { - highest_cluster = i; - } - - if (refcount1 != refcount2) { - - /* Check if we're allowed to fix the mismatch */ - int *num_fixed = NULL; - if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) { - num_fixed = &res->leaks_fixed; - } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) { - num_fixed = &res->corruptions_fixed; - } - - fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n", - num_fixed != NULL ? "Repairing" : - refcount1 < refcount2 ? "ERROR" : - "Leaked", - i, refcount1, refcount2); - - if (num_fixed) { - ret = update_refcount(bs, i << s->cluster_bits, 1, - refcount2 - refcount1, - QCOW2_DISCARD_ALWAYS); - if (ret >= 0) { - (*num_fixed)++; - continue; - } - } - - /* And if we couldn't, print an error */ - if (refcount1 < refcount2) { - res->corruptions++; - } else { - res->leaks++; - } - } - } - - res->image_end_offset = (highest_cluster + 1) * s->cluster_size; - ret = 0; - -fail: - g_free(refcount_table); - - return ret; -} - diff --git a/contrib/qemu/block/qcow2-snapshot.c b/contrib/qemu/block/qcow2-snapshot.c deleted file mode 100644 index 0caac9055f8..00000000000 --- a/contrib/qemu/block/qcow2-snapshot.c +++ /dev/null @@ -1,660 +0,0 @@ -/* - * Block driver for the QCOW version 2 format - * - * Copyright (c) 2004-2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu-common.h" -#include "block/block_int.h" -#include "block/qcow2.h" - -typedef struct QEMU_PACKED QCowSnapshotHeader { - /* header is 8 byte aligned */ - uint64_t l1_table_offset; - - uint32_t l1_size; - uint16_t id_str_size; - uint16_t name_size; - - uint32_t date_sec; - uint32_t date_nsec; - - uint64_t vm_clock_nsec; - - uint32_t vm_state_size; - uint32_t extra_data_size; /* for extension */ - /* extra data follows */ - /* id_str follows */ - /* name follows */ -} QCowSnapshotHeader; - -typedef struct QEMU_PACKED QCowSnapshotExtraData { - uint64_t vm_state_size_large; - uint64_t disk_size; -} QCowSnapshotExtraData; - -void qcow2_free_snapshots(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - int i; - - for(i = 0; i < s->nb_snapshots; i++) { - g_free(s->snapshots[i].name); - g_free(s->snapshots[i].id_str); - } - g_free(s->snapshots); - s->snapshots = NULL; - s->nb_snapshots = 0; -} - -int qcow2_read_snapshots(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - QCowSnapshotHeader h; - QCowSnapshotExtraData extra; - QCowSnapshot *sn; - int i, id_str_size, name_size; - int64_t offset; - uint32_t extra_data_size; - int ret; - - if (!s->nb_snapshots) { - s->snapshots = NULL; - s->snapshots_size = 0; - return 0; - } - - offset = s->snapshots_offset; - s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot)); - - for(i = 0; i < s->nb_snapshots; i++) { - /* Read statically sized part of the snapshot header */ - offset = align_offset(offset, 8); - ret = bdrv_pread(bs->file, offset, &h, sizeof(h)); - if (ret < 0) { - goto fail; - } - - offset += sizeof(h); - sn = s->snapshots + i; - sn->l1_table_offset = be64_to_cpu(h.l1_table_offset); - sn->l1_size = be32_to_cpu(h.l1_size); - sn->vm_state_size = be32_to_cpu(h.vm_state_size); - sn->date_sec = be32_to_cpu(h.date_sec); - sn->date_nsec = be32_to_cpu(h.date_nsec); - sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec); - extra_data_size = be32_to_cpu(h.extra_data_size); - - id_str_size = be16_to_cpu(h.id_str_size); - name_size = be16_to_cpu(h.name_size); - - /* Read extra data */ - ret = bdrv_pread(bs->file, offset, &extra, - MIN(sizeof(extra), extra_data_size)); - if (ret < 0) { - goto fail; - } - offset += extra_data_size; - - if (extra_data_size >= 8) { - sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large); - } - - if (extra_data_size >= 16) { - sn->disk_size = be64_to_cpu(extra.disk_size); - } else { - sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; - } - - /* Read snapshot ID */ - sn->id_str = g_malloc(id_str_size + 1); - ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size); - if (ret < 0) { - goto fail; - } - offset += id_str_size; - sn->id_str[id_str_size] = '\0'; - - /* Read snapshot name */ - sn->name = g_malloc(name_size + 1); - ret = bdrv_pread(bs->file, offset, sn->name, name_size); - if (ret < 0) { - goto fail; - } - offset += name_size; - sn->name[name_size] = '\0'; - } - - s->snapshots_size = offset - s->snapshots_offset; - return 0; - -fail: - qcow2_free_snapshots(bs); - return ret; -} - -/* add at the end of the file a new list of snapshots */ -static int qcow2_write_snapshots(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - QCowSnapshot *sn; - QCowSnapshotHeader h; - QCowSnapshotExtraData extra; - int i, name_size, id_str_size, snapshots_size; - struct { - uint32_t nb_snapshots; - uint64_t snapshots_offset; - } QEMU_PACKED header_data; - int64_t offset, snapshots_offset; - int ret; - - /* compute the size of the snapshots */ - offset = 0; - for(i = 0; i < s->nb_snapshots; i++) { - sn = s->snapshots + i; - offset = align_offset(offset, 8); - offset += sizeof(h); - offset += sizeof(extra); - offset += strlen(sn->id_str); - offset += strlen(sn->name); - } - snapshots_size = offset; - - /* Allocate space for the new snapshot list */ - snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size); - offset = snapshots_offset; - if (offset < 0) { - return offset; - } - ret = bdrv_flush(bs); - if (ret < 0) { - return ret; - } - - /* Write all snapshots to the new list */ - for(i = 0; i < s->nb_snapshots; i++) { - sn = s->snapshots + i; - memset(&h, 0, sizeof(h)); - h.l1_table_offset = cpu_to_be64(sn->l1_table_offset); - h.l1_size = cpu_to_be32(sn->l1_size); - /* If it doesn't fit in 32 bit, older implementations should treat it - * as a disk-only snapshot rather than truncate the VM state */ - if (sn->vm_state_size <= 0xffffffff) { - h.vm_state_size = cpu_to_be32(sn->vm_state_size); - } - h.date_sec = cpu_to_be32(sn->date_sec); - h.date_nsec = cpu_to_be32(sn->date_nsec); - h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec); - h.extra_data_size = cpu_to_be32(sizeof(extra)); - - memset(&extra, 0, sizeof(extra)); - extra.vm_state_size_large = cpu_to_be64(sn->vm_state_size); - extra.disk_size = cpu_to_be64(sn->disk_size); - - id_str_size = strlen(sn->id_str); - name_size = strlen(sn->name); - h.id_str_size = cpu_to_be16(id_str_size); - h.name_size = cpu_to_be16(name_size); - offset = align_offset(offset, 8); - - ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h)); - if (ret < 0) { - goto fail; - } - offset += sizeof(h); - - ret = bdrv_pwrite(bs->file, offset, &extra, sizeof(extra)); - if (ret < 0) { - goto fail; - } - offset += sizeof(extra); - - ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size); - if (ret < 0) { - goto fail; - } - offset += id_str_size; - - ret = bdrv_pwrite(bs->file, offset, sn->name, name_size); - if (ret < 0) { - goto fail; - } - offset += name_size; - } - - /* - * Update the header to point to the new snapshot table. This requires the - * new table and its refcounts to be stable on disk. - */ - ret = bdrv_flush(bs); - if (ret < 0) { - goto fail; - } - - QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) != - offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots)); - - header_data.nb_snapshots = cpu_to_be32(s->nb_snapshots); - header_data.snapshots_offset = cpu_to_be64(snapshots_offset); - - ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots), - &header_data, sizeof(header_data)); - if (ret < 0) { - goto fail; - } - - /* free the old snapshot table */ - qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size, - QCOW2_DISCARD_SNAPSHOT); - s->snapshots_offset = snapshots_offset; - s->snapshots_size = snapshots_size; - return 0; - -fail: - return ret; -} - -static void find_new_snapshot_id(BlockDriverState *bs, - char *id_str, int id_str_size) -{ - BDRVQcowState *s = bs->opaque; - QCowSnapshot *sn; - int i, id, id_max = 0; - - for(i = 0; i < s->nb_snapshots; i++) { - sn = s->snapshots + i; - id = strtoul(sn->id_str, NULL, 10); - if (id > id_max) - id_max = id; - } - snprintf(id_str, id_str_size, "%d", id_max + 1); -} - -static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str) -{ - BDRVQcowState *s = bs->opaque; - int i; - - for(i = 0; i < s->nb_snapshots; i++) { - if (!strcmp(s->snapshots[i].id_str, id_str)) - return i; - } - return -1; -} - -static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name) -{ - BDRVQcowState *s = bs->opaque; - int i, ret; - - ret = find_snapshot_by_id(bs, name); - if (ret >= 0) - return ret; - for(i = 0; i < s->nb_snapshots; i++) { - if (!strcmp(s->snapshots[i].name, name)) - return i; - } - return -1; -} - -/* if no id is provided, a new one is constructed */ -int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) -{ - BDRVQcowState *s = bs->opaque; - QCowSnapshot *new_snapshot_list = NULL; - QCowSnapshot *old_snapshot_list = NULL; - QCowSnapshot sn1, *sn = &sn1; - int i, ret; - uint64_t *l1_table = NULL; - int64_t l1_table_offset; - - memset(sn, 0, sizeof(*sn)); - - /* Generate an ID if it wasn't passed */ - if (sn_info->id_str[0] == '\0') { - find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str)); - } - - /* Check that the ID is unique */ - if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) { - return -EEXIST; - } - - /* Populate sn with passed data */ - sn->id_str = g_strdup(sn_info->id_str); - sn->name = g_strdup(sn_info->name); - - sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; - sn->vm_state_size = sn_info->vm_state_size; - sn->date_sec = sn_info->date_sec; - sn->date_nsec = sn_info->date_nsec; - sn->vm_clock_nsec = sn_info->vm_clock_nsec; - - /* Allocate the L1 table of the snapshot and copy the current one there. */ - l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t)); - if (l1_table_offset < 0) { - ret = l1_table_offset; - goto fail; - } - - sn->l1_table_offset = l1_table_offset; - sn->l1_size = s->l1_size; - - l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); - for(i = 0; i < s->l1_size; i++) { - l1_table[i] = cpu_to_be64(s->l1_table[i]); - } - - ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table, - s->l1_size * sizeof(uint64_t)); - if (ret < 0) { - goto fail; - } - - g_free(l1_table); - l1_table = NULL; - - /* - * Increase the refcounts of all clusters and make sure everything is - * stable on disk before updating the snapshot table to contain a pointer - * to the new L1 table. - */ - ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1); - if (ret < 0) { - goto fail; - } - - /* Append the new snapshot to the snapshot list */ - new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot)); - if (s->snapshots) { - memcpy(new_snapshot_list, s->snapshots, - s->nb_snapshots * sizeof(QCowSnapshot)); - old_snapshot_list = s->snapshots; - } - s->snapshots = new_snapshot_list; - s->snapshots[s->nb_snapshots++] = *sn; - - ret = qcow2_write_snapshots(bs); - if (ret < 0) { - g_free(s->snapshots); - s->snapshots = old_snapshot_list; - goto fail; - } - - g_free(old_snapshot_list); - -#ifdef DEBUG_ALLOC - { - BdrvCheckResult result = {0}; - qcow2_check_refcounts(bs, &result, 0); - } -#endif - return 0; - -fail: - g_free(sn->id_str); - g_free(sn->name); - g_free(l1_table); - - return ret; -} - -/* copy the snapshot 'snapshot_name' into the current disk image */ -int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) -{ - BDRVQcowState *s = bs->opaque; - QCowSnapshot *sn; - int i, snapshot_index; - int cur_l1_bytes, sn_l1_bytes; - int ret; - uint64_t *sn_l1_table = NULL; - - /* Search the snapshot */ - snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); - if (snapshot_index < 0) { - return -ENOENT; - } - sn = &s->snapshots[snapshot_index]; - - if (sn->disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) { - error_report("qcow2: Loading snapshots with different disk " - "size is not implemented"); - ret = -ENOTSUP; - goto fail; - } - - /* - * Make sure that the current L1 table is big enough to contain the whole - * L1 table of the snapshot. If the snapshot L1 table is smaller, the - * current one must be padded with zeros. - */ - ret = qcow2_grow_l1_table(bs, sn->l1_size, true); - if (ret < 0) { - goto fail; - } - - cur_l1_bytes = s->l1_size * sizeof(uint64_t); - sn_l1_bytes = sn->l1_size * sizeof(uint64_t); - - /* - * Copy the snapshot L1 table to the current L1 table. - * - * Before overwriting the old current L1 table on disk, make sure to - * increase all refcounts for the clusters referenced by the new one. - * Decrease the refcount referenced by the old one only when the L1 - * table is overwritten. - */ - sn_l1_table = g_malloc0(cur_l1_bytes); - - ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes); - if (ret < 0) { - goto fail; - } - - ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset, - sn->l1_size, 1); - if (ret < 0) { - goto fail; - } - - ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table, - cur_l1_bytes); - if (ret < 0) { - goto fail; - } - - /* - * Decrease refcount of clusters of current L1 table. - * - * At this point, the in-memory s->l1_table points to the old L1 table, - * whereas on disk we already have the new one. - * - * qcow2_update_snapshot_refcount special cases the current L1 table to use - * the in-memory data instead of really using the offset to load a new one, - * which is why this works. - */ - ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, - s->l1_size, -1); - - /* - * Now update the in-memory L1 table to be in sync with the on-disk one. We - * need to do this even if updating refcounts failed. - */ - for(i = 0;i < s->l1_size; i++) { - s->l1_table[i] = be64_to_cpu(sn_l1_table[i]); - } - - if (ret < 0) { - goto fail; - } - - g_free(sn_l1_table); - sn_l1_table = NULL; - - /* - * Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed - * when we decreased the refcount of the old snapshot. - */ - ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); - if (ret < 0) { - goto fail; - } - -#ifdef DEBUG_ALLOC - { - BdrvCheckResult result = {0}; - qcow2_check_refcounts(bs, &result, 0); - } -#endif - return 0; - -fail: - g_free(sn_l1_table); - return ret; -} - -int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) -{ - BDRVQcowState *s = bs->opaque; - QCowSnapshot sn; - int snapshot_index, ret; - - /* Search the snapshot */ - snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); - if (snapshot_index < 0) { - return -ENOENT; - } - sn = s->snapshots[snapshot_index]; - - /* Remove it from the snapshot list */ - memmove(s->snapshots + snapshot_index, - s->snapshots + snapshot_index + 1, - (s->nb_snapshots - snapshot_index - 1) * sizeof(sn)); - s->nb_snapshots--; - ret = qcow2_write_snapshots(bs); - if (ret < 0) { - return ret; - } - - /* - * The snapshot is now unused, clean up. If we fail after this point, we - * won't recover but just leak clusters. - */ - g_free(sn.id_str); - g_free(sn.name); - - /* - * Now decrease the refcounts of clusters referenced by the snapshot and - * free the L1 table. - */ - ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset, - sn.l1_size, -1); - if (ret < 0) { - return ret; - } - qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t), - QCOW2_DISCARD_SNAPSHOT); - - /* must update the copied flag on the current cluster offsets */ - ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); - if (ret < 0) { - return ret; - } - -#ifdef DEBUG_ALLOC - { - BdrvCheckResult result = {0}; - qcow2_check_refcounts(bs, &result, 0); - } -#endif - return 0; -} - -int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) -{ - BDRVQcowState *s = bs->opaque; - QEMUSnapshotInfo *sn_tab, *sn_info; - QCowSnapshot *sn; - int i; - - if (!s->nb_snapshots) { - *psn_tab = NULL; - return s->nb_snapshots; - } - - sn_tab = g_malloc0(s->nb_snapshots * sizeof(QEMUSnapshotInfo)); - for(i = 0; i < s->nb_snapshots; i++) { - sn_info = sn_tab + i; - sn = s->snapshots + i; - pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), - sn->id_str); - pstrcpy(sn_info->name, sizeof(sn_info->name), - sn->name); - sn_info->vm_state_size = sn->vm_state_size; - sn_info->date_sec = sn->date_sec; - sn_info->date_nsec = sn->date_nsec; - sn_info->vm_clock_nsec = sn->vm_clock_nsec; - } - *psn_tab = sn_tab; - return s->nb_snapshots; -} - -int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name) -{ - int i, snapshot_index; - BDRVQcowState *s = bs->opaque; - QCowSnapshot *sn; - uint64_t *new_l1_table; - int new_l1_bytes; - int ret; - - assert(bs->read_only); - - /* Search the snapshot */ - snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name); - if (snapshot_index < 0) { - return -ENOENT; - } - sn = &s->snapshots[snapshot_index]; - - /* Allocate and read in the snapshot's L1 table */ - new_l1_bytes = s->l1_size * sizeof(uint64_t); - new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512)); - - ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes); - if (ret < 0) { - g_free(new_l1_table); - return ret; - } - - /* Switch the L1 table */ - g_free(s->l1_table); - - s->l1_size = sn->l1_size; - s->l1_table_offset = sn->l1_table_offset; - s->l1_table = new_l1_table; - - for(i = 0;i < s->l1_size; i++) { - be64_to_cpus(&s->l1_table[i]); - } - - return 0; -} diff --git a/contrib/qemu/block/qcow2.c b/contrib/qemu/block/qcow2.c deleted file mode 100644 index 0eceefe2cd9..00000000000 --- a/contrib/qemu/block/qcow2.c +++ /dev/null @@ -1,1825 +0,0 @@ -/* - * Block driver for the QCOW version 2 format - * - * Copyright (c) 2004-2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu-common.h" -#include "block/block_int.h" -#include "qemu/module.h" -#include <zlib.h> -#include "qemu/aes.h" -#include "block/qcow2.h" -#include "qemu/error-report.h" -#include "qapi/qmp/qerror.h" -#include "qapi/qmp/qbool.h" -#include "trace.h" - -/* - Differences with QCOW: - - - Support for multiple incremental snapshots. - - Memory management by reference counts. - - Clusters which have a reference count of one have the bit - QCOW_OFLAG_COPIED to optimize write performance. - - Size of compressed clusters is stored in sectors to reduce bit usage - in the cluster offsets. - - Support for storing additional data (such as the VM state) in the - snapshots. - - If a backing store is used, the cluster size is not constrained - (could be backported to QCOW). - - L2 tables have always a size of one cluster. -*/ - - -typedef struct { - uint32_t magic; - uint32_t len; -} QCowExtension; - -#define QCOW2_EXT_MAGIC_END 0 -#define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA -#define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857 - -static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) -{ - const QCowHeader *cow_header = (const void *)buf; - - if (buf_size >= sizeof(QCowHeader) && - be32_to_cpu(cow_header->magic) == QCOW_MAGIC && - be32_to_cpu(cow_header->version) >= 2) - return 100; - else - return 0; -} - - -/* - * read qcow2 extension and fill bs - * start reading from start_offset - * finish reading upon magic of value 0 or when end_offset reached - * unknown magic is skipped (future extension this version knows nothing about) - * return 0 upon success, non-0 otherwise - */ -static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, - uint64_t end_offset, void **p_feature_table) -{ - BDRVQcowState *s = bs->opaque; - QCowExtension ext; - uint64_t offset; - int ret; - -#ifdef DEBUG_EXT - printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset); -#endif - offset = start_offset; - while (offset < end_offset) { - -#ifdef DEBUG_EXT - /* Sanity check */ - if (offset > s->cluster_size) - printf("qcow2_read_extension: suspicious offset %lu\n", offset); - - printf("attempting to read extended header in offset %lu\n", offset); -#endif - - if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) { - fprintf(stderr, "qcow2_read_extension: ERROR: " - "pread fail from offset %" PRIu64 "\n", - offset); - return 1; - } - be32_to_cpus(&ext.magic); - be32_to_cpus(&ext.len); - offset += sizeof(ext); -#ifdef DEBUG_EXT - printf("ext.magic = 0x%x\n", ext.magic); -#endif - if (ext.len > end_offset - offset) { - error_report("Header extension too large"); - return -EINVAL; - } - - switch (ext.magic) { - case QCOW2_EXT_MAGIC_END: - return 0; - - case QCOW2_EXT_MAGIC_BACKING_FORMAT: - if (ext.len >= sizeof(bs->backing_format)) { - fprintf(stderr, "ERROR: ext_backing_format: len=%u too large" - " (>=%zu)\n", - ext.len, sizeof(bs->backing_format)); - return 2; - } - if (bdrv_pread(bs->file, offset , bs->backing_format, - ext.len) != ext.len) - return 3; - bs->backing_format[ext.len] = '\0'; -#ifdef DEBUG_EXT - printf("Qcow2: Got format extension %s\n", bs->backing_format); -#endif - break; - - case QCOW2_EXT_MAGIC_FEATURE_TABLE: - if (p_feature_table != NULL) { - void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); - ret = bdrv_pread(bs->file, offset , feature_table, ext.len); - if (ret < 0) { - return ret; - } - - *p_feature_table = feature_table; - } - break; - - default: - /* unknown magic - save it in case we need to rewrite the header */ - { - Qcow2UnknownHeaderExtension *uext; - - uext = g_malloc0(sizeof(*uext) + ext.len); - uext->magic = ext.magic; - uext->len = ext.len; - QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next); - - ret = bdrv_pread(bs->file, offset , uext->data, uext->len); - if (ret < 0) { - return ret; - } - } - break; - } - - offset += ((ext.len + 7) & ~7); - } - - return 0; -} - -static void cleanup_unknown_header_ext(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - Qcow2UnknownHeaderExtension *uext, *next; - - QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) { - QLIST_REMOVE(uext, next); - g_free(uext); - } -} - -static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs, - const char *fmt, ...) -{ - char msg[64]; - va_list ap; - - va_start(ap, fmt); - vsnprintf(msg, sizeof(msg), fmt, ap); - va_end(ap); - - qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bs->device_name, "qcow2", msg); -} - -static void report_unsupported_feature(BlockDriverState *bs, - Qcow2Feature *table, uint64_t mask) -{ - while (table && table->name[0] != '\0') { - if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) { - if (mask & (1 << table->bit)) { - report_unsupported(bs, "%.46s",table->name); - mask &= ~(1 << table->bit); - } - } - table++; - } - - if (mask) { - report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask); - } -} - -/* - * Sets the dirty bit and flushes afterwards if necessary. - * - * The incompatible_features bit is only set if the image file header was - * updated successfully. Therefore it is not required to check the return - * value of this function. - */ -int qcow2_mark_dirty(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - uint64_t val; - int ret; - - assert(s->qcow_version >= 3); - - if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { - return 0; /* already dirty */ - } - - val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY); - ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features), - &val, sizeof(val)); - if (ret < 0) { - return ret; - } - ret = bdrv_flush(bs->file); - if (ret < 0) { - return ret; - } - - /* Only treat image as dirty if the header was updated successfully */ - s->incompatible_features |= QCOW2_INCOMPAT_DIRTY; - return 0; -} - -/* - * Clears the dirty bit and flushes before if necessary. Only call this - * function when there are no pending requests, it does not guard against - * concurrent requests dirtying the image. - */ -static int qcow2_mark_clean(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - - if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { - int ret = bdrv_flush(bs); - if (ret < 0) { - return ret; - } - - s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY; - return qcow2_update_header(bs); - } - return 0; -} - -static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, - BdrvCheckMode fix) -{ - int ret = qcow2_check_refcounts(bs, result, fix); - if (ret < 0) { - return ret; - } - - if (fix && result->check_errors == 0 && result->corruptions == 0) { - return qcow2_mark_clean(bs); - } - return ret; -} - -static QemuOptsList qcow2_runtime_opts = { - .name = "qcow2", - .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head), - .desc = { - { - .name = "lazy_refcounts", - .type = QEMU_OPT_BOOL, - .help = "Postpone refcount updates", - }, - { - .name = QCOW2_OPT_DISCARD_REQUEST, - .type = QEMU_OPT_BOOL, - .help = "Pass guest discard requests to the layer below", - }, - { - .name = QCOW2_OPT_DISCARD_SNAPSHOT, - .type = QEMU_OPT_BOOL, - .help = "Generate discard requests when snapshot related space " - "is freed", - }, - { - .name = QCOW2_OPT_DISCARD_OTHER, - .type = QEMU_OPT_BOOL, - .help = "Generate discard requests when other clusters are freed", - }, - { /* end of list */ } - }, -}; - -static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) -{ - BDRVQcowState *s = bs->opaque; - int len, i, ret = 0; - QCowHeader header; - QemuOpts *opts; - Error *local_err = NULL; - uint64_t ext_end; - uint64_t l1_vm_state_index; - - ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); - if (ret < 0) { - goto fail; - } - be32_to_cpus(&header.magic); - be32_to_cpus(&header.version); - be64_to_cpus(&header.backing_file_offset); - be32_to_cpus(&header.backing_file_size); - be64_to_cpus(&header.size); - be32_to_cpus(&header.cluster_bits); - be32_to_cpus(&header.crypt_method); - be64_to_cpus(&header.l1_table_offset); - be32_to_cpus(&header.l1_size); - be64_to_cpus(&header.refcount_table_offset); - be32_to_cpus(&header.refcount_table_clusters); - be64_to_cpus(&header.snapshots_offset); - be32_to_cpus(&header.nb_snapshots); - - if (header.magic != QCOW_MAGIC) { - ret = -EMEDIUMTYPE; - goto fail; - } - if (header.version < 2 || header.version > 3) { - report_unsupported(bs, "QCOW version %d", header.version); - ret = -ENOTSUP; - goto fail; - } - - s->qcow_version = header.version; - - /* Initialise version 3 header fields */ - if (header.version == 2) { - header.incompatible_features = 0; - header.compatible_features = 0; - header.autoclear_features = 0; - header.refcount_order = 4; - header.header_length = 72; - } else { - be64_to_cpus(&header.incompatible_features); - be64_to_cpus(&header.compatible_features); - be64_to_cpus(&header.autoclear_features); - be32_to_cpus(&header.refcount_order); - be32_to_cpus(&header.header_length); - } - - if (header.header_length > sizeof(header)) { - s->unknown_header_fields_size = header.header_length - sizeof(header); - s->unknown_header_fields = g_malloc(s->unknown_header_fields_size); - ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields, - s->unknown_header_fields_size); - if (ret < 0) { - goto fail; - } - } - - if (header.backing_file_offset) { - ext_end = header.backing_file_offset; - } else { - ext_end = 1 << header.cluster_bits; - } - - /* Handle feature bits */ - s->incompatible_features = header.incompatible_features; - s->compatible_features = header.compatible_features; - s->autoclear_features = header.autoclear_features; - - if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) { - void *feature_table = NULL; - qcow2_read_extensions(bs, header.header_length, ext_end, - &feature_table); - report_unsupported_feature(bs, feature_table, - s->incompatible_features & - ~QCOW2_INCOMPAT_MASK); - ret = -ENOTSUP; - goto fail; - } - - /* Check support for various header values */ - if (header.refcount_order != 4) { - report_unsupported(bs, "%d bit reference counts", - 1 << header.refcount_order); - ret = -ENOTSUP; - goto fail; - } - - if (header.cluster_bits < MIN_CLUSTER_BITS || - header.cluster_bits > MAX_CLUSTER_BITS) { - ret = -EINVAL; - goto fail; - } - if (header.crypt_method > QCOW_CRYPT_AES) { - ret = -EINVAL; - goto fail; - } - s->crypt_method_header = header.crypt_method; - if (s->crypt_method_header) { - bs->encrypted = 1; - } - s->cluster_bits = header.cluster_bits; - s->cluster_size = 1 << s->cluster_bits; - s->cluster_sectors = 1 << (s->cluster_bits - 9); - s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ - s->l2_size = 1 << s->l2_bits; - bs->total_sectors = header.size / 512; - s->csize_shift = (62 - (s->cluster_bits - 8)); - s->csize_mask = (1 << (s->cluster_bits - 8)) - 1; - s->cluster_offset_mask = (1LL << s->csize_shift) - 1; - s->refcount_table_offset = header.refcount_table_offset; - s->refcount_table_size = - header.refcount_table_clusters << (s->cluster_bits - 3); - - s->snapshots_offset = header.snapshots_offset; - s->nb_snapshots = header.nb_snapshots; - - /* read the level 1 table */ - s->l1_size = header.l1_size; - - l1_vm_state_index = size_to_l1(s, header.size); - if (l1_vm_state_index > INT_MAX) { - ret = -EFBIG; - goto fail; - } - s->l1_vm_state_index = l1_vm_state_index; - - /* the L1 table must contain at least enough entries to put - header.size bytes */ - if (s->l1_size < s->l1_vm_state_index) { - ret = -EINVAL; - goto fail; - } - s->l1_table_offset = header.l1_table_offset; - if (s->l1_size > 0) { - s->l1_table = g_malloc0( - align_offset(s->l1_size * sizeof(uint64_t), 512)); - ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, - s->l1_size * sizeof(uint64_t)); - if (ret < 0) { - goto fail; - } - for(i = 0;i < s->l1_size; i++) { - be64_to_cpus(&s->l1_table[i]); - } - } - - /* alloc L2 table/refcount block cache */ - s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE); - s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE); - - s->cluster_cache = g_malloc(s->cluster_size); - /* one more sector for decompressed data alignment */ - s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size - + 512); - s->cluster_cache_offset = -1; - s->flags = flags; - - ret = qcow2_refcount_init(bs); - if (ret != 0) { - goto fail; - } - - QLIST_INIT(&s->cluster_allocs); - QTAILQ_INIT(&s->discards); - - /* read qcow2 extensions */ - if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) { - ret = -EINVAL; - goto fail; - } - - /* read the backing file name */ - if (header.backing_file_offset != 0) { - len = header.backing_file_size; - if (len > 1023) { - len = 1023; - } - ret = bdrv_pread(bs->file, header.backing_file_offset, - bs->backing_file, len); - if (ret < 0) { - goto fail; - } - bs->backing_file[len] = '\0'; - } - - ret = qcow2_read_snapshots(bs); - if (ret < 0) { - goto fail; - } - - /* Clear unknown autoclear feature bits */ - if (!bs->read_only && s->autoclear_features != 0) { - s->autoclear_features = 0; - ret = qcow2_update_header(bs); - if (ret < 0) { - goto fail; - } - } - - /* Initialise locks */ - qemu_co_mutex_init(&s->lock); - - /* Repair image if dirty */ - if (!(flags & BDRV_O_CHECK) && !bs->read_only && - (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) { - BdrvCheckResult result = {0}; - - ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS); - if (ret < 0) { - goto fail; - } - } - - /* Enable lazy_refcounts according to image and command line options */ - opts = qemu_opts_create_nofail(&qcow2_runtime_opts); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); - ret = -EINVAL; - goto fail; - } - - s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS, - (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)); - - s->discard_passthrough[QCOW2_DISCARD_NEVER] = false; - s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true; - s->discard_passthrough[QCOW2_DISCARD_REQUEST] = - qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST, - flags & BDRV_O_UNMAP); - s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] = - qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true); - s->discard_passthrough[QCOW2_DISCARD_OTHER] = - qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); - - qemu_opts_del(opts); - - if (s->use_lazy_refcounts && s->qcow_version < 3) { - qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require " - "a qcow2 image with at least qemu 1.1 compatibility level"); - ret = -EINVAL; - goto fail; - } - -#ifdef DEBUG_ALLOC - { - BdrvCheckResult result = {0}; - qcow2_check_refcounts(bs, &result, 0); - } -#endif - return ret; - - fail: - g_free(s->unknown_header_fields); - cleanup_unknown_header_ext(bs); - qcow2_free_snapshots(bs); - qcow2_refcount_close(bs); - g_free(s->l1_table); - if (s->l2_table_cache) { - qcow2_cache_destroy(bs, s->l2_table_cache); - } - g_free(s->cluster_cache); - qemu_vfree(s->cluster_data); - return ret; -} - -static int qcow2_set_key(BlockDriverState *bs, const char *key) -{ - BDRVQcowState *s = bs->opaque; - uint8_t keybuf[16]; - int len, i; - - memset(keybuf, 0, 16); - len = strlen(key); - if (len > 16) - len = 16; - /* XXX: we could compress the chars to 7 bits to increase - entropy */ - for(i = 0;i < len;i++) { - keybuf[i] = key[i]; - } - s->crypt_method = s->crypt_method_header; - - if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) - return -1; - if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) - return -1; -#if 0 - /* test */ - { - uint8_t in[16]; - uint8_t out[16]; - uint8_t tmp[16]; - for(i=0;i<16;i++) - in[i] = i; - AES_encrypt(in, tmp, &s->aes_encrypt_key); - AES_decrypt(tmp, out, &s->aes_decrypt_key); - for(i = 0; i < 16; i++) - printf(" %02x", tmp[i]); - printf("\n"); - for(i = 0; i < 16; i++) - printf(" %02x", out[i]); - printf("\n"); - } -#endif - return 0; -} - -/* We have nothing to do for QCOW2 reopen, stubs just return - * success */ -static int qcow2_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) -{ - return 0; -} - -static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum) -{ - BDRVQcowState *s = bs->opaque; - uint64_t cluster_offset; - int ret; - - *pnum = nb_sectors; - /* FIXME We can get errors here, but the bdrv_co_is_allocated interface - * can't pass them on today */ - qemu_co_mutex_lock(&s->lock); - ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset); - qemu_co_mutex_unlock(&s->lock); - if (ret < 0) { - *pnum = 0; - } - - return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO); -} - -/* handle reading after the end of the backing file */ -int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, - int64_t sector_num, int nb_sectors) -{ - int n1; - if ((sector_num + nb_sectors) <= bs->total_sectors) - return nb_sectors; - if (sector_num >= bs->total_sectors) - n1 = 0; - else - n1 = bs->total_sectors - sector_num; - - qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1)); - - return n1; -} - -static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, - int remaining_sectors, QEMUIOVector *qiov) -{ - BDRVQcowState *s = bs->opaque; - int index_in_cluster, n1; - int ret; - int cur_nr_sectors; /* number of sectors in current iteration */ - uint64_t cluster_offset = 0; - uint64_t bytes_done = 0; - QEMUIOVector hd_qiov; - uint8_t *cluster_data = NULL; - - qemu_iovec_init(&hd_qiov, qiov->niov); - - qemu_co_mutex_lock(&s->lock); - - while (remaining_sectors != 0) { - - /* prepare next request */ - cur_nr_sectors = remaining_sectors; - if (s->crypt_method) { - cur_nr_sectors = MIN(cur_nr_sectors, - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); - } - - ret = qcow2_get_cluster_offset(bs, sector_num << 9, - &cur_nr_sectors, &cluster_offset); - if (ret < 0) { - goto fail; - } - - index_in_cluster = sector_num & (s->cluster_sectors - 1); - - qemu_iovec_reset(&hd_qiov); - qemu_iovec_concat(&hd_qiov, qiov, bytes_done, - cur_nr_sectors * 512); - - switch (ret) { - case QCOW2_CLUSTER_UNALLOCATED: - - if (bs->backing_hd) { - /* read from the base image */ - n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov, - sector_num, cur_nr_sectors); - if (n1 > 0) { - BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->backing_hd, sector_num, - n1, &hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - goto fail; - } - } - } else { - /* Note: in this case, no need to wait */ - qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); - } - break; - - case QCOW2_CLUSTER_ZERO: - qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); - break; - - case QCOW2_CLUSTER_COMPRESSED: - /* add AIO support for compressed blocks ? */ - ret = qcow2_decompress_cluster(bs, cluster_offset); - if (ret < 0) { - goto fail; - } - - qemu_iovec_from_buf(&hd_qiov, 0, - s->cluster_cache + index_in_cluster * 512, - 512 * cur_nr_sectors); - break; - - case QCOW2_CLUSTER_NORMAL: - if ((cluster_offset & 511) != 0) { - ret = -EIO; - goto fail; - } - - if (s->crypt_method) { - /* - * For encrypted images, read everything into a temporary - * contiguous buffer on which the AES functions can work. - */ - if (!cluster_data) { - cluster_data = - qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); - } - - assert(cur_nr_sectors <= - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); - qemu_iovec_reset(&hd_qiov); - qemu_iovec_add(&hd_qiov, cluster_data, - 512 * cur_nr_sectors); - } - - BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->file, - (cluster_offset >> 9) + index_in_cluster, - cur_nr_sectors, &hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - goto fail; - } - if (s->crypt_method) { - qcow2_encrypt_sectors(s, sector_num, cluster_data, - cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key); - qemu_iovec_from_buf(qiov, bytes_done, - cluster_data, 512 * cur_nr_sectors); - } - break; - - default: - g_assert_not_reached(); - ret = -EIO; - goto fail; - } - - remaining_sectors -= cur_nr_sectors; - sector_num += cur_nr_sectors; - bytes_done += cur_nr_sectors * 512; - } - ret = 0; - -fail: - qemu_co_mutex_unlock(&s->lock); - - qemu_iovec_destroy(&hd_qiov); - qemu_vfree(cluster_data); - - return ret; -} - -static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, - int64_t sector_num, - int remaining_sectors, - QEMUIOVector *qiov) -{ - BDRVQcowState *s = bs->opaque; - int index_in_cluster; - int n_end; - int ret; - int cur_nr_sectors; /* number of sectors in current iteration */ - uint64_t cluster_offset; - QEMUIOVector hd_qiov; - uint64_t bytes_done = 0; - uint8_t *cluster_data = NULL; - QCowL2Meta *l2meta = NULL; - - trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num, - remaining_sectors); - - qemu_iovec_init(&hd_qiov, qiov->niov); - - s->cluster_cache_offset = -1; /* disable compressed cache */ - - qemu_co_mutex_lock(&s->lock); - - while (remaining_sectors != 0) { - - l2meta = NULL; - - trace_qcow2_writev_start_part(qemu_coroutine_self()); - index_in_cluster = sector_num & (s->cluster_sectors - 1); - n_end = index_in_cluster + remaining_sectors; - if (s->crypt_method && - n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) { - n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; - } - - ret = qcow2_alloc_cluster_offset(bs, sector_num << 9, - index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta); - if (ret < 0) { - goto fail; - } - - assert((cluster_offset & 511) == 0); - - qemu_iovec_reset(&hd_qiov); - qemu_iovec_concat(&hd_qiov, qiov, bytes_done, - cur_nr_sectors * 512); - - if (s->crypt_method) { - if (!cluster_data) { - cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * - s->cluster_size); - } - - assert(hd_qiov.size <= - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); - qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size); - - qcow2_encrypt_sectors(s, sector_num, cluster_data, - cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key); - - qemu_iovec_reset(&hd_qiov); - qemu_iovec_add(&hd_qiov, cluster_data, - cur_nr_sectors * 512); - } - - qemu_co_mutex_unlock(&s->lock); - BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); - trace_qcow2_writev_data(qemu_coroutine_self(), - (cluster_offset >> 9) + index_in_cluster); - ret = bdrv_co_writev(bs->file, - (cluster_offset >> 9) + index_in_cluster, - cur_nr_sectors, &hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - goto fail; - } - - while (l2meta != NULL) { - QCowL2Meta *next; - - ret = qcow2_alloc_cluster_link_l2(bs, l2meta); - if (ret < 0) { - goto fail; - } - - /* Take the request off the list of running requests */ - if (l2meta->nb_clusters != 0) { - QLIST_REMOVE(l2meta, next_in_flight); - } - - qemu_co_queue_restart_all(&l2meta->dependent_requests); - - next = l2meta->next; - g_free(l2meta); - l2meta = next; - } - - remaining_sectors -= cur_nr_sectors; - sector_num += cur_nr_sectors; - bytes_done += cur_nr_sectors * 512; - trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors); - } - ret = 0; - -fail: - qemu_co_mutex_unlock(&s->lock); - - while (l2meta != NULL) { - QCowL2Meta *next; - - if (l2meta->nb_clusters != 0) { - QLIST_REMOVE(l2meta, next_in_flight); - } - qemu_co_queue_restart_all(&l2meta->dependent_requests); - - next = l2meta->next; - g_free(l2meta); - l2meta = next; - } - - qemu_iovec_destroy(&hd_qiov); - qemu_vfree(cluster_data); - trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); - - return ret; -} - -static void qcow2_close(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - g_free(s->l1_table); - - qcow2_cache_flush(bs, s->l2_table_cache); - qcow2_cache_flush(bs, s->refcount_block_cache); - - qcow2_mark_clean(bs); - - qcow2_cache_destroy(bs, s->l2_table_cache); - qcow2_cache_destroy(bs, s->refcount_block_cache); - - g_free(s->unknown_header_fields); - cleanup_unknown_header_ext(bs); - - g_free(s->cluster_cache); - qemu_vfree(s->cluster_data); - qcow2_refcount_close(bs); - qcow2_free_snapshots(bs); -} - -static void qcow2_invalidate_cache(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - int flags = s->flags; - AES_KEY aes_encrypt_key; - AES_KEY aes_decrypt_key; - uint32_t crypt_method = 0; - QDict *options; - - /* - * Backing files are read-only which makes all of their metadata immutable, - * that means we don't have to worry about reopening them here. - */ - - if (s->crypt_method) { - crypt_method = s->crypt_method; - memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key)); - memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key)); - } - - qcow2_close(bs); - - options = qdict_new(); - qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS, - qbool_from_int(s->use_lazy_refcounts)); - - memset(s, 0, sizeof(BDRVQcowState)); - qcow2_open(bs, options, flags); - - QDECREF(options); - - if (crypt_method) { - s->crypt_method = crypt_method; - memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key)); - memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key)); - } -} - -static size_t header_ext_add(char *buf, uint32_t magic, const void *s, - size_t len, size_t buflen) -{ - QCowExtension *ext_backing_fmt = (QCowExtension*) buf; - size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7); - - if (buflen < ext_len) { - return -ENOSPC; - } - - *ext_backing_fmt = (QCowExtension) { - .magic = cpu_to_be32(magic), - .len = cpu_to_be32(len), - }; - memcpy(buf + sizeof(QCowExtension), s, len); - - return ext_len; -} - -/* - * Updates the qcow2 header, including the variable length parts of it, i.e. - * the backing file name and all extensions. qcow2 was not designed to allow - * such changes, so if we run out of space (we can only use the first cluster) - * this function may fail. - * - * Returns 0 on success, -errno in error cases. - */ -int qcow2_update_header(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - QCowHeader *header; - char *buf; - size_t buflen = s->cluster_size; - int ret; - uint64_t total_size; - uint32_t refcount_table_clusters; - size_t header_length; - Qcow2UnknownHeaderExtension *uext; - - buf = qemu_blockalign(bs, buflen); - - /* Header structure */ - header = (QCowHeader*) buf; - - if (buflen < sizeof(*header)) { - ret = -ENOSPC; - goto fail; - } - - header_length = sizeof(*header) + s->unknown_header_fields_size; - total_size = bs->total_sectors * BDRV_SECTOR_SIZE; - refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3); - - *header = (QCowHeader) { - /* Version 2 fields */ - .magic = cpu_to_be32(QCOW_MAGIC), - .version = cpu_to_be32(s->qcow_version), - .backing_file_offset = 0, - .backing_file_size = 0, - .cluster_bits = cpu_to_be32(s->cluster_bits), - .size = cpu_to_be64(total_size), - .crypt_method = cpu_to_be32(s->crypt_method_header), - .l1_size = cpu_to_be32(s->l1_size), - .l1_table_offset = cpu_to_be64(s->l1_table_offset), - .refcount_table_offset = cpu_to_be64(s->refcount_table_offset), - .refcount_table_clusters = cpu_to_be32(refcount_table_clusters), - .nb_snapshots = cpu_to_be32(s->nb_snapshots), - .snapshots_offset = cpu_to_be64(s->snapshots_offset), - - /* Version 3 fields */ - .incompatible_features = cpu_to_be64(s->incompatible_features), - .compatible_features = cpu_to_be64(s->compatible_features), - .autoclear_features = cpu_to_be64(s->autoclear_features), - .refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT), - .header_length = cpu_to_be32(header_length), - }; - - /* For older versions, write a shorter header */ - switch (s->qcow_version) { - case 2: - ret = offsetof(QCowHeader, incompatible_features); - break; - case 3: - ret = sizeof(*header); - break; - default: - ret = -EINVAL; - goto fail; - } - - buf += ret; - buflen -= ret; - memset(buf, 0, buflen); - - /* Preserve any unknown field in the header */ - if (s->unknown_header_fields_size) { - if (buflen < s->unknown_header_fields_size) { - ret = -ENOSPC; - goto fail; - } - - memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size); - buf += s->unknown_header_fields_size; - buflen -= s->unknown_header_fields_size; - } - - /* Backing file format header extension */ - if (*bs->backing_format) { - ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT, - bs->backing_format, strlen(bs->backing_format), - buflen); - if (ret < 0) { - goto fail; - } - - buf += ret; - buflen -= ret; - } - - /* Feature table */ - Qcow2Feature features[] = { - { - .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, - .bit = QCOW2_INCOMPAT_DIRTY_BITNR, - .name = "dirty bit", - }, - { - .type = QCOW2_FEAT_TYPE_COMPATIBLE, - .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, - .name = "lazy refcounts", - }, - }; - - ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE, - features, sizeof(features), buflen); - if (ret < 0) { - goto fail; - } - buf += ret; - buflen -= ret; - - /* Keep unknown header extensions */ - QLIST_FOREACH(uext, &s->unknown_header_ext, next) { - ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen); - if (ret < 0) { - goto fail; - } - - buf += ret; - buflen -= ret; - } - - /* End of header extensions */ - ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen); - if (ret < 0) { - goto fail; - } - - buf += ret; - buflen -= ret; - - /* Backing file name */ - if (*bs->backing_file) { - size_t backing_file_len = strlen(bs->backing_file); - - if (buflen < backing_file_len) { - ret = -ENOSPC; - goto fail; - } - - /* Using strncpy is ok here, since buf is not NUL-terminated. */ - strncpy(buf, bs->backing_file, buflen); - - header->backing_file_offset = cpu_to_be64(buf - ((char*) header)); - header->backing_file_size = cpu_to_be32(backing_file_len); - } - - /* Write the new header */ - ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size); - if (ret < 0) { - goto fail; - } - - ret = 0; -fail: - qemu_vfree(header); - return ret; -} - -static int qcow2_change_backing_file(BlockDriverState *bs, - const char *backing_file, const char *backing_fmt) -{ - pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); - pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); - - return qcow2_update_header(bs); -} - -static int preallocate(BlockDriverState *bs) -{ - uint64_t nb_sectors; - uint64_t offset; - uint64_t host_offset = 0; - int num; - int ret; - QCowL2Meta *meta; - - nb_sectors = bdrv_getlength(bs) >> 9; - offset = 0; - - while (nb_sectors) { - num = MIN(nb_sectors, INT_MAX >> 9); - ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, - &host_offset, &meta); - if (ret < 0) { - return ret; - } - - ret = qcow2_alloc_cluster_link_l2(bs, meta); - if (ret < 0) { - qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters, - QCOW2_DISCARD_NEVER); - return ret; - } - - /* There are no dependent requests, but we need to remove our request - * from the list of in-flight requests */ - if (meta != NULL) { - QLIST_REMOVE(meta, next_in_flight); - } - - /* TODO Preallocate data if requested */ - - nb_sectors -= num; - offset += num << 9; - } - - /* - * It is expected that the image file is large enough to actually contain - * all of the allocated clusters (otherwise we get failing reads after - * EOF). Extend the image to the last allocated sector. - */ - if (host_offset != 0) { - uint8_t buf[512]; - memset(buf, 0, 512); - ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1); - if (ret < 0) { - return ret; - } - } - - return 0; -} - -static int qcow2_create2(const char *filename, int64_t total_size, - const char *backing_file, const char *backing_format, - int flags, size_t cluster_size, int prealloc, - QEMUOptionParameter *options, int version) -{ - /* Calculate cluster_bits */ - int cluster_bits; - cluster_bits = ffs(cluster_size) - 1; - if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || - (1 << cluster_bits) != cluster_size) - { - error_report( - "Cluster size must be a power of two between %d and %dk", - 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); - return -EINVAL; - } - - /* - * Open the image file and write a minimal qcow2 header. - * - * We keep things simple and start with a zero-sized image. We also - * do without refcount blocks or a L1 table for now. We'll fix the - * inconsistency later. - * - * We do need a refcount table because growing the refcount table means - * allocating two new refcount blocks - the seconds of which would be at - * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file - * size for any qcow2 image. - */ - BlockDriverState* bs; - QCowHeader header; - uint8_t* refcount_table; - int ret; - - ret = bdrv_create_file(filename, options); - if (ret < 0) { - return ret; - } - - ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR); - if (ret < 0) { - return ret; - } - - /* Write the header */ - memset(&header, 0, sizeof(header)); - header.magic = cpu_to_be32(QCOW_MAGIC); - header.version = cpu_to_be32(version); - header.cluster_bits = cpu_to_be32(cluster_bits); - header.size = cpu_to_be64(0); - header.l1_table_offset = cpu_to_be64(0); - header.l1_size = cpu_to_be32(0); - header.refcount_table_offset = cpu_to_be64(cluster_size); - header.refcount_table_clusters = cpu_to_be32(1); - header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT); - header.header_length = cpu_to_be32(sizeof(header)); - - if (flags & BLOCK_FLAG_ENCRYPT) { - header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); - } else { - header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); - } - - if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) { - header.compatible_features |= - cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); - } - - ret = bdrv_pwrite(bs, 0, &header, sizeof(header)); - if (ret < 0) { - goto out; - } - - /* Write an empty refcount table */ - refcount_table = g_malloc0(cluster_size); - ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size); - g_free(refcount_table); - - if (ret < 0) { - goto out; - } - - bdrv_close(bs); - - /* - * And now open the image and make it consistent first (i.e. increase the - * refcount of the cluster that is occupied by the header and the refcount - * table) - */ - BlockDriver* drv = bdrv_find_format("qcow2"); - assert(drv != NULL); - ret = bdrv_open(bs, filename, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv); - if (ret < 0) { - goto out; - } - - ret = qcow2_alloc_clusters(bs, 2 * cluster_size); - if (ret < 0) { - goto out; - - } else if (ret != 0) { - error_report("Huh, first cluster in empty image is already in use?"); - abort(); - } - - /* Okay, now that we have a valid image, let's give it the right size */ - ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE); - if (ret < 0) { - goto out; - } - - /* Want a backing file? There you go.*/ - if (backing_file) { - ret = bdrv_change_backing_file(bs, backing_file, backing_format); - if (ret < 0) { - goto out; - } - } - - /* And if we're supposed to preallocate metadata, do that now */ - if (prealloc) { - BDRVQcowState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = preallocate(bs); - qemu_co_mutex_unlock(&s->lock); - if (ret < 0) { - goto out; - } - } - - ret = 0; -out: - bdrv_delete(bs); - return ret; -} - -static int qcow2_create(const char *filename, QEMUOptionParameter *options) -{ - const char *backing_file = NULL; - const char *backing_fmt = NULL; - uint64_t sectors = 0; - int flags = 0; - size_t cluster_size = DEFAULT_CLUSTER_SIZE; - int prealloc = 0; - int version = 2; - - /* Read out options */ - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - sectors = options->value.n / 512; - } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { - backing_file = options->value.s; - } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { - backing_fmt = options->value.s; - } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) { - flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0; - } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { - if (options->value.n) { - cluster_size = options->value.n; - } - } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { - if (!options->value.s || !strcmp(options->value.s, "off")) { - prealloc = 0; - } else if (!strcmp(options->value.s, "metadata")) { - prealloc = 1; - } else { - fprintf(stderr, "Invalid preallocation mode: '%s'\n", - options->value.s); - return -EINVAL; - } - } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) { - if (!options->value.s || !strcmp(options->value.s, "0.10")) { - version = 2; - } else if (!strcmp(options->value.s, "1.1")) { - version = 3; - } else { - fprintf(stderr, "Invalid compatibility level: '%s'\n", - options->value.s); - return -EINVAL; - } - } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) { - flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0; - } - options++; - } - - if (backing_file && prealloc) { - fprintf(stderr, "Backing file and preallocation cannot be used at " - "the same time\n"); - return -EINVAL; - } - - if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) { - fprintf(stderr, "Lazy refcounts only supported with compatibility " - "level 1.1 and above (use compat=1.1 or greater)\n"); - return -EINVAL; - } - - return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags, - cluster_size, prealloc, options, version); -} - -static int qcow2_make_empty(BlockDriverState *bs) -{ -#if 0 - /* XXX: not correct */ - BDRVQcowState *s = bs->opaque; - uint32_t l1_length = s->l1_size * sizeof(uint64_t); - int ret; - - memset(s->l1_table, 0, l1_length); - if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0) - return -1; - ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); - if (ret < 0) - return ret; - - l2_cache_reset(bs); -#endif - return 0; -} - -static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) -{ - int ret; - BDRVQcowState *s = bs->opaque; - - /* Emulate misaligned zero writes */ - if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) { - return -ENOTSUP; - } - - /* Whatever is left can use real zero clusters */ - qemu_co_mutex_lock(&s->lock); - ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors); - qemu_co_mutex_unlock(&s->lock); - - return ret; -} - -static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) -{ - int ret; - BDRVQcowState *s = bs->opaque; - - qemu_co_mutex_lock(&s->lock); - ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors); - qemu_co_mutex_unlock(&s->lock); - return ret; -} - -static int qcow2_truncate(BlockDriverState *bs, int64_t offset) -{ - BDRVQcowState *s = bs->opaque; - int64_t new_l1_size; - int ret; - - if (offset & 511) { - error_report("The new size must be a multiple of 512"); - return -EINVAL; - } - - /* cannot proceed if image has snapshots */ - if (s->nb_snapshots) { - error_report("Can't resize an image which has snapshots"); - return -ENOTSUP; - } - - /* shrinking is currently not supported */ - if (offset < bs->total_sectors * 512) { - error_report("qcow2 doesn't support shrinking images yet"); - return -ENOTSUP; - } - - new_l1_size = size_to_l1(s, offset); - ret = qcow2_grow_l1_table(bs, new_l1_size, true); - if (ret < 0) { - return ret; - } - - /* write updated header.size */ - offset = cpu_to_be64(offset); - ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size), - &offset, sizeof(uint64_t)); - if (ret < 0) { - return ret; - } - - s->l1_vm_state_index = new_l1_size; - return 0; -} - -/* XXX: put compressed sectors first, then all the cluster aligned - tables to avoid losing bytes in alignment */ -static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - BDRVQcowState *s = bs->opaque; - z_stream strm; - int ret, out_len; - uint8_t *out_buf; - uint64_t cluster_offset; - - if (nb_sectors == 0) { - /* align end of file to a sector boundary to ease reading with - sector based I/Os */ - cluster_offset = bdrv_getlength(bs->file); - cluster_offset = (cluster_offset + 511) & ~511; - bdrv_truncate(bs->file, cluster_offset); - return 0; - } - - if (nb_sectors != s->cluster_sectors) { - ret = -EINVAL; - - /* Zero-pad last write if image size is not cluster aligned */ - if (sector_num + nb_sectors == bs->total_sectors && - nb_sectors < s->cluster_sectors) { - uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); - memset(pad_buf, 0, s->cluster_size); - memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); - ret = qcow2_write_compressed(bs, sector_num, - pad_buf, s->cluster_sectors); - qemu_vfree(pad_buf); - } - return ret; - } - - out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); - - /* best compression, small window, no zlib header */ - memset(&strm, 0, sizeof(strm)); - ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, - Z_DEFLATED, -12, - 9, Z_DEFAULT_STRATEGY); - if (ret != 0) { - ret = -EINVAL; - goto fail; - } - - strm.avail_in = s->cluster_size; - strm.next_in = (uint8_t *)buf; - strm.avail_out = s->cluster_size; - strm.next_out = out_buf; - - ret = deflate(&strm, Z_FINISH); - if (ret != Z_STREAM_END && ret != Z_OK) { - deflateEnd(&strm); - ret = -EINVAL; - goto fail; - } - out_len = strm.next_out - out_buf; - - deflateEnd(&strm); - - if (ret != Z_STREAM_END || out_len >= s->cluster_size) { - /* could not compress: write normal cluster */ - ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); - if (ret < 0) { - goto fail; - } - } else { - cluster_offset = qcow2_alloc_compressed_cluster_offset(bs, - sector_num << 9, out_len); - if (!cluster_offset) { - ret = -EIO; - goto fail; - } - cluster_offset &= s->cluster_offset_mask; - BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); - ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); - if (ret < 0) { - goto fail; - } - } - - ret = 0; -fail: - g_free(out_buf); - return ret; -} - -static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - int ret; - - qemu_co_mutex_lock(&s->lock); - ret = qcow2_cache_flush(bs, s->l2_table_cache); - if (ret < 0) { - qemu_co_mutex_unlock(&s->lock); - return ret; - } - - if (qcow2_need_accurate_refcounts(s)) { - ret = qcow2_cache_flush(bs, s->refcount_block_cache); - if (ret < 0) { - qemu_co_mutex_unlock(&s->lock); - return ret; - } - } - qemu_co_mutex_unlock(&s->lock); - - return 0; -} - -static int64_t qcow2_vm_state_offset(BDRVQcowState *s) -{ - return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); -} - -static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - BDRVQcowState *s = bs->opaque; - bdi->cluster_size = s->cluster_size; - bdi->vm_state_offset = qcow2_vm_state_offset(s); - return 0; -} - -#if 0 -static void dump_refcounts(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - int64_t nb_clusters, k, k1, size; - int refcount; - - size = bdrv_getlength(bs->file); - nb_clusters = size_to_clusters(s, size); - for(k = 0; k < nb_clusters;) { - k1 = k; - refcount = get_refcount(bs, k); - k++; - while (k < nb_clusters && get_refcount(bs, k) == refcount) - k++; - printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount, - k - k1); - } -} -#endif - -static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, - int64_t pos) -{ - BDRVQcowState *s = bs->opaque; - int growable = bs->growable; - int ret; - - BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); - bs->growable = 1; - ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov); - bs->growable = growable; - - return ret; -} - -static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, - int64_t pos, int size) -{ - BDRVQcowState *s = bs->opaque; - int growable = bs->growable; - int ret; - - BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); - bs->growable = 1; - ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size); - bs->growable = growable; - - return ret; -} - -static QEMUOptionParameter qcow2_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_COMPAT_LEVEL, - .type = OPT_STRING, - .help = "Compatibility level (0.10 or 1.1)" - }, - { - .name = BLOCK_OPT_BACKING_FILE, - .type = OPT_STRING, - .help = "File name of a base image" - }, - { - .name = BLOCK_OPT_BACKING_FMT, - .type = OPT_STRING, - .help = "Image format of the base image" - }, - { - .name = BLOCK_OPT_ENCRYPT, - .type = OPT_FLAG, - .help = "Encrypt the image" - }, - { - .name = BLOCK_OPT_CLUSTER_SIZE, - .type = OPT_SIZE, - .help = "qcow2 cluster size", - .value = { .n = DEFAULT_CLUSTER_SIZE }, - }, - { - .name = BLOCK_OPT_PREALLOC, - .type = OPT_STRING, - .help = "Preallocation mode (allowed values: off, metadata)" - }, - { - .name = BLOCK_OPT_LAZY_REFCOUNTS, - .type = OPT_FLAG, - .help = "Postpone refcount updates", - }, - { NULL } -}; - -static BlockDriver bdrv_qcow2 = { - .format_name = "qcow2", - .instance_size = sizeof(BDRVQcowState), - .bdrv_probe = qcow2_probe, - .bdrv_open = qcow2_open, - .bdrv_close = qcow2_close, - .bdrv_reopen_prepare = qcow2_reopen_prepare, - .bdrv_create = qcow2_create, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_is_allocated = qcow2_co_is_allocated, - .bdrv_set_key = qcow2_set_key, - .bdrv_make_empty = qcow2_make_empty, - - .bdrv_co_readv = qcow2_co_readv, - .bdrv_co_writev = qcow2_co_writev, - .bdrv_co_flush_to_os = qcow2_co_flush_to_os, - - .bdrv_co_write_zeroes = qcow2_co_write_zeroes, - .bdrv_co_discard = qcow2_co_discard, - .bdrv_truncate = qcow2_truncate, - .bdrv_write_compressed = qcow2_write_compressed, - - .bdrv_snapshot_create = qcow2_snapshot_create, - .bdrv_snapshot_goto = qcow2_snapshot_goto, - .bdrv_snapshot_delete = qcow2_snapshot_delete, - .bdrv_snapshot_list = qcow2_snapshot_list, - .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp, - .bdrv_get_info = qcow2_get_info, - - .bdrv_save_vmstate = qcow2_save_vmstate, - .bdrv_load_vmstate = qcow2_load_vmstate, - - .bdrv_change_backing_file = qcow2_change_backing_file, - - .bdrv_invalidate_cache = qcow2_invalidate_cache, - - .create_options = qcow2_create_options, - .bdrv_check = qcow2_check, -}; - -static void bdrv_qcow2_init(void) -{ - bdrv_register(&bdrv_qcow2); -} - -block_init(bdrv_qcow2_init); diff --git a/contrib/qemu/block/qcow2.h b/contrib/qemu/block/qcow2.h deleted file mode 100644 index 3b2d5cda71f..00000000000 --- a/contrib/qemu/block/qcow2.h +++ /dev/null @@ -1,437 +0,0 @@ -/* - * Block driver for the QCOW version 2 format - * - * Copyright (c) 2004-2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#ifndef BLOCK_QCOW2_H -#define BLOCK_QCOW2_H - -#include "qemu/aes.h" -#include "block/coroutine.h" - -//#define DEBUG_ALLOC -//#define DEBUG_ALLOC2 -//#define DEBUG_EXT - -#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) - -#define QCOW_CRYPT_NONE 0 -#define QCOW_CRYPT_AES 1 - -#define QCOW_MAX_CRYPT_CLUSTERS 32 - -/* indicate that the refcount of the referenced cluster is exactly one. */ -#define QCOW_OFLAG_COPIED (1LL << 63) -/* indicate that the cluster is compressed (they never have the copied flag) */ -#define QCOW_OFLAG_COMPRESSED (1LL << 62) -/* The cluster reads as all zeros */ -#define QCOW_OFLAG_ZERO (1LL << 0) - -#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */ - -#define MIN_CLUSTER_BITS 9 -#define MAX_CLUSTER_BITS 21 - -#define L2_CACHE_SIZE 16 - -/* Must be at least 4 to cover all cases of refcount table growth */ -#define REFCOUNT_CACHE_SIZE 4 - -#define DEFAULT_CLUSTER_SIZE 65536 - - -#define QCOW2_OPT_LAZY_REFCOUNTS "lazy_refcounts" -#define QCOW2_OPT_DISCARD_REQUEST "pass_discard_request" -#define QCOW2_OPT_DISCARD_SNAPSHOT "pass_discard_snapshot" -#define QCOW2_OPT_DISCARD_OTHER "pass_discard_other" - -typedef struct QCowHeader { - uint32_t magic; - uint32_t version; - uint64_t backing_file_offset; - uint32_t backing_file_size; - uint32_t cluster_bits; - uint64_t size; /* in bytes */ - uint32_t crypt_method; - uint32_t l1_size; /* XXX: save number of clusters instead ? */ - uint64_t l1_table_offset; - uint64_t refcount_table_offset; - uint32_t refcount_table_clusters; - uint32_t nb_snapshots; - uint64_t snapshots_offset; - - /* The following fields are only valid for version >= 3 */ - uint64_t incompatible_features; - uint64_t compatible_features; - uint64_t autoclear_features; - - uint32_t refcount_order; - uint32_t header_length; -} QCowHeader; - -typedef struct QCowSnapshot { - uint64_t l1_table_offset; - uint32_t l1_size; - char *id_str; - char *name; - uint64_t disk_size; - uint64_t vm_state_size; - uint32_t date_sec; - uint32_t date_nsec; - uint64_t vm_clock_nsec; -} QCowSnapshot; - -struct Qcow2Cache; -typedef struct Qcow2Cache Qcow2Cache; - -typedef struct Qcow2UnknownHeaderExtension { - uint32_t magic; - uint32_t len; - QLIST_ENTRY(Qcow2UnknownHeaderExtension) next; - uint8_t data[]; -} Qcow2UnknownHeaderExtension; - -enum { - QCOW2_FEAT_TYPE_INCOMPATIBLE = 0, - QCOW2_FEAT_TYPE_COMPATIBLE = 1, - QCOW2_FEAT_TYPE_AUTOCLEAR = 2, -}; - -/* Incompatible feature bits */ -enum { - QCOW2_INCOMPAT_DIRTY_BITNR = 0, - QCOW2_INCOMPAT_DIRTY = 1 << QCOW2_INCOMPAT_DIRTY_BITNR, - - QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY, -}; - -/* Compatible feature bits */ -enum { - QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0, - QCOW2_COMPAT_LAZY_REFCOUNTS = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, - - QCOW2_COMPAT_FEAT_MASK = QCOW2_COMPAT_LAZY_REFCOUNTS, -}; - -enum qcow2_discard_type { - QCOW2_DISCARD_NEVER = 0, - QCOW2_DISCARD_ALWAYS, - QCOW2_DISCARD_REQUEST, - QCOW2_DISCARD_SNAPSHOT, - QCOW2_DISCARD_OTHER, - QCOW2_DISCARD_MAX -}; - -typedef struct Qcow2Feature { - uint8_t type; - uint8_t bit; - char name[46]; -} QEMU_PACKED Qcow2Feature; - -typedef struct Qcow2DiscardRegion { - BlockDriverState *bs; - uint64_t offset; - uint64_t bytes; - QTAILQ_ENTRY(Qcow2DiscardRegion) next; -} Qcow2DiscardRegion; - -typedef struct BDRVQcowState { - int cluster_bits; - int cluster_size; - int cluster_sectors; - int l2_bits; - int l2_size; - int l1_size; - int l1_vm_state_index; - int csize_shift; - int csize_mask; - uint64_t cluster_offset_mask; - uint64_t l1_table_offset; - uint64_t *l1_table; - - Qcow2Cache* l2_table_cache; - Qcow2Cache* refcount_block_cache; - - uint8_t *cluster_cache; - uint8_t *cluster_data; - uint64_t cluster_cache_offset; - QLIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs; - - uint64_t *refcount_table; - uint64_t refcount_table_offset; - uint32_t refcount_table_size; - int64_t free_cluster_index; - int64_t free_byte_offset; - - CoMutex lock; - - uint32_t crypt_method; /* current crypt method, 0 if no key yet */ - uint32_t crypt_method_header; - AES_KEY aes_encrypt_key; - AES_KEY aes_decrypt_key; - uint64_t snapshots_offset; - int snapshots_size; - int nb_snapshots; - QCowSnapshot *snapshots; - - int flags; - int qcow_version; - bool use_lazy_refcounts; - - bool discard_passthrough[QCOW2_DISCARD_MAX]; - - uint64_t incompatible_features; - uint64_t compatible_features; - uint64_t autoclear_features; - - size_t unknown_header_fields_size; - void* unknown_header_fields; - QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext; - QTAILQ_HEAD (, Qcow2DiscardRegion) discards; - bool cache_discards; -} BDRVQcowState; - -/* XXX: use std qcow open function ? */ -typedef struct QCowCreateState { - int cluster_size; - int cluster_bits; - uint16_t *refcount_block; - uint64_t *refcount_table; - int64_t l1_table_offset; - int64_t refcount_table_offset; - int64_t refcount_block_offset; -} QCowCreateState; - -struct QCowAIOCB; - -typedef struct Qcow2COWRegion { - /** - * Offset of the COW region in bytes from the start of the first cluster - * touched by the request. - */ - uint64_t offset; - - /** Number of sectors to copy */ - int nb_sectors; -} Qcow2COWRegion; - -/** - * Describes an in-flight (part of a) write request that writes to clusters - * that are not referenced in their L2 table yet. - */ -typedef struct QCowL2Meta -{ - /** Guest offset of the first newly allocated cluster */ - uint64_t offset; - - /** Host offset of the first newly allocated cluster */ - uint64_t alloc_offset; - - /** - * Number of sectors from the start of the first allocated cluster to - * the end of the (possibly shortened) request - */ - int nb_available; - - /** Number of newly allocated clusters */ - int nb_clusters; - - /** - * Requests that overlap with this allocation and wait to be restarted - * when the allocating request has completed. - */ - CoQueue dependent_requests; - - /** - * The COW Region between the start of the first allocated cluster and the - * area the guest actually writes to. - */ - Qcow2COWRegion cow_start; - - /** - * The COW Region between the area the guest actually writes to and the - * end of the last allocated cluster. - */ - Qcow2COWRegion cow_end; - - /** Pointer to next L2Meta of the same write request */ - struct QCowL2Meta *next; - - QLIST_ENTRY(QCowL2Meta) next_in_flight; -} QCowL2Meta; - -enum { - QCOW2_CLUSTER_UNALLOCATED, - QCOW2_CLUSTER_NORMAL, - QCOW2_CLUSTER_COMPRESSED, - QCOW2_CLUSTER_ZERO -}; - -#define L1E_OFFSET_MASK 0x00ffffffffffff00ULL -#define L2E_OFFSET_MASK 0x00ffffffffffff00ULL -#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL - -#define REFT_OFFSET_MASK 0xffffffffffffff00ULL - -static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset) -{ - return offset & ~(s->cluster_size - 1); -} - -static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset) -{ - return offset & (s->cluster_size - 1); -} - -static inline int size_to_clusters(BDRVQcowState *s, int64_t size) -{ - return (size + (s->cluster_size - 1)) >> s->cluster_bits; -} - -static inline int64_t size_to_l1(BDRVQcowState *s, int64_t size) -{ - int shift = s->cluster_bits + s->l2_bits; - return (size + (1ULL << shift) - 1) >> shift; -} - -static inline int offset_to_l2_index(BDRVQcowState *s, int64_t offset) -{ - return (offset >> s->cluster_bits) & (s->l2_size - 1); -} - -static inline int64_t align_offset(int64_t offset, int n) -{ - offset = (offset + n - 1) & ~(n - 1); - return offset; -} - -static inline int qcow2_get_cluster_type(uint64_t l2_entry) -{ - if (l2_entry & QCOW_OFLAG_COMPRESSED) { - return QCOW2_CLUSTER_COMPRESSED; - } else if (l2_entry & QCOW_OFLAG_ZERO) { - return QCOW2_CLUSTER_ZERO; - } else if (!(l2_entry & L2E_OFFSET_MASK)) { - return QCOW2_CLUSTER_UNALLOCATED; - } else { - return QCOW2_CLUSTER_NORMAL; - } -} - -/* Check whether refcounts are eager or lazy */ -static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s) -{ - return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY); -} - -static inline uint64_t l2meta_cow_start(QCowL2Meta *m) -{ - return m->offset + m->cow_start.offset; -} - -static inline uint64_t l2meta_cow_end(QCowL2Meta *m) -{ - return m->offset + m->cow_end.offset - + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS); -} - -// FIXME Need qcow2_ prefix to global functions - -/* qcow2.c functions */ -int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, - int64_t sector_num, int nb_sectors); - -int qcow2_mark_dirty(BlockDriverState *bs); -int qcow2_update_header(BlockDriverState *bs); - -/* qcow2-refcount.c functions */ -int qcow2_refcount_init(BlockDriverState *bs); -void qcow2_refcount_close(BlockDriverState *bs); - -int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size); -int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, - int nb_clusters); -int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size); -void qcow2_free_clusters(BlockDriverState *bs, - int64_t offset, int64_t size, - enum qcow2_discard_type type); -void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, - int nb_clusters, enum qcow2_discard_type type); - -int qcow2_update_snapshot_refcount(BlockDriverState *bs, - int64_t l1_table_offset, int l1_size, int addend); - -int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, - BdrvCheckMode fix); - -void qcow2_process_discards(BlockDriverState *bs, int ret); - -/* qcow2-cluster.c functions */ -int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, - bool exact_size); -void qcow2_l2_cache_reset(BlockDriverState *bs); -int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); -void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, - uint8_t *out_buf, const uint8_t *in_buf, - int nb_sectors, int enc, - const AES_KEY *key); - -int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, - int *num, uint64_t *cluster_offset); -int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, - int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m); -uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, - uint64_t offset, - int compressed_size); - -int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); -int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, - int nb_sectors); -int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors); - -/* qcow2-snapshot.c functions */ -int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info); -int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id); -int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id); -int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab); -int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name); - -void qcow2_free_snapshots(BlockDriverState *bs); -int qcow2_read_snapshots(BlockDriverState *bs); - -/* qcow2-cache.c functions */ -Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables); -int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c); - -void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table); -int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c); -int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, - Qcow2Cache *dependency); -void qcow2_cache_depends_on_flush(Qcow2Cache *c); - -int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, - void **table); -int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, - void **table); -int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table); - -#endif diff --git a/contrib/qemu/block/qed-check.c b/contrib/qemu/block/qed-check.c deleted file mode 100644 index b473dcd61f6..00000000000 --- a/contrib/qemu/block/qed-check.c +++ /dev/null @@ -1,248 +0,0 @@ -/* - * QEMU Enhanced Disk Format Consistency Check - * - * Copyright IBM, Corp. 2010 - * - * Authors: - * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "qed.h" - -typedef struct { - BDRVQEDState *s; - BdrvCheckResult *result; - bool fix; /* whether to fix invalid offsets */ - - uint64_t nclusters; - uint32_t *used_clusters; /* referenced cluster bitmap */ - - QEDRequest request; -} QEDCheck; - -static bool qed_test_bit(uint32_t *bitmap, uint64_t n) { - return !!(bitmap[n / 32] & (1 << (n % 32))); -} - -static void qed_set_bit(uint32_t *bitmap, uint64_t n) { - bitmap[n / 32] |= 1 << (n % 32); -} - -/** - * Set bitmap bits for clusters - * - * @check: Check structure - * @offset: Starting offset in bytes - * @n: Number of clusters - */ -static bool qed_set_used_clusters(QEDCheck *check, uint64_t offset, - unsigned int n) -{ - uint64_t cluster = qed_bytes_to_clusters(check->s, offset); - unsigned int corruptions = 0; - - while (n-- != 0) { - /* Clusters should only be referenced once */ - if (qed_test_bit(check->used_clusters, cluster)) { - corruptions++; - } - - qed_set_bit(check->used_clusters, cluster); - cluster++; - } - - check->result->corruptions += corruptions; - return corruptions == 0; -} - -/** - * Check an L2 table - * - * @ret: Number of invalid cluster offsets - */ -static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table) -{ - BDRVQEDState *s = check->s; - unsigned int i, num_invalid = 0; - uint64_t last_offset = 0; - - for (i = 0; i < s->table_nelems; i++) { - uint64_t offset = table->offsets[i]; - - if (qed_offset_is_unalloc_cluster(offset) || - qed_offset_is_zero_cluster(offset)) { - continue; - } - check->result->bfi.allocated_clusters++; - if (last_offset && (last_offset + s->header.cluster_size != offset)) { - check->result->bfi.fragmented_clusters++; - } - last_offset = offset; - - /* Detect invalid cluster offset */ - if (!qed_check_cluster_offset(s, offset)) { - if (check->fix) { - table->offsets[i] = 0; - check->result->corruptions_fixed++; - } else { - check->result->corruptions++; - } - - num_invalid++; - continue; - } - - qed_set_used_clusters(check, offset, 1); - } - - return num_invalid; -} - -/** - * Descend tables and check each cluster is referenced once only - */ -static int qed_check_l1_table(QEDCheck *check, QEDTable *table) -{ - BDRVQEDState *s = check->s; - unsigned int i, num_invalid_l1 = 0; - int ret, last_error = 0; - - /* Mark L1 table clusters used */ - qed_set_used_clusters(check, s->header.l1_table_offset, - s->header.table_size); - - for (i = 0; i < s->table_nelems; i++) { - unsigned int num_invalid_l2; - uint64_t offset = table->offsets[i]; - - if (qed_offset_is_unalloc_cluster(offset)) { - continue; - } - - /* Detect invalid L2 offset */ - if (!qed_check_table_offset(s, offset)) { - /* Clear invalid offset */ - if (check->fix) { - table->offsets[i] = 0; - check->result->corruptions_fixed++; - } else { - check->result->corruptions++; - } - - num_invalid_l1++; - continue; - } - - if (!qed_set_used_clusters(check, offset, s->header.table_size)) { - continue; /* skip an invalid table */ - } - - ret = qed_read_l2_table_sync(s, &check->request, offset); - if (ret) { - check->result->check_errors++; - last_error = ret; - continue; - } - - num_invalid_l2 = qed_check_l2_table(check, - check->request.l2_table->table); - - /* Write out fixed L2 table */ - if (num_invalid_l2 > 0 && check->fix) { - ret = qed_write_l2_table_sync(s, &check->request, 0, - s->table_nelems, false); - if (ret) { - check->result->check_errors++; - last_error = ret; - continue; - } - } - } - - /* Drop reference to final table */ - qed_unref_l2_cache_entry(check->request.l2_table); - check->request.l2_table = NULL; - - /* Write out fixed L1 table */ - if (num_invalid_l1 > 0 && check->fix) { - ret = qed_write_l1_table_sync(s, 0, s->table_nelems); - if (ret) { - check->result->check_errors++; - last_error = ret; - } - } - - return last_error; -} - -/** - * Check for unreferenced (leaked) clusters - */ -static void qed_check_for_leaks(QEDCheck *check) -{ - BDRVQEDState *s = check->s; - uint64_t i; - - for (i = s->header.header_size; i < check->nclusters; i++) { - if (!qed_test_bit(check->used_clusters, i)) { - check->result->leaks++; - } - } -} - -/** - * Mark an image clean once it passes check or has been repaired - */ -static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result) -{ - /* Skip if there were unfixable corruptions or I/O errors */ - if (result->corruptions > 0 || result->check_errors > 0) { - return; - } - - /* Skip if image is already marked clean */ - if (!(s->header.features & QED_F_NEED_CHECK)) { - return; - } - - /* Ensure fixes reach storage before clearing check bit */ - bdrv_flush(s->bs); - - s->header.features &= ~QED_F_NEED_CHECK; - qed_write_header_sync(s); -} - -int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix) -{ - QEDCheck check = { - .s = s, - .result = result, - .nclusters = qed_bytes_to_clusters(s, s->file_size), - .request = { .l2_table = NULL }, - .fix = fix, - }; - int ret; - - check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) * - sizeof(check.used_clusters[0])); - - check.result->bfi.total_clusters = - (s->header.image_size + s->header.cluster_size - 1) / - s->header.cluster_size; - ret = qed_check_l1_table(&check, s->l1_table); - if (ret == 0) { - /* Only check for leaks if entire image was scanned successfully */ - qed_check_for_leaks(&check); - - if (fix) { - qed_check_mark_clean(s, result); - } - } - - g_free(check.used_clusters); - return ret; -} diff --git a/contrib/qemu/block/qed-cluster.c b/contrib/qemu/block/qed-cluster.c deleted file mode 100644 index f64b2af8f7e..00000000000 --- a/contrib/qemu/block/qed-cluster.c +++ /dev/null @@ -1,165 +0,0 @@ -/* - * QEMU Enhanced Disk Format Cluster functions - * - * Copyright IBM, Corp. 2010 - * - * Authors: - * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> - * Anthony Liguori <aliguori@us.ibm.com> - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "qed.h" - -/** - * Count the number of contiguous data clusters - * - * @s: QED state - * @table: L2 table - * @index: First cluster index - * @n: Maximum number of clusters - * @offset: Set to first cluster offset - * - * This function scans tables for contiguous clusters. A contiguous run of - * clusters may be allocated, unallocated, or zero. - */ -static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s, - QEDTable *table, - unsigned int index, - unsigned int n, - uint64_t *offset) -{ - unsigned int end = MIN(index + n, s->table_nelems); - uint64_t last = table->offsets[index]; - unsigned int i; - - *offset = last; - - for (i = index + 1; i < end; i++) { - if (qed_offset_is_unalloc_cluster(last)) { - /* Counting unallocated clusters */ - if (!qed_offset_is_unalloc_cluster(table->offsets[i])) { - break; - } - } else if (qed_offset_is_zero_cluster(last)) { - /* Counting zero clusters */ - if (!qed_offset_is_zero_cluster(table->offsets[i])) { - break; - } - } else { - /* Counting allocated clusters */ - if (table->offsets[i] != last + s->header.cluster_size) { - break; - } - last = table->offsets[i]; - } - } - return i - index; -} - -typedef struct { - BDRVQEDState *s; - uint64_t pos; - size_t len; - - QEDRequest *request; - - /* User callback */ - QEDFindClusterFunc *cb; - void *opaque; -} QEDFindClusterCB; - -static void qed_find_cluster_cb(void *opaque, int ret) -{ - QEDFindClusterCB *find_cluster_cb = opaque; - BDRVQEDState *s = find_cluster_cb->s; - QEDRequest *request = find_cluster_cb->request; - uint64_t offset = 0; - size_t len = 0; - unsigned int index; - unsigned int n; - - if (ret) { - goto out; - } - - index = qed_l2_index(s, find_cluster_cb->pos); - n = qed_bytes_to_clusters(s, - qed_offset_into_cluster(s, find_cluster_cb->pos) + - find_cluster_cb->len); - n = qed_count_contiguous_clusters(s, request->l2_table->table, - index, n, &offset); - - if (qed_offset_is_unalloc_cluster(offset)) { - ret = QED_CLUSTER_L2; - } else if (qed_offset_is_zero_cluster(offset)) { - ret = QED_CLUSTER_ZERO; - } else if (qed_check_cluster_offset(s, offset)) { - ret = QED_CLUSTER_FOUND; - } else { - ret = -EINVAL; - } - - len = MIN(find_cluster_cb->len, n * s->header.cluster_size - - qed_offset_into_cluster(s, find_cluster_cb->pos)); - -out: - find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len); - g_free(find_cluster_cb); -} - -/** - * Find the offset of a data cluster - * - * @s: QED state - * @request: L2 cache entry - * @pos: Byte position in device - * @len: Number of bytes - * @cb: Completion function - * @opaque: User data for completion function - * - * This function translates a position in the block device to an offset in the - * image file. It invokes the cb completion callback to report back the - * translated offset or unallocated range in the image file. - * - * If the L2 table exists, request->l2_table points to the L2 table cache entry - * and the caller must free the reference when they are finished. The cache - * entry is exposed in this way to avoid callers having to read the L2 table - * again later during request processing. If request->l2_table is non-NULL it - * will be unreferenced before taking on the new cache entry. - */ -void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, - size_t len, QEDFindClusterFunc *cb, void *opaque) -{ - QEDFindClusterCB *find_cluster_cb; - uint64_t l2_offset; - - /* Limit length to L2 boundary. Requests are broken up at the L2 boundary - * so that a request acts on one L2 table at a time. - */ - len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos); - - l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)]; - if (qed_offset_is_unalloc_cluster(l2_offset)) { - cb(opaque, QED_CLUSTER_L1, 0, len); - return; - } - if (!qed_check_table_offset(s, l2_offset)) { - cb(opaque, -EINVAL, 0, 0); - return; - } - - find_cluster_cb = g_malloc(sizeof(*find_cluster_cb)); - find_cluster_cb->s = s; - find_cluster_cb->pos = pos; - find_cluster_cb->len = len; - find_cluster_cb->cb = cb; - find_cluster_cb->opaque = opaque; - find_cluster_cb->request = request; - - qed_read_l2_table(s, request, l2_offset, - qed_find_cluster_cb, find_cluster_cb); -} diff --git a/contrib/qemu/block/qed-gencb.c b/contrib/qemu/block/qed-gencb.c deleted file mode 100644 index 7d7ac1ffc8e..00000000000 --- a/contrib/qemu/block/qed-gencb.c +++ /dev/null @@ -1,32 +0,0 @@ -/* - * QEMU Enhanced Disk Format - * - * Copyright IBM, Corp. 2010 - * - * Authors: - * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "qed.h" - -void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque) -{ - GenericCB *gencb = g_malloc(len); - gencb->cb = cb; - gencb->opaque = opaque; - return gencb; -} - -void gencb_complete(void *opaque, int ret) -{ - GenericCB *gencb = opaque; - BlockDriverCompletionFunc *cb = gencb->cb; - void *user_opaque = gencb->opaque; - - g_free(gencb); - cb(user_opaque, ret); -} diff --git a/contrib/qemu/block/qed-l2-cache.c b/contrib/qemu/block/qed-l2-cache.c deleted file mode 100644 index e9b2aae44d9..00000000000 --- a/contrib/qemu/block/qed-l2-cache.c +++ /dev/null @@ -1,187 +0,0 @@ -/* - * QEMU Enhanced Disk Format L2 Cache - * - * Copyright IBM, Corp. 2010 - * - * Authors: - * Anthony Liguori <aliguori@us.ibm.com> - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -/* - * L2 table cache usage is as follows: - * - * An open image has one L2 table cache that is used to avoid accessing the - * image file for recently referenced L2 tables. - * - * Cluster offset lookup translates the logical offset within the block device - * to a cluster offset within the image file. This is done by indexing into - * the L1 and L2 tables which store cluster offsets. It is here where the L2 - * table cache serves up recently referenced L2 tables. - * - * If there is a cache miss, that L2 table is read from the image file and - * committed to the cache. Subsequent accesses to that L2 table will be served - * from the cache until the table is evicted from the cache. - * - * L2 tables are also committed to the cache when new L2 tables are allocated - * in the image file. Since the L2 table cache is write-through, the new L2 - * table is first written out to the image file and then committed to the - * cache. - * - * Multiple I/O requests may be using an L2 table cache entry at any given - * time. That means an entry may be in use across several requests and - * reference counting is needed to free the entry at the correct time. In - * particular, an entry evicted from the cache will only be freed once all - * references are dropped. - * - * An in-flight I/O request will hold a reference to a L2 table cache entry for - * the period during which it needs to access the L2 table. This includes - * cluster offset lookup, L2 table allocation, and L2 table update when a new - * data cluster has been allocated. - * - * An interesting case occurs when two requests need to access an L2 table that - * is not in the cache. Since the operation to read the table from the image - * file takes some time to complete, both requests may see a cache miss and - * start reading the L2 table from the image file. The first to finish will - * commit its L2 table into the cache. When the second tries to commit its - * table will be deleted in favor of the existing cache entry. - */ - -#include "trace.h" -#include "qed.h" - -/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */ -#define MAX_L2_CACHE_SIZE 50 - -/** - * Initialize the L2 cache - */ -void qed_init_l2_cache(L2TableCache *l2_cache) -{ - QTAILQ_INIT(&l2_cache->entries); - l2_cache->n_entries = 0; -} - -/** - * Free the L2 cache - */ -void qed_free_l2_cache(L2TableCache *l2_cache) -{ - CachedL2Table *entry, *next_entry; - - QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) { - qemu_vfree(entry->table); - g_free(entry); - } -} - -/** - * Allocate an uninitialized entry from the cache - * - * The returned entry has a reference count of 1 and is owned by the caller. - * The caller must allocate the actual table field for this entry and it must - * be freeable using qemu_vfree(). - */ -CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache) -{ - CachedL2Table *entry; - - entry = g_malloc0(sizeof(*entry)); - entry->ref++; - - trace_qed_alloc_l2_cache_entry(l2_cache, entry); - - return entry; -} - -/** - * Decrease an entry's reference count and free if necessary when the reference - * count drops to zero. - */ -void qed_unref_l2_cache_entry(CachedL2Table *entry) -{ - if (!entry) { - return; - } - - entry->ref--; - trace_qed_unref_l2_cache_entry(entry, entry->ref); - if (entry->ref == 0) { - qemu_vfree(entry->table); - g_free(entry); - } -} - -/** - * Find an entry in the L2 cache. This may return NULL and it's up to the - * caller to satisfy the cache miss. - * - * For a cached entry, this function increases the reference count and returns - * the entry. - */ -CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset) -{ - CachedL2Table *entry; - - QTAILQ_FOREACH(entry, &l2_cache->entries, node) { - if (entry->offset == offset) { - trace_qed_find_l2_cache_entry(l2_cache, entry, offset, entry->ref); - entry->ref++; - return entry; - } - } - return NULL; -} - -/** - * Commit an L2 cache entry into the cache. This is meant to be used as part of - * the process to satisfy a cache miss. A caller would allocate an entry which - * is not actually in the L2 cache and then once the entry was valid and - * present on disk, the entry can be committed into the cache. - * - * Since the cache is write-through, it's important that this function is not - * called until the entry is present on disk and the L1 has been updated to - * point to the entry. - * - * N.B. This function steals a reference to the l2_table from the caller so the - * caller must obtain a new reference by issuing a call to - * qed_find_l2_cache_entry(). - */ -void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table) -{ - CachedL2Table *entry; - - entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset); - if (entry) { - qed_unref_l2_cache_entry(entry); - qed_unref_l2_cache_entry(l2_table); - return; - } - - /* Evict an unused cache entry so we have space. If all entries are in use - * we can grow the cache temporarily and we try to shrink back down later. - */ - if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) { - CachedL2Table *next; - QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next) { - if (entry->ref > 1) { - continue; - } - - QTAILQ_REMOVE(&l2_cache->entries, entry, node); - l2_cache->n_entries--; - qed_unref_l2_cache_entry(entry); - - /* Stop evicting when we've shrunk back to max size */ - if (l2_cache->n_entries < MAX_L2_CACHE_SIZE) { - break; - } - } - } - - l2_cache->n_entries++; - QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node); -} diff --git a/contrib/qemu/block/qed-table.c b/contrib/qemu/block/qed-table.c deleted file mode 100644 index 76d2dcccf81..00000000000 --- a/contrib/qemu/block/qed-table.c +++ /dev/null @@ -1,296 +0,0 @@ -/* - * QEMU Enhanced Disk Format Table I/O - * - * Copyright IBM, Corp. 2010 - * - * Authors: - * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> - * Anthony Liguori <aliguori@us.ibm.com> - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "trace.h" -#include "qemu/sockets.h" /* for EINPROGRESS on Windows */ -#include "qed.h" - -typedef struct { - GenericCB gencb; - BDRVQEDState *s; - QEDTable *table; - - struct iovec iov; - QEMUIOVector qiov; -} QEDReadTableCB; - -static void qed_read_table_cb(void *opaque, int ret) -{ - QEDReadTableCB *read_table_cb = opaque; - QEDTable *table = read_table_cb->table; - int noffsets = read_table_cb->qiov.size / sizeof(uint64_t); - int i; - - /* Handle I/O error */ - if (ret) { - goto out; - } - - /* Byteswap offsets */ - for (i = 0; i < noffsets; i++) { - table->offsets[i] = le64_to_cpu(table->offsets[i]); - } - -out: - /* Completion */ - trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret); - gencb_complete(&read_table_cb->gencb, ret); -} - -static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, - BlockDriverCompletionFunc *cb, void *opaque) -{ - QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb), - cb, opaque); - QEMUIOVector *qiov = &read_table_cb->qiov; - - trace_qed_read_table(s, offset, table); - - read_table_cb->s = s; - read_table_cb->table = table; - read_table_cb->iov.iov_base = table->offsets, - read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size, - - qemu_iovec_init_external(qiov, &read_table_cb->iov, 1); - bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov, - qiov->size / BDRV_SECTOR_SIZE, - qed_read_table_cb, read_table_cb); -} - -typedef struct { - GenericCB gencb; - BDRVQEDState *s; - QEDTable *orig_table; - QEDTable *table; - bool flush; /* flush after write? */ - - struct iovec iov; - QEMUIOVector qiov; -} QEDWriteTableCB; - -static void qed_write_table_cb(void *opaque, int ret) -{ - QEDWriteTableCB *write_table_cb = opaque; - - trace_qed_write_table_cb(write_table_cb->s, - write_table_cb->orig_table, - write_table_cb->flush, - ret); - - if (ret) { - goto out; - } - - if (write_table_cb->flush) { - /* We still need to flush first */ - write_table_cb->flush = false; - bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb, - write_table_cb); - return; - } - -out: - qemu_vfree(write_table_cb->table); - gencb_complete(&write_table_cb->gencb, ret); -} - -/** - * Write out an updated part or all of a table - * - * @s: QED state - * @offset: Offset of table in image file, in bytes - * @table: Table - * @index: Index of first element - * @n: Number of elements - * @flush: Whether or not to sync to disk - * @cb: Completion function - * @opaque: Argument for completion function - */ -static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, - unsigned int index, unsigned int n, bool flush, - BlockDriverCompletionFunc *cb, void *opaque) -{ - QEDWriteTableCB *write_table_cb; - unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1; - unsigned int start, end, i; - size_t len_bytes; - - trace_qed_write_table(s, offset, table, index, n); - - /* Calculate indices of the first and one after last elements */ - start = index & ~sector_mask; - end = (index + n + sector_mask) & ~sector_mask; - - len_bytes = (end - start) * sizeof(uint64_t); - - write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque); - write_table_cb->s = s; - write_table_cb->orig_table = table; - write_table_cb->flush = flush; - write_table_cb->table = qemu_blockalign(s->bs, len_bytes); - write_table_cb->iov.iov_base = write_table_cb->table->offsets; - write_table_cb->iov.iov_len = len_bytes; - qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1); - - /* Byteswap table */ - for (i = start; i < end; i++) { - uint64_t le_offset = cpu_to_le64(table->offsets[i]); - write_table_cb->table->offsets[i - start] = le_offset; - } - - /* Adjust for offset into table */ - offset += start * sizeof(uint64_t); - - bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, - &write_table_cb->qiov, - write_table_cb->qiov.size / BDRV_SECTOR_SIZE, - qed_write_table_cb, write_table_cb); -} - -/** - * Propagate return value from async callback - */ -static void qed_sync_cb(void *opaque, int ret) -{ - *(int *)opaque = ret; -} - -int qed_read_l1_table_sync(BDRVQEDState *s) -{ - int ret = -EINPROGRESS; - - qed_read_table(s, s->header.l1_table_offset, - s->l1_table, qed_sync_cb, &ret); - while (ret == -EINPROGRESS) { - qemu_aio_wait(); - } - - return ret; -} - -void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, - BlockDriverCompletionFunc *cb, void *opaque) -{ - BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE); - qed_write_table(s, s->header.l1_table_offset, - s->l1_table, index, n, false, cb, opaque); -} - -int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, - unsigned int n) -{ - int ret = -EINPROGRESS; - - qed_write_l1_table(s, index, n, qed_sync_cb, &ret); - while (ret == -EINPROGRESS) { - qemu_aio_wait(); - } - - return ret; -} - -typedef struct { - GenericCB gencb; - BDRVQEDState *s; - uint64_t l2_offset; - QEDRequest *request; -} QEDReadL2TableCB; - -static void qed_read_l2_table_cb(void *opaque, int ret) -{ - QEDReadL2TableCB *read_l2_table_cb = opaque; - QEDRequest *request = read_l2_table_cb->request; - BDRVQEDState *s = read_l2_table_cb->s; - CachedL2Table *l2_table = request->l2_table; - uint64_t l2_offset = read_l2_table_cb->l2_offset; - - if (ret) { - /* can't trust loaded L2 table anymore */ - qed_unref_l2_cache_entry(l2_table); - request->l2_table = NULL; - } else { - l2_table->offset = l2_offset; - - qed_commit_l2_cache_entry(&s->l2_cache, l2_table); - - /* This is guaranteed to succeed because we just committed the entry - * to the cache. - */ - request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); - assert(request->l2_table != NULL); - } - - gencb_complete(&read_l2_table_cb->gencb, ret); -} - -void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, - BlockDriverCompletionFunc *cb, void *opaque) -{ - QEDReadL2TableCB *read_l2_table_cb; - - qed_unref_l2_cache_entry(request->l2_table); - - /* Check for cached L2 entry */ - request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset); - if (request->l2_table) { - cb(opaque, 0); - return; - } - - request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); - request->l2_table->table = qed_alloc_table(s); - - read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque); - read_l2_table_cb->s = s; - read_l2_table_cb->l2_offset = offset; - read_l2_table_cb->request = request; - - BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD); - qed_read_table(s, offset, request->l2_table->table, - qed_read_l2_table_cb, read_l2_table_cb); -} - -int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset) -{ - int ret = -EINPROGRESS; - - qed_read_l2_table(s, request, offset, qed_sync_cb, &ret); - while (ret == -EINPROGRESS) { - qemu_aio_wait(); - } - - return ret; -} - -void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, - unsigned int index, unsigned int n, bool flush, - BlockDriverCompletionFunc *cb, void *opaque) -{ - BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE); - qed_write_table(s, request->l2_table->offset, - request->l2_table->table, index, n, flush, cb, opaque); -} - -int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, - unsigned int index, unsigned int n, bool flush) -{ - int ret = -EINPROGRESS; - - qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret); - while (ret == -EINPROGRESS) { - qemu_aio_wait(); - } - - return ret; -} diff --git a/contrib/qemu/block/qed.c b/contrib/qemu/block/qed.c deleted file mode 100644 index f767b0528ce..00000000000 --- a/contrib/qemu/block/qed.c +++ /dev/null @@ -1,1596 +0,0 @@ -/* - * QEMU Enhanced Disk Format - * - * Copyright IBM, Corp. 2010 - * - * Authors: - * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> - * Anthony Liguori <aliguori@us.ibm.com> - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "qemu/timer.h" -#include "trace.h" -#include "qed.h" -#include "qapi/qmp/qerror.h" -#include "migration/migration.h" - -static void qed_aio_cancel(BlockDriverAIOCB *blockacb) -{ - QEDAIOCB *acb = (QEDAIOCB *)blockacb; - bool finished = false; - - /* Wait for the request to finish */ - acb->finished = &finished; - while (!finished) { - qemu_aio_wait(); - } -} - -static const AIOCBInfo qed_aiocb_info = { - .aiocb_size = sizeof(QEDAIOCB), - .cancel = qed_aio_cancel, -}; - -static int bdrv_qed_probe(const uint8_t *buf, int buf_size, - const char *filename) -{ - const QEDHeader *header = (const QEDHeader *)buf; - - if (buf_size < sizeof(*header)) { - return 0; - } - if (le32_to_cpu(header->magic) != QED_MAGIC) { - return 0; - } - return 100; -} - -/** - * Check whether an image format is raw - * - * @fmt: Backing file format, may be NULL - */ -static bool qed_fmt_is_raw(const char *fmt) -{ - return fmt && strcmp(fmt, "raw") == 0; -} - -static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu) -{ - cpu->magic = le32_to_cpu(le->magic); - cpu->cluster_size = le32_to_cpu(le->cluster_size); - cpu->table_size = le32_to_cpu(le->table_size); - cpu->header_size = le32_to_cpu(le->header_size); - cpu->features = le64_to_cpu(le->features); - cpu->compat_features = le64_to_cpu(le->compat_features); - cpu->autoclear_features = le64_to_cpu(le->autoclear_features); - cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset); - cpu->image_size = le64_to_cpu(le->image_size); - cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset); - cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size); -} - -static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le) -{ - le->magic = cpu_to_le32(cpu->magic); - le->cluster_size = cpu_to_le32(cpu->cluster_size); - le->table_size = cpu_to_le32(cpu->table_size); - le->header_size = cpu_to_le32(cpu->header_size); - le->features = cpu_to_le64(cpu->features); - le->compat_features = cpu_to_le64(cpu->compat_features); - le->autoclear_features = cpu_to_le64(cpu->autoclear_features); - le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset); - le->image_size = cpu_to_le64(cpu->image_size); - le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset); - le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size); -} - -int qed_write_header_sync(BDRVQEDState *s) -{ - QEDHeader le; - int ret; - - qed_header_cpu_to_le(&s->header, &le); - ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le)); - if (ret != sizeof(le)) { - return ret; - } - return 0; -} - -typedef struct { - GenericCB gencb; - BDRVQEDState *s; - struct iovec iov; - QEMUIOVector qiov; - int nsectors; - uint8_t *buf; -} QEDWriteHeaderCB; - -static void qed_write_header_cb(void *opaque, int ret) -{ - QEDWriteHeaderCB *write_header_cb = opaque; - - qemu_vfree(write_header_cb->buf); - gencb_complete(write_header_cb, ret); -} - -static void qed_write_header_read_cb(void *opaque, int ret) -{ - QEDWriteHeaderCB *write_header_cb = opaque; - BDRVQEDState *s = write_header_cb->s; - - if (ret) { - qed_write_header_cb(write_header_cb, ret); - return; - } - - /* Update header */ - qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf); - - bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov, - write_header_cb->nsectors, qed_write_header_cb, - write_header_cb); -} - -/** - * Update header in-place (does not rewrite backing filename or other strings) - * - * This function only updates known header fields in-place and does not affect - * extra data after the QED header. - */ -static void qed_write_header(BDRVQEDState *s, BlockDriverCompletionFunc cb, - void *opaque) -{ - /* We must write full sectors for O_DIRECT but cannot necessarily generate - * the data following the header if an unrecognized compat feature is - * active. Therefore, first read the sectors containing the header, update - * them, and write back. - */ - - int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) / - BDRV_SECTOR_SIZE; - size_t len = nsectors * BDRV_SECTOR_SIZE; - QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb), - cb, opaque); - - write_header_cb->s = s; - write_header_cb->nsectors = nsectors; - write_header_cb->buf = qemu_blockalign(s->bs, len); - write_header_cb->iov.iov_base = write_header_cb->buf; - write_header_cb->iov.iov_len = len; - qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1); - - bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors, - qed_write_header_read_cb, write_header_cb); -} - -static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size) -{ - uint64_t table_entries; - uint64_t l2_size; - - table_entries = (table_size * cluster_size) / sizeof(uint64_t); - l2_size = table_entries * cluster_size; - - return l2_size * table_entries; -} - -static bool qed_is_cluster_size_valid(uint32_t cluster_size) -{ - if (cluster_size < QED_MIN_CLUSTER_SIZE || - cluster_size > QED_MAX_CLUSTER_SIZE) { - return false; - } - if (cluster_size & (cluster_size - 1)) { - return false; /* not power of 2 */ - } - return true; -} - -static bool qed_is_table_size_valid(uint32_t table_size) -{ - if (table_size < QED_MIN_TABLE_SIZE || - table_size > QED_MAX_TABLE_SIZE) { - return false; - } - if (table_size & (table_size - 1)) { - return false; /* not power of 2 */ - } - return true; -} - -static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size, - uint32_t table_size) -{ - if (image_size % BDRV_SECTOR_SIZE != 0) { - return false; /* not multiple of sector size */ - } - if (image_size > qed_max_image_size(cluster_size, table_size)) { - return false; /* image is too large */ - } - return true; -} - -/** - * Read a string of known length from the image file - * - * @file: Image file - * @offset: File offset to start of string, in bytes - * @n: String length in bytes - * @buf: Destination buffer - * @buflen: Destination buffer length in bytes - * @ret: 0 on success, -errno on failure - * - * The string is NUL-terminated. - */ -static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n, - char *buf, size_t buflen) -{ - int ret; - if (n >= buflen) { - return -EINVAL; - } - ret = bdrv_pread(file, offset, buf, n); - if (ret < 0) { - return ret; - } - buf[n] = '\0'; - return 0; -} - -/** - * Allocate new clusters - * - * @s: QED state - * @n: Number of contiguous clusters to allocate - * @ret: Offset of first allocated cluster - * - * This function only produces the offset where the new clusters should be - * written. It updates BDRVQEDState but does not make any changes to the image - * file. - */ -static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n) -{ - uint64_t offset = s->file_size; - s->file_size += n * s->header.cluster_size; - return offset; -} - -QEDTable *qed_alloc_table(BDRVQEDState *s) -{ - /* Honor O_DIRECT memory alignment requirements */ - return qemu_blockalign(s->bs, - s->header.cluster_size * s->header.table_size); -} - -/** - * Allocate a new zeroed L2 table - */ -static CachedL2Table *qed_new_l2_table(BDRVQEDState *s) -{ - CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); - - l2_table->table = qed_alloc_table(s); - l2_table->offset = qed_alloc_clusters(s, s->header.table_size); - - memset(l2_table->table->offsets, 0, - s->header.cluster_size * s->header.table_size); - return l2_table; -} - -static void qed_aio_next_io(void *opaque, int ret); - -static void qed_plug_allocating_write_reqs(BDRVQEDState *s) -{ - assert(!s->allocating_write_reqs_plugged); - - s->allocating_write_reqs_plugged = true; -} - -static void qed_unplug_allocating_write_reqs(BDRVQEDState *s) -{ - QEDAIOCB *acb; - - assert(s->allocating_write_reqs_plugged); - - s->allocating_write_reqs_plugged = false; - - acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); - if (acb) { - qed_aio_next_io(acb, 0); - } -} - -static void qed_finish_clear_need_check(void *opaque, int ret) -{ - /* Do nothing */ -} - -static void qed_flush_after_clear_need_check(void *opaque, int ret) -{ - BDRVQEDState *s = opaque; - - bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s); - - /* No need to wait until flush completes */ - qed_unplug_allocating_write_reqs(s); -} - -static void qed_clear_need_check(void *opaque, int ret) -{ - BDRVQEDState *s = opaque; - - if (ret) { - qed_unplug_allocating_write_reqs(s); - return; - } - - s->header.features &= ~QED_F_NEED_CHECK; - qed_write_header(s, qed_flush_after_clear_need_check, s); -} - -static void qed_need_check_timer_cb(void *opaque) -{ - BDRVQEDState *s = opaque; - - /* The timer should only fire when allocating writes have drained */ - assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs)); - - trace_qed_need_check_timer_cb(s); - - qed_plug_allocating_write_reqs(s); - - /* Ensure writes are on disk before clearing flag */ - bdrv_aio_flush(s->bs, qed_clear_need_check, s); -} - -static void qed_start_need_check_timer(BDRVQEDState *s) -{ - trace_qed_start_need_check_timer(s); - - /* Use vm_clock so we don't alter the image file while suspended for - * migration. - */ - qemu_mod_timer(s->need_check_timer, qemu_get_clock_ns(vm_clock) + - get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT); -} - -/* It's okay to call this multiple times or when no timer is started */ -static void qed_cancel_need_check_timer(BDRVQEDState *s) -{ - trace_qed_cancel_need_check_timer(s); - qemu_del_timer(s->need_check_timer); -} - -static void bdrv_qed_rebind(BlockDriverState *bs) -{ - BDRVQEDState *s = bs->opaque; - s->bs = bs; -} - -static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags) -{ - BDRVQEDState *s = bs->opaque; - QEDHeader le_header; - int64_t file_size; - int ret; - - s->bs = bs; - QSIMPLEQ_INIT(&s->allocating_write_reqs); - - ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header)); - if (ret < 0) { - return ret; - } - qed_header_le_to_cpu(&le_header, &s->header); - - if (s->header.magic != QED_MAGIC) { - return -EMEDIUMTYPE; - } - if (s->header.features & ~QED_FEATURE_MASK) { - /* image uses unsupported feature bits */ - char buf[64]; - snprintf(buf, sizeof(buf), "%" PRIx64, - s->header.features & ~QED_FEATURE_MASK); - qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bs->device_name, "QED", buf); - return -ENOTSUP; - } - if (!qed_is_cluster_size_valid(s->header.cluster_size)) { - return -EINVAL; - } - - /* Round down file size to the last cluster */ - file_size = bdrv_getlength(bs->file); - if (file_size < 0) { - return file_size; - } - s->file_size = qed_start_of_cluster(s, file_size); - - if (!qed_is_table_size_valid(s->header.table_size)) { - return -EINVAL; - } - if (!qed_is_image_size_valid(s->header.image_size, - s->header.cluster_size, - s->header.table_size)) { - return -EINVAL; - } - if (!qed_check_table_offset(s, s->header.l1_table_offset)) { - return -EINVAL; - } - - s->table_nelems = (s->header.cluster_size * s->header.table_size) / - sizeof(uint64_t); - s->l2_shift = ffs(s->header.cluster_size) - 1; - s->l2_mask = s->table_nelems - 1; - s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1; - - if ((s->header.features & QED_F_BACKING_FILE)) { - if ((uint64_t)s->header.backing_filename_offset + - s->header.backing_filename_size > - s->header.cluster_size * s->header.header_size) { - return -EINVAL; - } - - ret = qed_read_string(bs->file, s->header.backing_filename_offset, - s->header.backing_filename_size, bs->backing_file, - sizeof(bs->backing_file)); - if (ret < 0) { - return ret; - } - - if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) { - pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw"); - } - } - - /* Reset unknown autoclear feature bits. This is a backwards - * compatibility mechanism that allows images to be opened by older - * programs, which "knock out" unknown feature bits. When an image is - * opened by a newer program again it can detect that the autoclear - * feature is no longer valid. - */ - if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 && - !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) { - s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK; - - ret = qed_write_header_sync(s); - if (ret) { - return ret; - } - - /* From here on only known autoclear feature bits are valid */ - bdrv_flush(bs->file); - } - - s->l1_table = qed_alloc_table(s); - qed_init_l2_cache(&s->l2_cache); - - ret = qed_read_l1_table_sync(s); - if (ret) { - goto out; - } - - /* If image was not closed cleanly, check consistency */ - if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) { - /* Read-only images cannot be fixed. There is no risk of corruption - * since write operations are not possible. Therefore, allow - * potentially inconsistent images to be opened read-only. This can - * aid data recovery from an otherwise inconsistent image. - */ - if (!bdrv_is_read_only(bs->file) && - !(flags & BDRV_O_INCOMING)) { - BdrvCheckResult result = {0}; - - ret = qed_check(s, &result, true); - if (ret) { - goto out; - } - } - } - - s->need_check_timer = qemu_new_timer_ns(vm_clock, - qed_need_check_timer_cb, s); - -out: - if (ret) { - qed_free_l2_cache(&s->l2_cache); - qemu_vfree(s->l1_table); - } - return ret; -} - -/* We have nothing to do for QED reopen, stubs just return - * success */ -static int bdrv_qed_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) -{ - return 0; -} - -static void bdrv_qed_close(BlockDriverState *bs) -{ - BDRVQEDState *s = bs->opaque; - - qed_cancel_need_check_timer(s); - qemu_free_timer(s->need_check_timer); - - /* Ensure writes reach stable storage */ - bdrv_flush(bs->file); - - /* Clean shutdown, no check required on next open */ - if (s->header.features & QED_F_NEED_CHECK) { - s->header.features &= ~QED_F_NEED_CHECK; - qed_write_header_sync(s); - } - - qed_free_l2_cache(&s->l2_cache); - qemu_vfree(s->l1_table); -} - -static int qed_create(const char *filename, uint32_t cluster_size, - uint64_t image_size, uint32_t table_size, - const char *backing_file, const char *backing_fmt) -{ - QEDHeader header = { - .magic = QED_MAGIC, - .cluster_size = cluster_size, - .table_size = table_size, - .header_size = 1, - .features = 0, - .compat_features = 0, - .l1_table_offset = cluster_size, - .image_size = image_size, - }; - QEDHeader le_header; - uint8_t *l1_table = NULL; - size_t l1_size = header.cluster_size * header.table_size; - int ret = 0; - BlockDriverState *bs = NULL; - - ret = bdrv_create_file(filename, NULL); - if (ret < 0) { - return ret; - } - - ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB); - if (ret < 0) { - return ret; - } - - /* File must start empty and grow, check truncate is supported */ - ret = bdrv_truncate(bs, 0); - if (ret < 0) { - goto out; - } - - if (backing_file) { - header.features |= QED_F_BACKING_FILE; - header.backing_filename_offset = sizeof(le_header); - header.backing_filename_size = strlen(backing_file); - - if (qed_fmt_is_raw(backing_fmt)) { - header.features |= QED_F_BACKING_FORMAT_NO_PROBE; - } - } - - qed_header_cpu_to_le(&header, &le_header); - ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header)); - if (ret < 0) { - goto out; - } - ret = bdrv_pwrite(bs, sizeof(le_header), backing_file, - header.backing_filename_size); - if (ret < 0) { - goto out; - } - - l1_table = g_malloc0(l1_size); - ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size); - if (ret < 0) { - goto out; - } - - ret = 0; /* success */ -out: - g_free(l1_table); - bdrv_delete(bs); - return ret; -} - -static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options) -{ - uint64_t image_size = 0; - uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE; - uint32_t table_size = QED_DEFAULT_TABLE_SIZE; - const char *backing_file = NULL; - const char *backing_fmt = NULL; - - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - image_size = options->value.n; - } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { - backing_file = options->value.s; - } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { - backing_fmt = options->value.s; - } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { - if (options->value.n) { - cluster_size = options->value.n; - } - } else if (!strcmp(options->name, BLOCK_OPT_TABLE_SIZE)) { - if (options->value.n) { - table_size = options->value.n; - } - } - options++; - } - - if (!qed_is_cluster_size_valid(cluster_size)) { - fprintf(stderr, "QED cluster size must be within range [%u, %u] and power of 2\n", - QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE); - return -EINVAL; - } - if (!qed_is_table_size_valid(table_size)) { - fprintf(stderr, "QED table size must be within range [%u, %u] and power of 2\n", - QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE); - return -EINVAL; - } - if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) { - fprintf(stderr, "QED image size must be a non-zero multiple of " - "cluster size and less than %" PRIu64 " bytes\n", - qed_max_image_size(cluster_size, table_size)); - return -EINVAL; - } - - return qed_create(filename, cluster_size, image_size, table_size, - backing_file, backing_fmt); -} - -typedef struct { - Coroutine *co; - int is_allocated; - int *pnum; -} QEDIsAllocatedCB; - -static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) -{ - QEDIsAllocatedCB *cb = opaque; - *cb->pnum = len / BDRV_SECTOR_SIZE; - cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO); - if (cb->co) { - qemu_coroutine_enter(cb->co, NULL); - } -} - -static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum) -{ - BDRVQEDState *s = bs->opaque; - uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; - size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE; - QEDIsAllocatedCB cb = { - .is_allocated = -1, - .pnum = pnum, - }; - QEDRequest request = { .l2_table = NULL }; - - qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb); - - /* Now sleep if the callback wasn't invoked immediately */ - while (cb.is_allocated == -1) { - cb.co = qemu_coroutine_self(); - qemu_coroutine_yield(); - } - - qed_unref_l2_cache_entry(request.l2_table); - - return cb.is_allocated; -} - -static int bdrv_qed_make_empty(BlockDriverState *bs) -{ - return -ENOTSUP; -} - -static BDRVQEDState *acb_to_s(QEDAIOCB *acb) -{ - return acb->common.bs->opaque; -} - -/** - * Read from the backing file or zero-fill if no backing file - * - * @s: QED state - * @pos: Byte position in device - * @qiov: Destination I/O vector - * @cb: Completion function - * @opaque: User data for completion function - * - * This function reads qiov->size bytes starting at pos from the backing file. - * If there is no backing file then zeroes are read. - */ -static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, - QEMUIOVector *qiov, - BlockDriverCompletionFunc *cb, void *opaque) -{ - uint64_t backing_length = 0; - size_t size; - - /* If there is a backing file, get its length. Treat the absence of a - * backing file like a zero length backing file. - */ - if (s->bs->backing_hd) { - int64_t l = bdrv_getlength(s->bs->backing_hd); - if (l < 0) { - cb(opaque, l); - return; - } - backing_length = l; - } - - /* Zero all sectors if reading beyond the end of the backing file */ - if (pos >= backing_length || - pos + qiov->size > backing_length) { - qemu_iovec_memset(qiov, 0, 0, qiov->size); - } - - /* Complete now if there are no backing file sectors to read */ - if (pos >= backing_length) { - cb(opaque, 0); - return; - } - - /* If the read straddles the end of the backing file, shorten it */ - size = MIN((uint64_t)backing_length - pos, qiov->size); - - BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO); - bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE, - qiov, size / BDRV_SECTOR_SIZE, cb, opaque); -} - -typedef struct { - GenericCB gencb; - BDRVQEDState *s; - QEMUIOVector qiov; - struct iovec iov; - uint64_t offset; -} CopyFromBackingFileCB; - -static void qed_copy_from_backing_file_cb(void *opaque, int ret) -{ - CopyFromBackingFileCB *copy_cb = opaque; - qemu_vfree(copy_cb->iov.iov_base); - gencb_complete(©_cb->gencb, ret); -} - -static void qed_copy_from_backing_file_write(void *opaque, int ret) -{ - CopyFromBackingFileCB *copy_cb = opaque; - BDRVQEDState *s = copy_cb->s; - - if (ret) { - qed_copy_from_backing_file_cb(copy_cb, ret); - return; - } - - BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE); - bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE, - ©_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE, - qed_copy_from_backing_file_cb, copy_cb); -} - -/** - * Copy data from backing file into the image - * - * @s: QED state - * @pos: Byte position in device - * @len: Number of bytes - * @offset: Byte offset in image file - * @cb: Completion function - * @opaque: User data for completion function - */ -static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, - uint64_t len, uint64_t offset, - BlockDriverCompletionFunc *cb, - void *opaque) -{ - CopyFromBackingFileCB *copy_cb; - - /* Skip copy entirely if there is no work to do */ - if (len == 0) { - cb(opaque, 0); - return; - } - - copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque); - copy_cb->s = s; - copy_cb->offset = offset; - copy_cb->iov.iov_base = qemu_blockalign(s->bs, len); - copy_cb->iov.iov_len = len; - qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1); - - qed_read_backing_file(s, pos, ©_cb->qiov, - qed_copy_from_backing_file_write, copy_cb); -} - -/** - * Link one or more contiguous clusters into a table - * - * @s: QED state - * @table: L2 table - * @index: First cluster index - * @n: Number of contiguous clusters - * @cluster: First cluster offset - * - * The cluster offset may be an allocated byte offset in the image file, the - * zero cluster marker, or the unallocated cluster marker. - */ -static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index, - unsigned int n, uint64_t cluster) -{ - int i; - for (i = index; i < index + n; i++) { - table->offsets[i] = cluster; - if (!qed_offset_is_unalloc_cluster(cluster) && - !qed_offset_is_zero_cluster(cluster)) { - cluster += s->header.cluster_size; - } - } -} - -static void qed_aio_complete_bh(void *opaque) -{ - QEDAIOCB *acb = opaque; - BlockDriverCompletionFunc *cb = acb->common.cb; - void *user_opaque = acb->common.opaque; - int ret = acb->bh_ret; - bool *finished = acb->finished; - - qemu_bh_delete(acb->bh); - qemu_aio_release(acb); - - /* Invoke callback */ - cb(user_opaque, ret); - - /* Signal cancel completion */ - if (finished) { - *finished = true; - } -} - -static void qed_aio_complete(QEDAIOCB *acb, int ret) -{ - BDRVQEDState *s = acb_to_s(acb); - - trace_qed_aio_complete(s, acb, ret); - - /* Free resources */ - qemu_iovec_destroy(&acb->cur_qiov); - qed_unref_l2_cache_entry(acb->request.l2_table); - - /* Free the buffer we may have allocated for zero writes */ - if (acb->flags & QED_AIOCB_ZERO) { - qemu_vfree(acb->qiov->iov[0].iov_base); - acb->qiov->iov[0].iov_base = NULL; - } - - /* Arrange for a bh to invoke the completion function */ - acb->bh_ret = ret; - acb->bh = qemu_bh_new(qed_aio_complete_bh, acb); - qemu_bh_schedule(acb->bh); - - /* Start next allocating write request waiting behind this one. Note that - * requests enqueue themselves when they first hit an unallocated cluster - * but they wait until the entire request is finished before waking up the - * next request in the queue. This ensures that we don't cycle through - * requests multiple times but rather finish one at a time completely. - */ - if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { - QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next); - acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); - if (acb) { - qed_aio_next_io(acb, 0); - } else if (s->header.features & QED_F_NEED_CHECK) { - qed_start_need_check_timer(s); - } - } -} - -/** - * Commit the current L2 table to the cache - */ -static void qed_commit_l2_update(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - CachedL2Table *l2_table = acb->request.l2_table; - uint64_t l2_offset = l2_table->offset; - - qed_commit_l2_cache_entry(&s->l2_cache, l2_table); - - /* This is guaranteed to succeed because we just committed the entry to the - * cache. - */ - acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); - assert(acb->request.l2_table != NULL); - - qed_aio_next_io(opaque, ret); -} - -/** - * Update L1 table with new L2 table offset and write it out - */ -static void qed_aio_write_l1_update(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - int index; - - if (ret) { - qed_aio_complete(acb, ret); - return; - } - - index = qed_l1_index(s, acb->cur_pos); - s->l1_table->offsets[index] = acb->request.l2_table->offset; - - qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb); -} - -/** - * Update L2 table with new cluster offsets and write them out - */ -static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset) -{ - BDRVQEDState *s = acb_to_s(acb); - bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1; - int index; - - if (ret) { - goto err; - } - - if (need_alloc) { - qed_unref_l2_cache_entry(acb->request.l2_table); - acb->request.l2_table = qed_new_l2_table(s); - } - - index = qed_l2_index(s, acb->cur_pos); - qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters, - offset); - - if (need_alloc) { - /* Write out the whole new L2 table */ - qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true, - qed_aio_write_l1_update, acb); - } else { - /* Write out only the updated part of the L2 table */ - qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false, - qed_aio_next_io, acb); - } - return; - -err: - qed_aio_complete(acb, ret); -} - -static void qed_aio_write_l2_update_cb(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - qed_aio_write_l2_update(acb, ret, acb->cur_cluster); -} - -/** - * Flush new data clusters before updating the L2 table - * - * This flush is necessary when a backing file is in use. A crash during an - * allocating write could result in empty clusters in the image. If the write - * only touched a subregion of the cluster, then backing image sectors have - * been lost in the untouched region. The solution is to flush after writing a - * new data cluster and before updating the L2 table. - */ -static void qed_aio_write_flush_before_l2_update(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - - if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) { - qed_aio_complete(acb, -EIO); - } -} - -/** - * Write data to the image file - */ -static void qed_aio_write_main(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - uint64_t offset = acb->cur_cluster + - qed_offset_into_cluster(s, acb->cur_pos); - BlockDriverCompletionFunc *next_fn; - - trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size); - - if (ret) { - qed_aio_complete(acb, ret); - return; - } - - if (acb->find_cluster_ret == QED_CLUSTER_FOUND) { - next_fn = qed_aio_next_io; - } else { - if (s->bs->backing_hd) { - next_fn = qed_aio_write_flush_before_l2_update; - } else { - next_fn = qed_aio_write_l2_update_cb; - } - } - - BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); - bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, - &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, - next_fn, acb); -} - -/** - * Populate back untouched region of new data cluster - */ -static void qed_aio_write_postfill(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - uint64_t start = acb->cur_pos + acb->cur_qiov.size; - uint64_t len = - qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start; - uint64_t offset = acb->cur_cluster + - qed_offset_into_cluster(s, acb->cur_pos) + - acb->cur_qiov.size; - - if (ret) { - qed_aio_complete(acb, ret); - return; - } - - trace_qed_aio_write_postfill(s, acb, start, len, offset); - qed_copy_from_backing_file(s, start, len, offset, - qed_aio_write_main, acb); -} - -/** - * Populate front untouched region of new data cluster - */ -static void qed_aio_write_prefill(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - uint64_t start = qed_start_of_cluster(s, acb->cur_pos); - uint64_t len = qed_offset_into_cluster(s, acb->cur_pos); - - trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster); - qed_copy_from_backing_file(s, start, len, acb->cur_cluster, - qed_aio_write_postfill, acb); -} - -/** - * Check if the QED_F_NEED_CHECK bit should be set during allocating write - */ -static bool qed_should_set_need_check(BDRVQEDState *s) -{ - /* The flush before L2 update path ensures consistency */ - if (s->bs->backing_hd) { - return false; - } - - return !(s->header.features & QED_F_NEED_CHECK); -} - -static void qed_aio_write_zero_cluster(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - - if (ret) { - qed_aio_complete(acb, ret); - return; - } - - qed_aio_write_l2_update(acb, 0, 1); -} - -/** - * Write new data cluster - * - * @acb: Write request - * @len: Length in bytes - * - * This path is taken when writing to previously unallocated clusters. - */ -static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) -{ - BDRVQEDState *s = acb_to_s(acb); - BlockDriverCompletionFunc *cb; - - /* Cancel timer when the first allocating request comes in */ - if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) { - qed_cancel_need_check_timer(s); - } - - /* Freeze this request if another allocating write is in progress */ - if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { - QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next); - } - if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) || - s->allocating_write_reqs_plugged) { - return; /* wait for existing request to finish */ - } - - acb->cur_nclusters = qed_bytes_to_clusters(s, - qed_offset_into_cluster(s, acb->cur_pos) + len); - qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); - - if (acb->flags & QED_AIOCB_ZERO) { - /* Skip ahead if the clusters are already zero */ - if (acb->find_cluster_ret == QED_CLUSTER_ZERO) { - qed_aio_next_io(acb, 0); - return; - } - - cb = qed_aio_write_zero_cluster; - } else { - cb = qed_aio_write_prefill; - acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters); - } - - if (qed_should_set_need_check(s)) { - s->header.features |= QED_F_NEED_CHECK; - qed_write_header(s, cb, acb); - } else { - cb(acb, 0); - } -} - -/** - * Write data cluster in place - * - * @acb: Write request - * @offset: Cluster offset in bytes - * @len: Length in bytes - * - * This path is taken when writing to already allocated clusters. - */ -static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) -{ - /* Allocate buffer for zero writes */ - if (acb->flags & QED_AIOCB_ZERO) { - struct iovec *iov = acb->qiov->iov; - - if (!iov->iov_base) { - iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len); - memset(iov->iov_base, 0, iov->iov_len); - } - } - - /* Calculate the I/O vector */ - acb->cur_cluster = offset; - qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); - - /* Do the actual write */ - qed_aio_write_main(acb, 0); -} - -/** - * Write data cluster - * - * @opaque: Write request - * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, - * or -errno - * @offset: Cluster offset in bytes - * @len: Length in bytes - * - * Callback from qed_find_cluster(). - */ -static void qed_aio_write_data(void *opaque, int ret, - uint64_t offset, size_t len) -{ - QEDAIOCB *acb = opaque; - - trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len); - - acb->find_cluster_ret = ret; - - switch (ret) { - case QED_CLUSTER_FOUND: - qed_aio_write_inplace(acb, offset, len); - break; - - case QED_CLUSTER_L2: - case QED_CLUSTER_L1: - case QED_CLUSTER_ZERO: - qed_aio_write_alloc(acb, len); - break; - - default: - qed_aio_complete(acb, ret); - break; - } -} - -/** - * Read data cluster - * - * @opaque: Read request - * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, - * or -errno - * @offset: Cluster offset in bytes - * @len: Length in bytes - * - * Callback from qed_find_cluster(). - */ -static void qed_aio_read_data(void *opaque, int ret, - uint64_t offset, size_t len) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - BlockDriverState *bs = acb->common.bs; - - /* Adjust offset into cluster */ - offset += qed_offset_into_cluster(s, acb->cur_pos); - - trace_qed_aio_read_data(s, acb, ret, offset, len); - - if (ret < 0) { - goto err; - } - - qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); - - /* Handle zero cluster and backing file reads */ - if (ret == QED_CLUSTER_ZERO) { - qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size); - qed_aio_next_io(acb, 0); - return; - } else if (ret != QED_CLUSTER_FOUND) { - qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, - qed_aio_next_io, acb); - return; - } - - BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE, - &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, - qed_aio_next_io, acb); - return; - -err: - qed_aio_complete(acb, ret); -} - -/** - * Begin next I/O or complete the request - */ -static void qed_aio_next_io(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ? - qed_aio_write_data : qed_aio_read_data; - - trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size); - - /* Handle I/O error */ - if (ret) { - qed_aio_complete(acb, ret); - return; - } - - acb->qiov_offset += acb->cur_qiov.size; - acb->cur_pos += acb->cur_qiov.size; - qemu_iovec_reset(&acb->cur_qiov); - - /* Complete request */ - if (acb->cur_pos >= acb->end_pos) { - qed_aio_complete(acb, 0); - return; - } - - /* Find next cluster and start I/O */ - qed_find_cluster(s, &acb->request, - acb->cur_pos, acb->end_pos - acb->cur_pos, - io_fn, acb); -} - -static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque, int flags) -{ - QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque); - - trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, - opaque, flags); - - acb->flags = flags; - acb->finished = NULL; - acb->qiov = qiov; - acb->qiov_offset = 0; - acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; - acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE; - acb->request.l2_table = NULL; - qemu_iovec_init(&acb->cur_qiov, qiov->niov); - - /* Start request */ - qed_aio_next_io(acb, 0); - return &acb->common; -} - -static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque) -{ - return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); -} - -static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque) -{ - return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, - opaque, QED_AIOCB_WRITE); -} - -typedef struct { - Coroutine *co; - int ret; - bool done; -} QEDWriteZeroesCB; - -static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret) -{ - QEDWriteZeroesCB *cb = opaque; - - cb->done = true; - cb->ret = ret; - if (cb->co) { - qemu_coroutine_enter(cb->co, NULL); - } -} - -static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors) -{ - BlockDriverAIOCB *blockacb; - BDRVQEDState *s = bs->opaque; - QEDWriteZeroesCB cb = { .done = false }; - QEMUIOVector qiov; - struct iovec iov; - - /* Refuse if there are untouched backing file sectors */ - if (bs->backing_hd) { - if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) { - return -ENOTSUP; - } - if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) { - return -ENOTSUP; - } - } - - /* Zero writes start without an I/O buffer. If a buffer becomes necessary - * then it will be allocated during request processing. - */ - iov.iov_base = NULL, - iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE, - - qemu_iovec_init_external(&qiov, &iov, 1); - blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors, - qed_co_write_zeroes_cb, &cb, - QED_AIOCB_WRITE | QED_AIOCB_ZERO); - if (!blockacb) { - return -EIO; - } - if (!cb.done) { - cb.co = qemu_coroutine_self(); - qemu_coroutine_yield(); - } - assert(cb.done); - return cb.ret; -} - -static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset) -{ - BDRVQEDState *s = bs->opaque; - uint64_t old_image_size; - int ret; - - if (!qed_is_image_size_valid(offset, s->header.cluster_size, - s->header.table_size)) { - return -EINVAL; - } - - /* Shrinking is currently not supported */ - if ((uint64_t)offset < s->header.image_size) { - return -ENOTSUP; - } - - old_image_size = s->header.image_size; - s->header.image_size = offset; - ret = qed_write_header_sync(s); - if (ret < 0) { - s->header.image_size = old_image_size; - } - return ret; -} - -static int64_t bdrv_qed_getlength(BlockDriverState *bs) -{ - BDRVQEDState *s = bs->opaque; - return s->header.image_size; -} - -static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - BDRVQEDState *s = bs->opaque; - - memset(bdi, 0, sizeof(*bdi)); - bdi->cluster_size = s->header.cluster_size; - bdi->is_dirty = s->header.features & QED_F_NEED_CHECK; - return 0; -} - -static int bdrv_qed_change_backing_file(BlockDriverState *bs, - const char *backing_file, - const char *backing_fmt) -{ - BDRVQEDState *s = bs->opaque; - QEDHeader new_header, le_header; - void *buffer; - size_t buffer_len, backing_file_len; - int ret; - - /* Refuse to set backing filename if unknown compat feature bits are - * active. If the image uses an unknown compat feature then we may not - * know the layout of data following the header structure and cannot safely - * add a new string. - */ - if (backing_file && (s->header.compat_features & - ~QED_COMPAT_FEATURE_MASK)) { - return -ENOTSUP; - } - - memcpy(&new_header, &s->header, sizeof(new_header)); - - new_header.features &= ~(QED_F_BACKING_FILE | - QED_F_BACKING_FORMAT_NO_PROBE); - - /* Adjust feature flags */ - if (backing_file) { - new_header.features |= QED_F_BACKING_FILE; - - if (qed_fmt_is_raw(backing_fmt)) { - new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE; - } - } - - /* Calculate new header size */ - backing_file_len = 0; - - if (backing_file) { - backing_file_len = strlen(backing_file); - } - - buffer_len = sizeof(new_header); - new_header.backing_filename_offset = buffer_len; - new_header.backing_filename_size = backing_file_len; - buffer_len += backing_file_len; - - /* Make sure we can rewrite header without failing */ - if (buffer_len > new_header.header_size * new_header.cluster_size) { - return -ENOSPC; - } - - /* Prepare new header */ - buffer = g_malloc(buffer_len); - - qed_header_cpu_to_le(&new_header, &le_header); - memcpy(buffer, &le_header, sizeof(le_header)); - buffer_len = sizeof(le_header); - - if (backing_file) { - memcpy(buffer + buffer_len, backing_file, backing_file_len); - buffer_len += backing_file_len; - } - - /* Write new header */ - ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len); - g_free(buffer); - if (ret == 0) { - memcpy(&s->header, &new_header, sizeof(new_header)); - } - return ret; -} - -static void bdrv_qed_invalidate_cache(BlockDriverState *bs) -{ - BDRVQEDState *s = bs->opaque; - - bdrv_qed_close(bs); - memset(s, 0, sizeof(BDRVQEDState)); - bdrv_qed_open(bs, NULL, bs->open_flags); -} - -static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result, - BdrvCheckMode fix) -{ - BDRVQEDState *s = bs->opaque; - - return qed_check(s, result, !!fix); -} - -static QEMUOptionParameter qed_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size (in bytes)" - }, { - .name = BLOCK_OPT_BACKING_FILE, - .type = OPT_STRING, - .help = "File name of a base image" - }, { - .name = BLOCK_OPT_BACKING_FMT, - .type = OPT_STRING, - .help = "Image format of the base image" - }, { - .name = BLOCK_OPT_CLUSTER_SIZE, - .type = OPT_SIZE, - .help = "Cluster size (in bytes)", - .value = { .n = QED_DEFAULT_CLUSTER_SIZE }, - }, { - .name = BLOCK_OPT_TABLE_SIZE, - .type = OPT_SIZE, - .help = "L1/L2 table size (in clusters)" - }, - { /* end of list */ } -}; - -static BlockDriver bdrv_qed = { - .format_name = "qed", - .instance_size = sizeof(BDRVQEDState), - .create_options = qed_create_options, - - .bdrv_probe = bdrv_qed_probe, - .bdrv_rebind = bdrv_qed_rebind, - .bdrv_open = bdrv_qed_open, - .bdrv_close = bdrv_qed_close, - .bdrv_reopen_prepare = bdrv_qed_reopen_prepare, - .bdrv_create = bdrv_qed_create, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_is_allocated = bdrv_qed_co_is_allocated, - .bdrv_make_empty = bdrv_qed_make_empty, - .bdrv_aio_readv = bdrv_qed_aio_readv, - .bdrv_aio_writev = bdrv_qed_aio_writev, - .bdrv_co_write_zeroes = bdrv_qed_co_write_zeroes, - .bdrv_truncate = bdrv_qed_truncate, - .bdrv_getlength = bdrv_qed_getlength, - .bdrv_get_info = bdrv_qed_get_info, - .bdrv_change_backing_file = bdrv_qed_change_backing_file, - .bdrv_invalidate_cache = bdrv_qed_invalidate_cache, - .bdrv_check = bdrv_qed_check, -}; - -static void bdrv_qed_init(void) -{ - bdrv_register(&bdrv_qed); -} - -block_init(bdrv_qed_init); diff --git a/contrib/qemu/block/qed.h b/contrib/qemu/block/qed.h deleted file mode 100644 index 2b4ddedf313..00000000000 --- a/contrib/qemu/block/qed.h +++ /dev/null @@ -1,344 +0,0 @@ -/* - * QEMU Enhanced Disk Format - * - * Copyright IBM, Corp. 2010 - * - * Authors: - * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> - * Anthony Liguori <aliguori@us.ibm.com> - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#ifndef BLOCK_QED_H -#define BLOCK_QED_H - -#include "block/block_int.h" - -/* The layout of a QED file is as follows: - * - * +--------+----------+----------+----------+-----+ - * | header | L1 table | cluster0 | cluster1 | ... | - * +--------+----------+----------+----------+-----+ - * - * There is a 2-level pagetable for cluster allocation: - * - * +----------+ - * | L1 table | - * +----------+ - * ,------' | '------. - * +----------+ | +----------+ - * | L2 table | ... | L2 table | - * +----------+ +----------+ - * ,------' | '------. - * +----------+ | +----------+ - * | Data | ... | Data | - * +----------+ +----------+ - * - * The L1 table is fixed size and always present. L2 tables are allocated on - * demand. The L1 table size determines the maximum possible image size; it - * can be influenced using the cluster_size and table_size values. - * - * All fields are little-endian on disk. - */ - -enum { - QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24, - - /* The image supports a backing file */ - QED_F_BACKING_FILE = 0x01, - - /* The image needs a consistency check before use */ - QED_F_NEED_CHECK = 0x02, - - /* The backing file format must not be probed, treat as raw image */ - QED_F_BACKING_FORMAT_NO_PROBE = 0x04, - - /* Feature bits must be used when the on-disk format changes */ - QED_FEATURE_MASK = QED_F_BACKING_FILE | /* supported feature bits */ - QED_F_NEED_CHECK | - QED_F_BACKING_FORMAT_NO_PROBE, - QED_COMPAT_FEATURE_MASK = 0, /* supported compat feature bits */ - QED_AUTOCLEAR_FEATURE_MASK = 0, /* supported autoclear feature bits */ - - /* Data is stored in groups of sectors called clusters. Cluster size must - * be large to avoid keeping too much metadata. I/O requests that have - * sub-cluster size will require read-modify-write. - */ - QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */ - QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024, - QED_DEFAULT_CLUSTER_SIZE = 64 * 1024, - - /* Allocated clusters are tracked using a 2-level pagetable. Table size is - * a multiple of clusters so large maximum image sizes can be supported - * without jacking up the cluster size too much. - */ - QED_MIN_TABLE_SIZE = 1, /* in clusters */ - QED_MAX_TABLE_SIZE = 16, - QED_DEFAULT_TABLE_SIZE = 4, - - /* Delay to flush and clean image after last allocating write completes */ - QED_NEED_CHECK_TIMEOUT = 5, /* in seconds */ -}; - -typedef struct { - uint32_t magic; /* QED\0 */ - - uint32_t cluster_size; /* in bytes */ - uint32_t table_size; /* for L1 and L2 tables, in clusters */ - uint32_t header_size; /* in clusters */ - - uint64_t features; /* format feature bits */ - uint64_t compat_features; /* compatible feature bits */ - uint64_t autoclear_features; /* self-resetting feature bits */ - - uint64_t l1_table_offset; /* in bytes */ - uint64_t image_size; /* total logical image size, in bytes */ - - /* if (features & QED_F_BACKING_FILE) */ - uint32_t backing_filename_offset; /* in bytes from start of header */ - uint32_t backing_filename_size; /* in bytes */ -} QEDHeader; - -typedef struct { - uint64_t offsets[0]; /* in bytes */ -} QEDTable; - -/* The L2 cache is a simple write-through cache for L2 structures */ -typedef struct CachedL2Table { - QEDTable *table; - uint64_t offset; /* offset=0 indicates an invalidate entry */ - QTAILQ_ENTRY(CachedL2Table) node; - int ref; -} CachedL2Table; - -typedef struct { - QTAILQ_HEAD(, CachedL2Table) entries; - unsigned int n_entries; -} L2TableCache; - -typedef struct QEDRequest { - CachedL2Table *l2_table; -} QEDRequest; - -enum { - QED_AIOCB_WRITE = 0x0001, /* read or write? */ - QED_AIOCB_ZERO = 0x0002, /* zero write, used with QED_AIOCB_WRITE */ -}; - -typedef struct QEDAIOCB { - BlockDriverAIOCB common; - QEMUBH *bh; - int bh_ret; /* final return status for completion bh */ - QSIMPLEQ_ENTRY(QEDAIOCB) next; /* next request */ - int flags; /* QED_AIOCB_* bits ORed together */ - bool *finished; /* signal for cancel completion */ - uint64_t end_pos; /* request end on block device, in bytes */ - - /* User scatter-gather list */ - QEMUIOVector *qiov; - size_t qiov_offset; /* byte count already processed */ - - /* Current cluster scatter-gather list */ - QEMUIOVector cur_qiov; - uint64_t cur_pos; /* position on block device, in bytes */ - uint64_t cur_cluster; /* cluster offset in image file */ - unsigned int cur_nclusters; /* number of clusters being accessed */ - int find_cluster_ret; /* used for L1/L2 update */ - - QEDRequest request; -} QEDAIOCB; - -typedef struct { - BlockDriverState *bs; /* device */ - uint64_t file_size; /* length of image file, in bytes */ - - QEDHeader header; /* always cpu-endian */ - QEDTable *l1_table; - L2TableCache l2_cache; /* l2 table cache */ - uint32_t table_nelems; - uint32_t l1_shift; - uint32_t l2_shift; - uint32_t l2_mask; - - /* Allocating write request queue */ - QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs; - bool allocating_write_reqs_plugged; - - /* Periodic flush and clear need check flag */ - QEMUTimer *need_check_timer; -} BDRVQEDState; - -enum { - QED_CLUSTER_FOUND, /* cluster found */ - QED_CLUSTER_ZERO, /* zero cluster found */ - QED_CLUSTER_L2, /* cluster missing in L2 */ - QED_CLUSTER_L1, /* cluster missing in L1 */ -}; - -/** - * qed_find_cluster() completion callback - * - * @opaque: User data for completion callback - * @ret: QED_CLUSTER_FOUND Success - * QED_CLUSTER_L2 Data cluster unallocated in L2 - * QED_CLUSTER_L1 L2 unallocated in L1 - * -errno POSIX error occurred - * @offset: Data cluster offset - * @len: Contiguous bytes starting from cluster offset - * - * This function is invoked when qed_find_cluster() completes. - * - * On success ret is QED_CLUSTER_FOUND and offset/len are a contiguous range - * in the image file. - * - * On failure ret is QED_CLUSTER_L2 or QED_CLUSTER_L1 for missing L2 or L1 - * table offset, respectively. len is number of contiguous unallocated bytes. - */ -typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len); - -/** - * Generic callback for chaining async callbacks - */ -typedef struct { - BlockDriverCompletionFunc *cb; - void *opaque; -} GenericCB; - -void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque); -void gencb_complete(void *opaque, int ret); - -/** - * Header functions - */ -int qed_write_header_sync(BDRVQEDState *s); - -/** - * L2 cache functions - */ -void qed_init_l2_cache(L2TableCache *l2_cache); -void qed_free_l2_cache(L2TableCache *l2_cache); -CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache); -void qed_unref_l2_cache_entry(CachedL2Table *entry); -CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset); -void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table); - -/** - * Table I/O functions - */ -int qed_read_l1_table_sync(BDRVQEDState *s); -void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, - BlockDriverCompletionFunc *cb, void *opaque); -int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, - unsigned int n); -int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, - uint64_t offset); -void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, - BlockDriverCompletionFunc *cb, void *opaque); -void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, - unsigned int index, unsigned int n, bool flush, - BlockDriverCompletionFunc *cb, void *opaque); -int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, - unsigned int index, unsigned int n, bool flush); - -/** - * Cluster functions - */ -void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, - size_t len, QEDFindClusterFunc *cb, void *opaque); - -/** - * Consistency check - */ -int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix); - -QEDTable *qed_alloc_table(BDRVQEDState *s); - -/** - * Round down to the start of a cluster - */ -static inline uint64_t qed_start_of_cluster(BDRVQEDState *s, uint64_t offset) -{ - return offset & ~(uint64_t)(s->header.cluster_size - 1); -} - -static inline uint64_t qed_offset_into_cluster(BDRVQEDState *s, uint64_t offset) -{ - return offset & (s->header.cluster_size - 1); -} - -static inline uint64_t qed_bytes_to_clusters(BDRVQEDState *s, uint64_t bytes) -{ - return qed_start_of_cluster(s, bytes + (s->header.cluster_size - 1)) / - (s->header.cluster_size - 1); -} - -static inline unsigned int qed_l1_index(BDRVQEDState *s, uint64_t pos) -{ - return pos >> s->l1_shift; -} - -static inline unsigned int qed_l2_index(BDRVQEDState *s, uint64_t pos) -{ - return (pos >> s->l2_shift) & s->l2_mask; -} - -/** - * Test if a cluster offset is valid - */ -static inline bool qed_check_cluster_offset(BDRVQEDState *s, uint64_t offset) -{ - uint64_t header_size = (uint64_t)s->header.header_size * - s->header.cluster_size; - - if (offset & (s->header.cluster_size - 1)) { - return false; - } - return offset >= header_size && offset < s->file_size; -} - -/** - * Test if a table offset is valid - */ -static inline bool qed_check_table_offset(BDRVQEDState *s, uint64_t offset) -{ - uint64_t end_offset = offset + (s->header.table_size - 1) * - s->header.cluster_size; - - /* Overflow check */ - if (end_offset <= offset) { - return false; - } - - return qed_check_cluster_offset(s, offset) && - qed_check_cluster_offset(s, end_offset); -} - -static inline bool qed_offset_is_cluster_aligned(BDRVQEDState *s, - uint64_t offset) -{ - if (qed_offset_into_cluster(s, offset)) { - return false; - } - return true; -} - -static inline bool qed_offset_is_unalloc_cluster(uint64_t offset) -{ - if (offset == 0) { - return true; - } - return false; -} - -static inline bool qed_offset_is_zero_cluster(uint64_t offset) -{ - if (offset == 1) { - return true; - } - return false; -} - -#endif /* BLOCK_QED_H */ diff --git a/contrib/qemu/block/snapshot.c b/contrib/qemu/block/snapshot.c deleted file mode 100644 index 6c6d9deea1f..00000000000 --- a/contrib/qemu/block/snapshot.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Block layer snapshot related functions - * - * Copyright (c) 2003-2008 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "block/snapshot.h" -#include "block/block_int.h" - -int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, - const char *name) -{ - QEMUSnapshotInfo *sn_tab, *sn; - int nb_sns, i, ret; - - ret = -ENOENT; - nb_sns = bdrv_snapshot_list(bs, &sn_tab); - if (nb_sns < 0) { - return ret; - } - for (i = 0; i < nb_sns; i++) { - sn = &sn_tab[i]; - if (!strcmp(sn->id_str, name) || !strcmp(sn->name, name)) { - *sn_info = *sn; - ret = 0; - break; - } - } - g_free(sn_tab); - return ret; -} - -int bdrv_can_snapshot(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { - return 0; - } - - if (!drv->bdrv_snapshot_create) { - if (bs->file != NULL) { - return bdrv_can_snapshot(bs->file); - } - return 0; - } - - return 1; -} - -int bdrv_snapshot_create(BlockDriverState *bs, - QEMUSnapshotInfo *sn_info) -{ - BlockDriver *drv = bs->drv; - if (!drv) { - return -ENOMEDIUM; - } - if (drv->bdrv_snapshot_create) { - return drv->bdrv_snapshot_create(bs, sn_info); - } - if (bs->file) { - return bdrv_snapshot_create(bs->file, sn_info); - } - return -ENOTSUP; -} - -int bdrv_snapshot_goto(BlockDriverState *bs, - const char *snapshot_id) -{ - BlockDriver *drv = bs->drv; - int ret, open_ret; - - if (!drv) { - return -ENOMEDIUM; - } - if (drv->bdrv_snapshot_goto) { - return drv->bdrv_snapshot_goto(bs, snapshot_id); - } - - if (bs->file) { - drv->bdrv_close(bs); - ret = bdrv_snapshot_goto(bs->file, snapshot_id); - open_ret = drv->bdrv_open(bs, NULL, bs->open_flags); - if (open_ret < 0) { - bdrv_delete(bs->file); - bs->drv = NULL; - return open_ret; - } - return ret; - } - - return -ENOTSUP; -} - -int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) -{ - BlockDriver *drv = bs->drv; - if (!drv) { - return -ENOMEDIUM; - } - if (drv->bdrv_snapshot_delete) { - return drv->bdrv_snapshot_delete(bs, snapshot_id); - } - if (bs->file) { - return bdrv_snapshot_delete(bs->file, snapshot_id); - } - return -ENOTSUP; -} - -int bdrv_snapshot_list(BlockDriverState *bs, - QEMUSnapshotInfo **psn_info) -{ - BlockDriver *drv = bs->drv; - if (!drv) { - return -ENOMEDIUM; - } - if (drv->bdrv_snapshot_list) { - return drv->bdrv_snapshot_list(bs, psn_info); - } - if (bs->file) { - return bdrv_snapshot_list(bs->file, psn_info); - } - return -ENOTSUP; -} - -int bdrv_snapshot_load_tmp(BlockDriverState *bs, - const char *snapshot_name) -{ - BlockDriver *drv = bs->drv; - if (!drv) { - return -ENOMEDIUM; - } - if (!bs->read_only) { - return -EINVAL; - } - if (drv->bdrv_snapshot_load_tmp) { - return drv->bdrv_snapshot_load_tmp(bs, snapshot_name); - } - return -ENOTSUP; -} |