#include "kvm/qcow.h" #include "kvm/disk-image.h" #include "kvm/read-write.h" #include "kvm/mutex.h" #include "kvm/util.h" #include #include #include #include #include #include #include #include #ifdef CONFIG_HAS_ZLIB #include #endif #include #include #include #include static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append); static int qcow_write_refcount_table(struct qcow *q); static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref); static void qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size); static inline int qcow_pwrite_sync(int fd, void *buf, size_t count, off_t offset) { if (pwrite_in_full(fd, buf, count, offset) < 0) return -1; return fdatasync(fd); } static int l2_table_insert(struct rb_root *root, struct qcow_l2_table *new) { struct rb_node **link = &(root->rb_node), *parent = NULL; u64 offset = new->offset; /* search the tree */ while (*link) { struct qcow_l2_table *t; t = rb_entry(*link, struct qcow_l2_table, node); if (!t) goto error; parent = *link; if (t->offset > offset) link = &(*link)->rb_left; else if (t->offset < offset) link = &(*link)->rb_right; else goto out; } /* add new node */ rb_link_node(&new->node, parent, link); rb_insert_color(&new->node, root); out: return 0; error: return -1; } static struct qcow_l2_table *l2_table_lookup(struct rb_root *root, u64 offset) { struct rb_node *link = root->rb_node; while (link) { struct qcow_l2_table *t; t = rb_entry(link, struct qcow_l2_table, node); if (!t) goto out; if (t->offset > offset) link = link->rb_left; else if (t->offset < offset) link = link->rb_right; else return t; } out: return NULL; } static void l1_table_free_cache(struct qcow_l1_table *l1t) { struct rb_root *r = &l1t->root; struct list_head *pos, *n; struct qcow_l2_table *t; list_for_each_safe(pos, n, &l1t->lru_list) { /* Remove cache table from the list and RB tree */ list_del(pos); t = list_entry(pos, struct qcow_l2_table, list); rb_erase(&t->node, r); /* Free the cached node */ free(t); } } static int qcow_l2_cache_write(struct qcow *q, struct qcow_l2_table *c) { struct qcow_header *header = q->header; u64 size; if (!c->dirty) return 0; size = 1 << header->l2_bits; if (qcow_pwrite_sync(q->fd, c->table, size * sizeof(u64), c->offset) < 0) return -1; c->dirty = 0; return 0; } static int cache_table(struct qcow *q, struct qcow_l2_table *c) { struct qcow_l1_table *l1t = &q->table; struct rb_root *r = &l1t->root; struct qcow_l2_table *lru; if (l1t->nr_cached == MAX_CACHE_NODES) { /* * The node at the head of the list is least recently used * node. Remove it from the list and replaced with a new node. */ lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list); /* Remove the node from the cache */ rb_erase(&lru->node, r); list_del_init(&lru->list); l1t->nr_cached--; /* Free the LRUed node */ free(lru); } /* Add new node in RB Tree: Helps in searching faster */ if (l2_table_insert(r, c) < 0) goto error; /* Add in LRU replacement list */ list_add_tail(&c->list, &l1t->lru_list); l1t->nr_cached++; return 0; error: return -1; } static struct qcow_l2_table *l2_table_search(struct qcow *q, u64 offset) { struct qcow_l1_table *l1t = &q->table; struct qcow_l2_table *l2t; l2t = l2_table_lookup(&l1t->root, offset); if (!l2t) return NULL; /* Update the LRU state, by moving the searched node to list tail */ list_move_tail(&l2t->list, &l1t->lru_list); return l2t; } /* Allocates a new node for caching L2 table */ static struct qcow_l2_table *new_cache_table(struct qcow *q, u64 offset) { struct qcow_header *header = q->header; struct qcow_l2_table *c; u64 l2t_sz; u64 size; l2t_sz = 1 << header->l2_bits; size = sizeof(*c) + l2t_sz * sizeof(u64); c = calloc(1, size); if (!c) goto out; c->offset = offset; RB_CLEAR_NODE(&c->node); INIT_LIST_HEAD(&c->list); out: return c; } static inline u64 get_l1_index(struct qcow *q, u64 offset) { struct qcow_header *header = q->header; return offset >> (header->l2_bits + header->cluster_bits); } static inline u64 get_l2_index(struct qcow *q, u64 offset) { struct qcow_header *header = q->header; return (offset >> (header->cluster_bits)) & ((1 << header->l2_bits)-1); } static inline u64 get_cluster_offset(struct qcow *q, u64 offset) { struct qcow_header *header = q->header; return offset & ((1 << header->cluster_bits)-1); } static struct qcow_l2_table *qcow_read_l2_table(struct qcow *q, u64 offset) { struct qcow_header *header = q->header; struct qcow_l2_table *l2t; u64 size; size = 1 << header->l2_bits; /* search an entry for offset in cache */ l2t = l2_table_search(q, offset); if (l2t) return l2t; /* allocate new node for caching l2 table */ l2t = new_cache_table(q, offset); if (!l2t) goto error; /* table not cached: read from the disk */ if (pread_in_full(q->fd, l2t->table, size * sizeof(u64), offset) < 0) goto error; /* cache the table */ if (cache_table(q, l2t) < 0) goto error; return l2t; error: free(l2t); return NULL; } static int qcow_decompress_buffer(u8 *out_buf, int out_buf_size, const u8 *buf, int buf_size) { #ifdef CONFIG_HAS_ZLIB z_stream strm1, *strm = &strm1; int ret, out_len; memset(strm, 0, sizeof(*strm)); strm->next_in = (u8 *)buf; strm->avail_in = buf_size; strm->next_out = out_buf; strm->avail_out = out_buf_size; ret = inflateInit2(strm, -12); if (ret != Z_OK) return -1; ret = inflate(strm, Z_FINISH); out_len = strm->next_out - out_buf; if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || out_len != out_buf_size) { inflateEnd(strm); return -1; } inflateEnd(strm); return 0; #else return -1; #endif } static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_len) { struct qcow_header *header = q->header; struct qcow_l1_table *l1t = &q->table; struct qcow_l2_table *l2t; u64 clust_offset; u64 clust_start; u64 l2t_offset; size_t length; u64 l2t_size; u64 l1_idx; u64 l2_idx; int coffset; int csize; l1_idx = get_l1_index(q, offset); if (l1_idx >= l1t->table_size) return -1; clust_offset = get_cluster_offset(q, offset); if (clust_offset >= q->cluster_size) return -1; length = q->cluster_size - clust_offset; if (length > dst_len) length = dst_len; mutex_lock(&q->mutex); l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]); if (!l2t_offset) goto zero_cluster; l2t_size = 1 << header->l2_bits; /* read and cache level 2 table */ l2t = qcow_read_l2_table(q, l2t_offset); if (!l2t) goto out_error; l2_idx = get_l2_index(q, offset); if (l2_idx >= l2t_size) goto out_error; clust_start = be64_to_cpu(l2t->table[l2_idx]); if (clust_start & QCOW1_OFLAG_COMPRESSED) { coffset = clust_start & q->cluster_offset_mask; csize = clust_start >> (63 - q->header->cluster_bits); csize &= (q->cluster_size - 1); if (pread_in_full(q->fd, q->cluster_data, csize, coffset) < 0) goto out_error; if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size, q->cluster_data, csize) < 0) goto out_error; memcpy(dst, q->cluster_cache + clust_offset, length); mutex_unlock(&q->mutex); } else { if (!clust_start) goto zero_cluster; mutex_unlock(&q->mutex); if (pread_in_full(q->fd, dst, length, clust_start + clust_offset) < 0) return -1; } return length; zero_cluster: mutex_unlock(&q->mutex); memset(dst, 0, length); return length; out_error: mutex_unlock(&q->mutex); length = -1; return -1; } static ssize_t qcow2_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_len) { struct qcow_header *header = q->header; struct qcow_l1_table *l1t = &q->table; struct qcow_l2_table *l2t; u64 clust_offset; u64 clust_start; u64 l2t_offset; size_t length; u64 l2t_size; u64 l1_idx; u64 l2_idx; int coffset; int sector_offset; int nb_csectors; int csize; l1_idx = get_l1_index(q, offset); if (l1_idx >= l1t->table_size) return -1; clust_offset = get_cluster_offset(q, offset); if (clust_offset >= q->cluster_size) return -1; length = q->cluster_size - clust_offset; if (length > dst_len) length = dst_len; mutex_lock(&q->mutex); l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]); l2t_offset &= ~QCOW2_OFLAG_COPIED; if (!l2t_offset) goto zero_cluster; l2t_size = 1 << header->l2_bits; /* read and cache level 2 table */ l2t = qcow_read_l2_table(q, l2t_offset); if (!l2t) goto out_error; l2_idx = get_l2_index(q, offset); if (l2_idx >= l2t_size) goto out_error; clust_start = be64_to_cpu(l2t->table[l2_idx]); if (clust_start & QCOW2_OFLAG_COMPRESSED) { coffset = clust_start & q->cluster_offset_mask; nb_csectors = ((clust_start >> q->csize_shift) & q->csize_mask) + 1; sector_offset = coffset & (SECTOR_SIZE - 1); csize = nb_csectors * SECTOR_SIZE - sector_offset; if (pread_in_full(q->fd, q->cluster_data, nb_csectors * SECTOR_SIZE, coffset & ~(SECTOR_SIZE - 1)) < 0) { goto out_error; } if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size, q->cluster_data + sector_offset, csize) < 0) { goto out_error; } memcpy(dst, q->cluster_cache + clust_offset, length); mutex_unlock(&q->mutex); } else { clust_start &= QCOW2_OFFSET_MASK; if (!clust_start) goto zero_cluster; mutex_unlock(&q->mutex); if (pread_in_full(q->fd, dst, length, clust_start + clust_offset) < 0) return -1; } return length; zero_cluster: mutex_unlock(&q->mutex); memset(dst, 0, length); return length; out_error: mutex_unlock(&q->mutex); length = -1; return -1; } static ssize_t qcow_read_sector_single(struct disk_image *disk, u64 sector, void *dst, u32 dst_len) { struct qcow *q = disk->priv; struct qcow_header *header = q->header; u32 nr_read; u64 offset; char *buf; u32 nr; buf = dst; nr_read = 0; while (nr_read < dst_len) { offset = sector << SECTOR_SHIFT; if (offset >= header->size) return -1; if (q->version == QCOW1_VERSION) nr = qcow1_read_cluster(q, offset, buf, dst_len - nr_read); else nr = qcow2_read_cluster(q, offset, buf, dst_len - nr_read); if (nr <= 0) return -1; nr_read += nr; buf += nr; sector += (nr >> SECTOR_SHIFT); } return dst_len; } static ssize_t qcow_read_sector(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount, void *param) { ssize_t nr, total = 0; while (iovcount--) { nr = qcow_read_sector_single(disk, sector, iov->iov_base, iov->iov_len); if (nr != (ssize_t)iov->iov_len) { pr_info("qcow_read_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len); return -1; } sector += iov->iov_len >> SECTOR_SHIFT; total += nr; iov++; } return total; } static void refcount_table_free_cache(struct qcow_refcount_table *rft) { struct rb_root *r = &rft->root; struct list_head *pos, *n; struct qcow_refcount_block *t; list_for_each_safe(pos, n, &rft->lru_list) { list_del(pos); t = list_entry(pos, struct qcow_refcount_block, list); rb_erase(&t->node, r); free(t); } } static int refcount_block_insert(struct rb_root *root, struct qcow_refcount_block *new) { struct rb_node **link = &(root->rb_node), *parent = NULL; u64 offset = new->offset; /* search the tree */ while (*link) { struct qcow_refcount_block *t; t = rb_entry(*link, struct qcow_refcount_block, node); if (!t) goto error; parent = *link; if (t->offset > offset) link = &(*link)->rb_left; else if (t->offset < offset) link = &(*link)->rb_right; else goto out; } /* add new node */ rb_link_node(&new->node, parent, link); rb_insert_color(&new->node, root); out: return 0; error: return -1; } static int write_refcount_block(struct qcow *q, struct qcow_refcount_block *rfb) { if (!rfb->dirty) return 0; if (qcow_pwrite_sync(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb->offset) < 0) return -1; rfb->dirty = 0; return 0; } static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c) { struct qcow_refcount_table *rft = &q->refcount_table; struct rb_root *r = &rft->root; struct qcow_refcount_block *lru; if (rft->nr_cached == MAX_CACHE_NODES) { lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list); rb_erase(&lru->node, r); list_del_init(&lru->list); rft->nr_cached--; free(lru); } if (refcount_block_insert(r, c) < 0) goto error; list_add_tail(&c->list, &rft->lru_list); rft->nr_cached++; return 0; error: return -1; } static struct qcow_refcount_block *new_refcount_block(struct qcow *q, u64 rfb_offset) { struct qcow_refcount_block *rfb; rfb = malloc(sizeof *rfb + q->cluster_size); if (!rfb) return NULL; rfb->offset = rfb_offset; rfb->size = q->cluster_size / sizeof(u16); RB_CLEAR_NODE(&rfb->node); INIT_LIST_HEAD(&rfb->list); return rfb; } static struct qcow_refcount_block *refcount_block_lookup(struct rb_root *root, u64 offset) { struct rb_node *link = root->rb_node; while (link) { struct qcow_refcount_block *t; t = rb_entry(link, struct qcow_refcount_block, node); if (!t) goto out; if (t->offset > offset) link = link->rb_left; else if (t->offset < offset) link = link->rb_right; else return t; } out: return NULL; } static struct qcow_refcount_block *refcount_block_search(struct qcow *q, u64 offset) { struct qcow_refcount_table *rft = &q->refcount_table; struct qcow_refcount_block *rfb; rfb = refcount_block_lookup(&rft->root, offset); if (!rfb) return NULL; /* Update the LRU state, by moving the searched node to list tail */ list_move_tail(&rfb->list, &rft->lru_list); return rfb; } static struct qcow_refcount_block *qcow_grow_refcount_block(struct qcow *q, u64 clust_idx) { struct qcow_header *header = q->header; struct qcow_refcount_table *rft = &q->refcount_table; struct qcow_refcount_block *rfb; u64 new_block_offset; u64 rft_idx; rft_idx = clust_idx >> (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT); if (rft_idx >= rft->rf_size) { pr_warning("Don't support grow refcount block table"); return NULL; } new_block_offset = qcow_alloc_clusters(q, q->cluster_size, 0); if (new_block_offset == (u64)-1) return NULL; rfb = new_refcount_block(q, new_block_offset); if (!rfb) return NULL; memset(rfb->entries, 0x00, q->cluster_size); rfb->dirty = 1; /* write refcount block */ if (write_refcount_block(q, rfb) < 0) goto free_rfb; if (cache_refcount_block(q, rfb) < 0) goto free_rfb; rft->rf_table[rft_idx] = cpu_to_be64(new_block_offset); if (update_cluster_refcount(q, new_block_offset >> header->cluster_bits, 1) < 0) goto recover_rft; if (qcow_write_refcount_table(q) < 0) goto recover_rft; return rfb; recover_rft: rft->rf_table[rft_idx] = 0; free_rfb: free(rfb); return NULL; } static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64 clust_idx) { struct qcow_header *header = q->header; struct qcow_refcount_table *rft = &q->refcount_table; struct qcow_refcount_block *rfb; u64 rfb_offset; u64 rft_idx; rft_idx = clust_idx >> (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT); if (rft_idx >= rft->rf_size) return ERR_PTR(-ENOSPC); rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]); if (!rfb_offset) return ERR_PTR(-ENOSPC); rfb = refcount_block_search(q, rfb_offset); if (rfb) return rfb; rfb = new_refcount_block(q, rfb_offset); if (!rfb) return NULL; if (pread_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb_offset) < 0) goto error_free_rfb; if (cache_refcount_block(q, rfb) < 0) goto error_free_rfb; return rfb; error_free_rfb: free(rfb); return NULL; } static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx) { struct qcow_refcount_block *rfb = NULL; struct qcow_header *header = q->header; u64 rfb_idx; rfb = qcow_read_refcount_block(q, clust_idx); if (PTR_ERR(rfb) == -ENOSPC) return 0; else if (IS_ERR_OR_NULL(rfb)) { pr_warning("Error while reading refcount table"); return -1; } rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1)); if (rfb_idx >= rfb->size) { pr_warning("L1: refcount block index out of bounds"); return -1; } return be16_to_cpu(rfb->entries[rfb_idx]); } static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append) { struct qcow_refcount_block *rfb = NULL; struct qcow_header *header = q->header; u16 refcount; u64 rfb_idx; rfb = qcow_read_refcount_block(q, clust_idx); if (PTR_ERR(rfb) == -ENOSPC) { rfb = qcow_grow_refcount_block(q, clust_idx); if (!rfb) { pr_warning("error while growing refcount table"); return -1; } } else if (IS_ERR_OR_NULL(rfb)) { pr_warning("error while reading refcount table"); return -1; } rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1)); if (rfb_idx >= rfb->size) { pr_warning("refcount block index out of bounds"); return -1; } refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append; rfb->entries[rfb_idx] = cpu_to_be16(refcount); rfb->dirty = 1; /* write refcount block */ if (write_refcount_block(q, rfb) < 0) { pr_warning("refcount block index out of bounds"); return -1; } /* update free_clust_idx since refcount becomes zero */ if (!refcount && clust_idx < q->free_clust_idx) q->free_clust_idx = clust_idx; return 0; } static void qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size) { struct qcow_header *header = q->header; u64 start, end, offset; start = clust_start & ~(q->cluster_size - 1); end = (clust_start + size - 1) & ~(q->cluster_size - 1); for (offset = start; offset <= end; offset += q->cluster_size) update_cluster_refcount(q, offset >> header->cluster_bits, -1); } /* * Allocate clusters according to the size. Find a postion that * can satisfy the size. free_clust_idx is initialized to zero and * Record last position. */ static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref) { struct qcow_header *header = q->header; u16 clust_refcount; u32 clust_idx = 0, i; u64 clust_num; clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits; again: for (i = 0; i < clust_num; i++) { clust_idx = q->free_clust_idx++; clust_refcount = qcow_get_refcount(q, clust_idx); if (clust_refcount == (u16)-1) return -1; else if (clust_refcount > 0) goto again; } clust_idx++; if (update_ref) for (i = 0; i < clust_num; i++) if (update_cluster_refcount(q, clust_idx - clust_num + i, 1)) return -1; return (clust_idx - clust_num) << header->cluster_bits; } static int qcow_write_l1_table(struct qcow *q) { struct qcow_l1_table *l1t = &q->table; struct qcow_header *header = q->header; if (qcow_pwrite_sync(q->fd, l1t->l1_table, l1t->table_size * sizeof(u64), header->l1_table_offset) < 0) return -1; return 0; } /* * Get l2 table. If the table has been copied, read table directly. * If the table exists, allocate a new cluster and copy the table * to the new cluster. */ static int get_cluster_table(struct qcow *q, u64 offset, struct qcow_l2_table **result_l2t, u64 *result_l2_index) { struct qcow_header *header = q->header; struct qcow_l1_table *l1t = &q->table; struct qcow_l2_table *l2t; u64 l1t_idx; u64 l2t_offset; u64 l2t_idx; u64 l2t_size; u64 l2t_new_offset; l2t_size = 1 << header->l2_bits; l1t_idx = get_l1_index(q, offset); if (l1t_idx >= l1t->table_size) return -1; l2t_idx = get_l2_index(q, offset); if (l2t_idx >= l2t_size) return -1; l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]); if (l2t_offset & QCOW2_OFLAG_COPIED) { l2t_offset &= ~QCOW2_OFLAG_COPIED; l2t = qcow_read_l2_table(q, l2t_offset); if (!l2t) goto error; } else { l2t_new_offset = qcow_alloc_clusters(q, l2t_size*sizeof(u64), 1); if (l2t_new_offset != (u64)-1) goto error; l2t = new_cache_table(q, l2t_new_offset); if (!l2t) goto free_cluster; if (l2t_offset) { l2t = qcow_read_l2_table(q, l2t_offset); if (!l2t) goto free_cache; } else memset(l2t->table, 0x00, l2t_size * sizeof(u64)); /* write l2 table */ l2t->dirty = 1; if (qcow_l2_cache_write(q, l2t) < 0) goto free_cache; /* cache l2 table */ if (cache_table(q, l2t)) goto free_cache; /* update the l1 talble */ l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset | QCOW2_OFLAG_COPIED); if (qcow_write_l1_table(q)) { pr_warning("Update l1 table error"); goto free_cache; } /* free old cluster */ qcow_free_clusters(q, l2t_offset, q->cluster_size); } *result_l2t = l2t; *result_l2_index = l2t_idx; return 0; free_cache: free(l2t); free_cluster: qcow_free_clusters(q, l2t_new_offset, q->cluster_size); error: return -1; } /* * If the cluster has been copied, write data directly. If not, * read the original data and write it to the new cluster with * modification. */ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src_len) { struct qcow_l2_table *l2t; u64 clust_new_start; u64 clust_start; u64 clust_flags; u64 clust_off; u64 l2t_idx; u64 len; l2t = NULL; clust_off = get_cluster_offset(q, offset); if (clust_off >= q->cluster_size) return -1; len = q->cluster_size - clust_off; if (len > src_len) len = src_len; mutex_lock(&q->mutex); if (get_cluster_table(q, offset, &l2t, &l2t_idx)) { pr_warning("Get l2 table error"); goto error; } clust_start = be64_to_cpu(l2t->table[l2t_idx]); clust_flags = clust_start & QCOW2_OFLAGS_MASK; clust_start &= QCOW2_OFFSET_MASK; if (!(clust_flags & QCOW2_OFLAG_COPIED)) { clust_new_start = qcow_alloc_clusters(q, q->cluster_size, 1); if (clust_new_start != (u64)-1) { pr_warning("Cluster alloc error"); goto error; } offset &= ~(q->cluster_size - 1); /* if clust_start is not zero, read the original data*/ if (clust_start) { mutex_unlock(&q->mutex); if (qcow2_read_cluster(q, offset, q->copy_buff, q->cluster_size) < 0) { pr_warning("Read copy cluster error"); qcow_free_clusters(q, clust_new_start, q->cluster_size); return -1; } mutex_lock(&q->mutex); } else memset(q->copy_buff, 0x00, q->cluster_size); memcpy(q->copy_buff + clust_off, buf, len); /* Write actual data */ if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size, clust_new_start) < 0) goto free_cluster; /* update l2 table*/ l2t->table[l2t_idx] = cpu_to_be64(clust_new_start | QCOW2_OFLAG_COPIED); l2t->dirty = 1; if (qcow_l2_cache_write(q, l2t)) goto free_cluster; /* free old cluster*/ if (clust_flags & QCOW2_OFLAG_COMPRESSED) { int size; size = ((clust_start >> q->csize_shift) & q->csize_mask) + 1; size *= 512; clust_start &= q->cluster_offset_mask; clust_start &= ~511; qcow_free_clusters(q, clust_start, size); } else if (clust_start) qcow_free_clusters(q, clust_start, q->cluster_size); } else { /* Write actual data */ if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0) goto error; } mutex_unlock(&q->mutex); return len; free_cluster: qcow_free_clusters(q, clust_new_start, q->cluster_size); error: mutex_unlock(&q->mutex); return -1; } static ssize_t qcow_write_sector_single(struct disk_image *disk, u64 sector, void *src, u32 src_len) { struct qcow *q = disk->priv; struct qcow_header *header = q->header; u32 nr_written; char *buf; u64 offset; ssize_t nr; buf = src; nr_written = 0; offset = sector << SECTOR_SHIFT; while (nr_written < src_len) { if (offset >= header->size) return -1; nr = qcow_write_cluster(q, offset, buf, src_len - nr_written); if (nr < 0) return -1; nr_written += nr; buf += nr; offset += nr; } return nr_written; } static ssize_t qcow_write_sector(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount, void *param) { ssize_t nr, total = 0; while (iovcount--) { nr = qcow_write_sector_single(disk, sector, iov->iov_base, iov->iov_len); if (nr != (ssize_t)iov->iov_len) { pr_info("qcow_write_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len); return -1; } sector += iov->iov_len >> SECTOR_SHIFT; iov++; total += nr; } return total; } static int qcow_disk_flush(struct disk_image *disk) { struct qcow *q = disk->priv; struct qcow_refcount_table *rft; struct list_head *pos, *n; struct qcow_l1_table *l1t; l1t = &q->table; rft = &q->refcount_table; mutex_lock(&q->mutex); list_for_each_safe(pos, n, &rft->lru_list) { struct qcow_refcount_block *c = list_entry(pos, struct qcow_refcount_block, list); if (write_refcount_block(q, c) < 0) goto error_unlock; } list_for_each_safe(pos, n, &l1t->lru_list) { struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, list); if (qcow_l2_cache_write(q, c) < 0) goto error_unlock; } if (qcow_write_l1_table < 0) goto error_unlock; mutex_unlock(&q->mutex); return fsync(disk->fd); error_unlock: mutex_unlock(&q->mutex); return -1; } static int qcow_disk_close(struct disk_image *disk) { struct qcow *q; if (!disk) return 0; q = disk->priv; refcount_table_free_cache(&q->refcount_table); l1_table_free_cache(&q->table); free(q->copy_buff); free(q->cluster_data); free(q->cluster_cache); free(q->refcount_table.rf_table); free(q->table.l1_table); free(q->header); free(q); return 0; } static struct disk_image_operations qcow_disk_readonly_ops = { .read = qcow_read_sector, .close = qcow_disk_close, }; static struct disk_image_operations qcow_disk_ops = { .read = qcow_read_sector, .write = qcow_write_sector, .flush = qcow_disk_flush, .close = qcow_disk_close, }; static int qcow_read_refcount_table(struct qcow *q) { struct qcow_header *header = q->header; struct qcow_refcount_table *rft = &q->refcount_table; rft->rf_size = (header->refcount_table_size * q->cluster_size) / sizeof(u64); rft->rf_table = calloc(rft->rf_size, sizeof(u64)); if (!rft->rf_table) return -1; rft->root = (struct rb_root) RB_ROOT; INIT_LIST_HEAD(&rft->lru_list); return pread_in_full(q->fd, rft->rf_table, sizeof(u64) * rft->rf_size, header->refcount_table_offset); } static int qcow_write_refcount_table(struct qcow *q) { struct qcow_header *header = q->header; struct qcow_refcount_table *rft = &q->refcount_table; return qcow_pwrite_sync(q->fd, rft->rf_table, rft->rf_size * sizeof(u64), header->refcount_table_offset); } static int qcow_read_l1_table(struct qcow *q) { struct qcow_header *header = q->header; struct qcow_l1_table *table = &q->table; table->table_size = header->l1_size; table->l1_table = calloc(table->table_size, sizeof(u64)); if (!table->l1_table) return -1; return pread_in_full(q->fd, table->l1_table, sizeof(u64) * table->table_size, header->l1_table_offset); } static void *qcow2_read_header(int fd) { struct qcow2_header_disk f_header; struct qcow_header *header; header = malloc(sizeof(struct qcow_header)); if (!header) return NULL; if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0) { free(header); return NULL; } be32_to_cpus(&f_header.magic); be32_to_cpus(&f_header.version); be64_to_cpus(&f_header.backing_file_offset); be32_to_cpus(&f_header.backing_file_size); be32_to_cpus(&f_header.cluster_bits); be64_to_cpus(&f_header.size); be32_to_cpus(&f_header.crypt_method); be32_to_cpus(&f_header.l1_size); be64_to_cpus(&f_header.l1_table_offset); be64_to_cpus(&f_header.refcount_table_offset); be32_to_cpus(&f_header.refcount_table_clusters); be32_to_cpus(&f_header.nb_snapshots); be64_to_cpus(&f_header.snapshots_offset); *header = (struct qcow_header) { .size = f_header.size, .l1_table_offset = f_header.l1_table_offset, .l1_size = f_header.l1_size, .cluster_bits = f_header.cluster_bits, .l2_bits = f_header.cluster_bits - 3, .refcount_table_offset = f_header.refcount_table_offset, .refcount_table_size = f_header.refcount_table_clusters, }; return header; } static struct disk_image *qcow2_probe(int fd, bool readonly) { struct disk_image *disk_image; struct qcow_l1_table *l1t; struct qcow_header *h; struct qcow *q; q = calloc(1, sizeof(struct qcow)); if (!q) return NULL; mutex_init(&q->mutex); q->fd = fd; l1t = &q->table; l1t->root = (struct rb_root) RB_ROOT; INIT_LIST_HEAD(&l1t->lru_list); h = q->header = qcow2_read_header(fd); if (!h) goto free_qcow; q->version = QCOW2_VERSION; q->csize_shift = (62 - (q->header->cluster_bits - 8)); q->csize_mask = (1 << (q->header->cluster_bits - 8)) - 1; q->cluster_offset_mask = (1LL << q->csize_shift) - 1; q->cluster_size = 1 << q->header->cluster_bits; q->copy_buff = malloc(q->cluster_size); if (!q->copy_buff) { pr_warning("copy buff malloc error"); goto free_header; } q->cluster_data = malloc(q->cluster_size); if (!q->cluster_data) { pr_warning("cluster data malloc error"); goto free_copy_buff; } q->cluster_cache = malloc(q->cluster_size); if (!q->cluster_cache) { pr_warning("cluster cache malloc error"); goto free_cluster_data; } if (qcow_read_l1_table(q) < 0) goto free_cluster_cache; if (qcow_read_refcount_table(q) < 0) goto free_l1_table; /* * Do not use mmap use read/write instead */ if (readonly) disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR); else disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR); if (IS_ERR_OR_NULL(disk_image)) goto free_refcount_table; disk_image->priv = q; return disk_image; free_refcount_table: if (q->refcount_table.rf_table) free(q->refcount_table.rf_table); free_l1_table: if (q->table.l1_table) free(q->table.l1_table); free_cluster_cache: if (q->cluster_cache) free(q->cluster_cache); free_cluster_data: if (q->cluster_data) free(q->cluster_data); free_copy_buff: if (q->copy_buff) free(q->copy_buff); free_header: if (q->header) free(q->header); free_qcow: free(q); return NULL; } static bool qcow2_check_image(int fd) { struct qcow2_header_disk f_header; if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0) return false; be32_to_cpus(&f_header.magic); be32_to_cpus(&f_header.version); if (f_header.magic != QCOW_MAGIC) return false; if (f_header.version != QCOW2_VERSION) return false; return true; } static void *qcow1_read_header(int fd) { struct qcow1_header_disk f_header; struct qcow_header *header; header = malloc(sizeof(struct qcow_header)); if (!header) return NULL; if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0) { free(header); return NULL; } be32_to_cpus(&f_header.magic); be32_to_cpus(&f_header.version); be64_to_cpus(&f_header.backing_file_offset); be32_to_cpus(&f_header.backing_file_size); be32_to_cpus(&f_header.mtime); be64_to_cpus(&f_header.size); be32_to_cpus(&f_header.crypt_method); be64_to_cpus(&f_header.l1_table_offset); *header = (struct qcow_header) { .size = f_header.size, .l1_table_offset = f_header.l1_table_offset, .l1_size = f_header.size / ((1 << f_header.l2_bits) * (1 << f_header.cluster_bits)), .cluster_bits = f_header.cluster_bits, .l2_bits = f_header.l2_bits, }; return header; } static struct disk_image *qcow1_probe(int fd, bool readonly) { struct disk_image *disk_image; struct qcow_l1_table *l1t; struct qcow_header *h; struct qcow *q; q = calloc(1, sizeof(struct qcow)); if (!q) return NULL; mutex_init(&q->mutex); q->fd = fd; l1t = &q->table; l1t->root = (struct rb_root)RB_ROOT; INIT_LIST_HEAD(&l1t->lru_list); INIT_LIST_HEAD(&q->refcount_table.lru_list); h = q->header = qcow1_read_header(fd); if (!h) goto free_qcow; q->version = QCOW1_VERSION; q->cluster_size = 1 << q->header->cluster_bits; q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1; q->free_clust_idx = 0; q->cluster_data = malloc(q->cluster_size); if (!q->cluster_data) { pr_warning("cluster data malloc error"); goto free_header; } q->cluster_cache = malloc(q->cluster_size); if (!q->cluster_cache) { pr_warning("cluster cache malloc error"); goto free_cluster_data; } if (qcow_read_l1_table(q) < 0) goto free_cluster_cache; /* * Do not use mmap use read/write instead */ if (readonly) disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR); else disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR); if (!disk_image) goto free_l1_table; disk_image->priv = q; return disk_image; free_l1_table: if (q->table.l1_table) free(q->table.l1_table); free_cluster_cache: if (q->cluster_cache) free(q->cluster_cache); free_cluster_data: if (q->cluster_data) free(q->cluster_data); free_header: if (q->header) free(q->header); free_qcow: free(q); return NULL; } static bool qcow1_check_image(int fd) { struct qcow1_header_disk f_header; if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0) return false; be32_to_cpus(&f_header.magic); be32_to_cpus(&f_header.version); if (f_header.magic != QCOW_MAGIC) return false; if (f_header.version != QCOW1_VERSION) return false; return true; } struct disk_image *qcow_probe(int fd, bool readonly) { if (qcow1_check_image(fd)) return qcow1_probe(fd, readonly); if (qcow2_check_image(fd)) return qcow2_probe(fd, readonly); return NULL; }