diff options
author | Krutika Dhananjay <kdhananj@redhat.com> | 2015-05-18 18:06:32 +0530 |
---|---|---|
committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2015-05-31 20:31:57 -0700 |
commit | 43d464326b1158d5b6caa60b2eac5b78f20b5026 (patch) | |
tree | 155d0ea632bbc127a7bdcb94a3a39b14a4f7bac4 /xlators | |
parent | 7023870b28b1eb87fb6eca2904c72e91fdcaf625 (diff) |
features/shard: Fix issue with readdir(p) fop
Problem:
When readdir(p) is performed on '/' and ".shard" happens to be
the last of the entries read in a given iteration of dht_readdir(p)
(in other words the entry with the highest offset in the dirent list
sorted in ascending order of d_offs), shard xlator would delete this
entry as part of handling the call so as to avoid exposing its presence
to the application. This would cause xlators above (like fuse,
readdir-ahead etc) to wind the next readdirp as part of the same req
at an offset which is (now) the highest d_off (post deletion of .shard)
from the previously unwound list of entries. This offset would be less
than that of ".shard" and therefore cause /.shard to be read once again.
If by any chance this happens to be the only entry until end-of-directory,
shard xlator would delete this entry and unwind with 0 entries, causing the
xlator(s) above to think there is nothing more to readdir and the fop is
complete. This would prevent DHT from gathering entries from the rest of
its subvolumes, causing some entries to disappear.
Fix:
At the level of shard xlator, if ".shard" happens to be the last entry,
make shard xlator wind another readdirp at offset equal to d_off of
".shard". That way, if ".shard" happens to be the only other entry under '/'
until end-of-directory, DHT would receive an op_ret=0. This would enable it
to wind readdir(p) on the rest of its subvols and gather the complete picture.
Also, fixed a bug in shard_lookup_cbk() wherein file_size should be fetched
unconditionally in cbk since it is set unconditionally in the wind path, failing
which, lookup would be unwound with ia_size and ia_blocks only equal to that of
the base file.
Change-Id: I6c2bc770f1bcdad51c273c777ae0b42c88c53f61
BUG: 1222379
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
Reviewed-on: http://review.gluster.org/10809
Tested-by: NetBSD Build System <jenkins@build.gluster.org>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Diffstat (limited to 'xlators')
-rw-r--r-- | xlators/features/shard/src/shard.c | 156 | ||||
-rw-r--r-- | xlators/features/shard/src/shard.h | 3 |
2 files changed, 116 insertions, 43 deletions
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c index c3613711840..670fc60ddc0 100644 --- a/xlators/features/shard/src/shard.c +++ b/xlators/features/shard/src/shard.c @@ -214,6 +214,8 @@ shard_local_wipe (shard_local_t *local) GF_FREE (local->vector); if (local->iobref) iobref_unref (local->iobref); + if (local->list_inited) + gf_dirent_free (&local->entries_head); } int @@ -563,43 +565,39 @@ shard_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (IA_ISDIR (buf->ia_type)) goto unwind; - if (!shard_inode_ctx_get_block_size (inode, this, &size)) - goto unwind; - - ret = dict_get_uint64 (xdata, GF_XATTR_SHARD_BLOCK_SIZE, &size); - if (!ret) { - ctx_tmp.block_size = ntoh64 (size); - ctx_tmp.mode = st_mode_from_ia (buf->ia_prot, buf->ia_type); - ctx_tmp.rdev = buf->ia_rdev; - /* Sharding xlator would fetch size and block count only if - * @size is present. The absence of GF_XATTR_SHARD_BLOCK_SIZE on - * the file looked up could be because it was created before - * sharding was enabled on the volume. - */ - ret = dict_get_ptr (xdata, GF_XATTR_SHARD_FILE_SIZE, - &size_attr); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, "Failed to " - "get xattr "GF_XATTR_SHARD_FILE_SIZE" from disk" - " for %s", uuid_utoa (inode->gfid)); - op_ret = -1; - op_errno = ENOMEM; - goto unwind; + /* If this was a fresh lookup, there are two possibilities: + * 1) If the file is sharded (indicated by the presence of block size + * xattr), store this block size, along with rdev and mode in its + * inode ctx. + * 2) If the file is not sharded, store size along with rdev and mode + * (which are anyway don't cares) in inode ctx. Since @ctx_tmp is + * already initialised to all zeroes, nothing more needs to be done. + */ + if (shard_inode_ctx_get_block_size (inode, this, &size)) { + ret = dict_get_uint64 (xdata, GF_XATTR_SHARD_BLOCK_SIZE, &size); + if (!ret) { + ctx_tmp.block_size = ntoh64 (size); + ctx_tmp.mode = st_mode_from_ia (buf->ia_prot, + buf->ia_type); + ctx_tmp.rdev = buf->ia_rdev; } + ret = shard_inode_ctx_set_all (inode, this, &ctx_tmp); + if (ret) + gf_log (this->name, GF_LOG_WARNING, "Failed to set " + "inode ctx for %s", uuid_utoa (buf->ia_gfid)); + } + /* Also, if the file is sharded, get the file size and block cnt xattr, + * and store them in the stbuf appropriately. + */ + + ret = dict_get_ptr (xdata, GF_XATTR_SHARD_FILE_SIZE, &size_attr); + if (!ret) { memcpy (size_array, size_attr, sizeof (size_array)); buf->ia_size = ntoh64 (size_array[0]); buf->ia_blocks = ntoh64 (size_array[2]); } - /* else it is assumed that the file was created prior to enabling - * sharding on the volume. - */ - - ret = shard_inode_ctx_set_all (inode, this, &ctx_tmp); - if (ret) - gf_log (this->name, GF_LOG_WARNING, "Failed to set inode ctx " - "for %s", uuid_utoa (buf->ia_gfid)); unwind: SHARD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, @@ -3159,18 +3157,57 @@ shard_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, return 0; } +int +shard_readdir_past_dot_shard_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, gf_dirent_t *orig_entries, + dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) + goto unwind; + + list_for_each_entry_safe (entry, tmp, (&orig_entries->list), list) { + + list_del_init (&entry->list); + list_add_tail (&entry->list, &local->entries_head.list); + + if (!entry->dict) + continue; + + if (IA_ISDIR (entry->d_stat.ia_type)) + continue; + + shard_modify_size_and_block_count (&entry->d_stat, entry->dict); + } + local->op_ret += op_ret; + +unwind: + if (local->fop == GF_FOP_READDIR) + SHARD_STACK_UNWIND (readdir, frame, local->op_ret, + local->op_errno, + &local->entries_head, xdata); + else + SHARD_STACK_UNWIND (readdirp, frame, op_ret, op_errno, + &local->entries_head, xdata); + return 0; +} + int32_t shard_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + int32_t op_ret, int32_t op_errno, gf_dirent_t *orig_entries, dict_t *xdata) { - fd_t *fd = NULL; - gf_dirent_t *entry = NULL; - gf_dirent_t *tmp = NULL; - shard_local_t *local = NULL; - gf_dirent_t skipped; - - INIT_LIST_HEAD (&skipped.list); + fd_t *fd = NULL; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + shard_local_t *local = NULL; + gf_boolean_t last_entry = _gf_false; local = frame->local; fd = local->fd; @@ -3178,26 +3215,55 @@ shard_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret < 0) goto unwind; - list_for_each_entry_safe (entry, tmp, &entries->list, list) { + list_for_each_entry_safe (entry, tmp, (&orig_entries->list), list) { + if (last_entry) + last_entry = _gf_false; + if (__is_root_gfid (fd->inode->gfid) && !(strcmp (entry->d_name, GF_SHARD_DIR))) { - list_del_init (&entry->list); - list_add_tail (&entry->list, &skipped.list); + local->offset = entry->d_off; + op_ret--; + last_entry = _gf_true; continue; } + list_del_init (&entry->list); + list_add_tail (&entry->list, &local->entries_head.list); + if (!entry->dict) continue; + if (IA_ISDIR (entry->d_stat.ia_type)) continue; shard_modify_size_and_block_count (&entry->d_stat, entry->dict); + } + + local->op_ret = op_ret; + if (last_entry) { + if (local->fop == GF_FOP_READDIR) + STACK_WIND (frame, shard_readdir_past_dot_shard_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, local->fd, + local->readdir_size, local->offset, + local->xattr_req); + else + STACK_WIND (frame, shard_readdir_past_dot_shard_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, + local->fd, local->readdir_size, + local->offset, local->xattr_req); + return 0; } unwind: - SHARD_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, xdata); - gf_dirent_free (&skipped); + if (local->fop == GF_FOP_READDIR) + SHARD_STACK_UNWIND (readdir, frame, op_ret, op_errno, + &local->entries_head, xdata); + else + SHARD_STACK_UNWIND (readdirp, frame, op_ret, op_errno, + &local->entries_head, xdata); return 0; } @@ -3218,6 +3284,10 @@ shard_readdir_do (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, frame->local = local; local->fd = fd_ref (fd); + local->fop = whichop; + local->readdir_size = size; + INIT_LIST_HEAD (&local->entries_head.list); + local->list_inited = _gf_true; if (whichop == GF_FOP_READDIR) { STACK_WIND (frame, shard_readdir_cbk, FIRST_CHILD(this), diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h index 5423c03b1fa..6bdbcbb9b6e 100644 --- a/xlators/features/shard/src/shard.h +++ b/xlators/features/shard/src/shard.h @@ -153,6 +153,7 @@ typedef struct shard_local { size_t written_size; size_t hole_size; size_t req_size; + size_t readdir_size; loc_t loc; loc_t dot_shard_loc; loc_t loc2; @@ -171,6 +172,8 @@ typedef struct shard_local { struct iovec *vector; struct iobref *iobref; struct iobuf *iobuf; + gf_dirent_t entries_head; + gf_boolean_t list_inited; shard_post_fop_handler_t handler; shard_post_lookup_shards_fop_handler_t pls_fop_handler; shard_post_resolve_fop_handler_t post_res_handler; |