diff options
-rw-r--r-- | xlators/cluster/afr/src/afr-dir-read.c | 302 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-dir-read.h | 3 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.c | 33 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 7 |
4 files changed, 299 insertions, 46 deletions
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index b48488526e5..98cda1e809f 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -24,6 +24,7 @@ #include <sys/time.h> #include <stdlib.h> #include <signal.h> +#include <string.h> #ifndef _CONFIG_H #define _CONFIG_H @@ -219,6 +220,7 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie, afr_local_t * local = NULL; int call_count = -1; + int ret = 0; LOCK (&frame->lock); { @@ -234,30 +236,33 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie, call_count = afr_frame_return (frame); if (call_count == 0) { - if ((local->op_ret == 0) && - !afr_is_opendir_done (this, fd->inode)) { - - /* - * This is the first opendir on this inode. We need - * to check if the directory's entries are the same - * on all subvolumes. This is needed in addition - * to regular entry self-heal because the readdir - * call is sent only to the first subvolume, and - * thus files that exist only there will never be healed - * otherwise (assuming changelog shows no anamolies). - */ - - gf_log (this->name, GF_LOG_TRACE, - "reading contents of directory %s looking for mismatch", - local->loc.path); - - afr_examine_dir (frame, this); - - } else { - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); + if (local->op_ret == 0) { + ret = afr_fd_ctx_set (this, local->fd); + + if (!afr_is_opendir_done (this, fd->inode)) { + + /* + * This is the first opendir on this inode. We need + * to check if the directory's entries are the same + * on all subvolumes. This is needed in addition + * to regular entry self-heal because the readdir + * call is sent only to the first subvolume, and + * thus files that exist only there will never be healed + * otherwise (assuming changelog shows no anamolies). + */ + + gf_log (this->name, GF_LOG_TRACE, + "reading contents of directory %s looking for mismatch", + local->loc.path); + + afr_examine_dir (frame, this); + + } else { + AFR_STACK_UNWIND (opendir, frame, local->op_ret, + local->op_errno, local->fd); + } } - } + } return 0; } @@ -333,6 +338,120 @@ out: * Applicable to: readdir */ + +struct entry_name { + char *name; + struct list_head list; +}; + + +static gf_boolean_t +remembered_name (const char *name, struct list_head *entries) +{ + struct entry_name *e; + gf_boolean_t ret = _gf_false; + + list_for_each_entry (e, entries, list) { + if (!strcmp (name, e->name)) { + ret = _gf_true; + goto out; + } + } + +out: + return ret; +} + + +static void +afr_remember_entries (gf_dirent_t *entries, fd_t *fd) +{ + struct entry_name *n = NULL; + gf_dirent_t * entry = NULL; + + int ret = 0; + + uint64_t ctx; + afr_fd_ctx_t *fd_ctx; + + ret = fd_ctx_get (fd, THIS, &ctx); + if (ret < 0) { + gf_log (THIS->name, GF_LOG_DEBUG, + "could not get fd ctx for fd=%p", fd); + return; + } + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + + list_for_each_entry (entry, &entries->list, list) { + n = CALLOC (1, sizeof (*n)); + n->name = strdup (entry->d_name); + INIT_LIST_HEAD (&n->list); + + list_add (&n->list, &fd_ctx->entries); + } +} + + +static off_t +afr_filter_entries (gf_dirent_t *entries, fd_t *fd) +{ + gf_dirent_t *entry, *tmp; + int ret = 0; + + uint64_t ctx; + afr_fd_ctx_t *fd_ctx; + + off_t offset; + + ret = fd_ctx_get (fd, THIS, &ctx); + if (ret < 0) { + gf_log (THIS->name, GF_LOG_DEBUG, + "could not get fd ctx for fd=%p", fd); + return -1; + } + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + + list_for_each_entry_safe (entry, tmp, &entries->list, list) { + offset = entry->d_off; + + if (remembered_name (entry->d_name, &fd_ctx->entries)) { + list_del (&entry->list); + FREE (entry); + } + } + + return offset; +} + + +static void +afr_forget_entries (fd_t *fd) +{ + struct entry_name *entry, *tmp; + int ret = 0; + + uint64_t ctx; + afr_fd_ctx_t *fd_ctx; + + ret = fd_ctx_get (fd, THIS, &ctx); + if (ret < 0) { + gf_log (THIS->name, GF_LOG_DEBUG, + "could not get fd ctx for fd=%p", fd); + return; + } + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + + list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) { + FREE (entry->name); + list_del (&entry->list); + FREE (entry); + } +} + + int32_t afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, @@ -383,11 +502,19 @@ afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, xlator_t ** children = NULL; ino_t inum = 0; + int call_child = 0; + int ret = 0; + gf_dirent_t * entry = NULL; gf_dirent_t * tmp = NULL; int child_index = -1; + uint64_t ctx; + afr_fd_ctx_t *fd_ctx; + + off_t offset = 0; + priv = this->private; children = priv->children; @@ -395,23 +522,88 @@ afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, child_index = (long) cookie; - if (op_ret != -1) { - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - inum = afr_itransform (entry->d_ino, priv->child_count, - child_index); - entry->d_ino = inum; - inum = afr_itransform (entry->d_stat.st_ino, - priv->child_count, child_index); - entry->d_stat.st_ino = inum; + if (priv->strict_readdir) { + ret = fd_ctx_get (local->fd, this, &ctx); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "could not get fd ctx for fd=%p", local->fd); + op_ret = -1; + op_errno = -ret; + goto out; + } - if ((local->fd->inode == local->fd->inode->table->root) - && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - FREE (entry); - } + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + + if (child_went_down (op_ret, op_errno)) { + if (all_tried (child_index, priv->child_count)) { + goto out; + } + + call_child = ++child_index; + + gf_log (this->name, GF_LOG_TRACE, + "starting readdir afresh on child %d, offset %"PRId64, + call_child, (uint64_t) 0); + + fd_ctx->failed_over = _gf_true; + + STACK_WIND_COOKIE (frame, afr_readdirp_cbk, + (void *) (long) call_child, + children[call_child], + children[call_child]->fops->readdirp, local->fd, + local->cont.readdir.size, 0); + return 0; + } + } + + list_for_each_entry_safe (entry, tmp, &entries->list, list) { + inum = afr_itransform (entry->d_ino, priv->child_count, + child_index); + entry->d_ino = inum; + inum = afr_itransform (entry->d_stat.st_ino, + priv->child_count, child_index); + entry->d_stat.st_ino = inum; + + if ((local->fd->inode == local->fd->inode->table->root) + && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { + list_del_init (&entry->list); + FREE (entry); } } + if (priv->strict_readdir) { + if (fd_ctx->failed_over) { + if (list_empty (&entries->list)) { + goto out; + } + + offset = afr_filter_entries (entries, local->fd); + + afr_remember_entries (entries, local->fd); + + if (list_empty (&entries->list)) { + /* All the entries we got were duplicate. We + shouldn't send an empty list now, because + that'll make the application stop reading. So + try to get more entries */ + + gf_log (this->name, GF_LOG_TRACE, + "trying to fetch non-duplicate entries from offset %"PRId64", child %s", + offset, children[child_index]->name); + + STACK_WIND_COOKIE (frame, afr_readdirp_cbk, + (void *) (long) child_index, + children[child_index], + children[child_index]->fops->readdirp, + local->fd, local->cont.readdir.size, offset); + return 0; + } + } else { + afr_remember_entries (entries, local->fd); + } + } + +out: AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries); return 0; @@ -427,6 +619,9 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this, int call_child = 0; afr_local_t *local = NULL; + uint64_t ctx; + afr_fd_ctx_t *fd_ctx; + int ret = -1; int32_t op_ret = -1; @@ -445,7 +640,7 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this, op_errno = -ret; goto out; } - + frame->local = local; call_child = afr_first_up_child (priv); @@ -458,7 +653,29 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this, local->fd = fd_ref (fd); local->cont.readdir.size = size; - local->cont.readdir.offset = offset; + + if (priv->strict_readdir) { + ret = fd_ctx_get (fd, this, &ctx); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "could not get fd ctx for fd=%p", fd); + op_errno = -ret; + goto out; + } + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + + if (fd_ctx->last_tried != call_child) { + gf_log (this->name, GF_LOG_TRACE, + "first up child has changed from %d to %d, restarting readdir from offset 0", + fd_ctx->last_tried, call_child); + + fd_ctx->failed_over = _gf_true; + offset = 0; + } + + fd_ctx->last_tried = call_child; + } if (whichop == GF_FOP_READDIR) STACK_WIND_COOKIE (frame, afr_readdir_cbk, @@ -545,6 +762,15 @@ out: int32_t +afr_releasedir (xlator_t *this, fd_t *fd) +{ + afr_forget_entries (fd); + + return 0; +} + + +int32_t afr_getdents (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, int32_t flag) { diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h index 98ce1ca1680..abde2534de9 100644 --- a/xlators/cluster/afr/src/afr-dir-read.h +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -26,8 +26,7 @@ afr_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd); int32_t -afr_closedir (call_frame_t *frame, xlator_t *this, - fd_t *fd); +afr_releasedir (xlator_t *this, fd_t *fd); int32_t afr_readdir (call_frame_t *frame, xlator_t *this, diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 91189267fe7..c041adc99e0 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -1128,6 +1128,8 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd) if (ret < 0) { op_ret = ret; } + + INIT_LIST_HEAD (&fd_ctx->entries); } unlock: UNLOCK (&fd->lock); @@ -2693,11 +2695,12 @@ init (xlator_t *this) int ret = -1; int op_errno = 0; - char * read_subvol = NULL; - char * fav_child = NULL; - char * self_heal = NULL; - char * algo = NULL; - char * change_log = NULL; + char * read_subvol = NULL; + char * fav_child = NULL; + char * self_heal = NULL; + char * algo = NULL; + char * change_log = NULL; + char * strict_readdir = NULL; int32_t background_count = 0; int32_t lock_server_count = 1; @@ -2893,6 +2896,20 @@ init (xlator_t *this) priv->entry_lock_server_count = lock_server_count; } + priv->strict_readdir = _gf_false; + + dict_ret = dict_get_str (this->options, "strict-readdir", + &strict_readdir); + if (dict_ret == 0) { + ret = gf_string2boolean (strict_readdir, &priv->strict_readdir); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "Invalid 'option strict-readdir %s'. " + "Defaulting to strict-readdir as 'off'.", + strict_readdir); + } + } + trav = this->children; while (trav) { if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) { @@ -3037,7 +3054,8 @@ struct xlator_dumpops dumpops = { struct xlator_cbks cbks = { - .release = afr_release, + .release = afr_release, + .releasedir = afr_releasedir, }; @@ -3090,5 +3108,8 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_INT, .min = 0 }, + { .key = {"strict-readdir"}, + .type = GF_OPTION_TYPE_BOOL, + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 71f4b7e561e..a5c75add7ba 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -67,6 +67,8 @@ typedef struct _afr_private { unsigned int metadata_lock_server_count; unsigned int entry_lock_server_count; + gf_boolean_t strict_readdir; + unsigned int wait_count; /* # of servers to wait for success */ uint64_t up_count; /* number of CHILD_UPs we have seen */ @@ -327,6 +329,7 @@ typedef struct _afr_local { size_t size; off_t offset; + gf_boolean_t failed; int last_tried; } readdir; @@ -547,6 +550,10 @@ typedef struct { int32_t wbflags; uint64_t up_count; /* number of CHILD_UPs this fd has seen */ uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ + + int32_t last_tried; + gf_boolean_t failed_over; + struct list_head entries; /* needed for readdir failover */ } afr_fd_ctx_t; |