diff options
Diffstat (limited to 'xlators/cluster/dht/src')
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 99 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 53 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-diskusage.c | 75 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-helper.c | 183 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-inode-read.c | 69 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-inode-write.c | 197 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-layout.c | 35 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-linkfile.c | 4 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 155 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-rename.c | 48 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-selfheal.c | 33 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 26 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht.c | 1 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/nufa.c | 190 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/switch.c | 6 |
15 files changed, 807 insertions, 367 deletions
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index ec54e438e..8f61339e6 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -22,6 +22,7 @@ #include "dht-common.h" #include "defaults.h" #include "byte-order.h" +#include "glusterfs-acl.h" #include <sys/time.h> #include <libgen.h> @@ -62,6 +63,11 @@ dht_aggregate (dict_t *this, char *key, data_t *value, void *data) } *size = hton64 (ntoh64 (*size) + ntoh64 (*ptr)); + + } else if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) { + ret = gf_get_min_stime (THIS, dst, key, value); + if (ret < 0) + return ret; } else { /* compare user xattrs only */ if (!strncmp (key, "user.", strlen ("user."))) { @@ -149,7 +155,6 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) int ret = -1; dht_layout_t *layout = NULL; dht_conf_t *conf = NULL; - uint32_t missing = 0; local = discover_frame->local; layout = local->layout; @@ -186,26 +191,20 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) goto out; } } else { - ret = dht_layout_normalize (this, &local->loc, layout, - &missing); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "normalizing failed on %s (internal error)", - local->loc.path); - op_errno = EIO; - goto out; - } - if (missing == conf->subvolume_cnt) { - gf_log (this->name, GF_LOG_DEBUG, - "normalizing failed on %s, ENOENT errors: %u)", - local->loc.path, missing); - op_errno = ENOENT; - goto out; - } - if (ret != 0) { + ret = dht_layout_normalize (this, &local->loc, layout); + if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) { + /* either the layout is incorrect or the directory is + * not found even in one subvolume. + */ gf_log (this->name, GF_LOG_DEBUG, "normalizing failed on %s " - "(overlaps/holes present)", local->loc.path); + "(overlaps/holes present: %s, " + "ENOENT errors: %d)", local->loc.path, + (ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0); + if ((ret > 0) && (ret == conf->subvolume_cnt)) { + op_errno = ESTALE; + goto out; + } } if (local->inode) @@ -409,7 +408,6 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_layout_t *layout = NULL; int ret = -1; int is_dir = 0; - uint32_t missing = 0; GF_VALIDATE_OR_GOTO ("dht", frame, out); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -443,7 +441,7 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, op_ret, op_errno, xattr); if (op_ret == -1) { - local->op_errno = ENOENT; + local->op_errno = op_errno; gf_log (this->name, GF_LOG_DEBUG, "lookup of %s on %s returned error (%s)", local->loc.path, prev->this->name, @@ -490,16 +488,9 @@ unlock: } if (local->op_ret == 0) { - ret = dht_layout_normalize (this, &local->loc, layout, - &missing); - - /* - * Arguably, we shouldn't do self-heal just because - * bricks are missing as long as there are no other - * anomalies. For now, though, just preserve the - * existing behavior. - */ - if ((ret != 0) || (missing != 0)) { + ret = dht_layout_normalize (this, &local->loc, layout); + + if (ret != 0) { gf_log (this->name, GF_LOG_DEBUG, "fixing assignment on %s", local->loc.path); @@ -1424,7 +1415,6 @@ dht_lookup (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); conf = this->private; if (!conf) @@ -1645,7 +1635,8 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, LOCK (&frame->lock); { - if ((op_ret == -1) && (op_errno != ENOENT)) { + if ((op_ret == -1) && !((op_errno == ENOENT) || + (op_errno == ENOTCONN))) { local->op_errno = op_errno; gf_log (this->name, GF_LOG_DEBUG, "subvolume %s returned -1 (%s)", @@ -1797,6 +1788,7 @@ dht_vgetxattr_alloc_and_fill (dht_local_t *local, dict_t *xattr, xlator_t *this, } (void) strcat (local->xattr_val, value); + (void) strcat (local->xattr_val, " "); local->op_ret = 0; } @@ -1821,6 +1813,8 @@ dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this, if (!*dict) goto out; + local->xattr_val[strlen (local->xattr_val) - 1] = '\0'; + /* we would need max this many bytes to create xattr string * extra 40 bytes is just an estimated amount of additional * space required as we include translator name and some @@ -2128,7 +2122,6 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); VALIDATE_OR_GOTO (this->private, err); conf = this->private; @@ -2173,8 +2166,9 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, * NOTE: Don't trust inode here, as that may not be valid * (until inode_link() happens) */ - if (key && (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0) - && DHT_IS_DIR(layout)) { + if (key && DHT_IS_DIR(layout) && + ((strcmp (key, GF_XATTR_PATHINFO_KEY) == 0) + || (strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0))) { (void) strncpy (local->xsel, key, 256); cnt = local->call_cnt = layout->cnt; for (i = 0; i < cnt; i++) { @@ -2245,7 +2239,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (cluster_getmarkerattr (frame, this, loc, key, local, dht_getxattr_unwind, sub_volumes, cnt, - MARKER_UUID_TYPE, conf->vol_uuid)) { + MARKER_UUID_TYPE, marker_uuid_default_gauge, + conf->vol_uuid)) { op_errno = EINVAL; goto err; } @@ -2269,6 +2264,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, local, dht_getxattr_unwind, sub_volumes, cnt, MARKER_XTIME_TYPE, + marker_xtime_default_gauge, conf->vol_uuid)) { op_errno = EINVAL; goto err; @@ -2484,7 +2480,6 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); conf = this->private; @@ -2698,7 +2693,6 @@ dht_removexattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); local = dht_local_init (frame, loc, NULL, GF_FOP_REMOVEXATTR); if (!local) { @@ -2930,7 +2924,6 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); VALIDATE_OR_GOTO (this->private, err); conf = this->private; @@ -3051,7 +3044,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, list_for_each_entry (orig_entry, (&orig_entries->list), list) { next_offset = orig_entry->d_off; if (check_is_dir (NULL, (&orig_entry->d_stat), NULL) && - (prev->this != dht_first_up_subvol (this))) { + (prev->this != local->first_up_subvol)) { continue; } if (check_is_linkfile (NULL, (&orig_entry->d_stat), @@ -3133,13 +3126,16 @@ done: } if (conf->readdir_optimize == _gf_true) { - if (next_subvol != dht_first_up_subvol (this)) { + if (next_subvol != local->first_up_subvol) { ret = dict_set_int32 (local->xattr, GF_READDIR_SKIP_DIRS, 1); if (ret) gf_log (this->name, GF_LOG_ERROR, "dict set failed"); - } + } else { + dict_del (local->xattr, + GF_READDIR_SKIP_DIRS); + } } STACK_WIND (frame, dht_readdirp_cbk, @@ -3285,6 +3281,7 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, local->fd = fd_ref (fd); local->size = size; local->xattr_req = (dict)? dict_ref (dict) : NULL; + local->first_up_subvol = dht_first_up_subvol (this); dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); @@ -3303,13 +3300,16 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, "failed to set '%s' key", conf->link_xattr_name); if (conf->readdir_optimize == _gf_true) { - if (xvol != dht_first_up_subvol (this)) { + if (xvol != local->first_up_subvol) { ret = dict_set_int32 (local->xattr, GF_READDIR_SKIP_DIRS, 1); if (ret) gf_log (this->name, GF_LOG_ERROR, "Dict set failed"); + } else { + dict_del (local->xattr, + GF_READDIR_SKIP_DIRS); } } } @@ -3567,7 +3567,9 @@ dht_mknod (call_frame_t *frame, xlator_t *this, subvol, subvol->fops->mknod, loc, mode, rdev, umask, params); } else { - avail_subvol = dht_free_disk_available_subvol (this, subvol); + + avail_subvol = dht_free_disk_available_subvol (this, subvol, + local); if (avail_subvol != subvol) { /* Choose the minimum filled volume, and create the files there */ @@ -3988,7 +3990,7 @@ dht_create (call_frame_t *frame, xlator_t *this, } /* Choose the minimum filled volume, and create the files there */ - avail_subvol = dht_free_disk_available_subvol (this, subvol); + avail_subvol = dht_free_disk_available_subvol (this, subvol, local); if (avail_subvol != subvol) { local->params = dict_ref (params); local->flags = flags; @@ -4864,7 +4866,6 @@ dht_entrylk (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); local = dht_local_init (frame, loc, NULL, GF_FOP_ENTRYLK); if (!local) { @@ -5172,8 +5173,8 @@ unlock: * not need to handle CHILD_DOWN event here. */ if (conf->defrag) { - ret = pthread_create (&conf->defrag->th, NULL, - gf_defrag_start, this); + ret = gf_thread_create (&conf->defrag->th, NULL, + gf_defrag_start, this); if (ret) { conf->defrag = NULL; GF_FREE (conf->defrag); diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index fb90e48cc..5ccd66799 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -183,6 +183,7 @@ struct dht_local { xlator_t *link_subvol; struct dht_rebalance_ rebalance; + xlator_t *first_up_subvol; }; typedef struct dht_local dht_local_t; @@ -211,6 +212,10 @@ enum gf_defrag_status_t { GF_DEFRAG_STATUS_STOPPED, GF_DEFRAG_STATUS_COMPLETE, GF_DEFRAG_STATUS_FAILED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED, + GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, + GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED, }; typedef enum gf_defrag_status_t gf_defrag_status_t; @@ -227,6 +232,7 @@ struct gf_defrag_info_ { uint64_t total_data; uint64_t num_files_lookedup; uint64_t total_failures; + uint64_t skipped; gf_lock_t lock; int cmd; pthread_t th; @@ -394,26 +400,18 @@ typedef enum { } while (0) #define is_greater_time(a, an, b, bn) (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) - -dht_layout_t *dht_layout_new (xlator_t *this, int cnt); -dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); -dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); -xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, - const char *name); -int dht_layout_normalize (xlator_t *this, loc_t *loc, - dht_layout_t *layout, - uint32_t *missing_p); -int dht_layout_anomalies (xlator_t *this, loc_t *loc, - dht_layout_t *layout, - uint32_t *holes_p, - uint32_t *overlaps_p, - uint32_t *missing_p, - uint32_t *down_p, - uint32_t *misc_p, - uint32_t *no_space_p); -int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, - xlator_t *subvol, loc_t *loc, - dict_t *xattr); +dht_layout_t *dht_layout_new (xlator_t *this, int cnt); +dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); +dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); +xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, + const char *name); +int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout); +int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, + uint32_t *misc_p, uint32_t *no_space_p); +int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, + xlator_t *subvol, loc_t *loc, dict_t *xattr); xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *buf, dict_t *xattr); @@ -445,6 +443,7 @@ int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc); xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode); xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev); +xlator_t *dht_subvol_next_available (xlator_t *this, xlator_t *prev); int dht_subvol_cnt (xlator_t *this, xlator_t *subvol); int dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p); @@ -469,7 +468,8 @@ dht_layout_sort_volname (dht_layout_t *layout); int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc); gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); -xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol); +xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, + dht_local_t *layout); int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx); int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode); @@ -695,6 +695,8 @@ int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, off_t offset, size_t len, dict_t *xdata); int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, size_t len, dict_t *xdata); +int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata); int32_t dht_init (xlator_t *this); void dht_fini (xlator_t *this); @@ -764,9 +766,11 @@ dht_dir_has_layout (dict_t *xattr, char *name); gf_boolean_t dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator); xlator_t * -dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol); +dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); xlator_t * -dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol); +dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); int dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this); @@ -777,4 +781,7 @@ dht_priv_dump (xlator_t *this); int32_t dht_inodectx_dump (xlator_t *this, inode_t *inode); +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol); + #endif/* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 0c87f4a64..fe3955ecb 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -251,25 +251,45 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) /*Get the best subvolume to create the file in*/ xlator_t * -dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol) +dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, + dht_local_t *local) { xlator_t *avail_subvol = NULL; dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + loc_t *loc = NULL; conf = this->private; + if (!local) + goto out; + loc = &local->loc; + if (!local->layout) { + layout = dht_layout_get (this, loc->parent); + + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "layout missing path=%s parent=%s", + loc->path, uuid_utoa (loc->parent->gfid)); + goto out; + } + } else { + layout = dht_layout_ref (this, local->layout); + } - LOCK (&conf->subvolume_lock); + LOCK (&conf->subvolume_lock); { - avail_subvol = dht_subvol_with_free_space_inodes(this, subvol); + avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, + layout); if(!avail_subvol) { avail_subvol = dht_subvol_maxspace_nonzeroinode(this, - subvol); + subvol, + layout); } } UNLOCK (&conf->subvolume_lock); - +out: if (!avail_subvol) { gf_log (this->name, GF_LOG_DEBUG, @@ -278,17 +298,42 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol) avail_subvol = subvol; } - + if (layout) + dht_layout_unref (this, layout); return avail_subvol; } +static inline +int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout) +{ + int ret = -1; + int i = 0; + + if (!this || !layout) + goto out; + + /* check if subvol has layout errors, before selecting it */ + for (i = 0; i < layout->cnt; i++) { + if (!strcmp (layout->list[i].xlator->name, this->name) && + (layout->list[i].err != 0)) { + ret = -1; + goto out; + } + } + ret = 0; +out: + return ret; +} + /*Get subvolume which has both space and inodes more than the min criteria*/ xlator_t * -dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol) +dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) { int i = 0; double max = 0; double max_inodes = 0; + int ignore_subvol = 0; xlator_t *avail_subvol = NULL; dht_conf_t *conf = NULL; @@ -296,6 +341,12 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol) conf = this->private; for(i=0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + layout); + if (ignore_subvol) + continue; + if ((conf->disk_unit == 'p') && (conf->du_stats[i].avail_percent > conf->min_free_disk) && (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { @@ -325,10 +376,12 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol) /* Get subvol which has atleast one inode and maximum space */ xlator_t * -dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol) +dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) { int i = 0; double max = 0; + int ignore_subvol = 0; xlator_t *avail_subvol = NULL; dht_conf_t *conf = NULL; @@ -336,6 +389,12 @@ dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol) conf = this->private; for (i = 0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + layout); + if (ignore_subvol) + continue; + if (conf->disk_unit == 'p') { if ((conf->du_stats[i].avail_percent > max) && (conf->du_stats[i].avail_inodes > 0 )) { diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index 23fc1ff96..311a48112 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -18,6 +18,28 @@ #include "xlator.h" #include "dht-common.h" +static inline int +dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol) +{ + uint64_t tmp_subvol = 0; + + tmp_subvol = (long)subvol; + return inode_ctx_set1 (inode, this, &tmp_subvol); +} + +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol) +{ + int ret = -1; + uint64_t tmp_subvol = 0; + + ret = inode_ctx_get1 (inode, this, &tmp_subvol); + if (tmp_subvol && subvol) + *subvol = (xlator_t *)tmp_subvol; + + return ret; +} + int dht_frame_return (call_frame_t *frame) @@ -340,20 +362,6 @@ out: return local; } - -char * -basestr (const char *str) -{ - char *basestr = NULL; - - basestr = strrchr (str, '/'); - if (basestr) - basestr ++; - - return basestr; -} - - xlator_t * dht_first_up_subvol (xlator_t *this) { @@ -505,7 +513,36 @@ out: return next; } +/* This func wraps around, if prev is actually the last subvol. + */ +xlator_t * +dht_subvol_next_available (xlator_t *this, xlator_t *prev) +{ + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + + conf = this->private; + if (!conf) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + /* if prev is last in conf->subvolumes, then wrap + * around. + */ + if ((i + 1) < conf->subvolume_cnt) { + next = conf->subvolumes[i + 1]; + } else { + next = conf->subvolumes[0]; + } + break; + } + } + +out: + return next; +} int dht_subvol_cnt (xlator_t *this, xlator_t *subvol) { @@ -698,6 +735,10 @@ dht_migration_complete_check_task (void *data) loc_t tmp_loc = {0,}; char *path = NULL; dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + uint64_t tmp_subvol = 0; + int open_failed = 0; this = THIS; frame = data; @@ -709,6 +750,8 @@ dht_migration_complete_check_task (void *data) if (!local->loc.inode && !local->fd) goto out; + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr * as root:root. If a fd is already open, access check wont be done*/ @@ -771,10 +814,7 @@ dht_migration_complete_check_task (void *data) /* update inode ctx (the layout) */ dht_layout_unref (this, local->layout); - if (!local->loc.inode) - ret = dht_layout_preset (this, dst_node, local->fd->inode); - else - ret = dht_layout_preset (this, dst_node, local->loc.inode); + ret = dht_layout_preset (this, dst_node, inode); if (ret != 0) { gf_log (this->name, GF_LOG_DEBUG, "%s: could not set preset layout for subvol %s", @@ -792,10 +832,7 @@ dht_migration_complete_check_task (void *data) goto out; } - if (!local->loc.inode) - ret = dht_layout_set (this, local->fd->inode, layout); - else - ret = dht_layout_set (this, local->loc.inode, layout); + ret = dht_layout_set (this, inode, layout); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set the new layout", @@ -806,41 +843,46 @@ dht_migration_complete_check_task (void *data) local->cached_subvol = dst_node; ret = 0; - if (!local->fd) + /* once we detect the migration complete, the inode-ctx2 is no more + required.. delete the ctx and also, it means, open() already + done on all the fd of inode */ + ret = inode_ctx_reset1 (inode, this, &tmp_subvol); + if (tmp_subvol) goto out; - /* once we detect the migration complete, the fd-ctx is no more - required.. delete the ctx */ - ret = fd_ctx_del (local->fd, this, NULL); - if (!ret) + + if (list_empty (&inode->fd_list)) goto out; /* perform open as root:root. There is window between linkfile * creation(root:root) and setattr with the correct uid/gid */ SYNCTASK_SETID(0, 0); - /* if 'local->fd' (ie, fd based operation), send a 'open()' on - destination if not already done */ - if (local->loc.inode) { - ret = syncop_open (dst_node, &local->loc, - local->fd->flags, local->fd); - } else { - tmp_loc.inode = local->fd->inode; - inode_path (local->fd->inode, NULL, &path); - if (path) - tmp_loc.path = path; - ret = syncop_open (dst_node, &tmp_loc, - local->fd->flags, local->fd); - GF_FREE (path); + /* perform 'open()' on all the fd's present on the inode */ + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; + + ret = syncop_open (dst_node, &tmp_loc, + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } } + GF_FREE (path); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to send open() on target file at %s", - local->loc.path, dst_node->name); + if (open_failed) { + ret = -1; goto out; } - ret = 0; out: @@ -885,6 +927,9 @@ dht_rebalance_inprogress_task (void *data) struct iatt stbuf = {0,}; loc_t tmp_loc = {0,}; dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + int open_failed = 0; this = THIS; frame = data; @@ -896,6 +941,8 @@ dht_rebalance_inprogress_task (void *data) if (!local->loc.inode && !local->fd) goto out; + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr * as root:root. If a fd is already open, access check wont be done*/ if (local->loc.inode) { @@ -947,35 +994,47 @@ dht_rebalance_inprogress_task (void *data) } ret = 0; + + if (list_empty (&inode->fd_list)) + goto done; + /* perform open as root:root. There is window between linkfile * creation(root:root) and setattr with the correct uid/gid */ SYNCTASK_SETID (0, 0); - if (local->loc.inode) { - ret = syncop_open (dst_node, &local->loc, - local->fd->flags, local->fd); - } else { - tmp_loc.inode = local->fd->inode; - inode_path (local->fd->inode, NULL, &path); - if (path) - tmp_loc.path = path; + + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; + ret = syncop_open (dst_node, &tmp_loc, - local->fd->flags, local->fd); - GF_FREE (path); + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to send open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } } + GF_FREE (path); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to send open() on target file at %s", - local->loc.path, dst_node->name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + + if (open_failed) { + ret = -1; goto out; } - SYNCTASK_SETID (frame->root->uid, frame->root->gid); - ret = fd_ctx_set (local->fd, this, (uint64_t)(long)dst_node); +done: + ret = dht_inode_ctx_set1 (this, inode, dst_node); if (ret) { gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set fd-ctx target file at %s", + "%s: failed to set inode-ctx target file at %s", local->loc.path, dst_node->name); goto out; } diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c index d9b311fe2..ece84151a 100644 --- a/xlators/cluster/dht/src/dht-inode-read.c +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -130,10 +130,11 @@ int dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata) { - uint64_t tmp_subvol = 0; + xlator_t *subvol = 0; dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + inode_t *inode = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -157,19 +158,20 @@ dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; /* Check if the rebalance phase2 is true */ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (ret) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { /* Phase 2 of migration */ local->rebalance.target_op_fn = dht_attr2; ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; } else { /* value is already set in fd_ctx, that means no need to check for whether its complete or not. */ dht_attr2 (this, frame, 0); - } - if (!ret) return 0; + } } out: @@ -382,6 +384,8 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { dht_local_t *local = NULL; int ret = 0; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; if (!local) { @@ -400,17 +404,18 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { /* File would be migrated to other node */ - ret = fd_ctx_get (local->fd, this, NULL); - if (ret) { + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { local->rebalance.target_op_fn = dht_readv2; ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; } else { /* value is already set in fd_ctx, that means no need to check for whether its complete or not. */ dht_readv2 (this, frame, 0); - } - if (!ret) return 0; + } } out: @@ -501,18 +506,27 @@ dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int ret = -1; dht_local_t *local = NULL; xlator_t *subvol = NULL; + call_frame_t *prev = NULL; local = frame->local; + prev = cookie; + if (!prev || !prev->this) + goto out; if (local->call_cnt != 1) goto out; if ((op_ret == -1) && (op_errno == ENOTCONN) && IA_ISDIR(local->loc.inode->ia_type)) { - subvol = dht_first_up_subvol (this); + subvol = dht_subvol_next_available (this, prev->this); if (!subvol) goto out; + /* check if we are done with visiting every node */ + if (subvol == local->cached_subvol) { + goto out; + } + STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, &local->loc, local->rebalance.flags, NULL); return 0; @@ -607,8 +621,9 @@ int dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - int ret = -1; + dht_local_t *local = NULL; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; @@ -618,8 +633,8 @@ dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; /* If context is set, then send flush() it to the destination */ - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { dht_flush2 (this, frame, 0); return 0; } @@ -635,14 +650,10 @@ dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -704,6 +715,8 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; prev = cookie; @@ -725,8 +738,8 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, } local->op_errno = op_errno; - ret = fd_ctx_get (local->fd, this, NULL); - if (ret) { + dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { local->rebalance.target_op_fn = dht_fsync2; /* Check if the rebalance phase1 is true */ @@ -741,11 +754,12 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { ret = dht_rebalance_complete_check (this, frame); } + if (!ret) + return 0; } else { dht_fsync2 (this, frame, 0); - } - if (!ret) return 0; + } out: DHT_STRIP_PHASE1_FLAGS (postbuf); @@ -761,15 +775,10 @@ dht_fsync2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; - + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index 9bcd84ae1..4b3f3a049 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -21,6 +21,7 @@ int dht_truncate2 (xlator_t *this, call_frame_t *frame, int ret); int dht_setattr2 (xlator_t *this, call_frame_t *frame, int ret); int dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret); int dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret); +int dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret); int dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -29,6 +30,7 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { dht_local_t *local = NULL; int ret = -1; + xlator_t *subvol = NULL; if (op_ret == -1 && (op_errno != ENOENT)) { goto out; @@ -65,8 +67,8 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + ret = dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { dht_writev2 (this, frame, 0); return 0; } @@ -90,14 +92,10 @@ dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -172,6 +170,8 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + xlator_t *subvol = NULL; + inode_t *inode = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -213,8 +213,9 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret || !local->fd) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { dht_truncate2 (this, frame, 0); return 0; } @@ -238,16 +239,13 @@ dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; + inode_t *inode = NULL; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + inode = local->fd ? local->fd->inode : local->loc.inode; + dht_inode_ctx_get1 (this, inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -359,6 +357,7 @@ dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + xlator_t *subvol = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -398,8 +397,8 @@ dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { dht_fallocate2 (this, frame, 0); return 0; } @@ -422,15 +421,10 @@ dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -497,6 +491,7 @@ dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + xlator_t *subvol = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -536,8 +531,8 @@ dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { dht_discard2 (this, frame, 0); return 0; } @@ -560,15 +555,10 @@ dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -624,6 +614,141 @@ err: return 0; } +int +dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_zerofill2; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + ret = fd_ctx_get (local->fd, this, NULL); + if (!ret) { + dht_zerofill2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + uint64_t tmp_subvol = 0; + int ret = -1; + + local = frame->local; + + if (local->fd) + ret = fd_ctx_get (local->fd, this, &tmp_subvol); + if (!ret) + subvol = (xlator_t *)(long)tmp_subvol; + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + local->fd, local->rebalance.offset, local->rebalance.size, + NULL); + + return 0; +} + +int +dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + fd, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + + /* handle cases of migration here for 'setattr()' calls */ int dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -675,15 +800,13 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; + inode_t *inode = NULL; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + inode = (local->fd) ? local->fd->inode : local->loc.inode; + + dht_inode_ctx_get1 (this, inode, &subvol); if (!subvol) subvol = local->cached_subvol; diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index 07e8cbae4..38e9970a7 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -454,12 +454,19 @@ dht_layout_entry_cmp (dht_layout_t *layout, int i, int j) { int64_t diff = 0; + /* swap zero'ed out layouts to front, if needed */ + if (!layout->list[j].start && !layout->list[j].stop) { + diff = (int64_t) layout->list[i].stop + - (int64_t) layout->list[j].stop; + goto out; + } if (layout->list[i].err || layout->list[j].err) diff = layout->list[i].err - layout->list[j].err; else diff = (int64_t) layout->list[i].start - (int64_t) layout->list[j].start; +out: return diff; } @@ -534,13 +541,13 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, case -1: case ENOENT: missing++; - break; + continue; case ENOTCONN: down++; - break; + continue; case ENOSPC: no_space++; - break; + continue; case 0: /* if err == 0 and start == stop, then it is a non misc++; * participating subvolume(spread-cnt). Then, do not @@ -552,6 +559,7 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, break; default: misc++; + continue; } is_virgin = 0; @@ -593,8 +601,7 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, int -dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout, - uint32_t *missing_p) +dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) { int ret = 0; int i = 0; @@ -606,7 +613,6 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout, ret = dht_layout_sort (layout); if (ret == -1) { - /* defensive coding; this can't happen currently */ gf_log (this->name, GF_LOG_WARNING, "sort failed?! how the ...."); goto out; @@ -616,26 +622,23 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout, &holes, &overlaps, &missing, &down, &misc, NULL); if (ret == -1) { - /* defensive coding; this can't happen currently */ gf_log (this->name, GF_LOG_WARNING, "error while finding anomalies in %s -- not good news", loc->path); goto out; } - ret = holes + overlaps; - if (ret) { + if (holes || overlaps) { if (missing == layout->cnt) { gf_log (this->name, GF_LOG_DEBUG, "directory %s looked up first time", loc->path); } else { gf_log (this->name, GF_LOG_INFO, - "found anomalies in %s. holes=%d overlaps=%d" - " missing=%d down=%d misc=%d", - loc->path, holes, overlaps, missing, down, - misc); + "found anomalies in %s. holes=%d overlaps=%d", + loc->path, holes, overlaps); } + ret = -1; } for (i = 0; i < layout->cnt; i++) { @@ -650,14 +653,14 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout, (layout->list[i].xlator ? layout->list[i].xlator->name : "<>")); + if ((layout->list[i].err == ENOENT) && (ret >= 0)) { + ret++; + } } } out: - if (missing_p) { - *missing_p = missing; - } return ret; } diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c index ae5bd49d9..dbc9d0b3c 100644 --- a/xlators/cluster/dht/src/dht-linkfile.c +++ b/xlators/cluster/dht/src/dht-linkfile.c @@ -297,9 +297,7 @@ dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this) GF_VALIDATE_OR_GOTO ("dht", local, out); GF_VALIDATE_OR_GOTO ("dht", local->link_subvol, out); - if ((local->stbuf.ia_type == IA_INVAL) || - (is_equal (frame->root->uid, local->stbuf.ia_uid) && - is_equal (frame->root->gid, local->stbuf.ia_gid))) + if (local->stbuf.ia_type == IA_INVAL) return 0; uuid_copy (local->loc.gfid, local->stbuf.ia_gfid); diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index a9f913c2d..bcb19f23e 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -243,7 +243,7 @@ out: static inline int __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf, - dict_t *dict, fd_t **dst_fd) + dict_t *dict, fd_t **dst_fd, dict_t *xattr) { xlator_t *this = NULL; int ret = -1; @@ -307,6 +307,12 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc goto out; } + ret = syncop_fsetxattr (to, fd, xattr, 0); + if (ret == -1) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set xattr on %s (%s)", + loc->path, to->name, strerror (errno)); + ret = syncop_ftruncate (to, fd, stbuf->ia_size); if (ret < 0) gf_log (this->name, GF_LOG_ERROR, @@ -340,6 +346,9 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, int ret = -1; xlator_t *this = NULL; + uint64_t src_statfs_blocks = 1; + uint64_t dst_statfs_blocks = 1; + this = THIS; ret = syncop_statfs (from, loc, &src_statfs); @@ -363,22 +372,34 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, if (flag != GF_DHT_MIGRATE_DATA) goto check_avail_space; - if (((dst_statfs.f_bavail * - dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) < - (((src_statfs.f_bavail * src_statfs.f_bsize) / - GF_DISK_SECTOR_SIZE) - stbuf->ia_blocks)) { - gf_log (this->name, GF_LOG_WARNING, - "data movement attempted from node (%s) with" - " higher disk space to a node (%s) with " - "lesser disk space (%s)", from->name, - to->name, loc->path); - - /* this is not a 'failure', but we don't want to - consider this as 'success' too :-/ */ - ret = 1; - goto out; + /* Check: + During rebalance `migrate-data` - Destination subvol experiences + a `reduction` in 'blocks' of free space, at the same time source + subvol gains certain 'blocks' of free space. A valid check is + necessary here to avoid errorneous move to destination where + the space could be scantily available. + */ + if (stbuf) { + dst_statfs_blocks = ((dst_statfs.f_bavail * + dst_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + src_statfs_blocks = ((src_statfs.f_bavail * + src_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + if ((dst_statfs_blocks - stbuf->ia_blocks) < + (src_statfs_blocks + stbuf->ia_blocks)) { + gf_log (this->name, GF_LOG_WARNING, + "data movement attempted from node (%s) with" + " higher disk space to a node (%s) with " + "lesser disk space (%s)", from->name, + to->name, loc->path); + + /* this is not a 'failure', but we don't want to + consider this as 'success' too :-/ */ + ret = 1; + goto out; + } } - check_avail_space: if (((dst_statfs.f_bavail * dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) { @@ -620,6 +641,15 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, } done: + ret = syncop_setattr (to, loc, buf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | + GF_SET_ATTR_MODE), NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform setattr on %s (%s)", + loc->path, to->name, strerror (errno)); + } + ret = syncop_unlink (from, loc); if (ret) gf_log (this->name, GF_LOG_WARNING, "%s: unlink failed (%s)", @@ -699,9 +729,16 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } + /* TODO: move all xattr related operations to fd based operations */ + ret = syncop_listxattr (from, loc, &xattr); + if (ret == -1) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to get xattr from %s (%s)", + loc->path, from->name, strerror (errno)); + /* create the destination, with required modes/xattr */ ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf, - dict, &dst_fd); + dict, &dst_fd, xattr); if (ret) goto out; @@ -718,6 +755,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } + ret = syncop_fstat (from, src_fd, &stbuf); if (ret) { gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s (%s)", @@ -747,19 +785,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } - /* TODO: move all xattr related operations to fd based operations */ - ret = syncop_listxattr (from, loc, &xattr); - if (ret == -1) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to get xattr from %s (%s)", - loc->path, from->name, strerror (errno)); - - ret = syncop_setxattr (to, loc, xattr, 0); - if (ret == -1) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set xattr on %s (%s)", - loc->path, to->name, strerror (errno)); - /* TODO: Sync the locks */ ret = syncop_fsync (to, dst_fd, 0); @@ -823,6 +848,23 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } + /* Free up the data blocks on the source node, as the whole + file is migrated */ + ret = syncop_ftruncate (from, src_fd, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform truncate on %s (%s)", + loc->path, from->name, strerror (errno)); + } + + /* remove the 'linkto' xattr from the destination */ + ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform removexattr on %s (%s)", + loc->path, to->name, strerror (errno)); + } + /* Do a stat and check the gfid before unlink */ ret = syncop_stat (from, loc, &empty_iatt); if (ret) { @@ -843,23 +885,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, } } - /* Free up the data blocks on the source node, as the whole - file is migrated */ - ret = syncop_ftruncate (from, src_fd, 0); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform truncate on %s (%s)", - loc->path, from->name, strerror (errno)); - } - - /* remove the 'linkto' xattr from the destination */ - ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform removexattr on %s (%s)", - loc->path, to->name, strerror (errno)); - } - ret = syncop_lookup (this, loc, NULL, NULL, NULL, NULL); if (ret) { gf_log (this->name, GF_LOG_DEBUG, @@ -1101,6 +1126,7 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, struct timeval end = {0,}; double elapsed = {0,}; struct timeval start = {0,}; + int32_t err = 0; gf_log (this->name, GF_LOG_INFO, "migrate data called on %s", loc->path); @@ -1258,9 +1284,21 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, ret = syncop_setxattr (this, &entry_loc, migrate_data, 0); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "migrate-data" - " failed for %s", entry_loc.path); - defrag->total_failures +=1; + err = op_errno; + /* errno is overloaded. See + * rebalance_task_completion () */ + if (err != ENOSPC) { + gf_log (this->name, GF_LOG_DEBUG, + "migrate-data skipped for %s" + " due to space constraints", + entry_loc.path); + defrag->skipped +=1; + } else{ + gf_log (this->name, GF_LOG_ERROR, + "migrate-data failed for %s", + entry_loc.path); + defrag->total_failures +=1; + } } if (ret == -1) { @@ -1659,6 +1697,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) uint64_t size = 0; uint64_t lookup = 0; uint64_t failures = 0; + uint64_t skipped = 0; char *status = ""; double elapsed = 0; struct timeval end = {0,}; @@ -1675,6 +1714,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) size = defrag->total_data; lookup = defrag->num_files_lookedup; failures = defrag->total_failures; + skipped = defrag->skipped; gettimeofday (&end, NULL); @@ -1698,6 +1738,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) gf_log (THIS->name, GF_LOG_WARNING, "failed to set lookedup file count"); + ret = dict_set_int32 (dict, "status", defrag->defrag_status); if (ret) gf_log (THIS->name, GF_LOG_WARNING, @@ -1710,6 +1751,14 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) } ret = dict_set_uint64 (dict, "failures", failures); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set failure count"); + + ret = dict_set_uint64 (dict, "skipped", skipped); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set skipped file count"); log: switch (defrag->defrag_status) { case GF_DEFRAG_STATUS_NOT_STARTED: @@ -1727,13 +1776,15 @@ log: case GF_DEFRAG_STATUS_FAILED: status = "failed"; break; + default: + break; } gf_log (THIS->name, GF_LOG_INFO, "Rebalance is %s. Time taken is %.2f " "secs", status, elapsed); gf_log (THIS->name, GF_LOG_INFO, "Files migrated: %"PRIu64", size: %" - PRIu64", lookups: %"PRIu64", failures: %"PRIu64, files, size, - lookup, failures); + PRIu64", lookups: %"PRIu64", failures: %"PRIu64", skipped: " + "%"PRIu64, files, size, lookup, failures, skipped); out: diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c index 08965976b..5d6f4f232 100644 --- a/xlators/cluster/dht/src/dht-rename.c +++ b/xlators/cluster/dht/src/dht-rename.c @@ -306,7 +306,19 @@ err: NULL, NULL); return 0; } - +#define DHT_MARK_FOP_INTERNAL(xattr) do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new (); \ + if (!xattr) \ + break; \ + } \ + tmp = dict_set_str (xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \ + if (tmp) { \ + gf_log (this->name, GF_LOG_ERROR, "Failed to set" \ + " internal dict key for %s", local->loc.path); \ + } \ + }while (0) int dht_rename_done (call_frame_t *frame, xlator_t *this) { @@ -377,7 +389,7 @@ dht_rename_cleanup (call_frame_t *frame) xlator_t *dst_hashed = NULL; xlator_t *dst_cached = NULL; int call_cnt = 0; - + dict_t *xattr = NULL; local = frame->local; this = frame->this; @@ -401,13 +413,15 @@ dht_rename_cleanup (call_frame_t *frame) if (!call_cnt) goto nolinks; + DHT_MARK_FOP_INTERNAL (xattr); + if (dst_hashed != src_hashed && dst_hashed != src_cached) { gf_log (this->name, GF_LOG_TRACE, "unlinking linkfile %s @ %s => %s", local->loc.path, dst_hashed->name, src_cached->name); STACK_WIND (frame, dht_rename_unlink_cbk, dst_hashed, dst_hashed->fops->unlink, - &local->loc, 0, NULL); + &local->loc, 0, xattr); } if (src_cached != dst_hashed) { @@ -416,9 +430,12 @@ dht_rename_cleanup (call_frame_t *frame) local->loc2.path, src_cached->name); STACK_WIND (frame, dht_rename_unlink_cbk, src_cached, src_cached->fops->unlink, - &local->loc2, 0, NULL); + &local->loc2, 0, xattr); } + if (xattr) + dict_unref (xattr); + return 0; nolinks: @@ -482,6 +499,7 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, xlator_t *rename_subvol = NULL; call_frame_t *link_frame = NULL; dht_local_t *link_local = NULL; + dict_t *xattr = NULL; local = frame->local; prev = cookie; @@ -565,6 +583,8 @@ err: if (local->call_cnt == 0) goto unwind; + DHT_MARK_FOP_INTERNAL (xattr); + if (src_cached != dst_hashed && src_cached != dst_cached) { gf_log (this->name, GF_LOG_TRACE, "deleting old src datafile %s @ %s", @@ -572,7 +592,7 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, src_cached, src_cached->fops->unlink, - &local->loc, 0, NULL); + &local->loc, 0, xattr); } if (src_hashed != rename_subvol && src_hashed != src_cached) { @@ -582,7 +602,7 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, src_hashed, src_hashed->fops->unlink, - &local->loc, 0, NULL); + &local->loc, 0, xattr); } if (dst_cached @@ -594,8 +614,10 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, dst_cached, dst_cached->fops->unlink, - &local->loc2, 0, NULL); + &local->loc2, 0, xattr); } + if (xattr) + dict_unref (xattr); return 0; unwind: @@ -603,12 +625,16 @@ unwind: WIPE (&local->postoldparent); WIPE (&local->preparent); WIPE (&local->postparent); + if (xattr) + dict_unref (xattr); dht_rename_done (frame, this); return 0; cleanup: + if (xattr) + dict_unref (xattr); dht_rename_cleanup (frame); return 0; @@ -742,6 +768,7 @@ dht_rename_create_links (call_frame_t *frame) xlator_t *dst_hashed = NULL; xlator_t *dst_cached = NULL; int call_cnt = 0; + dict_t *xattr = NULL; local = frame->local; @@ -752,6 +779,7 @@ dht_rename_create_links (call_frame_t *frame) dst_hashed = local->dst_hashed; dst_cached = local->dst_cached; + DHT_MARK_FOP_INTERNAL (xattr); if (src_cached == dst_cached) { if (dst_hashed == dst_cached) @@ -763,7 +791,7 @@ dht_rename_create_links (call_frame_t *frame) STACK_WIND (frame, dht_rename_unlink_links_cbk, dst_hashed, dst_hashed->fops->unlink, - &local->loc2, 0, NULL); + &local->loc2, 0, xattr); return 0; } @@ -790,7 +818,7 @@ dht_rename_create_links (call_frame_t *frame) local->loc2.path, src_cached->name); STACK_WIND (frame, dht_rename_links_cbk, src_cached, src_cached->fops->link, - &local->loc, &local->loc2, NULL); + &local->loc, &local->loc2, xattr); } nolinks: @@ -798,6 +826,8 @@ nolinks: /* skip to next step */ dht_do_rename (frame); } + if (xattr) + dict_unref (xattr); return 0; } diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index b220a0e25..3fe96b1c7 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -17,6 +17,7 @@ #include "glusterfs.h" #include "xlator.h" #include "dht-common.h" +#include "glusterfs-acl.h" #define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \ layout->list[i].start = srt; \ @@ -564,9 +565,33 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) for (i = 0; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1 || err == 0) { - layout->list[i].err = -1; + if (err == -1 || err == 0 || err == ENOENT) { + /* Setting list[i].err = -1 is an indication for + dht_selfheal_layout_new_directory() to assign + a range. We set it to -1 based on any one of + the three criteria: + + - err == -1 already, which means directory + existed but layout was not set on it. + + - err == 0, which means directory exists and + has an old layout piece which will be + overwritten now. + + - err == ENOENT, which means directory does + not exist (possibly racing with mkdir or + finishing half done mkdir). The missing + directory will be attempted to be recreated. + + It is important to note that it is safe + to race with mkdir() as self-heal and + mkdir are idempotent operations. Both will + strive to set the directory and layouts to + the same final state. + */ count++; + if (!err) + layout->list[i].err = -1; } } @@ -776,7 +801,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, DHT_RESET_LAYOUT_RANGE (layout); for (i = start_subvol; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1) { + if (err == -1 || err == ENOENT) { DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, cnt, loc->path); if (--cnt == 0) { @@ -789,7 +814,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, for (i = 0; i < start_subvol; i++) { err = layout->list[i].err; - if (err == -1) { + if (err == -1 || err == ENOENT) { DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, cnt, loc->path); if (--cnt == 0) { diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 6b91c56a1..70aac7710 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -262,6 +262,28 @@ out: return ret; } + +int +dht_decommissioned_remove (xlator_t *this, dht_conf_t *conf) +{ + int i = 0; + int ret = -1; + + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i]) { + conf->decommissioned_bricks[i] = NULL; + conf->decommission_subvols_cnt--; + } + } + + ret = 0; +out: + + return ret; +} void dht_init_regex (xlator_t *this, dict_t *odict, char *name, regex_t *re, gf_boolean_t *re_valid) @@ -358,6 +380,10 @@ dht_reconfigure (xlator_t *this, dict_t *options) ret = dht_parse_decommissioned_bricks (this, conf, temp_str); if (ret == -1) goto out; + } else { + ret = dht_decommissioned_remove (this, conf); + if (ret == -1) + goto out; } dht_init_regex (this, options, "rsync-hash-regex", diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index 0349b63a9..fc0ca2f77 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -72,6 +72,7 @@ struct xlator_fops fops = { .fsetattr = dht_fsetattr, .fallocate = dht_fallocate, .discard = dht_discard, + .zerofill = dht_zerofill, }; struct xlator_dumpops dumpops = { diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index 5e5c68058..e934acdf0 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -323,7 +323,8 @@ nufa_create (call_frame_t *frame, xlator_t *this, if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { avail_subvol = dht_free_disk_available_subvol (this, - (xlator_t *)conf->private); + (xlator_t *)conf->private, + local); } if (subvol != avail_subvol) { @@ -427,7 +428,8 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { avail_subvol = dht_free_disk_available_subvol (this, - (xlator_t *)conf->private); + (xlator_t *)conf->private, + local); } if (avail_subvol != subvol) { @@ -482,97 +484,141 @@ same_first_part (char *str1, char term1, char *str2, char term2) } } +typedef struct nufa_args { + xlator_t *this; + char *volname; + gf_boolean_t addr_match; +} nufa_args_t; + +static void +nufa_find_local_brick (xlator_t *xl, void *data) +{ + nufa_args_t *args = data; + xlator_t *this = args->this; + char *local_volname = args->volname; + gf_boolean_t addr_match = args->addr_match; + char *brick_host = NULL; + dht_conf_t *conf = this->private; + int ret = -1; + + /*This means a local subvol was already found. We pick the first brick + * that is local*/ + if (conf->private) + return; + + if (strcmp (xl->name, local_volname) == 0) { + conf->private = xl; + gf_log (this->name, GF_LOG_INFO, "Using specified subvol %s", + local_volname); + return; + } + + if (!addr_match) + return; + + ret = dict_get_str (xl->options, "remote-host", &brick_host); + if ((ret == 0) && + (gf_is_same_address (local_volname, brick_host) || + gf_is_local_addr (brick_host))) { + conf->private = xl; + gf_log (this->name, GF_LOG_INFO, "Using the first local " + "subvol %s", xl->name); + return; + } + +} + +static void +nufa_to_dht (xlator_t *this) +{ + GF_ASSERT (this); + GF_ASSERT (this->fops); + + this->fops->lookup = dht_lookup; + this->fops->create = dht_create; + this->fops->mknod = dht_mknod; +} + +int +nufa_find_local_subvol (xlator_t *this, + void (*fn) (xlator_t *each, void* data), void *data) +{ + int ret = -1; + dht_conf_t *conf = this->private; + xlator_list_t *trav = NULL; + xlator_t *parent = NULL; + xlator_t *candidate = NULL; + + xlator_foreach_depth_first (this, fn, data); + if (!conf->private) { + gf_log (this->name, GF_LOG_ERROR, "Couldn't find a local " + "brick"); + return -1; + } + + candidate = conf->private; + trav = candidate->parents; + while (trav) { + + parent = trav->xlator; + if (strcmp (parent->type, "cluster/nufa") == 0) { + gf_log (this->name, GF_LOG_INFO, "Found local subvol, " + "%s", candidate->name); + ret = 0; + conf->private = candidate; + break; + } + + candidate = parent; + trav = parent->parents; + } + + return ret; +} + int nufa_init (xlator_t *this) { - dht_conf_t *conf = NULL; - xlator_list_t *trav = NULL; data_t *data = NULL; char *local_volname = NULL; int ret = -1; char my_hostname[256]; - xlator_t *local_subvol = NULL; - char *brick_host = NULL; - xlator_t *kid = NULL; + gf_boolean_t addr_match = _gf_false; + nufa_args_t args = {0, }; ret = dht_init(this); if (ret) { return ret; } - conf = this->private; - local_volname = "localhost"; - ret = gethostname (my_hostname, 256); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "could not find hostname (%s)", - strerror (errno)); - } + if ((data = dict_get (this->options, "local-volume-name"))) { + local_volname = data->data; - if (ret == 0) - local_volname = my_hostname; + } else { + addr_match = _gf_true; + local_volname = "localhost"; + ret = gethostname (my_hostname, 256); + if (ret == 0) + local_volname = my_hostname; - data = dict_get (this->options, "local-volume-name"); - if (data) { - local_volname = data->data; - } + else + gf_log (this->name, GF_LOG_WARNING, + "could not find hostname (%s)", + strerror (errno)); - for (trav = this->children; trav; trav = trav->next) { - if (strcmp (trav->xlator->name, local_volname) == 0) - break; - if (local_subvol) { - continue; - } - kid = trav->xlator; - for (;;) { - if (dict_get_str(trav->xlator->options,"remote-host", - &brick_host) == 0) { - /* Found it. */ - break; - } - if (!kid->children) { - /* Nowhere further to look. */ - gf_log (this->name, GF_LOG_ERROR, - "could not get remote-host"); - goto err; - } - if (kid->children->next) { - /* Multiple choices, can't/shouldn't decide. */ - gf_log (this->name, GF_LOG_ERROR, - "NUFA found fan-out (type %s) volume", - kid->type); - goto err; - } - /* One-to-one xlators are OK, try the next one. */ - kid = kid->children->xlator; - } - if (same_first_part(my_hostname,'.',brick_host,'.')) { - local_subvol = trav->xlator; - } } - if (trav) { - gf_log (this->name, GF_LOG_INFO, - "Using specified subvol %s", local_volname); - conf->private = trav->xlator; - } - else if (local_subvol) { + args.this = this; + args.volname = local_volname; + args.addr_match = addr_match; + ret = nufa_find_local_subvol (this, nufa_find_local_brick, &args); + if (ret) { gf_log (this->name, GF_LOG_INFO, - "Using first local subvol %s", local_subvol->name); - conf->private = local_subvol; + "Unable to find local subvolume, switching " + "to dht mode"); + nufa_to_dht (this); } - else { - gf_log (this->name, GF_LOG_ERROR, - "Could not find specified or local subvol"); - goto err; - - } - return 0; - -err: - dht_fini(this); - return -1; } diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c index 861012247..d3ea90ba8 100644 --- a/xlators/cluster/dht/src/switch.c +++ b/xlators/cluster/dht/src/switch.c @@ -437,7 +437,8 @@ switch_create (call_frame_t *frame, xlator_t *this, avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); if (dht_is_subvol_filled (this, avail_subvol)) { avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol); + dht_free_disk_available_subvol (this, avail_subvol, + local); } if (subvol != avail_subvol) { @@ -536,7 +537,8 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); if (dht_is_subvol_filled (this, avail_subvol)) { avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol); + dht_free_disk_available_subvol (this, avail_subvol, + local); } if (avail_subvol != subvol) { |
