summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--libglusterfs/src/glusterfs.h9
-rw-r--r--xlators/cluster/dht/src/dht-common.c559
-rw-r--r--xlators/cluster/dht/src/dht-common.h20
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c97
-rw-r--r--xlators/storage/posix/src/posix.c123
-rw-r--r--xlators/storage/posix/src/posix.h10
6 files changed, 742 insertions, 76 deletions
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index c6fc469ee4c..3e7aedde13d 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -149,6 +149,15 @@
#define GF_REBALANCE_TID_KEY "rebalance-id"
#define GF_REMOVE_BRICK_TID_KEY "remove-brick-id"
#define GF_REPLACE_BRICK_TID_KEY "replace-brick-id"
+#define DHT_SKIP_NON_LINKTO_UNLINK "unlink-only-if-dht-linkto-file"
+#define DHT_SKIP_OPEN_FD_UNLINK "dont-unlink-for-open-fd"
+
+#define DHT_LINKFILE_MODE (S_ISVTX)
+
+#define IS_DHT_LINKFILE_MODE(iabuf) ((st_mode_from_ia ((iabuf)->ia_prot, \
+ (iabuf)->ia_type) \
+ & ~S_IFMT) \
+ == DHT_LINKFILE_MODE)
/* NOTE: add members ONLY at the end (just before _MAXVALUE) */
typedef enum {
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 5f7996a9ad6..c5105d27b91 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -442,7 +442,8 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
is_dir = check_is_dir (inode, stbuf, xattr);
if (!is_dir) {
gf_log (this->name, GF_LOG_DEBUG,
- "lookup of %s on %s returned non dir 0%o",
+ "lookup of %s on %s returned non dir 0%o "
+ "calling lookup_everywhere",
local->loc.path, prev->this->name,
stbuf->ia_type);
local->need_selfheal = 1;
@@ -541,6 +542,12 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
LOCK (&frame->lock);
{
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "revalidate lookup of %s "
+ "returned with op_ret %d and op_errno %d",
+ local->loc.path, op_ret, op_errno);
+
if (op_ret == -1) {
local->op_errno = op_errno;
@@ -564,6 +571,14 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
* the file is not migrated */
if (op_errno == ENOENT) {
if (IA_ISREG (local->loc.inode->ia_type)) {
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "found ENOENT for %s. "
+ "Setting "
+ "need_lookup_everywhere"
+ " flag to 1",
+ local->loc.path);
+
local->need_lookup_everywhere = 1;
}
}
@@ -760,9 +775,16 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
}
unwind:
+ gf_log (this->name, GF_LOG_DEBUG,
+ "creation of linkto on hashed subvol:%s, "
+ "returned with op_ret %d and op_errno %d: %s",
+ local->hashed_subvol->name,
+ op_ret, op_errno, uuid_utoa (local->loc.gfid));
+
if (local->linked == _gf_true)
dht_linkfile_attr_heal (frame, this);
+
DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
local->inode, &local->stbuf, local->xattr,
@@ -771,6 +793,176 @@ out:
return ret;
}
+int
+dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ const char *path = NULL;
+
+ local = (dht_local_t*)frame->local;
+ path = local->loc.path;
+
+ gf_log (this->name, GF_LOG_INFO, "lookup_unlink returned with "
+ "op_ret -> %d and op-errno -> %d for %s", op_ret, op_errno,
+ ((path == NULL)? "null" : path ));
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ dht_lookup_everywhere_done (frame, this);
+ }
+
+ return 0;
+}
+
+int
+dht_lookup_unlink_of_false_linkto_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ const char *path = NULL;
+
+ local = (dht_local_t*)frame->local;
+ path = local->loc.path;
+
+ gf_log (this->name, GF_LOG_INFO, "lookup_unlink returned with "
+ "op_ret -> %d and op-errno -> %d for %s", op_ret, op_errno,
+ ((path == NULL)? "null" : path ));
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+
+ if (op_ret == 0) {
+ dht_lookup_everywhere_done (frame, this);
+ } else {
+ /*When dht_lookup_everywhere is performed, one cached
+ *and one hashed file was found and hashed file does
+ *not point to the above mentioned cached node. So it
+ *was considered as stale and an unlink was performed.
+ *But unlink fails. So may be rebalance is in progress.
+ *now ideally we have two data-files. One obtained during
+ *lookup_everywhere and one where unlink-failed. So
+ *at this point in time we cannot decide which one to
+ *choose because there are chances of first cached
+ *file is truncated after rebalance and if it is choosen
+ *as cached node, application will fail. So return EIO.*/
+
+ if (op_errno == EBUSY) {
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not unlink the linkto file as "
+ "either fd is open and/or linkto xattr "
+ "is set for %s",
+ ((path == NULL)? "null":path));
+
+ }
+ DHT_STACK_UNWIND (lookup, frame, -1, EIO, NULL, NULL,
+ NULL, NULL);
+
+ }
+ }
+
+ return 0;
+}
+
+int
+dht_lookup_unlink_stale_linkto_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+
+ dht_local_t *local = NULL;
+ const char *path = NULL;
+
+ /* NOTE:
+ * If stale file unlink fails either there is an open-fd or is not an
+ * dht-linkto-file then posix_unlink returns EBUSY, which is overwritten
+ * to ENOENT
+ */
+
+ local = frame->local;
+
+ if (local && local->loc.path)
+ path = local->loc.path;
+
+ gf_log (this->name, GF_LOG_INFO, "Returned with op_ret %d and "
+ "op_errno %d for %s", op_ret, op_errno,
+ ((path==NULL)?"null":path));
+
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL,
+ NULL);
+
+ return 0;
+}
+
+int
+dht_fill_dict_to_avoid_unlink_of_migrating_file (dict_t *dict) {
+
+ int ret = 0;
+
+ ret = dict_set_int32 (dict, DHT_SKIP_NON_LINKTO_UNLINK, 1);
+
+ if (ret)
+ goto err;
+
+ ret = dict_set_int32 (dict, DHT_SKIP_OPEN_FD_UNLINK, 1);
+
+ if (ret)
+ goto err;
+
+
+ return 0;
+
+err:
+ return -1;
+
+}
+/* Rebalance is performed from cached_node to hashed_node. Initial cached_node
+ * contains a non-linkto file. After migration it is converted to linkto and
+ * then unlinked. And at hashed_subvolume, first a linkto file is present,
+ * then after migration it is converted to a non-linkto file.
+ *
+ * Lets assume a file is present on cached subvolume and a new brick is added
+ * and new brick is the new_hashed subvolume. So fresh lookup on newly added
+ * hashed subvolume will fail and dht_lookup_everywhere gets called. If just
+ * before sending the dht_lookup_everywhere request rebalance is in progress,
+ *
+ * from cached subvolume it may see: Nonlinkto or linkto or No file
+ * from hashed subvolume it may see: No file or linkto file or non-linkto file
+ *
+ * So this boils down to 9 cases:
+ * at cached_subvol at hashed_subvol
+ * ---------------- -----------------
+ *
+ *a) No file No file
+ * [request reached after [Request reached before
+ * migration] Migration]
+ *
+ *b) No file Linkto File
+ *
+ *c) No file Non-Linkto File
+ *
+ *d) Linkto No-File
+ *
+ *e) Linkto Linkto
+ *
+ *f) Linkto Non-Linkto
+ *
+ *g) NonLinkto No-File
+ *
+ *h) NonLinkto Linkto
+ *
+ *i) NonLinkto NonLinkto
+ *
+ * dht_lookup_everywhere_done takes decision based on any of the above case
+ */
int
dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
@@ -780,6 +972,7 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
xlator_t *hashed_subvol = NULL;
xlator_t *cached_subvol = NULL;
dht_layout_t *layout = NULL;
+ gf_boolean_t found_non_linkto_on_hashed = _gf_false;
local = frame->local;
hashed_subvol = local->hashed_subvol;
@@ -801,19 +994,210 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
return 0;
}
+ gf_log (this->name, GF_LOG_INFO, "STATUS: hashed_subvol %s "
+ "cached_subvol %s",
+ (hashed_subvol == NULL)?"null":hashed_subvol->name,
+ (cached_subvol == NULL)?"null":cached_subvol->name);
+
if (!cached_subvol) {
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL,
- NULL);
+
+ if (local->skip_unlink.handle_valid_link && hashed_subvol) {
+
+ /*Purpose of "DHT_SKIP_NON_LINKTO_UNLINK":
+ * If this lookup is performed by rebalance and this
+ * rebalance process detected hashed file and by
+ * the time it sends the lookup request to cached node,
+ * file got migrated and now at intial hashed_node,
+ * final migrated file is present. With current logic,
+ * because this process fails to find the cached_node,
+ * it will unlink the file at initial hashed_node.
+ *
+ * So we avoid this by setting key, and checking at the
+ * posix_unlink that unlink the file only if file is a
+ * linkto file and not a migrated_file.
+ */
+
+
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file
+ (local->xattr_req);
+
+ if (ret) {
+ /* If for some reason, setting key in the dict
+ * fails, return with ENOENT, as with respect to
+ * this process, it detected only a stale link
+ * file.
+ *
+ * Next lookup will delete it.
+ *
+ * Performing deletion of stale link file when
+ * setting key in dict fails, may cause the data
+ * loss becase of the above mentioned race.
+ */
+
+
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT,
+ NULL, NULL, NULL, NULL);
+ } else {
+ local->skip_unlink.handle_valid_link = _gf_false;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "No Cached was found and "
+ "unlink on hashed was skipped"
+ " so performing now: %s",
+ local->loc.path);
+
+ STACK_WIND (frame,
+ dht_lookup_unlink_stale_linkto_cbk,
+ hashed_subvol,
+ hashed_subvol->fops->unlink,
+ &local->loc, 0, local->xattr_req);
+ }
+
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "There was no cached file and "
+ "unlink on hashed is not skipped %s",
+ local->loc.path);
+
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL,
+ NULL, NULL);
+ }
return 0;
}
- if (local->need_lookup_everywhere) {
- if (uuid_compare (local->gfid, local->inode->gfid)) {
- /* GFID different, return error */
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL,
- NULL, NULL, NULL);
- return 0;
+ /* At the time of dht_lookup, no file was found on hashed and that is
+ * why dht_lookup_everywhere is called, but by the time
+ * dht_lookup_everywhere
+ * reached to server, file might have already migrated. In that case we
+ * will find a migrated file at the hashed_node. In this case store the
+ * layout in context and return successfully.
+ */
+
+ if (hashed_subvol || local->need_lookup_everywhere) {
+
+ if (local->need_lookup_everywhere) {
+
+ found_non_linkto_on_hashed = _gf_true;
+
+ } else if ((local->file_count == 1) &&
+ (hashed_subvol == cached_subvol)) {
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "found cached file on hashed subvolume "
+ "so store in context and return for %s",
+ local->loc.path);
+
+ found_non_linkto_on_hashed = _gf_true;
}
+
+ if (found_non_linkto_on_hashed)
+ goto preset_layout;
+
+ }
+
+
+ if (hashed_subvol) {
+ if (local->skip_unlink.handle_valid_link == _gf_true) {
+ if (cached_subvol == local->skip_unlink.hash_links_to) {
+
+ if (uuid_compare (local->skip_unlink.cached_gfid,
+ local->skip_unlink.hashed_gfid)){
+
+ /*GFID different, return error*/
+ DHT_STACK_UNWIND (lookup, frame, -1,
+ ESTALE, NULL, NULL, NULL,
+ NULL);
+
+
+ }
+
+ ret = dht_layout_preset (this, cached_subvol,
+ local->loc.inode);
+ if (ret) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Could not set pre-set layout "
+ "for subvolume %s",
+ cached_subvol->name);
+ }
+
+ local->op_ret = (ret == 0) ? ret : -1;
+ local->op_errno = (ret == 0) ? ret : EINVAL;
+
+ /* Presence of local->cached_subvol validates
+ * that lookup from cached node is successful
+ */
+
+ if (!local->op_ret && local->loc.parent) {
+ dht_inode_ctx_time_update
+ (local->loc.parent, this,
+ &local->postparent, 1);
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Skipped unlinking linkto file "
+ "on the hashed subvolume. "
+ "Returning success as it is a "
+ "valid linkto file. Path:%s"
+ ,local->loc.path);
+
+ goto unwind_hashed_and_cached;
+ } else {
+
+ local->skip_unlink.handle_valid_link = _gf_false;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Linkto file found on hashed "
+ "subvol "
+ "and data file found on cached "
+ "subvolume. But linkto points to "
+ "different cached subvolume (%s) "
+ "path %s",
+ local->skip_unlink.hash_links_to->name,
+ local->loc.path);
+
+ if (local->skip_unlink.opend_fd_count == 0) {
+
+
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file
+ (local->xattr_req);
+
+
+ if (ret) {
+ DHT_STACK_UNWIND (lookup, frame, -1,
+ EIO, NULL, NULL,
+ NULL, NULL);
+ } else {
+ local->call_cnt = 1;
+ STACK_WIND (frame,
+ dht_lookup_unlink_of_false_linkto_cbk,
+ hashed_subvol,
+ hashed_subvol->fops->unlink,
+ &local->loc, 0,
+ local->xattr_req);
+ }
+
+ return 0;
+
+ }
+ }
+
+ }
+ }
+
+
+preset_layout:
+
+ if (found_non_linkto_on_hashed) {
+
+ if (local->need_lookup_everywhere) {
+ if (uuid_compare (local->gfid, local->inode->gfid)) {
+ /* GFID different, return error */
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT,
+ NULL, NULL, NULL, NULL);
+ return 0;
+ }
+ }
+
local->op_ret = 0;
local->op_errno = 0;
layout = dht_layout_for_subvol (this, cached_subvol);
@@ -890,26 +1274,15 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
cached_subvol, hashed_subvol, &local->loc);
return ret;
-}
-
-
-int
-dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
-{
- int this_call_cnt = 0;
-
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- dht_lookup_everywhere_done (frame, this);
- }
+unwind_hashed_and_cached:
+ DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
+ DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
+ local->loc.inode, &local->stbuf, local->xattr,
+ &local->postparent);
return 0;
}
-
int
dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
@@ -924,8 +1297,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
xlator_t *subvol = NULL;
loc_t *loc = NULL;
xlator_t *link_subvol = NULL;
- int ret = -1;
- int32_t fd_count = 0;
+ int ret = -1;
+ int32_t fd_count = 0;
+ dict_t *dict_req = {0};
GF_VALIDATE_OR_GOTO ("dht", frame, out);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -939,6 +1313,11 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
prev = cookie;
subvol = prev->this;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "returned with op_ret %d and op_errno %d (%s) "
+ "from subvol %s", op_ret, op_errno, loc->path,
+ subvol->name);
+
LOCK (&frame->lock);
{
if (op_ret == -1) {
@@ -957,6 +1336,13 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
is_linkfile = check_is_linkfile (inode, buf, xattr);
+ if (is_linkfile) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Found linktofile on %s for %s",
+ subvol->name, loc->path);
+
+ }
+
is_dir = check_is_dir (inode, buf, xattr);
if (is_linkfile) {
@@ -981,18 +1367,26 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
} else {
local->file_count++;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "found cached file on %s for %s",
+ subvol->name, loc->path);
+
if (!local->cached_subvol) {
/* found one file */
dht_iatt_merge (this, &local->stbuf, buf,
subvol);
local->xattr = dict_ref (xattr);
local->cached_subvol = subvol;
+
gf_log (this->name, GF_LOG_DEBUG,
- "found on %s file %s",
+ "datafile found on %s file %s",
subvol->name, loc->path);
dht_iatt_merge (this, &local->postparent,
postparent, subvol);
+
+ uuid_copy (local->skip_unlink.cached_gfid,
+ buf->ia_gfid);
} else {
/* This is where we need 'rename' both entries logic */
gf_log (this->name, GF_LOG_WARNING,
@@ -1009,15 +1403,68 @@ unlock:
if (is_linkfile) {
ret = dict_get_int32 (xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count);
- /* Delete the linkfile only if there are no open fds on it.
- if there is a open-fd, it may be in migration */
- if (!ret && (fd_count == 0)) {
- gf_log (this->name, GF_LOG_INFO,
- "deleting stale linkfile %s on %s",
- loc->path, subvol->name);
- STACK_WIND (frame, dht_lookup_unlink_cbk,
- subvol, subvol->fops->unlink, loc, 0, NULL);
- return 0;
+
+ /* Any linkto file found on the non-hashed subvolume should
+ * be unlinked (performed in the "else if" block below)
+ *
+ * But if a linkto file is found on hashed subvolume, it may be
+ * pointing to vaild cached node. So unlinking of linkto
+ * file on hashed subvolume is skipped and inside
+ * dht_lookup_everywhere_done, checks are performed. If this
+ * linkto file is found as stale linkto file, it is deleted
+ * otherwise unlink is skipped.
+ */
+
+ if (local->hashed_subvol && local->hashed_subvol == subvol) {
+
+ local->skip_unlink.handle_valid_link = _gf_true;
+ local->skip_unlink.opend_fd_count = fd_count;
+ local->skip_unlink.hash_links_to = link_subvol;
+ uuid_copy (local->skip_unlink.hashed_gfid,
+ buf->ia_gfid);
+
+ gf_log (this->name, GF_LOG_DEBUG, "Found"
+ " one linkto file on hashed subvol %s "
+ "for %s: Skipping unlinking till "
+ "everywhere_done", subvol->name,
+ loc->path);
+
+ } else if (!ret && (fd_count == 0)) {
+
+ dict_req = dict_new ();
+
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file
+ (dict_req);
+
+ if (ret) {
+
+ /* Skip unlinking for dict_failure
+ *File is found as a linkto file on non-hashed,
+ *subvolume. In the current implementation,
+ *finding a linkto-file on non-hashed does not
+ *always implies that it is stale. So deletion
+ *of file should be done only when both fd is
+ *closed and linkto-xattr is set. In case of
+ *dict_set failure, avoid skipping of file.
+ *NOTE: dht_frame_return should get called for
+ * this block.
+ */
+
+ dict_unref (dict_req);
+
+ } else {
+ gf_log (this->name, GF_LOG_INFO,
+ "attempting deletion of stale linkfile "
+ "%s on %s", loc->path, subvol->name);
+
+ STACK_WIND (frame, dht_lookup_unlink_cbk,
+ subvol, subvol->fops->unlink, loc,
+ 0, dict_req);
+
+ dict_unref (dict_req);
+
+ return 0;
+ }
}
}
@@ -1054,6 +1501,9 @@ dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc)
if (!local->inode)
local->inode = inode_ref (loc->inode);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "winding lookup call to %d subvols", call_cnt);
+
for (i = 0; i < call_cnt; i++) {
STACK_WIND (frame, dht_lookup_everywhere_cbk,
conf->subvolumes[i],
@@ -1252,9 +1702,14 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (!op_ret && uuid_is_null (local->gfid))
memcpy (local->gfid, stbuf->ia_gfid, 16);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "fresh_lookup returned for %s with op_ret %d and "
+ "op_errno %d", loc->path, op_ret, op_errno);
+
if (ENTRY_MISSING (op_ret, op_errno)) {
gf_log (this->name, GF_LOG_TRACE, "Entry %s missing on subvol"
" %s", loc->path, prev->this->name);
+
if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_ON) {
local->op_errno = ENOENT;
dht_lookup_everywhere (frame, this, loc);
@@ -1313,13 +1768,17 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
subvol = dht_linkfile_subvol (this, inode, stbuf, xattr);
if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "linkfile not having link subvolume. path=%s",
- loc->path);
+ gf_log (this->name, GF_LOG_INFO, "linkfile not having link "
+ "subvol for %s", loc->path);
+
dht_lookup_everywhere (frame, this, loc);
return 0;
}
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Calling lookup on linkto target %s for path %s",
+ subvol->name, loc->path);
+
STACK_WIND (frame, dht_lookup_linkfile_cbk,
subvol, subvol->fops->lookup,
&local->loc, local->xattr_req);
@@ -1465,6 +1924,13 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
dht_layout_unref (this, local->layout);
local->layout = NULL;
local->cached_subvol = NULL;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "Called revalidate lookup for %s, "
+ "but layout->gen (%d) is less than "
+ "conf->gen (%d), calling fresh_lookup",
+ loc->path, layout->gen, conf->gen);
+
goto do_fresh_lookup;
}
@@ -1521,6 +1987,10 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
for (i = 0; i < call_cnt; i++) {
subvol = layout->list[i].xlator;
+ gf_log (this->name, GF_LOG_DEBUG, "calling "
+ "revalidate lookup for %s at %s",
+ loc->path, subvol->name);
+
STACK_WIND (frame, dht_revalidate_cbk,
subvol, subvol->fops->lookup,
&local->loc, local->xattr_req);
@@ -1565,6 +2035,7 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
"no subvolume in layout for path=%s, "
"checking on all the subvols to see if "
"it is a directory", loc->path);
+
call_cnt = conf->subvolume_cnt;
local->call_cnt = call_cnt;
@@ -1575,6 +2046,10 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
goto err;
}
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Found null hashed subvol. Calling lookup"
+ " on all nodes.");
+
for (i = 0; i < call_cnt; i++) {
STACK_WIND (frame, dht_lookup_dir_cbk,
conf->subvolumes[i],
@@ -1584,6 +2059,10 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
return 0;
}
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Calling fresh lookup for %s on"
+ " %s", loc->path, hashed_subvol->name);
+
STACK_WIND (frame, dht_lookup_cbk,
hashed_subvol, hashed_subvol->fops->lookup,
loc, local->xattr_req);
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 83725f09712..c7f20a28383 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -96,6 +96,15 @@ struct dht_rebalance_ {
dict_t *xdata;
};
+struct dht_skip_linkto_unlink {
+
+ gf_boolean_t handle_valid_link;
+ int opend_fd_count;
+ xlator_t *hash_links_to;
+ uuid_t cached_gfid;
+ uuid_t hashed_gfid;
+};
+
struct dht_local {
int call_cnt;
loc_t loc;
@@ -184,6 +193,9 @@ struct dht_local {
xlator_t *first_up_subvol;
gf_boolean_t added_link;
+
+ struct dht_skip_linkto_unlink skip_unlink;
+
};
typedef struct dht_local dht_local_t;
@@ -752,4 +764,12 @@ dht_inodectx_dump (xlator_t *this, inode_t *inode);
int
dht_subvol_status (dht_conf_t *conf, xlator_t *subvol);
+void
+dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,
+ dht_layout_t *layout);
+int
+dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this);
+
+int
+dht_fill_dict_to_avoid_unlink_of_migrating_file (dict_t *dict);
#endif/* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index d6e34f92036..725e0c8c7b0 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -237,13 +237,15 @@ out:
}
static inline int
-__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf,
- dict_t *dict, fd_t **dst_fd)
+__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc,
+ struct iatt *stbuf, dict_t *dict,
+ fd_t **dst_fd)
{
- xlator_t *this = NULL;
- int ret = -1;
- fd_t *fd = NULL;
- struct iatt new_stbuf = {0,};
+ xlator_t *this = NULL;
+ int ret = -1;
+ fd_t *fd = NULL;
+ struct iatt new_stbuf = {0,};
+ struct iatt check_stbuf = {0,};
this = THIS;
@@ -300,6 +302,46 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
goto out;
}
+ /*Reason of doing lookup after create again:
+ *In the create, there is some time-gap between opening fd at the
+ *server (posix_layer) and binding it in server (incrementing fd count),
+ *so if in that time-gap, if other process sends unlink considering it
+ *as a linkto file, because inode->fd count will be 0, so file will be
+ *unlinked at the backend. And because furthur operations are performed
+ *on fd, so though migration will be done but will end with no file
+ *at the backend.
+ */
+
+
+ ret = syncop_lookup (to, loc, NULL, &check_stbuf, NULL, NULL);
+ if (!ret) {
+ if (uuid_compare (stbuf->ia_gfid, check_stbuf.ia_gfid) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "file %s exists in %s with different gfid,"
+ "found in lookup after create",
+ loc->path, to->name);
+ ret = -1;
+ fd_unref (fd);
+ goto out;
+ }
+
+ }
+
+ if (-ret == ENOENT) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: file does not exists"
+ "on %s (%s)", loc->path, to->name, strerror (-ret));
+ ret = -1;
+ fd_unref (fd);
+ goto out;
+ }
+
+ ret = syncop_fsetxattr (to, fd, dict, 0);
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to set xattr on %s (%s)",
+ loc->path, to->name, strerror (-ret));
+
ret = syncop_ftruncate (to, fd, stbuf->ia_size);
if (ret < 0)
gf_log (this->name, GF_LOG_ERROR,
@@ -650,17 +692,18 @@ int
dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
int flag)
{
- int ret = -1;
- struct iatt new_stbuf = {0,};
- struct iatt stbuf = {0,};
- struct iatt empty_iatt = {0,};
- ia_prot_t src_ia_prot = {0,};
- fd_t *src_fd = NULL;
- fd_t *dst_fd = NULL;
- dict_t *dict = NULL;
- dict_t *xattr = NULL;
- dict_t *xattr_rsp = NULL;
- int file_has_holes = 0;
+ int ret = -1;
+ struct iatt new_stbuf = {0,};
+ struct iatt stbuf = {0,};
+ struct iatt empty_iatt = {0,};
+ ia_prot_t src_ia_prot = {0,};
+ fd_t *src_fd = NULL;
+ fd_t *dst_fd = NULL;
+ dict_t *dict = NULL;
+ dict_t *xattr = NULL;
+ dict_t *xattr_rsp = NULL;
+ int file_has_holes = 0;
+ int rcvd_enoent_from_src = 0;
gf_log (this->name, GF_LOG_INFO, "%s: attempting to move from %s to %s",
loc->path, from->name, to->name);
@@ -827,15 +870,31 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
}
/* Do a stat and check the gfid before unlink */
+
+ /*
+ * Cached file changes its state from non-linkto to linkto file after
+ * migrating data. If lookup from any other mount-point is performed,
+ * converted-linkto-cached file will be treated as a stale and will be
+ * unlinked. But by this time, file is already migrated. So further
+ * failure because of ENOENT should not be treated as error
+ */
+
ret = syncop_stat (from, loc, &empty_iatt);
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to do a stat on %s (%s)",
loc->path, from->name, strerror (errno));
- goto out;
+
+ if (-ret != ENOENT) {
+ ret = -1;
+ goto out;
+ }
+
+ rcvd_enoent_from_src = 1;
}
- if (uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0) {
+ if ((uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0 ) &&
+ (!rcvd_enoent_from_src)) {
/* take out the source from namespace */
ret = syncop_unlink (from, loc);
if (ret) {
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index dc3a709cd26..bf5c188e5ca 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -1019,20 +1019,60 @@ out:
return 0;
}
+int32_t
+posix_unlink_gfid_handle_and_entry (xlator_t *this, const char *real_path,
+ struct iatt *stbuf, int32_t *op_errno)
+{
+ int32_t ret = 0;
+
+ /* Unlink the gfid_handle_first */
+
+ if (stbuf && stbuf->ia_nlink == 1) {
+ ret = posix_handle_unset (this, stbuf->ia_gfid, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "unlink of gfid handle failed for path:%s with"
+ "gfid %s with errno:%s", real_path,
+ uuid_utoa (stbuf->ia_gfid), strerror (errno));
+ }
+ }
+
+ /* Unlink the actual file */
+ ret = sys_unlink (real_path);
+ if (ret == -1) {
+ if (op_errno)
+ *op_errno = errno;
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "unlink of %s failed: %s", real_path,
+ strerror (errno));
+ goto err;
+ }
+
+ return 0;
+
+err:
+ return -1;
+}
int32_t
posix_unlink (call_frame_t *frame, xlator_t *this,
loc_t *loc, int xflag, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char *real_path = NULL;
- char *par_path = NULL;
- int32_t fd = -1;
- struct iatt stbuf = {0,};
- struct posix_private *priv = NULL;
- struct iatt preparent = {0,};
- struct iatt postparent = {0,};
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_path = NULL;
+ char *par_path = NULL;
+ int32_t fd = -1;
+ struct iatt stbuf = {0,};
+ struct posix_private *priv = NULL;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
+ int32_t unlink_if_linkto = 0;
+ int32_t check_open_fd = 0;
+ int32_t skip_unlink = 0;
+ ssize_t xattr_size = -1;
+ int32_t is_dht_linkto_file = 0;
DECLARE_OLD_FS_ID_VAR;
@@ -1052,10 +1092,62 @@ posix_unlink (call_frame_t *frame, xlator_t *this,
goto out;
}
- if (stbuf.ia_nlink == 1)
- posix_handle_unset (this, stbuf.ia_gfid, NULL);
-
priv = this->private;
+
+ op_ret = dict_get_int32 (xdata, DHT_SKIP_OPEN_FD_UNLINK,
+ &check_open_fd);
+
+ if (!op_ret && check_open_fd) {
+
+ LOCK (&loc->inode->lock);
+
+ if (loc->inode->fd_count) {
+ skip_unlink = 1;
+ }
+
+ UNLOCK (&loc->inode->lock);
+
+ gf_log (this->name, GF_LOG_INFO, "open-fd-key-status: "
+ "%"PRIu32" for %s", skip_unlink, real_path);
+
+ if (skip_unlink) {
+ op_ret = -1;
+ op_errno = EBUSY;
+ goto out;
+ }
+ }
+
+
+ op_ret = dict_get_int32 (xdata, DHT_SKIP_NON_LINKTO_UNLINK,
+ &unlink_if_linkto);
+
+ if (!op_ret && unlink_if_linkto) {
+
+ LOCK (&loc->inode->lock);
+
+ xattr_size = sys_lgetxattr (real_path, LINKTO, NULL, 0);
+
+ if (xattr_size <= 0) {
+ skip_unlink = 1;
+ } else {
+ is_dht_linkto_file = IS_DHT_LINKFILE_MODE (&stbuf);
+ if (!is_dht_linkto_file)
+ skip_unlink = 1;
+ }
+
+ UNLOCK (&loc->inode->lock);
+
+ gf_log (this->name, GF_LOG_INFO, "linkto_xattr status: "
+ "%"PRIu32" for %s", skip_unlink, real_path);
+
+ if (skip_unlink) {
+ op_ret = -1;
+ op_errno = EBUSY;
+ goto out;
+ }
+ }
+
+
if (priv->background_unlink) {
if (IA_ISREG (loc->inode->ia_type)) {
fd = open (real_path, O_RDONLY);
@@ -1070,12 +1162,9 @@ posix_unlink (call_frame_t *frame, xlator_t *this,
}
}
- op_ret = sys_unlink (real_path);
+ op_ret = posix_unlink_gfid_handle_and_entry (this, real_path, &stbuf,
+ &op_errno);
if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "unlink of %s failed: %s", real_path,
- strerror (op_errno));
goto out;
}
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
index 58f445c699a..80121c08c8f 100644
--- a/xlators/storage/posix/src/posix.h
+++ b/xlators/storage/posix/src/posix.h
@@ -49,6 +49,16 @@
#include "posix-aio.h"
#endif
+#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/
+#define MAX_NO_VECT 1024
+
+#define LINKTO "trusted.glusterfs.dht.linkto"
+
+#define POSIX_GFID_HANDLE_SIZE(base_path_len) (base_path_len + SLEN("/") \
+ + SLEN(GF_HIDDEN_PATH) + SLEN("/") \
+ + SLEN("00/") \
+ + SLEN("00/") + SLEN(UUID0_STR) + 1) /* '\0' */;
+
/**
* posix_fd - internal structure common to file and directory fd's
*/