summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVenkatesh Somyajulu <vsomyaju@redhat.com>2014-09-25 17:24:09 +0530
committerKaleb KEITHLEY <kkeithle@redhat.com>2014-10-20 07:53:56 -0700
commitb3387c83a0d968db7428920f46434f09f4922e53 (patch)
tree54dd3199d302979b86f3f2ab1b46beedc2351e9b
parent3042613687d976c4e8938fb411cedc6e6ef559bf (diff)
cluster/dht: Fix races to avoid deletion of linkto
file Explanation of Race between rebalance processes: https://bugzilla.redhat.com/show_bug.cgi?id=1110694#c4 scenario-1: =========== STATE 1: BRICK-1 only one brick Cached File in the system STATE 2: Add brick-2 BRICK-1 BRICK-2 STATE 3: Lookup of File on brick-2 by this node's rebalance will fail because hashed file is not created yet. So dht_lookup_everywhere is about to get called. STATE 4: As part of lookup link file at brick-2 will be created. STATE 5: getxattr to check that cached file belongs to this node is done STATE 6: dht_lookup_everywhere_cbk detects the link created by rebalance-1. It will unlink it. STATE 7: getxattr at the link file with "pathinfo" key will be called will fail as the link file is deleted by rebalance on node-2 Fix: So in the STATE 6, we should avoid the deletion of link file. Every time dht_lookup_everywhere gets called, lookup will be performed on all the nodes. So to avoid STATE 6, if linkto file is found, it is not deleted until valid case is found in dht_lookup_everywhere_done. Case 1: if linkto file points to cached node, and cached file exists, uwind with success. Case 2: if linkto does not point to current cached node, and cached file exists: a) Unlink stale link file b) Create new link file Case 3: Only linkto file exists: Delete linkto file Case 4: Only cached file Create link file (Handled event without patch) Case 5: Neither cached nor hashed file is present Return with ENOENT (handled even without patch) Reviewed-on: http://review.gluster.org/8231 ****************************************************************************** scenario-2: =========== cluster/dht: Modified logic of linkto file deletion on non-hashed Currently whenever dht_lookup_everywhere gets called, if in dht_lookup_everywhere_cbk, a linkto file is found on non-hashed subvolume, file is unlinked. But there are cases when this file is under migration. Under such condition, we should avoid deletion of file. When some other rebalance process changes the layout of parent such that dst_file (w.r.t. migration) falls on non-hashed node, then may be lookup could have found it as linkto file but just before unlink, file is under migration or already migrated In such cased unlink can be avoided. Race: ------- If we have two bricks (brick-1 and brick-2) with initial file "a" under BaseDir which is hashed as well as cached on (brick-1). Assume "a" hashing gives 44. Brick-1 Brick-2 Initial Setup: BaseDir/a BaseDir [1-50] [51-100] Now add new-brick Brick-3. 1. Rebalance-1 on node Node-1 (Brick-1 node) will reset the BaseDir Layout. 2. After that it will perform a) Create linkto file on new-hashed (brick-2) b) Perform file migration. 1.Rebalance-1 Fixes the base-layout: Brick-1 Brick-2 Brick-3 --------- ---------- ------------ BaseDir/a BaseDir BaseDir [1-33] [34-66] [67-100] 2. Only a) is BaseDir/a BaseDir/a(linkto) BaseDir performed Create linktofile Now rebalance 2 on node-2 jumped in and it will perform step 1 and 2-a. After (rebal-2, step-1), it changes the layout of the BaseDir. BaseDir/a BaseDir/a(link) BaseDir [67-100] [1-33] [34-66] For (rebale-2, step-2), It will perform lookup at Brick-3 as w.r.t new layout 44 falls for brick-3. But lookup will fail. So dht_lookup_everywhere gets called. NOTE: On brick-2 by rebalance-1, a linkto file was created. Currently that linkto files gets deleted by rebalance-2 lookup as it is considered as stale linkto file. But with patch if rebalance is already in progress or rebalance is over, linkto file will not be unlinked. If rebalance is in progress fd will be open and if rebalance is over then linkto file wont be set. Reviewed-on: http://review.gluster.org/8345 ******************************************************************************* scenario-3: =========== cluster/dht: Added keys in dht_lookup_everywhere_done Case where both cached (C1) and hashed file are found, but hash does not point to above cached node (C1), then dont unlink if either fd-is-open on hashed or linkto-xattr is not found. Reviewed-on: http://review.gluster.org/8429 BUG: 1139995 Signed-off-by: Venkatesh Somyajulu <vsomyaju@redhat.com> Change-Id: I86d0a21d4c0501c45d837101ced4f96d6fedc5b9 Signed-off-by: Venkatesh Somyajulu <vsomyaju@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: susant palai <spalai@redhat.com> Reviewed-by: Raghavendra G <rgowdapp@redhat.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com> Reviewed-on: http://review.gluster.org/8674 Reviewed-by: Kaleb KEITHLEY <kkeithle@redhat.com>
-rw-r--r--libglusterfs/src/glusterfs.h9
-rw-r--r--xlators/cluster/dht/src/dht-common.c559
-rw-r--r--xlators/cluster/dht/src/dht-common.h20
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c97
-rw-r--r--xlators/storage/posix/src/posix.c123
-rw-r--r--xlators/storage/posix/src/posix.h10
6 files changed, 742 insertions, 76 deletions
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index c6fc469ee4c..3e7aedde13d 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -149,6 +149,15 @@
#define GF_REBALANCE_TID_KEY "rebalance-id"
#define GF_REMOVE_BRICK_TID_KEY "remove-brick-id"
#define GF_REPLACE_BRICK_TID_KEY "replace-brick-id"
+#define DHT_SKIP_NON_LINKTO_UNLINK "unlink-only-if-dht-linkto-file"
+#define DHT_SKIP_OPEN_FD_UNLINK "dont-unlink-for-open-fd"
+
+#define DHT_LINKFILE_MODE (S_ISVTX)
+
+#define IS_DHT_LINKFILE_MODE(iabuf) ((st_mode_from_ia ((iabuf)->ia_prot, \
+ (iabuf)->ia_type) \
+ & ~S_IFMT) \
+ == DHT_LINKFILE_MODE)
/* NOTE: add members ONLY at the end (just before _MAXVALUE) */
typedef enum {
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 5f7996a9ad6..c5105d27b91 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -442,7 +442,8 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
is_dir = check_is_dir (inode, stbuf, xattr);
if (!is_dir) {
gf_log (this->name, GF_LOG_DEBUG,
- "lookup of %s on %s returned non dir 0%o",
+ "lookup of %s on %s returned non dir 0%o "
+ "calling lookup_everywhere",
local->loc.path, prev->this->name,
stbuf->ia_type);
local->need_selfheal = 1;
@@ -541,6 +542,12 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
LOCK (&frame->lock);
{
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "revalidate lookup of %s "
+ "returned with op_ret %d and op_errno %d",
+ local->loc.path, op_ret, op_errno);
+
if (op_ret == -1) {
local->op_errno = op_errno;
@@ -564,6 +571,14 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
* the file is not migrated */
if (op_errno == ENOENT) {
if (IA_ISREG (local->loc.inode->ia_type)) {
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "found ENOENT for %s. "
+ "Setting "
+ "need_lookup_everywhere"
+ " flag to 1",
+ local->loc.path);
+
local->need_lookup_everywhere = 1;
}
}
@@ -760,9 +775,16 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
}
unwind:
+ gf_log (this->name, GF_LOG_DEBUG,
+ "creation of linkto on hashed subvol:%s, "
+ "returned with op_ret %d and op_errno %d: %s",
+ local->hashed_subvol->name,
+ op_ret, op_errno, uuid_utoa (local->loc.gfid));
+
if (local->linked == _gf_true)
dht_linkfile_attr_heal (frame, this);
+
DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
local->inode, &local->stbuf, local->xattr,
@@ -771,6 +793,176 @@ out:
return ret;
}
+int
+dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ const char *path = NULL;
+
+ local = (dht_local_t*)frame->local;
+ path = local->loc.path;
+
+ gf_log (this->name, GF_LOG_INFO, "lookup_unlink returned with "
+ "op_ret -> %d and op-errno -> %d for %s", op_ret, op_errno,
+ ((path == NULL)? "null" : path ));
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ dht_lookup_everywhere_done (frame, this);
+ }
+
+ return 0;
+}
+
+int
+dht_lookup_unlink_of_false_linkto_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ const char *path = NULL;
+
+ local = (dht_local_t*)frame->local;
+ path = local->loc.path;
+
+ gf_log (this->name, GF_LOG_INFO, "lookup_unlink returned with "
+ "op_ret -> %d and op-errno -> %d for %s", op_ret, op_errno,
+ ((path == NULL)? "null" : path ));
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+
+ if (op_ret == 0) {
+ dht_lookup_everywhere_done (frame, this);
+ } else {
+ /*When dht_lookup_everywhere is performed, one cached
+ *and one hashed file was found and hashed file does
+ *not point to the above mentioned cached node. So it
+ *was considered as stale and an unlink was performed.
+ *But unlink fails. So may be rebalance is in progress.
+ *now ideally we have two data-files. One obtained during
+ *lookup_everywhere and one where unlink-failed. So
+ *at this point in time we cannot decide which one to
+ *choose because there are chances of first cached
+ *file is truncated after rebalance and if it is choosen
+ *as cached node, application will fail. So return EIO.*/
+
+ if (op_errno == EBUSY) {
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not unlink the linkto file as "
+ "either fd is open and/or linkto xattr "
+ "is set for %s",
+ ((path == NULL)? "null":path));
+
+ }
+ DHT_STACK_UNWIND (lookup, frame, -1, EIO, NULL, NULL,
+ NULL, NULL);
+
+ }
+ }
+
+ return 0;
+}
+
+int
+dht_lookup_unlink_stale_linkto_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+
+ dht_local_t *local = NULL;
+ const char *path = NULL;
+
+ /* NOTE:
+ * If stale file unlink fails either there is an open-fd or is not an
+ * dht-linkto-file then posix_unlink returns EBUSY, which is overwritten
+ * to ENOENT
+ */
+
+ local = frame->local;
+
+ if (local && local->loc.path)
+ path = local->loc.path;
+
+ gf_log (this->name, GF_LOG_INFO, "Returned with op_ret %d and "
+ "op_errno %d for %s", op_ret, op_errno,
+ ((path==NULL)?"null":path));
+
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL,
+ NULL);
+
+ return 0;
+}
+
+int
+dht_fill_dict_to_avoid_unlink_of_migrating_file (dict_t *dict) {
+
+ int ret = 0;
+
+ ret = dict_set_int32 (dict, DHT_SKIP_NON_LINKTO_UNLINK, 1);
+
+ if (ret)
+ goto err;
+
+ ret = dict_set_int32 (dict, DHT_SKIP_OPEN_FD_UNLINK, 1);
+
+ if (ret)
+ goto err;
+
+
+ return 0;
+
+err:
+ return -1;
+
+}
+/* Rebalance is performed from cached_node to hashed_node. Initial cached_node
+ * contains a non-linkto file. After migration it is converted to linkto and
+ * then unlinked. And at hashed_subvolume, first a linkto file is present,
+ * then after migration it is converted to a non-linkto file.
+ *
+ * Lets assume a file is present on cached subvolume and a new brick is added
+ * and new brick is the new_hashed subvolume. So fresh lookup on newly added
+ * hashed subvolume will fail and dht_lookup_everywhere gets called. If just
+ * before sending the dht_lookup_everywhere request rebalance is in progress,
+ *
+ * from cached subvolume it may see: Nonlinkto or linkto or No file
+ * from hashed subvolume it may see: No file or linkto file or non-linkto file
+ *
+ * So this boils down to 9 cases:
+ * at cached_subvol at hashed_subvol
+ * ---------------- -----------------
+ *
+ *a) No file No file
+ * [request reached after [Request reached before
+ * migration] Migration]
+ *
+ *b) No file Linkto File
+ *
+ *c) No file Non-Linkto File
+ *
+ *d) Linkto No-File
+ *
+ *e) Linkto Linkto
+ *
+ *f) Linkto Non-Linkto
+ *
+ *g) NonLinkto No-File
+ *
+ *h) NonLinkto Linkto
+ *
+ *i) NonLinkto NonLinkto
+ *
+ * dht_lookup_everywhere_done takes decision based on any of the above case
+ */
int
dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
@@ -780,6 +972,7 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
xlator_t *hashed_subvol = NULL;
xlator_t *cached_subvol = NULL;
dht_layout_t *layout = NULL;
+ gf_boolean_t found_non_linkto_on_hashed = _gf_false;
local = frame->local;
hashed_subvol = local->hashed_subvol;
@@ -801,19 +994,210 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
return 0;
}
+ gf_log (this->name, GF_LOG_INFO, "STATUS: hashed_subvol %s "
+ "cached_subvol %s",
+ (hashed_subvol == NULL)?"null":hashed_subvol->name,
+ (cached_subvol == NULL)?"null":cached_subvol->name);
+
if (!cached_subvol) {
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL,
- NULL);
+
+ if (local->skip_unlink.handle_valid_link && hashed_subvol) {
+
+ /*Purpose of "DHT_SKIP_NON_LINKTO_UNLINK":
+ * If this lookup is performed by rebalance and this
+ * rebalance process detected hashed file and by
+ * the time it sends the lookup request to cached node,
+ * file got migrated and now at intial hashed_node,
+ * final migrated file is present. With current logic,
+ * because this process fails to find the cached_node,
+ * it will unlink the file at initial hashed_node.
+ *
+ * So we avoid this by setting key, and checking at the
+ * posix_unlink that unlink the file only if file is a
+ * linkto file and not a migrated_file.
+ */
+
+
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file
+ (local->xattr_req);
+
+ if (ret) {
+ /* If for some reason, setting key in the dict
+ * fails, return with ENOENT, as with respect to
+ * this process, it detected only a stale link
+ * file.
+ *
+ * Next lookup will delete it.
+ *
+ * Performing deletion of stale link file when
+ * setting key in dict fails, may cause the data
+ * loss becase of the above mentioned race.
+ */
+
+
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT,
+ NULL, NULL, NULL, NULL);
+ } else {
+ local->skip_unlink.handle_valid_link = _gf_false;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "No Cached was found and "
+ "unlink on hashed was skipped"
+ " so performing now: %s",
+ local->loc.path);
+
+ STACK_WIND (frame,
+ dht_lookup_unlink_stale_linkto_cbk,
+ hashed_subvol,
+ hashed_subvol->fops->unlink,
+ &local->loc, 0, local->xattr_req);
+ }
+
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "There was no cached file and "
+ "unlink on hashed is not skipped %s",
+ local->loc.path);
+
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL,
+ NULL, NULL);
+ }
return 0;
}
- if (local->need_lookup_everywhere) {
- if (uuid_compare (local->gfid, local->inode->gfid)) {
- /* GFID different, return error */
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL,
- NULL, NULL, NULL);
- return 0;
+ /* At the time of dht_lookup, no file was found on hashed and that is
+ * why dht_lookup_everywhere is called, but by the time
+ * dht_lookup_everywhere
+ * reached to server, file might have already migrated. In that case we
+ * will find a migrated file at the hashed_node. In this case store the
+ * layout in context and return successfully.
+ */
+
+ if (hashed_subvol || local->need_lookup_everywhere) {
+
+ if (local->need_lookup_everywhere) {
+
+ found_non_linkto_on_hashed = _gf_true;
+
+ } else if ((local->file_count == 1) &&
+ (hashed_subvol == cached_subvol)) {
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "found cached file on hashed subvolume "
+ "so store in context and return for %s",
+ local->loc.path);
+
+ found_non_linkto_on_hashed = _gf_true;
}
+
+ if (found_non_linkto_on_hashed)
+ goto preset_layout;
+
+ }
+
+
+ if (hashed_subvol) {
+ if (local->skip_unlink.handle_valid_link == _gf_true) {
+ if (cached_subvol == local->skip_unlink.hash_links_to) {
+
+ if (uuid_compare (local->skip_unlink.cached_gfid,
+ local->skip_unlink.hashed_gfid)){
+
+ /*GFID different, return error*/
+ DHT_STACK_UNWIND (lookup, frame, -1,
+ ESTALE, NULL, NULL, NULL,
+ NULL);
+
+
+ }
+
+ ret = dht_layout_preset (this, cached_subvol,
+ local->loc.inode);
+ if (ret) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Could not set pre-set layout "
+ "for subvolume %s",
+ cached_subvol->name);
+ }
+
+ local->op_ret = (ret == 0) ? ret : -1;
+ local->op_errno = (ret == 0) ? ret : EINVAL;
+
+ /* Presence of local->cached_subvol validates
+ * that lookup from cached node is successful
+ */
+
+ if (!local->op_ret && local->loc.parent) {
+ dht_inode_ctx_time_update
+ (local->loc.parent, this,
+ &local->postparent, 1);
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Skipped unlinking linkto file "
+ "on the hashed subvolume. "
+ "Returning success as it is a "
+ "valid linkto file. Path:%s"
+ ,local->loc.path);
+
+ goto unwind_hashed_and_cached;
+ } else {
+
+ local->skip_unlink.handle_valid_link = _gf_false;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Linkto file found on hashed "
+ "subvol "
+ "and data file found on cached "
+ "subvolume. But linkto points to "
+ "different cached subvolume (%s) "
+ "path %s",
+ local->skip_unlink.hash_links_to->name,
+ local->loc.path);
+
+ if (local->skip_unlink.opend_fd_count == 0) {
+
+
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file
+ (local->xattr_req);
+
+
+ if (ret) {
+ DHT_STACK_UNWIND (lookup, frame, -1,
+ EIO, NULL, NULL,
+ NULL, NULL);
+ } else {
+ local->call_cnt = 1;
+ STACK_WIND (frame,
+ dht_lookup_unlink_of_false_linkto_cbk,
+ hashed_subvol,
+ hashed_subvol->fops->unlink,
+ &local->loc, 0,
+ local->xattr_req);
+ }
+
+ return 0;
+
+ }
+ }
+
+ }
+ }
+
+
+preset_layout:
+
+ if (found_non_linkto_on_hashed) {
+
+ if (local->need_lookup_everywhere) {
+ if (uuid_compare (local->gfid, local->inode->gfid)) {
+ /* GFID different, return error */
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT,
+ NULL, NULL, NULL, NULL);
+ return 0;
+ }
+ }
+
local->op_ret = 0;
local->op_errno = 0;
layout = dht_layout_for_subvol (this, cached_subvol);
@@ -890,26 +1274,15 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
cached_subvol, hashed_subvol, &local->loc);
return ret;
-}
-
-
-int
-dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
-{
- int this_call_cnt = 0;
-
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- dht_lookup_everywhere_done (frame, this);
- }
+unwind_hashed_and_cached:
+ DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
+ DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
+ local->loc.inode, &local->stbuf, local->xattr,
+ &local->postparent);
return 0;
}
-
int
dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
@@ -924,8 +1297,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
xlator_t *subvol = NULL;
loc_t *loc = NULL;
xlator_t *link_subvol = NULL;
- int ret = -1;
- int32_t fd_count = 0;
+ int ret = -1;
+ int32_t fd_count = 0;
+ dict_t *dict_req = {0};
GF_VALIDATE_OR_GOTO ("dht", frame, out);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -939,6 +1313,11 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
prev = cookie;
subvol = prev->this;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "returned with op_ret %d and op_errno %d (%s) "
+ "from subvol %s", op_ret, op_errno, loc->path,
+ subvol->name);
+
LOCK (&frame->lock);
{
if (op_ret == -1) {
@@ -957,6 +1336,13 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
is_linkfile = check_is_linkfile (inode, buf, xattr);
+ if (is_linkfile) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Found linktofile on %s for %s",
+ subvol->name, loc->path);
+
+ }
+
is_dir = check_is_dir (inode, buf, xattr);
if (is_linkfile) {
@@ -981,18 +1367,26 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
} else {
local->file_count++;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "found cached file on %s for %s",
+ subvol->name, loc->path);
+
if (!local->cached_subvol) {
/* found one file */
dht_iatt_merge (this, &local->stbuf, buf,
subvol);
local->xattr = dict_ref (xattr);
local->cached_subvol = subvol;
+
gf_log (this->name, GF_LOG_DEBUG,
- "found on %s file %s",
+ "datafile found on %s file %s",
subvol->name, loc->path);
dht_iatt_merge (this, &local->postparent,
postparent, subvol);
+
+ uuid_copy (local->skip_unlink.cached_gfid,
+ buf->ia_gfid);
} else {
/* This is where we need 'rename' both entries logic */
gf_log (this->name, GF_LOG_WARNING,
@@ -1009,15 +1403,68 @@ unlock:
if (is_linkfile) {
ret = dict_get_int32 (xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count);
- /* Delete the linkfile only if there are no open fds on it.
- if there is a open-fd, it may be in migration */
- if (!ret && (fd_count == 0)) {
- gf_log (this->name, GF_LOG_INFO,
- "deleting stale linkfile %s on %s",
- loc->path, subvol->name);
- STACK_WIND (frame, dht_lookup_unlink_cbk,
- subvol, subvol->fops->unlink, loc, 0, NULL);
- return 0;
+
+ /* Any linkto file found on the non-hashed subvolume should
+ * be unlinked (performed in the "else if" block below)
+ *
+ * But if a linkto file is found on hashed subvolume, it may be
+ * pointing to vaild cached node. So unlinking of linkto
+ * file on hashed subvolume is skipped and inside
+ * dht_lookup_everywhere_done, checks are performed. If this
+ * linkto file is found as stale linkto file, it is deleted
+ * otherwise unlink is skipped.
+ */
+
+ if (local->hashed_subvol && local->hashed_subvol == subvol) {
+
+ local->skip_unlink.handle_valid_link = _gf_true;
+ local->skip_unlink.opend_fd_count = fd_count;
+ local->skip_unlink.hash_links_to = link_subvol;
+ uuid_copy (local->skip_unlink.hashed_gfid,
+ buf->ia_gfid);
+
+ gf_log (this->name, GF_LOG_DEBUG, "Found"
+ " one linkto file on hashed subvol %s "
+ "for %s: Skipping unlinking till "
+ "everywhere_done", subvol->name,
+ loc->path);
+
+ } else if (!ret && (fd_count == 0)) {
+
+ dict_req = dict_new ();
+
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file
+ (dict_req);
+
+ if (ret) {
+
+ /* Skip unlinking for dict_failure
+ *File is found as a linkto file on non-hashed,
+ *subvolume. In the current implementation,
+ *finding a linkto-file on non-hashed does not
+ *always implies that it is stale. So deletion
+ *of file should be done only when both fd is
+ *closed and linkto-xattr is set. In case of
+ *dict_set failure, avoid skipping of file.
+ *NOTE: dht_frame_return should get called for
+ * this block.
+ */
+
+ dict_unref (dict_req);
+
+ } else {
+ gf_log (this->name, GF_LOG_INFO,
+ "attempting deletion of stale linkfile "
+ "%s on %s", loc->path, subvol->name);
+
+ STACK_WIND (frame, dht_lookup_unlink_cbk,
+ subvol, subvol->fops->unlink, loc,
+ 0, dict_req);
+
+ dict_unref (dict_req);
+
+ return 0;
+ }
}
}
@@ -1054,6 +1501,9 @@ dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc)
if (!local->inode)
local->inode = inode_ref (loc->inode);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "winding lookup call to %d subvols", call_cnt);
+
for (i = 0; i < call_cnt; i++) {
STACK_WIND (frame, dht_lookup_everywhere_cbk,
conf->subvolumes[i],
@@ -1252,9 +1702,14 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (!op_ret && uuid_is_null (local->gfid))
memcpy (local->gfid, stbuf->ia_gfid, 16);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "fresh_lookup returned for %s with op_ret %d and "
+ "op_errno %d", loc->path, op_ret, op_errno);
+
if (ENTRY_MISSING (op_ret, op_errno)) {
gf_log (this->name, GF_LOG_TRACE, "Entry %s missing on subvol"
" %s", loc->path, prev->this->name);
+
if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_ON) {
local->op_errno = ENOENT;
dht_lookup_everywhere (frame, this, loc);
@@ -1313,13 +1768,17 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
subvol = dht_linkfile_subvol (this, inode, stbuf, xattr);
if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "linkfile not having link subvolume. path=%s",
- loc->path);
+ gf_log (this->name, GF_LOG_INFO, "linkfile not having link "
+ "subvol for %s", loc->path);
+
dht_lookup_everywhere (frame, this, loc);
return 0;
}
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Calling lookup on linkto target %s for path %s",
+ subvol->name, loc->path);
+
STACK_WIND (frame, dht_lookup_linkfile_cbk,
subvol, subvol->fops->lookup,
&local->loc, local->xattr_req);
@@ -1465,6 +1924,13 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
dht_layout_unref (this, local->layout);
local->layout = NULL;
local->cached_subvol = NULL;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "Called revalidate lookup for %s, "
+ "but layout->gen (%d) is less than "
+ "conf->gen (%d), calling fresh_lookup",
+ loc->path, layout->gen, conf->gen);
+
goto do_fresh_lookup;
}
@@ -1521,6 +1987,10 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
for (i = 0; i < call_cnt; i++) {
subvol = layout->list[i].xlator;
+ gf_log (this->name, GF_LOG_DEBUG, "calling "
+ "revalidate lookup for %s at %s",
+ loc->path, subvol->name);
+
STACK_WIND (frame, dht_revalidate_cbk,
subvol, subvol->fops->lookup,
&local->loc, local->xattr_req);
@@ -1565,6 +2035,7 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
"no subvolume in layout for path=%s, "
"checking on all the subvols to see if "
"it is a directory", loc->path);
+
call_cnt = conf->subvolume_cnt;
local->call_cnt = call_cnt;
@@ -1575,6 +2046,10 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
goto err;
}
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Found null hashed subvol. Calling lookup"
+ " on all nodes.");
+
for (i = 0; i < call_cnt; i++) {
STACK_WIND (frame, dht_lookup_dir_cbk,
conf->subvolumes[i],
@@ -1584,6 +2059,10 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
return 0;
}
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Calling fresh lookup for %s on"
+ " %s", loc->path, hashed_subvol->name);
+
STACK_WIND (frame, dht_lookup_cbk,
hashed_subvol, hashed_subvol->fops->lookup,
loc, local->xattr_req);
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 83725f09712..c7f20a28383 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -96,6 +96,15 @@ struct dht_rebalance_ {
dict_t *xdata;
};
+struct dht_skip_linkto_unlink {
+
+ gf_boolean_t handle_valid_link;
+ int opend_fd_count;
+ xlator_t *hash_links_to;
+ uuid_t cached_gfid;
+ uuid_t hashed_gfid;
+};
+
struct dht_local {
int call_cnt;
loc_t loc;
@@ -184,6 +193,9 @@ struct dht_local {
xlator_t *first_up_subvol;
gf_boolean_t added_link;
+
+ struct dht_skip_linkto_unlink skip_unlink;
+
};
typedef struct dht_local dht_local_t;
@@ -752,4 +764,12 @@ dht_inodectx_dump (xlator_t *this, inode_t *inode);
int
dht_subvol_status (dht_conf_t *conf, xlator_t *subvol);
+void
+dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,
+ dht_layout_t *layout);
+int
+dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this);
+
+int
+dht_fill_dict_to_avoid_unlink_of_migrating_file (dict_t *dict);
#endif/* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index d6e34f92036..725e0c8c7b0 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -237,13 +237,15 @@ out:
}
static inline int
-__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf,
- dict_t *dict, fd_t **dst_fd)
+__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc,
+ struct iatt *stbuf, dict_t *dict,
+ fd_t **dst_fd)
{
- xlator_t *this = NULL;
- int ret = -1;
- fd_t *fd = NULL;
- struct iatt new_stbuf = {0,};
+ xlator_t *this = NULL;
+ int ret = -1;
+ fd_t *fd = NULL;
+ struct iatt new_stbuf = {0,};
+ struct iatt check_stbuf = {0,};
this = THIS;
@@ -300,6 +302,46 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
goto out;
}
+ /*Reason of doing lookup after create again:
+ *In the create, there is some time-gap between opening fd at the
+ *server (posix_layer) and binding it in server (incrementing fd count),
+ *so if in that time-gap, if other process sends unlink considering it
+ *as a linkto file, because inode->fd count will be 0, so file will be
+ *unlinked at the backend. And because furthur operations are performed
+ *on fd, so though migration will be done but will end with no file
+ *at the backend.
+ */
+
+
+ ret = syncop_lookup (to, loc, NULL, &check_stbuf, NULL, NULL);
+ if (!ret) {
+ if (uuid_compare (stbuf->ia_gfid, check_stbuf.ia_gfid) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "file %s exists in %s with different gfid,"
+ "found in lookup after create",
+ loc->path, to->name);
+ ret = -1;
+ fd_unref (fd);
+ goto out;
+ }
+
+ }
+
+ if (-ret == ENOENT) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: file does not exists"
+ "on %s (%s)", loc->path, to->name, strerror (-ret));
+ ret = -1;
+ fd_unref (fd);
+ goto out;
+ }
+
+ ret = syncop_fsetxattr (to, fd, dict, 0);
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to set xattr on %s (%s)",
+ loc->path, to->name, strerror (-ret));
+
ret = syncop_ftruncate (to, fd, stbuf->ia_size);
if (ret < 0)
gf_log (this->name, GF_LOG_ERROR,
@@ -650,17 +692,18 @@ int
dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
int flag)
{
- int ret = -1;
- struct iatt new_stbuf = {0,};
- struct iatt stbuf = {0,};
- struct iatt empty_iatt = {0,};
- ia_prot_t src_ia_prot = {0,};
- fd_t *src_fd = NULL;
- fd_t *dst_fd = NULL;
- dict_t *dict = NULL;
- dict_t *xattr = NULL;
- dict_t *xattr_rsp = NULL;
- int file_has_holes = 0;
+ int ret = -1;
+ struct iatt new_stbuf = {0,};
+ struct iatt stbuf = {0,};
+ struct iatt empty_iatt = {0,};
+ ia_prot_t src_ia_prot = {0,};
+ fd_t *src_fd = NULL;
+ fd_t *dst_fd = NULL;
+ dict_t *dict = NULL;
+ dict_t *xattr = NULL;
+ dict_t *xattr_rsp = NULL;
+ int file_has_holes = 0;
+ int rcvd_enoent_from_src = 0;
gf_log (this->name, GF_LOG_INFO, "%s: attempting to move from %s to %s",
loc->path, from->name, to->name);
@@ -827,15 +870,31 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
}
/* Do a stat and check the gfid before unlink */
+
+ /*
+ * Cached file changes its state from non-linkto to linkto file after
+ * migrating data. If lookup from any other mount-point is performed,
+ * converted-linkto-cached file will be treated as a stale and will be
+ * unlinked. But by this time, file is already migrated. So further
+ * failure because of ENOENT should not be treated as error
+ */
+
ret = syncop_stat (from, loc, &empty_iatt);
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to do a stat on %s (%s)",
loc->path, from->name, strerror (errno));
- goto out;
+
+ if (-ret != ENOENT) {
+ ret = -1;
+ goto out;
+ }
+
+ rcvd_enoent_from_src = 1;
}
- if (uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0) {
+ if ((uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0 ) &&
+ (!rcvd_enoent_from_src)) {
/* take out the source from namespace */
ret = syncop_unlink (from, loc);
if (ret) {
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index dc3a709cd26..bf5c188e5ca 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -1019,20 +1019,60 @@ out:
return 0;
}
+int32_t
+posix_unlink_gfid_handle_and_entry (xlator_t *this, const char *real_path,
+ struct iatt *stbuf, int32_t *op_errno)
+{
+ int32_t ret = 0;
+
+ /* Unlink the gfid_handle_first */
+
+ if (stbuf && stbuf->ia_nlink == 1) {
+ ret = posix_handle_unset (this, stbuf->ia_gfid, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "unlink of gfid handle failed for path:%s with"
+ "gfid %s with errno:%s", real_path,
+ uuid_utoa (stbuf->ia_gfid), strerror (errno));
+ }
+ }
+
+ /* Unlink the actual file */
+ ret = sys_unlink (real_path);
+ if (ret == -1) {
+ if (op_errno)
+ *op_errno = errno;
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "unlink of %s failed: %s", real_path,
+ strerror (errno));
+ goto err;
+ }
+
+ return 0;
+
+err:
+ return -1;
+}
int32_t
posix_unlink (call_frame_t *frame, xlator_t *this,
loc_t *loc, int xflag, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char *real_path = NULL;
- char *par_path = NULL;
- int32_t fd = -1;
- struct iatt stbuf = {0,};
- struct posix_private *priv = NULL;
- struct iatt preparent = {0,};
- struct iatt postparent = {0,};
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_path = NULL;
+ char *par_path = NULL;
+ int32_t fd = -1;
+ struct iatt stbuf = {0,};
+ struct posix_private *priv = NULL;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
+ int32_t unlink_if_linkto = 0;
+ int32_t check_open_fd = 0;
+ int32_t skip_unlink = 0;
+ ssize_t xattr_size = -1;
+ int32_t is_dht_linkto_file = 0;
DECLARE_OLD_FS_ID_VAR;
@@ -1052,10 +1092,62 @@ posix_unlink (call_frame_t *frame, xlator_t *this,
goto out;
}
- if (stbuf.ia_nlink == 1)
- posix_handle_unset (this, stbuf.ia_gfid, NULL);
-
priv = this->private;
+
+ op_ret = dict_get_int32 (xdata, DHT_SKIP_OPEN_FD_UNLINK,
+ &check_open_fd);
+
+ if (!op_ret && check_open_fd) {
+
+ LOCK (&loc->inode->lock);
+
+ if (loc->inode->fd_count) {
+ skip_unlink = 1;
+ }
+
+ UNLOCK (&loc->inode->lock);
+
+ gf_log (this->name, GF_LOG_INFO, "open-fd-key-status: "
+ "%"PRIu32" for %s", skip_unlink, real_path);
+
+ if (skip_unlink) {
+ op_ret = -1;
+ op_errno = EBUSY;
+ goto out;
+ }
+ }
+
+
+ op_ret = dict_get_int32 (xdata, DHT_SKIP_NON_LINKTO_UNLINK,
+ &unlink_if_linkto);
+
+ if (!op_ret && unlink_if_linkto) {
+
+ LOCK (&loc->inode->lock);
+
+ xattr_size = sys_lgetxattr (real_path, LINKTO, NULL, 0);
+
+ if (xattr_size <= 0) {
+ skip_unlink = 1;
+ } else {
+ is_dht_linkto_file = IS_DHT_LINKFILE_MODE (&stbuf);
+ if (!is_dht_linkto_file)
+ skip_unlink = 1;
+ }
+
+ UNLOCK (&loc->inode->lock);
+
+ gf_log (this->name, GF_LOG_INFO, "linkto_xattr status: "
+ "%"PRIu32" for %s", skip_unlink, real_path);
+
+ if (skip_unlink) {
+ op_ret = -1;
+ op_errno = EBUSY;
+ goto out;
+ }
+ }
+
+
if (priv->background_unlink) {
if (IA_ISREG (loc->inode->ia_type)) {
fd = open (real_path, O_RDONLY);
@@ -1070,12 +1162,9 @@ posix_unlink (call_frame_t *frame, xlator_t *this,
}
}
- op_ret = sys_unlink (real_path);
+ op_ret = posix_unlink_gfid_handle_and_entry (this, real_path, &stbuf,
+ &op_errno);
if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "unlink of %s failed: %s", real_path,
- strerror (op_errno));
goto out;
}
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
index 58f445c699a..80121c08c8f 100644
--- a/xlators/storage/posix/src/posix.h
+++ b/xlators/storage/posix/src/posix.h
@@ -49,6 +49,16 @@
#include "posix-aio.h"
#endif
+#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/
+#define MAX_NO_VECT 1024
+
+#define LINKTO "trusted.glusterfs.dht.linkto"
+
+#define POSIX_GFID_HANDLE_SIZE(base_path_len) (base_path_len + SLEN("/") \
+ + SLEN(GF_HIDDEN_PATH) + SLEN("/") \
+ + SLEN("00/") \
+ + SLEN("00/") + SLEN(UUID0_STR) + 1) /* '\0' */;
+
/**
* posix_fd - internal structure common to file and directory fd's
*/