summaryrefslogtreecommitdiffstats
path: root/xlators/cluster
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster')
-rw-r--r--xlators/cluster/afr/src/afr-common.c317
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c36
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c21
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.c93
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c1070
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.h11
-rw-r--r--xlators/cluster/afr/src/afr-lk-common.c162
-rw-r--r--xlators/cluster/afr/src/afr-mem-types.h5
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-algorithm.c62
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c503
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.h28
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c335
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c82
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c151
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c574
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.h13
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c698
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h8
-rw-r--r--xlators/cluster/afr/src/afr.c36
-rw-r--r--xlators/cluster/afr/src/afr.h186
-rw-r--r--xlators/cluster/afr/src/pump.c11
-rw-r--r--xlators/cluster/dht/src/dht-common.c71
-rw-r--r--xlators/cluster/dht/src/dht-common.h25
-rw-r--r--xlators/cluster/dht/src/dht-diskusage.c75
-rw-r--r--xlators/cluster/dht/src/dht-helper.c183
-rw-r--r--xlators/cluster/dht/src/dht-inode-read.c69
-rw-r--r--xlators/cluster/dht/src/dht-inode-write.c441
-rw-r--r--xlators/cluster/dht/src/dht-layout.c14
-rw-r--r--xlators/cluster/dht/src/dht-linkfile.c72
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c155
-rw-r--r--xlators/cluster/dht/src/dht-rename.c106
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c33
-rw-r--r--xlators/cluster/dht/src/dht-shared.c26
-rw-r--r--xlators/cluster/dht/src/dht.c3
-rw-r--r--xlators/cluster/dht/src/nufa.c190
-rw-r--r--xlators/cluster/dht/src/switch.c6
-rw-r--r--xlators/cluster/stripe/src/stripe.c595
37 files changed, 5228 insertions, 1238 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index a9acb4094..af01f2ef2 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -51,7 +51,7 @@
#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL
#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL
-
+#define AFR_STATISTICS_HISTORY_SIZE 50
int
afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this,
gf_boolean_t fail_conflict);
@@ -779,6 +779,12 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
sh = &local->self_heal;
priv = this->private;
+ if (sh->data_sh_info && strcmp (sh->data_sh_info, ""))
+ GF_FREE (sh->data_sh_info);
+
+ if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, ""))
+ GF_FREE (sh->metadata_sh_info);
+
GF_FREE (sh->buf);
GF_FREE (sh->parentbufs);
@@ -826,7 +832,8 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
void
afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
{
- afr_private_t * priv = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
priv = this->private;
@@ -836,7 +843,9 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
GF_FREE (local->internal_lock.locked_nodes);
- GF_FREE (local->internal_lock.inode_locked_nodes);
+ for (i = 0; local->internal_lock.inodelk[i].domain; i++) {
+ GF_FREE (local->internal_lock.inodelk[i].locked_nodes);
+ }
GF_FREE (local->internal_lock.lower_locked_nodes);
@@ -1071,6 +1080,88 @@ afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent)
uuid_copy (loc->pargfid, postparent->ia_gfid);
}
+/*
+ * Quota size xattrs are not maintained by afr. There is a
+ * possibility that they differ even when both the directory changelog xattrs
+ * suggest everything is fine. So if there is at least one 'source' check among
+ * the sources which has the maximum quota size. Otherwise check among all the
+ * available ones for maximum quota size. This way if there is a source and
+ * stale copies it always votes for the 'source'.
+ * */
+
+static void
+afr_handle_quota_size (afr_local_t *local, xlator_t *this,
+ dict_t *rsp_dict)
+{
+ int32_t *sources = NULL;
+ dict_t *xattr = NULL;
+ data_t *max_data = NULL;
+ int64_t max_quota_size = -1;
+ data_t *data = NULL;
+ int64_t *size = NULL;
+ int64_t quota_size = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int ret = -1;
+ gf_boolean_t source_present = _gf_false;
+
+ priv = this->private;
+ sources = local->cont.lookup.sources;
+
+ if (rsp_dict == NULL) {
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid "
+ "response dictionary", local->loc.path);
+ return;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source_present = _gf_true;
+ break;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ /*
+ * If there is at least one source lets check
+ * for maximum quota sizes among sources, otherwise take the
+ * maximum of the ones present to be on the safer side.
+ */
+ if (source_present && !sources[i])
+ continue;
+
+ xattr = local->cont.lookup.xattrs[i];
+ if (!xattr)
+ continue;
+
+ data = dict_get (xattr, QUOTA_SIZE_KEY);
+ if (!data)
+ continue;
+
+ size = (int64_t*)data->data;
+ quota_size = ntoh64(*size);
+ gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64,
+ local->loc.path, i, quota_size);
+ if (quota_size > max_quota_size) {
+ if (max_data)
+ data_unref (max_data);
+
+ max_quota_size = quota_size;
+ max_data = data_ref (data);
+ }
+ }
+
+ if (max_data) {
+ ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set "
+ "quota size", local->loc.path);
+ }
+
+ data_unref (max_data);
+ }
+}
+
int
afr_lookup_build_response_params (afr_local_t *local, xlator_t *this)
{
@@ -1115,13 +1206,18 @@ afr_lookup_build_response_params (afr_local_t *local, xlator_t *this)
ret = -1;
goto out;
}
+
gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d",
read_child);
if (!*xattr)
*xattr = dict_ref (local->cont.lookup.xattrs[read_child]);
+
*buf = local->cont.lookup.bufs[read_child];
*postparent = local->cont.lookup.postparents[read_child];
+ if (dict_get (local->xattr_req, QUOTA_SIZE_KEY))
+ afr_handle_quota_size (local, this, *xattr);
+
if (IA_INVAL == local->cont.lookup.inode->ia_type) {
/* fix for RT #602 */
local->cont.lookup.inode->ia_type = buf->ia_type;
@@ -1270,7 +1366,7 @@ afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this,
if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) {
/* mismatching gfid */
- gf_log (this->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_DEBUG,
"%s: gfid different on subvolume", local->loc.path);
}
}
@@ -1494,7 +1590,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children,
child2 = &bufs[success_children[i-1]];
if (FILETYPE_DIFFERS (child1, child2)) {
- gf_log (xlator_name, GF_LOG_WARNING, "%s: filetype "
+ gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype "
"differs on subvolumes (%d, %d)", path,
success_children[i-1], success_children[i]);
conflicting = _gf_true;
@@ -1503,7 +1599,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children,
if (!gfid || uuid_is_null (child1->ia_gfid))
continue;
if (uuid_compare (*gfid, child1->ia_gfid)) {
- gf_log (xlator_name, GF_LOG_WARNING, "%s: gfid differs"
+ gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs"
" on subvolume %d", path, success_children[i]);
conflicting = _gf_true;
goto out;
@@ -1632,7 +1728,6 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this,
afr_local_t *local = NULL;
int ret = -1;
dict_t *xattr = NULL;
- int32_t spb = 0;
local = frame->local;
@@ -1663,10 +1758,6 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this,
local->loc.path,
local->self_heal.actual_sh_started);
}
-
- if (local->loc.inode)
- spb = afr_is_split_brain (this, local->loc.inode);
- ret = dict_set_int32 (xattr, "split-brain", spb);
}
out:
AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
@@ -1741,7 +1832,8 @@ afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this,
afr_lookup_set_self_heal_params (local, this);
if (afr_can_self_heal_proceed (&local->self_heal, priv)) {
- if (afr_is_transaction_running (local))
+ if (afr_is_transaction_running (local) &&
+ (!local->allow_sh_for_running_transaction))
goto out;
reason = "lookup detected pending operations";
@@ -2336,7 +2428,7 @@ afr_lookup (call_frame_t *frame, xlator_t *this,
int call_count = 0;
uint64_t ctx = 0;
int32_t op_errno = 0;
-
+ int allow_sh = 0;
priv = this->private;
AFR_LOCAL_ALLOC_OR_GOTO (local, out);
@@ -2353,6 +2445,13 @@ afr_lookup (call_frame_t *frame, xlator_t *this,
goto out;
}
+ if (local->loc.path &&
+ (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) {
+ op_errno = EPERM;
+ ret = -1;
+ goto out;
+ }
+
ret = inode_ctx_get (local->loc.inode, this, &ctx);
if (ret == 0) {
/* lookup is a revalidate */
@@ -2401,6 +2500,11 @@ afr_lookup (call_frame_t *frame, xlator_t *this,
/* By default assume ENOTCONN. On success it will be set to 0. */
local->op_errno = ENOTCONN;
+ ret = dict_get_int32 (xattr_req, "allow-sh-for-running-transaction",
+ &allow_sh);
+ dict_del (xattr_req, "allow-sh-for-running-transaction");
+ local->allow_sh_for_running_transaction = allow_sh;
+
ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc,
&gfid_req);
if (ret) {
@@ -2578,15 +2682,42 @@ afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
+static int
+afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+
+ priv = this->private;
+ local = frame->local;
+ call_count = local->call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_flush_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->flush,
+ local->fd, NULL);
+ if (!--call_count)
+ break;
+
+ }
+ }
+
+ return 0;
+}
+
int
afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
+ call_stub_t *stub = NULL;
int ret = -1;
int op_errno = 0;
- int call_count = -1;
- int i = 0;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -2602,23 +2733,14 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
goto out;
local->fd = fd_ref(fd);
- call_count = local->call_count;
-
- afr_delayed_changelog_wake_up (this, fd);
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_flush_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->flush,
- local->fd, NULL);
-
- if (!--call_count)
- break;
- }
+ stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata);
+ if (!stub) {
+ ret = -1;
+ op_errno = ENOMEM;
+ goto out;
}
+ afr_delayed_changelog_wake_resume (this, fd, stub);
ret = 0;
out:
@@ -2694,6 +2816,16 @@ afr_release (xlator_t *this, fd_t *fd)
/* {{{ fsync */
int
+afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+int
afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
@@ -2718,13 +2850,13 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->op_ret = 0;
if (local->success_count == 0) {
- local->cont.fsync.prebuf = *prebuf;
- local->cont.fsync.postbuf = *postbuf;
+ local->cont.inode_wfop.prebuf = *prebuf;
+ local->cont.inode_wfop.postbuf = *postbuf;
}
if (child_index == read_child) {
- local->cont.fsync.prebuf = *prebuf;
- local->cont.fsync.postbuf = *postbuf;
+ local->cont.inode_wfop.prebuf = *prebuf;
+ local->cont.inode_wfop.postbuf = *postbuf;
}
local->success_count++;
@@ -2746,10 +2878,11 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
post-op. This guarantee is expected by FUSE graph switching
for example.
*/
- stub = fop_fsync_cbk_stub (frame, default_fsync_cbk,
+ stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk,
local->op_ret, local->op_errno,
- &local->cont.fsync.prebuf,
- &local->cont.fsync.postbuf, xdata);
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ xdata);
if (!stub) {
AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0);
return 0;
@@ -3966,6 +4099,8 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
goto out;
}
+ local->append_write = _gf_false;
+
ret = 0;
out:
return ret;
@@ -3977,11 +4112,6 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count,
{
int ret = -ENOMEM;
- lk->inode_locked_nodes = GF_CALLOC (sizeof (*lk->inode_locked_nodes),
- child_count, gf_afr_mt_char);
- if (NULL == lk->inode_locked_nodes)
- goto out;
-
lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
child_count, gf_afr_mt_char);
if (NULL == lk->locked_nodes)
@@ -4040,6 +4170,21 @@ out:
}
int
+afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count)
+{
+ int ret = -ENOMEM;
+
+ lk->domain = dom;
+ lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
+ child_count, gf_afr_mt_char);
+ if (NULL == lk->locked_nodes)
+ goto out;
+ ret = 0;
+out:
+ return ret;
+}
+
+int
afr_transaction_local_init (afr_local_t *local, xlator_t *this)
{
int child_up_count = 0;
@@ -4052,6 +4197,14 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
if (ret < 0)
goto out;
+ if ((local->transaction.type == AFR_DATA_TRANSACTION) ||
+ (local->transaction.type == AFR_METADATA_TRANSACTION)) {
+ ret = afr_inodelk_init (&local->internal_lock.inodelk[0],
+ this->name, priv->child_count);
+ if (ret < 0)
+ goto out;
+ }
+
ret = -ENOMEM;
child_up_count = afr_up_children_count (local->child_up,
priv->child_count);
@@ -4278,6 +4431,16 @@ afr_priv_destroy (afr_private_t *priv)
if (priv->shd.split_brain)
eh_destroy (priv->shd.split_brain);
+ for (i = 0; i < priv->child_count; i++)
+ {
+ if (priv->shd.statistics[i])
+ eh_destroy (priv->shd.statistics[i]);
+ }
+
+ GF_FREE (priv->shd.statistics);
+
+ GF_FREE (priv->shd.crawl_events);
+
GF_FREE (priv->last_event);
if (priv->pending_key) {
for (i = 0; i < priv->child_count; i++)
@@ -4356,3 +4519,73 @@ afr_is_fd_fixable (fd_t *fd)
return _gf_true;
}
+
+void
+afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ inode_t *inode = NULL;
+ afr_inode_ctx_t *ctx = NULL;
+
+ local = frame->local;
+
+ if (local->fd)
+ inode = local->fd->inode;
+ else
+ inode = local->loc.inode;
+
+ if (!inode)
+ return;
+
+ LOCK (&inode->lock);
+ {
+ ctx = __afr_inode_ctx_get (inode, this);
+ ctx->open_fd_count = local->open_fd_count;
+ }
+ UNLOCK (&inode->lock);
+}
+
+int
+afr_initialise_statistics (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ int i = 0;
+ int child_count = 0;
+ eh_t *stats_per_brick = NULL;
+ shd_crawl_event_t ***shd_crawl_events = NULL;
+ priv = this->private;
+
+ priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count,
+ gf_common_mt_eh_t);
+ if (!priv->shd.statistics) {
+ ret = -1;
+ goto out;
+ }
+ child_count = priv->child_count;
+ for (i=0; i < child_count ; i++) {
+ stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE,
+ _gf_false,
+ _destroy_crawl_event_data);
+ if (!stats_per_brick) {
+ ret = -1;
+ goto out;
+ }
+ priv->shd.statistics[i] = stats_per_brick;
+
+ }
+
+ shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events);
+ *shd_crawl_events = GF_CALLOC (sizeof(shd_crawl_event_t*),
+ priv->child_count,
+ gf_afr_mt_shd_crawl_event_t);
+
+ if (!priv->shd.crawl_events) {
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+out:
+ return ret;
+
+}
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index 0e1718814..689dd84e6 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -253,7 +253,7 @@ unlock:
goto out;
if (!afr_is_opendir_done (this, local->fd->inode) &&
- up_children_count > 1) {
+ up_children_count > 1 && priv->entry_self_heal) {
/*
* This is the first opendir on this inode. We need
@@ -383,14 +383,36 @@ afr_forget_entries (fd_t *fd)
}
}
+static void
+afr_readdir_filter_trash_dir (gf_dirent_t *entries, fd_t *fd)
+{
+ gf_dirent_t * entry = NULL;
+ gf_dirent_t * tmp = NULL;
+
+ list_for_each_entry_safe (entry, tmp, &entries->list, list) {
+ if (__is_root_gfid (fd->inode->gfid) &&
+ !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) {
+ list_del_init (&entry->list);
+ GF_FREE (entry);
+ }
+ }
+}
int32_t
afr_readdir_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
gf_dirent_t *entries, dict_t *xdata)
{
- AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL);
+ afr_local_t *local = NULL;
+ if (op_ret == -1)
+ goto out;
+
+ local = frame->local;
+ afr_readdir_filter_trash_dir (entries, local->fd);
+
+out:
+ AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL);
return 0;
}
@@ -400,8 +422,16 @@ afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
dict_t *xdata)
{
- AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL);
+ afr_local_t *local = NULL;
+
+ if (op_ret == -1)
+ goto out;
+
+ local = frame->local;
+ afr_readdir_filter_trash_dir (entries, local->fd);
+out:
+ AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL);
return 0;
}
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index 5a22696ce..1943b719b 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -235,26 +235,6 @@ out:
}
void
-afr_dir_fop_handle_all_fop_failures (call_frame_t *frame)
-{
- xlator_t *this = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- this = frame->this;
- local = frame->local;
- priv = this->private;
-
- if (local->op_ret >= 0)
- goto out;
-
- __mark_all_success (local->pending, priv->child_count,
- local->transaction.type);
-out:
- return;
-}
-
-void
afr_dir_fop_done (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
@@ -273,7 +253,6 @@ afr_dir_fop_done (call_frame_t *frame, xlator_t *this)
done:
local->transaction.unwind (frame, this);
afr_dir_fop_mark_entry_pending_changelog (frame, this);
- afr_dir_fop_handle_all_fop_failures (frame);
local->transaction.resume (frame, this);
}
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index 051b7b164..e06e3b2f2 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -117,6 +117,8 @@ afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
children = priv->children;
+ AFR_SBRAIN_CHECK_LOC (loc, out);
+
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -232,6 +234,8 @@ afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
children = priv->children;
+ AFR_SBRAIN_CHECK_LOC (loc, out);
+
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -348,10 +352,7 @@ afr_fstat (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (fd->inode, out);
- if (afr_is_split_brain (this, fd->inode)) {
- op_errno = EIO;
- goto out;
- }
+ AFR_SBRAIN_CHECK_FD (fd, out);
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -472,6 +473,8 @@ afr_readlink (call_frame_t *frame, xlator_t *this,
children = priv->children;
+ AFR_SBRAIN_CHECK_LOC (loc, out);
+
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -1323,6 +1326,62 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
return ret;
}
+static int
+afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data)
+{
+ int ret = 0;
+
+ if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0)
+ ret = gf_get_min_stime (THIS, data, key, value);
+
+ return ret;
+}
+
+int32_t
+afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
+
+ if (!frame || !frame->local || !this) {
+ gf_log ("", GF_LOG_ERROR, "possible NULL deref");
+ goto out;
+ }
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (!dict || (op_ret < 0)) {
+ local->op_errno = op_errno;
+ goto cleanup;
+ }
+
+ if (!local->dict)
+ local->dict = dict_copy_with_ref (dict, NULL);
+ else
+ dict_foreach (dict, afr_aggregate_stime_xattr,
+ local->dict);
+ local->op_ret = 0;
+ }
+
+cleanup:
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ AFR_STACK_UNWIND (getxattr, frame, local->op_ret,
+ local->op_errno, local->dict, xdata);
+ }
+
+out:
+ return 0;
+}
+
+
static gf_boolean_t
afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk,
gf_boolean_t is_fgetxattr)
@@ -1355,6 +1414,8 @@ afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk,
} else {
*cbk = afr_getxattr_lockinfo_cbk;
}
+ } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) {
+ *cbk = afr_common_getxattr_stime_cbk;
} else {
is_spl = _gf_false;
}
@@ -1403,7 +1464,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
int32_t read_child = -1;
int ret = -1;
fop_getxattr_cbk_t cbk = NULL;
-
+ int afr_xtime_gauge[MCNT_MAX] = {0,};
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -1414,6 +1475,8 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
children = priv->children;
+ AFR_SBRAIN_CHECK_LOC (loc, out);
+
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -1452,6 +1515,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
sub_volumes,
priv->child_count,
MARKER_UUID_TYPE,
+ marker_uuid_default_gauge,
priv->vol_uuid)) {
gf_log (this->name, GF_LOG_INFO,
@@ -1498,12 +1562,20 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
}
+ /* don't err out on getting ENOTCONN (brick down)
+ * from a subset of the bricks
+ */
+ memcpy (afr_xtime_gauge, marker_xtime_default_gauge,
+ sizeof (afr_xtime_gauge));
+ afr_xtime_gauge[MCNT_NOTFOUND] = 0;
+ afr_xtime_gauge[MCNT_ENOTCONN] = 0;
if (cluster_getmarkerattr (frame, this, loc,
name, local,
afr_getxattr_unwind,
sub_volumes,
priv->child_count,
MARKER_XTIME_TYPE,
+ afr_xtime_gauge,
priv->vol_uuid)) {
gf_log (this->name, GF_LOG_INFO,
"%s: failed to get marker attr (%s)",
@@ -1660,10 +1732,8 @@ afr_fgetxattr (call_frame_t *frame, xlator_t *this,
children = priv->children;
- if (afr_is_split_brain (this, fd->inode)) {
- op_errno = EIO;
- goto out;
- }
+ AFR_SBRAIN_CHECK_FD (fd, out);
+
AFR_LOCAL_ALLOC_OR_GOTO (local, out);
frame->local = local;
@@ -1819,10 +1889,7 @@ afr_readv (call_frame_t *frame, xlator_t *this,
priv = this->private;
children = priv->children;
- if (afr_is_split_brain (this, fd->inode)) {
- op_errno = EIO;
- goto out;
- }
+ AFR_SBRAIN_CHECK_FD (fd, out);
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index ce4fbf226..c1ec69a55 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -39,6 +39,46 @@
#include "afr-transaction.h"
#include "afr-self-heal-common.h"
+void
+__inode_write_fop_cbk (call_frame_t *frame, int child_index, int read_child,
+ xlator_t *this, int32_t *op_ret, int32_t *op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (afr_fop_failed (*op_ret, *op_errno)) {
+ local->child_errno[child_index] = *op_errno;
+
+ switch (local->op) {
+ case GF_FOP_TRUNCATE:
+ case GF_FOP_FTRUNCATE:
+ if (*op_errno != EFBIG)
+ afr_transaction_fop_failed (frame, this,
+ child_index);
+ break;
+ default:
+ afr_transaction_fop_failed (frame, this, child_index);
+ break;
+ }
+ local->op_errno = *op_errno;
+ goto out;
+ }
+
+ if ((local->success_count == 0) || (read_child == child_index)) {
+ local->op_ret = *op_ret;
+ if (prebuf)
+ local->cont.inode_wfop.prebuf = *prebuf;
+ if (postbuf)
+ local->cont.inode_wfop.postbuf = *postbuf;
+ }
+
+ local->success_count++;
+out:
+ return;
+}
+
/* {{{ writev */
void
@@ -52,8 +92,8 @@ afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame)
dst_local->op_ret = src_local->op_ret;
dst_local->op_errno = src_local->op_errno;
- dst_local->cont.writev.prebuf = src_local->cont.writev.prebuf;
- dst_local->cont.writev.postbuf = src_local->cont.writev.postbuf;
+ dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf;
+ dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf;
}
void
@@ -64,8 +104,8 @@ afr_writev_unwind (call_frame_t *frame, xlator_t *this)
AFR_STACK_UNWIND (writev, frame,
local->op_ret, local->op_errno,
- &local->cont.writev.prebuf,
- &local->cont.writev.postbuf,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
NULL);
}
@@ -133,12 +173,17 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postbuf, dict_t *xdata)
{
afr_local_t * local = NULL;
+ afr_private_t *priv = NULL;
call_frame_t *fop_frame = NULL;
int child_index = (long) cookie;
int call_count = -1;
int read_child = 0;
+ int ret = 0;
+ uint32_t open_fd_count = 0;
+ uint32_t write_is_append = 0;
local = frame->local;
+ priv = this->private;
read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
@@ -148,12 +193,14 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->read_child_returned = _gf_true;
}
+ __inode_write_fop_cbk (frame, child_index, read_child, this,
+ &op_ret, &op_errno, prebuf, postbuf,
+ xdata);
+
local->replies[child_index].valid = 1;
local->replies[child_index].op_ret = op_ret;
local->replies[child_index].op_errno = op_errno;
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
/* stage the best case return value for unwind */
if ((local->success_count == 0) || (op_ret > local->op_ret)) {
@@ -162,12 +209,24 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
if (op_ret != -1) {
- if ((local->success_count == 0) ||
- (child_index == read_child)) {
- local->cont.writev.prebuf = *prebuf;
- local->cont.writev.postbuf = *postbuf;
- }
- local->success_count++;
+ if (xdata) {
+ ret = dict_get_uint32 (xdata,
+ GLUSTERFS_OPEN_FD_COUNT,
+ &open_fd_count);
+ if ((ret == 0) &&
+ (open_fd_count > local->open_fd_count)) {
+ local->open_fd_count = open_fd_count;
+ local->update_open_fd_count = _gf_true;
+ }
+
+ write_is_append = 0;
+ ret = dict_get_uint32 (xdata,
+ GLUSTERFS_WRITE_IS_APPEND,
+ &write_is_append);
+ if (ret || !write_is_append)
+ local->append_write = _gf_false;
+ }
+
}
}
UNLOCK (&frame->lock);
@@ -176,10 +235,23 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (call_count == 0) {
- if (!local->stable_write)
- afr_fd_report_unstable_write (this, local->fd);
+ if (local->update_open_fd_count)
+ afr_handle_open_fd_count (frame, this);
+
+ if (!local->stable_write && !local->append_write)
+ /* An appended write removes the necessity to
+ fsync() the file. This is because self-heal
+ has the logic to check for larger file when
+ the xattrs are not reliably pointing at
+ a stale file.
+ */
+ afr_fd_report_unstable_write (this, local->fd);
afr_writev_handle_short_writes (frame, this);
+ if (afr_any_fops_failed (local, priv)) {
+ //Don't unwind until post-op is complete
+ local->transaction.resume (frame, this);
+ } else {
/*
* Generally inode-write fops do transaction.unwind then
* transaction.resume, but writev needs to make sure that
@@ -191,10 +263,11 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
* completed.
*/
- fop_frame = afr_transaction_detach_fop_frame (frame);
- afr_writev_copy_outvars (frame, fop_frame);
- local->transaction.resume (frame, this);
- afr_writev_unwind (fop_frame, this);
+ fop_frame = afr_transaction_detach_fop_frame (frame);
+ afr_writev_copy_outvars (frame, fop_frame);
+ local->transaction.resume (frame, this);
+ afr_writev_unwind (fop_frame, this);
+ }
}
return 0;
}
@@ -206,6 +279,8 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this)
afr_private_t *priv = NULL;
int i = 0;
int call_count = -1;
+ dict_t *xdata = NULL;
+ GF_UNUSED int ret = 0;
local = frame->local;
priv = this->private;
@@ -229,6 +304,19 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this)
return 0;
}
+ xdata = dict_new ();
+ if (xdata) {
+ ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT,
+ sizeof (uint32_t));
+ ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND,
+ 0);
+ /* Set append_write to be true speculatively. If on any
+ server it turns not be true, we unset it in the
+ callback.
+ */
+ local->append_write = _gf_true;
+ }
+
for (i = 0; i < priv->child_count; i++) {
if (local->transaction.pre_op[i]) {
STACK_WIND_COOKIE (frame, afr_writev_wind_cbk,
@@ -241,13 +329,16 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this)
local->cont.writev.offset,
local->cont.writev.flags,
local->cont.writev.iobref,
- NULL);
+ xdata);
if (!--call_count)
break;
}
}
+ if (xdata)
+ dict_unref (xdata);
+
return 0;
}
@@ -515,8 +606,8 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this)
if (main_frame) {
AFR_STACK_UNWIND (truncate, main_frame, local->op_ret,
local->op_errno,
- &local->cont.truncate.prebuf,
- &local->cont.truncate.postbuf,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
NULL);
}
@@ -530,14 +621,11 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postbuf, dict_t *xdata)
{
afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
int child_index = (long) cookie;
int read_child = 0;
int call_count = -1;
- int need_unwind = 0;
local = frame->local;
- priv = this->private;
read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL);
@@ -547,38 +635,22 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->read_child_returned = _gf_true;
}
- if (afr_fop_failed (op_ret, op_errno) && op_errno != EFBIG)
- afr_transaction_fop_failed (frame, this, child_index);
-
if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.truncate.prebuf = *prebuf;
- local->cont.truncate.postbuf = *postbuf;
- }
-
- if (child_index == read_child) {
- local->cont.truncate.prebuf = *prebuf;
- local->cont.truncate.postbuf = *postbuf;
- }
-
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
+ if (prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
}
- local->op_errno = op_errno;
+ __inode_write_fop_cbk (frame, child_index, read_child, this,
+ &op_ret, &op_errno, prebuf, postbuf,
+ xdata);
}
UNLOCK (&frame->lock);
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
call_count = afr_frame_return (frame);
if (call_count == 0) {
+ if (local->stable_write && afr_txn_nothing_failed (frame, this))
+ local->transaction.unwind (frame, this);
+
local->transaction.resume (frame, this);
}
@@ -606,6 +678,7 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this)
}
local->call_count = call_count;
+ local->stable_write = _gf_true;
for (i = 0; i < priv->child_count; i++) {
if (local->transaction.pre_op[i]) {
@@ -726,8 +799,8 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this)
if (main_frame) {
AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret,
local->op_errno,
- &local->cont.ftruncate.prebuf,
- &local->cont.ftruncate.postbuf,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
NULL);
}
return 0;
@@ -740,14 +813,11 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postbuf, dict_t *xdata)
{
afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
int child_index = (long) cookie;
int call_count = -1;
- int need_unwind = 0;
int read_child = 0;
local = frame->local;
- priv = this->private;
read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
@@ -757,38 +827,22 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->read_child_returned = _gf_true;
}
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.ftruncate.prebuf = *prebuf;
- local->cont.ftruncate.postbuf = *postbuf;
- }
-
- if (child_index == read_child) {
- local->cont.ftruncate.prebuf = *prebuf;
- local->cont.ftruncate.postbuf = *postbuf;
- }
-
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
+ if (prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
}
- local->op_errno = op_errno;
+ __inode_write_fop_cbk (frame, child_index, read_child, this,
+ &op_ret, &op_errno, prebuf, postbuf,
+ xdata);
}
UNLOCK (&frame->lock);
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
call_count = afr_frame_return (frame);
if (call_count == 0) {
+ if (local->stable_write && afr_txn_nothing_failed (frame, this))
+ local->transaction.unwind (frame, this);
+
local->transaction.resume (frame, this);
}
@@ -816,6 +870,7 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this)
}
local->call_count = call_count;
+ local->stable_write = _gf_true;
for (i = 0; i < priv->child_count; i++) {
if (local->transaction.pre_op[i]) {
@@ -970,8 +1025,8 @@ afr_setattr_unwind (call_frame_t *frame, xlator_t *this)
if (main_frame) {
AFR_STACK_UNWIND (setattr, main_frame, local->op_ret,
local->op_errno,
- &local->cont.setattr.preop_buf,
- &local->cont.setattr.postop_buf,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
NULL);
}
@@ -1002,29 +1057,14 @@ afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->read_child_returned = _gf_true;
}
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.setattr.preop_buf = *preop;
- local->cont.setattr.postop_buf = *postop;
- }
-
- if (child_index == read_child) {
- local->cont.setattr.preop_buf = *preop;
- local->cont.setattr.postop_buf = *postop;
- }
+ __inode_write_fop_cbk (frame, child_index, read_child, this,
+ &op_ret, &op_errno, preop, postop,
+ xdata);
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
+ if ((local->success_count >= priv->wait_count)
+ && local->read_child_returned) {
+ need_unwind = 1;
}
- local->op_errno = op_errno;
}
UNLOCK (&frame->lock);
@@ -1179,8 +1219,8 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this)
if (main_frame) {
AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret,
local->op_errno,
- &local->cont.fsetattr.preop_buf,
- &local->cont.fsetattr.postop_buf,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
NULL);
}
@@ -1211,29 +1251,14 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->read_child_returned = _gf_true;
}
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.fsetattr.preop_buf = *preop;
- local->cont.fsetattr.postop_buf = *postop;
- }
-
- if (child_index == read_child) {
- local->cont.fsetattr.preop_buf = *preop;
- local->cont.fsetattr.postop_buf = *postop;
- }
+ __inode_write_fop_cbk (frame, child_index, read_child, this,
+ &op_ret, &op_errno, preop, postop,
+ xdata);
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
+ if ((local->success_count >= priv->wait_count)
+ && local->read_child_returned) {
+ need_unwind = 1;
}
- local->op_errno = op_errno;
}
UNLOCK (&frame->lock);
@@ -1406,28 +1431,23 @@ int
afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int need_unwind = 0;
+ int child_index = (long) cookie;
local = frame->local;
priv = this->private;
LOCK (&frame->lock);
{
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->child_count) {
- need_unwind = 1;
- }
+ __inode_write_fop_cbk (frame, child_index, -1, this,
+ &op_ret, &op_errno, NULL, NULL,
+ xdata);
+ if (local->success_count == priv->child_count) {
+ need_unwind = 1;
}
-
- local->op_errno = op_errno;
}
UNLOCK (&frame->lock);
@@ -1596,28 +1616,24 @@ int
afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int need_unwind = 0;
+ int child_index = (long) cookie;
local = frame->local;
priv = this->private;
LOCK (&frame->lock);
{
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
- if (local->success_count == priv->child_count) {
- need_unwind = 1;
- }
+ __inode_write_fop_cbk (frame, child_index, -1, this,
+ &op_ret, &op_errno, NULL, NULL,
+ xdata);
+ if (local->success_count == priv->child_count) {
+ need_unwind = 1;
}
-
- local->op_errno = op_errno;
}
UNLOCK (&frame->lock);
@@ -1796,28 +1812,23 @@ int
afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int need_unwind = 0;
+ int child_index = (long) cookie;
local = frame->local;
priv = this->private;
LOCK (&frame->lock);
{
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
+ __inode_write_fop_cbk (frame, child_index, -1, this,
+ &op_ret, &op_errno, NULL, NULL,
+ xdata);
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
}
-
- local->op_errno = op_errno;
}
UNLOCK (&frame->lock);
@@ -1985,28 +1996,24 @@ int
afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ int call_count = -1;
+ int need_unwind = 0;
+ int child_index = (long) cookie;
local = frame->local;
priv = this->private;
LOCK (&frame->lock);
{
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
+ __inode_write_fop_cbk (frame, child_index, -1, this,
+ &op_ret, &op_errno, NULL, NULL,
+ xdata);
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
}
-
- local->op_errno = op_errno;
}
UNLOCK (&frame->lock);
@@ -2151,3 +2158,704 @@ out:
return 0;
}
+
+static int
+afr_fallocate_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret,
+ local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ NULL);
+ }
+ return 0;
+}
+
+static int
+afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+ int read_child = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
+
+ LOCK (&frame->lock);
+ {
+ if (child_index == read_child) {
+ local->read_child_returned = _gf_true;
+ }
+
+ __inode_write_fop_cbk (frame, child_index, read_child, this,
+ &op_ret, &op_errno, prebuf, postbuf,
+ xdata);
+
+ if ((local->success_count >= priv->wait_count)
+ && local->read_child_returned) {
+ need_unwind = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+static int
+afr_fallocate_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
+ priv->child_count);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i]) {
+ STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fallocate,
+ local->fd,
+ local->cont.fallocate.mode,
+ local->cont.fallocate.offset,
+ local->cont.fallocate.len,
+ NULL);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int
+afr_fallocate_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+static int
+afr_do_fallocate (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t * transaction_frame = NULL;
+ afr_local_t * local = NULL;
+ int op_ret = -1;
+ int op_errno = 0;
+
+ local = frame->local;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ goto out;
+ }
+
+ transaction_frame->local = local;
+ frame->local = NULL;
+
+ local->op = GF_FOP_FALLOCATE;
+
+ local->transaction.fop = afr_fallocate_wind;
+ local->transaction.done = afr_fallocate_done;
+ local->transaction.unwind = afr_fallocate_unwind;
+
+ local->transaction.main_frame = frame;
+
+ local->transaction.start = local->cont.fallocate.offset;
+ local->transaction.len = 0;
+
+ /* fallocate can modify the file size */
+ op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ goto out;
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret < 0) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (fallocate, frame, op_ret, op_errno, NULL,
+ NULL, NULL);
+ }
+
+ return 0;
+}
+
+int
+afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ if (afr_is_split_brain (this, fd->inode)) {
+ op_errno = EIO;
+ goto out;
+ }
+ QUORUM_CHECK(fallocate,out);
+
+ AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
+ local = frame->local;
+
+ ret = afr_local_init (local, priv, &op_errno);
+ if (ret < 0)
+ goto out;
+
+ local->cont.fallocate.mode = mode;
+ local->cont.fallocate.offset = offset;
+ local->cont.fallocate.len = len;
+
+ local->fd = fd_ref (fd);
+
+ afr_open_fd_fix (fd, this);
+
+ afr_do_fallocate (frame, this);
+
+ ret = 0;
+out:
+ if (ret < 0) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ discard */
+
+static int
+afr_discard_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ AFR_STACK_UNWIND (discard, main_frame, local->op_ret,
+ local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ NULL);
+ }
+ return 0;
+}
+
+static int
+afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+ int read_child = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
+
+ LOCK (&frame->lock);
+ {
+ if (child_index == read_child) {
+ local->read_child_returned = _gf_true;
+ }
+
+ __inode_write_fop_cbk (frame, child_index, read_child, this,
+ &op_ret, &op_errno, prebuf, postbuf,
+ xdata);
+
+ if ((local->success_count >= priv->wait_count)
+ && local->read_child_returned) {
+ need_unwind = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+static int
+afr_discard_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
+ priv->child_count);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i]) {
+ STACK_WIND_COOKIE (frame, afr_discard_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->discard,
+ local->fd,
+ local->cont.discard.offset,
+ local->cont.discard.len,
+ NULL);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int
+afr_discard_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+static int
+afr_do_discard (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t * transaction_frame = NULL;
+ afr_local_t * local = NULL;
+ int op_ret = -1;
+ int op_errno = 0;
+
+ local = frame->local;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ goto out;
+ }
+
+ transaction_frame->local = local;
+ frame->local = NULL;
+
+ local->op = GF_FOP_DISCARD;
+
+ local->transaction.fop = afr_discard_wind;
+ local->transaction.done = afr_discard_done;
+ local->transaction.unwind = afr_discard_unwind;
+
+ local->transaction.main_frame = frame;
+
+ local->transaction.start = local->cont.discard.offset;
+ local->transaction.len = 0;
+
+ op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ goto out;
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret < 0) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (discard, frame, op_ret, op_errno, NULL,
+ NULL, NULL);
+ }
+
+ return 0;
+}
+
+int
+afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ if (afr_is_split_brain (this, fd->inode)) {
+ op_errno = EIO;
+ goto out;
+ }
+ QUORUM_CHECK(discard, out);
+
+ AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
+ local = frame->local;
+
+ ret = afr_local_init (local, priv, &op_errno);
+ if (ret < 0)
+ goto out;
+
+ local->cont.discard.offset = offset;
+ local->cont.discard.len = len;
+
+ local->fd = fd_ref (fd);
+
+ afr_open_fd_fix (fd, this);
+
+ afr_do_discard(frame, this);
+
+ ret = 0;
+out:
+ if (ret < 0) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ }
+
+ return 0;
+}
+
+
+/* {{{ zerofill */
+
+static int
+afr_zerofill_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ AFR_STACK_UNWIND (zerofill, main_frame, local->op_ret,
+ local->op_errno,
+ &local->cont.zerofill.prebuf,
+ &local->cont.zerofill.postbuf,
+ NULL);
+ }
+ return 0;
+}
+
+static int
+afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+ int read_child = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
+
+ LOCK (&frame->lock);
+ {
+ if (child_index == read_child) {
+ local->read_child_returned = _gf_true;
+ }
+
+ if (afr_fop_failed (op_ret, op_errno)) {
+ afr_transaction_fop_failed (frame, this, child_index);
+ }
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.zerofill.prebuf = *prebuf;
+ local->cont.zerofill.postbuf = *postbuf;
+ }
+
+ if (child_index == read_child) {
+ local->cont.zerofill.prebuf = *prebuf;
+ local->cont.zerofill.postbuf = *postbuf;
+ }
+
+ local->success_count++;
+
+ if ((local->success_count >= priv->wait_count)
+ && local->read_child_returned) {
+ need_unwind = 1;
+ }
+ }
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind) {
+ local->transaction.unwind (frame, this);
+ }
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+static int
+afr_zerofill_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
+ priv->child_count);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i]) {
+ STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->zerofill,
+ local->fd,
+ local->cont.zerofill.offset,
+ local->cont.zerofill.len,
+ NULL);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int
+afr_zerofill_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+static int
+afr_do_zerofill(call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ int op_ret = -1;
+ int op_errno = 0;
+
+ local = frame->local;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ goto out;
+ }
+
+ transaction_frame->local = local;
+ frame->local = NULL;
+
+ local->op = GF_FOP_ZEROFILL;
+
+ local->transaction.fop = afr_zerofill_wind;
+ local->transaction.done = afr_zerofill_done;
+ local->transaction.unwind = afr_zerofill_unwind;
+
+ local->transaction.main_frame = frame;
+
+ local->transaction.start = local->cont.zerofill.offset;
+ local->transaction.len = 0;
+
+ op_ret = afr_transaction (transaction_frame, this,
+ AFR_DATA_TRANSACTION);
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ goto out;
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret < 0) {
+ if (transaction_frame) {
+ AFR_STACK_DESTROY (transaction_frame);
+ }
+ AFR_STACK_UNWIND (zerofill, frame, op_ret, op_errno, NULL,
+ NULL, NULL);
+ }
+
+ return 0;
+}
+
+int
+afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ if (afr_is_split_brain (this, fd->inode)) {
+ op_errno = EIO;
+ goto out;
+ }
+ QUORUM_CHECK(zerofill, out);
+
+ AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
+ local = frame->local;
+
+ ret = afr_local_init (local, priv, &op_errno);
+ if (ret < 0) {
+ goto out;
+ }
+ local->cont.zerofill.offset = offset;
+ local->cont.zerofill.len = len;
+
+ local->fd = fd_ref (fd);
+
+ afr_open_fd_fix (fd, this);
+
+ afr_do_zerofill(frame, this);
+
+ ret = 0;
+out:
+ if (ret < 0) {
+ if (transaction_frame) {
+ AFR_STACK_DESTROY (transaction_frame);
+ }
+ AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL,
+ NULL, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+
diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h
index ed11079fd..8e93ca44a 100644
--- a/xlators/cluster/afr/src/afr-inode-write.h
+++ b/xlators/cluster/afr/src/afr-inode-write.h
@@ -68,4 +68,15 @@ int32_t
afr_fremovexattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, const char *name, dict_t *xdata);
+int
+afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata);
+
+int
+afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata);
+
+int
+afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata);
#endif /* __INODE_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
index e091a7939..060d78f35 100644
--- a/xlators/cluster/afr/src/afr-lk-common.c
+++ b/xlators/cluster/afr/src/afr-lk-common.c
@@ -57,11 +57,15 @@
int
afr_entry_lockee_cmp (const void *l1, const void *l2)
{
- const afr_entry_lockee_t *r1 = l1;
- const afr_entry_lockee_t *r2 = l2;
- int ret = 0;
-
- ret = uuid_compare (r1->loc.gfid, r2->loc.gfid);
+ const afr_entry_lockee_t *r1 = l1;
+ const afr_entry_lockee_t *r2 = l2;
+ int ret = 0;
+ uuid_t gfid1 = {0};
+ uuid_t gfid2 = {0};
+
+ loc_gfid ((loc_t*)&r1->loc, gfid1);
+ loc_gfid ((loc_t*)&r2->loc, gfid2);
+ ret = uuid_compare (gfid1, gfid2);
/*Entrylks with NULL basename are the 'smallest'*/
if (ret == 0) {
if (!r1->basename)
@@ -75,7 +79,6 @@ afr_entry_lockee_cmp (const void *l1, const void *l2)
return -1;
else
return 1;
-
}
int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index);
@@ -556,19 +559,23 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this)
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
afr_private_t *priv = NULL;
- int i = 0;
+ afr_inodelk_t *inodelk = NULL;
priv = this->private;
local = frame->local;
int_lock = &local->internal_lock;
- int_lock->inodelk_lock_count = 0;
- int_lock->lock_op_ret = -1;
- int_lock->lock_op_errno = 0;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- for (i = 0; i < priv->child_count; i++) {
- int_lock->inode_locked_nodes[i] = 0;
- }
+ inodelk->lock_count = 0;
+ int_lock->lk_attempted_count = 0;
+ int_lock->lock_op_ret = -1;
+ int_lock->lock_op_errno = 0;
+
+ memset (inodelk->locked_nodes, 0,
+ sizeof (*inodelk->locked_nodes) * priv->child_count);
+ memset (int_lock->locked_nodes, 0,
+ sizeof (*int_lock->locked_nodes) * priv->child_count);
return 0;
}
@@ -649,7 +656,9 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
int32_t child_index = (long)cookie;
+ afr_private_t *priv = NULL;
local = frame->local;
int_lock = &local->internal_lock;
@@ -658,14 +667,18 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
AFR_UNLOCK_OP, NULL, op_ret,
op_errno, child_index);
+ priv = this->private;
+
if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
- gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on %d "
- "unlock by %s", local->loc.path, child_index,
+ gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on subvolume %s "
+ "with lock owner %s", local->loc.path,
+ priv->children[child_index]->name,
lkowner_utoa (&frame->root->lk_owner));
}
- int_lock->inode_locked_nodes[child_index] &= LOCKED_NO;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ inodelk->locked_nodes[child_index] &= LOCKED_NO;
if (local->transaction.eager_lock)
local->transaction.eager_lock[child_index] = 0;
@@ -679,6 +692,7 @@ static int
afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
struct gf_flock flock = {0,};
@@ -694,12 +708,14 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
priv = this->private;
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
flock.l_type = F_UNLCK;
full_flock.l_type = F_UNLCK;
- call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes,
+ call_count = afr_locked_nodes_count (inodelk->locked_nodes,
priv->child_count);
int_lock->lk_call_count = call_count;
@@ -715,8 +731,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
fd_ctx = afr_fd_ctx_get (local->fd, this);
for (i = 0; i < priv->child_count; i++) {
- if ((int_lock->inode_locked_nodes[i] & LOCKED_YES)
- != LOCKED_YES)
+ if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
continue;
if (local->fd) {
@@ -757,7 +772,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long)i,
priv->children[i],
priv->children[i]->fops->finodelk,
- this->name, local->fd,
+ int_lock->domain, local->fd,
F_SETLK, flock_use, NULL);
if (!--call_count)
@@ -772,7 +787,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long)i,
priv->children[i],
priv->children[i]->fops->inodelk,
- this->name, &local->loc,
+ int_lock->domain, &local->loc,
F_SETLK, &flock, NULL);
if (!--call_count)
@@ -858,7 +873,7 @@ afr_unlock_entrylk (call_frame_t *frame, xlator_t *this)
(void *) (long) i,
priv->children[index],
priv->children[index]->fops->entrylk,
- this->name,
+ int_lock->domain,
&int_lock->lockee[lockee_no].loc,
int_lock->lockee[lockee_no].basename,
ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
@@ -961,6 +976,7 @@ static int
afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
@@ -971,10 +987,10 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- memcpy (int_lock->inode_locked_nodes,
- int_lock->locked_nodes,
- priv->child_count);
- int_lock->inodelk_lock_count = int_lock->lock_count;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ memcpy (inodelk->locked_nodes, int_lock->locked_nodes,
+ sizeof (*inodelk->locked_nodes) * priv->child_count);
+ inodelk->lock_count = int_lock->lock_count;
break;
case AFR_ENTRY_RENAME_TRANSACTION:
@@ -1025,6 +1041,7 @@ int
afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
struct gf_flock flock = {0,};
@@ -1039,10 +1056,15 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
priv = this->private;
child_index = cookie % priv->child_count;
lockee_no = cookie / priv->child_count;
+ is_entrylk = afr_is_entrylk (int_lock, local->transaction.type);
+
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
- flock.l_type = int_lock->lk_flock.l_type;
+ if (!is_entrylk) {
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
+ flock.l_type = inodelk->flock.l_type;
+ }
if (local->fd) {
ret = fd_ctx_get (local->fd, this, &ctx);
@@ -1064,8 +1086,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
}
if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
- is_entrylk = afr_is_entrylk (int_lock, local->transaction.type);
-
if ((is_entrylk && int_lock->entrylk_lock_count == 0) ||
(!is_entrylk && int_lock->lock_count == 0)) {
gf_log (this->name, GF_LOG_INFO,
@@ -1114,7 +1134,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
(void *) (long) child_index,
priv->children[child_index],
priv->children[child_index]->fops->finodelk,
- this->name, local->fd,
+ int_lock->domain, local->fd,
F_SETLKW, &flock, NULL);
} else {
@@ -1127,7 +1147,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
(void *) (long) child_index,
priv->children[child_index],
priv->children[child_index]->fops->inodelk,
- this->name, &local->loc,
+ int_lock->domain, &local->loc,
F_SETLKW, &flock, NULL);
}
@@ -1148,7 +1168,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
(void *) (long) cookie,
priv->children[child_index],
priv->children[child_index]->fops->fentrylk,
- this->name, local->fd,
+ int_lock->domain, local->fd,
int_lock->lockee[lockee_no].basename,
ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
} else {
@@ -1161,7 +1181,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
(void *) (long) cookie,
priv->children[child_index],
priv->children[child_index]->fops->entrylk,
- this->name,
+ int_lock->domain,
&int_lock->lockee[lockee_no].loc,
int_lock->lockee[lockee_no].basename,
ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
@@ -1316,6 +1336,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
local->op_errno = EINVAL;
int_lock->lock_op_errno = EINVAL;
+ afr_unlock (frame, this);
return -1;
}
@@ -1390,6 +1411,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
int call_count = 0;
int child_index = (long) cookie;
@@ -1398,6 +1420,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
int_lock = &local->internal_lock;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION,
AFR_LOCK_OP, NULL, op_ret,
@@ -1423,9 +1446,8 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (local->transaction.eager_lock)
local->transaction.eager_lock[child_index] = 0;
} else {
- int_lock->inode_locked_nodes[child_index]
- |= LOCKED_YES;
- int_lock->inodelk_lock_count++;
+ inodelk->locked_nodes[child_index] |= LOCKED_YES;
+ inodelk->lock_count++;
if (local->transaction.eager_lock &&
local->transaction.eager_lock[child_index] &&
@@ -1448,8 +1470,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (this->name, GF_LOG_TRACE,
"Last inode locking reply received");
/* all locks successful. Proceed to call FOP */
- if (int_lock->inodelk_lock_count ==
- int_lock->lk_expected_count) {
+ if (inodelk->lock_count == int_lock->lk_expected_count) {
gf_log (this->name, GF_LOG_TRACE,
"All servers locked. Calling the cbk");
int_lock->lock_op_ret = 0;
@@ -1473,6 +1494,7 @@ int
afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
afr_fd_ctx_t *fd_ctx = NULL;
@@ -1488,11 +1510,13 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
priv = this->private;
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
- flock.l_type = int_lock->lk_flock.l_type;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
+ flock.l_type = inodelk->flock.l_type;
- full_flock.l_type = int_lock->lk_flock.l_type;
+ full_flock.l_type = inodelk->flock.l_type;
initialize_inodelk_variables (frame, this);
@@ -1508,6 +1532,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
local->op_errno = EINVAL;
int_lock->lock_op_errno = EINVAL;
+ afr_unlock (frame, this);
ret = -1;
goto out;
}
@@ -1566,7 +1591,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long) i,
priv->children[i],
priv->children[i]->fops->finodelk,
- this->name, local->fd,
+ int_lock->domain, local->fd,
F_SETLK, flock_use, NULL);
if (!--call_count)
@@ -1588,7 +1613,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long) i,
priv->children[i],
priv->children[i]->fops->inodelk,
- this->name, &local->loc,
+ int_lock->domain, &local->loc,
F_SETLK, &flock, NULL);
if (!--call_count)
@@ -2112,29 +2137,38 @@ out:
return ret;
}
-void
-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src,
+int
+afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
unsigned int child_count)
{
- afr_local_t *dst_local = NULL;
- afr_local_t *src_local = NULL;
- afr_internal_lock_t *dst_lock = NULL;
- afr_internal_lock_t *src_lock = NULL;
+ afr_local_t *dst_local = NULL;
+ afr_local_t *src_local = NULL;
+ afr_internal_lock_t *dst_lock = NULL;
+ afr_internal_lock_t *src_lock = NULL;
+ afr_inodelk_t *dst_inodelk = NULL;
+ afr_inodelk_t *src_inodelk = NULL;
+ int ret = -1;
- dst_local = dst->local;
- dst_lock = &dst_local->internal_lock;
src_local = src->local;
src_lock = &src_local->internal_lock;
- if (src_lock->inode_locked_nodes) {
- memcpy (dst_lock->inode_locked_nodes,
- src_lock->inode_locked_nodes,
- sizeof (*dst_lock->inode_locked_nodes) * child_count);
- memset (src_lock->inode_locked_nodes, 0,
- sizeof (*src_lock->inode_locked_nodes) * child_count);
+ src_inodelk = afr_get_inodelk (src_lock, dom);
+ dst_local = dst->local;
+ dst_lock = &dst_local->internal_lock;
+ dst_inodelk = afr_get_inodelk (dst_lock, dom);
+ if (!dst_inodelk || !src_inodelk)
+ goto out;
+ if (src_inodelk->locked_nodes) {
+ memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes,
+ sizeof (*dst_inodelk->locked_nodes) * child_count);
+ memset (src_inodelk->locked_nodes, 0,
+ sizeof (*src_inodelk->locked_nodes) * child_count);
}
dst_lock->transaction_lk_type = src_lock->transaction_lk_type;
dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type;
- dst_lock->inodelk_lock_count = src_lock->inodelk_lock_count;
- src_lock->inodelk_lock_count = 0;
+ dst_inodelk->lock_count = src_inodelk->lock_count;
+ src_inodelk->lock_count = 0;
+ ret = 0;
+out:
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index e01ab366f..73594f265 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -41,7 +41,10 @@ enum gf_afr_mem_types_ {
gf_afr_mt_shd_event_t,
gf_afr_mt_time_t,
gf_afr_mt_pos_data_t,
- gf_afr_mt_reply_t,
+ gf_afr_mt_reply_t,
+ gf_afr_mt_stats_t,
+ gf_afr_mt_shd_crawl_event_t,
+ gf_afr_mt_uint64_t,
gf_afr_mt_end
};
#endif
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
index 1721fd270..83846f152 100644
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c
+++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
@@ -100,7 +100,7 @@ sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this,
}
sh_private_cleanup (sh_frame, this);
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
GF_ASSERT (!last_loop_frame);
//loop_finish should have happened and the old_loop should be NULL
gf_log (this->name, GF_LOG_DEBUG,
@@ -143,7 +143,7 @@ sh_loop_finish (call_frame_t *loop_frame, xlator_t *this)
}
if (loop_sh && loop_sh->data_lock_held) {
- afr_sh_data_unlock (loop_frame, this,
+ afr_sh_data_unlock (loop_frame, this, this->name,
sh_destroy_frame);
} else {
sh_destroy_frame (loop_frame, this);
@@ -214,7 +214,7 @@ sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this,
goto out;
//We want the frame to have same lk_owner as sh_frame
//so that locks translator allows conflicting locks
- new_loop_local = afr_local_copy (local, this);
+ new_loop_local = afr_self_heal_local_init (local, this);
if (!new_loop_local)
goto out;
new_loop_frame->local = new_loop_local;
@@ -273,10 +273,10 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset,
new_loop_sh->offset = offset;
new_loop_sh->block_size = sh->block_size;
afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size,
- _gf_true, sh_loop_lock_success, sh_loop_lock_failure);
+ _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure);
return 0;
out:
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
if (old_loop_frame)
sh_loop_finish (old_loop_frame, this);
sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM);
@@ -307,8 +307,9 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this,
sh_priv->loops_running--;
offset = sh_priv->offset;
block_size = sh->block_size;
- while ((!sh->eof_reached) && (0 == sh->op_failed) &&
- (sh_priv->loops_running < priv->data_self_heal_window_size)
+ while ((!sh->eof_reached) &&
+ (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) &&
+ (sh_priv->loops_running < priv->data_self_heal_window_size)
&& (sh_priv->offset < sh->file_size)) {
loop++;
@@ -327,7 +328,8 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this,
if (0 == loop) {
//loop finish does unlock, but the erasing of the pending
//xattrs needs to happen before that so do not finish the loop
- if (is_driver_done && !sh->op_failed)
+ if (is_driver_done &&
+ !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC))
goto driver_done;
if (old_loop_frame) {
sh_loop_finish (old_loop_frame, this);
@@ -338,7 +340,7 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this,
//If we have more loops to form we should finish previous loop after
//the next loop lock
while (loop--) {
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
// op failed in other loop, stop spawning more loops
if (old_loop_frame) {
sh_loop_finish (old_loop_frame, this);
@@ -384,7 +386,7 @@ sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame
}
if (op_ret == -1) {
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_set_error (sh, op_errno);
if (loop_frame) {
sh_loop_finish (loop_frame, this);
@@ -432,16 +434,16 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
priv->children[child_index]->name,
strerror (op_errno));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_set_error (loop_sh, op_errno);
} else if (op_ret < loop_local->cont.writev.vector->iov_len) {
- gf_log(this->name, GF_LOG_ERROR,
- "incomplete write to %s on subvolume %s "
- "(expected %lu, returned %d)", sh_local->loc.path,
- priv->children[child_index]->name,
- loop_local->cont.writev.vector->iov_len, op_ret);
- sh->op_failed = 1;
- }
+ gf_log (this->name, GF_LOG_ERROR,
+ "incomplete write to %s on subvolume %s "
+ "(expected %lu, returned %d)", sh_local->loc.path,
+ priv->children[child_index]->name,
+ loop_local->cont.writev.vector->iov_len, op_ret);
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
+ }
call_count = afr_frame_return (loop_frame);
@@ -514,7 +516,7 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie,
if (op_ret <= 0) {
if (op_ret < 0) {
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
gf_log (this->name, GF_LOG_ERROR, "read failed on %d "
"for %s reason :%s", sh->source,
sh_local->loc.path, strerror (errno));
@@ -624,7 +626,7 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
"checksum on %s failed on subvolume %s (%s)",
sh_local->loc.path, priv->children[child_index]->name,
strerror (op_errno));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
} else {
memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH,
strong_checksum, MD5_DIGEST_LENGTH);
@@ -662,7 +664,8 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
}
UNLOCK (&sh_priv->lock);
- if (write_needed && !sh->op_failed) {
+ if (write_needed &&
+ !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
sh_loop_read (loop_frame, this);
} else {
sh_loop_return (sh_frame, this, loop_frame,
@@ -751,14 +754,15 @@ out:
return sh_priv;
}
-void
-afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src,
+int
+afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom,
unsigned int child_count)
{
afr_local_t *dst_local = NULL;
afr_self_heal_t *dst_sh = NULL;
afr_local_t *src_local = NULL;
afr_self_heal_t *src_sh = NULL;
+ int ret = -1;
dst_local = dst->local;
dst_sh = &dst_local->self_heal;
@@ -766,9 +770,12 @@ afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src,
src_sh = &src_local->self_heal;
GF_ASSERT (src_sh->data_lock_held);
GF_ASSERT (!dst_sh->data_lock_held);
- afr_lk_transfer_datalock (dst, src, child_count);
+ ret = afr_lk_transfer_datalock (dst, src, dom, child_count);
+ if (ret)
+ return ret;
src_sh->data_lock_held = _gf_false;
dst_sh->data_lock_held = _gf_true;
+ return 0;
}
int
@@ -790,7 +797,10 @@ afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this,
ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame);
if (ret)
goto out;
- afr_sh_transfer_lock (first_loop_frame, sh_frame, priv->child_count);
+ ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name,
+ priv->child_count);
+ if (ret)
+ goto out;
sh->private = afr_sh_priv_init ();
if (!sh->private) {
ret = -1;
@@ -800,7 +810,7 @@ afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this,
ret = 0;
out:
if (ret) {
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
sh_loop_driver_done (sh_frame, this, NULL);
}
return 0;
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 2538f4c8b..ef92b4205 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -18,6 +18,28 @@
#include "afr-self-heal.h"
#include "pump.h"
+#define ADD_FMT_STRING(msg, off, sh_str, status, print_log) \
+ do { \
+ if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) { \
+ off += snprintf (msg + off, sizeof (msg) - off, \
+ " "sh_str" self heal %s,", \
+ get_sh_completion_status (status));\
+ print_log = 1; \
+ } \
+ } while (0)
+
+#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log) \
+ do { \
+ if (AFR_SELF_HEAL_SYNC_BEGIN == status || \
+ AFR_SELF_HEAL_FAILED == status) { \
+ off += snprintf (msg + off, sizeof (msg) - off, \
+ " "sh_str" self heal %s,", \
+ get_sh_completion_status (status));\
+ print_log = 1; \
+ } \
+ } while (0)
+
+
void
afr_sh_reset (call_frame_t *frame, xlator_t *this)
{
@@ -141,9 +163,8 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
GF_FREE (buf);
}
-void
-afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this,
- const char *loc)
+char*
+afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this)
{
afr_private_t * priv = this->private;
char *buf = NULL;
@@ -173,10 +194,8 @@ afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this,
+ (child_count * child_count * pending_entry_strlen);
buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char);
- if (!buf) {
- buf = "";
+ if (!buf)
goto out;
- }
ptr = buf;
ptr += sprintf (ptr, "%s", msg);
@@ -192,11 +211,27 @@ afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this,
ptr += sprintf (ptr, "%s", matrix_end);
out:
+ return buf;
+}
+
+void
+afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this,
+ const char *loc)
+{
+ char *buf = NULL;
+ char *free_ptr = NULL;
+
+ buf = afr_get_pending_matrix_str (pending_matrix, this);
+ if (buf)
+ free_ptr = buf;
+ else
+ buf = "";
+
+
gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'"
" (possible split-brain). Please delete the file from all but "
"the preferred subvolume.%s", loc, buf);
- if (buf)
- GF_FREE (buf);
+ GF_FREE (free_ptr);
return;
}
@@ -466,6 +501,8 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses,
{
int i = 0;
int biggest_witness = -1;
+ int biggest_witness_idx = -1;
+ int biggest_witness_cnt = -1;
GF_ASSERT (witnesses);
GF_ASSERT (characters);
@@ -475,10 +512,21 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses,
if (characters[i].type != AFR_NODE_FOOL)
continue;
- if (biggest_witness < witnesses[i])
+ if (biggest_witness < witnesses[i]) {
biggest_witness = witnesses[i];
+ biggest_witness_idx = i;
+ biggest_witness_cnt = 1;
+ continue;
+ }
+
+ if (biggest_witness == witnesses[i])
+ biggest_witness_cnt++;
}
- return biggest_witness;
+
+ if (biggest_witness_cnt != 1)
+ return -1;
+
+ return biggest_witness_idx;
}
int
@@ -506,10 +554,84 @@ afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses,
return nsources;
}
+
+int
+afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx)
+{
+ if (idx >= 0 && idx < child_count) {
+ sources[idx] = 1;
+ return 1;
+ }
+ return 0;
+}
+
+
+static int
+afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children,
+ int child_count)
+{
+ int idx = -1;
+ int i = -1;
+ int child = -1;
+ uint64_t max_size = 0;
+ uint64_t min_size = 0;
+ int num_children = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (success_children[i] == -1)
+ break;
+
+ child = success_children[i];
+ if (bufs[child].ia_size > max_size) {
+ max_size = bufs[child].ia_size;
+ idx = child;
+ }
+
+ if ((num_children == 0) || (bufs[child].ia_size < min_size)) {
+ min_size = bufs[child].ia_size;
+ }
+
+ num_children++;
+ }
+
+ /* If sizes are same for all of them, finding sources will have to
+ * happen with pending changelog. So return -1
+ */
+ if ((num_children > 1) && (min_size == max_size))
+ return -1;
+ return idx;
+}
+
+
+static int
+afr_find_newest_file (struct iatt *bufs, int32_t *success_children,
+ int child_count)
+{
+ int idx = -1;
+ int i = -1;
+ int child = -1;
+ uint64_t max_ctime = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (success_children[i] == -1)
+ break;
+
+ child = success_children[i];
+ if (bufs[child].ia_ctime > max_ctime) {
+ max_ctime = bufs[child].ia_ctime;
+ idx = child;
+ }
+ }
+
+ return idx;
+}
+
+
static int
afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix,
afr_node_character *characters,
- int child_count)
+ int32_t *success_children,
+ int child_count, struct iatt *bufs)
{
int32_t biggest_witness = 0;
int nsources = 0;
@@ -517,6 +639,11 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix,
GF_ASSERT (child_count > 0);
+ biggest_witness = afr_find_largest_file_size (bufs, success_children,
+ child_count);
+ if (biggest_witness != -1)
+ goto found;
+
witnesses = GF_CALLOC (child_count, sizeof (*witnesses),
gf_afr_mt_int32_t);
if (NULL == witnesses) {
@@ -529,9 +656,15 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix,
biggest_witness = afr_find_biggest_witness_among_fools (witnesses,
characters,
child_count);
- nsources = afr_mark_fool_as_source_by_witness (sources, witnesses,
- characters, child_count,
- biggest_witness);
+ if (biggest_witness != -1)
+ goto found;
+
+ biggest_witness = afr_find_newest_file (bufs, success_children,
+ child_count);
+
+found:
+ nsources = afr_mark_fool_as_source_by_idx (sources, child_count,
+ biggest_witness);
out:
GF_FREE (witnesses);
return nsources;
@@ -875,7 +1008,8 @@ afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix,
nsources = afr_mark_biggest_of_fools_as_source (sources,
pending_matrix,
characters,
- child_count);
+ success_children,
+ child_count, bufs);
}
out:
@@ -1012,14 +1146,13 @@ afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this)
afr_sh_reset (frame, this);
- if (local->govinda_gOvinda) {
+ if (local->unhealable) {
gf_log (this->name, GF_LOG_DEBUG,
"split brain found, aborting selfheal of %s",
local->loc.path);
- sh->op_failed = 1;
}
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
sh->completion_cbk (frame, this);
} else {
gf_log (this->name, GF_LOG_TRACE,
@@ -1251,7 +1384,7 @@ out:
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, "
"reason: %s", local->loc.path, strerror (-ret));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
}
afr_sh_missing_entries_finish (frame, this);
}
@@ -1266,7 +1399,7 @@ afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this,
local = frame->local;
sh = &local->self_heal;
if (op_ret < 0)
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_missing_entries_finish (frame, this);
return 0;
}
@@ -1290,7 +1423,7 @@ sh_missing_entries_create (call_frame_t *frame, xlator_t *this)
if (!afr_valid_ia_type (type)) {
gf_log (this->name, GF_LOG_ERROR,
"%s: unknown file type: 0%o", local->loc.path, type);
- local->govinda_gOvinda = 1;
+ afr_set_local_for_unhealable (local);
afr_sh_missing_entries_finish (frame, this);
goto out;
}
@@ -1323,8 +1456,9 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this,
loc = &local->loc;
if (op_ret < 0) {
- if (op_errno == EIO)
- local->govinda_gOvinda = 1;
+ if (op_errno == EIO) {
+ afr_set_local_for_unhealable (local);
+ }
// EIO can happen if finding the fresh parent dir failed
goto out;
}
@@ -1386,7 +1520,7 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this,
}
return;
out:
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_set_error (sh, op_errno);
afr_sh_missing_entries_finish (frame, this);
return;
@@ -1470,7 +1604,7 @@ afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child,
LOCK (&frame->lock);
{
afr_sh_set_error (sh, EIO);
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
}
UNLOCK (&frame->lock);
}
@@ -1552,7 +1686,7 @@ afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this)
sh = &local->self_heal;
priv = this->private;
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
afr_sh_missing_entries_finish (frame, this);
} else {
if (afr_gfid_missing_count (this->name, sh->fresh_children,
@@ -1766,7 +1900,7 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this,
priv->child_count, ENOENT);
if (fresh_child_enoents == fresh_parent_count) {
afr_sh_set_error (sh, ENOENT);
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_purge_entry (frame, this);
} else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children,
priv->child_count, local->loc.path,
@@ -1780,14 +1914,14 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this,
afr_sh_purge_stale_entry (frame, this);
} else {
op_errno = EIO;
- local->govinda_gOvinda = 1;
+ afr_set_local_for_unhealable (local);
goto fail;
}
return;
fail:
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_set_error (sh, op_errno);
afr_sh_missing_entries_finish (frame, this);
return;
@@ -1858,8 +1992,8 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this,
out:
afr_sh_set_error (sh, op_errno);
- sh->op_failed = 1;
- afr_sh_missing_entries_finish (frame, this);
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
+ afr_sh_missing_entries_finish (frame, this);
return;
}
@@ -1962,7 +2096,7 @@ afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame,
if (int_lock->lock_op_ret < 0) {
gf_log (this->name, GF_LOG_INFO,
"Non blocking entrylks failed.");
- sh->op_failed = -1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_missing_entries_done (frame, this);
} else {
@@ -1997,6 +2131,7 @@ afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc,
int_lock->lk_basename = base_name;
int_lock->lk_loc = loc;
int_lock->lock_cbk = lock_cbk;
+ int_lock->domain = this->name;
int_lock->lockee_count = 0;
afr_init_entry_lockee (&int_lock->lockee[0], local, loc,
@@ -2041,19 +2176,29 @@ out:
static int
afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this)
{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sh->sh_type_in_action = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY;
+
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
+
afr_self_heal_parent_entrylk (frame, this,
- afr_sh_post_nb_entrylk_missing_entry_sh_cbk);
+ afr_sh_post_nb_entrylk_missing_entry_sh_cbk);
return 0;
}
afr_local_t*
-afr_local_copy (afr_local_t *l, xlator_t *this)
+afr_self_heal_local_init (afr_local_t *l, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *lc = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *shc = NULL;
- int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *lc = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_self_heal_t *shc = NULL;
+ int ret = 0;
priv = this->private;
@@ -2076,13 +2221,23 @@ afr_local_copy (afr_local_t *l, xlator_t *this)
shc->forced_merge = sh->forced_merge;
shc->background = sh->background;
shc->type = sh->type;
+ shc->data_sh_info = "";
+ shc->metadata_sh_info = "";
uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req);
- if (l->loc.path)
- loc_copy (&lc->loc, &l->loc);
+ if (l->loc.path) {
+ ret = loc_copy (&lc->loc, &l->loc);
+ if (ret < 0)
+ goto out;
+ }
lc->child_up = memdup (l->child_up,
sizeof (*lc->child_up) * priv->child_count);
+ if (!lc->child_up) {
+ ret = -1;
+ goto out;
+ }
+
if (l->xattr_req)
lc->xattr_req = dict_ref (l->xattr_req);
@@ -2090,59 +2245,25 @@ afr_local_copy (afr_local_t *l, xlator_t *this)
lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode);
if (l->cont.lookup.xattr)
lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr);
- if (l->internal_lock.inode_locked_nodes)
- lc->internal_lock.inode_locked_nodes =
- memdup (l->internal_lock.inode_locked_nodes,
- sizeof (*lc->internal_lock.inode_locked_nodes) * priv->child_count);
- else
- lc->internal_lock.inode_locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
-
- if (l->internal_lock.locked_nodes)
- lc->internal_lock.locked_nodes =
- memdup (l->internal_lock.locked_nodes,
- sizeof (*lc->internal_lock.locked_nodes) * priv->child_count);
- else
- lc->internal_lock.locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
-
- for (i = 0; i < l->internal_lock.lockee_count; i++) {
- loc_copy (&lc->internal_lock.lockee[i].loc,
- &l->internal_lock.lockee[i].loc);
-
- lc->internal_lock.lockee[i].locked_count =
- l->internal_lock.lockee[i].locked_count;
-
- if (l->internal_lock.lockee[i].basename)
- lc->internal_lock.lockee[i].basename =
- gf_strdup (l->internal_lock.lockee[i].basename);
-
- if (l->internal_lock.lockee[i].locked_nodes) {
- lc->internal_lock.lockee[i].locked_nodes =
- memdup (l->internal_lock.lockee[i].locked_nodes,
- sizeof (*lc->internal_lock.lockee[i].locked_nodes) *
- priv->child_count);
- } else {
- lc->internal_lock.lockee[i].locked_nodes =
- GF_CALLOC (priv->child_count,
- sizeof (*lc->internal_lock.lockee[i].locked_nodes),
- gf_afr_mt_char);
- }
+ lc->internal_lock.locked_nodes =
+ GF_CALLOC (sizeof (*l->internal_lock.locked_nodes),
+ priv->child_count, gf_afr_mt_char);
+ if (!lc->internal_lock.locked_nodes) {
+ ret = -1;
+ goto out;
}
- lc->internal_lock.lockee_count = l->internal_lock.lockee_count;
-
- lc->internal_lock.inodelk_lock_count =
- l->internal_lock.inodelk_lock_count;
- lc->internal_lock.entrylk_lock_count =
- l->internal_lock.entrylk_lock_count;
+ ret = afr_inodelk_init (&lc->internal_lock.inodelk[0],
+ this->name, priv->child_count);
+ if (ret)
+ goto out;
out:
+ if (ret) {
+ afr_local_cleanup (lc, this);
+ lc = NULL;
+ }
return lc;
}
@@ -2155,32 +2276,28 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)
afr_local_t * orig_frame_local = NULL;
afr_self_heal_t * orig_frame_sh = NULL;
char sh_type_str[256] = {0,};
+ gf_loglevel_t loglevel = 0;
priv = this->private;
local = bgsh_frame->local;
sh = &local->self_heal;
- if (local->govinda_gOvinda) {
+ if (local->unhealable) {
afr_set_split_brain (this, sh->inode, SPB, SPB);
- sh->op_failed = 1;
}
afr_self_heal_type_str_get (sh, sh_type_str,
sizeof(sh_type_str));
- if (sh->op_failed) {
- gf_loglevel_t loglevel = GF_LOG_ERROR;
- if (priv->shd.iamshd)
- loglevel = GF_LOG_DEBUG;
-
- gf_log (this->name, loglevel, "background %s self-heal "
- "failed on %s", sh_type_str, local->loc.path);
-
+ if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) {
+ loglevel = GF_LOG_ERROR;
+ } else if (!is_self_heal_failed (sh, AFR_CHECK_ALL)) {
+ loglevel = GF_LOG_INFO;
} else {
- gf_log (this->name, GF_LOG_DEBUG, "background %s self-heal "
- "completed on %s", sh_type_str, local->loc.path);
-
+ loglevel = GF_LOG_DEBUG;
}
+ afr_log_self_heal_completion_status (local, loglevel);
+
FRAME_SU_UNDO (bgsh_frame, afr_local_t);
if (!sh->unwound && sh->unwind) {
@@ -2188,7 +2305,7 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)
orig_frame_sh = &orig_frame_local->self_heal;
orig_frame_sh->actual_sh_started = _gf_true;
sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno,
- sh->op_failed);
+ is_self_heal_failed (sh, AFR_CHECK_ALL));
}
if (sh->background) {
@@ -2237,7 +2354,7 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode)
afr_set_lk_owner (sh_frame, this, sh_frame->root);
afr_set_low_priority (sh_frame);
- sh_local = afr_local_copy (local, this);
+ sh_local = afr_self_heal_local_init (local, this);
if (!sh_local)
goto out;
sh_frame->local = sh_local;
@@ -2302,6 +2419,8 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode)
sh->do_gfid_self_heal = _gf_false;
}
+ sh->sh_type_in_action = AFR_SELF_HEAL_INVALID;
+
FRAME_SU_DO (sh_frame, afr_local_t);
if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) {
afr_self_heal_missing_entries (sh_frame, this);
@@ -2511,9 +2630,183 @@ out:
GF_FREE (erase_xattr);
if (ret < 0) {
- sh->op_failed = _gf_true;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
finish (frame, this);
}
return 0;
}
+
+void
+afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status)
+{
+ xlator_t *this = NULL;
+ afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status);
+ afr_self_heal_type sh_type_in_action = sh->sh_type_in_action;
+ this = THIS;
+
+ if (!sh) {
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal"
+ "Structure");
+ goto out;
+ }
+
+ switch (sh_type_in_action) {
+ case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY:
+ sh_status->gfid_or_missing_entry_self_heal = status;
+ break;
+ case AFR_SELF_HEAL_METADATA:
+ sh_status->metadata_self_heal = status;
+ break;
+ case AFR_SELF_HEAL_DATA:
+ sh_status->data_self_heal = status;
+ break;
+ case AFR_SELF_HEAL_ENTRY:
+ sh_status->entry_self_heal = status;
+ break;
+ case AFR_SELF_HEAL_INVALID:
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid"
+ "self heal type in action");
+ break;
+ }
+out:
+ return;
+}
+
+void
+afr_set_local_for_unhealable (afr_local_t *local)
+{
+ afr_self_heal_t *sh = NULL;
+
+ sh = &local->self_heal;
+
+ local->unhealable = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
+}
+
+int
+is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type)
+{
+ afr_sh_status_for_all_type sh_status = sh->afr_all_sh_status;
+ afr_self_heal_type sh_type_in_action = AFR_SELF_HEAL_INVALID;
+ afr_self_heal_status status = AFR_SELF_HEAL_FAILED;
+ xlator_t *this = NULL;
+ int sh_failed = 0;
+
+ this = THIS;
+
+ if (!sh) {
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal "
+ "structure");
+ sh_failed = 1;
+ goto out;
+ }
+
+ if (type == AFR_CHECK_ALL) {
+ if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED)
+ || (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED)
+ || (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED)
+ || (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED))
+ sh_failed = 1;
+ } else if (type == AFR_CHECK_SPECIFIC) {
+ sh_type_in_action = sh->sh_type_in_action;
+ switch (sh_type_in_action) {
+ case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY:
+ status = sh_status.gfid_or_missing_entry_self_heal;
+ break;
+ case AFR_SELF_HEAL_METADATA:
+ status = sh_status.metadata_self_heal;
+ break;
+ case AFR_SELF_HEAL_ENTRY:
+ status = sh_status.entry_self_heal;
+ break;
+ case AFR_SELF_HEAL_DATA:
+ status = sh_status.data_self_heal;
+ break;
+ case AFR_SELF_HEAL_INVALID:
+ status = AFR_SELF_HEAL_NOT_ATTEMPTED;
+ break;
+ }
+ if (status == AFR_SELF_HEAL_FAILED)
+ sh_failed = 1;
+
+ }
+
+out:
+ return sh_failed;
+}
+
+char *
+get_sh_completion_status (afr_self_heal_status status)
+{
+
+ char *not_attempted = " is not attempted";
+ char *failed = " failed";
+ char *started = " is started";
+ char *sync_begin = " is successfully completed";
+ char *result = " has unknown status";
+
+ switch (status)
+ {
+ case AFR_SELF_HEAL_NOT_ATTEMPTED:
+ result = not_attempted;
+ break;
+ case AFR_SELF_HEAL_FAILED:
+ result = failed;
+ break;
+ case AFR_SELF_HEAL_STARTED:
+ result = started;
+ break;
+ case AFR_SELF_HEAL_SYNC_BEGIN:
+ result = sync_begin;
+ break;
+ }
+
+ return result;
+
+}
+
+void
+afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t loglvl)
+{
+
+ char sh_log[4096] = {0};
+ afr_self_heal_t *sh = &local->self_heal;
+ afr_sh_status_for_all_type all_status = sh->afr_all_sh_status;
+ xlator_t *this = NULL;
+ size_t off = 0;
+ int data_sh = 0;
+ int metadata_sh = 0;
+ int print_log = 0;
+
+ this = THIS;
+
+ ADD_FMT_STRING (sh_log, off, "gfid or missing entry",
+ all_status.gfid_or_missing_entry_self_heal, print_log);
+ ADD_FMT_STRING_SYNC (sh_log, off, "metadata",
+ all_status.metadata_self_heal, print_log);
+ if (sh->background) {
+ ADD_FMT_STRING_SYNC (sh_log, off, "backgroung data",
+ all_status.data_self_heal, print_log);
+ } else {
+ ADD_FMT_STRING_SYNC (sh_log, off, "foreground data",
+ all_status.data_self_heal, print_log);
+ }
+ ADD_FMT_STRING_SYNC (sh_log, off, "entry", all_status.entry_self_heal,
+ print_log);
+
+ if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.data_self_heal &&
+ strcmp (sh->data_sh_info, "") && sh->data_sh_info )
+ data_sh = 1;
+ if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.metadata_self_heal &&
+ strcmp (sh->metadata_sh_info, "") && sh->metadata_sh_info)
+ metadata_sh = 1;
+
+ if (!print_log)
+ return;
+
+ gf_log (this->name, loglvl, "%s %s %s on %s", sh_log,
+ ((data_sh == 1) ? sh->data_sh_info : ""),
+ ((metadata_sh == 1) ? sh->metadata_sh_info : ""),
+ local->loc.path);
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h
index 035fce543..473264776 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.h
+++ b/xlators/cluster/afr/src/afr-self-heal-common.h
@@ -15,13 +15,6 @@
#define AFR_SH_MIN_PARTICIPANTS 2
typedef enum {
- AFR_SELF_HEAL_ENTRY,
- AFR_SELF_HEAL_METADATA,
- AFR_SELF_HEAL_DATA,
- AFR_SELF_HEAL_INVALID = -1,
-} afr_self_heal_type;
-
-typedef enum {
AFR_LOOKUP_FAIL_CONFLICTS = 1,
AFR_LOOKUP_FAIL_MISSING_GFIDS = 2,
} afr_lookup_flags_t;
@@ -98,13 +91,13 @@ int
afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this,
int child_index);
int
-afr_sh_data_unlock (call_frame_t *frame, xlator_t *this,
+afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom,
afr_lock_cbk_t lock_cbk);
afr_local_t *
-afr_local_copy (afr_local_t *l, xlator_t *this);
+afr_self_heal_local_init (afr_local_t *l, xlator_t *this);
int
afr_sh_data_lock (call_frame_t *frame, xlator_t *this,
- off_t start, off_t len, gf_boolean_t block,
+ off_t start, off_t len, gf_boolean_t block, char *dom,
afr_lock_cbk_t success_handler,
afr_lock_cbk_t failure_handler);
void
@@ -133,4 +126,19 @@ int
afr_sh_erase_pending (call_frame_t *frame, xlator_t *this,
afr_transaction_type type, afr_fxattrop_cbk_t cbk,
int (*finish)(call_frame_t *frame, xlator_t *this));
+
+void
+afr_set_local_for_unhealable (afr_local_t *local);
+
+int
+is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type);
+
+void
+afr_set_self_heal_status (afr_self_heal_t *sh, afr_self_heal_status status);
+
+void
+afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t logl);
+
+char*
+afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this);
#endif /* __AFR_SELF_HEAL_COMMON_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 2f63ed27d..9de26ee56 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -156,6 +156,25 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this)
}
int
+afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ if (sh->sh_dom_lock_held)
+ afr_sh_data_unlock (frame, this, priv->sh_domain,
+ afr_sh_data_close);
+ else
+ afr_sh_data_close (frame, this);
+ return 0;
+}
+
+int
afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *statpre,
struct iatt *statpost, dict_t *xdata)
@@ -283,31 +302,45 @@ afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this)
//Fun fact, lock_cbk is being used for both lock & unlock
int
-afr_sh_data_unlock (call_frame_t *frame, xlator_t *this,
+afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom,
afr_lock_cbk_t lock_cbk)
{
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int ret = 0;
local = frame->local;
int_lock = &local->internal_lock;
sh = &local->self_heal;
+ priv = this->private;
- GF_ASSERT (sh->data_lock_held);
-
- sh->data_lock_held = _gf_false;
+ if (strcmp (dom, this->name) == 0) {
+ sh->data_lock_held = _gf_false;
+ } else if (strcmp (dom, priv->sh_domain) == 0) {
+ sh->sh_dom_lock_held = _gf_false;
+ } else {
+ ret = -1;
+ goto out;
+ }
int_lock->lock_cbk = lock_cbk;
+ int_lock->domain = dom;
afr_unlock (frame, this);
+out:
+ if (ret) {
+ int_lock->lock_op_ret = -1;
+ int_lock->lock_cbk (frame, this);
+ }
return 0;
}
int
afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
local = frame->local;
sh = &local->self_heal;
@@ -316,9 +349,9 @@ afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
"finishing data selfheal of %s", local->loc.path);
if (sh->data_lock_held)
- afr_sh_data_unlock (frame, this, afr_sh_data_close);
+ afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock);
else
- afr_sh_data_close (frame, this);
+ afr_sh_dom_unlock (frame, this);
return 0;
}
@@ -335,11 +368,8 @@ afr_sh_data_fail (call_frame_t *frame, xlator_t *this)
gf_log (this->name, GF_LOG_DEBUG,
"finishing failed data selfheal of %s", local->loc.path);
- sh->op_failed = 1;
- if (sh->data_lock_held)
- afr_sh_data_unlock (frame, this, afr_sh_data_close);
- else
- afr_sh_data_close (frame, this);
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
+ afr_sh_data_finish (frame, this);
return 0;
}
@@ -362,13 +392,13 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
"log failed on %s for subvol %s, reason: %s",
local->loc.path, priv->children[child_index]->name,
strerror (op_errno));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
}
call_count = afr_frame_return (frame);
if (call_count == 0) {
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
if (sh->old_loop_frame)
sh_loop_finish (sh->old_loop_frame, this);
sh->old_loop_frame = NULL;
@@ -380,7 +410,7 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
goto out;
}
GF_ASSERT (sh->old_loop_frame);
- afr_sh_data_lock (frame, this, 0, 0, _gf_true,
+ afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name,
afr_post_sh_big_lock_success,
afr_post_sh_big_lock_failure);
}
@@ -418,7 +448,7 @@ afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
priv->children[child_index]->name, strerror (op_errno));
LOCK (&frame->lock);
{
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
}
UNLOCK (&frame->lock);
if (sh->old_loop_frame)
@@ -428,7 +458,7 @@ afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_count = afr_frame_return (frame);
if (call_count == 0) {
- if (sh->op_failed)
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC))
afr_sh_data_fail (frame, this);
else
afr_sh_data_erase_pending (frame, this);
@@ -604,7 +634,7 @@ afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->loc.path,
priv->children[child_index]->name,
strerror (op_errno));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
} else {
gf_log (this->name, GF_LOG_DEBUG,
"ftruncate of %s on subvolume %s completed",
@@ -617,7 +647,7 @@ afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_count = afr_frame_return (frame);
if (call_count == 0) {
- if (sh->op_failed)
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC))
afr_sh_data_fail (frame, this);
else
afr_sh_data_sync_prepare (frame, this);
@@ -697,6 +727,199 @@ out:
return ret;
}
+char*
+afr_get_sizes_str (afr_local_t *local, struct iatt *bufs, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ char num[1024] = {0};
+ size_t len = 0;
+ char *sizes_str = NULL;
+ size_t off = 0;
+ char *fmt_str = "%llu bytes on %s, ";
+ char *child_down = " %s,";
+ char *child_unknown = " %s,";
+ int down_child_present = 0;
+ int down_count = 0;
+ int unknown_count = 0;
+ int unknown_child_present = 0;
+ char *down_subvol_1 = " down subvolume is ";
+ char *unknown_subvol_1 = " unknown subvolume is ";
+ char *down_subvol_2 = " down subvolumes are ";
+ char *unknown_subvol_2 = " unknown subvolumes are ";
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] == 1) {
+ len += snprintf (num, sizeof (num), fmt_str,
+ (unsigned long long) bufs[i].ia_size,
+ priv->children[i]->name);
+ } else if (local->child_up[i] == 0) {
+ len += snprintf (num, sizeof (num), child_down,
+ priv->children[i]->name);
+ if (!down_child_present)
+ down_child_present = 1;
+ down_count ++;
+ } else if (local->child_up[i] == -1) {
+ len += snprintf (num, sizeof (num), child_unknown,
+ priv->children[i]->name);
+ if (!unknown_child_present)
+ unknown_child_present = 1;
+ unknown_count++;
+ }
+
+ }
+
+ if (down_child_present) {
+ if (down_count > 1)
+ len += snprintf (num, sizeof (num), "%s",
+ down_subvol_2);
+ else
+ len += snprintf (num, sizeof (num), "%s",
+ down_subvol_1);
+ }
+ if (unknown_child_present) {
+ if (unknown_count > 1)
+ len += snprintf (num, sizeof (num), "%s",
+ unknown_subvol_2);
+ else
+ len += snprintf (num, sizeof (num), "%s",
+ unknown_subvol_1);
+ }
+
+ len++;//for '\0'
+
+ sizes_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char);
+
+ if (!sizes_str)
+ return NULL;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] == 1) {
+ off += snprintf (sizes_str + off, len - off, fmt_str,
+ (unsigned long long) bufs[i].ia_size,
+ priv->children[i]->name);
+ }
+ }
+
+ if (down_child_present) {
+ if (down_count > 1) {
+ off += snprintf (sizes_str + off, len - off, "%s",
+ down_subvol_2);
+ } else {
+ off += snprintf (sizes_str + off, len - off, "%s",
+ down_subvol_1);
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] == 0) {
+ off += snprintf (sizes_str + off, len - off, child_down,
+ priv->children[i]->name);
+ }
+ }
+
+ if (unknown_child_present) {
+ if (unknown_count > 1) {
+ off += snprintf (sizes_str + off, len - off, "%s",
+ unknown_subvol_2);
+ } else {
+ off += snprintf (sizes_str + off, len - off, "%s",
+ unknown_subvol_1);
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] == -1) {
+ off += snprintf (sizes_str + off, len - off,
+ child_unknown,
+ priv->children[i]->name);
+
+ }
+ }
+
+ return sizes_str;
+}
+
+char*
+afr_get_sinks_str (xlator_t *this, afr_local_t *local, afr_self_heal_t *sh)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ char num[1024] = {0};
+ size_t len = 0;
+ char *sinks_str = NULL;
+ char *temp_str = " to sinks ";
+ char *str_format = " %s,";
+ char off = 0;
+
+ priv = this->private;
+
+ len += snprintf (num, sizeof (num), "%s", temp_str);
+ for (i = 0; i < priv->child_count; i++) {
+ if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) {
+ len += snprintf (num, sizeof (num), str_format,
+ priv->children[i]->name);
+ }
+ }
+
+ len ++;
+
+ sinks_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char);
+
+ if (!sinks_str)
+ return NULL;
+
+ off += snprintf (sinks_str + off, len - off, "%s", temp_str);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) {
+ off += snprintf (sinks_str + off, len - off,
+ str_format,
+ priv->children[i]->name);
+ }
+ }
+
+ return sinks_str;
+
+}
+
+
+void
+afr_set_data_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, xlator_t *this)
+{
+ char *pending_matrix_str = NULL;
+ char *sizes_str = NULL;
+ char *sinks_str = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix,
+ this);
+ if (!pending_matrix_str)
+ pending_matrix_str = "";
+
+ sizes_str = afr_get_sizes_str (local, sh->buf, this);
+ if (!sizes_str)
+ sizes_str = "";
+
+ sinks_str = afr_get_sinks_str (this, local, sh);
+ if (!sinks_str)
+ sinks_str = "";
+
+ gf_asprintf (&sh->data_sh_info, " data self heal from %s %s with "
+ "%s data %s", priv->children[sh->source]->name, sinks_str,
+ sizes_str, pending_matrix_str);
+
+ if (pending_matrix_str && strcmp (pending_matrix_str, ""))
+ GF_FREE (pending_matrix_str);
+
+ if (sizes_str && strcmp (sizes_str, ""))
+ GF_FREE (sizes_str);
+}
+
void
afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
{
@@ -718,7 +941,7 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
if (sh->background && sh->unwind && !sh->unwound) {
sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno,
- sh->op_failed);
+ is_self_heal_failed (sh, AFR_CHECK_SPECIFIC));
sh->unwound = _gf_true;
}
@@ -737,6 +960,7 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
sh->active_sinks);
sh->actual_sh_started = _gf_true;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN);
afr_sh_data_trim_sinks (frame, this);
}
@@ -817,6 +1041,7 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this)
}
afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]);
} else {
+ afr_set_data_sh_info_str (local, sh, this);
if (nsources == 0) {
gf_log (this->name, GF_LOG_DEBUG,
"No self-heal needed for %s",
@@ -1174,6 +1399,22 @@ afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this)
}
int
+afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sh->sh_dom_lock_held = _gf_true;
+ afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name,
+ afr_sh_data_big_lock_success,
+ afr_sh_data_fail);
+ return 0;
+}
+
+int
afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
@@ -1237,9 +1478,11 @@ afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
}
int
-afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t len)
+afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom,
+ off_t start, off_t len)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
local = frame->local;
@@ -1250,11 +1493,14 @@ afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t le
afr_set_lock_number (frame, this);
- int_lock->lk_flock.l_start = start;
- int_lock->lk_flock.l_len = len;
- int_lock->lk_flock.l_type = F_WRLCK;
int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk;
+ int_lock->domain = dom;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ inodelk->flock.l_start = start;
+ inodelk->flock.l_len = len;
+ inodelk->flock.l_type = F_WRLCK;
+
afr_nonblocking_inodelk (frame, this);
return 0;
@@ -1298,7 +1544,7 @@ afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this)
int
afr_sh_data_lock (call_frame_t *frame, xlator_t *this,
off_t start, off_t len, gf_boolean_t block,
- afr_lock_cbk_t success_handler,
+ char *dom, afr_lock_cbk_t success_handler,
afr_lock_cbk_t failure_handler)
{
afr_local_t * local = NULL;
@@ -1310,7 +1556,7 @@ afr_sh_data_lock (call_frame_t *frame, xlator_t *this,
sh->data_lock_success_handler = success_handler;
sh->data_lock_failure_handler = failure_handler;
sh->data_lock_block = block;
- return afr_sh_data_lock_rec (frame, this, start, len);
+ return afr_sh_data_lock_rec (frame, this, dom, start, len);
}
int
@@ -1322,7 +1568,6 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_private_t *priv = NULL;
int call_count = 0;
int child_index = 0;
- gf_boolean_t block = _gf_true;
local = frame->local;
sh = &local->self_heal;
@@ -1342,7 +1587,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->loc.path,
priv->children[child_index]->name,
strerror (op_errno));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
} else {
gf_log (this->name, GF_LOG_TRACE,
"open of %s succeeded on child %s",
@@ -1355,7 +1600,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_count = afr_frame_return (frame);
if (call_count == 0) {
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
afr_sh_data_fail (frame, this);
return 0;
}
@@ -1364,15 +1609,8 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"fd for %s opened, commencing sync",
local->loc.path);
- /*
- * The read and write self-heal trigger codepaths do not provide
- * an unwind callback. We run a trylock in these codepaths
- * because we are sensitive to locking latency.
- */
- block = sh->unwind ? _gf_true : _gf_false;
- afr_sh_data_lock (frame, this, 0, 0, block,
- afr_sh_data_big_lock_success,
- afr_sh_data_fail);
+ afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain,
+ afr_sh_dom_lock_success, afr_sh_data_fail);
}
return 0;
@@ -1477,18 +1715,31 @@ afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv)
int
afr_self_heal_data (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = this->private;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = this->private;
+ int ret = -1;
local = frame->local;
sh = &local->self_heal;
+ sh->sh_type_in_action = AFR_SELF_HEAL_DATA;
+
if (afr_can_start_data_self_heal (sh, priv)) {
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
+ ret = afr_inodelk_init (&local->internal_lock.inodelk[1],
+ priv->sh_domain, priv->child_count);
+ if (ret < 0) {
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
+ afr_sh_data_done (frame, this);
+ return 0;
+ }
+
if (IA_ISREG (sh->type)) {
afr_sh_data_open (frame, this);
} else {
afr_sh_data_lock (frame, this, 0, 0, _gf_true,
+ this->name,
afr_sh_non_reg_lock_success,
afr_sh_data_fail);
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index c3c9f9fca..53491a1d7 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -162,7 +162,7 @@ afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this)
sh = &local->self_heal;
if (sh->entries_skipped) {
- sh->op_failed = _gf_true;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
goto out;
}
afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION,
@@ -613,6 +613,19 @@ out:
return 0;
}
+static gf_boolean_t
+can_skip_entry_self_heal (char *name, loc_t *parent_loc)
+{
+ if (strcmp (name, ".") == 0) {
+ return _gf_true;
+ } else if (strcmp (name, "..") == 0) {
+ return _gf_true;
+ } else if (loc_is_root (parent_loc) &&
+ (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0)) {
+ return _gf_true;
+ }
+ return _gf_false;
+}
int
afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this,
@@ -640,13 +653,7 @@ afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this,
sh->expunge_done = afr_sh_entry_expunge_entry_done;
name = entry->d_name;
-
- if ((strcmp (name, ".") == 0)
- || (strcmp (name, "..") == 0)) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "skipping inspection of %s under %s",
- name, local->loc.path);
+ if (can_skip_entry_self_heal (name, &local->loc)) {
op_ret = 0;
goto out;
}
@@ -799,7 +806,7 @@ afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this)
active_src = next_active_sink (frame, this, sh->active_source);
sh->active_source = active_src;
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
goto out;
}
@@ -1256,6 +1263,35 @@ afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this,
gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed",
impunge_local->loc.path);
+ /*
+ * Reason for adding GLUSTERFS_INTERNAL_FOP_KEY :
+ *
+ * Problem:
+ * While a brick is down in a replica pair, lets say the user creates
+ * one file(file-A) and a hard link to that file(h-file-A). After the
+ * brick comes back up, entry self-heal is attempted on parent dir of
+ * these two files. As part of readdir in self-heal it reads both the
+ * entries file-A and h-file-A for both of them it does name less lookup
+ * to check if there are any hardlinks already present in the
+ * destination brick. It finds that there are no hard links already
+ * present for files file-A, h-file-A. Self-heal does mknods for both
+ * file-A and h-file-A. This leads to file-A and h-file-A not being
+ * hardlinks anymore.
+ *
+ * Fix: (More like shrinking of race-window, the race itself is still
+ * present in posix-mknod).
+ * If mknod comes with the presence of GLUSTERFS_INTERNAL_FOP_KEY then
+ * posix_mknod checks if there are already any gfid-links and does
+ * link() instead of mknod. There still can be a race where two
+ * posix_mknods same gfid see that
+ * gfid-link file is not present and proceeds with mknods and result in
+ * two different files with same gfid.
+ */
+ ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+ if (ret)
+ gf_log (this->name, GF_LOG_INFO, "%s: %s set failed",
+ impunge_local->loc.path, GLUSTERFS_INTERNAL_FOP_KEY);
+
STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
(void *) (long) child_index,
priv->children[child_index],
@@ -1872,12 +1908,7 @@ afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this,
active_src = sh->active_source;
sh->impunge_done = afr_sh_entry_impunge_entry_done;
- if ((strcmp (entry->d_name, ".") == 0)
- || (strcmp (entry->d_name, "..") == 0)) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "skipping inspection of %s under %s",
- entry->d_name, local->loc.path);
+ if (can_skip_entry_self_heal (entry->d_name, &local->loc)) {
op_ret = 0;
goto out;
}
@@ -1946,7 +1977,7 @@ afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie,
local->loc.path,
priv->children[active_src]->name,
strerror (op_errno));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
} else {
gf_log (this->name, GF_LOG_TRACE,
"readdir of %s on subvolume %s complete",
@@ -2019,7 +2050,7 @@ afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this)
active_src = next_active_source (frame, this, sh->active_source);
sh->active_source = active_src;
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
afr_sh_entry_finish (frame, this);
return 0;
}
@@ -2068,7 +2099,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->loc.path,
priv->children[child_index]->name,
strerror (op_errno));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
}
}
UNLOCK (&frame->lock);
@@ -2076,7 +2107,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_count = afr_frame_return (frame);
if (call_count == 0) {
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
afr_sh_entry_finish (frame, this);
return 0;
}
@@ -2209,6 +2240,7 @@ afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this)
local->loc.path);
sh->actual_sh_started = _gf_true;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN);
afr_sh_entry_open (frame, this);
return 0;
@@ -2231,7 +2263,7 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this,
priv = this->private;
if (op_ret < 0) {
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_set_error (sh, op_errno);
afr_sh_entry_finish (frame, this);
goto out;
@@ -2294,7 +2326,7 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this)
if (int_lock->lock_op_ret < 0) {
gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks "
"failed for %s.", local->loc.path);
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_entry_done (frame, this);
} else {
@@ -2313,14 +2345,18 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this)
int
afr_self_heal_entry (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
afr_private_t *priv = NULL;
-
+ afr_self_heal_t *sh = NULL;
priv = this->private;
local = frame->local;
+ sh = &local->self_heal;
+
+ sh->sh_type_in_action = AFR_SELF_HEAL_ENTRY;
if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) {
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
afr_sh_entrylk (frame, this, &local->loc, NULL,
afr_sh_post_nonblocking_entry_cbk);
} else {
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index cc85d9b9f..fd5da6cfd 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -97,7 +97,7 @@ afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this)
local = frame->local;
sh = &local->self_heal;
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_metadata_finish (frame, this);
return 0;
}
@@ -406,6 +406,140 @@ afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
+static void
+afr_set_metadata_sh_info_str (afr_local_t *local, afr_self_heal_t *sh,
+ xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ char num[1024] = {0};
+ size_t len = 0;
+ char *string = NULL;
+ size_t off = 0;
+ char *source_child = " from source %s to";
+ char *format = " %s, ";
+ char *string_msg = " metadata self heal";
+ char *pending_matrix_str = NULL;
+ int down_child_present = 0;
+ int unknown_child_present = 0;
+ char *down_subvol_1 = " down subvolume is ";
+ char *unknown_subvol_1 = " unknown subvolume is";
+ char *down_subvol_2 = " down subvolumes are ";
+ char *unknown_subvol_2 = " unknown subvolumes are ";
+ int down_count = 0;
+ int unknown_count = 0;
+
+ priv = this->private;
+
+ pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix,
+ this);
+
+ if (!pending_matrix_str)
+ pending_matrix_str = "";
+
+ len += snprintf (num, sizeof (num), "%s", string_msg);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((sh->source == i) && (local->child_up[i] == 1)) {
+ len += snprintf (num, sizeof (num), source_child,
+ priv->children[i]->name);
+ } else if ((local->child_up[i] == 1) && (sh->sources[i] == 0)) {
+ len += snprintf (num, sizeof (num), format,
+ priv->children[i]->name);
+ } else if (local->child_up[i] == 0) {
+ len += snprintf (num, sizeof (num), format,
+ priv->children[i]->name);
+ if (!down_child_present)
+ down_child_present = 1;
+ down_count++;
+ } else if (local->child_up[i] == -1) {
+ len += snprintf (num, sizeof (num), format,
+ priv->children[i]->name);
+ if (!unknown_child_present)
+ unknown_child_present = 1;
+ unknown_count++;
+ }
+ }
+
+ if (down_child_present) {
+ if (down_count > 1) {
+ len += snprintf (num, sizeof (num), "%s",
+ down_subvol_2);
+ } else {
+ len += snprintf (num, sizeof (num), "%s",
+ down_subvol_1);
+ }
+ }
+ if (unknown_child_present) {
+ if (unknown_count > 1) {
+ len += snprintf (num, sizeof (num), "%s",
+ unknown_subvol_2);
+ } else {
+ len += snprintf (num, sizeof (num), "%s",
+ unknown_subvol_1);
+ }
+ }
+
+ len ++;
+
+ string = GF_CALLOC (len, sizeof (char), gf_common_mt_char);
+ if (!string)
+ return;
+
+ off += snprintf (string + off, len - off, "%s", string_msg);
+ for (i=0; i < priv->child_count; i++) {
+ if ((sh->source == i) && (local->child_up[i] == 1))
+ off += snprintf (string + off, len - off, source_child,
+ priv->children[i]->name);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((local->child_up[i] == 1)&& (sh->sources[i] == 0))
+ off += snprintf (string + off, len - off, format,
+ priv->children[i]->name);
+ }
+
+ if (down_child_present) {
+ if (down_count > 1) {
+ off += snprintf (string + off, len - off, "%s",
+ down_subvol_2);
+ } else {
+ off += snprintf (string + off, len - off, "%s",
+ down_subvol_1);
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] == 0)
+ off += snprintf (string + off, len - off, format,
+ priv->children[i]->name);
+ }
+
+ if (unknown_child_present) {
+ if (unknown_count > 1) {
+ off += snprintf (string + off, len - off, "%s",
+ unknown_subvol_2);
+ } else {
+ off += snprintf (string + off, len - off, "%s",
+ unknown_subvol_1);
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] == -1)
+ off += snprintf (string + off, len - off, format,
+ priv->children[i]->name);
+ }
+
+ gf_asprintf (&sh->metadata_sh_info, "%s metadata %s,", string,
+ pending_matrix_str);
+
+ if (pending_matrix_str && strcmp (pending_matrix_str, ""))
+ GF_FREE (pending_matrix_str);
+
+ if (string && strcmp (string, ""))
+ GF_FREE (string);
+}
int
afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)
@@ -436,6 +570,8 @@ afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)
sh->active_sinks);
sh->actual_sh_started = _gf_true;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN);
+ afr_set_metadata_sh_info_str (local, sh, this);
STACK_WIND (frame, afr_sh_metadata_getxattr_cbk,
priv->children[source],
priv->children[source]->fops->getxattr,
@@ -461,7 +597,7 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this,
priv = this->private;
if (op_ret < 0) {
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_set_error (sh, op_errno);
afr_sh_metadata_finish (frame, this);
goto out;
@@ -579,19 +715,22 @@ int
afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
local = frame->local;
int_lock = &local->internal_lock;
+ int_lock->domain = this->name;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK;
afr_set_lock_number (frame, this);
- int_lock->lk_flock.l_start = LLONG_MAX - 1;
- int_lock->lk_flock.l_len = 0;
- int_lock->lk_flock.l_type = F_WRLCK;
+ inodelk->flock.l_start = LLONG_MAX - 1;
+ inodelk->flock.l_len = 0;
+ inodelk->flock.l_type = F_WRLCK;
int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk;
afr_nonblocking_inodelk (frame, this);
@@ -618,8 +757,10 @@ afr_self_heal_metadata (call_frame_t *frame, xlator_t *this)
local = frame->local;
sh = &local->self_heal;
+ sh->sh_type_in_action = AFR_SELF_HEAL_METADATA;
if (afr_can_start_metadata_self_heal (sh, priv)) {
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
afr_sh_metadata_lock (frame, this);
} else {
afr_sh_metadata_done (frame, this);
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index 37bc224f5..1b48a1bca 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -25,7 +25,8 @@ typedef enum {
typedef enum {
HEAL = 1,
- INFO
+ INFO,
+ STATISTICS_TO_BE_HEALED,
} shd_crawl_op;
typedef struct shd_dump {
@@ -84,6 +85,33 @@ _loc_assign_gfid_path (loc_t *loc)
}
void
+_destroy_crawl_event_data (void *data)
+{
+ shd_crawl_event_t *crawl_event = NULL;
+
+ if (!data)
+ goto out;
+
+ crawl_event = (shd_crawl_event_t *)data;
+ GF_FREE (crawl_event->start_time_str);
+ GF_FREE (crawl_event->end_time_str);
+
+out:
+ return;
+}
+
+void
+_destroy_shd_event_data (void *data)
+{
+ shd_event_t *event = NULL;
+ if (!data)
+ goto out;
+ event = (shd_event_t*)data;
+ GF_FREE (event->path);
+out:
+ return;
+}
+void
shd_cleanup_event (void *event)
{
shd_event_t *shd_event = event;
@@ -128,6 +156,123 @@ _build_index_loc (xlator_t *this, loc_t *loc, char *name, loc_t *parent)
}
int
+_add_crawl_stats_to_dict (xlator_t *this, dict_t *output, int child,
+ shd_crawl_event_t *shd_event, struct timeval *tv)
+{
+ int ret = 0;
+ uint64_t count = 0;
+ char key[256] = {0};
+ int xl_id = 0;
+ uint64_t healed_count = 0;
+ uint64_t split_brain_count = 0;
+ uint64_t heal_failed_count = 0;
+ char *start_time_str = NULL;
+ char *end_time_str = NULL;
+ char *crawl_type = NULL;
+ int progress = -1;
+
+ healed_count = shd_event->healed_count;
+ split_brain_count = shd_event->split_brain_count;
+ heal_failed_count = shd_event->heal_failed_count;
+ start_time_str = shd_event->start_time_str;
+ end_time_str = shd_event->end_time_str;
+ crawl_type = shd_event->crawl_type;
+
+ if (!start_time_str) {
+ ret = -1;
+ goto out;
+ }
+
+
+ ret = dict_get_int32 (output, this->name, &xl_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "xl does not have id");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child);
+ ret = dict_get_uint64 (output, key, &count);
+
+ snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64(output, key, healed_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_"
+ "healed_count to outout");
+ goto out;
+ }
+ snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64 (output, key, split_brain_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_"
+ "split_brain_count to outout");
+ goto out;
+ }
+ snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_dynstr (output, key, gf_strdup (crawl_type));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_"
+ "crawl_type to output");
+ goto out;
+ }
+ snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64 (output, key, heal_failed_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_"
+ "healed_failed_count to outout");
+ goto out;
+ }
+ snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_dynstr (output, key, gf_strdup(start_time_str));
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_"
+ "crawl_start_time to outout");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64,
+ xl_id, child, count);
+
+ if (!end_time_str)
+ end_time_str = "Could not determine the end time";
+ ret = dict_set_dynstr (output, key, gf_strdup(end_time_str));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_"
+ "crawl_end_time to outout");
+ goto out;
+ }
+ snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64,
+ xl_id, child, count);
+
+ if (shd_event->crawl_inprogress == _gf_true)
+ progress = 1;
+ else
+ progress = 0;
+
+ ret = dict_set_int32 (output, key, progress);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_"
+ "inprogress to outout");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics-%d-%d-count",xl_id, child);
+ ret = dict_set_uint64 (output, key, count + 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not increment the "
+ "counter.");
+ goto out;
+ }
+out:
+ return ret;
+}
+
+int
_add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path,
struct timeval *tv, gf_boolean_t dyn)
{
@@ -233,6 +378,20 @@ out:
}
int
+_add_crawl_event_statistics_to_dict (circular_buffer_t *cb, void *data)
+{
+ int ret = 0;
+ shd_dump_t *dump_data = NULL;
+ shd_crawl_event_t *shd_event = NULL;
+
+ dump_data = data;
+ shd_event = cb->data;
+ ret = _add_crawl_stats_to_dict (dump_data->this, dump_data->dict,
+ dump_data->child, shd_event, &cb->tv);
+ return ret;
+}
+
+int
_add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child)
{
shd_dump_t dump_data = {0};
@@ -244,6 +403,26 @@ _add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child)
return 0;
}
+
+int
+_add_statistics_to_dict (xlator_t *this, dict_t *dict, int child)
+{
+ shd_dump_t dump_data = {0};
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ dump_data.this = this;
+ dump_data.dict = dict;
+ dump_data.child = child;
+ eh_dump (shd->statistics[child], &dump_data,
+ _add_crawl_event_statistics_to_dict);
+ return 0;
+
+}
+
void
_remove_stale_index (xlator_t *this, xlator_t *readdir_xl,
loc_t *parent, char *fname)
@@ -269,6 +448,46 @@ out:
}
int
+_count_hard_links_under_base_indices_dir (xlator_t *this,
+ afr_crawl_data_t *crawl_data,
+ gf_dirent_t *entry, loc_t *childloc,
+ loc_t *parentloc, struct iatt *iattr)
+{
+ xlator_t *readdir_xl = crawl_data->readdir_xl;
+ struct iatt parent = {0};
+ int ret = 0;
+ dict_t *output = NULL;
+ int xl_id = 0;
+ char key[256] = {0};
+ int child = -1;
+ uint64_t hardlinks = 0;
+
+ output = crawl_data->op_data;
+ child = crawl_data->child;
+
+ ret = syncop_lookup (readdir_xl, childloc, NULL, iattr, NULL, &parent);
+ if (ret)
+ goto out;
+
+ ret = dict_get_int32 (output, this->name, &xl_id);
+ if (ret)
+ goto out;
+
+ snprintf (key, sizeof (key), "%d-%d-hardlinks", xl_id, child);
+ ret = dict_get_uint64 (output, key, &hardlinks);
+
+ /*Removing the count of base_entry under indices/base_indicies and
+ * entry under indices/xattrop */
+ hardlinks = hardlinks + iattr->ia_nlink - 2;
+ ret = dict_set_uint64 (output, key, hardlinks);
+ if (ret)
+ goto out;
+
+out:
+ return ret;
+}
+
+int
_add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data,
gf_dirent_t *entry,
loc_t *childloc, loc_t *parentloc, struct iatt *iattr)
@@ -307,16 +526,18 @@ _crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child,
int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp,
afr_crawl_data_t *crawl_data)
{
- int ret = 0;
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- eh_t *eh = NULL;
- char *path = NULL;
- char gfid_str[64] = {0};
- shd_event_t *event = NULL;
- int32_t sh_failed = 0;
- gf_boolean_t split_brain = 0;
- int32_t actual_sh_done = 0;
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ eh_t *eh = NULL;
+ char *path = NULL;
+ char gfid_str[64] = {0};
+ shd_event_t *event = NULL;
+ int32_t sh_failed = 0;
+ gf_boolean_t split_brain = 0;
+ int32_t actual_sh_done = 0;
+ shd_crawl_event_t **shd_crawl_event = NULL;
+
priv = this->private;
shd = &priv->shd;
if (crawl_data->crawl == INDEX) {
@@ -343,16 +564,19 @@ _crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child,
ret = dict_get_int32 (xattr_rsp, "actual-sh-done", &actual_sh_done);
}
- split_brain = afr_is_split_brain (this, child->inode);
+ shd_crawl_event = (shd_crawl_event_t**)(shd->crawl_events);
+ split_brain = afr_is_split_brain (this, child->inode);
if ((op_ret < 0 && op_errno == EIO) || split_brain) {
eh = shd->split_brain;
+ shd_crawl_event[crawl_data->child]->split_brain_count += 1;
} else if ((op_ret < 0) || sh_failed) {
eh = shd->heal_failed;
+ shd_crawl_event[crawl_data->child]->heal_failed_count += 1;
} else if (actual_sh_done == 1) {
- eh = shd->healed;
+ eh = shd->healed;
+ shd_crawl_event[crawl_data->child]->healed_count += 1;
}
-
ret = -1;
if (eh != NULL) {
@@ -408,10 +632,20 @@ _self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *ent
struct iatt parentbuf = {0};
int ret = 0;
dict_t *xattr_rsp = NULL;
+ dict_t *xattr_req = NULL;
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_int32 (xattr_req, "allow-sh-for-running-transaction", 1);
gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path);
- ret = syncop_lookup (this, child, NULL,
+ ret = syncop_lookup (this, child, xattr_req,
iattr, &xattr_rsp, &parentbuf);
_crawl_post_sh_action (this, parent, child, ret, errno, xattr_rsp,
crawl_data);
@@ -420,6 +654,9 @@ _self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *ent
if (ret == 0)
ret = _link_inode_update_loc (this, child, iattr);
+out:
+ if (xattr_req)
+ dict_unref(xattr_req);
return ret;
}
@@ -528,12 +765,20 @@ _do_crawl_op_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl,
status = "Started self-heal";
_do_self_heal_on_subvol (this, i,
crawl);
- } else if (output) {
+ } else if (output && (op == INFO)) {
status = "";
afr_start_crawl (this, i, INDEX,
_add_summary_to_dict,
output, _gf_false, 0,
NULL);
+ } else if (output &&
+ (op == STATISTICS_TO_BE_HEALED)) {
+ status = "";
+ afr_start_crawl (this, i,
+ INDEX_TO_BE_HEALED,
+ _count_hard_links_under_base_indices_dir,
+ output, _gf_false,
+ 0, NULL);
}
}
if (output) {
@@ -567,8 +812,103 @@ _get_index_summary_on_local_subvols (xlator_t *this, dict_t *output)
return _do_crawl_op_on_local_subvols (this, INDEX, INFO, output);
}
+void
+afr_fill_completed_crawl_statistics_to_dict (xlator_t *this, dict_t *dict)
+{
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ int i = 0;
+ priv = this->private;
+ shd= &priv->shd;
+ for (i = 0; i < priv->child_count; i++) {
+ if (shd->pos[i] != AFR_POS_LOCAL)
+ continue;
+ _add_statistics_to_dict (this, dict, i);
+ }
+
+ return ;
+}
+
+static void
+reset_crawl_event (shd_crawl_event_t *crawl_event)
+{
+ crawl_event->healed_count = 0;
+ crawl_event->split_brain_count = 0;
+ crawl_event->heal_failed_count = 0;
+ GF_FREE (crawl_event->start_time_str);
+ crawl_event->start_time_str = NULL;
+ crawl_event->end_time_str = NULL;
+ crawl_event->crawl_type = NULL;
+ crawl_event->crawl_inprogress = _gf_false;
+ return;
+}
+
+static void
+afr_copy_crawl_event_struct (shd_crawl_event_t *src, shd_crawl_event_t *dst)
+{
+ dst->healed_count = src->healed_count;
+ dst->split_brain_count = src->split_brain_count;
+ dst->heal_failed_count = src->heal_failed_count;
+ dst->start_time_str = gf_strdup (src->start_time_str);
+ dst->end_time_str = "Crawl is already in progress";
+ dst->crawl_type = src->crawl_type;
+ dst->crawl_inprogress = _gf_true;
+ return;
+}
+
+static int
+afr_fill_crawl_statistics_of_running_crawl(xlator_t *this, dict_t *dict)
+{
+ shd_crawl_event_t *evnt = NULL;
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ int i = 0;
+ priv = this->private;
+ shd = &priv->shd;
+
+ evnt = GF_CALLOC (1, sizeof (shd_crawl_event_t),
+ gf_afr_mt_shd_crawl_event_t);
+ if (!evnt) {
+ ret = -1;
+ goto out;
+ }
+ LOCK (&priv->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (shd->pos[i] != AFR_POS_LOCAL)
+ continue;
+
+ reset_crawl_event (evnt);
+
+ if (!shd->crawl_events[i]) {
+ continue;
+ }
+
+ afr_copy_crawl_event_struct (shd->crawl_events[i],
+ evnt);
+ _add_crawl_stats_to_dict (this, dict, i, evnt, NULL);
+
+ }
+ }
+ UNLOCK (&priv->lock);
+ reset_crawl_event (evnt);
+ GF_FREE (evnt);
+
+out:
+ return ret;
+}
+
+static int
+_add_local_subvols_crawl_statistics_to_dict (xlator_t *this, dict_t *dict)
+{
+ int ret = 0;
+ afr_fill_completed_crawl_statistics_to_dict (this, dict);
+ ret = afr_fill_crawl_statistics_of_running_crawl (this, dict);
+ return ret;
+}
int
-_add_all_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict)
+_add_local_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict)
{
afr_private_t *priv = NULL;
afr_self_heald_t *shd = NULL;
@@ -618,16 +958,25 @@ afr_xl_op (xlator_t *this, dict_t *input, dict_t *output)
ret = 0;
break;
case GF_AFR_OP_HEALED_FILES:
- ret = _add_all_subvols_eh_to_dict (this, shd->healed, output);
+ ret = _add_local_subvols_eh_to_dict (this, shd->healed, output);
break;
case GF_AFR_OP_HEAL_FAILED_FILES:
- ret = _add_all_subvols_eh_to_dict (this, shd->heal_failed,
+ ret = _add_local_subvols_eh_to_dict (this, shd->heal_failed,
output);
break;
case GF_AFR_OP_SPLIT_BRAIN_FILES:
- ret = _add_all_subvols_eh_to_dict (this, shd->split_brain,
+ ret = _add_local_subvols_eh_to_dict (this, shd->split_brain,
output);
break;
+ case GF_AFR_OP_STATISTICS:
+ ret = _add_local_subvols_crawl_statistics_to_dict (this, output);
+ break;
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT:
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED,
+ STATISTICS_TO_BE_HEALED,
+ output);
+ break;
default:
gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op);
break;
@@ -642,7 +991,7 @@ afr_poll_self_heal (void *data)
{
afr_private_t *priv = NULL;
afr_self_heald_t *shd = NULL;
- struct timeval timeout = {0};
+ struct timespec timeout = {0};
xlator_t *this = NULL;
long child = (long)data;
gf_timer_t *old_timer = NULL;
@@ -666,7 +1015,7 @@ afr_poll_self_heal (void *data)
if (shd->enabled && (shd->pos[child] == AFR_POS_LOCAL))
_do_self_heal_on_subvol (this, child, INDEX);
timeout.tv_sec = shd->timeout;
- timeout.tv_usec = 0;
+ timeout.tv_nsec = 0;
//notify and previous timer should be synchronized.
LOCK (&priv->lock);
{
@@ -800,6 +1149,7 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data,
afr_private_t *priv = NULL;
dict_t *xattr = NULL;
void *index_gfid = NULL;
+ void *base_indices_holder_vgfid = NULL;
loc_t rootloc = {0};
struct iatt iattr = {0};
struct iatt parent = {0};
@@ -809,7 +1159,7 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data,
priv = this->private;
if (crawl_data->crawl == FULL) {
afr_build_root_loc (this, dirloc);
- } else {
+ } else if (crawl_data->crawl == INDEX) {
afr_build_root_loc (this, &rootloc);
ret = syncop_getxattr (readdir_xl, &rootloc, &xattr,
GF_XATTROP_INDEX_GFID);
@@ -843,6 +1193,47 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data,
ret = _link_inode_update_loc (this, dirloc, &iattr);
if (ret)
goto out;
+ } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) {
+ afr_build_root_loc (this, &rootloc);
+ ret = syncop_getxattr (readdir_xl, &rootloc, &xattr,
+ GF_BASE_INDICES_HOLDER_GFID);
+ if (ret < 0)
+ goto out;
+ ret = dict_get_ptr (xattr, GF_BASE_INDICES_HOLDER_GFID,
+ &base_indices_holder_vgfid);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "index gfid empty "
+ "on %s", readdir_xl->name);
+ ret = -1;
+ goto out;
+ }
+ if (!base_indices_holder_vgfid) {
+ gf_log (this->name, GF_LOG_ERROR, "Base indices holder"
+ "virtual gfid is null on %s", readdir_xl->name);
+ ret = -1;
+ goto out;
+ }
+ uuid_copy (dirloc->gfid, base_indices_holder_vgfid);
+ dirloc->path = "";
+ dirloc->inode = inode_new (priv->root_inode->table);
+ ret = syncop_lookup (readdir_xl, dirloc, NULL, &iattr, NULL,
+ &parent);
+ if (ret < 0) {
+ if (errno != ENOENT) {
+ gf_log (this->name, GF_LOG_ERROR, "lookup "
+ "failed for base_indices_holder dir"
+ " on %s - (%s)", readdir_xl->name,
+ strerror (errno));
+
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "base_indices"
+ "_holder is not yet created.");
+ }
+ goto out;
+ }
+ ret = _link_inode_update_loc (this, dirloc, &iattr);
+ if (ret)
+ goto out;
}
ret = 0;
out:
@@ -907,6 +1298,16 @@ afr_crawl_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent,
priv = this->private;
if (crawl_data->crawl == FULL) {
ret = afr_build_child_loc (this, child, parent, entry->d_name);
+ } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) {
+ ret = _build_index_loc (this, child, entry->d_name, parent);
+ if (ret)
+ goto out;
+ child->inode = inode_new (priv->root_inode->table);
+ if (!child->inode) {
+ ret = -1;
+ goto out;
+ }
+ child->path = NULL;
} else {
child->inode = inode_new (priv->root_inode->table);
if (!child->inode)
@@ -956,10 +1357,14 @@ _process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries,
ret = crawl_data->process_entry (this, crawl_data, entry,
&entry_loc, parentloc, &iattr);
- if (ret)
+ if (crawl_data->crawl == INDEX_TO_BE_HEALED && ret) {
+ goto out;
+ } else if (ret) {
continue;
+ }
- if (crawl_data->crawl == INDEX)
+ if ((crawl_data->crawl == INDEX) ||
+ (crawl_data->crawl == INDEX_TO_BE_HEALED))
continue;
if (!IA_ISDIR (iattr.ia_type))
@@ -974,6 +1379,10 @@ _process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries,
}
ret = 0;
out:
+ if ((crawl_data->crawl == INDEX_TO_BE_HEALED) && ret) {
+ gf_log (this->name, GF_LOG_ERROR,"Failed to get the hardlink "
+ "count");
+ }
loc_wipe (&entry_loc);
return ret;
}
@@ -1021,6 +1430,9 @@ _crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data)
ret = _process_entries (this, loc, &entries, &offset,
crawl_data);
+ if ((ret < 0) && (crawl_data->crawl == INDEX_TO_BE_HEALED)) {
+ goto out;
+ }
gf_dirent_free (&entries);
free_entries = _gf_false;
}
@@ -1126,8 +1538,13 @@ afr_dir_crawl (void *data)
goto out;
ret = afr_crawl_opendir (this, crawl_data, &fd, &dirloc);
- if (ret)
+ if (ret) {
+ if (crawl_data->crawl == INDEX_TO_BE_HEALED) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to open base_"
+ "indices_holder");
+ }
goto out;
+ }
ret = _crawl_directory (fd, &dirloc, crawl_data);
if (ret)
@@ -1141,12 +1558,102 @@ afr_dir_crawl (void *data)
out:
if (fd)
fd_unref (fd);
- if (crawl_data->crawl == INDEX)
+ if ((crawl_data->crawl == INDEX) ||
+ (crawl_data->crawl == INDEX_TO_BE_HEALED ))
dirloc.path = NULL;
loc_wipe (&dirloc);
return ret;
}
+char *
+get_crawl_type_in_string (afr_crawl_type_t crawl)
+{
+ char *index = "INDEX";
+ char *full = "FULL";
+ char *crawl_type = NULL;
+
+ if (crawl == INDEX){
+ crawl_type = index;
+ } else if (crawl == FULL) {
+ crawl_type = full;
+ }
+
+ return crawl_type;
+}
+
+static int
+afr_allocate_crawl_event (xlator_t *this, int child, afr_crawl_type_t crawl)
+{
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ int ret = 0;
+ shd_crawl_event_t *crawl_event = NULL;
+ time_t get_time = 0;
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ crawl_event = GF_CALLOC (sizeof (shd_crawl_event_t), 1,
+ gf_afr_mt_shd_crawl_event_t);
+ if (!crawl_event) {
+ ret = -1;
+ goto out;
+ }
+
+ get_time = time(NULL);
+ if (get_time == ((time_t)-1)) {
+ ret = -1;
+ goto out;
+ }
+
+ crawl_event->start_time_str = gf_strdup (ctime(&get_time));
+
+ crawl_event->crawl_type = get_crawl_type_in_string (crawl);
+ if (!crawl_event->crawl_type) {
+ ret = -1;
+ goto out;
+ }
+ LOCK (&priv->lock);
+ {
+ shd->crawl_events[child] = crawl_event;
+ }
+ UNLOCK (&priv->lock);
+ ret = 0;
+out:
+ return ret;
+
+}
+
+static int
+afr_put_crawl_event_in_eh (xlator_t *this, int child)
+{
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ int ret = 0;
+ time_t get_time = 0;
+ shd_crawl_event_t **crawl_event = NULL;
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ get_time = time(NULL);
+ if (get_time == ((time_t)-1)) {
+ ret = -1;
+ goto out;
+ }
+ crawl_event = (shd_crawl_event_t**)shd->crawl_events;
+ LOCK (&priv->lock);
+ {
+ crawl_event[child]->end_time_str = gf_strdup (ctime(&get_time));
+ ret = eh_save_history (shd->statistics[child],
+ crawl_event[child]);
+ crawl_event[child] = NULL;
+ }
+ UNLOCK (&priv->lock);
+out:
+ return ret;
+}
+
static int
afr_dir_exclusive_crawl (void *data)
{
@@ -1182,7 +1689,15 @@ afr_dir_exclusive_crawl (void *data)
}
do {
+ ret = afr_allocate_crawl_event (this, child, crawl_data->crawl);
+ if (ret)
+ goto out;
afr_dir_crawl (data);
+
+ ret = afr_put_crawl_event_in_eh (this, child);
+ if (ret < 0)
+ goto out;
+
LOCK (&priv->lock);
{
if (shd->pending[child] != NONE) {
@@ -1239,8 +1754,8 @@ afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl,
ret = synctask_new (this->ctx->env, crawler,
crawl_done, frame, crawl_data);
if (ret)
- gf_log (this->name, GF_LOG_ERROR, "Could not create the "
- "task for %d ret %d", idx, ret);
+ gf_log (this->name, GF_LOG_ERROR, "afr crawl failed for child"
+ " %d with ret %d", idx, ret);
out:
return;
}
@@ -1270,4 +1785,3 @@ afr_set_root_gfid (dict_t *dict)
return ret;
}
-
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
index 32a8aaca5..e0c083754 100644
--- a/xlators/cluster/afr/src/afr-self-heald.h
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -29,6 +29,19 @@ typedef struct afr_crawl_data_ {
struct iatt *iattr);
} afr_crawl_data_t;
+typedef struct crawl_event_stats_ {
+ uint64_t healed_count;
+ uint64_t split_brain_count;
+ uint64_t heal_failed_count;
+ char *start_time_str;
+ char *end_time_str;
+ char *crawl_type;
+ gf_boolean_t crawl_inprogress;
+} shd_crawl_event_t;
+
+void _destroy_crawl_event_data (void *data);
+void _destroy_shd_event_data (void *data);
+
typedef int (*process_entry_cbk_t) (xlator_t *this, afr_crawl_data_t *crawl_data,
gf_dirent_t *entry, loc_t *child, loc_t *parent,
struct iatt *iattr);
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 217ff8548..20306e469 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -79,7 +79,7 @@ afr_save_lk_owner (call_frame_t *frame)
local = frame->local;
- local->saved_lk_owner = frame->root->lk_owner;
+ local->saved_lk_owner = frame->root->lk_owner;
}
@@ -90,10 +90,9 @@ afr_restore_lk_owner (call_frame_t *frame)
local = frame->local;
- frame->root->lk_owner = local->saved_lk_owner;
+ frame->root->lk_owner = local->saved_lk_owner;
}
-
static void
__mark_all_pending (int32_t *pending[], int child_count,
afr_transaction_type type)
@@ -369,7 +368,7 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (call_count == 0) {
if (local->transaction.resume_stub) {
- AFR_CALL_RESUME (local->transaction.resume_stub);
+ call_resume (local->transaction.resume_stub);
local->transaction.resume_stub = NULL;
}
@@ -443,14 +442,30 @@ out:
return;
}
+afr_inodelk_t*
+afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom)
+{
+ afr_inodelk_t *inodelk = NULL;
+ int i = 0;
+
+ for (i = 0; int_lock->inodelk[i].domain; i++) {
+ inodelk = &int_lock->inodelk[i];
+ if (strcmp (dom, inodelk->domain) == 0)
+ return inodelk;
+ }
+ return NULL;
+}
+
unsigned char*
afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)
{
unsigned char *locked_nodes = NULL;
+ afr_inodelk_t *inodelk = NULL;
switch (type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- locked_nodes = int_lock->inode_locked_nodes;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ locked_nodes = inodelk->locked_nodes;
break;
case AFR_ENTRY_TRANSACTION:
@@ -553,24 +568,77 @@ afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr,
gf_boolean_t
afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int index = -1;
- int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int index = -1;
+ int i = 0;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
index = afr_index_for_transaction_type (local->transaction.type);
- for (i = 0; i < priv->child_count; i++) {
+ for (i = 0; i < priv->child_count; i++) {
if (local->pending[i][index] == 0)
- return _gf_false;
+ return _gf_false;
}
- return _gf_true;
+ return _gf_true;
}
+static void
+afr_dir_fop_handle_all_fop_failures (call_frame_t *frame)
+{
+ xlator_t *this = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ this = frame->this;
+ local = frame->local;
+ priv = this->private;
+
+ if ((local->transaction.type != AFR_ENTRY_TRANSACTION) &&
+ (local->transaction.type != AFR_ENTRY_RENAME_TRANSACTION))
+ return;
+
+ if (local->op_ret >= 0)
+ goto out;
+
+ __mark_all_success (local->pending, priv->child_count,
+ local->transaction.type);
+out:
+ return;
+}
+
+static void
+afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ gf_boolean_t all_quota_failures = _gf_false;
+
+ local = frame->local;
+ priv = this->private;
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ return;
+ /*
+ * Idea is to not leave the file in FOOL-FOOL scenario in case on
+ * all the bricks data transaction failed with EDQUOT to avoid
+ * increasing un-necessary load of self-heals in the system.
+ */
+ all_quota_failures = _gf_true;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] &&
+ (local->child_errno[i] != EDQUOT)) {
+ all_quota_failures = _gf_false;
+ break;
+ }
+ }
+ if (all_quota_failures)
+ __mark_all_success (local->pending, priv->child_count,
+ local->transaction.type);
+}
int
afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
@@ -593,6 +661,9 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
local->transaction.pre_op,
local->transaction.type);
+ afr_data_handle_quota_errors (frame, this);
+ afr_dir_fop_handle_all_fop_failures (frame);
+
if (local->fd)
afr_transaction_rm_stale_children (frame, this,
local->fd->inode,
@@ -619,7 +690,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
goto out;
}
- nothing_failed = afr_txn_nothing_failed (frame, this);
+ nothing_failed = afr_txn_nothing_failed (frame, this);
afr_compute_txn_changelog (local , priv);
@@ -645,14 +716,14 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
break;
}
- /* local->transaction.postop_piggybacked[] was
- precomputed in is_piggyback_postop() when called from
- afr_changelog_post_op_safe()
- */
+ /* local->transaction.postop_piggybacked[] was
+ precomputed in is_piggyback_postop() when called from
+ afr_changelog_post_op_safe()
+ */
- piggyback = 0;
- if (local->transaction.postop_piggybacked[i])
- piggyback = 1;
+ piggyback = 0;
+ if (local->transaction.postop_piggybacked[i])
+ piggyback = 1;
afr_set_postop_dict (local, this, xattr[i],
piggyback, i);
@@ -906,7 +977,7 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
}
UNLOCK (&local->fd->lock);
- afr_set_delayed_post_op (frame, this);
+ afr_set_delayed_post_op (frame, this);
if (piggyback)
afr_changelog_pre_op_cbk (frame, (void *)(long)i,
@@ -1179,12 +1250,14 @@ int
afr_set_transaction_flock (afr_local_t *local)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
int_lock = &local->internal_lock;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- int_lock->lk_flock.l_len = local->transaction.len;
- int_lock->lk_flock.l_start = local->transaction.start;
- int_lock->lk_flock.l_type = F_WRLCK;
+ inodelk->flock.l_len = local->transaction.len;
+ inodelk->flock.l_start = local->transaction.start;
+ inodelk->flock.l_type = F_WRLCK;
return 0;
}
@@ -1199,6 +1272,7 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
int_lock->transaction_lk_type = AFR_TRANSACTION_LK;
+ int_lock->domain = this->name;
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
@@ -1259,69 +1333,115 @@ afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
void
afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- /* call this function from any of the related optimizations
- which benefit from delaying post op are enabled, namely:
+ /* call this function from any of the related optimizations
+ which benefit from delaying post op are enabled, namely:
- - changelog piggybacking
- - eager locking
- */
+ - changelog piggybacking
+ - eager locking
+ */
- priv = this->private;
- if (!priv)
- return;
+ priv = this->private;
+ if (!priv)
+ return;
- if (!priv->post_op_delay_secs)
- return;
+ if (!priv->post_op_delay_secs)
+ return;
local = frame->local;
if (!local->transaction.eager_lock_on)
return;
- if (!local)
- return;
+ if (!local)
+ return;
- if (!local->fd)
- return;
+ if (!local->fd)
+ return;
- if (local->op == GF_FOP_WRITE)
- local->delayed_post_op = _gf_true;
+ if (local->op == GF_FOP_WRITE)
+ local->delayed_post_op = _gf_true;
}
+gf_boolean_t
+afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this)
+{
+ afr_inode_ctx_t *ictx = NULL;
+
+ if (!inode) {
+ /* If false is returned, it may keep on taking eager-lock
+ * which may lead to starvation, so return true to avoid that.
+ */
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid inode");
+ return _gf_true;
+ }
+ /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
+ * is taken mount2 opened the same file, it won't be able to
+ * perform any data operations until mount1 releases eager-lock.
+ * To avoid such scenario do not enable eager-lock for this transaction
+ * if open-fd-count is > 1
+ */
+
+ ictx = afr_inode_ctx_get (inode, this);
+ if (!ictx)
+ return _gf_true;
+
+ if (ictx->open_fd_count > 1)
+ return _gf_true;
+
+ return _gf_false;
+}
+
+gf_boolean_t
+afr_any_fops_failed (afr_local_t *local, afr_private_t *priv)
+{
+ if (local->success_count != priv->child_count)
+ return _gf_true;
+ return _gf_false;
+}
gf_boolean_t
is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- gf_boolean_t res = _gf_false;
+ afr_local_t *local = NULL;
+ gf_boolean_t res = _gf_false;
+ afr_private_t *priv = NULL;
- local = frame->local;
- if (!local)
- goto out;
+ priv = this->private;
- if (!local->delayed_post_op)
- goto out;
+ local = frame->local;
+ if (!local)
+ goto out;
- res = _gf_true;
+ if (!local->delayed_post_op)
+ goto out;
+
+ //Mark pending changelog ASAP
+ if (afr_any_fops_failed (local, priv))
+ goto out;
+
+ if (local->fd && afr_are_multiple_fds_opened (local->fd->inode, this))
+ goto out;
+
+ res = _gf_true;
out:
- return res;
+ return res;
}
void
afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
- call_stub_t *stub);
+ call_stub_t *stub);
void
afr_delayed_changelog_wake_up_cbk (void *data)
{
- fd_t *fd = NULL;
+ fd_t *fd = NULL;
- fd = data;
+ fd = data;
- afr_delayed_changelog_wake_up (THIS, fd);
+ afr_delayed_changelog_wake_up (THIS, fd);
}
@@ -1332,39 +1452,39 @@ afr_delayed_changelog_wake_up_cbk (void *data)
static gf_boolean_t
is_piggyback_post_op (call_frame_t *frame, fd_t *fd)
{
- afr_fd_ctx_t *fdctx = NULL;
- afr_local_t *local = NULL;
- gf_boolean_t piggyback = _gf_true;
- afr_private_t *priv = NULL;
- int i = 0;
+ afr_fd_ctx_t *fdctx = NULL;
+ afr_local_t *local = NULL;
+ gf_boolean_t piggyback = _gf_true;
+ afr_private_t *priv = NULL;
+ int i = 0;
- priv = frame->this->private;
- local = frame->local;
- fdctx = afr_fd_ctx_get (fd, frame->this);
+ priv = frame->this->private;
+ local = frame->local;
+ fdctx = afr_fd_ctx_get (fd, frame->this);
- LOCK(&fd->lock);
- {
- piggyback = _gf_true;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.pre_op[i])
- continue;
- if (fdctx->pre_op_piggyback[i]) {
- fdctx->pre_op_piggyback[i]--;
- local->transaction.postop_piggybacked[i] = 1;
- } else {
- /* For at least _one_ subvolume we cannot
- piggyback on the changelog, and have to
- perform a hard POST-OP and therefore fsync
- if necesssary
- */
- piggyback = _gf_false;
+ LOCK(&fd->lock);
+ {
+ piggyback = _gf_true;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
+ if (fdctx->pre_op_piggyback[i]) {
+ fdctx->pre_op_piggyback[i]--;
+ local->transaction.postop_piggybacked[i] = 1;
+ } else {
+ /* For at least _one_ subvolume we cannot
+ piggyback on the changelog, and have to
+ perform a hard POST-OP and therefore fsync
+ if necesssary
+ */
+ piggyback = _gf_false;
GF_ASSERT (fdctx->pre_op_done[i]);
fdctx->pre_op_done[i]--;
- }
- }
- }
- UNLOCK(&fd->lock);
+ }
+ }
+ }
+ UNLOCK(&fd->lock);
if (!afr_txn_nothing_failed (frame, frame->this)) {
/* something failed in this transaction,
@@ -1381,186 +1501,205 @@ is_piggyback_post_op (call_frame_t *frame, fd_t *fd)
int
afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
{
- afr_fd_ctx_t *fdctx = NULL;
+ afr_fd_ctx_t *fdctx = NULL;
- fdctx = afr_fd_ctx_get (fd, this);
+ fdctx = afr_fd_ctx_get (fd, this);
- LOCK(&fd->lock);
- {
- fdctx->witnessed_unstable_write = _gf_true;
- }
- UNLOCK(&fd->lock);
+ LOCK(&fd->lock);
+ {
+ fdctx->witnessed_unstable_write = _gf_true;
+ }
+ UNLOCK(&fd->lock);
- return 0;
+ return 0;
}
/* TEST and CLEAR operation */
gf_boolean_t
afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
{
- afr_fd_ctx_t *fdctx = NULL;
- gf_boolean_t witness = _gf_false;
+ afr_fd_ctx_t *fdctx = NULL;
+ gf_boolean_t witness = _gf_false;
fdctx = afr_fd_ctx_get (fd, this);
+ if (!fdctx)
+ return _gf_true;
- LOCK(&fd->lock);
- {
- if (fdctx->witnessed_unstable_write) {
- witness = _gf_true;
- fdctx->witnessed_unstable_write = _gf_false;
- }
- }
- UNLOCK (&fd->lock);
+ LOCK(&fd->lock);
+ {
+ if (fdctx->witnessed_unstable_write) {
+ witness = _gf_true;
+ fdctx->witnessed_unstable_write = _gf_false;
+ }
+ }
+ UNLOCK (&fd->lock);
- return witness;
+ return witness;
}
int
afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *pre,
- struct iatt *post, dict_t *xdata)
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
{
- afr_private_t *priv = NULL;
+ afr_private_t *priv = NULL;
int child_index = (long) cookie;
- int call_count = -1;
- afr_local_t *local = NULL;
+ int call_count = -1;
+ afr_local_t *local = NULL;
- priv = this->private;
- local = frame->local;
+ priv = this->private;
+ local = frame->local;
- if (afr_fop_failed (op_ret, op_errno)) {
- /* Failure of fsync() is as good as failure of previous
- write(). So treat it like one.
- */
- gf_log (this->name, GF_LOG_WARNING,
- "fsync(%s) failed on subvolume %s. Transaction was %s",
- uuid_utoa (local->fd->inode->gfid),
- priv->children[child_index]->name,
- gf_fop_list[local->op]);
-
- afr_transaction_fop_failed (frame, this, child_index);
- }
+ if (afr_fop_failed (op_ret, op_errno)) {
+ /* Failure of fsync() is as good as failure of previous
+ write(). So treat it like one.
+ */
+ gf_log (this->name, GF_LOG_WARNING,
+ "fsync(%s) failed on subvolume %s. Transaction was %s",
+ uuid_utoa (local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ gf_fop_list[local->op]);
+
+ afr_transaction_fop_failed (frame, this, child_index);
+ }
call_count = afr_frame_return (frame);
if (call_count == 0)
- afr_changelog_post_op_now (frame, this);
+ afr_changelog_post_op_now (frame, this);
- return 0;
+ return 0;
}
int
afr_changelog_fsync (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- int i = 0;
- int call_count = 0;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ GF_UNUSED int ret = -1;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
priv->child_count);
- if (!call_count) {
- /* will go straight to unlock */
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
+ if (!call_count) {
+ /* will go straight to unlock */
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
- local->call_count = call_count;
+ local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.pre_op[i])
- continue;
+ xdata = dict_new();
+ if (xdata)
+ ret = dict_set_int32 (xdata, "batch-fsync", 1);
- STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk,
- (void *) (long) i, priv->children[i],
- priv->children[i]->fops->fsync, local->fd,
- 1, NULL);
- if (!--call_count)
- break;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->fsync, local->fd,
+ 1, xdata);
+ if (!--call_count)
+ break;
+ }
- return 0;
+ if (xdata)
+ dict_unref (xdata);
+
+ return 0;
}
-int
+ int
afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
local = frame->local;
+ priv = this->private;
- if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
-
- if (is_piggyback_post_op (frame, local->fd)) {
- /* just detected that this post-op is about to
- be optimized away as a new write() has
- already piggybacked on this frame's changelog.
- */
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
+ if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
- /* Calling afr_changelog_post_op_now() now will result in
- issuing ->[f]xattrop().
-
- Performing a hard POST-OP (->[f]xattrop() FOP) is a more
- responsible operation that what it might appear on the surface.
-
- The changelog of a file (in the xattr of the file on the server)
- stores information (pending count) about the state of the file
- on the OTHER server. This changelog is blindly trusted, and must
- therefore be updated in such a way it remains trustworthy. This
- implies that decrementing the pending count (essentially "clearing
- the dirty flag") must be done STRICTLY after we are sure that the
- operation on the other server has reached stable storage.
-
- While the backend filesystem on that server will eventually flush
- it to stable storage, we (being in userspace) have no mechanism
- to get notified when the write became "stable".
-
- This means we need take matter into our own hands and issue an
- fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
- and get an acknowledgement for it. And we need to wait for the
- fsync() acknowledgement before initiating the hard POST-OP.
-
- However if the FD itself was opened in O_SYNC or O_DSYNC then
- we are already guaranteed that the writes were made stable as
- part of the FOP itself. The same holds true for NFS stable
- writes which happen on an anonymous FD with O_DSYNC or O_SYNC
- flag set in the writev() @flags param. For all other write types,
- mark a flag in the fdctx whenever an unstable write is witnessed.
- */
-
- if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
+ if (is_piggyback_post_op (frame, local->fd)) {
+ /* just detected that this post-op is about to
+ be optimized away as a new write() has
+ already piggybacked on this frame's changelog.
+ */
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
- /* Time to fsync() */
+ /* Calling afr_changelog_post_op_now() now will result in
+ issuing ->[f]xattrop().
+
+ Performing a hard POST-OP (->[f]xattrop() FOP) is a more
+ responsible operation that what it might appear on the surface.
+
+ The changelog of a file (in the xattr of the file on the server)
+ stores information (pending count) about the state of the file
+ on the OTHER server. This changelog is blindly trusted, and must
+ therefore be updated in such a way it remains trustworthy. This
+ implies that decrementing the pending count (essentially "clearing
+ the dirty flag") must be done STRICTLY after we are sure that the
+ operation on the other server has reached stable storage.
+
+ While the backend filesystem on that server will eventually flush
+ it to stable storage, we (being in userspace) have no mechanism
+ to get notified when the write became "stable".
+
+ This means we need take matter into our own hands and issue an
+ fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
+ and get an acknowledgement for it. And we need to wait for the
+ fsync() acknowledgement before initiating the hard POST-OP.
+
+ However if the FD itself was opened in O_SYNC or O_DSYNC then
+ we are already guaranteed that the writes were made stable as
+ part of the FOP itself. The same holds true for NFS stable
+ writes which happen on an anonymous FD with O_DSYNC or O_SYNC
+ flag set in the writev() @flags param. For all other write types,
+ mark a flag in the fdctx whenever an unstable write is witnessed.
+ */
+
+ if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
- afr_changelog_fsync (frame, this);
+ /* Check whether users want durability and perform fsync/post-op
+ * accordingly.
+ */
+ if (priv->ensure_durability) {
+ /* Time to fsync() */
+ afr_changelog_fsync (frame, this);
+ } else {
+ afr_changelog_post_op_now (frame, this);
+ }
- return 0;
+ return 0;
}
void
afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
- call_stub_t *stub)
+ call_stub_t *stub)
{
afr_fd_ctx_t *fd_ctx = NULL;
call_frame_t *prev_frame = NULL;
- struct timeval delta = {0, };
+ struct timespec delta = {0, };
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
@@ -1568,10 +1707,10 @@ afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
fd_ctx = afr_fd_ctx_get (fd, this);
if (!fd_ctx)
- return;
+ goto out;
delta.tv_sec = priv->post_op_delay_secs;
- delta.tv_usec = 0;
+ delta.tv_nsec = 0;
pthread_mutex_lock (&fd_ctx->delay_lock);
{
@@ -1590,6 +1729,7 @@ afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
unlock:
pthread_mutex_unlock (&fd_ctx->delay_lock);
+out:
if (prev_frame) {
local = prev_frame->local;
local->transaction.resume_stub = stub;
@@ -1603,14 +1743,14 @@ unlock:
void
afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (is_afr_delayed_changelog_post_op_needed (frame, this))
- afr_delayed_changelog_post_op (this, frame, local->fd, NULL);
- else
- afr_changelog_post_op_safe (frame, this);
+ if (is_afr_delayed_changelog_post_op_needed (frame, this))
+ afr_delayed_changelog_post_op (this, frame, local->fd, NULL);
+ else
+ afr_changelog_post_op_safe (frame, this);
}
@@ -1621,22 +1761,22 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
The @stub gets saved in @local and gets resumed in
afr_local_cleanup()
-*/
+ */
void
afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub)
{
- afr_delayed_changelog_post_op (this, NULL, fd, stub);
+ afr_delayed_changelog_post_op (this, NULL, fd, stub);
}
void
afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd)
{
- afr_delayed_changelog_post_op (this, NULL, fd, NULL);
+ afr_delayed_changelog_post_op (this, NULL, fd, NULL);
}
-int
+ int
afr_transaction_resume (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
@@ -1647,18 +1787,18 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
priv = this->private;
- if (local->transaction.eager_lock_on) {
- /* We don't need to retain "local" in the
- fd list anymore, writes to all subvols
- are finished by now */
- LOCK (&local->fd->lock);
- {
- list_del_init (&local->transaction.eager_locked);
- }
- UNLOCK (&local->fd->lock);
- }
+ if (local->transaction.eager_lock_on) {
+ /* We don't need to retain "local" in the
+ fd list anymore, writes to all subvols
+ are finished by now */
+ LOCK (&local->fd->lock);
+ {
+ list_del_init (&local->transaction.eager_locked);
+ }
+ UNLOCK (&local->fd->lock);
+ }
- afr_restore_lk_owner (frame);
+ afr_restore_lk_owner (frame);
if (__fop_changelog_needed (frame, this)) {
afr_changelog_post_op (frame, this);
@@ -1680,7 +1820,8 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this)
*/
void
-afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index)
+afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
+ int child_index)
{
afr_local_t * local = NULL;
afr_private_t * priv = NULL;
@@ -1689,55 +1830,56 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index
priv = this->private;
__mark_child_dead (local->pending, priv->child_count,
- child_index, local->transaction.type);
+ child_index, local->transaction.type);
}
-static gf_boolean_t
+ static gf_boolean_t
afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
{
- uint64_t start1 = local1->transaction.start;
- uint64_t start2 = local2->transaction.start;
- uint64_t end1 = 0;
- uint64_t end2 = 0;
-
- if (local1->transaction.len)
- end1 = start1 + local1->transaction.len - 1;
- else
- end1 = ULLONG_MAX;
-
- if (local2->transaction.len)
- end2 = start2 + local2->transaction.len - 1;
- else
- end2 = ULLONG_MAX;
-
- return ((end1 >= start2) && (end2 >= start1));
-}
+ uint64_t start1 = local1->transaction.start;
+ uint64_t start2 = local2->transaction.start;
+ uint64_t end1 = 0;
+ uint64_t end2 = 0;
+ if (local1->transaction.len)
+ end1 = start1 + local1->transaction.len - 1;
+ else
+ end1 = ULLONG_MAX;
+
+ if (local2->transaction.len)
+ end2 = start2 + local2->transaction.len - 1;
+ else
+ end2 = ULLONG_MAX;
+
+ return ((end1 >= start2) && (end2 >= start1));
+}
void
afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fdctx = NULL;
- afr_local_t *each = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fdctx = NULL;
+ afr_local_t *each = NULL;
- priv = this->private;
+ priv = this->private;
- if (!local->fd)
- return;
+ if (!local->fd)
+ return;
- if (local->transaction.type != AFR_DATA_TRANSACTION)
- return;
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ return;
- if (!priv->eager_lock)
- return;
+ if (!priv->eager_lock)
+ return;
- fdctx = afr_fd_ctx_get (local->fd, this);
- if (!fdctx)
- return;
+ fdctx = afr_fd_ctx_get (local->fd, this);
+ if (!fdctx)
+ return;
+ if (afr_are_multiple_fds_opened (local->fd->inode, this))
+ return;
/*
* Once full file lock is acquired in eager-lock phase, overlapping
* writes do not compete for inode-locks, instead are transferred to the
@@ -1755,22 +1897,22 @@ afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)
* This check makes sure the locks are not transferred for
* overlapping writes.
*/
- LOCK (&local->fd->lock);
- {
- list_for_each_entry (each, &fdctx->eager_locked,
- transaction.eager_locked) {
- if (afr_locals_overlap (each, local)) {
- local->transaction.eager_lock_on = _gf_false;
- goto unlock;
- }
- }
-
- local->transaction.eager_lock_on = _gf_true;
- list_add_tail (&local->transaction.eager_locked,
- &fdctx->eager_locked);
- }
+ LOCK (&local->fd->lock);
+ {
+ list_for_each_entry (each, &fdctx->eager_locked,
+ transaction.eager_locked) {
+ if (afr_locals_overlap (each, local)) {
+ local->transaction.eager_lock_on = _gf_false;
+ goto unlock;
+ }
+ }
+
+ local->transaction.eager_lock_on = _gf_true;
+ list_add_tail (&local->transaction.eager_locked,
+ &fdctx->eager_locked);
+ }
unlock:
- UNLOCK (&local->fd->lock);
+ UNLOCK (&local->fd->lock);
}
@@ -1789,11 +1931,10 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
local->transaction.type = type;
ret = afr_transaction_local_init (local, this);
- if (ret < 0) {
+ if (ret < 0)
goto out;
- }
- afr_transaction_eager_lock_init (local, this);
+ afr_transaction_eager_lock_init (local, this);
if (local->fd && local->transaction.eager_lock_on)
afr_set_lk_owner (frame, this, local->fd);
@@ -1802,6 +1943,9 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
if (!local->transaction.eager_lock_on && local->loc.inode) {
fd = fd_lookup (local->loc.inode, frame->root->pid);
+ if (fd == NULL)
+ fd = fd_lookup_anonymous (local->loc.inode);
+
if (fd) {
afr_delayed_changelog_wake_up (this, fd);
fd_unref (fd);
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index 55e8bbcca..fa626fd0d 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -23,6 +23,9 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
int
afr_lock_server_count (afr_private_t *priv, afr_transaction_type type);
+afr_inodelk_t*
+afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom);
+
int32_t
afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
@@ -40,4 +43,9 @@ afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd);
void
__mark_all_success (int32_t *pending[], int child_count,
afr_transaction_type type);
+gf_boolean_t
+afr_any_fops_failed (afr_local_t *local, afr_private_t *priv);
+
+gf_boolean_t
+afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this);
#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index bee10fd01..c724eb2ae 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -191,6 +191,8 @@ reconfigure (xlator_t *this, dict_t *options)
/* Reset this so we re-discover in case the topology changed. */
GF_OPTION_RECONF ("readdir-failover", priv->readdir_failover, options,
bool, out);
+ GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options,
+ bool, out);
priv->did_discovery = _gf_false;
ret = 0;
@@ -335,6 +337,8 @@ init (xlator_t *this)
GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out);
GF_OPTION_INIT ("readdir-failover", priv->readdir_failover, bool, out);
+ GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool,
+ out);
priv->wait_count = 1;
@@ -376,8 +380,6 @@ init (xlator_t *this)
AFR_XATTR_PREFIX,
trav->xlator->name);
if (-1 == ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "asprintf failed to set pending key");
ret = -ENOMEM;
goto out;
}
@@ -386,6 +388,13 @@ init (xlator_t *this)
i++;
}
+ ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT,
+ this->name);
+ if (-1 == ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event),
gf_afr_mt_int32_t);
if (!priv->last_event) {
@@ -430,15 +439,18 @@ init (xlator_t *this)
if (!priv->shd.timer)
goto out;
- priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false);
+ priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false,
+ _destroy_shd_event_data);
if (!priv->shd.healed)
goto out;
- priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false);
+ priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false,
+ _destroy_shd_event_data);
if (!priv->shd.heal_failed)
goto out;
- priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false);
+ priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false,
+ _destroy_shd_event_data);
if (!priv->shd.split_brain)
goto out;
@@ -448,7 +460,9 @@ init (xlator_t *this)
priv->root_inode = inode_ref (this->itable->root);
GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out);
GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out);
-
+ ret = afr_initialise_statistics (this);
+ if (ret)
+ goto out;
ret = 0;
out:
return ret;
@@ -483,6 +497,9 @@ struct xlator_fops fops = {
.finodelk = afr_finodelk,
.entrylk = afr_entrylk,
.fentrylk = afr_fentrylk,
+ .fallocate = afr_fallocate,
+ .discard = afr_discard,
+ .zerofill = afr_zerofill,
/* inode read */
.access = afr_access,
@@ -765,5 +782,12 @@ struct volume_options options[] = {
.description = "readdir(p) will not failover if this option is off",
.default_value = "on",
},
+ { .key = {"ensure-durability"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .description = "Afr performs fsyncs for transactions if this "
+ "option is on to make sure the changelogs/data is "
+ "written to the disk",
+ .default_value = "on",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 387ed12ec..21064db58 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -28,8 +28,10 @@
#define AFR_XATTR_PREFIX "trusted.afr"
#define AFR_PATHINFO_HEADER "REPLICATE:"
#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
+#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
#define AFR_LOCKEE_COUNT_MAX 3
+#define AFR_DOM_COUNT_MAX 3
struct _pump_private;
@@ -86,26 +88,30 @@ typedef struct afr_inode_ctx_ {
int32_t *fresh_children;//increasing order of latency
afr_spb_state_t mdata_spb;
afr_spb_state_t data_spb;
+ uint32_t open_fd_count;
} afr_inode_ctx_t;
typedef enum {
NONE,
INDEX,
+ INDEX_TO_BE_HEALED,
FULL,
} afr_crawl_type_t;
typedef struct afr_self_heald_ {
- gf_boolean_t enabled;
- gf_boolean_t iamshd;
- afr_crawl_type_t *pending;
- gf_boolean_t *inprogress;
- afr_child_pos_t *pos;
- gf_timer_t **timer;
- eh_t *healed;
- eh_t *heal_failed;
- eh_t *split_brain;
- char *node_uuid;
- int timeout;
+ gf_boolean_t enabled;
+ gf_boolean_t iamshd;
+ afr_crawl_type_t *pending;
+ gf_boolean_t *inprogress;
+ afr_child_pos_t *pos;
+ gf_timer_t **timer;
+ eh_t *healed;
+ eh_t *heal_failed;
+ eh_t *split_brain;
+ eh_t **statistics;
+ void **crawl_events;
+ char *node_uuid;
+ int timeout;
} afr_self_heald_t;
typedef struct _afr_private {
@@ -170,9 +176,38 @@ typedef struct _afr_private {
gf_boolean_t did_discovery;
gf_boolean_t readdir_failover;
uint64_t sh_readdir_size;
+ gf_boolean_t ensure_durability;
+ char *sh_domain;
} afr_private_t;
+typedef enum {
+ AFR_SELF_HEAL_NOT_ATTEMPTED,
+ AFR_SELF_HEAL_STARTED,
+ AFR_SELF_HEAL_FAILED,
+ AFR_SELF_HEAL_SYNC_BEGIN,
+} afr_self_heal_status;
+
typedef struct {
+ afr_self_heal_status gfid_or_missing_entry_self_heal;
+ afr_self_heal_status metadata_self_heal;
+ afr_self_heal_status data_self_heal;
+ afr_self_heal_status entry_self_heal;
+} afr_sh_status_for_all_type;
+
+typedef enum {
+ AFR_SELF_HEAL_ENTRY,
+ AFR_SELF_HEAL_METADATA,
+ AFR_SELF_HEAL_DATA,
+ AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY,
+ AFR_SELF_HEAL_INVALID = -1,
+} afr_self_heal_type;
+
+typedef enum {
+ AFR_CHECK_ALL,
+ AFR_CHECK_SPECIFIC,
+} afr_sh_fail_check_type;
+
+struct afr_self_heal_ {
/* External interface: These are variables (some optional) that
are set by whoever has triggered self-heal */
@@ -249,10 +284,10 @@ typedef struct {
const char *linkname;
gf_boolean_t entries_skipped;
- int op_failed;
gf_boolean_t actual_sh_started;
gf_boolean_t sync_done;
gf_boolean_t data_lock_held;
+ gf_boolean_t sh_dom_lock_held;
gf_boolean_t eof_reached;
fd_t *healing_fd;
int file_has_holes;
@@ -263,13 +298,17 @@ typedef struct {
uint8_t *checksum;
afr_post_remove_call_t post_remove_call;
- loc_t parent_loc;
+ char *data_sh_info;
+ char *metadata_sh_info;
+ loc_t parent_loc;
call_frame_t *orig_frame;
call_frame_t *old_loop_frame;
gf_boolean_t unwound;
afr_sh_algo_private_t *private;
+ afr_sh_status_for_all_type afr_all_sh_status;
+ afr_self_heal_type sh_type_in_action;
struct afr_sh_algorithm *algo;
afr_lock_cbk_t data_lock_success_handler;
@@ -282,7 +321,9 @@ typedef struct {
void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this);
call_frame_t *sh_frame;
-} afr_self_heal_t;
+};
+
+typedef struct afr_self_heal_ afr_self_heal_t;
typedef enum {
AFR_DATA_TRANSACTION, /* truncate, write, ... */
@@ -356,12 +397,19 @@ int
afr_entry_lockee_cmp (const void *l1, const void *l2);
typedef struct {
+ char *domain; /* Domain on which inodelk is taken */
+ struct gf_flock flock;
+ unsigned char *locked_nodes;
+ int32_t lock_count;
+} afr_inodelk_t;
+
+typedef struct {
loc_t *lk_loc;
- struct gf_flock lk_flock;
int lockee_count;
afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
+ afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX];
const char *lk_basename;
const char *lower_basename;
const char *higher_basename;
@@ -370,13 +418,11 @@ typedef struct {
unsigned char *locked_nodes;
unsigned char *lower_locked_nodes;
- unsigned char *inode_locked_nodes;
selfheal_lk_type_t selfheal_lk_type;
transaction_lk_type_t transaction_lk_type;
int32_t lock_count;
- int32_t inodelk_lock_count;
int32_t entrylk_lock_count;
uint64_t lock_number;
@@ -387,6 +433,7 @@ typedef struct {
int32_t lock_op_ret;
int32_t lock_op_errno;
afr_lock_cbk_t lock_cbk;
+ char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
} afr_internal_lock_t;
typedef struct _afr_locked_fd {
@@ -406,9 +453,11 @@ typedef struct _afr_local {
unsigned int call_count;
unsigned int success_count;
unsigned int enoent_count;
+ uint32_t open_fd_count;
+ gf_boolean_t update_open_fd_count;
- unsigned int govinda_gOvinda;
+ unsigned int unhealable;
unsigned int read_child_index;
unsigned char read_child_returned;
@@ -448,15 +497,23 @@ typedef struct _afr_local {
int optimistic_change_log;
gf_boolean_t delayed_post_op;
+
/* Is the current writev() going to perform a stable write?
i.e, is fd->flags or @flags writev param have O_SYNC or
O_DSYNC?
*/
- gf_boolean_t stable_write;
+ gf_boolean_t stable_write;
- /*
- This struct contains the arguments for the "continuation"
- (scheme-like) of fops
+ /* This write appended to the file. Nnot necessarily O_APPEND,
+ just means the offset of write was at the end of file.
+ */
+ gf_boolean_t append_write;
+
+ int allow_sh_for_running_transaction;
+
+
+ /* This struct contains the arguments for the "continuation"
+ (scheme-like) of fops
*/
int op;
@@ -552,7 +609,9 @@ typedef struct _afr_local {
struct {
struct iatt prebuf;
struct iatt postbuf;
+ } inode_wfop; //common structure for all inode-write-fops
+ struct {
int32_t op_ret;
struct iovec *vector;
@@ -563,34 +622,21 @@ typedef struct _afr_local {
} writev;
struct {
- struct iatt prebuf;
- struct iatt postbuf;
- } fsync;
-
- struct {
off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
} truncate;
struct {
off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
} ftruncate;
struct {
struct iatt in_buf;
int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
} setattr;
struct {
struct iatt in_buf;
int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
} fsetattr;
struct {
@@ -652,6 +698,26 @@ typedef struct _afr_local {
dict_t *params;
char *linkpath;
} symlink;
+
+ struct {
+ int32_t mode;
+ off_t offset;
+ size_t len;
+ } fallocate;
+
+ struct {
+ off_t offset;
+ size_t len;
+ } discard;
+
+ struct {
+ off_t offset;
+ size_t len;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } zerofill;
+
+
} cont;
struct {
@@ -824,8 +890,8 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this);
int
afr_internal_lock_finish (call_frame_t *frame, xlator_t *this);
-void
-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src,
+int
+afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
unsigned int child_count);
int pump_start (call_frame_t *frame, xlator_t *this);
@@ -927,22 +993,6 @@ afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd);
} \
} while (0);
-#define AFR_CALL_RESUME(stub) \
- do { \
- afr_local_t *__local = NULL; \
- xlator_t *__this = NULL; \
- \
- __local = stub->frame->local; \
- __this = stub->frame->this; \
- stub->frame->local = NULL; \
- \
- call_resume (stub); \
- if (__local) { \
- afr_local_cleanup (__local, __this); \
- mem_put (__local); \
- } \
- } while (0)
-
#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
/* allocate and return a string that is the basename of argument */
static inline char *
@@ -1120,6 +1170,27 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count);
} while (0);
+#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO."
+
+#define AFR_SBRAIN_CHECK_FD(fd, label) do { \
+ if (fd->inode && afr_is_split_brain (this, fd->inode)) { \
+ op_errno = EIO; \
+ gf_log (this->name, GF_LOG_WARNING, \
+ AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid)); \
+ goto label; \
+ } \
+} while (0)
+
+#define AFR_SBRAIN_CHECK_LOC(loc, label) do { \
+ if (loc->inode && afr_is_split_brain (this, loc->inode)) { \
+ op_errno = EIO; \
+ loc_path (loc, NULL); \
+ gf_log (this->name, GF_LOG_WARNING, \
+ AFR_SBRAIN_MSG , loc->path); \
+ goto label; \
+ } \
+} while (0)
+
int
afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
@@ -1129,4 +1200,13 @@ afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
void
afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
+int
+afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
+
+void
+afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
+
+afr_inode_ctx_t*
+afr_inode_ctx_get (inode_t *inode, xlator_t *this);
+
#endif /* __AFR_H__ */
diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c
index db83410c7..a7f72fb30 100644
--- a/xlators/cluster/afr/src/pump.c
+++ b/xlators/cluster/afr/src/pump.c
@@ -2501,6 +2501,12 @@ init (xlator_t *this)
i++;
}
+ ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name);
+ if (-1 == ret) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
priv->first_lookup = 1;
priv->root_inode = NULL;
@@ -2532,7 +2538,7 @@ init (xlator_t *this)
goto out;
}
- pump_priv->env = syncenv_new (0);
+ pump_priv->env = this->ctx->env;
if (!pump_priv->env) {
gf_log (this->name, GF_LOG_ERROR,
"Could not create new sync-environment");
@@ -2573,9 +2579,6 @@ fini (xlator_t *this)
if (!pump_priv)
goto afr_priv;
- if (pump_priv->env)
- syncenv_destroy (pump_priv->env);
-
GF_FREE (pump_priv->resume_path);
LOCK_DESTROY (&pump_priv->resume_path_lock);
LOCK_DESTROY (&pump_priv->pump_state_lock);
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 8cbae676e..8f61339e6 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -22,6 +22,7 @@
#include "dht-common.h"
#include "defaults.h"
#include "byte-order.h"
+#include "glusterfs-acl.h"
#include <sys/time.h>
#include <libgen.h>
@@ -62,6 +63,11 @@ dht_aggregate (dict_t *this, char *key, data_t *value, void *data)
}
*size = hton64 (ntoh64 (*size) + ntoh64 (*ptr));
+
+ } else if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) {
+ ret = gf_get_min_stime (THIS, dst, key, value);
+ if (ret < 0)
+ return ret;
} else {
/* compare user xattrs only */
if (!strncmp (key, "user.", strlen ("user."))) {
@@ -148,9 +154,11 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
int op_errno = 0;
int ret = -1;
dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
local = discover_frame->local;
layout = local->layout;
+ conf = this->private;
LOCK(&discover_frame->lock);
{
@@ -193,11 +201,14 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
"(overlaps/holes present: %s, "
"ENOENT errors: %d)", local->loc.path,
(ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0);
- op_errno = EINVAL;
- goto out;
+ if ((ret > 0) && (ret == conf->subvolume_cnt)) {
+ op_errno = ESTALE;
+ goto out;
+ }
}
- dht_layout_set (this, local->inode, layout);
+ if (local->inode)
+ dht_layout_set (this, local->inode, layout);
}
DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno,
@@ -430,7 +441,7 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
op_ret, op_errno, xattr);
if (op_ret == -1) {
- local->op_errno = ENOENT;
+ local->op_errno = op_errno;
gf_log (this->name, GF_LOG_DEBUG,
"lookup of %s on %s returned error (%s)",
local->loc.path, prev->this->name,
@@ -1404,7 +1415,6 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
conf = this->private;
if (!conf)
@@ -1625,7 +1635,8 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
LOCK (&frame->lock);
{
- if (op_ret == -1) {
+ if ((op_ret == -1) && !((op_errno == ENOENT) ||
+ (op_errno == ENOTCONN))) {
local->op_errno = op_errno;
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
@@ -1638,7 +1649,7 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
unlock:
UNLOCK (&frame->lock);
- if (op_ret == -1)
+ if (local->op_ret == -1)
goto err;
cached_subvol = dht_subvol_get_cached (this, local->loc.inode);
@@ -1777,6 +1788,7 @@ dht_vgetxattr_alloc_and_fill (dht_local_t *local, dict_t *xattr, xlator_t *this,
}
(void) strcat (local->xattr_val, value);
+ (void) strcat (local->xattr_val, " ");
local->op_ret = 0;
}
@@ -1801,6 +1813,8 @@ dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this,
if (!*dict)
goto out;
+ local->xattr_val[strlen (local->xattr_val) - 1] = '\0';
+
/* we would need max this many bytes to create xattr string
* extra 40 bytes is just an estimated amount of additional
* space required as we include translator name and some
@@ -2033,10 +2047,8 @@ dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie,
{
int this_call_cnt = 0;
dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- conf = this->private;
local = frame->local;
if (op_ret != -1) {
@@ -2063,7 +2075,6 @@ int
dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *key, dict_t *xdata)
{
- dht_conf_t *conf = NULL;
dht_local_t *local = NULL;
int i = 0;
dht_layout_t *layout = NULL;
@@ -2071,7 +2082,6 @@ dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this,
xlator_t *subvol = NULL;
- conf = this->private;
local = frame->local;
layout = local->layout;
@@ -2112,7 +2122,6 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
VALIDATE_OR_GOTO (this->private, err);
conf = this->private;
@@ -2157,8 +2166,9 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
* NOTE: Don't trust inode here, as that may not be valid
* (until inode_link() happens)
*/
- if (key && (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)
- && DHT_IS_DIR(layout)) {
+ if (key && DHT_IS_DIR(layout) &&
+ ((strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)
+ || (strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0))) {
(void) strncpy (local->xsel, key, 256);
cnt = local->call_cnt = layout->cnt;
for (i = 0; i < cnt; i++) {
@@ -2229,7 +2239,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
if (cluster_getmarkerattr (frame, this, loc, key,
local, dht_getxattr_unwind,
sub_volumes, cnt,
- MARKER_UUID_TYPE, conf->vol_uuid)) {
+ MARKER_UUID_TYPE, marker_uuid_default_gauge,
+ conf->vol_uuid)) {
op_errno = EINVAL;
goto err;
}
@@ -2253,6 +2264,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
local, dht_getxattr_unwind,
sub_volumes, cnt,
MARKER_XTIME_TYPE,
+ marker_xtime_default_gauge,
conf->vol_uuid)) {
op_errno = EINVAL;
goto err;
@@ -2468,7 +2480,6 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
conf = this->private;
@@ -2682,7 +2693,6 @@ dht_removexattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
local = dht_local_init (frame, loc, NULL, GF_FOP_REMOVEXATTR);
if (!local) {
@@ -2914,7 +2924,6 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
VALIDATE_OR_GOTO (this->private, err);
conf = this->private;
@@ -3035,7 +3044,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
list_for_each_entry (orig_entry, (&orig_entries->list), list) {
next_offset = orig_entry->d_off;
if (check_is_dir (NULL, (&orig_entry->d_stat), NULL) &&
- (prev->this != dht_first_up_subvol (this))) {
+ (prev->this != local->first_up_subvol)) {
continue;
}
if (check_is_linkfile (NULL, (&orig_entry->d_stat),
@@ -3117,13 +3126,16 @@ done:
}
if (conf->readdir_optimize == _gf_true) {
- if (next_subvol != dht_first_up_subvol (this)) {
+ if (next_subvol != local->first_up_subvol) {
ret = dict_set_int32 (local->xattr,
GF_READDIR_SKIP_DIRS, 1);
if (ret)
gf_log (this->name, GF_LOG_ERROR,
"dict set failed");
- }
+ } else {
+ dict_del (local->xattr,
+ GF_READDIR_SKIP_DIRS);
+ }
}
STACK_WIND (frame, dht_readdirp_cbk,
@@ -3269,6 +3281,7 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
local->fd = fd_ref (fd);
local->size = size;
local->xattr_req = (dict)? dict_ref (dict) : NULL;
+ local->first_up_subvol = dht_first_up_subvol (this);
dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff);
@@ -3287,13 +3300,16 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
"failed to set '%s' key",
conf->link_xattr_name);
if (conf->readdir_optimize == _gf_true) {
- if (xvol != dht_first_up_subvol (this)) {
+ if (xvol != local->first_up_subvol) {
ret = dict_set_int32 (local->xattr,
GF_READDIR_SKIP_DIRS, 1);
if (ret)
gf_log (this->name,
GF_LOG_ERROR,
"Dict set failed");
+ } else {
+ dict_del (local->xattr,
+ GF_READDIR_SKIP_DIRS);
}
}
}
@@ -3551,7 +3567,9 @@ dht_mknod (call_frame_t *frame, xlator_t *this,
subvol, subvol->fops->mknod, loc, mode,
rdev, umask, params);
} else {
- avail_subvol = dht_free_disk_available_subvol (this, subvol);
+
+ avail_subvol = dht_free_disk_available_subvol (this, subvol,
+ local);
if (avail_subvol != subvol) {
/* Choose the minimum filled volume, and create the
files there */
@@ -3972,7 +3990,7 @@ dht_create (call_frame_t *frame, xlator_t *this,
}
/* Choose the minimum filled volume, and create the
files there */
- avail_subvol = dht_free_disk_available_subvol (this, subvol);
+ avail_subvol = dht_free_disk_available_subvol (this, subvol, local);
if (avail_subvol != subvol) {
local->params = dict_ref (params);
local->flags = flags;
@@ -4848,7 +4866,6 @@ dht_entrylk (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
local = dht_local_init (frame, loc, NULL, GF_FOP_ENTRYLK);
if (!local) {
@@ -5156,8 +5173,8 @@ unlock:
* not need to handle CHILD_DOWN event here.
*/
if (conf->defrag) {
- ret = pthread_create (&conf->defrag->th, NULL,
- gf_defrag_start, this);
+ ret = gf_thread_create (&conf->defrag->th, NULL,
+ gf_defrag_start, this);
if (ret) {
conf->defrag = NULL;
GF_FREE (conf->defrag);
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index f079e688b..5ccd66799 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -183,6 +183,7 @@ struct dht_local {
xlator_t *link_subvol;
struct dht_rebalance_ rebalance;
+ xlator_t *first_up_subvol;
};
typedef struct dht_local dht_local_t;
@@ -211,6 +212,10 @@ enum gf_defrag_status_t {
GF_DEFRAG_STATUS_STOPPED,
GF_DEFRAG_STATUS_COMPLETE,
GF_DEFRAG_STATUS_FAILED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED,
};
typedef enum gf_defrag_status_t gf_defrag_status_t;
@@ -227,6 +232,7 @@ struct gf_defrag_info_ {
uint64_t total_data;
uint64_t num_files_lookedup;
uint64_t total_failures;
+ uint64_t skipped;
gf_lock_t lock;
int cmd;
pthread_t th;
@@ -437,6 +443,7 @@ int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt
xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc);
xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode);
xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev);
+xlator_t *dht_subvol_next_available (xlator_t *this, xlator_t *prev);
int dht_subvol_cnt (xlator_t *this, xlator_t *subvol);
int dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p);
@@ -461,7 +468,8 @@ dht_layout_sort_volname (dht_layout_t *layout);
int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc);
gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol);
-xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol);
+xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
+ dht_local_t *layout);
int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx);
int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode);
@@ -683,6 +691,12 @@ int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
struct iatt *stbuf, int32_t valid, dict_t *xdata);
int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iatt *stbuf, int32_t valid, dict_t *xdata);
+int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t mode, off_t offset, size_t len, dict_t *xdata);
+int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, size_t len, dict_t *xdata);
+int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, size_t len, dict_t *xdata);
int32_t dht_init (xlator_t *this);
void dht_fini (xlator_t *this);
@@ -752,9 +766,11 @@ dht_dir_has_layout (dict_t *xattr, char *name);
gf_boolean_t
dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator);
xlator_t *
-dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol);
+dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout);
xlator_t *
-dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol);
+dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout);
int
dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this);
@@ -765,4 +781,7 @@ dht_priv_dump (xlator_t *this);
int32_t
dht_inodectx_dump (xlator_t *this, inode_t *inode);
+int
+dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol);
+
#endif/* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
index 0c87f4a64..fe3955ecb 100644
--- a/xlators/cluster/dht/src/dht-diskusage.c
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -251,25 +251,45 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)
/*Get the best subvolume to create the file in*/
xlator_t *
-dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol)
+dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
+ dht_local_t *local)
{
xlator_t *avail_subvol = NULL;
dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ loc_t *loc = NULL;
conf = this->private;
+ if (!local)
+ goto out;
+ loc = &local->loc;
+ if (!local->layout) {
+ layout = dht_layout_get (this, loc->parent);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "layout missing path=%s parent=%s",
+ loc->path, uuid_utoa (loc->parent->gfid));
+ goto out;
+ }
+ } else {
+ layout = dht_layout_ref (this, local->layout);
+ }
- LOCK (&conf->subvolume_lock);
+ LOCK (&conf->subvolume_lock);
{
- avail_subvol = dht_subvol_with_free_space_inodes(this, subvol);
+ avail_subvol = dht_subvol_with_free_space_inodes(this, subvol,
+ layout);
if(!avail_subvol)
{
avail_subvol = dht_subvol_maxspace_nonzeroinode(this,
- subvol);
+ subvol,
+ layout);
}
}
UNLOCK (&conf->subvolume_lock);
-
+out:
if (!avail_subvol) {
gf_log (this->name,
GF_LOG_DEBUG,
@@ -278,17 +298,42 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol)
avail_subvol = subvol;
}
-
+ if (layout)
+ dht_layout_unref (this, layout);
return avail_subvol;
}
+static inline
+int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout)
+{
+ int ret = -1;
+ int i = 0;
+
+ if (!this || !layout)
+ goto out;
+
+ /* check if subvol has layout errors, before selecting it */
+ for (i = 0; i < layout->cnt; i++) {
+ if (!strcmp (layout->list[i].xlator->name, this->name) &&
+ (layout->list[i].err != 0)) {
+ ret = -1;
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
/*Get subvolume which has both space and inodes more than the min criteria*/
xlator_t *
-dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol)
+dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout)
{
int i = 0;
double max = 0;
double max_inodes = 0;
+ int ignore_subvol = 0;
xlator_t *avail_subvol = NULL;
dht_conf_t *conf = NULL;
@@ -296,6 +341,12 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol)
conf = this->private;
for(i=0; i < conf->subvolume_cnt; i++) {
+ /* check if subvol has layout errors, before selecting it */
+ ignore_subvol = dht_subvol_has_err (conf->subvolumes[i],
+ layout);
+ if (ignore_subvol)
+ continue;
+
if ((conf->disk_unit == 'p') &&
(conf->du_stats[i].avail_percent > conf->min_free_disk) &&
(conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
@@ -325,10 +376,12 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol)
/* Get subvol which has atleast one inode and maximum space */
xlator_t *
-dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol)
+dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout)
{
int i = 0;
double max = 0;
+ int ignore_subvol = 0;
xlator_t *avail_subvol = NULL;
dht_conf_t *conf = NULL;
@@ -336,6 +389,12 @@ dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol)
conf = this->private;
for (i = 0; i < conf->subvolume_cnt; i++) {
+ /* check if subvol has layout errors, before selecting it */
+ ignore_subvol = dht_subvol_has_err (conf->subvolumes[i],
+ layout);
+ if (ignore_subvol)
+ continue;
+
if (conf->disk_unit == 'p') {
if ((conf->du_stats[i].avail_percent > max)
&& (conf->du_stats[i].avail_inodes > 0 )) {
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
index 56c139aa9..311a48112 100644
--- a/xlators/cluster/dht/src/dht-helper.c
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -18,6 +18,28 @@
#include "xlator.h"
#include "dht-common.h"
+static inline int
+dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol)
+{
+ uint64_t tmp_subvol = 0;
+
+ tmp_subvol = (long)subvol;
+ return inode_ctx_set1 (inode, this, &tmp_subvol);
+}
+
+int
+dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol)
+{
+ int ret = -1;
+ uint64_t tmp_subvol = 0;
+
+ ret = inode_ctx_get1 (inode, this, &tmp_subvol);
+ if (tmp_subvol && subvol)
+ *subvol = (xlator_t *)tmp_subvol;
+
+ return ret;
+}
+
int
dht_frame_return (call_frame_t *frame)
@@ -340,20 +362,6 @@ out:
return local;
}
-
-char *
-basestr (const char *str)
-{
- char *basestr = NULL;
-
- basestr = strrchr (str, '/');
- if (basestr)
- basestr ++;
-
- return basestr;
-}
-
-
xlator_t *
dht_first_up_subvol (xlator_t *this)
{
@@ -505,7 +513,36 @@ out:
return next;
}
+/* This func wraps around, if prev is actually the last subvol.
+ */
+xlator_t *
+dht_subvol_next_available (xlator_t *this, xlator_t *prev)
+{
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *next = NULL;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == prev) {
+ /* if prev is last in conf->subvolumes, then wrap
+ * around.
+ */
+ if ((i + 1) < conf->subvolume_cnt) {
+ next = conf->subvolumes[i + 1];
+ } else {
+ next = conf->subvolumes[0];
+ }
+ break;
+ }
+ }
+out:
+ return next;
+}
int
dht_subvol_cnt (xlator_t *this, xlator_t *subvol)
{
@@ -698,6 +735,10 @@ dht_migration_complete_check_task (void *data)
loc_t tmp_loc = {0,};
char *path = NULL;
dht_conf_t *conf = NULL;
+ inode_t *inode = NULL;
+ fd_t *iter_fd = NULL;
+ uint64_t tmp_subvol = 0;
+ int open_failed = 0;
this = THIS;
frame = data;
@@ -709,6 +750,8 @@ dht_migration_complete_check_task (void *data)
if (!local->loc.inode && !local->fd)
goto out;
+ inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
/* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
* as root:root. If a fd is already open, access check wont be done*/
@@ -771,10 +814,7 @@ dht_migration_complete_check_task (void *data)
/* update inode ctx (the layout) */
dht_layout_unref (this, local->layout);
- if (!local->loc.inode)
- ret = dht_layout_preset (this, dst_node, local->fd->inode);
- else
- ret = dht_layout_preset (this, dst_node, local->loc.inode);
+ ret = dht_layout_preset (this, dst_node, inode);
if (ret != 0) {
gf_log (this->name, GF_LOG_DEBUG,
"%s: could not set preset layout for subvol %s",
@@ -792,10 +832,7 @@ dht_migration_complete_check_task (void *data)
goto out;
}
- if (!local->loc.inode)
- ret = dht_layout_set (this, local->fd->inode, layout);
- else
- ret = dht_layout_set (this, local->loc.inode, layout);
+ ret = dht_layout_set (this, inode, layout);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to set the new layout",
@@ -806,39 +843,46 @@ dht_migration_complete_check_task (void *data)
local->cached_subvol = dst_node;
ret = 0;
- /* once we detect the migration complete, the fd-ctx is no more
- required.. delete the ctx */
- ret = fd_ctx_del (local->fd, this, NULL);
- if (!ret)
+ /* once we detect the migration complete, the inode-ctx2 is no more
+ required.. delete the ctx and also, it means, open() already
+ done on all the fd of inode */
+ ret = inode_ctx_reset1 (inode, this, &tmp_subvol);
+ if (tmp_subvol)
+ goto out;
+
+ if (list_empty (&inode->fd_list))
goto out;
/* perform open as root:root. There is window between linkfile
* creation(root:root) and setattr with the correct uid/gid
*/
SYNCTASK_SETID(0, 0);
- /* if 'local->fd' (ie, fd based operation), send a 'open()' on
- destination if not already done */
- if (local->loc.inode) {
- ret = syncop_open (dst_node, &local->loc,
- local->fd->flags, local->fd);
- } else {
- tmp_loc.inode = local->fd->inode;
- inode_path (local->fd->inode, NULL, &path);
- if (path)
- tmp_loc.path = path;
- ret = syncop_open (dst_node, &tmp_loc,
- local->fd->flags, local->fd);
- GF_FREE (path);
+ /* perform 'open()' on all the fd's present on the inode */
+ tmp_loc.inode = inode;
+ inode_path (inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ if (fd_is_anonymous (iter_fd))
+ continue;
+
+ ret = syncop_open (dst_node, &tmp_loc,
+ iter_fd->flags, iter_fd);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to open "
+ "the fd (%p, flags=0%o) on file %s @ %s",
+ iter_fd, iter_fd->flags, path, dst_node->name);
+ open_failed = 1;
+ }
}
+ GF_FREE (path);
+
SYNCTASK_SETID (frame->root->uid, frame->root->gid);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to send open() on target file at %s",
- local->loc.path, dst_node->name);
+ if (open_failed) {
+ ret = -1;
goto out;
}
-
ret = 0;
out:
@@ -883,6 +927,9 @@ dht_rebalance_inprogress_task (void *data)
struct iatt stbuf = {0,};
loc_t tmp_loc = {0,};
dht_conf_t *conf = NULL;
+ inode_t *inode = NULL;
+ fd_t *iter_fd = NULL;
+ int open_failed = 0;
this = THIS;
frame = data;
@@ -894,6 +941,8 @@ dht_rebalance_inprogress_task (void *data)
if (!local->loc.inode && !local->fd)
goto out;
+ inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
/* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
* as root:root. If a fd is already open, access check wont be done*/
if (local->loc.inode) {
@@ -945,35 +994,47 @@ dht_rebalance_inprogress_task (void *data)
}
ret = 0;
+
+ if (list_empty (&inode->fd_list))
+ goto done;
+
/* perform open as root:root. There is window between linkfile
* creation(root:root) and setattr with the correct uid/gid
*/
SYNCTASK_SETID (0, 0);
- if (local->loc.inode) {
- ret = syncop_open (dst_node, &local->loc,
- local->fd->flags, local->fd);
- } else {
- tmp_loc.inode = local->fd->inode;
- inode_path (local->fd->inode, NULL, &path);
- if (path)
- tmp_loc.path = path;
+
+ tmp_loc.inode = inode;
+ inode_path (inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ if (fd_is_anonymous (iter_fd))
+ continue;
+
ret = syncop_open (dst_node, &tmp_loc,
- local->fd->flags, local->fd);
- GF_FREE (path);
+ iter_fd->flags, iter_fd);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to send open "
+ "the fd (%p, flags=0%o) on file %s @ %s",
+ iter_fd, iter_fd->flags, path, dst_node->name);
+ open_failed = 1;
+ }
}
+ GF_FREE (path);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to send open() on target file at %s",
- local->loc.path, dst_node->name);
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+
+ if (open_failed) {
+ ret = -1;
goto out;
}
- SYNCTASK_SETID (frame->root->uid, frame->root->gid);
- ret = fd_ctx_set (local->fd, this, (uint64_t)(long)dst_node);
+done:
+ ret = dht_inode_ctx_set1 (this, inode, dst_node);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to set fd-ctx target file at %s",
+ "%s: failed to set inode-ctx target file at %s",
local->loc.path, dst_node->name);
goto out;
}
diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c
index d9b311fe2..ece84151a 100644
--- a/xlators/cluster/dht/src/dht-inode-read.c
+++ b/xlators/cluster/dht/src/dht-inode-read.c
@@ -130,10 +130,11 @@ int
dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata)
{
- uint64_t tmp_subvol = 0;
+ xlator_t *subvol = 0;
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ inode_t *inode = NULL;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -157,19 +158,20 @@ dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->op_errno = op_errno;
/* Check if the rebalance phase2 is true */
if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) {
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (ret) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+ ret = dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
/* Phase 2 of migration */
local->rebalance.target_op_fn = dht_attr2;
ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
} else {
/* value is already set in fd_ctx, that means no need
to check for whether its complete or not. */
dht_attr2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
}
out:
@@ -382,6 +384,8 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
dht_local_t *local = NULL;
int ret = 0;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
if (!local) {
@@ -400,17 +404,18 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->op_errno = op_errno;
if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) {
/* File would be migrated to other node */
- ret = fd_ctx_get (local->fd, this, NULL);
- if (ret) {
+ ret = dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
local->rebalance.target_op_fn = dht_readv2;
ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
} else {
/* value is already set in fd_ctx, that means no need
to check for whether its complete or not. */
dht_readv2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
}
out:
@@ -501,18 +506,27 @@ dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int ret = -1;
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
+ call_frame_t *prev = NULL;
local = frame->local;
+ prev = cookie;
+ if (!prev || !prev->this)
+ goto out;
if (local->call_cnt != 1)
goto out;
if ((op_ret == -1) && (op_errno == ENOTCONN) &&
IA_ISDIR(local->loc.inode->ia_type)) {
- subvol = dht_first_up_subvol (this);
+ subvol = dht_subvol_next_available (this, prev->this);
if (!subvol)
goto out;
+ /* check if we are done with visiting every node */
+ if (subvol == local->cached_subvol) {
+ goto out;
+ }
+
STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access,
&local->loc, local->rebalance.flags, NULL);
return 0;
@@ -607,8 +621,9 @@ int
dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int ret = -1;
+ dht_local_t *local = NULL;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
@@ -618,8 +633,8 @@ dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
/* If context is set, then send flush() it to the destination */
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (subvol) {
dht_flush2 (this, frame, 0);
return 0;
}
@@ -635,14 +650,10 @@ dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -704,6 +715,8 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
prev = cookie;
@@ -725,8 +738,8 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
}
local->op_errno = op_errno;
- ret = fd_ctx_get (local->fd, this, NULL);
- if (ret) {
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
local->rebalance.target_op_fn = dht_fsync2;
/* Check if the rebalance phase1 is true */
@@ -741,11 +754,12 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
if (IS_DHT_MIGRATION_PHASE2 (postbuf)) {
ret = dht_rebalance_complete_check (this, frame);
}
+ if (!ret)
+ return 0;
} else {
dht_fsync2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
out:
DHT_STRIP_PHASE1_FLAGS (postbuf);
@@ -761,15 +775,10 @@ dht_fsync2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
-
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c
index b23434cdf..4b3f3a049 100644
--- a/xlators/cluster/dht/src/dht-inode-write.c
+++ b/xlators/cluster/dht/src/dht-inode-write.c
@@ -19,6 +19,9 @@
int dht_writev2 (xlator_t *this, call_frame_t *frame, int ret);
int dht_truncate2 (xlator_t *this, call_frame_t *frame, int ret);
int dht_setattr2 (xlator_t *this, call_frame_t *frame, int ret);
+int dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret);
+int dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret);
+int dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret);
int
dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -27,6 +30,7 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
dht_local_t *local = NULL;
int ret = -1;
+ xlator_t *subvol = NULL;
if (op_ret == -1 && (op_errno != ENOENT)) {
goto out;
@@ -63,8 +67,8 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ ret = dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
dht_writev2 (this, frame, 0);
return 0;
}
@@ -88,14 +92,10 @@ dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -170,6 +170,8 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -211,8 +213,9 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (subvol) {
dht_truncate2 (this, frame, 0);
return 0;
}
@@ -236,16 +239,13 @@ dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
+ inode_t *inode = NULL;
local = frame->local;
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ inode = local->fd ? local->fd->inode : local->loc.inode;
+ dht_inode_ctx_get1 (this, inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -348,6 +348,407 @@ err:
return 0;
}
+
+int
+dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+ xlator_t *subvol = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_fallocate2;
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
+ dht_fallocate2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (fallocate, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_fallocate_cbk, subvol, subvol->fops->fallocate,
+ local->fd, local->rebalance.flags, local->rebalance.offset,
+ local->rebalance.size, NULL);
+
+ return 0;
+}
+
+int
+dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_FALLOCATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.flags = mode;
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_fallocate_cbk,
+ subvol, subvol->fops->fallocate,
+ fd, mode, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+ xlator_t *subvol = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_discard2;
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
+ dht_discard2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (discard, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_discard_cbk, subvol, subvol->fops->discard,
+ local->fd, local->rebalance.offset, local->rebalance.size,
+ NULL);
+
+ return 0;
+}
+
+int
+dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_DISCARD);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_discard_cbk, subvol, subvol->fops->discard,
+ fd, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_zerofill2;
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ ret = fd_ctx_get (local->fd, this, NULL);
+ if (!ret) {
+ dht_zerofill2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ uint64_t tmp_subvol = 0;
+ int ret = -1;
+
+ local = frame->local;
+
+ if (local->fd)
+ ret = fd_ctx_get (local->fd, this, &tmp_subvol);
+ if (!ret)
+ subvol = (xlator_t *)(long)tmp_subvol;
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill,
+ local->fd, local->rebalance.offset, local->rebalance.size,
+ NULL);
+
+ return 0;
+}
+
+int
+dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill,
+ fd, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+
/* handle cases of migration here for 'setattr()' calls */
int
dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -399,15 +800,13 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
+ inode_t *inode = NULL;
local = frame->local;
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+ dht_inode_ctx_get1 (this, inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
index 057229869..38e9970a7 100644
--- a/xlators/cluster/dht/src/dht-layout.c
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -454,12 +454,19 @@ dht_layout_entry_cmp (dht_layout_t *layout, int i, int j)
{
int64_t diff = 0;
+ /* swap zero'ed out layouts to front, if needed */
+ if (!layout->list[j].start && !layout->list[j].stop) {
+ diff = (int64_t) layout->list[i].stop
+ - (int64_t) layout->list[j].stop;
+ goto out;
+ }
if (layout->list[i].err || layout->list[j].err)
diff = layout->list[i].err - layout->list[j].err;
else
diff = (int64_t) layout->list[i].start
- (int64_t) layout->list[j].start;
+out:
return diff;
}
@@ -534,13 +541,13 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
case -1:
case ENOENT:
missing++;
- break;
+ continue;
case ENOTCONN:
down++;
- break;
+ continue;
case ENOSPC:
no_space++;
- break;
+ continue;
case 0:
/* if err == 0 and start == stop, then it is a non misc++;
* participating subvolume(spread-cnt). Then, do not
@@ -552,6 +559,7 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
break;
default:
misc++;
+ continue;
}
is_virgin = 0;
diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c
index b4a074b65..dbc9d0b3c 100644
--- a/xlators/cluster/dht/src/dht-linkfile.c
+++ b/xlators/cluster/dht/src/dht-linkfile.c
@@ -19,6 +19,35 @@
#include "compat.h"
#include "dht-common.h"
+int
+dht_linkfile_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ char is_linkfile = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (op_ret)
+ goto out;
+
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
+ if (!is_linkfile)
+ gf_log (this->name, GF_LOG_WARNING, "got non-linkfile %s:%s",
+ prev->this->name, local->loc.path);
+out:
+ local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
+ inode, stbuf, postparent, postparent,
+ xattr);
+ return 0;
+}
#define is_equal(a, b) (a == b)
int
@@ -28,15 +57,47 @@ dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postparent, dict_t *xdata)
{
dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ call_frame_t *prev = NULL;
+ dict_t *xattrs = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
local = frame->local;
if (!op_ret)
local->linked = _gf_true;
+ FRAME_SU_UNDO (frame, dht_local_t);
+
+ if (op_ret && (op_errno == EEXIST)) {
+ conf = this->private;
+ prev = cookie;
+ subvol = prev->this;
+ if (!subvol)
+ goto out;
+ xattrs = dict_new ();
+ if (!xattrs)
+ goto out;
+ ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set linkto key");
+ goto out;
+ }
+
+ STACK_WIND (frame, dht_linkfile_lookup_cbk, subvol,
+ subvol->fops->lookup, &local->loc, xattrs);
+ if (xattrs)
+ dict_unref (xattrs);
+ return 0;
+ }
+out:
local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
inode, stbuf, preparent, postparent,
xdata);
+ if (xattrs)
+ dict_unref (xattrs);
return 0;
}
@@ -88,6 +149,9 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
}
local->link_subvol = fromvol;
+ /* Always create as root:root. dht_linkfile_attr_heal fixes the
+ * ownsership */
+ FRAME_SU_DO (frame, dht_local_t);
STACK_WIND (frame, dht_linkfile_create_cbk,
fromvol, fromvol->fops->mknod, loc,
S_IFREG | DHT_LINKFILE_MODE, 0, 0, dict);
@@ -233,11 +297,11 @@ dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this)
GF_VALIDATE_OR_GOTO ("dht", local, out);
GF_VALIDATE_OR_GOTO ("dht", local->link_subvol, out);
- if ((local->stbuf.ia_type == IA_INVAL) ||
- (is_equal (frame->root->uid, local->stbuf.ia_uid) &&
- is_equal (frame->root->gid, local->stbuf.ia_gid)))
+ if (local->stbuf.ia_type == IA_INVAL)
return 0;
+ uuid_copy (local->loc.gfid, local->stbuf.ia_gfid);
+
copy = copy_frame (frame);
if (!copy)
@@ -253,6 +317,8 @@ dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this)
copy->local = copy_local;
+ FRAME_SU_DO (copy, dht_local_t);
+
STACK_WIND (copy, dht_linkfile_setattr_cbk, subvol,
subvol->fops->setattr, &copy_local->loc,
&stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL);
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index a9f913c2d..bcb19f23e 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -243,7 +243,7 @@ out:
static inline int
__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf,
- dict_t *dict, fd_t **dst_fd)
+ dict_t *dict, fd_t **dst_fd, dict_t *xattr)
{
xlator_t *this = NULL;
int ret = -1;
@@ -307,6 +307,12 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
goto out;
}
+ ret = syncop_fsetxattr (to, fd, xattr, 0);
+ if (ret == -1)
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to set xattr on %s (%s)",
+ loc->path, to->name, strerror (errno));
+
ret = syncop_ftruncate (to, fd, stbuf->ia_size);
if (ret < 0)
gf_log (this->name, GF_LOG_ERROR,
@@ -340,6 +346,9 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
int ret = -1;
xlator_t *this = NULL;
+ uint64_t src_statfs_blocks = 1;
+ uint64_t dst_statfs_blocks = 1;
+
this = THIS;
ret = syncop_statfs (from, loc, &src_statfs);
@@ -363,22 +372,34 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
if (flag != GF_DHT_MIGRATE_DATA)
goto check_avail_space;
- if (((dst_statfs.f_bavail *
- dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) <
- (((src_statfs.f_bavail * src_statfs.f_bsize) /
- GF_DISK_SECTOR_SIZE) - stbuf->ia_blocks)) {
- gf_log (this->name, GF_LOG_WARNING,
- "data movement attempted from node (%s) with"
- " higher disk space to a node (%s) with "
- "lesser disk space (%s)", from->name,
- to->name, loc->path);
-
- /* this is not a 'failure', but we don't want to
- consider this as 'success' too :-/ */
- ret = 1;
- goto out;
+ /* Check:
+ During rebalance `migrate-data` - Destination subvol experiences
+ a `reduction` in 'blocks' of free space, at the same time source
+ subvol gains certain 'blocks' of free space. A valid check is
+ necessary here to avoid errorneous move to destination where
+ the space could be scantily available.
+ */
+ if (stbuf) {
+ dst_statfs_blocks = ((dst_statfs.f_bavail *
+ dst_statfs.f_bsize) /
+ GF_DISK_SECTOR_SIZE);
+ src_statfs_blocks = ((src_statfs.f_bavail *
+ src_statfs.f_bsize) /
+ GF_DISK_SECTOR_SIZE);
+ if ((dst_statfs_blocks - stbuf->ia_blocks) <
+ (src_statfs_blocks + stbuf->ia_blocks)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "data movement attempted from node (%s) with"
+ " higher disk space to a node (%s) with "
+ "lesser disk space (%s)", from->name,
+ to->name, loc->path);
+
+ /* this is not a 'failure', but we don't want to
+ consider this as 'success' too :-/ */
+ ret = 1;
+ goto out;
+ }
}
-
check_avail_space:
if (((dst_statfs.f_bavail * dst_statfs.f_bsize) /
GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) {
@@ -620,6 +641,15 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
}
done:
+ ret = syncop_setattr (to, loc, buf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID |
+ GF_SET_ATTR_MODE), NULL, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform setattr on %s (%s)",
+ loc->path, to->name, strerror (errno));
+ }
+
ret = syncop_unlink (from, loc);
if (ret)
gf_log (this->name, GF_LOG_WARNING, "%s: unlink failed (%s)",
@@ -699,9 +729,16 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
+ /* TODO: move all xattr related operations to fd based operations */
+ ret = syncop_listxattr (from, loc, &xattr);
+ if (ret == -1)
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to get xattr from %s (%s)",
+ loc->path, from->name, strerror (errno));
+
/* create the destination, with required modes/xattr */
ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf,
- dict, &dst_fd);
+ dict, &dst_fd, xattr);
if (ret)
goto out;
@@ -718,6 +755,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
+
ret = syncop_fstat (from, src_fd, &stbuf);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s (%s)",
@@ -747,19 +785,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
- /* TODO: move all xattr related operations to fd based operations */
- ret = syncop_listxattr (from, loc, &xattr);
- if (ret == -1)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to get xattr from %s (%s)",
- loc->path, from->name, strerror (errno));
-
- ret = syncop_setxattr (to, loc, xattr, 0);
- if (ret == -1)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set xattr on %s (%s)",
- loc->path, to->name, strerror (errno));
-
/* TODO: Sync the locks */
ret = syncop_fsync (to, dst_fd, 0);
@@ -823,6 +848,23 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
+ /* Free up the data blocks on the source node, as the whole
+ file is migrated */
+ ret = syncop_ftruncate (from, src_fd, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform truncate on %s (%s)",
+ loc->path, from->name, strerror (errno));
+ }
+
+ /* remove the 'linkto' xattr from the destination */
+ ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform removexattr on %s (%s)",
+ loc->path, to->name, strerror (errno));
+ }
+
/* Do a stat and check the gfid before unlink */
ret = syncop_stat (from, loc, &empty_iatt);
if (ret) {
@@ -843,23 +885,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
}
}
- /* Free up the data blocks on the source node, as the whole
- file is migrated */
- ret = syncop_ftruncate (from, src_fd, 0);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to perform truncate on %s (%s)",
- loc->path, from->name, strerror (errno));
- }
-
- /* remove the 'linkto' xattr from the destination */
- ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to perform removexattr on %s (%s)",
- loc->path, to->name, strerror (errno));
- }
-
ret = syncop_lookup (this, loc, NULL, NULL, NULL, NULL);
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
@@ -1101,6 +1126,7 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
struct timeval end = {0,};
double elapsed = {0,};
struct timeval start = {0,};
+ int32_t err = 0;
gf_log (this->name, GF_LOG_INFO, "migrate data called on %s",
loc->path);
@@ -1258,9 +1284,21 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
ret = syncop_setxattr (this, &entry_loc, migrate_data,
0);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "migrate-data"
- " failed for %s", entry_loc.path);
- defrag->total_failures +=1;
+ err = op_errno;
+ /* errno is overloaded. See
+ * rebalance_task_completion () */
+ if (err != ENOSPC) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "migrate-data skipped for %s"
+ " due to space constraints",
+ entry_loc.path);
+ defrag->skipped +=1;
+ } else{
+ gf_log (this->name, GF_LOG_ERROR,
+ "migrate-data failed for %s",
+ entry_loc.path);
+ defrag->total_failures +=1;
+ }
}
if (ret == -1) {
@@ -1659,6 +1697,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
uint64_t size = 0;
uint64_t lookup = 0;
uint64_t failures = 0;
+ uint64_t skipped = 0;
char *status = "";
double elapsed = 0;
struct timeval end = {0,};
@@ -1675,6 +1714,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
size = defrag->total_data;
lookup = defrag->num_files_lookedup;
failures = defrag->total_failures;
+ skipped = defrag->skipped;
gettimeofday (&end, NULL);
@@ -1698,6 +1738,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
gf_log (THIS->name, GF_LOG_WARNING,
"failed to set lookedup file count");
+
ret = dict_set_int32 (dict, "status", defrag->defrag_status);
if (ret)
gf_log (THIS->name, GF_LOG_WARNING,
@@ -1710,6 +1751,14 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
}
ret = dict_set_uint64 (dict, "failures", failures);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to set failure count");
+
+ ret = dict_set_uint64 (dict, "skipped", skipped);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to set skipped file count");
log:
switch (defrag->defrag_status) {
case GF_DEFRAG_STATUS_NOT_STARTED:
@@ -1727,13 +1776,15 @@ log:
case GF_DEFRAG_STATUS_FAILED:
status = "failed";
break;
+ default:
+ break;
}
gf_log (THIS->name, GF_LOG_INFO, "Rebalance is %s. Time taken is %.2f "
"secs", status, elapsed);
gf_log (THIS->name, GF_LOG_INFO, "Files migrated: %"PRIu64", size: %"
- PRIu64", lookups: %"PRIu64", failures: %"PRIu64, files, size,
- lookup, failures);
+ PRIu64", lookups: %"PRIu64", failures: %"PRIu64", skipped: "
+ "%"PRIu64, files, size, lookup, failures, skipped);
out:
diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c
index fd0208f71..5d6f4f232 100644
--- a/xlators/cluster/dht/src/dht-rename.c
+++ b/xlators/cluster/dht/src/dht-rename.c
@@ -306,7 +306,38 @@ err:
NULL, NULL);
return 0;
}
+#define DHT_MARK_FOP_INTERNAL(xattr) do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new (); \
+ if (!xattr) \
+ break; \
+ } \
+ tmp = dict_set_str (xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \
+ if (tmp) { \
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set" \
+ " internal dict key for %s", local->loc.path); \
+ } \
+ }while (0)
+int
+dht_rename_done (call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (local->linked == _gf_true) {
+ local->linked = _gf_false;
+ dht_linkfile_attr_heal (frame, this);
+ }
+ DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
+ DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
+ &local->stbuf, &local->preoldparent,
+ &local->postoldparent, &local->preparent,
+ &local->postparent, NULL);
+ return 0;
+}
int
dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -340,11 +371,7 @@ dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
WIPE (&local->postparent);
if (is_last_call (this_call_cnt)) {
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent, &local->preparent,
- &local->postparent, NULL);
+ dht_rename_done (frame, this);
}
out:
@@ -362,7 +389,7 @@ dht_rename_cleanup (call_frame_t *frame)
xlator_t *dst_hashed = NULL;
xlator_t *dst_cached = NULL;
int call_cnt = 0;
-
+ dict_t *xattr = NULL;
local = frame->local;
this = frame->this;
@@ -386,13 +413,15 @@ dht_rename_cleanup (call_frame_t *frame)
if (!call_cnt)
goto nolinks;
+ DHT_MARK_FOP_INTERNAL (xattr);
+
if (dst_hashed != src_hashed && dst_hashed != src_cached) {
gf_log (this->name, GF_LOG_TRACE,
"unlinking linkfile %s @ %s => %s",
local->loc.path, dst_hashed->name, src_cached->name);
STACK_WIND (frame, dht_rename_unlink_cbk,
dst_hashed, dst_hashed->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr);
}
if (src_cached != dst_hashed) {
@@ -401,9 +430,12 @@ dht_rename_cleanup (call_frame_t *frame)
local->loc2.path, src_cached->name);
STACK_WIND (frame, dht_rename_unlink_cbk,
src_cached, src_cached->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr);
}
+ if (xattr)
+ dict_unref (xattr);
+
return 0;
nolinks:
@@ -467,6 +499,7 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
xlator_t *rename_subvol = NULL;
call_frame_t *link_frame = NULL;
dht_local_t *link_local = NULL;
+ dict_t *xattr = NULL;
local = frame->local;
prev = cookie;
@@ -476,6 +509,8 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dst_hashed = local->dst_hashed;
dst_cached = local->dst_cached;
+ if (local->linked == _gf_true)
+ FRAME_SU_UNDO (frame, dht_local_t);
if (op_ret == -1) {
gf_log (this->name, GF_LOG_WARNING,
"%s: rename on %s failed (%s)", local->loc.path,
@@ -510,17 +545,21 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
err:
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->preoldparent, preoldparent, prev->this);
- dht_iatt_merge (this, &local->postoldparent, postoldparent, prev->this);
- dht_iatt_merge (this, &local->preparent, prenewparent, prev->this);
- dht_iatt_merge (this, &local->postparent, postnewparent, prev->this);
-
- if (local->linked == _gf_true) {
- local->linked = _gf_false;
- dht_linkfile_attr_heal (frame, this);
+ /* Merge attrs only from src_cached. In case there of src_cached !=
+ * dst_hashed, this ignores linkfile attrs. */
+ if (prev->this == src_cached) {
+ dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
+ dht_iatt_merge (this, &local->preoldparent, preoldparent,
+ prev->this);
+ dht_iatt_merge (this, &local->postoldparent, postoldparent,
+ prev->this);
+ dht_iatt_merge (this, &local->preparent, prenewparent,
+ prev->this);
+ dht_iatt_merge (this, &local->postparent, postnewparent,
+ prev->this);
}
+
/* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk
* is called. since rename has already happened on rename_subvol,
* unlink should not be sent for oldpath (either linkfile or cached-file)
@@ -544,6 +583,8 @@ err:
if (local->call_cnt == 0)
goto unwind;
+ DHT_MARK_FOP_INTERNAL (xattr);
+
if (src_cached != dst_hashed && src_cached != dst_cached) {
gf_log (this->name, GF_LOG_TRACE,
"deleting old src datafile %s @ %s",
@@ -551,7 +592,7 @@ err:
STACK_WIND (frame, dht_rename_unlink_cbk,
src_cached, src_cached->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr);
}
if (src_hashed != rename_subvol && src_hashed != src_cached) {
@@ -561,7 +602,7 @@ err:
STACK_WIND (frame, dht_rename_unlink_cbk,
src_hashed, src_hashed->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr);
}
if (dst_cached
@@ -573,8 +614,10 @@ err:
STACK_WIND (frame, dht_rename_unlink_cbk,
dst_cached, dst_cached->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr);
}
+ if (xattr)
+ dict_unref (xattr);
return 0;
unwind:
@@ -582,16 +625,16 @@ unwind:
WIPE (&local->postoldparent);
WIPE (&local->preparent);
WIPE (&local->postparent);
+ if (xattr)
+ dict_unref (xattr);
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent, &local->preparent,
- &local->postparent, NULL);
+ dht_rename_done (frame, this);
return 0;
cleanup:
+ if (xattr)
+ dict_unref (xattr);
dht_rename_cleanup (frame);
return 0;
@@ -625,6 +668,8 @@ dht_do_rename (call_frame_t *frame)
"renaming %s => %s (%s)",
local->loc.path, local->loc2.path, rename_subvol->name);
+ if (local->linked == _gf_true)
+ FRAME_SU_DO (frame, dht_local_t);
STACK_WIND (frame, dht_rename_cbk,
rename_subvol, rename_subvol->fops->rename,
&local->loc, &local->loc2, NULL);
@@ -655,7 +700,8 @@ dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->op_ret = -1;
if (op_errno != ENOENT)
local->op_errno = op_errno;
- } else {
+ } else if (local->src_cached == prev->this) {
+ /* merge of attr returned only from linkfile creation */
dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
}
@@ -722,6 +768,7 @@ dht_rename_create_links (call_frame_t *frame)
xlator_t *dst_hashed = NULL;
xlator_t *dst_cached = NULL;
int call_cnt = 0;
+ dict_t *xattr = NULL;
local = frame->local;
@@ -732,6 +779,7 @@ dht_rename_create_links (call_frame_t *frame)
dst_hashed = local->dst_hashed;
dst_cached = local->dst_cached;
+ DHT_MARK_FOP_INTERNAL (xattr);
if (src_cached == dst_cached) {
if (dst_hashed == dst_cached)
@@ -743,7 +791,7 @@ dht_rename_create_links (call_frame_t *frame)
STACK_WIND (frame, dht_rename_unlink_links_cbk,
dst_hashed, dst_hashed->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr);
return 0;
}
@@ -770,7 +818,7 @@ dht_rename_create_links (call_frame_t *frame)
local->loc2.path, src_cached->name);
STACK_WIND (frame, dht_rename_links_cbk,
src_cached, src_cached->fops->link,
- &local->loc, &local->loc2, NULL);
+ &local->loc, &local->loc2, xattr);
}
nolinks:
@@ -778,6 +826,8 @@ nolinks:
/* skip to next step */
dht_do_rename (frame);
}
+ if (xattr)
+ dict_unref (xattr);
return 0;
}
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index b220a0e25..3fe96b1c7 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -17,6 +17,7 @@
#include "glusterfs.h"
#include "xlator.h"
#include "dht-common.h"
+#include "glusterfs-acl.h"
#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \
layout->list[i].start = srt; \
@@ -564,9 +565,33 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)
for (i = 0; i < layout->cnt; i++) {
err = layout->list[i].err;
- if (err == -1 || err == 0) {
- layout->list[i].err = -1;
+ if (err == -1 || err == 0 || err == ENOENT) {
+ /* Setting list[i].err = -1 is an indication for
+ dht_selfheal_layout_new_directory() to assign
+ a range. We set it to -1 based on any one of
+ the three criteria:
+
+ - err == -1 already, which means directory
+ existed but layout was not set on it.
+
+ - err == 0, which means directory exists and
+ has an old layout piece which will be
+ overwritten now.
+
+ - err == ENOENT, which means directory does
+ not exist (possibly racing with mkdir or
+ finishing half done mkdir). The missing
+ directory will be attempted to be recreated.
+
+ It is important to note that it is safe
+ to race with mkdir() as self-heal and
+ mkdir are idempotent operations. Both will
+ strive to set the directory and layouts to
+ the same final state.
+ */
count++;
+ if (!err)
+ layout->list[i].err = -1;
}
}
@@ -776,7 +801,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
DHT_RESET_LAYOUT_RANGE (layout);
for (i = start_subvol; i < layout->cnt; i++) {
err = layout->list[i].err;
- if (err == -1) {
+ if (err == -1 || err == ENOENT) {
DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
cnt, loc->path);
if (--cnt == 0) {
@@ -789,7 +814,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
for (i = 0; i < start_subvol; i++) {
err = layout->list[i].err;
- if (err == -1) {
+ if (err == -1 || err == ENOENT) {
DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
cnt, loc->path);
if (--cnt == 0) {
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index 6b91c56a1..70aac7710 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -262,6 +262,28 @@ out:
return ret;
}
+
+int
+dht_decommissioned_remove (xlator_t *this, dht_conf_t *conf)
+{
+ int i = 0;
+ int ret = -1;
+
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i]) {
+ conf->decommissioned_bricks[i] = NULL;
+ conf->decommission_subvols_cnt--;
+ }
+ }
+
+ ret = 0;
+out:
+
+ return ret;
+}
void
dht_init_regex (xlator_t *this, dict_t *odict, char *name,
regex_t *re, gf_boolean_t *re_valid)
@@ -358,6 +380,10 @@ dht_reconfigure (xlator_t *this, dict_t *options)
ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
if (ret == -1)
goto out;
+ } else {
+ ret = dht_decommissioned_remove (this, conf);
+ if (ret == -1)
+ goto out;
}
dht_init_regex (this, options, "rsync-hash-regex",
diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c
index 814f0a8eb..fc0ca2f77 100644
--- a/xlators/cluster/dht/src/dht.c
+++ b/xlators/cluster/dht/src/dht.c
@@ -70,6 +70,9 @@ struct xlator_fops fops = {
.fxattrop = dht_fxattrop,
.setattr = dht_setattr,
.fsetattr = dht_fsetattr,
+ .fallocate = dht_fallocate,
+ .discard = dht_discard,
+ .zerofill = dht_zerofill,
};
struct xlator_dumpops dumpops = {
diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c
index 5e5c68058..e934acdf0 100644
--- a/xlators/cluster/dht/src/nufa.c
+++ b/xlators/cluster/dht/src/nufa.c
@@ -323,7 +323,8 @@ nufa_create (call_frame_t *frame, xlator_t *this,
if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) {
avail_subvol =
dht_free_disk_available_subvol (this,
- (xlator_t *)conf->private);
+ (xlator_t *)conf->private,
+ local);
}
if (subvol != avail_subvol) {
@@ -427,7 +428,8 @@ nufa_mknod (call_frame_t *frame, xlator_t *this,
if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) {
avail_subvol =
dht_free_disk_available_subvol (this,
- (xlator_t *)conf->private);
+ (xlator_t *)conf->private,
+ local);
}
if (avail_subvol != subvol) {
@@ -482,97 +484,141 @@ same_first_part (char *str1, char term1, char *str2, char term2)
}
}
+typedef struct nufa_args {
+ xlator_t *this;
+ char *volname;
+ gf_boolean_t addr_match;
+} nufa_args_t;
+
+static void
+nufa_find_local_brick (xlator_t *xl, void *data)
+{
+ nufa_args_t *args = data;
+ xlator_t *this = args->this;
+ char *local_volname = args->volname;
+ gf_boolean_t addr_match = args->addr_match;
+ char *brick_host = NULL;
+ dht_conf_t *conf = this->private;
+ int ret = -1;
+
+ /*This means a local subvol was already found. We pick the first brick
+ * that is local*/
+ if (conf->private)
+ return;
+
+ if (strcmp (xl->name, local_volname) == 0) {
+ conf->private = xl;
+ gf_log (this->name, GF_LOG_INFO, "Using specified subvol %s",
+ local_volname);
+ return;
+ }
+
+ if (!addr_match)
+ return;
+
+ ret = dict_get_str (xl->options, "remote-host", &brick_host);
+ if ((ret == 0) &&
+ (gf_is_same_address (local_volname, brick_host) ||
+ gf_is_local_addr (brick_host))) {
+ conf->private = xl;
+ gf_log (this->name, GF_LOG_INFO, "Using the first local "
+ "subvol %s", xl->name);
+ return;
+ }
+
+}
+
+static void
+nufa_to_dht (xlator_t *this)
+{
+ GF_ASSERT (this);
+ GF_ASSERT (this->fops);
+
+ this->fops->lookup = dht_lookup;
+ this->fops->create = dht_create;
+ this->fops->mknod = dht_mknod;
+}
+
+int
+nufa_find_local_subvol (xlator_t *this,
+ void (*fn) (xlator_t *each, void* data), void *data)
+{
+ int ret = -1;
+ dht_conf_t *conf = this->private;
+ xlator_list_t *trav = NULL;
+ xlator_t *parent = NULL;
+ xlator_t *candidate = NULL;
+
+ xlator_foreach_depth_first (this, fn, data);
+ if (!conf->private) {
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't find a local "
+ "brick");
+ return -1;
+ }
+
+ candidate = conf->private;
+ trav = candidate->parents;
+ while (trav) {
+
+ parent = trav->xlator;
+ if (strcmp (parent->type, "cluster/nufa") == 0) {
+ gf_log (this->name, GF_LOG_INFO, "Found local subvol, "
+ "%s", candidate->name);
+ ret = 0;
+ conf->private = candidate;
+ break;
+ }
+
+ candidate = parent;
+ trav = parent->parents;
+ }
+
+ return ret;
+}
+
int
nufa_init (xlator_t *this)
{
- dht_conf_t *conf = NULL;
- xlator_list_t *trav = NULL;
data_t *data = NULL;
char *local_volname = NULL;
int ret = -1;
char my_hostname[256];
- xlator_t *local_subvol = NULL;
- char *brick_host = NULL;
- xlator_t *kid = NULL;
+ gf_boolean_t addr_match = _gf_false;
+ nufa_args_t args = {0, };
ret = dht_init(this);
if (ret) {
return ret;
}
- conf = this->private;
- local_volname = "localhost";
- ret = gethostname (my_hostname, 256);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "could not find hostname (%s)",
- strerror (errno));
- }
+ if ((data = dict_get (this->options, "local-volume-name"))) {
+ local_volname = data->data;
- if (ret == 0)
- local_volname = my_hostname;
+ } else {
+ addr_match = _gf_true;
+ local_volname = "localhost";
+ ret = gethostname (my_hostname, 256);
+ if (ret == 0)
+ local_volname = my_hostname;
- data = dict_get (this->options, "local-volume-name");
- if (data) {
- local_volname = data->data;
- }
+ else
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not find hostname (%s)",
+ strerror (errno));
- for (trav = this->children; trav; trav = trav->next) {
- if (strcmp (trav->xlator->name, local_volname) == 0)
- break;
- if (local_subvol) {
- continue;
- }
- kid = trav->xlator;
- for (;;) {
- if (dict_get_str(trav->xlator->options,"remote-host",
- &brick_host) == 0) {
- /* Found it. */
- break;
- }
- if (!kid->children) {
- /* Nowhere further to look. */
- gf_log (this->name, GF_LOG_ERROR,
- "could not get remote-host");
- goto err;
- }
- if (kid->children->next) {
- /* Multiple choices, can't/shouldn't decide. */
- gf_log (this->name, GF_LOG_ERROR,
- "NUFA found fan-out (type %s) volume",
- kid->type);
- goto err;
- }
- /* One-to-one xlators are OK, try the next one. */
- kid = kid->children->xlator;
- }
- if (same_first_part(my_hostname,'.',brick_host,'.')) {
- local_subvol = trav->xlator;
- }
}
- if (trav) {
- gf_log (this->name, GF_LOG_INFO,
- "Using specified subvol %s", local_volname);
- conf->private = trav->xlator;
- }
- else if (local_subvol) {
+ args.this = this;
+ args.volname = local_volname;
+ args.addr_match = addr_match;
+ ret = nufa_find_local_subvol (this, nufa_find_local_brick, &args);
+ if (ret) {
gf_log (this->name, GF_LOG_INFO,
- "Using first local subvol %s", local_subvol->name);
- conf->private = local_subvol;
+ "Unable to find local subvolume, switching "
+ "to dht mode");
+ nufa_to_dht (this);
}
- else {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not find specified or local subvol");
- goto err;
-
- }
-
return 0;
-
-err:
- dht_fini(this);
- return -1;
}
diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c
index 861012247..d3ea90ba8 100644
--- a/xlators/cluster/dht/src/switch.c
+++ b/xlators/cluster/dht/src/switch.c
@@ -437,7 +437,8 @@ switch_create (call_frame_t *frame, xlator_t *this,
avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol);
if (dht_is_subvol_filled (this, avail_subvol)) {
avail_subvol =
- dht_free_disk_available_subvol (this, avail_subvol);
+ dht_free_disk_available_subvol (this, avail_subvol,
+ local);
}
if (subvol != avail_subvol) {
@@ -536,7 +537,8 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol);
if (dht_is_subvol_filled (this, avail_subvol)) {
avail_subvol =
- dht_free_disk_available_subvol (this, avail_subvol);
+ dht_free_disk_available_subvol (this, avail_subvol,
+ local);
}
if (avail_subvol != subvol) {
diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c
index dadd3fec5..69b510e23 100644
--- a/xlators/cluster/stripe/src/stripe.c
+++ b/xlators/cluster/stripe/src/stripe.c
@@ -3774,6 +3774,524 @@ err:
int32_t
+stripe_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = NULL;
+ stripe_local_t *mlocal = NULL;
+ call_frame_t *prev = NULL;
+ call_frame_t *mframe = NULL;
+
+ if (!this || !frame || !frame->local || !cookie) {
+ gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
+ goto out;
+ }
+
+ prev = cookie;
+ local = frame->local;
+ mframe = local->orig_frame;
+ mlocal = mframe->local;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = ++mlocal->call_count;
+
+ if (op_ret == 0) {
+ mlocal->post_buf = *postbuf;
+ mlocal->pre_buf = *prebuf;
+
+ mlocal->prebuf_blocks += prebuf->ia_blocks;
+ mlocal->postbuf_blocks += postbuf->ia_blocks;
+
+ correct_file_size(prebuf, mlocal->fctx, prev);
+ correct_file_size(postbuf, mlocal->fctx, prev);
+
+ if (mlocal->prebuf_size < prebuf->ia_size)
+ mlocal->prebuf_size = prebuf->ia_size;
+ if (mlocal->postbuf_size < postbuf->ia_size)
+ mlocal->postbuf_size = postbuf->ia_size;
+ }
+
+ /* return the first failure */
+ if (mlocal->op_ret == 0) {
+ mlocal->op_ret = op_ret;
+ mlocal->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
+ mlocal->pre_buf.ia_size = mlocal->prebuf_size;
+ mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
+ mlocal->post_buf.ia_size = mlocal->postbuf_size;
+ mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
+
+ STRIPE_STACK_UNWIND (fallocate, mframe, mlocal->op_ret,
+ mlocal->op_errno, &mlocal->pre_buf,
+ &mlocal->post_buf, NULL);
+ }
+out:
+ STRIPE_STACK_DESTROY(frame);
+ return 0;
+}
+
+int32_t
+stripe_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ stripe_local_t *local = NULL;
+ stripe_fd_ctx_t *fctx = NULL;
+ int32_t op_errno = 1;
+ int32_t idx = 0;
+ int32_t offset_offset = 0;
+ int32_t remaining_size = 0;
+ off_t fill_size = 0;
+ uint64_t stripe_size = 0;
+ uint64_t tmp_fctx = 0;
+ off_t dest_offset = 0;
+ call_frame_t *fframe = NULL;
+ stripe_local_t *flocal = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ inode_ctx_get (fd->inode, this, &tmp_fctx);
+ if (!tmp_fctx) {
+ op_errno = EINVAL;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+ local->stripe_size = stripe_size;
+ local->fctx = fctx;
+
+ if (!stripe_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Wrong stripe size for the file");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ while (1) {
+ fframe = copy_frame(frame);
+ flocal = mem_get0(this->local_pool);
+ if (!flocal) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ flocal->orig_frame = frame;
+ fframe->local = flocal;
+
+ /* send fallocate request to the associated child node */
+ idx = (((offset + offset_offset) /
+ local->stripe_size) % fctx->stripe_count);
+
+ fill_size = (local->stripe_size -
+ ((offset + offset_offset) % local->stripe_size));
+ if (fill_size > remaining_size)
+ fill_size = remaining_size;
+
+ remaining_size -= fill_size;
+
+ local->wind_count++;
+ if (remaining_size == 0)
+ local->unwind = 1;
+
+ dest_offset = offset + offset_offset;
+ if (fctx->stripe_coalesce)
+ dest_offset = coalesced_offset(dest_offset,
+ local->stripe_size, fctx->stripe_count);
+
+ /*
+ * TODO: Create a separate handler for coalesce mode that sends a
+ * single fallocate per-child (since the ranges are linear).
+ */
+ STACK_WIND(fframe, stripe_fallocate_cbk, fctx->xl_array[idx],
+ fctx->xl_array[idx]->fops->fallocate, fd, mode,
+ dest_offset, fill_size, xdata);
+
+ offset_offset += fill_size;
+ if (remaining_size == 0)
+ break;
+ }
+
+ return 0;
+err:
+ if (fframe)
+ STRIPE_STACK_DESTROY(fframe);
+
+ STRIPE_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+stripe_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = NULL;
+ stripe_local_t *mlocal = NULL;
+ call_frame_t *prev = NULL;
+ call_frame_t *mframe = NULL;
+
+ if (!this || !frame || !frame->local || !cookie) {
+ gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
+ goto out;
+ }
+
+ prev = cookie;
+ local = frame->local;
+ mframe = local->orig_frame;
+ mlocal = mframe->local;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = ++mlocal->call_count;
+
+ if (op_ret == 0) {
+ mlocal->post_buf = *postbuf;
+ mlocal->pre_buf = *prebuf;
+
+ mlocal->prebuf_blocks += prebuf->ia_blocks;
+ mlocal->postbuf_blocks += postbuf->ia_blocks;
+
+ correct_file_size(prebuf, mlocal->fctx, prev);
+ correct_file_size(postbuf, mlocal->fctx, prev);
+
+ if (mlocal->prebuf_size < prebuf->ia_size)
+ mlocal->prebuf_size = prebuf->ia_size;
+ if (mlocal->postbuf_size < postbuf->ia_size)
+ mlocal->postbuf_size = postbuf->ia_size;
+ }
+
+ /* return the first failure */
+ if (mlocal->op_ret == 0) {
+ mlocal->op_ret = op_ret;
+ mlocal->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
+ mlocal->pre_buf.ia_size = mlocal->prebuf_size;
+ mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
+ mlocal->post_buf.ia_size = mlocal->postbuf_size;
+ mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
+
+ STRIPE_STACK_UNWIND (discard, mframe, mlocal->op_ret,
+ mlocal->op_errno, &mlocal->pre_buf,
+ &mlocal->post_buf, NULL);
+ }
+out:
+ STRIPE_STACK_DESTROY(frame);
+ return 0;
+}
+
+int32_t
+stripe_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ stripe_local_t *local = NULL;
+ stripe_fd_ctx_t *fctx = NULL;
+ int32_t op_errno = 1;
+ int32_t idx = 0;
+ int32_t offset_offset = 0;
+ int32_t remaining_size = 0;
+ off_t fill_size = 0;
+ uint64_t stripe_size = 0;
+ uint64_t tmp_fctx = 0;
+ off_t dest_offset = 0;
+ call_frame_t *fframe = NULL;
+ stripe_local_t *flocal = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ inode_ctx_get (fd->inode, this, &tmp_fctx);
+ if (!tmp_fctx) {
+ op_errno = EINVAL;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+ local->stripe_size = stripe_size;
+ local->fctx = fctx;
+
+ if (!stripe_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Wrong stripe size for the file");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ while (1) {
+ fframe = copy_frame(frame);
+ flocal = mem_get0(this->local_pool);
+ if (!flocal) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ flocal->orig_frame = frame;
+ fframe->local = flocal;
+
+ /* send discard request to the associated child node */
+ idx = (((offset + offset_offset) /
+ local->stripe_size) % fctx->stripe_count);
+
+ fill_size = (local->stripe_size -
+ ((offset + offset_offset) % local->stripe_size));
+ if (fill_size > remaining_size)
+ fill_size = remaining_size;
+
+ remaining_size -= fill_size;
+
+ local->wind_count++;
+ if (remaining_size == 0)
+ local->unwind = 1;
+
+ dest_offset = offset + offset_offset;
+ if (fctx->stripe_coalesce)
+ dest_offset = coalesced_offset(dest_offset,
+ local->stripe_size, fctx->stripe_count);
+
+ /*
+ * TODO: Create a separate handler for coalesce mode that sends a
+ * single discard per-child (since the ranges are linear).
+ */
+ STACK_WIND(fframe, stripe_discard_cbk, fctx->xl_array[idx],
+ fctx->xl_array[idx]->fops->discard, fd, dest_offset,
+ fill_size, xdata);
+
+ offset_offset += fill_size;
+ if (remaining_size == 0)
+ break;
+ }
+
+ return 0;
+err:
+ if (fframe)
+ STRIPE_STACK_DESTROY(fframe);
+
+ STRIPE_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+stripe_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = NULL;
+ stripe_local_t *mlocal = NULL;
+ call_frame_t *prev = NULL;
+ call_frame_t *mframe = NULL;
+
+ if (!this || !frame || !frame->local || !cookie) {
+ gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
+ goto out;
+ }
+
+ prev = cookie;
+ local = frame->local;
+ mframe = local->orig_frame;
+ mlocal = mframe->local;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = ++mlocal->call_count;
+
+ if (op_ret == 0) {
+ mlocal->post_buf = *postbuf;
+ mlocal->pre_buf = *prebuf;
+
+ mlocal->prebuf_blocks += prebuf->ia_blocks;
+ mlocal->postbuf_blocks += postbuf->ia_blocks;
+
+ correct_file_size(prebuf, mlocal->fctx, prev);
+ correct_file_size(postbuf, mlocal->fctx, prev);
+
+ if (mlocal->prebuf_size < prebuf->ia_size)
+ mlocal->prebuf_size = prebuf->ia_size;
+ if (mlocal->postbuf_size < postbuf->ia_size)
+ mlocal->postbuf_size = postbuf->ia_size;
+ }
+
+ /* return the first failure */
+ if (mlocal->op_ret == 0) {
+ mlocal->op_ret = op_ret;
+ mlocal->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
+ mlocal->pre_buf.ia_size = mlocal->prebuf_size;
+ mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
+ mlocal->post_buf.ia_size = mlocal->postbuf_size;
+ mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
+
+ STRIPE_STACK_UNWIND (zerofill, mframe, mlocal->op_ret,
+ mlocal->op_errno, &mlocal->pre_buf,
+ &mlocal->post_buf, NULL);
+ }
+out:
+ STRIPE_STACK_DESTROY(frame);
+ return 0;
+}
+
+int32_t
+stripe_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ stripe_local_t *local = NULL;
+ stripe_fd_ctx_t *fctx = NULL;
+ int32_t op_errno = 1;
+ int32_t idx = 0;
+ int32_t offset_offset = 0;
+ int32_t remaining_size = 0;
+ off_t fill_size = 0;
+ uint64_t stripe_size = 0;
+ uint64_t tmp_fctx = 0;
+ off_t dest_offset = 0;
+ call_frame_t *fframe = NULL;
+ stripe_local_t *flocal = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ inode_ctx_get (fd->inode, this, &tmp_fctx);
+ if (!tmp_fctx) {
+ op_errno = EINVAL;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+ local->stripe_size = stripe_size;
+ local->fctx = fctx;
+
+ if (!stripe_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Wrong stripe size for the file");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ while (1) {
+ fframe = copy_frame(frame);
+ flocal = mem_get0(this->local_pool);
+ if (!flocal) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ flocal->orig_frame = frame;
+ fframe->local = flocal;
+
+ idx = (((offset + offset_offset) /
+ local->stripe_size) % fctx->stripe_count);
+
+ fill_size = (local->stripe_size -
+ ((offset + offset_offset) % local->stripe_size));
+ if (fill_size > remaining_size)
+ fill_size = remaining_size;
+
+ remaining_size -= fill_size;
+
+ local->wind_count++;
+ if (remaining_size == 0)
+ local->unwind = 1;
+
+ dest_offset = offset + offset_offset;
+ if (fctx->stripe_coalesce)
+ dest_offset = coalesced_offset(dest_offset,
+ local->stripe_size,
+ fctx->stripe_count);
+
+ STACK_WIND(fframe, stripe_zerofill_cbk, fctx->xl_array[idx],
+ fctx->xl_array[idx]->fops->zerofill, fd,
+ dest_offset, fill_size, xdata);
+ offset_offset += fill_size;
+ if (remaining_size == 0)
+ break;
+ }
+
+ return 0;
+err:
+ if (fframe)
+ STRIPE_STACK_DESTROY(fframe);
+
+ STRIPE_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
stripe_release (xlator_t *this, fd_t *fd)
{
return 0;
@@ -3947,6 +4465,37 @@ stripe_setxattr_cbk (call_frame_t *frame, void *cookie,
return 0;
}
+#ifdef HAVE_BD_XLATOR
+int
+stripe_is_bd (dict_t *this, char *key, data_t *value, void *data)
+{
+ gf_boolean_t *is_bd = data;
+
+ if (data == NULL)
+ return 0;
+
+ if (XATTR_IS_BD (key))
+ *is_bd = _gf_true;
+
+ return 0;
+}
+
+inline gf_boolean_t
+stripe_setxattr_is_bd (dict_t *dict)
+{
+ gf_boolean_t is_bd = _gf_false;
+
+ if (dict == NULL)
+ goto out;
+
+ dict_foreach (dict, stripe_is_bd, &is_bd);
+out:
+ return is_bd;
+}
+#else
+#define stripe_setxattr_is_bd(dict) _gf_false
+#endif
+
int
stripe_setxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *dict, int flags, dict_t *xdata)
@@ -3956,6 +4505,7 @@ stripe_setxattr (call_frame_t *frame, xlator_t *this,
stripe_private_t *priv = NULL;
stripe_local_t *local = NULL;
int i = 0;
+ gf_boolean_t is_bd = _gf_false;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
@@ -3978,11 +4528,15 @@ stripe_setxattr (call_frame_t *frame, xlator_t *this,
local->wind_count = priv->child_count;
local->op_ret = local->op_errno = 0;
+ is_bd = stripe_setxattr_is_bd (dict);
+
/**
* Set xattrs for directories on all subvolumes. Additionally
- * this power is only given to a special client.
+ * this power is only given to a special client. Bd xlator
+ * also needs xattrs for regular files (ie LVs)
*/
- if ((frame->root->pid == GF_CLIENT_PID_GSYNCD) && IA_ISDIR (loc->inode->ia_type)) {
+ if (((frame->root->pid == GF_CLIENT_PID_GSYNCD) &&
+ IA_ISDIR (loc->inode->ia_type)) || is_bd) {
for (i = 0; i < priv->child_count; i++, trav = trav->next) {
STACK_WIND (frame, stripe_setxattr_cbk,
trav->xlator, trav->xlator->fops->setxattr,
@@ -4013,21 +4567,21 @@ stripe_fsetxattr_cbk (call_frame_t *frame, void *cookie,
int
-stripe_is_lockinfo (dict_t *this,
- char *key,
- data_t *value,
- void *data)
+stripe_is_special_key (dict_t *this,
+ char *key,
+ data_t *value,
+ void *data)
{
- gf_boolean_t *is_lockinfo = NULL;
+ gf_boolean_t *is_special = NULL;
if (data == NULL) {
goto out;
}
- is_lockinfo = data;
+ is_special = data;
- if (XATTR_IS_LOCKINFO (key))
- *is_lockinfo = _gf_true;
+ if (XATTR_IS_LOCKINFO (key) || XATTR_IS_BD (key))
+ *is_special = _gf_true;
out:
return 0;
@@ -4104,7 +4658,7 @@ stripe_fsetxattr_is_special (dict_t *dict)
goto out;
}
- dict_foreach (dict, stripe_is_lockinfo, &is_spl);
+ dict_foreach (dict, stripe_is_special_key, &is_spl);
out:
return is_spl;
@@ -4866,10 +5420,10 @@ stripe_vgetxattr_cbk (call_frame_t *frame, void *cookie,
xattr->pos = cky;
xattr->xattr_value = gf_memdup (xattr_val,
- xattr->xattr_value,
xattr->xattr_len);
- local->xattr_total_len += xattr->xattr_len + 1;
+ if (xattr->xattr_value != NULL)
+ local->xattr_total_len += xattr->xattr_len + 1;
}
}
out:
@@ -4969,7 +5523,8 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this,
if (cluster_getmarkerattr (frame, this, loc, name,
local, stripe_getxattr_unwind,
sub_volumes, priv->child_count,
- MARKER_UUID_TYPE, priv->vol_uuid)) {
+ MARKER_UUID_TYPE, marker_uuid_default_gauge,
+ priv->vol_uuid)) {
op_errno = EINVAL;
goto err;
}
@@ -5048,6 +5603,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this,
sub_volumes,
local->marker.call_count,
MARKER_XTIME_TYPE,
+ marker_xtime_default_gauge,
priv->vol_uuid)) {
op_errno = EINVAL;
goto err;
@@ -5221,6 +5777,9 @@ struct xlator_fops fops = {
.removexattr = stripe_removexattr,
.fremovexattr = stripe_fremovexattr,
.readdirp = stripe_readdirp,
+ .fallocate = stripe_fallocate,
+ .discard = stripe_discard,
+ .zerofill = stripe_zerofill,
};
struct xlator_cbks cbks = {
@@ -5246,10 +5805,10 @@ struct volume_options options[] = {
},
{ .key = {"coalesce"},
.type = GF_OPTION_TYPE_BOOL,
- .default_value = "false",
- .description = "Enable coalesce mode to flatten striped files as "
- "stored on the server (i.e., eliminate holes caused "
- "by the traditional format)."
+ .default_value = "true",
+ .description = "Enable/Disable coalesce mode to flatten striped "
+ "files as stored on the server (i.e., eliminate holes "
+ "caused by the traditional format)."
},
{ .key = {NULL} },
};