summaryrefslogtreecommitdiffstats
path: root/xlators/cluster
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster')
-rw-r--r--xlators/cluster/afr/src/Makefile.am4
-rw-r--r--xlators/cluster/afr/src/afr-common.c667
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c227
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c220
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.c103
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c1462
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.h11
-rw-r--r--xlators/cluster/afr/src/afr-lk-common.c1009
-rw-r--r--xlators/cluster/afr/src/afr-mem-types.h5
-rw-r--r--xlators/cluster/afr/src/afr-open.c196
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-algorithm.c62
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c647
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.h32
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c463
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c82
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c289
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c682
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.h13
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c746
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h11
-rw-r--r--xlators/cluster/afr/src/afr.c36
-rw-r--r--xlators/cluster/afr/src/afr.h269
-rw-r--r--xlators/cluster/afr/src/pump.c12
-rw-r--r--xlators/cluster/dht/src/Makefile.am8
-rw-r--r--xlators/cluster/dht/src/dht-common.c728
-rw-r--r--xlators/cluster/dht/src/dht-common.h82
-rw-r--r--xlators/cluster/dht/src/dht-diskusage.c179
-rw-r--r--xlators/cluster/dht/src/dht-hashfn.c76
-rw-r--r--xlators/cluster/dht/src/dht-helper.c326
-rw-r--r--xlators/cluster/dht/src/dht-inode-read.c75
-rw-r--r--xlators/cluster/dht/src/dht-inode-write.c445
-rw-r--r--xlators/cluster/dht/src/dht-layout.c106
-rw-r--r--xlators/cluster/dht/src/dht-linkfile.c144
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c274
-rw-r--r--xlators/cluster/dht/src/dht-rename.c112
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c163
-rw-r--r--xlators/cluster/dht/src/dht-shared.c758
-rw-r--r--xlators/cluster/dht/src/dht.c559
-rw-r--r--xlators/cluster/dht/src/nufa.c375
-rw-r--r--xlators/cluster/dht/src/switch.c214
-rw-r--r--xlators/cluster/ha/src/Makefile.am2
-rw-r--r--xlators/cluster/map/src/Makefile.am2
-rw-r--r--xlators/cluster/stripe/src/Makefile.am2
-rw-r--r--xlators/cluster/stripe/src/stripe-mem-types.h1
-rw-r--r--xlators/cluster/stripe/src/stripe.c662
-rw-r--r--xlators/cluster/stripe/src/stripe.h2
46 files changed, 8838 insertions, 3705 deletions
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am
index ef6bf9786..35d18a6c0 100644
--- a/xlators/cluster/afr/src/Makefile.am
+++ b/xlators/cluster/afr/src/Makefile.am
@@ -7,11 +7,11 @@ afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \
afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c \
$(top_builddir)/xlators/lib/src/libxlator.c
-afr_la_LDFLAGS = -module -avoidversion
+afr_la_LDFLAGS = -module -avoid-version
afr_la_SOURCES = $(afr_common_source) afr.c
afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-pump_la_LDFLAGS = -module -avoidversion
+pump_la_LDFLAGS = -module -avoid-version
pump_la_SOURCES = $(afr_common_source) pump.c
pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index c38e52725..af01f2ef2 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -49,10 +49,9 @@
#include "afr-self-heald.h"
#include "pump.h"
-#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000200000000ULL
-#define AFR_ICTX_SPLIT_BRAIN_MASK 0x0000000100000000ULL
+#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL
#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL
-
+#define AFR_STATISTICS_HISTORY_SIZE 50
int
afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this,
gf_boolean_t fail_conflict);
@@ -203,59 +202,86 @@ out:
return ret;
}
-afr_inode_ctx_t*
-afr_inode_ctx_get_from_addr (uint64_t addr, int32_t child_count)
+void
+afr_inode_ctx_destroy (afr_inode_ctx_t *ctx)
{
- int ret = -1;
- afr_inode_ctx_t *ctx = NULL;
- size_t size = 0;
+ if (!ctx)
+ return;
+ GF_FREE (ctx->fresh_children);
+ GF_FREE (ctx);
+}
- GF_ASSERT (child_count > 0);
+afr_inode_ctx_t*
+__afr_inode_ctx_get (inode_t *inode, xlator_t *this)
+{
+ int ret = 0;
+ uint64_t ctx_addr = 0;
+ afr_inode_ctx_t *ctx = NULL;
+ afr_private_t *priv = NULL;
- if (!addr) {
- ctx = GF_CALLOC (1, sizeof (*ctx),
- gf_afr_mt_inode_ctx_t);
- if (!ctx)
- goto out;
- size = sizeof (*ctx->fresh_children);
- ctx->fresh_children = GF_CALLOC (child_count, size,
- gf_afr_mt_int32_t);
- if (!ctx->fresh_children)
- goto out;
- } else {
- ctx = (afr_inode_ctx_t*) (long) addr;
+ priv = this->private;
+ ret = __inode_ctx_get (inode, this, &ctx_addr);
+ if (ret < 0)
+ ctx_addr = 0;
+ if (ctx_addr != 0) {
+ ctx = (afr_inode_ctx_t*) (long) ctx_addr;
+ goto out;
}
- ret = 0;
+ ctx = GF_CALLOC (1, sizeof (*ctx),
+ gf_afr_mt_inode_ctx_t);
+ if (!ctx)
+ goto fail;
+ ctx->fresh_children = GF_CALLOC (priv->child_count,
+ sizeof (*ctx->fresh_children),
+ gf_afr_mt_int32_t);
+ if (!ctx->fresh_children)
+ goto fail;
+ ret = __inode_ctx_put (inode, this, (uint64_t)ctx);
+ if (ret) {
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to "
+ "set the inode ctx (%s)",
+ uuid_utoa (inode->gfid));
+ goto fail;
+ }
+
out:
- if (ret && ctx) {
- GF_FREE (ctx->fresh_children);
- GF_FREE (ctx);
- ctx = NULL;
+ return ctx;
+
+fail:
+ afr_inode_ctx_destroy (ctx);
+ return NULL;
+}
+
+afr_inode_ctx_t*
+afr_inode_ctx_get (inode_t *inode, xlator_t *this)
+{
+ afr_inode_ctx_t *ctx = NULL;
+
+ LOCK (&inode->lock);
+ {
+ ctx = __afr_inode_ctx_get (inode, this);
}
+ UNLOCK (&inode->lock);
return ctx;
}
void
-afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params)
+afr_inode_get_ctx_params (xlator_t *this, inode_t *inode,
+ afr_inode_params_t *params)
{
GF_ASSERT (inode);
GF_ASSERT (params);
- int ret = 0;
afr_inode_ctx_t *ctx = NULL;
afr_private_t *priv = NULL;
int i = 0;
- uint64_t ctx_addr = 0;
int32_t read_child = -1;
int32_t *fresh_children = NULL;
priv = this->private;
LOCK (&inode->lock);
{
- ret = __inode_ctx_get (inode, this, &ctx_addr);
- if (ret < 0)
- goto unlock;
- ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count);
+ ctx = __afr_inode_ctx_get (inode, this);
if (!ctx)
goto unlock;
switch (params->op) {
@@ -274,12 +300,6 @@ afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params)
if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK)
params->u.value = _gf_true;
break;
- case AFR_INODE_GET_SPLIT_BRAIN:
- params->u.value = _gf_false;
- if (ctx->masks & AFR_ICTX_SPLIT_BRAIN_MASK)
- params->u.value = _gf_true;
- ;
- break;
default:
GF_ASSERT (0);
break;
@@ -292,11 +312,16 @@ unlock:
gf_boolean_t
afr_is_split_brain (xlator_t *this, inode_t *inode)
{
- afr_inode_params_t params = {0};
+ afr_inode_ctx_t *ctx = NULL;
+ gf_boolean_t spb = _gf_false;
- params.op = AFR_INODE_GET_SPLIT_BRAIN;
- afr_inode_get_ctx (this, inode, &params);
- return params.u.value;
+ ctx = afr_inode_ctx_get (inode, this);
+ if (!ctx)
+ goto out;
+ if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB))
+ spb = _gf_true;
+out:
+ return spb;
}
gf_boolean_t
@@ -305,11 +330,10 @@ afr_is_opendir_done (xlator_t *this, inode_t *inode)
afr_inode_params_t params = {0};
params.op = AFR_INODE_GET_OPENDIR_DONE;
- afr_inode_get_ctx (this, inode, &params);
+ afr_inode_get_ctx_params (this, inode, &params);
return params.u.value;
}
-
int32_t
afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children)
{
@@ -317,7 +341,7 @@ afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children)
params.op = AFR_INODE_GET_READ_CTX;
params.u.read_ctx.children = fresh_children;
- afr_inode_get_ctx (this, inode, &params);
+ afr_inode_get_ctx_params (this, inode, &params);
return params.u.read_ctx.read_child;
}
@@ -379,31 +403,14 @@ afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx)
}
void
-afr_inode_ctx_set_splitbrain (afr_inode_ctx_t *ctx, gf_boolean_t set)
-{
- uint64_t remaining_mask = 0;
- uint64_t mask = 0;
-
- if (set) {
- remaining_mask = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks);
- mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK);
- ctx->masks = remaining_mask | mask;
- } else {
- ctx->masks = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks);
- }
-}
-
-void
-afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params)
+afr_inode_set_ctx_params (xlator_t *this, inode_t *inode,
+ afr_inode_params_t *params)
{
GF_ASSERT (inode);
GF_ASSERT (params);
- int ret = 0;
afr_inode_ctx_t *ctx = NULL;
afr_private_t *priv = NULL;
- uint64_t ctx_addr = 0;
- gf_boolean_t set = _gf_false;
int32_t read_child = -1;
int32_t *fresh_children = NULL;
int32_t *stale_children = NULL;
@@ -411,10 +418,7 @@ afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params)
priv = this->private;
LOCK (&inode->lock);
{
- ret = __inode_ctx_get (inode, this, &ctx_addr);
- if (ret < 0)
- ctx_addr = 0;
- ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count);
+ ctx = __afr_inode_ctx_get (inode, this);
if (!ctx)
goto unlock;
switch (params->op) {
@@ -434,33 +438,26 @@ afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params)
case AFR_INODE_SET_OPENDIR_DONE:
afr_inode_ctx_set_opendir_done (ctx);
break;
- case AFR_INODE_SET_SPLIT_BRAIN:
- set = params->u.value;
- afr_inode_ctx_set_splitbrain (ctx, set);
- break;
default:
GF_ASSERT (0);
break;
}
- ret = __inode_ctx_put (inode, this, (uint64_t)ctx);
- if (ret) {
- gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to "
- "set the inode ctx (%s)",
- uuid_utoa (inode->gfid));
- }
}
unlock:
UNLOCK (&inode->lock);
}
void
-afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set)
+afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb,
+ afr_spb_state_t data_spb)
{
- afr_inode_params_t params = {0};
+ afr_inode_ctx_t *ctx = NULL;
- params.op = AFR_INODE_SET_SPLIT_BRAIN;
- params.u.value = set;
- afr_inode_set_ctx (this, inode, &params);
+ ctx = afr_inode_ctx_get (inode, this);
+ if (mdata_spb != DONT_KNOW)
+ ctx->mdata_spb = mdata_spb;
+ if (data_spb != DONT_KNOW)
+ ctx->data_spb = data_spb;
}
void
@@ -469,7 +466,7 @@ afr_set_opendir_done (xlator_t *this, inode_t *inode)
afr_inode_params_t params = {0};
params.op = AFR_INODE_SET_OPENDIR_DONE;
- afr_inode_set_ctx (this, inode, &params);
+ afr_inode_set_ctx_params (this, inode, &params);
}
void
@@ -488,7 +485,7 @@ afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child,
params.op = AFR_INODE_SET_READ_CTX;
params.u.read_ctx.read_child = read_child;
params.u.read_ctx.children = fresh_children;
- afr_inode_set_ctx (this, inode, &params);
+ afr_inode_set_ctx_params (this, inode, &params);
}
void
@@ -501,7 +498,7 @@ afr_inode_rm_stale_children (xlator_t *this, inode_t *inode,
params.op = AFR_INODE_RM_STALE_CHILDREN;
params.u.read_ctx.children = stale_children;
- afr_inode_set_ctx (this, inode, &params);
+ afr_inode_set_ctx_params (this, inode, &params);
}
gf_boolean_t
@@ -782,6 +779,12 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
sh = &local->self_heal;
priv = this->private;
+ if (sh->data_sh_info && strcmp (sh->data_sh_info, ""))
+ GF_FREE (sh->data_sh_info);
+
+ if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, ""))
+ GF_FREE (sh->metadata_sh_info);
+
GF_FREE (sh->buf);
GF_FREE (sh->parentbufs);
@@ -829,7 +832,8 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
void
afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
{
- afr_private_t * priv = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
priv = this->private;
@@ -839,12 +843,13 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
GF_FREE (local->internal_lock.locked_nodes);
- GF_FREE (local->internal_lock.inode_locked_nodes);
-
- GF_FREE (local->internal_lock.entry_locked_nodes);
+ for (i = 0; local->internal_lock.inodelk[i].domain; i++) {
+ GF_FREE (local->internal_lock.inodelk[i].locked_nodes);
+ }
GF_FREE (local->internal_lock.lower_locked_nodes);
+ afr_entry_lockee_cleanup (&local->internal_lock);
GF_FREE (local->transaction.pre_op);
GF_FREE (local->transaction.eager_lock);
@@ -854,6 +859,8 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
loc_wipe (&local->transaction.parent_loc);
loc_wipe (&local->transaction.new_parent_loc);
+
+ GF_FREE (local->transaction.postop_piggybacked);
}
@@ -891,8 +898,6 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
GF_FREE (local->fresh_children);
- GF_FREE (local->fd_open_on);
-
{ /* lookup */
if (local->cont.lookup.xattrs) {
afr_reset_xattr (local->cont.lookup.xattrs,
@@ -1075,6 +1080,88 @@ afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent)
uuid_copy (loc->pargfid, postparent->ia_gfid);
}
+/*
+ * Quota size xattrs are not maintained by afr. There is a
+ * possibility that they differ even when both the directory changelog xattrs
+ * suggest everything is fine. So if there is at least one 'source' check among
+ * the sources which has the maximum quota size. Otherwise check among all the
+ * available ones for maximum quota size. This way if there is a source and
+ * stale copies it always votes for the 'source'.
+ * */
+
+static void
+afr_handle_quota_size (afr_local_t *local, xlator_t *this,
+ dict_t *rsp_dict)
+{
+ int32_t *sources = NULL;
+ dict_t *xattr = NULL;
+ data_t *max_data = NULL;
+ int64_t max_quota_size = -1;
+ data_t *data = NULL;
+ int64_t *size = NULL;
+ int64_t quota_size = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int ret = -1;
+ gf_boolean_t source_present = _gf_false;
+
+ priv = this->private;
+ sources = local->cont.lookup.sources;
+
+ if (rsp_dict == NULL) {
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid "
+ "response dictionary", local->loc.path);
+ return;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source_present = _gf_true;
+ break;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ /*
+ * If there is at least one source lets check
+ * for maximum quota sizes among sources, otherwise take the
+ * maximum of the ones present to be on the safer side.
+ */
+ if (source_present && !sources[i])
+ continue;
+
+ xattr = local->cont.lookup.xattrs[i];
+ if (!xattr)
+ continue;
+
+ data = dict_get (xattr, QUOTA_SIZE_KEY);
+ if (!data)
+ continue;
+
+ size = (int64_t*)data->data;
+ quota_size = ntoh64(*size);
+ gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64,
+ local->loc.path, i, quota_size);
+ if (quota_size > max_quota_size) {
+ if (max_data)
+ data_unref (max_data);
+
+ max_quota_size = quota_size;
+ max_data = data_ref (data);
+ }
+ }
+
+ if (max_data) {
+ ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set "
+ "quota size", local->loc.path);
+ }
+
+ data_unref (max_data);
+ }
+}
+
int
afr_lookup_build_response_params (afr_local_t *local, xlator_t *this)
{
@@ -1119,13 +1206,18 @@ afr_lookup_build_response_params (afr_local_t *local, xlator_t *this)
ret = -1;
goto out;
}
+
gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d",
read_child);
if (!*xattr)
*xattr = dict_ref (local->cont.lookup.xattrs[read_child]);
+
*buf = local->cont.lookup.bufs[read_child];
*postparent = local->cont.lookup.postparents[read_child];
+ if (dict_get (local->xattr_req, QUOTA_SIZE_KEY))
+ afr_handle_quota_size (local, this, *xattr);
+
if (IA_INVAL == local->cont.lookup.inode->ia_type) {
/* fix for RT #602 */
local->cont.lookup.inode->ia_type = buf->ia_type;
@@ -1274,7 +1366,7 @@ afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this,
if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) {
/* mismatching gfid */
- gf_log (this->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_DEBUG,
"%s: gfid different on subvolume", local->loc.path);
}
}
@@ -1498,7 +1590,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children,
child2 = &bufs[success_children[i-1]];
if (FILETYPE_DIFFERS (child1, child2)) {
- gf_log (xlator_name, GF_LOG_WARNING, "%s: filetype "
+ gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype "
"differs on subvolumes (%d, %d)", path,
success_children[i-1], success_children[i]);
conflicting = _gf_true;
@@ -1507,7 +1599,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children,
if (!gfid || uuid_is_null (child1->ia_gfid))
continue;
if (uuid_compare (*gfid, child1->ia_gfid)) {
- gf_log (xlator_name, GF_LOG_WARNING, "%s: gfid differs"
+ gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs"
" on subvolume %d", path, success_children[i]);
conflicting = _gf_true;
goto out;
@@ -1641,8 +1733,8 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this,
if (op_ret == -1) {
local->op_ret = -1;
- if (afr_error_more_important (local->op_errno, op_errno))
- local->op_errno = op_errno;
+ local->op_errno = afr_most_important_error(local->op_errno,
+ op_errno, _gf_true);
goto out;
} else {
@@ -1666,7 +1758,6 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this,
local->loc.path,
local->self_heal.actual_sh_started);
}
-
}
out:
AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
@@ -1741,7 +1832,8 @@ afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this,
afr_lookup_set_self_heal_params (local, this);
if (afr_can_self_heal_proceed (&local->self_heal, priv)) {
- if (afr_is_transaction_running (local))
+ if (afr_is_transaction_running (local) &&
+ (!local->allow_sh_for_running_transaction))
goto out;
reason = "lookup detected pending operations";
@@ -1920,6 +2012,57 @@ out:
return;
}
+gf_boolean_t
+afr_is_entry_possibly_under_creation (afr_local_t *local, xlator_t *this)
+{
+ /*
+ * We need to perform this test in lookup done and treat on going
+ * create/DELETE as ENOENT.
+ * Reason:
+ Multiple clients A, B and C are attempting 'mkdir -p /mnt/a/b/c'
+
+ 1 Client A is in the middle of mkdir(/a). It has acquired lock.
+ It has performed mkdir(/a) on one subvol, and second one is still
+ in progress
+ 2 Client B performs a lookup, sees directory /a on one,
+ ENOENT on the other, succeeds lookup.
+ 3 Client B performs lookup on /a/b on both subvols, both return ENOENT
+ (one subvol because /a/b does not exist, another because /a
+ itself does not exist)
+ 4 Client B proceeds to mkdir /a/b. It obtains entrylk on inode=/a with
+ basename=b on one subvol, but fails on other subvol as /a is yet to
+ be created by Client A.
+ 5 Client A finishes mkdir of /a on other subvol
+ 6 Client C also attempts to create /a/b, lookup returns ENOENT on
+ both subvols.
+ 7 Client C tries to obtain entrylk on on inode=/a with basename=b,
+ obtains on one subvol (where B had failed), and waits for B to unlock
+ on other subvol.
+ 8 Client B finishes mkdir() on one subvol with GFID-1 and completes
+ transaction and unlocks
+ 9 Client C gets the lock on the second subvol, At this stage second
+ subvol already has /a/b created from Client B, but Client C does not
+ check that in the middle of mkdir transaction
+ 10 Client C attempts mkdir /a/b on both subvols. It succeeds on
+ ONLY ONE (where Client B could not get lock because of
+ missing parent /a dir) with GFID-2, and gets EEXIST from ONE subvol.
+ This way we have /a/b in GFID mismatch. One subvol got GFID-1 because
+ Client B performed transaction on only one subvol (because entrylk()
+ could not be obtained on second subvol because of missing parent dir --
+ caused by premature/speculative succeeding of lookup() on /a when locks
+ are detected). Other subvol gets GFID-2 from Client C because while
+ it was waiting for entrylk() on both subvols, Client B was in the
+ middle of creating mkdir() on only one subvol, and Client C does not
+ "expect" this when it is between lock() and pre-op()/op() phase of the
+ transaction.
+ */
+ if (local->cont.lookup.parent_entrylk && local->enoent_count)
+ return _gf_true;
+
+ return _gf_false;
+}
+
+
static void
afr_lookup_done (call_frame_t *frame, xlator_t *this)
{
@@ -1936,6 +2079,12 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
priv = this->private;
local = frame->local;
+ if (afr_is_entry_possibly_under_creation (local, this)) {
+ local->op_ret = -1;
+ local->op_errno = ENOENT;
+ goto unwind;
+ }
+
if (local->op_ret < 0)
goto unwind;
@@ -1993,25 +2142,20 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
* others in that they must be given higher priority while
* returning to the user.
*
- * The hierarchy is ESTALE > ENOENT > others
- *
+ * The hierarchy is ESTALE > EIO > ENOENT > others
*/
-
-gf_boolean_t
-afr_error_more_important (int32_t old_errno, int32_t new_errno)
+int32_t
+afr_most_important_error(int32_t old_errno, int32_t new_errno,
+ gf_boolean_t eio)
{
- gf_boolean_t ret = _gf_true;
-
- /* Nothing should ever overwrite ESTALE */
- if (old_errno == ESTALE)
- ret = _gf_false;
-
- /* Nothing should overwrite ENOENT, except ESTALE/EIO*/
- else if ((old_errno == ENOENT) && (new_errno != ESTALE)
- && (new_errno != EIO))
- ret = _gf_false;
+ if (old_errno == ESTALE || new_errno == ESTALE)
+ return ESTALE;
+ if (eio && (old_errno == EIO || new_errno == EIO))
+ return EIO;
+ if (old_errno == ENOENT || new_errno == ENOENT)
+ return ENOENT;
- return ret;
+ return new_errno;
}
int32_t
@@ -2030,8 +2174,9 @@ afr_resultant_errno_get (int32_t *children,
} else {
child = i;
}
- if (afr_error_more_important (op_errno, child_errno[child]))
- op_errno = child_errno[child];
+ op_errno = afr_most_important_error(op_errno,
+ child_errno[child],
+ _gf_false);
}
return op_errno;
}
@@ -2043,8 +2188,8 @@ afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno)
if (op_errno == ENOENT)
local->enoent_count++;
- if (afr_error_more_important (local->op_errno, op_errno))
- local->op_errno = op_errno;
+ local->op_errno = afr_most_important_error(local->op_errno, op_errno,
+ _gf_false);
if (local->op_errno == ESTALE) {
local->op_ret = -1;
@@ -2283,7 +2428,7 @@ afr_lookup (call_frame_t *frame, xlator_t *this,
int call_count = 0;
uint64_t ctx = 0;
int32_t op_errno = 0;
-
+ int allow_sh = 0;
priv = this->private;
AFR_LOCAL_ALLOC_OR_GOTO (local, out);
@@ -2300,6 +2445,13 @@ afr_lookup (call_frame_t *frame, xlator_t *this,
goto out;
}
+ if (local->loc.path &&
+ (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) {
+ op_errno = EPERM;
+ ret = -1;
+ goto out;
+ }
+
ret = inode_ctx_get (local->loc.inode, this, &ctx);
if (ret == 0) {
/* lookup is a revalidate */
@@ -2348,6 +2500,11 @@ afr_lookup (call_frame_t *frame, xlator_t *this,
/* By default assume ENOTCONN. On success it will be set to 0. */
local->op_errno = ENOTCONN;
+ ret = dict_get_int32 (xattr_req, "allow-sh-for-running-transaction",
+ &allow_sh);
+ dict_del (xattr_req, "allow-sh-for-running-transaction");
+ local->allow_sh_for_running_transaction = allow_sh;
+
ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc,
&gfid_req);
if (ret) {
@@ -2464,10 +2621,11 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
}
pthread_mutex_init (&fd_ctx->delay_lock, NULL);
- INIT_LIST_HEAD (&fd_ctx->paused_calls);
INIT_LIST_HEAD (&fd_ctx->entries);
fd_ctx->call_child = -1;
+ INIT_LIST_HEAD (&fd_ctx->eager_locked);
+
ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx);
if (ret)
gf_log (this->name, GF_LOG_DEBUG,
@@ -2524,15 +2682,42 @@ afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
+static int
+afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+
+ priv = this->private;
+ local = frame->local;
+ call_count = local->call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_flush_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->flush,
+ local->fd, NULL);
+ if (!--call_count)
+ break;
+
+ }
+ }
+
+ return 0;
+}
+
int
afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
+ call_stub_t *stub = NULL;
int ret = -1;
int op_errno = 0;
- int call_count = -1;
- int i = 0;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -2548,27 +2733,14 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
goto out;
local->fd = fd_ref(fd);
- call_count = local->call_count;
-
- /*
- * Ideally we should synchronize flush against completion of writing
- * the delayed changelog, but for now we just push it out first...
- */
- afr_delayed_changelog_wake_up(this, fd);
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_flush_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->flush,
- local->fd, NULL);
-
- if (!--call_count)
- break;
- }
+ stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata);
+ if (!stub) {
+ ret = -1;
+ op_errno = ENOMEM;
+ goto out;
}
+ afr_delayed_changelog_wake_resume (this, fd, stub);
ret = 0;
out:
@@ -2587,8 +2759,6 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
uint64_t ctx = 0;
afr_fd_ctx_t *fd_ctx = NULL;
int ret = 0;
- afr_fd_paused_call_t *paused_call = NULL;
- afr_fd_paused_call_t *tmp = NULL;
ret = fd_ctx_get (fd, this, &ctx);
if (ret < 0)
@@ -2604,12 +2774,6 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
GF_FREE (fd_ctx->locked_on);
GF_FREE (fd_ctx->pre_op_piggyback);
- list_for_each_entry_safe (paused_call, tmp, &fd_ctx->paused_calls,
- call_list) {
- list_del_init (&paused_call->call_list);
- GF_FREE (paused_call);
- }
-
GF_FREE (fd_ctx->lock_piggyback);
GF_FREE (fd_ctx->lock_acquired);
@@ -2652,6 +2816,16 @@ afr_release (xlator_t *this, fd_t *fd)
/* {{{ fsync */
int
+afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+int
afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
@@ -2660,6 +2834,7 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int call_count = -1;
int child_index = (long) cookie;
int read_child = 0;
+ call_stub_t *stub = NULL;
local = frame->local;
@@ -2675,13 +2850,13 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->op_ret = 0;
if (local->success_count == 0) {
- local->cont.fsync.prebuf = *prebuf;
- local->cont.fsync.postbuf = *postbuf;
+ local->cont.inode_wfop.prebuf = *prebuf;
+ local->cont.inode_wfop.postbuf = *postbuf;
}
if (child_index == read_child) {
- local->cont.fsync.prebuf = *prebuf;
- local->cont.fsync.postbuf = *postbuf;
+ local->cont.inode_wfop.prebuf = *prebuf;
+ local->cont.inode_wfop.postbuf = *postbuf;
}
local->success_count++;
@@ -2694,10 +2869,32 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_count = afr_frame_return (frame);
if (call_count == 0) {
- AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno,
- &local->cont.fsync.prebuf,
- &local->cont.fsync.postbuf,
- NULL);
+ /* Make a stub out of the frame, and register it
+ with the waking up post-op. When the call-stub resumes,
+ we are guaranteed that there was no post-op pending
+ (i.e changelogs were unset in the server). This is an
+ essential "guarantee", that fsync() returns only after
+ completely finishing EVERYTHING, including the delayed
+ post-op. This guarantee is expected by FUSE graph switching
+ for example.
+ */
+ stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk,
+ local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ xdata);
+ if (!stub) {
+ AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0);
+ return 0;
+ }
+
+ /* If no new unstable writes happened between the
+ time we cleared the unstable write witness flag in afr_fsync
+ and now, calling afr_delayed_changelog_wake_up() should
+ wake up and skip over the fsync phase and go straight to
+ afr_changelog_post_op_now()
+ */
+ afr_delayed_changelog_wake_resume (this, local->fd, stub);
}
return 0;
@@ -2732,6 +2929,10 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
local->fd = fd_ref (fd);
+ if (afr_fd_has_witnessed_unstable_write (this, fd)) {
+ /* don't care. we only wanted to CLEAR the bit */
+ }
+
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND_COOKIE (frame, afr_fsync_cbk,
@@ -3889,6 +4090,17 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
goto out;
}
+ local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count,
+ sizeof (int),
+ gf_afr_mt_int32_t);
+ if (!local->transaction.postop_piggybacked) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->append_write = _gf_false;
+
ret = 0;
out:
return ret;
@@ -3900,16 +4112,6 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count,
{
int ret = -ENOMEM;
- lk->inode_locked_nodes = GF_CALLOC (sizeof (*lk->inode_locked_nodes),
- child_count, gf_afr_mt_char);
- if (NULL == lk->inode_locked_nodes)
- goto out;
-
- lk->entry_locked_nodes = GF_CALLOC (sizeof (*lk->entry_locked_nodes),
- child_count, gf_afr_mt_char);
- if (NULL == lk->entry_locked_nodes)
- goto out;
-
lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
child_count, gf_afr_mt_char);
if (NULL == lk->locked_nodes)
@@ -3968,6 +4170,21 @@ out:
}
int
+afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count)
+{
+ int ret = -ENOMEM;
+
+ lk->domain = dom;
+ lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
+ child_count, gf_afr_mt_char);
+ if (NULL == lk->locked_nodes)
+ goto out;
+ ret = 0;
+out:
+ return ret;
+}
+
+int
afr_transaction_local_init (afr_local_t *local, xlator_t *this)
{
int child_up_count = 0;
@@ -3980,6 +4197,14 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
if (ret < 0)
goto out;
+ if ((local->transaction.type == AFR_DATA_TRANSACTION) ||
+ (local->transaction.type == AFR_METADATA_TRANSACTION)) {
+ ret = afr_inodelk_init (&local->internal_lock.inodelk[0],
+ this->name, priv->child_count);
+ if (ret < 0)
+ goto out;
+ }
+
ret = -ENOMEM;
child_up_count = afr_up_children_count (local->child_up,
priv->child_count);
@@ -4001,14 +4226,6 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
if (!local->fresh_children)
goto out;
- if (local->fd) {
- local->fd_open_on = GF_CALLOC (sizeof (*local->fd_open_on),
- priv->child_count,
- gf_afr_mt_char);
- if (!local->fd_open_on)
- goto out;
- }
-
local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op),
priv->child_count,
gf_afr_mt_char);
@@ -4024,6 +4241,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
AFR_NUM_CHANGE_LOGS);
if (!local->transaction.txn_changelog)
goto out;
+
+ INIT_LIST_HEAD (&local->transaction.eager_locked);
+
ret = 0;
out:
return ret;
@@ -4211,6 +4431,16 @@ afr_priv_destroy (afr_private_t *priv)
if (priv->shd.split_brain)
eh_destroy (priv->shd.split_brain);
+ for (i = 0; i < priv->child_count; i++)
+ {
+ if (priv->shd.statistics[i])
+ eh_destroy (priv->shd.statistics[i]);
+ }
+
+ GF_FREE (priv->shd.statistics);
+
+ GF_FREE (priv->shd.crawl_events);
+
GF_FREE (priv->last_event);
if (priv->pending_key) {
for (i = 0; i < priv->child_count; i++)
@@ -4276,3 +4506,86 @@ afr_prepare_new_entry_pending_matrix (int32_t **pending,
}
}
}
+
+gf_boolean_t
+afr_is_fd_fixable (fd_t *fd)
+{
+ if (!fd || !fd->inode)
+ return _gf_false;
+ else if (fd_is_anonymous (fd))
+ return _gf_false;
+ else if (uuid_is_null (fd->inode->gfid))
+ return _gf_false;
+
+ return _gf_true;
+}
+
+void
+afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ inode_t *inode = NULL;
+ afr_inode_ctx_t *ctx = NULL;
+
+ local = frame->local;
+
+ if (local->fd)
+ inode = local->fd->inode;
+ else
+ inode = local->loc.inode;
+
+ if (!inode)
+ return;
+
+ LOCK (&inode->lock);
+ {
+ ctx = __afr_inode_ctx_get (inode, this);
+ ctx->open_fd_count = local->open_fd_count;
+ }
+ UNLOCK (&inode->lock);
+}
+
+int
+afr_initialise_statistics (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ int i = 0;
+ int child_count = 0;
+ eh_t *stats_per_brick = NULL;
+ shd_crawl_event_t ***shd_crawl_events = NULL;
+ priv = this->private;
+
+ priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count,
+ gf_common_mt_eh_t);
+ if (!priv->shd.statistics) {
+ ret = -1;
+ goto out;
+ }
+ child_count = priv->child_count;
+ for (i=0; i < child_count ; i++) {
+ stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE,
+ _gf_false,
+ _destroy_crawl_event_data);
+ if (!stats_per_brick) {
+ ret = -1;
+ goto out;
+ }
+ priv->shd.statistics[i] = stats_per_brick;
+
+ }
+
+ shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events);
+ *shd_crawl_events = GF_CALLOC (sizeof(shd_crawl_event_t*),
+ priv->child_count,
+ gf_afr_mt_shd_crawl_event_t);
+
+ if (!priv->shd.crawl_events) {
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+out:
+ return ret;
+
+}
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index c201d45fd..689dd84e6 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -253,7 +253,7 @@ unlock:
goto out;
if (!afr_is_opendir_done (this, local->fd->inode) &&
- up_children_count > 1) {
+ up_children_count > 1 && priv->entry_self_heal) {
/*
* This is the first opendir on this inode. We need
@@ -358,85 +358,6 @@ struct entry_name {
struct list_head list;
};
-
-static gf_boolean_t
-remembered_name (const char *name, struct list_head *entries)
-{
- struct entry_name *e = NULL;
- gf_boolean_t ret = _gf_false;
-
- list_for_each_entry (e, entries, list) {
- if (!strcmp (name, e->name)) {
- ret = _gf_true;
- goto out;
- }
- }
-
-out:
- return ret;
-}
-
-
-static void
-afr_remember_entries (gf_dirent_t *entries, fd_t *fd)
-{
- struct entry_name *n = NULL;
- gf_dirent_t *entry = NULL;
- int ret = 0;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- ret = fd_ctx_get (fd, THIS, &ctx);
- if (ret < 0) {
- gf_log (THIS->name, GF_LOG_INFO,
- "could not get fd ctx for fd=%p", fd);
- return;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- list_for_each_entry (entry, &entries->list, list) {
- n = GF_CALLOC (1, sizeof (*n), gf_afr_mt_entry_name);
- n->name = gf_strdup (entry->d_name);
- INIT_LIST_HEAD (&n->list);
-
- list_add (&n->list, &fd_ctx->entries);
- }
-}
-
-
-static off_t
-afr_filter_entries (gf_dirent_t *entries, fd_t *fd)
-{
- gf_dirent_t *entry = NULL;
- gf_dirent_t *tmp = NULL;
- int ret = 0;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- off_t offset = 0;
-
- ret = fd_ctx_get (fd, THIS, &ctx);
- if (ret < 0) {
- gf_log (THIS->name, GF_LOG_INFO,
- "could not get fd ctx for fd=%p", fd);
- return -1;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- offset = entry->d_off;
-
- if (remembered_name (entry->d_name, &fd_ctx->entries)) {
- list_del (&entry->list);
- GF_FREE (entry);
- }
- }
-
- return offset;
-}
-
-
static void
afr_forget_entries (fd_t *fd)
{
@@ -462,14 +383,36 @@ afr_forget_entries (fd_t *fd)
}
}
+static void
+afr_readdir_filter_trash_dir (gf_dirent_t *entries, fd_t *fd)
+{
+ gf_dirent_t * entry = NULL;
+ gf_dirent_t * tmp = NULL;
+
+ list_for_each_entry_safe (entry, tmp, &entries->list, list) {
+ if (__is_root_gfid (fd->inode->gfid) &&
+ !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) {
+ list_del_init (&entry->list);
+ GF_FREE (entry);
+ }
+ }
+}
int32_t
afr_readdir_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
gf_dirent_t *entries, dict_t *xdata)
{
- AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL);
+ afr_local_t *local = NULL;
+
+ if (op_ret == -1)
+ goto out;
+ local = frame->local;
+ afr_readdir_filter_trash_dir (entries, local->fd);
+
+out:
+ AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL);
return 0;
}
@@ -479,116 +422,16 @@ afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int32_t next_call_child = -1;
- int ret = 0;
- int32_t *last_index = NULL;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- off_t offset = 0;
- int32_t call_child = -1;
-
- priv = this->private;
- children = priv->children;
-
- local = frame->local;
+ afr_local_t *local = NULL;
- if ((priv->readdir_failover == _gf_false) && (op_ret < 0))
+ if (op_ret == -1)
goto out;
- read_child = (long) cookie;
- last_index = &local->cont.readdir.last_index;
- fresh_children = local->fresh_children;
-
- /* the value of the last_index changes if afr_next_call_child is
- * called. So to find the call_child of this callback use last_index
- * before the next_call_child call.
- */
- if (*last_index == -1)
- call_child = read_child;
- else
- call_child = fresh_children[*last_index];
-
- if (priv->strict_readdir) {
- ret = fd_ctx_get (local->fd, this, &ctx);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "could not get fd ctx for fd=%p", local->fd);
- op_ret = -1;
- op_errno = -ret;
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- if (op_ret == -1) {
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index,
- read_child);
- if (next_call_child < 0)
- goto out;
- gf_log (this->name, GF_LOG_TRACE,
- "starting readdir afresh on child %d, offset %"PRId64,
- next_call_child, (uint64_t) 0);
-
- fd_ctx->failed_over = _gf_true;
-
- STACK_WIND_COOKIE (frame, afr_readdirp_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->readdirp,
- local->fd,
- local->cont.readdir.size, 0,
- local->cont.readdir.dict);
- return 0;
- }
- }
-
- if (priv->strict_readdir) {
- if (fd_ctx->failed_over) {
- if (list_empty (&entries->list)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no entries found");
- goto out;
- }
-
- offset = afr_filter_entries (entries, local->fd);
-
- afr_remember_entries (entries, local->fd);
-
- if (list_empty (&entries->list)) {
- /* All the entries we got were duplicate. We
- shouldn't send an empty list now, because
- that will make the application stop reading. So
- try to get more entries */
-
- gf_log (this->name, GF_LOG_TRACE,
- "trying to fetch non-duplicate entries "
- "from offset %"PRId64", child %s",
- offset, children[call_child]->name);
-
- STACK_WIND_COOKIE (frame, afr_readdirp_cbk,
- (void *) (long) read_child,
- children[call_child],
- children[call_child]->fops->readdirp,
- local->fd, local->cont.readdir.size, offset,
- local->cont.readdir.dict);
- return 0;
- }
- } else {
- afr_remember_entries (entries, local->fd);
- }
- }
+ local = frame->local;
+ afr_readdir_filter_trash_dir (entries, local->fd);
out:
AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL);
-
return 0;
}
@@ -654,20 +497,6 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this,
local->cont.readdir.size = size;
local->cont.readdir.dict = (dict)? dict_ref (dict) : NULL;
- if (priv->strict_readdir) {
- if (fd_ctx->last_tried != call_child) {
- gf_log (this->name, GF_LOG_TRACE,
- "first up child has changed from %d to %d, "
- "restarting readdir from offset 0",
- fd_ctx->last_tried, call_child);
-
- fd_ctx->failed_over = _gf_true;
- offset = 0;
- }
-
- fd_ctx->last_tried = call_child;
- }
-
if (whichop == GF_FOP_READDIR)
STACK_WIND_COOKIE (frame, afr_readdir_cbk,
(void *) (long) call_child,
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index d86b08b87..1943b719b 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -231,7 +231,7 @@ afr_dir_fop_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this)
afr_mark_new_entry_changelog (frame, this);
out:
- local->transaction.resume (frame, this);
+ return;
}
void
@@ -253,6 +253,7 @@ afr_dir_fop_done (call_frame_t *frame, xlator_t *this)
done:
local->transaction.unwind (frame, this);
afr_dir_fop_mark_entry_pending_changelog (frame, this);
+ local->transaction.resume (frame, this);
}
/* {{{ create */
@@ -418,11 +419,12 @@ afr_create (call_frame_t *frame, xlator_t *this,
loc_t *loc, int32_t flags, mode_t mode,
mode_t umask, fd_t *fd, dict_t *params)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -473,7 +475,17 @@ afr_create (call_frame_t *frame, xlator_t *this,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+ int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
op_errno = -ret;
@@ -612,11 +624,12 @@ int
afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
dev_t dev, mode_t umask, dict_t *params)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -666,7 +679,17 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
op_errno = -ret;
@@ -802,16 +825,16 @@ afr_mkdir_done (call_frame_t *frame, xlator_t *this)
return 0;
}
-
int
afr_mkdir (call_frame_t *frame, xlator_t *this,
loc_t *loc, mode_t mode, mode_t umask, dict_t *params)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -848,6 +871,7 @@ afr_mkdir (call_frame_t *frame, xlator_t *this,
if (params)
local->xdata_req = dict_ref (params);
+ local->op = GF_FOP_MKDIR;
local->transaction.fop = afr_mkdir_wind;
local->transaction.done = afr_mkdir_done;
local->transaction.unwind = afr_mkdir_unwind;
@@ -859,7 +883,17 @@ afr_mkdir (call_frame_t *frame, xlator_t *this,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+ int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
op_errno = -ret;
@@ -966,7 +1000,8 @@ afr_link_wind (call_frame_t *frame, xlator_t *this)
for (i = 0; i < priv->child_count; i++) {
if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i,
+ STACK_WIND_COOKIE (frame, afr_link_wind_cbk,
+ (void *) (long) i,
priv->children[i],
priv->children[i]->fops->link,
&local->loc,
@@ -998,11 +1033,12 @@ int
afr_link (call_frame_t *frame, xlator_t *this,
loc_t *oldloc, loc_t *newloc, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -1037,6 +1073,7 @@ afr_link (call_frame_t *frame, xlator_t *this,
}
UNLOCK (&priv->read_child_lock);
+ local->op = GF_FOP_LINK;
local->transaction.fop = afr_link_wind;
local->transaction.done = afr_link_done;
local->transaction.unwind = afr_link_unwind;
@@ -1048,13 +1085,22 @@ afr_link (call_frame_t *frame, xlator_t *this,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (newloc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+ int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
op_errno = -ret;
goto out;
}
-
ret = 0;
out:
if (ret < 0) {
@@ -1190,11 +1236,12 @@ int
afr_symlink (call_frame_t *frame, xlator_t *this,
const char *linkpath, loc_t *loc, mode_t umask, dict_t *params)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -1231,6 +1278,7 @@ afr_symlink (call_frame_t *frame, xlator_t *this,
if (params)
local->xdata_req = dict_ref (params);
+ local->op = GF_FOP_SYMLINK;
local->transaction.fop = afr_symlink_wind;
local->transaction.done = afr_symlink_done;
local->transaction.unwind = afr_symlink_unwind;
@@ -1242,7 +1290,17 @@ afr_symlink (call_frame_t *frame, xlator_t *this,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
op_errno = -ret;
@@ -1317,6 +1375,7 @@ afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY)
afr_transaction_fop_failed (frame, this, child_index);
local->op_errno = op_errno;
+ local->child_errno[child_index] = op_errno;
if (op_ret > -1)
__dir_entry_fop_common_cbk (frame, child_index, this,
@@ -1391,11 +1450,13 @@ int
afr_rename (call_frame_t *frame, xlator_t *this,
loc_t *oldloc, loc_t *newloc, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
+ int nlockee = 0;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -1423,6 +1484,7 @@ afr_rename (call_frame_t *frame, xlator_t *this,
local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL);
+ local->op = GF_FOP_RENAME;
local->transaction.fop = afr_rename_wind;
local->transaction.done = afr_rename_done;
local->transaction.unwind = afr_rename_unwind;
@@ -1439,6 +1501,38 @@ afr_rename (call_frame_t *frame, xlator_t *this,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (oldloc->path);
local->transaction.new_basename = AFR_BASENAME (newloc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = nlockee = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->transaction.new_parent_loc,
+ local->transaction.new_basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ if (local->newloc.inode && IA_ISDIR (local->newloc.inode->ia_type)) {
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->newloc,
+ NULL,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ }
+ qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee),
+ afr_entry_lockee_cmp);
+ int_lock->lockee_count = nlockee;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION);
if (ret < 0) {
@@ -1578,11 +1672,12 @@ int32_t
afr_unlink (call_frame_t *frame, xlator_t *this,
loc_t *loc, int xflag, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -1610,6 +1705,7 @@ afr_unlink (call_frame_t *frame, xlator_t *this,
if (xdata)
local->xdata_req = dict_ref (xdata);
+ local->op = GF_FOP_UNLINK;
local->transaction.fop = afr_unlink_wind;
local->transaction.done = afr_unlink_done;
local->transaction.unwind = afr_unlink_unwind;
@@ -1621,7 +1717,17 @@ afr_unlink (call_frame_t *frame, xlator_t *this,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
op_errno = -ret;
@@ -1695,6 +1801,7 @@ afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY))
afr_transaction_fop_failed (frame, this, child_index);
local->op_errno = op_errno;
+ local->child_errno[child_index] = op_errno;
if (op_ret > -1)
__dir_entry_fop_common_cbk (frame, child_index, this,
op_ret, op_errno, NULL, NULL,
@@ -1768,11 +1875,13 @@ int
afr_rmdir (call_frame_t *frame, xlator_t *this,
loc_t *loc, int flags, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
+ int nlockee = 0;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -1798,6 +1907,7 @@ afr_rmdir (call_frame_t *frame, xlator_t *this,
local->cont.rmdir.flags = flags;
loc_copy (&local->loc, loc);
+ local->op = GF_FOP_RMDIR;
local->transaction.fop = afr_rmdir_wind;
local->transaction.done = afr_rmdir_done;
local->transaction.unwind = afr_rmdir_unwind;
@@ -1809,6 +1919,28 @@ afr_rmdir (call_frame_t *frame, xlator_t *this,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = nlockee = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->loc,
+ NULL,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee),
+ afr_entry_lockee_cmp);
+ int_lock->lockee_count = nlockee;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index beff715b5..e06e3b2f2 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -117,6 +117,8 @@ afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
children = priv->children;
+ AFR_SBRAIN_CHECK_LOC (loc, out);
+
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -232,6 +234,8 @@ afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
children = priv->children;
+ AFR_SBRAIN_CHECK_LOC (loc, out);
+
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -348,6 +352,8 @@ afr_fstat (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (fd->inode, out);
+ AFR_SBRAIN_CHECK_FD (fd, out);
+
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -377,11 +383,8 @@ afr_fstat (call_frame_t *frame, xlator_t *this,
local->fd = fd_ref (fd);
- ret = afr_open_fd_fix (frame, this, _gf_false);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
+ afr_open_fd_fix (fd, this);
+
STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child,
children[call_child],
children[call_child]->fops->fstat,
@@ -470,6 +473,8 @@ afr_readlink (call_frame_t *frame, xlator_t *this,
children = priv->children;
+ AFR_SBRAIN_CHECK_LOC (loc, out);
+
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -1058,7 +1063,7 @@ unlock:
}
len = dict_serialized_length (local->dict);
- if (len == 0) {
+ if (len <= 0) {
goto unwind;
}
@@ -1321,6 +1326,62 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
return ret;
}
+static int
+afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data)
+{
+ int ret = 0;
+
+ if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0)
+ ret = gf_get_min_stime (THIS, data, key, value);
+
+ return ret;
+}
+
+int32_t
+afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
+
+ if (!frame || !frame->local || !this) {
+ gf_log ("", GF_LOG_ERROR, "possible NULL deref");
+ goto out;
+ }
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (!dict || (op_ret < 0)) {
+ local->op_errno = op_errno;
+ goto cleanup;
+ }
+
+ if (!local->dict)
+ local->dict = dict_copy_with_ref (dict, NULL);
+ else
+ dict_foreach (dict, afr_aggregate_stime_xattr,
+ local->dict);
+ local->op_ret = 0;
+ }
+
+cleanup:
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ AFR_STACK_UNWIND (getxattr, frame, local->op_ret,
+ local->op_errno, local->dict, xdata);
+ }
+
+out:
+ return 0;
+}
+
+
static gf_boolean_t
afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk,
gf_boolean_t is_fgetxattr)
@@ -1353,6 +1414,8 @@ afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk,
} else {
*cbk = afr_getxattr_lockinfo_cbk;
}
+ } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) {
+ *cbk = afr_common_getxattr_stime_cbk;
} else {
is_spl = _gf_false;
}
@@ -1401,7 +1464,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
int32_t read_child = -1;
int ret = -1;
fop_getxattr_cbk_t cbk = NULL;
-
+ int afr_xtime_gauge[MCNT_MAX] = {0,};
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -1412,6 +1475,8 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
children = priv->children;
+ AFR_SBRAIN_CHECK_LOC (loc, out);
+
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -1434,7 +1499,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
goto out;
}
if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0)
- && (-1 == frame->root->pid)) {
+ && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
local->marker.call_count = priv->child_count;
@@ -1450,6 +1515,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
sub_volumes,
priv->child_count,
MARKER_UUID_TYPE,
+ marker_uuid_default_gauge,
priv->vol_uuid)) {
gf_log (this->name, GF_LOG_INFO,
@@ -1484,7 +1550,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
if (*priv->vol_uuid) {
if ((match_uuid_local (name, priv->vol_uuid) == 0)
- && (-1 == frame->root->pid)) {
+ && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
local->marker.call_count = priv->child_count;
sub_volumes = alloca ( priv->child_count
@@ -1496,12 +1562,20 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
}
+ /* don't err out on getting ENOTCONN (brick down)
+ * from a subset of the bricks
+ */
+ memcpy (afr_xtime_gauge, marker_xtime_default_gauge,
+ sizeof (afr_xtime_gauge));
+ afr_xtime_gauge[MCNT_NOTFOUND] = 0;
+ afr_xtime_gauge[MCNT_ENOTCONN] = 0;
if (cluster_getmarkerattr (frame, this, loc,
name, local,
afr_getxattr_unwind,
sub_volumes,
priv->child_count,
MARKER_XTIME_TYPE,
+ afr_xtime_gauge,
priv->vol_uuid)) {
gf_log (this->name, GF_LOG_INFO,
"%s: failed to get marker attr (%s)",
@@ -1658,6 +1732,8 @@ afr_fgetxattr (call_frame_t *frame, xlator_t *this,
children = priv->children;
+ AFR_SBRAIN_CHECK_FD (fd, out);
+
AFR_LOCAL_ALLOC_OR_GOTO (local, out);
frame->local = local;
@@ -1813,6 +1889,8 @@ afr_readv (call_frame_t *frame, xlator_t *this,
priv = this->private;
children = priv->children;
+ AFR_SBRAIN_CHECK_FD (fd, out);
+
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
@@ -1842,11 +1920,8 @@ afr_readv (call_frame_t *frame, xlator_t *this,
local->cont.readv.offset = offset;
local->cont.readv.flags = flags;
- ret = afr_open_fd_fix (frame, this, _gf_false);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
+ afr_open_fd_fix (fd, this);
+
STACK_WIND_COOKIE (frame, afr_readv_cbk,
(void *) (long) call_child,
children[call_child],
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index d9d3800fc..c1ec69a55 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -39,34 +39,133 @@
#include "afr-transaction.h"
#include "afr-self-heal-common.h"
+void
+__inode_write_fop_cbk (call_frame_t *frame, int child_index, int read_child,
+ xlator_t *this, int32_t *op_ret, int32_t *op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (afr_fop_failed (*op_ret, *op_errno)) {
+ local->child_errno[child_index] = *op_errno;
+
+ switch (local->op) {
+ case GF_FOP_TRUNCATE:
+ case GF_FOP_FTRUNCATE:
+ if (*op_errno != EFBIG)
+ afr_transaction_fop_failed (frame, this,
+ child_index);
+ break;
+ default:
+ afr_transaction_fop_failed (frame, this, child_index);
+ break;
+ }
+ local->op_errno = *op_errno;
+ goto out;
+ }
+
+ if ((local->success_count == 0) || (read_child == child_index)) {
+ local->op_ret = *op_ret;
+ if (prebuf)
+ local->cont.inode_wfop.prebuf = *prebuf;
+ if (postbuf)
+ local->cont.inode_wfop.postbuf = *postbuf;
+ }
+
+ local->success_count++;
+out:
+ return;
+}
+
/* {{{ writev */
-int
+void
+afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame)
+{
+ afr_local_t *src_local = NULL;
+ afr_local_t *dst_local = NULL;
+
+ src_local = src_frame->local;
+ dst_local = dst_frame->local;
+
+ dst_local->op_ret = src_local->op_ret;
+ dst_local->op_errno = src_local->op_errno;
+ dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf;
+ dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf;
+}
+
+void
afr_writev_unwind (call_frame_t *frame, xlator_t *this)
{
afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ local = frame->local;
+
+ AFR_STACK_UNWIND (writev, frame,
+ local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ NULL);
+}
+
+call_frame_t*
+afr_transaction_detach_fop_frame (call_frame_t *frame)
+{
+ afr_local_t * local = NULL;
+ call_frame_t *fop_frame = NULL;
local = frame->local;
LOCK (&frame->lock);
{
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
+ fop_frame = local->transaction.main_frame;
local->transaction.main_frame = NULL;
}
UNLOCK (&frame->lock);
- if (main_frame) {
- AFR_STACK_UNWIND (writev, main_frame,
- local->op_ret, local->op_errno,
- &local->cont.writev.prebuf,
- &local->cont.writev.postbuf,
- NULL);
+ return fop_frame;
+}
+
+int
+afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *fop_frame = NULL;
+
+ fop_frame = afr_transaction_detach_fop_frame (frame);
+
+ if (fop_frame) {
+ afr_writev_copy_outvars (frame, fop_frame);
+ afr_writev_unwind (fop_frame, this);
}
return 0;
}
+static void
+afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+ /*
+ * We already have the best case result of the writev calls staged
+ * as the return value. Any writev that returns some value less
+ * than the best case is now out of sync, so mark the fop as
+ * failed. Note that fops that have returned with errors have
+ * already been marked as failed.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if ((!local->replies[i].valid) ||
+ (local->replies[i].op_ret == -1))
+ continue;
+
+ if (local->replies[i].op_ret < local->op_ret)
+ afr_transaction_fop_failed(frame, this, i);
+ }
+}
int
afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -74,14 +173,17 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postbuf, dict_t *xdata)
{
afr_local_t * local = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *fop_frame = NULL;
int child_index = (long) cookie;
int call_count = -1;
int read_child = 0;
- afr_private_t *priv = NULL;
- int i = 0;
+ int ret = 0;
+ uint32_t open_fd_count = 0;
+ uint32_t write_is_append = 0;
local = frame->local;
- priv = this->private;
+ priv = this->private;
read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
@@ -91,12 +193,14 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->read_child_returned = _gf_true;
}
+ __inode_write_fop_cbk (frame, child_index, read_child, this,
+ &op_ret, &op_errno, prebuf, postbuf,
+ xdata);
+
local->replies[child_index].valid = 1;
local->replies[child_index].op_ret = op_ret;
local->replies[child_index].op_errno = op_errno;
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
/* stage the best case return value for unwind */
if ((local->success_count == 0) || (op_ret > local->op_ret)) {
@@ -105,12 +209,24 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
if (op_ret != -1) {
- if ((local->success_count == 0) ||
- (child_index == read_child)) {
- local->cont.writev.prebuf = *prebuf;
- local->cont.writev.postbuf = *postbuf;
- }
- local->success_count++;
+ if (xdata) {
+ ret = dict_get_uint32 (xdata,
+ GLUSTERFS_OPEN_FD_COUNT,
+ &open_fd_count);
+ if ((ret == 0) &&
+ (open_fd_count > local->open_fd_count)) {
+ local->open_fd_count = open_fd_count;
+ local->update_open_fd_count = _gf_true;
+ }
+
+ write_is_append = 0;
+ ret = dict_get_uint32 (xdata,
+ GLUSTERFS_WRITE_IS_APPEND,
+ &write_is_append);
+ if (ret || !write_is_append)
+ local->append_write = _gf_false;
+ }
+
}
}
UNLOCK (&frame->lock);
@@ -118,24 +234,40 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_count = afr_frame_return (frame);
if (call_count == 0) {
- /*
- * We already have the best case result of the writev calls staged
- * as the return value. Any writev that returns some value less
- * than the best case is now out of sync, so mark the fop as
- * failed. Note that fops that have returned with errors have
- * already been marked as failed.
- */
- for (i = 0; i < priv->child_count; i++) {
- if ((!local->replies[i].valid) ||
- (local->replies[i].op_ret == -1))
- continue;
-
- if (local->replies[i].op_ret < local->op_ret)
- afr_transaction_fop_failed(frame, this, i);
- }
- local->transaction.unwind (frame, this);
- local->transaction.resume (frame, this);
+ if (local->update_open_fd_count)
+ afr_handle_open_fd_count (frame, this);
+
+ if (!local->stable_write && !local->append_write)
+ /* An appended write removes the necessity to
+ fsync() the file. This is because self-heal
+ has the logic to check for larger file when
+ the xattrs are not reliably pointing at
+ a stale file.
+ */
+ afr_fd_report_unstable_write (this, local->fd);
+
+ afr_writev_handle_short_writes (frame, this);
+ if (afr_any_fops_failed (local, priv)) {
+ //Don't unwind until post-op is complete
+ local->transaction.resume (frame, this);
+ } else {
+ /*
+ * Generally inode-write fops do transaction.unwind then
+ * transaction.resume, but writev needs to make sure that
+ * delayed post-op frame is placed in fdctx before unwind
+ * happens. This prevents the race of flush doing the
+ * changelog wakeup first in fuse thread and then this
+ * writev placing its delayed post-op frame in fdctx.
+ * This helps flush make sure all the delayed post-ops are
+ * completed.
+ */
+
+ fop_frame = afr_transaction_detach_fop_frame (frame);
+ afr_writev_copy_outvars (frame, fop_frame);
+ local->transaction.resume (frame, this);
+ afr_writev_unwind (fop_frame, this);
+ }
}
return 0;
}
@@ -147,6 +279,8 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this)
afr_private_t *priv = NULL;
int i = 0;
int call_count = -1;
+ dict_t *xdata = NULL;
+ GF_UNUSED int ret = 0;
local = frame->local;
priv = this->private;
@@ -170,6 +304,19 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this)
return 0;
}
+ xdata = dict_new ();
+ if (xdata) {
+ ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT,
+ sizeof (uint32_t));
+ ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND,
+ 0);
+ /* Set append_write to be true speculatively. If on any
+ server it turns not be true, we unset it in the
+ callback.
+ */
+ local->append_write = _gf_true;
+ }
+
for (i = 0; i < priv->child_count; i++) {
if (local->transaction.pre_op[i]) {
STACK_WIND_COOKIE (frame, afr_writev_wind_cbk,
@@ -182,13 +329,16 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this)
local->cont.writev.offset,
local->cont.writev.flags,
local->cont.writev.iobref,
- NULL);
+ xdata);
if (!--call_count)
break;
}
}
+ if (xdata)
+ dict_unref (xdata);
+
return 0;
}
@@ -228,7 +378,7 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)
}
transaction_frame->local = local;
- frame->local = NULL;
+ AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local->op = GF_FOP_WRITE;
@@ -236,10 +386,17 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)
local->transaction.fop = afr_writev_wind;
local->transaction.done = afr_writev_done;
- local->transaction.unwind = afr_writev_unwind;
+ local->transaction.unwind = afr_transaction_writev_unwind;
local->transaction.main_frame = frame;
if (local->fd->flags & O_APPEND) {
+ /*
+ * Backend vfs ignores the 'offset' for append mode fd so
+ * locking just the region provided for the writev does not
+ * give consistency gurantee. The actual write may happen at a
+ * completely different range than the one provided by the
+ * offset, len in the fop. So lock the entire file.
+ */
local->transaction.start = 0;
local->transaction.len = 0;
} else {
@@ -265,153 +422,74 @@ out:
return 0;
}
-static int
-afr_prepare_loc (call_frame_t *frame, fd_t *fd)
+static void
+afr_trigger_open_fd_self_heal (fd_t *fd, xlator_t *this)
{
- afr_local_t *local = NULL;
- char *name = NULL;
- char *path = NULL;
- int ret = 0;
-
- if ((!fd) || (!fd->inode))
- return -1;
-
- local = frame->local;
- ret = inode_path (fd->inode, NULL, (char **)&path);
- if (ret <= 0) {
- gf_log (frame->this->name, GF_LOG_DEBUG,
- "Unable to get path for gfid: %s",
- uuid_utoa (fd->inode->gfid));
- return -1;
- }
-
- if (local->loc.path) {
- if (strcmp (path, local->loc.path))
- gf_log (frame->this->name, GF_LOG_DEBUG,
- "overwriting old loc->path %s with %s",
- local->loc.path, path);
- GF_FREE ((char *)local->loc.path);
- }
- local->loc.path = path;
-
- name = strrchr (local->loc.path, '/');
- if (name)
- name++;
- local->loc.name = name;
-
- if (local->loc.inode) {
- inode_unref (local->loc.inode);
- }
- local->loc.inode = inode_ref (fd->inode);
-
- if (local->loc.parent) {
- inode_unref (local->loc.parent);
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ char *reason = NULL;
+ int32_t op_errno = 0;
+ int ret = 0;
+
+ if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) {
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid args: "
+ "fd: %p, inode: %p", fd,
+ fd ? fd->inode : NULL);
+ goto out;
}
- local->loc.parent = inode_parent (local->loc.inode, 0, NULL);
-
- return 0;
-}
-
-afr_fd_paused_call_t*
-afr_paused_call_create (call_frame_t *frame)
-{
- afr_local_t *local = NULL;
- afr_fd_paused_call_t *paused_call = NULL;
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame)
+ goto out;
+ AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
- GF_ASSERT (local->fop_call_continue);
-
- paused_call = GF_CALLOC (1, sizeof (*paused_call),
- gf_afr_fd_paused_call_t);
- if (paused_call) {
- INIT_LIST_HEAD (&paused_call->call_list);
- paused_call->frame = frame;
- }
-
- return paused_call;
-}
-
-static int
-afr_pause_fd_fop (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx)
-{
- afr_fd_paused_call_t *paused_call = NULL;
- int ret = 0;
-
- paused_call = afr_paused_call_create (frame);
- if (paused_call)
- list_add (&paused_call->call_list, &fd_ctx->paused_calls);
- else
- ret = -ENOMEM;
-
- return ret;
-}
+ ret = afr_local_init (local, this->private, &op_errno);
+ if (ret < 0)
+ goto out;
-static void
-afr_trigger_open_fd_self_heal (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- inode_t *inode = NULL;
- char *reason = NULL;
+ local->loc.inode = inode_ref (fd->inode);
+ ret = loc_path (&local->loc, NULL);
+ if (ret < 0)
+ goto out;
- local = frame->local;
sh = &local->self_heal;
- inode = local->fd->inode;
-
- sh->do_missing_entry_self_heal = _gf_true;
- sh->do_gfid_self_heal = _gf_true;
- sh->do_data_self_heal = _gf_true;
+ sh->do_metadata_self_heal = _gf_true;
+ if (fd->inode->ia_type == IA_IFREG)
+ sh->do_data_self_heal = _gf_true;
+ else if (fd->inode->ia_type == IA_IFDIR)
+ sh->do_entry_self_heal = _gf_true;
reason = "subvolume came online";
- afr_launch_self_heal (frame, this, inode, _gf_true, inode->ia_type,
- reason, NULL, NULL);
+ afr_launch_self_heal (frame, this, fd->inode, _gf_true,
+ fd->inode->ia_type, reason, NULL, NULL);
+ return;
+out:
+ AFR_STACK_DESTROY (frame);
}
-int
-afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop)
-{
- int ret = 0;
- int i = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- inode_t *inode = NULL;
- gf_boolean_t need_self_heal = _gf_false;
- int *need_open = NULL;
- int need_open_count = 0;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- gf_boolean_t fop_continue = _gf_true;
+void
+afr_open_fd_fix (fd_t *fd, xlator_t *this)
+{
+ int ret = 0;
+ int i = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ gf_boolean_t need_self_heal = _gf_false;
+ int *need_open = NULL;
+ size_t need_open_count = 0;
+ afr_private_t *priv = NULL;
- local = frame->local;
priv = this->private;
- GF_ASSERT (local->fd);
-
- inode = local->fd->inode;
- //gfid is not set in rebalance, that case needs to be handled.
- if (fd_is_anonymous (local->fd) ||
- !inode || uuid_is_null (inode->gfid)) {
- fop_continue = _gf_true;
- goto out;
- }
-
- if (pause_fop)
- GF_ASSERT (local->fop_call_continue);
-
- ret = afr_prepare_loc (frame, local->fd);
- if (ret < 0) {
- //File does not exist we cant open it.
- ret = 0;
+ if (!afr_is_fd_fixable (fd))
goto out;
- }
- fd_ctx = afr_fd_ctx_get (local->fd, this);
- if (!fd_ctx) {
- ret = -EINVAL;
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
goto out;
- }
- LOCK (&local->fd->lock);
+ LOCK (&fd->lock);
{
if (fd_ctx->up_count < priv->up_count) {
need_self_heal = _gf_true;
@@ -419,55 +497,34 @@ afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop)
fd_ctx->down_count = priv->down_count;
}
+ need_open = alloca (priv->child_count * sizeof (*need_open));
for (i = 0; i < priv->child_count; i++) {
- if ((fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED) &&
- local->child_up[i]) {
- fd_ctx->opened_on[i] = AFR_FD_OPENING;
- if (!need_open)
- need_open = GF_CALLOC (priv->child_count,
- sizeof (*need_open),
- gf_afr_mt_int32_t);
- need_open[i] = 1;
- need_open_count++;
- } else if (pause_fop && local->child_up[i] &&
- (fd_ctx->opened_on[i] == AFR_FD_OPENING)) {
- local->fop_paused = _gf_true;
- }
- }
+ need_open[i] = 0;
+ if (fd_ctx->opened_on[i] != AFR_FD_NOT_OPENED)
+ continue;
+
+ if (!priv->child_up[i])
+ continue;
+
+ fd_ctx->opened_on[i] = AFR_FD_OPENING;
- if (local->fop_paused) {
- GF_ASSERT (pause_fop);
- gf_log (this->name, GF_LOG_INFO, "Pause fd %p",
- local->fd);
- ret = afr_pause_fd_fop (frame, this, fd_ctx);
- if (ret)
- goto unlock;
- fop_continue = _gf_false;
+ need_open[i] = 1;
+ need_open_count++;
}
}
-unlock:
- UNLOCK (&local->fd->lock);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to fix fd for %s",
- local->loc.path);
- fop_continue = _gf_false;
+ UNLOCK (&fd->lock);
+ if (ret)
goto out;
- }
if (need_self_heal)
- afr_trigger_open_fd_self_heal (frame, this);
+ afr_trigger_open_fd_self_heal (fd, this);
if (!need_open_count)
goto out;
- gf_log (this->name, GF_LOG_INFO, "Opening fd %p", local->fd);
- afr_fix_open (frame, this, fd_ctx, need_open_count, need_open);
- fop_continue = _gf_false;
+ afr_fix_open (this, fd, need_open_count, need_open);
out:
- GF_FREE (need_open);
- if (fop_continue && local->fop_call_continue)
- local->fop_call_continue (frame, this);
- return ret;
+ return;
}
int
@@ -486,6 +543,11 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
priv = this->private;
+ if (afr_is_split_brain (this, fd->inode)) {
+ op_errno = EIO;
+ goto out;
+ }
+
QUORUM_CHECK(writev,out);
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
@@ -502,13 +564,15 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
local->cont.writev.iobref = iobref_ref (iobref);
local->fd = fd_ref (fd);
- local->fop_call_continue = afr_do_writev;
- ret = afr_open_fd_fix (frame, this, _gf_true);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
+ /* detect here, but set it in writev_wind_cbk *after* the unstable
+ write is performed
+ */
+ local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC));
+
+ afr_open_fd_fix (fd, this);
+
+ afr_do_writev (frame, this);
ret = 0;
out:
@@ -542,8 +606,8 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this)
if (main_frame) {
AFR_STACK_UNWIND (truncate, main_frame, local->op_ret,
local->op_errno,
- &local->cont.truncate.prebuf,
- &local->cont.truncate.postbuf,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
NULL);
}
@@ -557,14 +621,11 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postbuf, dict_t *xdata)
{
afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
int child_index = (long) cookie;
int read_child = 0;
int call_count = -1;
- int need_unwind = 0;
local = frame->local;
- priv = this->private;
read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL);
@@ -574,38 +635,22 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->read_child_returned = _gf_true;
}
- if (afr_fop_failed (op_ret, op_errno) && op_errno != EFBIG)
- afr_transaction_fop_failed (frame, this, child_index);
-
if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.truncate.prebuf = *prebuf;
- local->cont.truncate.postbuf = *postbuf;
- }
-
- if (child_index == read_child) {
- local->cont.truncate.prebuf = *prebuf;
- local->cont.truncate.postbuf = *postbuf;
- }
-
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
+ if (prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
}
- local->op_errno = op_errno;
+ __inode_write_fop_cbk (frame, child_index, read_child, this,
+ &op_ret, &op_errno, prebuf, postbuf,
+ xdata);
}
UNLOCK (&frame->lock);
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
call_count = afr_frame_return (frame);
if (call_count == 0) {
+ if (local->stable_write && afr_txn_nothing_failed (frame, this))
+ local->transaction.unwind (frame, this);
+
local->transaction.resume (frame, this);
}
@@ -633,6 +678,7 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this)
}
local->call_count = call_count;
+ local->stable_write = _gf_true;
for (i = 0; i < priv->child_count; i++) {
if (local->transaction.pre_op[i]) {
@@ -753,8 +799,8 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this)
if (main_frame) {
AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret,
local->op_errno,
- &local->cont.ftruncate.prebuf,
- &local->cont.ftruncate.postbuf,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
NULL);
}
return 0;
@@ -767,14 +813,11 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postbuf, dict_t *xdata)
{
afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
int child_index = (long) cookie;
int call_count = -1;
- int need_unwind = 0;
int read_child = 0;
local = frame->local;
- priv = this->private;
read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
@@ -784,38 +827,22 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->read_child_returned = _gf_true;
}
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.ftruncate.prebuf = *prebuf;
- local->cont.ftruncate.postbuf = *postbuf;
- }
-
- if (child_index == read_child) {
- local->cont.ftruncate.prebuf = *prebuf;
- local->cont.ftruncate.postbuf = *postbuf;
- }
-
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
+ if (prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
}
- local->op_errno = op_errno;
+ __inode_write_fop_cbk (frame, child_index, read_child, this,
+ &op_ret, &op_errno, prebuf, postbuf,
+ xdata);
}
UNLOCK (&frame->lock);
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
call_count = afr_frame_return (frame);
if (call_count == 0) {
+ if (local->stable_write && afr_txn_nothing_failed (frame, this))
+ local->transaction.unwind (frame, this);
+
local->transaction.resume (frame, this);
}
@@ -843,6 +870,7 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this)
}
local->call_count = call_count;
+ local->stable_write = _gf_true;
for (i = 0; i < priv->child_count; i++) {
if (local->transaction.pre_op[i]) {
@@ -942,6 +970,10 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this,
priv = this->private;
+ if (afr_is_split_brain (this, fd->inode)) {
+ op_errno = EIO;
+ goto out;
+ }
QUORUM_CHECK(ftruncate,out);
AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
@@ -954,13 +986,10 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this,
local->cont.ftruncate.offset = offset;
local->fd = fd_ref (fd);
- local->fop_call_continue = afr_do_ftruncate;
- ret = afr_open_fd_fix (frame, this, _gf_true);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
+ afr_open_fd_fix (fd, this);
+
+ afr_do_ftruncate (frame, this);
ret = 0;
out:
@@ -996,8 +1025,8 @@ afr_setattr_unwind (call_frame_t *frame, xlator_t *this)
if (main_frame) {
AFR_STACK_UNWIND (setattr, main_frame, local->op_ret,
local->op_errno,
- &local->cont.setattr.preop_buf,
- &local->cont.setattr.postop_buf,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
NULL);
}
@@ -1028,29 +1057,14 @@ afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->read_child_returned = _gf_true;
}
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.setattr.preop_buf = *preop;
- local->cont.setattr.postop_buf = *postop;
- }
-
- if (child_index == read_child) {
- local->cont.setattr.preop_buf = *preop;
- local->cont.setattr.postop_buf = *postop;
- }
-
- local->success_count++;
+ __inode_write_fop_cbk (frame, child_index, read_child, this,
+ &op_ret, &op_errno, preop, postop,
+ xdata);
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
+ if ((local->success_count >= priv->wait_count)
+ && local->read_child_returned) {
+ need_unwind = 1;
}
- local->op_errno = op_errno;
}
UNLOCK (&frame->lock);
@@ -1205,8 +1219,8 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this)
if (main_frame) {
AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret,
local->op_errno,
- &local->cont.fsetattr.preop_buf,
- &local->cont.fsetattr.postop_buf,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
NULL);
}
@@ -1237,29 +1251,14 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->read_child_returned = _gf_true;
}
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.fsetattr.preop_buf = *preop;
- local->cont.fsetattr.postop_buf = *postop;
- }
+ __inode_write_fop_cbk (frame, child_index, read_child, this,
+ &op_ret, &op_errno, preop, postop,
+ xdata);
- if (child_index == read_child) {
- local->cont.fsetattr.preop_buf = *preop;
- local->cont.fsetattr.postop_buf = *postop;
- }
-
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
+ if ((local->success_count >= priv->wait_count)
+ && local->read_child_returned) {
+ need_unwind = 1;
}
- local->op_errno = op_errno;
}
UNLOCK (&frame->lock);
@@ -1347,6 +1346,11 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,
priv = this->private;
+ if (afr_is_split_brain (this, fd->inode)) {
+ op_errno = EIO;
+ goto out;
+ }
+
QUORUM_CHECK(fsetattr,out);
transaction_frame = copy_frame (frame);
@@ -1371,11 +1375,7 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,
local->fd = fd_ref (fd);
- ret = afr_open_fd_fix (transaction_frame, this, _gf_false);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
+ afr_open_fd_fix (fd, this);
local->transaction.main_frame = frame;
local->transaction.start = LLONG_MAX - 1;
@@ -1431,28 +1431,23 @@ int
afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int need_unwind = 0;
+ int child_index = (long) cookie;
local = frame->local;
priv = this->private;
LOCK (&frame->lock);
{
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->child_count) {
- need_unwind = 1;
- }
+ __inode_write_fop_cbk (frame, child_index, -1, this,
+ &op_ret, &op_errno, NULL, NULL,
+ xdata);
+ if (local->success_count == priv->child_count) {
+ need_unwind = 1;
}
-
- local->op_errno = op_errno;
}
UNLOCK (&frame->lock);
@@ -1621,28 +1616,24 @@ int
afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int need_unwind = 0;
+ int child_index = (long) cookie;
local = frame->local;
priv = this->private;
LOCK (&frame->lock);
{
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
- if (local->success_count == priv->child_count) {
- need_unwind = 1;
- }
+ __inode_write_fop_cbk (frame, child_index, -1, this,
+ &op_ret, &op_errno, NULL, NULL,
+ xdata);
+ if (local->success_count == priv->child_count) {
+ need_unwind = 1;
}
-
- local->op_errno = op_errno;
}
UNLOCK (&frame->lock);
@@ -1734,6 +1725,11 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,
priv = this->private;
+ if (afr_is_split_brain (this, fd->inode)) {
+ op_errno = EIO;
+ goto out;
+ }
+
QUORUM_CHECK(fsetxattr,out);
AFR_LOCAL_ALLOC_OR_GOTO (local, out);
@@ -1816,28 +1812,23 @@ int
afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int need_unwind = 0;
+ int child_index = (long) cookie;
local = frame->local;
priv = this->private;
LOCK (&frame->lock);
{
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
+ __inode_write_fop_cbk (frame, child_index, -1, this,
+ &op_ret, &op_errno, NULL, NULL,
+ xdata);
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
}
-
- local->op_errno = op_errno;
}
UNLOCK (&frame->lock);
@@ -2005,28 +1996,24 @@ int
afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ int call_count = -1;
+ int need_unwind = 0;
+ int child_index = (long) cookie;
local = frame->local;
priv = this->private;
LOCK (&frame->lock);
{
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
+ __inode_write_fop_cbk (frame, child_index, -1, this,
+ &op_ret, &op_errno, NULL, NULL,
+ xdata);
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
}
-
- local->op_errno = op_errno;
}
UNLOCK (&frame->lock);
@@ -2119,6 +2106,10 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this->private, out);
priv = this->private;
+ if (afr_is_split_brain (this, fd->inode)) {
+ op_errno = EIO;
+ goto out;
+ }
QUORUM_CHECK(fremovexattr, out);
@@ -2167,3 +2158,704 @@ out:
return 0;
}
+
+static int
+afr_fallocate_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret,
+ local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ NULL);
+ }
+ return 0;
+}
+
+static int
+afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+ int read_child = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
+
+ LOCK (&frame->lock);
+ {
+ if (child_index == read_child) {
+ local->read_child_returned = _gf_true;
+ }
+
+ __inode_write_fop_cbk (frame, child_index, read_child, this,
+ &op_ret, &op_errno, prebuf, postbuf,
+ xdata);
+
+ if ((local->success_count >= priv->wait_count)
+ && local->read_child_returned) {
+ need_unwind = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+static int
+afr_fallocate_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
+ priv->child_count);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i]) {
+ STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fallocate,
+ local->fd,
+ local->cont.fallocate.mode,
+ local->cont.fallocate.offset,
+ local->cont.fallocate.len,
+ NULL);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int
+afr_fallocate_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+static int
+afr_do_fallocate (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t * transaction_frame = NULL;
+ afr_local_t * local = NULL;
+ int op_ret = -1;
+ int op_errno = 0;
+
+ local = frame->local;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ goto out;
+ }
+
+ transaction_frame->local = local;
+ frame->local = NULL;
+
+ local->op = GF_FOP_FALLOCATE;
+
+ local->transaction.fop = afr_fallocate_wind;
+ local->transaction.done = afr_fallocate_done;
+ local->transaction.unwind = afr_fallocate_unwind;
+
+ local->transaction.main_frame = frame;
+
+ local->transaction.start = local->cont.fallocate.offset;
+ local->transaction.len = 0;
+
+ /* fallocate can modify the file size */
+ op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ goto out;
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret < 0) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (fallocate, frame, op_ret, op_errno, NULL,
+ NULL, NULL);
+ }
+
+ return 0;
+}
+
+int
+afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ if (afr_is_split_brain (this, fd->inode)) {
+ op_errno = EIO;
+ goto out;
+ }
+ QUORUM_CHECK(fallocate,out);
+
+ AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
+ local = frame->local;
+
+ ret = afr_local_init (local, priv, &op_errno);
+ if (ret < 0)
+ goto out;
+
+ local->cont.fallocate.mode = mode;
+ local->cont.fallocate.offset = offset;
+ local->cont.fallocate.len = len;
+
+ local->fd = fd_ref (fd);
+
+ afr_open_fd_fix (fd, this);
+
+ afr_do_fallocate (frame, this);
+
+ ret = 0;
+out:
+ if (ret < 0) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ discard */
+
+static int
+afr_discard_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ AFR_STACK_UNWIND (discard, main_frame, local->op_ret,
+ local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ NULL);
+ }
+ return 0;
+}
+
+static int
+afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+ int read_child = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
+
+ LOCK (&frame->lock);
+ {
+ if (child_index == read_child) {
+ local->read_child_returned = _gf_true;
+ }
+
+ __inode_write_fop_cbk (frame, child_index, read_child, this,
+ &op_ret, &op_errno, prebuf, postbuf,
+ xdata);
+
+ if ((local->success_count >= priv->wait_count)
+ && local->read_child_returned) {
+ need_unwind = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+static int
+afr_discard_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
+ priv->child_count);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i]) {
+ STACK_WIND_COOKIE (frame, afr_discard_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->discard,
+ local->fd,
+ local->cont.discard.offset,
+ local->cont.discard.len,
+ NULL);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int
+afr_discard_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+static int
+afr_do_discard (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t * transaction_frame = NULL;
+ afr_local_t * local = NULL;
+ int op_ret = -1;
+ int op_errno = 0;
+
+ local = frame->local;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ goto out;
+ }
+
+ transaction_frame->local = local;
+ frame->local = NULL;
+
+ local->op = GF_FOP_DISCARD;
+
+ local->transaction.fop = afr_discard_wind;
+ local->transaction.done = afr_discard_done;
+ local->transaction.unwind = afr_discard_unwind;
+
+ local->transaction.main_frame = frame;
+
+ local->transaction.start = local->cont.discard.offset;
+ local->transaction.len = 0;
+
+ op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ goto out;
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret < 0) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (discard, frame, op_ret, op_errno, NULL,
+ NULL, NULL);
+ }
+
+ return 0;
+}
+
+int
+afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ if (afr_is_split_brain (this, fd->inode)) {
+ op_errno = EIO;
+ goto out;
+ }
+ QUORUM_CHECK(discard, out);
+
+ AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
+ local = frame->local;
+
+ ret = afr_local_init (local, priv, &op_errno);
+ if (ret < 0)
+ goto out;
+
+ local->cont.discard.offset = offset;
+ local->cont.discard.len = len;
+
+ local->fd = fd_ref (fd);
+
+ afr_open_fd_fix (fd, this);
+
+ afr_do_discard(frame, this);
+
+ ret = 0;
+out:
+ if (ret < 0) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ }
+
+ return 0;
+}
+
+
+/* {{{ zerofill */
+
+static int
+afr_zerofill_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ AFR_STACK_UNWIND (zerofill, main_frame, local->op_ret,
+ local->op_errno,
+ &local->cont.zerofill.prebuf,
+ &local->cont.zerofill.postbuf,
+ NULL);
+ }
+ return 0;
+}
+
+static int
+afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+ int read_child = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
+
+ LOCK (&frame->lock);
+ {
+ if (child_index == read_child) {
+ local->read_child_returned = _gf_true;
+ }
+
+ if (afr_fop_failed (op_ret, op_errno)) {
+ afr_transaction_fop_failed (frame, this, child_index);
+ }
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.zerofill.prebuf = *prebuf;
+ local->cont.zerofill.postbuf = *postbuf;
+ }
+
+ if (child_index == read_child) {
+ local->cont.zerofill.prebuf = *prebuf;
+ local->cont.zerofill.postbuf = *postbuf;
+ }
+
+ local->success_count++;
+
+ if ((local->success_count >= priv->wait_count)
+ && local->read_child_returned) {
+ need_unwind = 1;
+ }
+ }
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind) {
+ local->transaction.unwind (frame, this);
+ }
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+static int
+afr_zerofill_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
+ priv->child_count);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i]) {
+ STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->zerofill,
+ local->fd,
+ local->cont.zerofill.offset,
+ local->cont.zerofill.len,
+ NULL);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int
+afr_zerofill_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+static int
+afr_do_zerofill(call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ int op_ret = -1;
+ int op_errno = 0;
+
+ local = frame->local;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ goto out;
+ }
+
+ transaction_frame->local = local;
+ frame->local = NULL;
+
+ local->op = GF_FOP_ZEROFILL;
+
+ local->transaction.fop = afr_zerofill_wind;
+ local->transaction.done = afr_zerofill_done;
+ local->transaction.unwind = afr_zerofill_unwind;
+
+ local->transaction.main_frame = frame;
+
+ local->transaction.start = local->cont.zerofill.offset;
+ local->transaction.len = 0;
+
+ op_ret = afr_transaction (transaction_frame, this,
+ AFR_DATA_TRANSACTION);
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ goto out;
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret < 0) {
+ if (transaction_frame) {
+ AFR_STACK_DESTROY (transaction_frame);
+ }
+ AFR_STACK_UNWIND (zerofill, frame, op_ret, op_errno, NULL,
+ NULL, NULL);
+ }
+
+ return 0;
+}
+
+int
+afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ if (afr_is_split_brain (this, fd->inode)) {
+ op_errno = EIO;
+ goto out;
+ }
+ QUORUM_CHECK(zerofill, out);
+
+ AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
+ local = frame->local;
+
+ ret = afr_local_init (local, priv, &op_errno);
+ if (ret < 0) {
+ goto out;
+ }
+ local->cont.zerofill.offset = offset;
+ local->cont.zerofill.len = len;
+
+ local->fd = fd_ref (fd);
+
+ afr_open_fd_fix (fd, this);
+
+ afr_do_zerofill(frame, this);
+
+ ret = 0;
+out:
+ if (ret < 0) {
+ if (transaction_frame) {
+ AFR_STACK_DESTROY (transaction_frame);
+ }
+ AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL,
+ NULL, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+
diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h
index ed11079fd..8e93ca44a 100644
--- a/xlators/cluster/afr/src/afr-inode-write.h
+++ b/xlators/cluster/afr/src/afr-inode-write.h
@@ -68,4 +68,15 @@ int32_t
afr_fremovexattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, const char *name, dict_t *xdata);
+int
+afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata);
+
+int
+afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata);
+
+int
+afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata);
#endif /* __INODE_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
index 5e61be4d4..060d78f35 100644
--- a/xlators/cluster/afr/src/afr-lk-common.c
+++ b/xlators/cluster/afr/src/afr-lk-common.c
@@ -55,7 +55,33 @@
} while (0);
int
-afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index);
+afr_entry_lockee_cmp (const void *l1, const void *l2)
+{
+ const afr_entry_lockee_t *r1 = l1;
+ const afr_entry_lockee_t *r2 = l2;
+ int ret = 0;
+ uuid_t gfid1 = {0};
+ uuid_t gfid2 = {0};
+
+ loc_gfid ((loc_t*)&r1->loc, gfid1);
+ loc_gfid ((loc_t*)&r2->loc, gfid2);
+ ret = uuid_compare (gfid1, gfid2);
+ /*Entrylks with NULL basename are the 'smallest'*/
+ if (ret == 0) {
+ if (!r1->basename)
+ return -1;
+ if (!r2->basename)
+ return 1;
+ ret = strcmp (r1->basename, r2->basename);
+ }
+
+ if (ret <= 0)
+ return -1;
+ else
+ return 1;
+}
+
+int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index);
static int
afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this);
@@ -125,16 +151,9 @@ internal_lock_count (call_frame_t *frame, xlator_t *this)
local = frame->local;
priv = this->private;
- if (local->fd) {
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] && local->fd_open_on[i])
- ++call_count;
- }
- } else {
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i])
- ++call_count;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i])
+ ++call_count;
}
return call_count;
@@ -332,10 +351,13 @@ static void
afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this,
afr_lock_call_type_t lock_call_type,
afr_lock_op_type_t lk_op_type, const char *basename,
- int32_t child_index)
+ int32_t cookie)
{
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = 0;
+ int lockee_no = 0;
char lock[256];
char lockee[256];
@@ -343,28 +365,40 @@ afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this,
local = frame->local;
int_lock = &local->internal_lock;
+ priv = this->private;
+
+ if (!priv->entrylk_trace) {
+ return;
+ }
+ lockee_no = cookie / priv->child_count;
+ child_index = cookie % priv->child_count;
afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner);
- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
+ afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd,
+ child_index);
afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
gf_log (this->name, GF_LOG_INFO,
- "[%s %s] Lock={%s} Lockee={%s} Number={%llu}",
+ "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}",
lock_call_type_str,
lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST",
lock, lockee,
- (unsigned long long) int_lock->lock_number);
+ (unsigned long long) int_lock->lock_number,
+ cookie);
}
static void
afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this,
afr_lock_call_type_t lock_call_type,
afr_lock_op_type_t lk_op_type, const char *basename,
- int op_ret, int op_errno, int32_t child_index)
+ int op_ret, int op_errno, int32_t cookie)
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int lockee_no = 0;
+ int child_index = 0;
char lock[256];
char lockee[256];
@@ -373,20 +407,30 @@ afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this,
local = frame->local;
int_lock = &local->internal_lock;
+ priv = this->private;
- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
+ if (!priv->entrylk_trace) {
+ return;
+ }
+ lockee_no = cookie / priv->child_count;
+ child_index = cookie % priv->child_count;
+
+ afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner);
+ afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd,
+ child_index);
afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
afr_print_verdict (op_ret, op_errno, verdict);
gf_log (this->name, GF_LOG_INFO,
- "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu}",
+ "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}",
lock_call_type_str,
lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY",
verdict,
lock, lockee,
- (unsigned long long) int_lock->lock_number);
+ (unsigned long long) int_lock->lock_number,
+ cookie);
}
@@ -439,6 +483,47 @@ is_afr_lock_transaction (afr_local_t *local)
return ret;
}
+int
+afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local,
+ loc_t *loc, char *basename, int child_count)
+{
+ int ret = -1;
+
+ loc_copy (&lockee->loc, loc);
+ lockee->basename = (basename)? gf_strdup (basename): NULL;
+ if (basename && !lockee->basename)
+ goto out;
+
+ lockee->locked_count = 0;
+ lockee->locked_nodes = GF_CALLOC (child_count,
+ sizeof (*lockee->locked_nodes),
+ gf_afr_mt_afr_node_character);
+
+ if (!lockee->locked_nodes)
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+
+}
+
+void
+afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock)
+{
+ int i = 0;
+
+ for (i = 0; i < int_lock->lockee_count; i++) {
+ loc_wipe (&int_lock->lockee[i].loc);
+ if (int_lock->lockee[i].basename)
+ GF_FREE (int_lock->lockee[i].basename);
+ if (int_lock->lockee[i].locked_nodes)
+ GF_FREE (int_lock->lockee[i].locked_nodes);
+ }
+
+ return;
+}
+
static int
initialize_entrylk_variables (call_frame_t *frame, xlator_t *this)
{
@@ -456,8 +541,13 @@ initialize_entrylk_variables (call_frame_t *frame, xlator_t *this)
int_lock->lock_op_ret = -1;
int_lock->lock_op_errno = 0;
- for (i = 0; i < priv->child_count; i++) {
- int_lock->entry_locked_nodes[i] = 0;
+ for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) {
+ if (!int_lock->lockee[i].locked_nodes)
+ break;
+ int_lock->lockee[i].locked_count = 0;
+ memset (int_lock->lockee[i].locked_nodes, 0,
+ sizeof (*int_lock->lockee[i].locked_nodes) *
+ priv->child_count);
}
return 0;
@@ -469,19 +559,23 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this)
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
afr_private_t *priv = NULL;
- int i = 0;
+ afr_inodelk_t *inodelk = NULL;
priv = this->private;
local = frame->local;
int_lock = &local->internal_lock;
- int_lock->inodelk_lock_count = 0;
- int_lock->lock_op_ret = -1;
- int_lock->lock_op_errno = 0;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- for (i = 0; i < priv->child_count; i++) {
- int_lock->inode_locked_nodes[i] = 0;
- }
+ inodelk->lock_count = 0;
+ int_lock->lk_attempted_count = 0;
+ int_lock->lock_op_ret = -1;
+ int_lock->lock_op_errno = 0;
+
+ memset (inodelk->locked_nodes, 0,
+ sizeof (*inodelk->locked_nodes) * priv->child_count);
+ memset (int_lock->locked_nodes, 0,
+ sizeof (*int_lock->locked_nodes) * priv->child_count);
return 0;
}
@@ -503,6 +597,18 @@ lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2)
}
int
+afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock)
+{
+ int call_count = 0;
+ int i = 0;
+
+ for (i = 0; i < int_lock->lockee_count; i++)
+ call_count += int_lock->lockee[i].locked_count;
+
+ return call_count;
+}
+
+int
afr_locked_nodes_count (unsigned char *locked_nodes, int child_count)
{
@@ -550,7 +656,9 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
int32_t child_index = (long)cookie;
+ afr_private_t *priv = NULL;
local = frame->local;
int_lock = &local->internal_lock;
@@ -559,14 +667,18 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
AFR_UNLOCK_OP, NULL, op_ret,
op_errno, child_index);
+ priv = this->private;
+
if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
- gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on %d "
- "unlock by %s", local->loc.path, child_index,
+ gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on subvolume %s "
+ "with lock owner %s", local->loc.path,
+ priv->children[child_index]->name,
lkowner_utoa (&frame->root->lk_owner));
}
- int_lock->inode_locked_nodes[child_index] &= LOCKED_NO;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ inodelk->locked_nodes[child_index] &= LOCKED_NO;
if (local->transaction.eager_lock)
local->transaction.eager_lock[child_index] = 0;
@@ -580,6 +692,7 @@ static int
afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
struct gf_flock flock = {0,};
@@ -595,12 +708,14 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
priv = this->private;
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
flock.l_type = F_UNLCK;
full_flock.l_type = F_UNLCK;
- call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes,
+ call_count = afr_locked_nodes_count (inodelk->locked_nodes,
priv->child_count);
int_lock->lk_call_count = call_count;
@@ -616,8 +731,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
fd_ctx = afr_fd_ctx_get (local->fd, this);
for (i = 0; i < priv->child_count; i++) {
- if ((int_lock->inode_locked_nodes[i] & LOCKED_YES)
- != LOCKED_YES)
+ if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
continue;
if (local->fd) {
@@ -658,7 +772,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long)i,
priv->children[i],
priv->children[i]->fops->finodelk,
- this->name, local->fd,
+ int_lock->domain, local->fd,
F_SETLK, flock_use, NULL);
if (!--call_count)
@@ -673,7 +787,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long)i,
priv->children[i],
priv->children[i]->fops->inodelk,
- this->name, &local->loc,
+ int_lock->domain, &local->loc,
F_SETLK, &flock, NULL);
if (!--call_count)
@@ -689,13 +803,22 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_local_t *local = NULL;
- int32_t child_index = (long)cookie;
+ afr_private_t *priv = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ int32_t child_index = 0;
+ int lockee_no = 0;
+
+ priv = this->private;
+ lockee_no = (int)((long) cookie) / priv->child_count;
+ child_index = (int) ((long) cookie) % priv->child_count;
local = frame->local;
+ int_lock = &local->internal_lock;
AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION,
- AFR_UNLOCK_OP, NULL, op_ret,
- op_errno, child_index);
+ AFR_UNLOCK_OP,
+ int_lock->lockee[lockee_no].basename, op_ret,
+ op_errno, (int) ((long)cookie));
if (op_ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
@@ -703,6 +826,7 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->loc.path, child_index, strerror (op_errno));
}
+ int_lock->lockee[lockee_no].locked_nodes[child_index] &= LOCKED_NO;
afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, NULL);
return 0;
@@ -711,24 +835,22 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
static int
afr_unlock_entrylk (call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- const char *basename = NULL;
- loc_t *loc = NULL;
- int call_count = 0;
- int i = -1;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int index = 0;
+ int lockee_no = 0;
+ int copies = 0;
+ int i = -1;
local = frame->local;
int_lock = &local->internal_lock;
priv = this->private;
+ copies = priv->child_count;
- basename = int_lock->lk_basename;
- if (int_lock->lk_loc)
- loc = int_lock->lk_loc;
+ call_count = afr_lockee_locked_nodes_count (int_lock);
- call_count = afr_locked_nodes_count (int_lock->entry_locked_nodes,
- priv->child_count);
int_lock->lk_call_count = call_count;
if (!call_count){
@@ -738,18 +860,22 @@ afr_unlock_entrylk (call_frame_t *frame, xlator_t *this)
goto out;
}
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->entry_locked_nodes[i] & LOCKED_YES) {
- AFR_TRACE_ENTRYLK_IN (frame, this,
- AFR_ENTRYLK_NB_TRANSACTION,
- AFR_UNLOCK_OP, basename, i);
+ for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
+ lockee_no = i / copies;
+ index = i % copies;
+ if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) {
+ AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION,
+ AFR_UNLOCK_OP,
+ int_lock->lockee[lockee_no].basename,
+ i);
STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk,
(void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name,
- loc, basename,
+ priv->children[index],
+ priv->children[index]->fops->entrylk,
+ int_lock->domain,
+ &int_lock->lockee[lockee_no].loc,
+ int_lock->lockee[lockee_no].basename,
ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
if (!--call_count)
@@ -766,13 +892,20 @@ static int32_t
afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- int child_index = (long) cookie;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int cky = (long) cookie;
+ int child_index = 0;
+ int lockee_no = 0;
+ priv = this->private;
local = frame->local;
int_lock = &local->internal_lock;
+ child_index = ((int)cky) % priv->child_count;
+ lockee_no = ((int)cky) / priv->child_count;
+
LOCK (&frame->lock);
{
if (op_ret == -1) {
@@ -788,6 +921,8 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->op_errno = op_errno;
int_lock->lock_op_errno = op_errno;
}
+
+ int_lock->lk_attempted_count++;
}
UNLOCK (&frame->lock);
@@ -796,10 +931,17 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_unlock (frame, this);
} else {
if (op_ret == 0) {
- int_lock->locked_nodes[child_index] |= LOCKED_YES;
- int_lock->lock_count++;
+ if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ int_lock->lockee[lockee_no].locked_nodes[child_index] |= LOCKED_YES;
+ int_lock->lockee[lockee_no].locked_count++;
+ int_lock->entrylk_lock_count++;
+ } else {
+ int_lock->locked_nodes[child_index] |= LOCKED_YES;
+ int_lock->lock_count++;
+ }
}
- afr_lock_blocking (frame, this, child_index + 1);
+ afr_lock_blocking (frame, this, cky + 1);
}
return 0;
@@ -819,79 +961,6 @@ afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
static int32_t
-afr_lock_lower_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- loc_t *lower = NULL;
- loc_t *higher = NULL;
- const char *higher_name = NULL;
- int child_index = (long) cookie;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
-
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/locks xlator on server");
-
- local->op_ret = op_ret;
- }
-
- local->op_errno = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- if (op_ret != 0) {
- afr_copy_locked_nodes (frame, this);
- afr_unlock (frame, this);
- goto out;
- } else {
- int_lock->lower_locked_nodes[child_index] |= LOCKED_LOWER;
- int_lock->lock_count++;
- }
-
- /* The lower path has been locked. Now lock the higher path */
-
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- higher = (lower == &local->transaction.parent_loc ?
- &local->transaction.new_parent_loc :
- &local->transaction.parent_loc);
-
- higher_name = (higher == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, higher_name, child_index);
-
-
- STACK_WIND_COOKIE (frame, afr_lock_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- this->name, higher, higher_name,
- ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
-
-out:
- return 0;
-}
-
-static int32_t
afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
@@ -907,6 +976,7 @@ static int
afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
@@ -917,18 +987,16 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- memcpy (int_lock->inode_locked_nodes,
- int_lock->locked_nodes,
- priv->child_count);
- int_lock->inodelk_lock_count = int_lock->lock_count;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ memcpy (inodelk->locked_nodes, int_lock->locked_nodes,
+ sizeof (*inodelk->locked_nodes) * priv->child_count);
+ inodelk->lock_count = int_lock->lock_count;
break;
case AFR_ENTRY_RENAME_TRANSACTION:
case AFR_ENTRY_TRANSACTION:
- memcpy (int_lock->entry_locked_nodes,
- int_lock->locked_nodes,
- priv->child_count);
- int_lock->entrylk_lock_count = int_lock->lock_count;
+ /*entrylk_count is being used in both non-blocking and blocking
+ * modes */
break;
}
@@ -936,25 +1004,67 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
}
+static inline gf_boolean_t
+afr_is_entrylk (afr_internal_lock_t *int_lock,
+ afr_transaction_type trans_type)
+{
+ gf_boolean_t is_entrylk = _gf_false;
+
+ if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) &&
+ int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) {
+
+ is_entrylk = _gf_true;
+
+ } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) &&
+ (trans_type == AFR_ENTRY_TRANSACTION ||
+ trans_type == AFR_ENTRY_RENAME_TRANSACTION)) {
+
+ is_entrylk = _gf_true;
+
+ } else {
+ is_entrylk = _gf_false;
+ }
+
+ return is_entrylk;
+}
+
+static gf_boolean_t
+_is_lock_wind_needed (afr_local_t *local, int child_index)
+{
+ if (!local->child_up[child_index])
+ return _gf_false;
+
+ return _gf_true;
+}
+
int
-afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
+afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- loc_t *lower = NULL;
- const char *lower_name = NULL;
struct gf_flock flock = {0,};
uint64_t ctx = 0;
int ret = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
- flock.l_type = int_lock->lk_flock.l_type;
+ int child_index = 0;
+ int lockee_no = 0;
+ gf_boolean_t is_entrylk = _gf_false;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+ child_index = cookie % priv->child_count;
+ lockee_no = cookie / priv->child_count;
+ is_entrylk = afr_is_entrylk (int_lock, local->transaction.type);
+
+
+ if (!is_entrylk) {
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
+ flock.l_type = inodelk->flock.l_type;
+ }
if (local->fd) {
ret = fd_ctx_get (local->fd, this, &ctx);
@@ -973,42 +1083,26 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
return 0;
}
-
- /* skip over children that or down
- or don't have the fd open */
-
- while ((child_index < priv->child_count)
- && (!local->child_up[child_index] ||
- !local->fd_open_on[child_index]))
-
- child_index++;
- } else {
- /* skip over children that are down */
- while ((child_index < priv->child_count)
- && !local->child_up[child_index])
- child_index++;
}
- if ((child_index == priv->child_count) &&
- int_lock->lock_count == 0) {
-
- gf_log (this->name, GF_LOG_INFO,
- "unable to lock on even one child");
-
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
+ if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
+ if ((is_entrylk && int_lock->entrylk_lock_count == 0) ||
+ (!is_entrylk && int_lock->lock_count == 0)) {
+ gf_log (this->name, GF_LOG_INFO,
+ "unable to lock on even one child");
- afr_copy_locked_nodes (frame, this);
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
- afr_unlock(frame, this);
+ afr_copy_locked_nodes (frame, this);
- return 0;
+ afr_unlock(frame, this);
+ return 0;
+ }
}
- if ((child_index == priv->child_count)
- || (int_lock->lock_count == int_lock->lk_expected_count)) {
-
+ if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
/* we're done locking */
gf_log (this->name, GF_LOG_DEBUG,
@@ -1021,6 +1115,11 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
return 0;
}
+ if (!_is_lock_wind_needed (local, child_index)) {
+ afr_lock_blocking (frame, this, cookie + 1);
+ return 0;
+ }
+
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
@@ -1035,7 +1134,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
(void *) (long) child_index,
priv->children[child_index],
priv->children[child_index]->fops->finodelk,
- this->name, local->fd,
+ int_lock->domain, local->fd,
F_SETLKW, &flock, NULL);
} else {
@@ -1048,50 +1147,29 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
(void *) (long) child_index,
priv->children[child_index],
priv->children[child_index]->fops->inodelk,
- this->name, &local->loc,
+ int_lock->domain, &local->loc,
F_SETLKW, &flock, NULL);
}
break;
case AFR_ENTRY_RENAME_TRANSACTION:
- {
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- lower_name = (lower == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, lower_name, child_index);
-
-
- STACK_WIND_COOKIE (frame, afr_lock_lower_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- this->name, lower, lower_name,
- ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
-
- break;
- }
-
case AFR_ENTRY_TRANSACTION:
+ /*Accounting for child_index increments on 'down'
+ *and 'fd-less' children */
+
if (local->fd) {
- AFR_TRACE_ENTRYLK_IN (frame, this,
- AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, local->transaction.basename,
- child_index);
+ AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION,
+ AFR_LOCK_OP,
+ int_lock->lockee[lockee_no].basename,
+ cookie);
STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
- (void *) (long) child_index,
+ (void *) (long) cookie,
priv->children[child_index],
priv->children[child_index]->fops->fentrylk,
- this->name, local->fd,
- local->transaction.basename,
+ int_lock->domain, local->fd,
+ int_lock->lockee[lockee_no].basename,
ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
} else {
AFR_TRACE_ENTRYLK_IN (frame, this,
@@ -1100,12 +1178,12 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
child_index);
STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
- (void *) (long) child_index,
+ (void *) (long) cookie,
priv->children[child_index],
priv->children[child_index]->fops->entrylk,
- this->name,
- &local->transaction.parent_loc,
- local->transaction.basename,
+ int_lock->domain,
+ &int_lock->lockee[lockee_no].loc,
+ int_lock->lockee[lockee_no].basename,
ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
}
@@ -1134,11 +1212,12 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this)
break;
case AFR_ENTRY_RENAME_TRANSACTION:
+ case AFR_ENTRY_TRANSACTION:
up_count = afr_up_children_count (local->child_up,
priv->child_count);
- int_lock->lk_expected_count = 2 * up_count;
- //fallthrough
- case AFR_ENTRY_TRANSACTION:
+ int_lock->lk_call_count = int_lock->lk_expected_count
+ = (int_lock->lockee_count *
+ up_count);
initialize_entrylk_variables (frame, this);
break;
}
@@ -1154,35 +1233,48 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
+ int call_count = 0;
+ int child_index = (long) cookie;
+ int copies = 0;
+ int index = 0;
+ int lockee_no = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ copies = priv->child_count;
+ index = child_index % copies;
+ lockee_no = child_index / copies;
local = frame->local;
int_lock = &local->internal_lock;
AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, NULL, op_ret,
+ AFR_LOCK_OP,
+ int_lock->lockee[lockee_no].basename, op_ret,
op_errno, (long) cookie);
- if (op_ret < 0 ) {
- if (op_errno == ENOSYS) {
+ LOCK (&frame->lock);
+ {
+ if (op_ret < 0 ) {
+ if (op_errno == ENOSYS) {
/* return ENOTSUP */
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/locks xlator on server");
- local->op_ret = op_ret;
- int_lock->lock_op_ret = op_ret;
-
- int_lock->lock_op_errno = op_errno;
- local->op_errno = op_errno;
- }
- } else if (op_ret == 0) {
- int_lock->entry_locked_nodes[child_index] |= LOCKED_YES;
- int_lock->entrylk_lock_count++;
- }
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume does not support locking. "
+ "please load features/locks xlator on server");
+ local->op_ret = op_ret;
+ int_lock->lock_op_ret = op_ret;
+
+ int_lock->lock_op_errno = op_errno;
+ local->op_errno = op_errno;
+ }
+ } else if (op_ret == 0) {
+ int_lock->lockee[lockee_no].locked_nodes[index] |= \
+ LOCKED_YES;
+ int_lock->lockee[lockee_no].locked_count++;
+ int_lock->entrylk_lock_count++;
+ }
- LOCK (&frame->lock);
- {
call_count = --int_lock->lk_call_count;
}
UNLOCK (&frame->lock);
@@ -1212,42 +1304,26 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-void
-afr_mark_fd_open_on (afr_local_t *local, afr_fd_ctx_t *fd_ctx,
- size_t child_count)
-{
- int i = 0;
-
- GF_ASSERT (local->fd_open_on);
-
- memset (local->fd_open_on, 0, sizeof (*local->fd_open_on)*child_count);
- for (i = 0; i < child_count; i++)
- if (fd_ctx->opened_on[i] == AFR_FD_OPENED)
- local->fd_open_on[i] = 1;
-}
-
int
afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- const char *basename = NULL;
- loc_t *loc = NULL;
- int32_t call_count = 0;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int copies = 0;
+ int index = 0;
+ int lockee_no = 0;
+ int32_t call_count = 0;
int i = 0;
local = frame->local;
int_lock = &local->internal_lock;
priv = this->private;
+ copies = priv->child_count;
initialize_entrylk_variables (frame, this);
- basename = int_lock->lk_basename;
- if (int_lock->lk_loc)
- loc = int_lock->lk_loc;
-
if (local->fd) {
fd_ctx = afr_fd_ctx_get (local->fd, this);
if (!fd_ctx) {
@@ -1260,11 +1336,11 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
local->op_errno = EINVAL;
int_lock->lock_op_errno = EINVAL;
+ afr_unlock (frame, this);
return -1;
}
- afr_mark_fd_open_on (local, fd_ctx, priv->child_count);
- call_count = internal_lock_count (frame, this);
+ call_count = int_lock->lockee_count * internal_lock_count (frame, this);
int_lock->lk_call_count = call_count;
int_lock->lk_expected_count = call_count;
@@ -1277,46 +1353,52 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
/* Send non-blocking entrylk calls only on up children
and where the fd has been opened */
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] && local->fd_open_on[i]) {
- AFR_TRACE_ENTRYLK_IN (frame, this,
- AFR_ENTRYLK_NB_TRANSACTION,
- AFR_LOCK_OP, basename, i);
+ for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) {
+ index = i%copies;
+ lockee_no = i/copies;
+ if (local->child_up[index]) {
+ AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION,
+ AFR_LOCK_OP,
+ int_lock->lockee[lockee_no].basename,
+ i);
STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
(void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fentrylk,
+ priv->children[index],
+ priv->children[index]->fops->fentrylk,
this->name, local->fd,
- basename,
+ int_lock->lockee[lockee_no].basename,
ENTRYLK_LOCK_NB, ENTRYLK_WRLCK,
NULL);
+ if (!--call_count)
+ break;
}
}
} else {
- GF_ASSERT (loc);
-
- call_count = internal_lock_count (frame, this);
+ call_count = int_lock->lockee_count * internal_lock_count (frame, this);
int_lock->lk_call_count = call_count;
int_lock->lk_expected_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- AFR_TRACE_ENTRYLK_IN (frame, this,
- AFR_ENTRYLK_NB_TRANSACTION,
- AFR_LOCK_OP, basename, i);
+ for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) {
+ index = i%copies;
+ lockee_no = i/copies;
+ if (local->child_up[index]) {
+ AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION,
+ AFR_LOCK_OP,
+ int_lock->lockee[lockee_no].basename,
+ i);
STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
(void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name, loc, basename,
+ priv->children[index],
+ priv->children[index]->fops->entrylk,
+ this->name, &int_lock->lockee[lockee_no].loc,
+ int_lock->lockee[lockee_no].basename,
ENTRYLK_LOCK_NB, ENTRYLK_WRLCK,
NULL);
if (!--call_count)
break;
-
}
}
}
@@ -1329,6 +1411,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
int call_count = 0;
int child_index = (long) cookie;
@@ -1337,58 +1420,57 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
int_lock = &local->internal_lock;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION,
AFR_LOCK_OP, NULL, op_ret,
op_errno, (long) cookie);
- if (op_ret < 0) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/locks xlator on server");
- local->op_ret = op_ret;
- int_lock->lock_op_ret = op_ret;
- int_lock->lock_op_errno = op_errno;
- local->op_errno = op_errno;
- }
- if (local->transaction.eager_lock)
- local->transaction.eager_lock[child_index] = 0;
- } else {
- int_lock->inode_locked_nodes[child_index]
- |= LOCKED_YES;
- int_lock->inodelk_lock_count++;
-
- if (local->transaction.eager_lock &&
- local->transaction.eager_lock[child_index] && local->fd) {
- fd_ctx = afr_fd_ctx_get (local->fd, this);
- /* piggybacked */
-
- if (op_ret == 1) {
- /* piggybacked */
- } else if (op_ret == 0) {
- /* lock acquired from server */
- LOCK (&local->fd->lock);
- {
- fd_ctx->lock_acquired[child_index]++;
- }
- UNLOCK (&local->fd->lock);
- }
- }
- }
+ if (local->fd)
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
LOCK (&frame->lock);
{
+ if (op_ret < 0) {
+ if (op_errno == ENOSYS) {
+ /* return ENOTSUP */
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume does not support locking. "
+ "please load features/locks xlator on "
+ "server");
+ local->op_ret = op_ret;
+ int_lock->lock_op_ret = op_ret;
+ int_lock->lock_op_errno = op_errno;
+ local->op_errno = op_errno;
+ }
+ if (local->transaction.eager_lock)
+ local->transaction.eager_lock[child_index] = 0;
+ } else {
+ inodelk->locked_nodes[child_index] |= LOCKED_YES;
+ inodelk->lock_count++;
+
+ if (local->transaction.eager_lock &&
+ local->transaction.eager_lock[child_index] &&
+ local->fd) {
+ /* piggybacked */
+ if (op_ret == 1) {
+ /* piggybacked */
+ } else if (op_ret == 0) {
+ /* lock acquired from server */
+ fd_ctx->lock_acquired[child_index]++;
+ }
+ }
+ }
+
call_count = --int_lock->lk_call_count;
}
UNLOCK (&frame->lock);
+
if (call_count == 0) {
gf_log (this->name, GF_LOG_TRACE,
"Last inode locking reply received");
/* all locks successful. Proceed to call FOP */
- if (int_lock->inodelk_lock_count ==
- int_lock->lk_expected_count) {
+ if (inodelk->lock_count == int_lock->lk_expected_count) {
gf_log (this->name, GF_LOG_TRACE,
"All servers locked. Calling the cbk");
int_lock->lock_op_ret = 0;
@@ -1412,6 +1494,7 @@ int
afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
afr_fd_ctx_t *fd_ctx = NULL;
@@ -1427,11 +1510,13 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
priv = this->private;
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
- flock.l_type = int_lock->lk_flock.l_type;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
+ flock.l_type = inodelk->flock.l_type;
- full_flock.l_type = int_lock->lk_flock.l_type;
+ full_flock.l_type = inodelk->flock.l_type;
initialize_inodelk_variables (frame, this);
@@ -1447,11 +1532,11 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
local->op_errno = EINVAL;
int_lock->lock_op_errno = EINVAL;
+ afr_unlock (frame, this);
ret = -1;
goto out;
}
- afr_mark_fd_open_on (local, fd_ctx, priv->child_count);
call_count = internal_lock_count (frame, this);
int_lock->lk_call_count = call_count;
int_lock->lk_expected_count = call_count;
@@ -1466,11 +1551,11 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
/* Send non-blocking inodelk calls only on up children
and where the fd has been opened */
for (i = 0; i < priv->child_count; i++) {
- if (!local->child_up[i] || !local->fd_open_on[i])
+ if (!local->child_up[i])
continue;
flock_use = &flock;
- if (!priv->eager_lock) {
+ if (!local->transaction.eager_lock_on) {
goto wind;
}
@@ -1506,7 +1591,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long) i,
priv->children[i],
priv->children[i]->fops->finodelk,
- this->name, local->fd,
+ int_lock->domain, local->fd,
F_SETLK, flock_use, NULL);
if (!--call_count)
@@ -1528,7 +1613,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long) i,
priv->children[i],
priv->children[i]->fops->inodelk,
- this->name, &local->loc,
+ int_lock->domain, &local->loc,
F_SETLK, &flock, NULL);
if (!--call_count)
@@ -1539,201 +1624,6 @@ out:
return ret;
}
-static int
-__is_lower_locked (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int count = 0;
- int i = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER)
- count++;
- }
-
- return count;
-
-}
-
-static int
-__is_higher_locked (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int count = 0;
- int i = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->locked_nodes[i] & LOCKED_YES)
- count++;
- }
-
- return count;
-
-}
-
-static int
-afr_unlock_lower_entrylk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- const char *basename = NULL;
- loc_t *loc = NULL;
- int call_count = 0;
- int i = -1;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- basename = int_lock->lk_basename;
- if (int_lock->lk_loc)
- loc = int_lock->lk_loc;
-
- call_count = __is_lower_locked (frame, this);
- int_lock->lk_call_count = call_count;
-
- if (!call_count){
- gf_log (this->name, GF_LOG_TRACE,
- "No internal locks unlocked");
- int_lock->lock_cbk (frame, this);
- goto out;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) {
- AFR_TRACE_ENTRYLK_IN (frame, this,
- AFR_ENTRYLK_NB_TRANSACTION,
- AFR_UNLOCK_OP, basename, i);
-
- STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name,
- loc, basename,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
-
- if (!--call_count)
- break;
-
- }
- }
-
-out:
- return 0;
-
-}
-
-
-static int
-afr_post_unlock_higher_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->transaction.done (frame, this);
- return 0;
-}
-
-static int
-afr_post_unlock_lower_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- loc_t *lower = NULL;
- loc_t *higher = NULL;
- const char *higher_name = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- higher = (lower == &local->transaction.parent_loc ?
- &local->transaction.new_parent_loc :
- &local->transaction.parent_loc);
-
- higher_name = (higher == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- if (__is_higher_locked (frame, this)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlocking higher");
- int_lock->lk_basename = higher_name;
- int_lock->lk_loc = higher;
- int_lock->lock_cbk = afr_post_unlock_higher_cbk;
-
- afr_unlock_entrylk (frame, this);
- } else
- local->transaction.done (frame, this);
-
- return 0;
-}
-
-static int
-afr_rename_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- loc_t *lower = NULL;
- const char *lower_name = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- lower_name = (lower == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- if (__is_lower_locked (frame, this)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlocking lower");
- int_lock->lk_basename = lower_name;
- int_lock->lk_loc = lower;
- int_lock->lock_cbk = afr_post_unlock_lower_cbk;
-
- afr_unlock_lower_entrylk (frame, this);
- } else
- afr_post_unlock_lower_cbk (frame, this);
-
- return 0;
-}
-
-static int
-afr_rename_transaction (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- return (local->transaction.type ==
- AFR_ENTRY_RENAME_TRANSACTION);
-
-}
-
int32_t
afr_unlock (call_frame_t *frame, xlator_t *this)
{
@@ -1745,10 +1635,8 @@ afr_unlock (call_frame_t *frame, xlator_t *this)
if (is_afr_lock_transaction (local))
afr_unlock_inodelk (frame, this);
else
- if (!afr_rename_transaction (frame, this))
- afr_unlock_entrylk (frame, this);
- else
- afr_rename_unlock (frame, this);
+ afr_unlock_entrylk (frame, this);
+
} else {
if (is_afr_lock_selfheal (local))
afr_unlock_inodelk (frame, this);
@@ -2249,29 +2137,38 @@ out:
return ret;
}
-void
-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src,
+int
+afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
unsigned int child_count)
{
- afr_local_t *dst_local = NULL;
- afr_local_t *src_local = NULL;
- afr_internal_lock_t *dst_lock = NULL;
- afr_internal_lock_t *src_lock = NULL;
+ afr_local_t *dst_local = NULL;
+ afr_local_t *src_local = NULL;
+ afr_internal_lock_t *dst_lock = NULL;
+ afr_internal_lock_t *src_lock = NULL;
+ afr_inodelk_t *dst_inodelk = NULL;
+ afr_inodelk_t *src_inodelk = NULL;
+ int ret = -1;
- dst_local = dst->local;
- dst_lock = &dst_local->internal_lock;
src_local = src->local;
src_lock = &src_local->internal_lock;
- if (src_lock->inode_locked_nodes) {
- memcpy (dst_lock->inode_locked_nodes,
- src_lock->inode_locked_nodes,
- sizeof (*dst_lock->inode_locked_nodes) * child_count);
- memset (src_lock->inode_locked_nodes, 0,
- sizeof (*src_lock->inode_locked_nodes) * child_count);
+ src_inodelk = afr_get_inodelk (src_lock, dom);
+ dst_local = dst->local;
+ dst_lock = &dst_local->internal_lock;
+ dst_inodelk = afr_get_inodelk (dst_lock, dom);
+ if (!dst_inodelk || !src_inodelk)
+ goto out;
+ if (src_inodelk->locked_nodes) {
+ memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes,
+ sizeof (*dst_inodelk->locked_nodes) * child_count);
+ memset (src_inodelk->locked_nodes, 0,
+ sizeof (*src_inodelk->locked_nodes) * child_count);
}
dst_lock->transaction_lk_type = src_lock->transaction_lk_type;
dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type;
- dst_lock->inodelk_lock_count = src_lock->inodelk_lock_count;
- src_lock->inodelk_lock_count = 0;
+ dst_inodelk->lock_count = src_inodelk->lock_count;
+ src_inodelk->lock_count = 0;
+ ret = 0;
+out:
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index e01ab366f..73594f265 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -41,7 +41,10 @@ enum gf_afr_mem_types_ {
gf_afr_mt_shd_event_t,
gf_afr_mt_time_t,
gf_afr_mt_pos_data_t,
- gf_afr_mt_reply_t,
+ gf_afr_mt_reply_t,
+ gf_afr_mt_stats_t,
+ gf_afr_mt_shd_crawl_event_t,
+ gf_afr_mt_uint64_t,
gf_afr_mt_end
};
#endif
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
index c0be197f2..643a5d692 100644
--- a/xlators/cluster/afr/src/afr-open.c
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -249,189 +249,126 @@ out:
return 0;
}
-//NOTE: this function should be called with holding the lock on
-//fd to which fd_ctx belongs
-void
-afr_get_resumable_calls (xlator_t *this, afr_fd_ctx_t *fd_ctx,
- struct list_head *list)
-{
- afr_fd_paused_call_t *paused_call = NULL;
- afr_fd_paused_call_t *tmp = NULL;
- afr_local_t *call_local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- gf_boolean_t call = _gf_false;
-
- priv = this->private;
- list_for_each_entry_safe (paused_call, tmp, &fd_ctx->paused_calls,
- call_list) {
- call = _gf_true;
- call_local = paused_call->frame->local;
- for (i = 0; i < priv->child_count; i++) {
- if (call_local->child_up[i] &&
- (fd_ctx->opened_on[i] == AFR_FD_OPENING))
- call = _gf_false;
- }
-
- if (call) {
- list_del_init (&paused_call->call_list);
- list_add (&paused_call->call_list, list);
- }
- }
-}
-
-void
-afr_resume_calls (xlator_t *this, struct list_head *list)
-{
- afr_fd_paused_call_t *paused_call = NULL;
- afr_fd_paused_call_t *tmp = NULL;
- afr_local_t *call_local = NULL;
-
- list_for_each_entry_safe (paused_call, tmp, list, call_list) {
- list_del_init (&paused_call->call_list);
- call_local = paused_call->frame->local;
- call_local->fop_call_continue (paused_call->frame, this);
- GF_FREE (paused_call);
- }
-}
-
int
afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
- struct list_head paused_calls = {0};
- gf_boolean_t fop_paused = _gf_false;
- fd_t *local_fd = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
priv = this->private;
local = frame->local;
- fop_paused = local->fop_paused;
if (op_ret >= 0) {
- gf_log (this->name, GF_LOG_INFO, "fd for %s opened "
+ gf_log (this->name, GF_LOG_DEBUG, "fd for %s opened "
"successfully on subvolume %s", local->loc.path,
priv->children[child_index]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to open %s "
+ "on subvolume %s", local->loc.path,
+ priv->children[child_index]->name);
}
- local_fd = fd_ref (local->fd);
- call_count = afr_frame_return (frame);
- //Note: Do not access any thing using the frame outside call_count 0
-
- //Note: No frame locking needed for this block of code
- fd_ctx = afr_fd_ctx_get (local_fd, this);
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
if (!fd_ctx) {
gf_log (this->name, GF_LOG_WARNING,
- "failed to get fd context, %p", local_fd);
+ "failed to get fd context, %p", local->fd);
goto out;
}
- LOCK (&local_fd->lock);
+ LOCK (&local->fd->lock);
{
if (op_ret >= 0) {
fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
} else {
fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
}
- if (call_count == 0) {
- INIT_LIST_HEAD (&paused_calls);
- afr_get_resumable_calls (this, fd_ctx, &paused_calls);
- }
}
- UNLOCK (&local_fd->lock);
+ UNLOCK (&local->fd->lock);
out:
- if (call_count == 0) {
- afr_resume_calls (this, &paused_calls);
- //If the fop is paused then resume_calls will continue the fop
- if (fop_paused)
- goto done;
-
- if (local->fop_call_continue)
- local->fop_call_continue (frame, this);
- else
- AFR_STACK_DESTROY (frame);
- }
+ call_count = afr_frame_return (frame);
+ if (call_count == 0)
+ AFR_STACK_DESTROY (frame);
-done:
- fd_unref (local_fd);
return 0;
}
-int
-afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx,
- int need_open_count, int *need_open)
+void
+afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- call_frame_t *open_frame = NULL;
- afr_local_t *open_local = NULL;
- int ret = -1;
- ia_type_t ia_type = IA_INVAL;
- int32_t op_errno = 0;
-
- GF_ASSERT (fd_ctx);
- GF_ASSERT (need_open_count > 0);
- GF_ASSERT (need_open);
+ afr_private_t *priv = NULL;
+ int i = 0;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int32_t op_errno = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
- local = frame->local;
priv = this->private;
- if (!local->fop_call_continue) {
- open_frame = copy_frame (frame);
- if (!open_frame) {
- ret = -ENOMEM;
- goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (open_frame->local, out);
- open_local = open_frame->local;
- ret = afr_local_init (open_local, priv, &op_errno);
- if (ret < 0)
- goto out;
- loc_copy (&open_local->loc, &local->loc);
- open_local->fd = fd_ref (local->fd);
- } else {
- ret = 0;
- open_frame = frame;
- open_local = local;
+
+ if (!afr_is_fd_fixable (fd) || !need_open || !need_open_count)
+ goto out;
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx) {
+ ret = -1;
+ goto out;
}
- open_local->call_count = need_open_count;
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame) {
+ ret = -1;
+ goto out;
+ }
- gf_log (this->name, GF_LOG_DEBUG, "need open count: %d",
+ AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
+ local = frame->local;
+ ret = afr_local_init (local, priv, &op_errno);
+ if (ret < 0)
+ goto out;
+
+ local->loc.inode = inode_ref (fd->inode);
+ ret = loc_path (&local->loc, NULL);
+ if (ret < 0)
+ goto out;
+
+ local->fd = fd_ref (fd);
+ local->call_count = need_open_count;
+
+ gf_log (this->name, GF_LOG_DEBUG, "need open count: %zd",
need_open_count);
- ia_type = open_local->fd->inode->ia_type;
- GF_ASSERT (ia_type != IA_INVAL);
for (i = 0; i < priv->child_count; i++) {
if (!need_open[i])
continue;
- if (IA_IFDIR == ia_type) {
+
+ if (IA_IFDIR == fd->inode->ia_type) {
gf_log (this->name, GF_LOG_DEBUG,
"opening fd for dir %s on subvolume %s",
local->loc.path, priv->children[i]->name);
- STACK_WIND_COOKIE (open_frame, afr_openfd_fix_open_cbk,
+ STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk,
(void*) (long) i,
priv->children[i],
priv->children[i]->fops->opendir,
- &open_local->loc, open_local->fd,
+ &local->loc, local->fd,
NULL);
} else {
gf_log (this->name, GF_LOG_DEBUG,
"opening fd for file %s on subvolume %s",
local->loc.path, priv->children[i]->name);
- STACK_WIND_COOKIE (open_frame, afr_openfd_fix_open_cbk,
+ STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk,
(void *)(long) i,
priv->children[i],
priv->children[i]->fops->open,
- &open_local->loc,
+ &local->loc,
fd_ctx->flags & (~O_TRUNC),
- open_local->fd, NULL);
+ local->fd, NULL);
}
}
@@ -439,8 +376,7 @@ afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx,
ret = 0;
out:
if (op_errno)
- ret = -op_errno;
- if (ret && open_frame)
- AFR_STACK_DESTROY (open_frame);
- return ret;
+ ret = -1; //For handling ALLOC_OR_GOTO
+ if (ret && frame)
+ AFR_STACK_DESTROY (frame);
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
index 1721fd270..83846f152 100644
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c
+++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
@@ -100,7 +100,7 @@ sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this,
}
sh_private_cleanup (sh_frame, this);
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
GF_ASSERT (!last_loop_frame);
//loop_finish should have happened and the old_loop should be NULL
gf_log (this->name, GF_LOG_DEBUG,
@@ -143,7 +143,7 @@ sh_loop_finish (call_frame_t *loop_frame, xlator_t *this)
}
if (loop_sh && loop_sh->data_lock_held) {
- afr_sh_data_unlock (loop_frame, this,
+ afr_sh_data_unlock (loop_frame, this, this->name,
sh_destroy_frame);
} else {
sh_destroy_frame (loop_frame, this);
@@ -214,7 +214,7 @@ sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this,
goto out;
//We want the frame to have same lk_owner as sh_frame
//so that locks translator allows conflicting locks
- new_loop_local = afr_local_copy (local, this);
+ new_loop_local = afr_self_heal_local_init (local, this);
if (!new_loop_local)
goto out;
new_loop_frame->local = new_loop_local;
@@ -273,10 +273,10 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset,
new_loop_sh->offset = offset;
new_loop_sh->block_size = sh->block_size;
afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size,
- _gf_true, sh_loop_lock_success, sh_loop_lock_failure);
+ _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure);
return 0;
out:
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
if (old_loop_frame)
sh_loop_finish (old_loop_frame, this);
sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM);
@@ -307,8 +307,9 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this,
sh_priv->loops_running--;
offset = sh_priv->offset;
block_size = sh->block_size;
- while ((!sh->eof_reached) && (0 == sh->op_failed) &&
- (sh_priv->loops_running < priv->data_self_heal_window_size)
+ while ((!sh->eof_reached) &&
+ (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) &&
+ (sh_priv->loops_running < priv->data_self_heal_window_size)
&& (sh_priv->offset < sh->file_size)) {
loop++;
@@ -327,7 +328,8 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this,
if (0 == loop) {
//loop finish does unlock, but the erasing of the pending
//xattrs needs to happen before that so do not finish the loop
- if (is_driver_done && !sh->op_failed)
+ if (is_driver_done &&
+ !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC))
goto driver_done;
if (old_loop_frame) {
sh_loop_finish (old_loop_frame, this);
@@ -338,7 +340,7 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this,
//If we have more loops to form we should finish previous loop after
//the next loop lock
while (loop--) {
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
// op failed in other loop, stop spawning more loops
if (old_loop_frame) {
sh_loop_finish (old_loop_frame, this);
@@ -384,7 +386,7 @@ sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame
}
if (op_ret == -1) {
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_set_error (sh, op_errno);
if (loop_frame) {
sh_loop_finish (loop_frame, this);
@@ -432,16 +434,16 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
priv->children[child_index]->name,
strerror (op_errno));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_set_error (loop_sh, op_errno);
} else if (op_ret < loop_local->cont.writev.vector->iov_len) {
- gf_log(this->name, GF_LOG_ERROR,
- "incomplete write to %s on subvolume %s "
- "(expected %lu, returned %d)", sh_local->loc.path,
- priv->children[child_index]->name,
- loop_local->cont.writev.vector->iov_len, op_ret);
- sh->op_failed = 1;
- }
+ gf_log (this->name, GF_LOG_ERROR,
+ "incomplete write to %s on subvolume %s "
+ "(expected %lu, returned %d)", sh_local->loc.path,
+ priv->children[child_index]->name,
+ loop_local->cont.writev.vector->iov_len, op_ret);
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
+ }
call_count = afr_frame_return (loop_frame);
@@ -514,7 +516,7 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie,
if (op_ret <= 0) {
if (op_ret < 0) {
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
gf_log (this->name, GF_LOG_ERROR, "read failed on %d "
"for %s reason :%s", sh->source,
sh_local->loc.path, strerror (errno));
@@ -624,7 +626,7 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
"checksum on %s failed on subvolume %s (%s)",
sh_local->loc.path, priv->children[child_index]->name,
strerror (op_errno));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
} else {
memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH,
strong_checksum, MD5_DIGEST_LENGTH);
@@ -662,7 +664,8 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
}
UNLOCK (&sh_priv->lock);
- if (write_needed && !sh->op_failed) {
+ if (write_needed &&
+ !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
sh_loop_read (loop_frame, this);
} else {
sh_loop_return (sh_frame, this, loop_frame,
@@ -751,14 +754,15 @@ out:
return sh_priv;
}
-void
-afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src,
+int
+afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom,
unsigned int child_count)
{
afr_local_t *dst_local = NULL;
afr_self_heal_t *dst_sh = NULL;
afr_local_t *src_local = NULL;
afr_self_heal_t *src_sh = NULL;
+ int ret = -1;
dst_local = dst->local;
dst_sh = &dst_local->self_heal;
@@ -766,9 +770,12 @@ afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src,
src_sh = &src_local->self_heal;
GF_ASSERT (src_sh->data_lock_held);
GF_ASSERT (!dst_sh->data_lock_held);
- afr_lk_transfer_datalock (dst, src, child_count);
+ ret = afr_lk_transfer_datalock (dst, src, dom, child_count);
+ if (ret)
+ return ret;
src_sh->data_lock_held = _gf_false;
dst_sh->data_lock_held = _gf_true;
+ return 0;
}
int
@@ -790,7 +797,10 @@ afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this,
ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame);
if (ret)
goto out;
- afr_sh_transfer_lock (first_loop_frame, sh_frame, priv->child_count);
+ ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name,
+ priv->child_count);
+ if (ret)
+ goto out;
sh->private = afr_sh_priv_init ();
if (!sh->private) {
ret = -1;
@@ -800,7 +810,7 @@ afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this,
ret = 0;
out:
if (ret) {
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
sh_loop_driver_done (sh_frame, this, NULL);
}
return 0;
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 07603c5b2..ef92b4205 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -18,6 +18,28 @@
#include "afr-self-heal.h"
#include "pump.h"
+#define ADD_FMT_STRING(msg, off, sh_str, status, print_log) \
+ do { \
+ if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) { \
+ off += snprintf (msg + off, sizeof (msg) - off, \
+ " "sh_str" self heal %s,", \
+ get_sh_completion_status (status));\
+ print_log = 1; \
+ } \
+ } while (0)
+
+#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log) \
+ do { \
+ if (AFR_SELF_HEAL_SYNC_BEGIN == status || \
+ AFR_SELF_HEAL_FAILED == status) { \
+ off += snprintf (msg + off, sizeof (msg) - off, \
+ " "sh_str" self heal %s,", \
+ get_sh_completion_status (status));\
+ print_log = 1; \
+ } \
+ } while (0)
+
+
void
afr_sh_reset (call_frame_t *frame, xlator_t *this)
{
@@ -112,8 +134,8 @@ void
afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno)
{
sh->op_ret = -1;
- if (afr_error_more_important (sh->op_errno, op_errno))
- sh->op_errno = op_errno;
+ sh->op_errno = afr_most_important_error(sh->op_errno, op_errno,
+ _gf_false);
}
void
@@ -141,6 +163,79 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
GF_FREE (buf);
}
+char*
+afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this)
+{
+ afr_private_t * priv = this->private;
+ char *buf = NULL;
+ char *ptr = NULL;
+ int i = 0;
+ int j = 0;
+ int child_count = priv->child_count;
+ char *matrix_begin = "[ [ ";
+ char *matrix_end = "] ]";
+ char *seperator = "] [ ";
+ int pending_entry_strlen = 12; //Including space after entry
+ int matrix_begin_strlen = 0;
+ int matrix_end_strlen = 0;
+ int seperator_strlen = 0;
+ int string_length = 0;
+ char *msg = "- Pending matrix: ";
+
+ /*
+ * for a list of lists of [ [ a b ] [ c d ] ]
+ * */
+
+ matrix_begin_strlen = strlen (matrix_begin);
+ matrix_end_strlen = strlen (matrix_end);
+ seperator_strlen = strlen (seperator);
+ string_length = matrix_begin_strlen + matrix_end_strlen
+ + (child_count -1) * seperator_strlen
+ + (child_count * child_count * pending_entry_strlen);
+
+ buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char);
+ if (!buf)
+ goto out;
+
+ ptr = buf;
+ ptr += sprintf (ptr, "%s", msg);
+ ptr += sprintf (ptr, "%s", matrix_begin);
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ ptr += sprintf (ptr, "%d ", pending_matrix[i][j]);
+ }
+ if (i < priv->child_count -1)
+ ptr += sprintf (ptr, "%s", seperator);
+ }
+
+ ptr += sprintf (ptr, "%s", matrix_end);
+
+out:
+ return buf;
+}
+
+void
+afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this,
+ const char *loc)
+{
+ char *buf = NULL;
+ char *free_ptr = NULL;
+
+ buf = afr_get_pending_matrix_str (pending_matrix, this);
+ if (buf)
+ free_ptr = buf;
+ else
+ buf = "";
+
+
+ gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'"
+ " (possible split-brain). Please delete the file from all but "
+ "the preferred subvolume.%s", loc, buf);
+ GF_FREE (free_ptr);
+ return;
+}
+
+
void
afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count)
{
@@ -406,6 +501,8 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses,
{
int i = 0;
int biggest_witness = -1;
+ int biggest_witness_idx = -1;
+ int biggest_witness_cnt = -1;
GF_ASSERT (witnesses);
GF_ASSERT (characters);
@@ -415,10 +512,21 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses,
if (characters[i].type != AFR_NODE_FOOL)
continue;
- if (biggest_witness < witnesses[i])
+ if (biggest_witness < witnesses[i]) {
biggest_witness = witnesses[i];
+ biggest_witness_idx = i;
+ biggest_witness_cnt = 1;
+ continue;
+ }
+
+ if (biggest_witness == witnesses[i])
+ biggest_witness_cnt++;
}
- return biggest_witness;
+
+ if (biggest_witness_cnt != 1)
+ return -1;
+
+ return biggest_witness_idx;
}
int
@@ -446,10 +554,84 @@ afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses,
return nsources;
}
+
+int
+afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx)
+{
+ if (idx >= 0 && idx < child_count) {
+ sources[idx] = 1;
+ return 1;
+ }
+ return 0;
+}
+
+
+static int
+afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children,
+ int child_count)
+{
+ int idx = -1;
+ int i = -1;
+ int child = -1;
+ uint64_t max_size = 0;
+ uint64_t min_size = 0;
+ int num_children = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (success_children[i] == -1)
+ break;
+
+ child = success_children[i];
+ if (bufs[child].ia_size > max_size) {
+ max_size = bufs[child].ia_size;
+ idx = child;
+ }
+
+ if ((num_children == 0) || (bufs[child].ia_size < min_size)) {
+ min_size = bufs[child].ia_size;
+ }
+
+ num_children++;
+ }
+
+ /* If sizes are same for all of them, finding sources will have to
+ * happen with pending changelog. So return -1
+ */
+ if ((num_children > 1) && (min_size == max_size))
+ return -1;
+ return idx;
+}
+
+
+static int
+afr_find_newest_file (struct iatt *bufs, int32_t *success_children,
+ int child_count)
+{
+ int idx = -1;
+ int i = -1;
+ int child = -1;
+ uint64_t max_ctime = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (success_children[i] == -1)
+ break;
+
+ child = success_children[i];
+ if (bufs[child].ia_ctime > max_ctime) {
+ max_ctime = bufs[child].ia_ctime;
+ idx = child;
+ }
+ }
+
+ return idx;
+}
+
+
static int
afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix,
afr_node_character *characters,
- int child_count)
+ int32_t *success_children,
+ int child_count, struct iatt *bufs)
{
int32_t biggest_witness = 0;
int nsources = 0;
@@ -457,6 +639,11 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix,
GF_ASSERT (child_count > 0);
+ biggest_witness = afr_find_largest_file_size (bufs, success_children,
+ child_count);
+ if (biggest_witness != -1)
+ goto found;
+
witnesses = GF_CALLOC (child_count, sizeof (*witnesses),
gf_afr_mt_int32_t);
if (NULL == witnesses) {
@@ -469,9 +656,15 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix,
biggest_witness = afr_find_biggest_witness_among_fools (witnesses,
characters,
child_count);
- nsources = afr_mark_fool_as_source_by_witness (sources, witnesses,
- characters, child_count,
- biggest_witness);
+ if (biggest_witness != -1)
+ goto found;
+
+ biggest_witness = afr_find_newest_file (bufs, success_children,
+ child_count);
+
+found:
+ nsources = afr_mark_fool_as_source_by_idx (sources, child_count,
+ biggest_witness);
out:
GF_FREE (witnesses);
return nsources;
@@ -815,7 +1008,8 @@ afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix,
nsources = afr_mark_biggest_of_fools_as_source (sources,
pending_matrix,
characters,
- child_count);
+ success_children,
+ child_count, bufs);
}
out:
@@ -833,14 +1027,46 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
int32_t *delta_matrix[], unsigned char success[],
int child_count, afr_transaction_type type)
{
- int i = 0;
- int j = 0;
+ int tgt = 0;
+ int src = 0;
+ int value = 0;
afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL,
xattr, type, priv->child_count);
- for (i = 0; i < priv->child_count; i++)
- for (j = 0; j < priv->child_count; j++)
- delta_matrix[i][j] = -delta_matrix[i][j];
+
+ /*
+ * The algorithm here has two parts. First, for each subvol indexed
+ * as tgt, we try to figure out what count everyone should have for it.
+ * If the self-heal succeeded, that's easy; the value is zero.
+ * Otherwise, the value is the maximum of the succeeding nodes' counts.
+ * Once we know the value, we loop through (possibly for a second time)
+ * setting each count to the difference so that when we're done all
+ * succeeding nodes will have the same count for tgt.
+ */
+ for (tgt = 0; tgt < priv->child_count; ++tgt) {
+ value = 0;
+ if (!success[tgt]) {
+ /* Find the maximum. */
+ for (src = 0; src < priv->child_count; ++src) {
+ if (!success[src]) {
+ continue;
+ }
+ if (delta_matrix[src][tgt] > value) {
+ value = delta_matrix[src][tgt];
+ }
+ }
+ }
+ /* Force everyone who succeeded to the chosen value. */
+ for (src = 0; src < priv->child_count; ++src) {
+ if (success[src]) {
+ delta_matrix[src][tgt] = value
+ - delta_matrix[src][tgt];
+ }
+ else {
+ delta_matrix[src][tgt] = 0;
+ }
+ }
+ }
}
@@ -867,8 +1093,14 @@ afr_sh_delta_to_xattr (xlator_t *this,
pending = GF_CALLOC (sizeof (int32_t), 3,
gf_afr_mt_int32_t);
- if (!pending)
+ if (!pending) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate pending entry "
+ "for %s[%d] on %s",
+ priv->pending_key[j], type,
+ priv->children[i]->name);
continue;
+ }
/* 3 = data+metadata+entry */
k = afr_index_for_transaction_type (type);
@@ -914,14 +1146,13 @@ afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this)
afr_sh_reset (frame, this);
- if (local->govinda_gOvinda) {
+ if (local->unhealable) {
gf_log (this->name, GF_LOG_DEBUG,
"split brain found, aborting selfheal of %s",
local->loc.path);
- sh->op_failed = 1;
}
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
sh->completion_cbk (frame, this);
} else {
gf_log (this->name, GF_LOG_TRACE,
@@ -1153,7 +1384,7 @@ out:
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, "
"reason: %s", local->loc.path, strerror (-ret));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
}
afr_sh_missing_entries_finish (frame, this);
}
@@ -1168,7 +1399,7 @@ afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this,
local = frame->local;
sh = &local->self_heal;
if (op_ret < 0)
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_missing_entries_finish (frame, this);
return 0;
}
@@ -1192,7 +1423,7 @@ sh_missing_entries_create (call_frame_t *frame, xlator_t *this)
if (!afr_valid_ia_type (type)) {
gf_log (this->name, GF_LOG_ERROR,
"%s: unknown file type: 0%o", local->loc.path, type);
- local->govinda_gOvinda = 1;
+ afr_set_local_for_unhealable (local);
afr_sh_missing_entries_finish (frame, this);
goto out;
}
@@ -1225,8 +1456,9 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this,
loc = &local->loc;
if (op_ret < 0) {
- if (op_errno == EIO)
- local->govinda_gOvinda = 1;
+ if (op_errno == EIO) {
+ afr_set_local_for_unhealable (local);
+ }
// EIO can happen if finding the fresh parent dir failed
goto out;
}
@@ -1288,7 +1520,7 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this,
}
return;
out:
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_set_error (sh, op_errno);
afr_sh_missing_entries_finish (frame, this);
return;
@@ -1372,7 +1604,7 @@ afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child,
LOCK (&frame->lock);
{
afr_sh_set_error (sh, EIO);
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
}
UNLOCK (&frame->lock);
}
@@ -1454,7 +1686,7 @@ afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this)
sh = &local->self_heal;
priv = this->private;
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
afr_sh_missing_entries_finish (frame, this);
} else {
if (afr_gfid_missing_count (this->name, sh->fresh_children,
@@ -1547,7 +1779,7 @@ afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this,
if (!purge_condition (local, priv, i))
continue;
gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s "
- "on %d", local->loc.path, i);
+ "on %s", local->loc.path, priv->children[i]->name);
afr_sh_call_entry_expunge_remove (frame, this,
(long) i, &sh->buf[i],
&sh->parentbufs[i],
@@ -1667,10 +1899,8 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this,
sh->child_errno,
priv->child_count, ENOENT);
if (fresh_child_enoents == fresh_parent_count) {
- gf_log (this->name, GF_LOG_INFO, "Deleting stale file %s",
- local->loc.path);
afr_sh_set_error (sh, ENOENT);
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_purge_entry (frame, this);
} else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children,
priv->child_count, local->loc.path,
@@ -1684,14 +1914,14 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this,
afr_sh_purge_stale_entry (frame, this);
} else {
op_errno = EIO;
- local->govinda_gOvinda = 1;
+ afr_set_local_for_unhealable (local);
goto fail;
}
return;
fail:
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_set_error (sh, op_errno);
afr_sh_missing_entries_finish (frame, this);
return;
@@ -1762,8 +1992,8 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this,
out:
afr_sh_set_error (sh, op_errno);
- sh->op_failed = 1;
- afr_sh_missing_entries_finish (frame, this);
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
+ afr_sh_missing_entries_finish (frame, this);
return;
}
@@ -1852,7 +2082,8 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
int
-afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this)
+afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame,
+ xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
@@ -1865,7 +2096,7 @@ afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this)
if (int_lock->lock_op_ret < 0) {
gf_log (this->name, GF_LOG_INFO,
"Non blocking entrylks failed.");
- sh->op_failed = -1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_missing_entries_done (frame, this);
} else {
@@ -1881,40 +2112,14 @@ afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this)
}
int
-afr_sh_post_nb_entrylk_gfid_sh_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- int_lock = &local->internal_lock;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "Non blocking entrylks failed.");
- afr_sh_missing_entries_done (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking entrylks done. Proceeding to FOP");
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_missing_entries_lookup_done,
- sh->sh_gfid_req, AFR_LOOKUP_FAIL_CONFLICTS|
- AFR_LOOKUP_FAIL_MISSING_GFIDS,
- NULL);
- }
-
- return 0;
-}
-
-int
afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc,
char *base_name, afr_lock_cbk_t lock_cbk)
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ priv = this->private;
local = frame->local;
int_lock = &local->internal_lock;
@@ -1926,7 +2131,12 @@ afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc,
int_lock->lk_basename = base_name;
int_lock->lk_loc = loc;
int_lock->lock_cbk = lock_cbk;
+ int_lock->domain = this->name;
+ int_lock->lockee_count = 0;
+ afr_init_entry_lockee (&int_lock->lockee[0], local, loc,
+ base_name, priv->child_count);
+ int_lock->lockee_count++;
afr_nonblocking_entrylk (frame, this);
return 0;
@@ -1964,27 +2174,31 @@ out:
}
static int
-afr_self_heal_conflicting_entries (call_frame_t *frame, xlator_t *this)
+afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this)
{
- afr_self_heal_parent_entrylk (frame, this,
- afr_sh_post_nb_entrylk_conflicting_sh_cbk);
- return 0;
-}
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sh->sh_type_in_action = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY;
+
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
-static int
-afr_self_heal_gfids (call_frame_t *frame, xlator_t *this)
-{
afr_self_heal_parent_entrylk (frame, this,
- afr_sh_post_nb_entrylk_gfid_sh_cbk);
+ afr_sh_post_nb_entrylk_missing_entry_sh_cbk);
return 0;
}
-afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this)
+afr_local_t*
+afr_self_heal_local_init (afr_local_t *l, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *lc = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *shc = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *lc = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_self_heal_t *shc = NULL;
+ int ret = 0;
priv = this->private;
@@ -2007,13 +2221,23 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this)
shc->forced_merge = sh->forced_merge;
shc->background = sh->background;
shc->type = sh->type;
+ shc->data_sh_info = "";
+ shc->metadata_sh_info = "";
uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req);
- if (l->loc.path)
- loc_copy (&lc->loc, &l->loc);
+ if (l->loc.path) {
+ ret = loc_copy (&lc->loc, &l->loc);
+ if (ret < 0)
+ goto out;
+ }
lc->child_up = memdup (l->child_up,
sizeof (*lc->child_up) * priv->child_count);
+ if (!lc->child_up) {
+ ret = -1;
+ goto out;
+ }
+
if (l->xattr_req)
lc->xattr_req = dict_ref (l->xattr_req);
@@ -2021,40 +2245,25 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this)
lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode);
if (l->cont.lookup.xattr)
lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr);
- if (l->internal_lock.inode_locked_nodes)
- lc->internal_lock.inode_locked_nodes =
- memdup (l->internal_lock.inode_locked_nodes,
- sizeof (*lc->internal_lock.inode_locked_nodes) * priv->child_count);
- else
- lc->internal_lock.inode_locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
- if (l->internal_lock.entry_locked_nodes)
- lc->internal_lock.entry_locked_nodes =
- memdup (l->internal_lock.entry_locked_nodes,
- sizeof (*lc->internal_lock.entry_locked_nodes) * priv->child_count);
- else
- lc->internal_lock.entry_locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
- if (l->internal_lock.locked_nodes)
- lc->internal_lock.locked_nodes =
- memdup (l->internal_lock.locked_nodes,
- sizeof (*lc->internal_lock.locked_nodes) * priv->child_count);
- else
- lc->internal_lock.locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
- lc->internal_lock.inodelk_lock_count =
- l->internal_lock.inodelk_lock_count;
- lc->internal_lock.entrylk_lock_count =
- l->internal_lock.entrylk_lock_count;
+ lc->internal_lock.locked_nodes =
+ GF_CALLOC (sizeof (*l->internal_lock.locked_nodes),
+ priv->child_count, gf_afr_mt_char);
+ if (!lc->internal_lock.locked_nodes) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = afr_inodelk_init (&lc->internal_lock.inodelk[0],
+ this->name, priv->child_count);
+ if (ret)
+ goto out;
out:
+ if (ret) {
+ afr_local_cleanup (lc, this);
+ lc = NULL;
+ }
return lc;
}
@@ -2067,35 +2276,28 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)
afr_local_t * orig_frame_local = NULL;
afr_self_heal_t * orig_frame_sh = NULL;
char sh_type_str[256] = {0,};
- gf_boolean_t split_brain = _gf_false;
+ gf_loglevel_t loglevel = 0;
priv = this->private;
local = bgsh_frame->local;
sh = &local->self_heal;
- if (local->govinda_gOvinda || sh->mdata_spb || sh->data_spb) {
- split_brain = _gf_true;
- sh->op_failed = 1;
+ if (local->unhealable) {
+ afr_set_split_brain (this, sh->inode, SPB, SPB);
}
- afr_set_split_brain (this, sh->inode, split_brain);
-
afr_self_heal_type_str_get (sh, sh_type_str,
sizeof(sh_type_str));
- if (sh->op_failed) {
- gf_loglevel_t loglevel = GF_LOG_ERROR;
- if (priv->shd.iamshd)
- loglevel = GF_LOG_DEBUG;
-
- gf_log (this->name, loglevel, "background %s self-heal "
- "failed on %s", sh_type_str, local->loc.path);
-
+ if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) {
+ loglevel = GF_LOG_ERROR;
+ } else if (!is_self_heal_failed (sh, AFR_CHECK_ALL)) {
+ loglevel = GF_LOG_INFO;
} else {
- gf_log (this->name, GF_LOG_DEBUG, "background %s self-heal "
- "completed on %s", sh_type_str, local->loc.path);
-
+ loglevel = GF_LOG_DEBUG;
}
+ afr_log_self_heal_completion_status (local, loglevel);
+
FRAME_SU_UNDO (bgsh_frame, afr_local_t);
if (!sh->unwound && sh->unwind) {
@@ -2103,7 +2305,7 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)
orig_frame_sh = &orig_frame_local->self_heal;
orig_frame_sh->actual_sh_started = _gf_true;
sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno,
- sh->op_failed);
+ is_self_heal_failed (sh, AFR_CHECK_ALL));
}
if (sh->background) {
@@ -2152,7 +2354,7 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode)
afr_set_lk_owner (sh_frame, this, sh_frame->root);
afr_set_low_priority (sh_frame);
- sh_local = afr_local_copy (local, this);
+ sh_local = afr_self_heal_local_init (local, this);
if (!sh_local)
goto out;
sh_frame->local = sh_local;
@@ -2217,12 +2419,11 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode)
sh->do_gfid_self_heal = _gf_false;
}
+ sh->sh_type_in_action = AFR_SELF_HEAL_INVALID;
+
FRAME_SU_DO (sh_frame, afr_local_t);
- if (sh->do_missing_entry_self_heal) {
- afr_self_heal_conflicting_entries (sh_frame, this);
- } else if (sh->do_gfid_self_heal) {
- GF_ASSERT (!uuid_is_null (sh->sh_gfid_req));
- afr_self_heal_gfids (sh_frame, this);
+ if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) {
+ afr_self_heal_missing_entries (sh_frame, this);
} else {
loc = &sh_local->loc;
if (uuid_is_null (loc->inode->gfid) && uuid_is_null (loc->gfid)) {
@@ -2429,9 +2630,183 @@ out:
GF_FREE (erase_xattr);
if (ret < 0) {
- sh->op_failed = _gf_true;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
finish (frame, this);
}
return 0;
}
+
+void
+afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status)
+{
+ xlator_t *this = NULL;
+ afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status);
+ afr_self_heal_type sh_type_in_action = sh->sh_type_in_action;
+ this = THIS;
+
+ if (!sh) {
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal"
+ "Structure");
+ goto out;
+ }
+
+ switch (sh_type_in_action) {
+ case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY:
+ sh_status->gfid_or_missing_entry_self_heal = status;
+ break;
+ case AFR_SELF_HEAL_METADATA:
+ sh_status->metadata_self_heal = status;
+ break;
+ case AFR_SELF_HEAL_DATA:
+ sh_status->data_self_heal = status;
+ break;
+ case AFR_SELF_HEAL_ENTRY:
+ sh_status->entry_self_heal = status;
+ break;
+ case AFR_SELF_HEAL_INVALID:
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid"
+ "self heal type in action");
+ break;
+ }
+out:
+ return;
+}
+
+void
+afr_set_local_for_unhealable (afr_local_t *local)
+{
+ afr_self_heal_t *sh = NULL;
+
+ sh = &local->self_heal;
+
+ local->unhealable = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
+}
+
+int
+is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type)
+{
+ afr_sh_status_for_all_type sh_status = sh->afr_all_sh_status;
+ afr_self_heal_type sh_type_in_action = AFR_SELF_HEAL_INVALID;
+ afr_self_heal_status status = AFR_SELF_HEAL_FAILED;
+ xlator_t *this = NULL;
+ int sh_failed = 0;
+
+ this = THIS;
+
+ if (!sh) {
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal "
+ "structure");
+ sh_failed = 1;
+ goto out;
+ }
+
+ if (type == AFR_CHECK_ALL) {
+ if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED)
+ || (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED)
+ || (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED)
+ || (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED))
+ sh_failed = 1;
+ } else if (type == AFR_CHECK_SPECIFIC) {
+ sh_type_in_action = sh->sh_type_in_action;
+ switch (sh_type_in_action) {
+ case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY:
+ status = sh_status.gfid_or_missing_entry_self_heal;
+ break;
+ case AFR_SELF_HEAL_METADATA:
+ status = sh_status.metadata_self_heal;
+ break;
+ case AFR_SELF_HEAL_ENTRY:
+ status = sh_status.entry_self_heal;
+ break;
+ case AFR_SELF_HEAL_DATA:
+ status = sh_status.data_self_heal;
+ break;
+ case AFR_SELF_HEAL_INVALID:
+ status = AFR_SELF_HEAL_NOT_ATTEMPTED;
+ break;
+ }
+ if (status == AFR_SELF_HEAL_FAILED)
+ sh_failed = 1;
+
+ }
+
+out:
+ return sh_failed;
+}
+
+char *
+get_sh_completion_status (afr_self_heal_status status)
+{
+
+ char *not_attempted = " is not attempted";
+ char *failed = " failed";
+ char *started = " is started";
+ char *sync_begin = " is successfully completed";
+ char *result = " has unknown status";
+
+ switch (status)
+ {
+ case AFR_SELF_HEAL_NOT_ATTEMPTED:
+ result = not_attempted;
+ break;
+ case AFR_SELF_HEAL_FAILED:
+ result = failed;
+ break;
+ case AFR_SELF_HEAL_STARTED:
+ result = started;
+ break;
+ case AFR_SELF_HEAL_SYNC_BEGIN:
+ result = sync_begin;
+ break;
+ }
+
+ return result;
+
+}
+
+void
+afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t loglvl)
+{
+
+ char sh_log[4096] = {0};
+ afr_self_heal_t *sh = &local->self_heal;
+ afr_sh_status_for_all_type all_status = sh->afr_all_sh_status;
+ xlator_t *this = NULL;
+ size_t off = 0;
+ int data_sh = 0;
+ int metadata_sh = 0;
+ int print_log = 0;
+
+ this = THIS;
+
+ ADD_FMT_STRING (sh_log, off, "gfid or missing entry",
+ all_status.gfid_or_missing_entry_self_heal, print_log);
+ ADD_FMT_STRING_SYNC (sh_log, off, "metadata",
+ all_status.metadata_self_heal, print_log);
+ if (sh->background) {
+ ADD_FMT_STRING_SYNC (sh_log, off, "backgroung data",
+ all_status.data_self_heal, print_log);
+ } else {
+ ADD_FMT_STRING_SYNC (sh_log, off, "foreground data",
+ all_status.data_self_heal, print_log);
+ }
+ ADD_FMT_STRING_SYNC (sh_log, off, "entry", all_status.entry_self_heal,
+ print_log);
+
+ if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.data_self_heal &&
+ strcmp (sh->data_sh_info, "") && sh->data_sh_info )
+ data_sh = 1;
+ if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.metadata_self_heal &&
+ strcmp (sh->metadata_sh_info, "") && sh->metadata_sh_info)
+ metadata_sh = 1;
+
+ if (!print_log)
+ return;
+
+ gf_log (this->name, loglvl, "%s %s %s on %s", sh_log,
+ ((data_sh == 1) ? sh->data_sh_info : ""),
+ ((metadata_sh == 1) ? sh->metadata_sh_info : ""),
+ local->loc.path);
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h
index 1c83289a9..473264776 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.h
+++ b/xlators/cluster/afr/src/afr-self-heal-common.h
@@ -15,13 +15,6 @@
#define AFR_SH_MIN_PARTICIPANTS 2
typedef enum {
- AFR_SELF_HEAL_ENTRY,
- AFR_SELF_HEAL_METADATA,
- AFR_SELF_HEAL_DATA,
- AFR_SELF_HEAL_INVALID = -1,
-} afr_self_heal_type;
-
-typedef enum {
AFR_LOOKUP_FAIL_CONFLICTS = 1,
AFR_LOOKUP_FAIL_MISSING_GFIDS = 2,
} afr_lookup_flags_t;
@@ -35,6 +28,10 @@ afr_sh_source_count (int sources[], int child_count);
void
afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this);
+void
+afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this,
+ const char *loc);
+
int
afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix,
unsigned char *ignorant_subvols,
@@ -94,13 +91,13 @@ int
afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this,
int child_index);
int
-afr_sh_data_unlock (call_frame_t *frame, xlator_t *this,
+afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom,
afr_lock_cbk_t lock_cbk);
afr_local_t *
-afr_local_copy (afr_local_t *l, xlator_t *this);
+afr_self_heal_local_init (afr_local_t *l, xlator_t *this);
int
afr_sh_data_lock (call_frame_t *frame, xlator_t *this,
- off_t start, off_t len, gf_boolean_t block,
+ off_t start, off_t len, gf_boolean_t block, char *dom,
afr_lock_cbk_t success_handler,
afr_lock_cbk_t failure_handler);
void
@@ -129,4 +126,19 @@ int
afr_sh_erase_pending (call_frame_t *frame, xlator_t *this,
afr_transaction_type type, afr_fxattrop_cbk_t cbk,
int (*finish)(call_frame_t *frame, xlator_t *this));
+
+void
+afr_set_local_for_unhealable (afr_local_t *local);
+
+int
+is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type);
+
+void
+afr_set_self_heal_status (afr_self_heal_t *sh, afr_self_heal_status status);
+
+void
+afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t logl);
+
+char*
+afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this);
#endif /* __AFR_SELF_HEAL_COMMON_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index c90ed7c1a..9de26ee56 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -156,6 +156,25 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this)
}
int
+afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ if (sh->sh_dom_lock_held)
+ afr_sh_data_unlock (frame, this, priv->sh_domain,
+ afr_sh_data_close);
+ else
+ afr_sh_data_close (frame, this);
+ return 0;
+}
+
+int
afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *statpre,
struct iatt *statpost, dict_t *xdata)
@@ -190,29 +209,20 @@ afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
int
-afr_sh_data_setattr (call_frame_t *frame, xlator_t *this)
+afr_sh_data_setattr (call_frame_t *frame, xlator_t *this, struct iatt* stbuf)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
afr_self_heal_t *sh = NULL;
int i = 0;
int call_count = 0;
- int source = 0;
int32_t valid = 0;
- struct iatt stbuf = {0,};
local = frame->local;
sh = &local->self_heal;
priv = this->private;
- source = sh->source;
-
- valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME);
-
- stbuf.ia_atime = sh->buf[source].ia_atime;
- stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec;
- stbuf.ia_mtime = sh->buf[source].ia_mtime;
- stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec;
+ valid = (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME);
call_count = afr_set_elem_count_get (sh->success,
priv->child_count);
@@ -232,7 +242,7 @@ afr_sh_data_setattr (call_frame_t *frame, xlator_t *this)
(void *) (long) i,
priv->children[i],
priv->children[i]->fops->setattr,
- &local->loc, &stbuf, valid, NULL);
+ &local->loc, stbuf, valid, NULL);
if (!--call_count)
break;
@@ -256,7 +266,7 @@ afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie,
GF_ASSERT (sh->source == child_index);
if (op_ret != -1) {
sh->buf[child_index] = *buf;
- afr_sh_data_setattr (frame, this);
+ afr_sh_data_setattr (frame, this, buf);
} else {
gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set "
"time-stamps after self-heal", local->loc.path);
@@ -292,31 +302,45 @@ afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this)
//Fun fact, lock_cbk is being used for both lock & unlock
int
-afr_sh_data_unlock (call_frame_t *frame, xlator_t *this,
+afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom,
afr_lock_cbk_t lock_cbk)
{
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int ret = 0;
local = frame->local;
int_lock = &local->internal_lock;
sh = &local->self_heal;
+ priv = this->private;
- GF_ASSERT (sh->data_lock_held);
-
- sh->data_lock_held = _gf_false;
+ if (strcmp (dom, this->name) == 0) {
+ sh->data_lock_held = _gf_false;
+ } else if (strcmp (dom, priv->sh_domain) == 0) {
+ sh->sh_dom_lock_held = _gf_false;
+ } else {
+ ret = -1;
+ goto out;
+ }
int_lock->lock_cbk = lock_cbk;
+ int_lock->domain = dom;
afr_unlock (frame, this);
+out:
+ if (ret) {
+ int_lock->lock_op_ret = -1;
+ int_lock->lock_cbk (frame, this);
+ }
return 0;
}
int
afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
local = frame->local;
sh = &local->self_heal;
@@ -325,9 +349,9 @@ afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
"finishing data selfheal of %s", local->loc.path);
if (sh->data_lock_held)
- afr_sh_data_unlock (frame, this, afr_sh_data_close);
+ afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock);
else
- afr_sh_data_close (frame, this);
+ afr_sh_dom_unlock (frame, this);
return 0;
}
@@ -344,11 +368,8 @@ afr_sh_data_fail (call_frame_t *frame, xlator_t *this)
gf_log (this->name, GF_LOG_DEBUG,
"finishing failed data selfheal of %s", local->loc.path);
- sh->op_failed = 1;
- if (sh->data_lock_held)
- afr_sh_data_unlock (frame, this, afr_sh_data_close);
- else
- afr_sh_data_close (frame, this);
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
+ afr_sh_data_finish (frame, this);
return 0;
}
@@ -371,13 +392,13 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
"log failed on %s for subvol %s, reason: %s",
local->loc.path, priv->children[child_index]->name,
strerror (op_errno));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
}
call_count = afr_frame_return (frame);
if (call_count == 0) {
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
if (sh->old_loop_frame)
sh_loop_finish (sh->old_loop_frame, this);
sh->old_loop_frame = NULL;
@@ -389,7 +410,7 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
goto out;
}
GF_ASSERT (sh->old_loop_frame);
- afr_sh_data_lock (frame, this, 0, 0, _gf_true,
+ afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name,
afr_post_sh_big_lock_success,
afr_post_sh_big_lock_failure);
}
@@ -406,6 +427,80 @@ afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)
return 0;
}
+int
+afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+ priv = this->private;
+ sh = &local->self_heal;
+
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "%s: Failed to fsync on "
+ "%s - %s", local->loc.path,
+ priv->children[child_index]->name, strerror (op_errno));
+ LOCK (&frame->lock);
+ {
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
+ }
+ UNLOCK (&frame->lock);
+ if (sh->old_loop_frame)
+ sh_loop_finish (sh->old_loop_frame, this);
+ sh->old_loop_frame = NULL;
+ }
+
+ call_count = afr_frame_return (frame);
+ if (call_count == 0) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC))
+ afr_sh_data_fail (frame, this);
+ else
+ afr_sh_data_erase_pending (frame, this);
+ }
+ return 0;
+}
+
+/*
+ * Before erasing xattrs, make sure the data is written to disk
+ */
+int
+afr_sh_data_fsync (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_self_heal_t *sh = NULL;
+ int i = 0;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+ sh = &local->self_heal;
+
+ call_count = sh->active_sinks;
+ if (call_count == 0) {
+ afr_sh_data_erase_pending (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sh->success[i] || sh->sources[i])
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_fsync_cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->fsync,
+ sh->healing_fd, 1, NULL);
+ }
+
+ return 0;
+}
static struct afr_sh_algorithm *
sh_algo_from_name (xlator_t *this, char *name)
@@ -503,7 +598,7 @@ afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this)
local = frame->local;
sh = &local->self_heal;
- sh->algo_completion_cbk = afr_sh_data_erase_pending;
+ sh->algo_completion_cbk = afr_sh_data_fsync;
sh->algo_abort_cbk = afr_sh_data_fail;
sh_algo = afr_sh_data_pick_algo (frame, this);
@@ -539,7 +634,7 @@ afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->loc.path,
priv->children[child_index]->name,
strerror (op_errno));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
} else {
gf_log (this->name, GF_LOG_DEBUG,
"ftruncate of %s on subvolume %s completed",
@@ -552,7 +647,7 @@ afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_count = afr_frame_return (frame);
if (call_count == 0) {
- if (sh->op_failed)
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC))
afr_sh_data_fail (frame, this);
else
afr_sh_data_sync_prepare (frame, this);
@@ -632,6 +727,199 @@ out:
return ret;
}
+char*
+afr_get_sizes_str (afr_local_t *local, struct iatt *bufs, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ char num[1024] = {0};
+ size_t len = 0;
+ char *sizes_str = NULL;
+ size_t off = 0;
+ char *fmt_str = "%llu bytes on %s, ";
+ char *child_down = " %s,";
+ char *child_unknown = " %s,";
+ int down_child_present = 0;
+ int down_count = 0;
+ int unknown_count = 0;
+ int unknown_child_present = 0;
+ char *down_subvol_1 = " down subvolume is ";
+ char *unknown_subvol_1 = " unknown subvolume is ";
+ char *down_subvol_2 = " down subvolumes are ";
+ char *unknown_subvol_2 = " unknown subvolumes are ";
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] == 1) {
+ len += snprintf (num, sizeof (num), fmt_str,
+ (unsigned long long) bufs[i].ia_size,
+ priv->children[i]->name);
+ } else if (local->child_up[i] == 0) {
+ len += snprintf (num, sizeof (num), child_down,
+ priv->children[i]->name);
+ if (!down_child_present)
+ down_child_present = 1;
+ down_count ++;
+ } else if (local->child_up[i] == -1) {
+ len += snprintf (num, sizeof (num), child_unknown,
+ priv->children[i]->name);
+ if (!unknown_child_present)
+ unknown_child_present = 1;
+ unknown_count++;
+ }
+
+ }
+
+ if (down_child_present) {
+ if (down_count > 1)
+ len += snprintf (num, sizeof (num), "%s",
+ down_subvol_2);
+ else
+ len += snprintf (num, sizeof (num), "%s",
+ down_subvol_1);
+ }
+ if (unknown_child_present) {
+ if (unknown_count > 1)
+ len += snprintf (num, sizeof (num), "%s",
+ unknown_subvol_2);
+ else
+ len += snprintf (num, sizeof (num), "%s",
+ unknown_subvol_1);
+ }
+
+ len++;//for '\0'
+
+ sizes_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char);
+
+ if (!sizes_str)
+ return NULL;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] == 1) {
+ off += snprintf (sizes_str + off, len - off, fmt_str,
+ (unsigned long long) bufs[i].ia_size,
+ priv->children[i]->name);
+ }
+ }
+
+ if (down_child_present) {
+ if (down_count > 1) {
+ off += snprintf (sizes_str + off, len - off, "%s",
+ down_subvol_2);
+ } else {
+ off += snprintf (sizes_str + off, len - off, "%s",
+ down_subvol_1);
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] == 0) {
+ off += snprintf (sizes_str + off, len - off, child_down,
+ priv->children[i]->name);
+ }
+ }
+
+ if (unknown_child_present) {
+ if (unknown_count > 1) {
+ off += snprintf (sizes_str + off, len - off, "%s",
+ unknown_subvol_2);
+ } else {
+ off += snprintf (sizes_str + off, len - off, "%s",
+ unknown_subvol_1);
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] == -1) {
+ off += snprintf (sizes_str + off, len - off,
+ child_unknown,
+ priv->children[i]->name);
+
+ }
+ }
+
+ return sizes_str;
+}
+
+char*
+afr_get_sinks_str (xlator_t *this, afr_local_t *local, afr_self_heal_t *sh)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ char num[1024] = {0};
+ size_t len = 0;
+ char *sinks_str = NULL;
+ char *temp_str = " to sinks ";
+ char *str_format = " %s,";
+ char off = 0;
+
+ priv = this->private;
+
+ len += snprintf (num, sizeof (num), "%s", temp_str);
+ for (i = 0; i < priv->child_count; i++) {
+ if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) {
+ len += snprintf (num, sizeof (num), str_format,
+ priv->children[i]->name);
+ }
+ }
+
+ len ++;
+
+ sinks_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char);
+
+ if (!sinks_str)
+ return NULL;
+
+ off += snprintf (sinks_str + off, len - off, "%s", temp_str);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) {
+ off += snprintf (sinks_str + off, len - off,
+ str_format,
+ priv->children[i]->name);
+ }
+ }
+
+ return sinks_str;
+
+}
+
+
+void
+afr_set_data_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, xlator_t *this)
+{
+ char *pending_matrix_str = NULL;
+ char *sizes_str = NULL;
+ char *sinks_str = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix,
+ this);
+ if (!pending_matrix_str)
+ pending_matrix_str = "";
+
+ sizes_str = afr_get_sizes_str (local, sh->buf, this);
+ if (!sizes_str)
+ sizes_str = "";
+
+ sinks_str = afr_get_sinks_str (this, local, sh);
+ if (!sinks_str)
+ sinks_str = "";
+
+ gf_asprintf (&sh->data_sh_info, " data self heal from %s %s with "
+ "%s data %s", priv->children[sh->source]->name, sinks_str,
+ sizes_str, pending_matrix_str);
+
+ if (pending_matrix_str && strcmp (pending_matrix_str, ""))
+ GF_FREE (pending_matrix_str);
+
+ if (sizes_str && strcmp (sizes_str, ""))
+ GF_FREE (sizes_str);
+}
+
void
afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
{
@@ -653,7 +941,7 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
if (sh->background && sh->unwind && !sh->unwound) {
sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno,
- sh->op_failed);
+ is_self_heal_failed (sh, AFR_CHECK_SPECIFIC));
sh->unwound = _gf_true;
}
@@ -672,6 +960,7 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
sh->active_sinks);
sh->actual_sh_started = _gf_true;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN);
afr_sh_data_trim_sinks (frame, this);
}
@@ -683,6 +972,9 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this)
afr_private_t *priv = NULL;
int nsources = 0;
int ret = 0;
+ int *old_sources = NULL;
+ int tstamp_source = 0;
+ int i = 0;
local = frame->local;
sh = &local->self_heal;
@@ -690,6 +982,13 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this)
gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %s",
lkowner_utoa (&frame->root->lk_owner));
+ if (sh->sync_done) {
+ //store sources before sync so that mtime can be set using the
+ //iatt buf from one of them.
+ old_sources = alloca (priv->child_count*sizeof (*old_sources));
+ memcpy (old_sources, sh->sources,
+ priv->child_count * sizeof (*old_sources));
+ }
nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix,
sh->sources, sh->success_children,
@@ -711,17 +1010,16 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this)
}
if (nsources == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Unable to self-heal contents of '%s' (possible "
- "split-brain). Please delete the file from all but "
- "the preferred subvolume.", local->loc.path);
+ afr_sh_print_split_brain_log (sh->pending_matrix, this,
+ local->loc.path);
+ afr_set_split_brain (this, sh->inode, DONT_KNOW, SPB);
- sh->data_spb = _gf_true;
afr_sh_data_fail (frame, this);
return 0;
}
- sh->data_spb = _gf_false;
+ afr_set_split_brain (this, sh->inode, DONT_KNOW, NO_SPB);
+
ret = afr_sh_inode_set_read_ctx (sh, this);
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
@@ -732,8 +1030,18 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this)
}
if (sh->sync_done) {
- afr_sh_data_setattr (frame, this);
+ /* Perform setattr from one of the old_sources if possible
+ * Because only they have the correct mtime, the new sources
+ * (i.e. old sinks) have mtime from last writev in sync.
+ */
+ tstamp_source = sh->source;
+ for (i = 0; i < priv->child_count; i++) {
+ if (old_sources[i] && sh->sources[i])
+ tstamp_source = i;
+ }
+ afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]);
} else {
+ afr_set_data_sh_info_str (local, sh, this);
if (nsources == 0) {
gf_log (this->name, GF_LOG_DEBUG,
"No self-heal needed for %s",
@@ -1091,6 +1399,22 @@ afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this)
}
int
+afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sh->sh_dom_lock_held = _gf_true;
+ afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name,
+ afr_sh_data_big_lock_success,
+ afr_sh_data_fail);
+ return 0;
+}
+
+int
afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
@@ -1154,9 +1478,11 @@ afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
}
int
-afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t len)
+afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom,
+ off_t start, off_t len)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
local = frame->local;
@@ -1167,11 +1493,14 @@ afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t le
afr_set_lock_number (frame, this);
- int_lock->lk_flock.l_start = start;
- int_lock->lk_flock.l_len = len;
- int_lock->lk_flock.l_type = F_WRLCK;
int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk;
+ int_lock->domain = dom;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ inodelk->flock.l_start = start;
+ inodelk->flock.l_len = len;
+ inodelk->flock.l_type = F_WRLCK;
+
afr_nonblocking_inodelk (frame, this);
return 0;
@@ -1215,7 +1544,7 @@ afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this)
int
afr_sh_data_lock (call_frame_t *frame, xlator_t *this,
off_t start, off_t len, gf_boolean_t block,
- afr_lock_cbk_t success_handler,
+ char *dom, afr_lock_cbk_t success_handler,
afr_lock_cbk_t failure_handler)
{
afr_local_t * local = NULL;
@@ -1227,7 +1556,7 @@ afr_sh_data_lock (call_frame_t *frame, xlator_t *this,
sh->data_lock_success_handler = success_handler;
sh->data_lock_failure_handler = failure_handler;
sh->data_lock_block = block;
- return afr_sh_data_lock_rec (frame, this, start, len);
+ return afr_sh_data_lock_rec (frame, this, dom, start, len);
}
int
@@ -1239,7 +1568,6 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_private_t *priv = NULL;
int call_count = 0;
int child_index = 0;
- gf_boolean_t block = _gf_true;
local = frame->local;
sh = &local->self_heal;
@@ -1259,7 +1587,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->loc.path,
priv->children[child_index]->name,
strerror (op_errno));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
} else {
gf_log (this->name, GF_LOG_TRACE,
"open of %s succeeded on child %s",
@@ -1272,7 +1600,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_count = afr_frame_return (frame);
if (call_count == 0) {
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
afr_sh_data_fail (frame, this);
return 0;
}
@@ -1281,15 +1609,8 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"fd for %s opened, commencing sync",
local->loc.path);
- /*
- * The read and write self-heal trigger codepaths do not provide
- * an unwind callback. We run a trylock in these codepaths
- * because we are sensitive to locking latency.
- */
- block = sh->unwind ? _gf_true : _gf_false;
- afr_sh_data_lock (frame, this, 0, 0, block,
- afr_sh_data_big_lock_success,
- afr_sh_data_fail);
+ afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain,
+ afr_sh_dom_lock_success, afr_sh_data_fail);
}
return 0;
@@ -1394,25 +1715,31 @@ afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv)
int
afr_self_heal_data (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = this->private;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = this->private;
+ int ret = -1;
local = frame->local;
sh = &local->self_heal;
- /* Self-heal completion cbk changes inode split-brain status based on
- * govinda_gOvinda, mdata_spb, data_spb value.
- * Initialize data_spb with current split-brain status.
- * If for some reason self-heal fails(locking phase etc), it makes sure
- * we retain the split-brain status before this self-heal started.
- */
- sh->data_spb = afr_is_split_brain (this, sh->inode);
+ sh->sh_type_in_action = AFR_SELF_HEAL_DATA;
+
if (afr_can_start_data_self_heal (sh, priv)) {
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
+ ret = afr_inodelk_init (&local->internal_lock.inodelk[1],
+ priv->sh_domain, priv->child_count);
+ if (ret < 0) {
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
+ afr_sh_data_done (frame, this);
+ return 0;
+ }
+
if (IA_ISREG (sh->type)) {
afr_sh_data_open (frame, this);
} else {
afr_sh_data_lock (frame, this, 0, 0, _gf_true,
+ this->name,
afr_sh_non_reg_lock_success,
afr_sh_data_fail);
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index c3c9f9fca..53491a1d7 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -162,7 +162,7 @@ afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this)
sh = &local->self_heal;
if (sh->entries_skipped) {
- sh->op_failed = _gf_true;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
goto out;
}
afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION,
@@ -613,6 +613,19 @@ out:
return 0;
}
+static gf_boolean_t
+can_skip_entry_self_heal (char *name, loc_t *parent_loc)
+{
+ if (strcmp (name, ".") == 0) {
+ return _gf_true;
+ } else if (strcmp (name, "..") == 0) {
+ return _gf_true;
+ } else if (loc_is_root (parent_loc) &&
+ (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0)) {
+ return _gf_true;
+ }
+ return _gf_false;
+}
int
afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this,
@@ -640,13 +653,7 @@ afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this,
sh->expunge_done = afr_sh_entry_expunge_entry_done;
name = entry->d_name;
-
- if ((strcmp (name, ".") == 0)
- || (strcmp (name, "..") == 0)) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "skipping inspection of %s under %s",
- name, local->loc.path);
+ if (can_skip_entry_self_heal (name, &local->loc)) {
op_ret = 0;
goto out;
}
@@ -799,7 +806,7 @@ afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this)
active_src = next_active_sink (frame, this, sh->active_source);
sh->active_source = active_src;
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
goto out;
}
@@ -1256,6 +1263,35 @@ afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this,
gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed",
impunge_local->loc.path);
+ /*
+ * Reason for adding GLUSTERFS_INTERNAL_FOP_KEY :
+ *
+ * Problem:
+ * While a brick is down in a replica pair, lets say the user creates
+ * one file(file-A) and a hard link to that file(h-file-A). After the
+ * brick comes back up, entry self-heal is attempted on parent dir of
+ * these two files. As part of readdir in self-heal it reads both the
+ * entries file-A and h-file-A for both of them it does name less lookup
+ * to check if there are any hardlinks already present in the
+ * destination brick. It finds that there are no hard links already
+ * present for files file-A, h-file-A. Self-heal does mknods for both
+ * file-A and h-file-A. This leads to file-A and h-file-A not being
+ * hardlinks anymore.
+ *
+ * Fix: (More like shrinking of race-window, the race itself is still
+ * present in posix-mknod).
+ * If mknod comes with the presence of GLUSTERFS_INTERNAL_FOP_KEY then
+ * posix_mknod checks if there are already any gfid-links and does
+ * link() instead of mknod. There still can be a race where two
+ * posix_mknods same gfid see that
+ * gfid-link file is not present and proceeds with mknods and result in
+ * two different files with same gfid.
+ */
+ ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+ if (ret)
+ gf_log (this->name, GF_LOG_INFO, "%s: %s set failed",
+ impunge_local->loc.path, GLUSTERFS_INTERNAL_FOP_KEY);
+
STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
(void *) (long) child_index,
priv->children[child_index],
@@ -1872,12 +1908,7 @@ afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this,
active_src = sh->active_source;
sh->impunge_done = afr_sh_entry_impunge_entry_done;
- if ((strcmp (entry->d_name, ".") == 0)
- || (strcmp (entry->d_name, "..") == 0)) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "skipping inspection of %s under %s",
- entry->d_name, local->loc.path);
+ if (can_skip_entry_self_heal (entry->d_name, &local->loc)) {
op_ret = 0;
goto out;
}
@@ -1946,7 +1977,7 @@ afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie,
local->loc.path,
priv->children[active_src]->name,
strerror (op_errno));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
} else {
gf_log (this->name, GF_LOG_TRACE,
"readdir of %s on subvolume %s complete",
@@ -2019,7 +2050,7 @@ afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this)
active_src = next_active_source (frame, this, sh->active_source);
sh->active_source = active_src;
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
afr_sh_entry_finish (frame, this);
return 0;
}
@@ -2068,7 +2099,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->loc.path,
priv->children[child_index]->name,
strerror (op_errno));
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
}
}
UNLOCK (&frame->lock);
@@ -2076,7 +2107,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_count = afr_frame_return (frame);
if (call_count == 0) {
- if (sh->op_failed) {
+ if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
afr_sh_entry_finish (frame, this);
return 0;
}
@@ -2209,6 +2240,7 @@ afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this)
local->loc.path);
sh->actual_sh_started = _gf_true;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN);
afr_sh_entry_open (frame, this);
return 0;
@@ -2231,7 +2263,7 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this,
priv = this->private;
if (op_ret < 0) {
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_set_error (sh, op_errno);
afr_sh_entry_finish (frame, this);
goto out;
@@ -2294,7 +2326,7 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this)
if (int_lock->lock_op_ret < 0) {
gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks "
"failed for %s.", local->loc.path);
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_entry_done (frame, this);
} else {
@@ -2313,14 +2345,18 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this)
int
afr_self_heal_entry (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
afr_private_t *priv = NULL;
-
+ afr_self_heal_t *sh = NULL;
priv = this->private;
local = frame->local;
+ sh = &local->self_heal;
+
+ sh->sh_type_in_action = AFR_SELF_HEAL_ENTRY;
if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) {
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
afr_sh_entrylk (frame, this, &local->loc, NULL,
afr_sh_post_nonblocking_entry_cbk);
} else {
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 647ee47a1..fd5da6cfd 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -88,6 +88,19 @@ afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this)
return 0;
}
+int
+afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
+ afr_sh_metadata_finish (frame, this);
+ return 0;
+}
int
afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie,
@@ -167,8 +180,13 @@ afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_count = afr_frame_return (frame);
- if (call_count == 0)
+ if (call_count == 0) {
+ if (local->xattr_req) {
+ dict_unref (local->xattr_req);
+ local->xattr_req = NULL;
+ }
afr_sh_metadata_erase_pending (frame, this);
+ }
return 0;
}
@@ -194,6 +212,86 @@ afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
+int
+afr_sh_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t *xdata)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (op_ret < 0) {
+ afr_sh_metadata_sync_cbk (frame, cookie,
+ this, -1, op_errno, xdata);
+ goto out;
+ }
+
+ i = (long) cookie;
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->setxattr,
+ &local->loc, local->xattr_req, 0, NULL);
+
+ out:
+ return 0;
+}
+
+inline void
+afr_prune_special_keys (dict_t *xattr_dict)
+{
+ dict_del (xattr_dict, GF_SELINUX_XATTR_KEY);
+}
+
+inline void
+afr_prune_pending_keys (dict_t *xattr_dict, afr_private_t *priv)
+{
+ int i = 0;
+
+ for (; i < priv->child_count; i++) {
+ dict_del (xattr_dict, priv->pending_key[i]);
+ }
+}
+
+int
+afr_sh_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr,
+ dict_t *xdata)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (op_ret < 0) {
+ afr_sh_metadata_sync_cbk (frame, cookie,
+ this, -1, op_errno, xdata);
+ goto out;
+ }
+
+ afr_prune_pending_keys (xattr, priv);
+
+ afr_prune_special_keys (xattr);
+
+ i = (long) cookie;
+
+ /* send removexattr in bulk via xdata */
+ STACK_WIND_COOKIE (frame, afr_sh_removexattr_cbk,
+ cookie,
+ priv->children[i],
+ priv->children[i]->fops->removexattr,
+ &local->loc, "", xattr);
+
+ out:
+ return 0;
+}
int
afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr)
@@ -219,9 +317,10 @@ afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr)
/*
* 2 calls per sink - setattr, setxattr
*/
- if (xattr)
+ if (xattr) {
call_count = active_sinks * 2;
- else
+ local->xattr_req = dict_ref (xattr);
+ } else
call_count = active_sinks;
local->call_count = call_count;
@@ -264,11 +363,11 @@ afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr)
if (!xattr)
continue;
- STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk,
+ STACK_WIND_COOKIE (frame, afr_sh_getxattr_cbk,
(void *) (long) i,
priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc, xattr, 0, NULL);
+ priv->children[i]->fops->getxattr,
+ &local->loc, NULL, NULL);
call_count--;
}
@@ -286,8 +385,6 @@ afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_private_t *priv = NULL;
int source = 0;
- int i;
-
local = frame->local;
sh = &local->self_heal;
priv = this->private;
@@ -302,16 +399,147 @@ afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_sh_metadata_sync (frame, this, NULL);
} else {
- for (i = 0; i < priv->child_count; i++) {
- dict_del (xattr, priv->pending_key[i]);
- }
-
+ afr_prune_pending_keys (xattr, priv);
afr_sh_metadata_sync (frame, this, xattr);
}
return 0;
}
+static void
+afr_set_metadata_sh_info_str (afr_local_t *local, afr_self_heal_t *sh,
+ xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ char num[1024] = {0};
+ size_t len = 0;
+ char *string = NULL;
+ size_t off = 0;
+ char *source_child = " from source %s to";
+ char *format = " %s, ";
+ char *string_msg = " metadata self heal";
+ char *pending_matrix_str = NULL;
+ int down_child_present = 0;
+ int unknown_child_present = 0;
+ char *down_subvol_1 = " down subvolume is ";
+ char *unknown_subvol_1 = " unknown subvolume is";
+ char *down_subvol_2 = " down subvolumes are ";
+ char *unknown_subvol_2 = " unknown subvolumes are ";
+ int down_count = 0;
+ int unknown_count = 0;
+
+ priv = this->private;
+
+ pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix,
+ this);
+
+ if (!pending_matrix_str)
+ pending_matrix_str = "";
+
+ len += snprintf (num, sizeof (num), "%s", string_msg);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((sh->source == i) && (local->child_up[i] == 1)) {
+ len += snprintf (num, sizeof (num), source_child,
+ priv->children[i]->name);
+ } else if ((local->child_up[i] == 1) && (sh->sources[i] == 0)) {
+ len += snprintf (num, sizeof (num), format,
+ priv->children[i]->name);
+ } else if (local->child_up[i] == 0) {
+ len += snprintf (num, sizeof (num), format,
+ priv->children[i]->name);
+ if (!down_child_present)
+ down_child_present = 1;
+ down_count++;
+ } else if (local->child_up[i] == -1) {
+ len += snprintf (num, sizeof (num), format,
+ priv->children[i]->name);
+ if (!unknown_child_present)
+ unknown_child_present = 1;
+ unknown_count++;
+ }
+ }
+
+ if (down_child_present) {
+ if (down_count > 1) {
+ len += snprintf (num, sizeof (num), "%s",
+ down_subvol_2);
+ } else {
+ len += snprintf (num, sizeof (num), "%s",
+ down_subvol_1);
+ }
+ }
+ if (unknown_child_present) {
+ if (unknown_count > 1) {
+ len += snprintf (num, sizeof (num), "%s",
+ unknown_subvol_2);
+ } else {
+ len += snprintf (num, sizeof (num), "%s",
+ unknown_subvol_1);
+ }
+ }
+
+ len ++;
+
+ string = GF_CALLOC (len, sizeof (char), gf_common_mt_char);
+ if (!string)
+ return;
+
+ off += snprintf (string + off, len - off, "%s", string_msg);
+ for (i=0; i < priv->child_count; i++) {
+ if ((sh->source == i) && (local->child_up[i] == 1))
+ off += snprintf (string + off, len - off, source_child,
+ priv->children[i]->name);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((local->child_up[i] == 1)&& (sh->sources[i] == 0))
+ off += snprintf (string + off, len - off, format,
+ priv->children[i]->name);
+ }
+
+ if (down_child_present) {
+ if (down_count > 1) {
+ off += snprintf (string + off, len - off, "%s",
+ down_subvol_2);
+ } else {
+ off += snprintf (string + off, len - off, "%s",
+ down_subvol_1);
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] == 0)
+ off += snprintf (string + off, len - off, format,
+ priv->children[i]->name);
+ }
+
+ if (unknown_child_present) {
+ if (unknown_count > 1) {
+ off += snprintf (string + off, len - off, "%s",
+ unknown_subvol_2);
+ } else {
+ off += snprintf (string + off, len - off, "%s",
+ unknown_subvol_1);
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] == -1)
+ off += snprintf (string + off, len - off, format,
+ priv->children[i]->name);
+ }
+
+ gf_asprintf (&sh->metadata_sh_info, "%s metadata %s,", string,
+ pending_matrix_str);
+
+ if (pending_matrix_str && strcmp (pending_matrix_str, ""))
+ GF_FREE (pending_matrix_str);
+
+ if (string && strcmp (string, ""))
+ GF_FREE (string);
+}
int
afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)
@@ -342,6 +570,8 @@ afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)
sh->active_sinks);
sh->actual_sh_started = _gf_true;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN);
+ afr_set_metadata_sh_info_str (local, sh, this);
STACK_WIND (frame, afr_sh_metadata_getxattr_cbk,
priv->children[source],
priv->children[source]->fops->getxattr,
@@ -367,7 +597,7 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this,
priv = this->private;
if (op_ret < 0) {
- sh->op_failed = 1;
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
afr_sh_set_error (sh, op_errno);
afr_sh_metadata_finish (frame, this);
goto out;
@@ -392,18 +622,14 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this,
}
if (nsources == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Unable to self-heal permissions/ownership of '%s' "
- "(possible split-brain). Please fix the file on "
- "all backend volumes", local->loc.path);
-
- sh->mdata_spb = _gf_true;
-
- afr_sh_metadata_finish (frame, this);
+ afr_sh_print_split_brain_log (sh->pending_matrix, this,
+ local->loc.path);
+ afr_set_split_brain (this, sh->inode, SPB, DONT_KNOW);
+ afr_sh_metadata_fail (frame, this);
goto out;
}
- sh->mdata_spb = _gf_false;
+ afr_set_split_brain (this, sh->inode, NO_SPB, DONT_KNOW);
if (nsources == 0) {
gf_log (this->name, GF_LOG_TRACE,
"No self-heal needed for %s",
@@ -489,19 +715,22 @@ int
afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
local = frame->local;
int_lock = &local->internal_lock;
+ int_lock->domain = this->name;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK;
afr_set_lock_number (frame, this);
- int_lock->lk_flock.l_start = LLONG_MAX - 1;
- int_lock->lk_flock.l_len = 0;
- int_lock->lk_flock.l_type = F_WRLCK;
+ inodelk->flock.l_start = LLONG_MAX - 1;
+ inodelk->flock.l_len = 0;
+ inodelk->flock.l_type = F_WRLCK;
int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk;
afr_nonblocking_inodelk (frame, this);
@@ -528,16 +757,10 @@ afr_self_heal_metadata (call_frame_t *frame, xlator_t *this)
local = frame->local;
sh = &local->self_heal;
-
- /* Self-heal completion cbk changes inode split-brain status based on
- * govinda_gOvinda, mdata_spb, data_spb value.
- * Initialize mdata_spb with current split-brain status.
- * If for some reason self-heal fails(locking phase etc), it makes sure
- * we retain the split-brain status before this self-heal started.
- */
- sh->mdata_spb = afr_is_split_brain (this, sh->inode);
+ sh->sh_type_in_action = AFR_SELF_HEAL_METADATA;
if (afr_can_start_metadata_self_heal (sh, priv)) {
+ afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
afr_sh_metadata_lock (frame, this);
} else {
afr_sh_metadata_done (frame, this);
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index 214c0fff4..1b48a1bca 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -25,7 +25,8 @@ typedef enum {
typedef enum {
HEAL = 1,
- INFO
+ INFO,
+ STATISTICS_TO_BE_HEALED,
} shd_crawl_op;
typedef struct shd_dump {
@@ -84,6 +85,33 @@ _loc_assign_gfid_path (loc_t *loc)
}
void
+_destroy_crawl_event_data (void *data)
+{
+ shd_crawl_event_t *crawl_event = NULL;
+
+ if (!data)
+ goto out;
+
+ crawl_event = (shd_crawl_event_t *)data;
+ GF_FREE (crawl_event->start_time_str);
+ GF_FREE (crawl_event->end_time_str);
+
+out:
+ return;
+}
+
+void
+_destroy_shd_event_data (void *data)
+{
+ shd_event_t *event = NULL;
+ if (!data)
+ goto out;
+ event = (shd_event_t*)data;
+ GF_FREE (event->path);
+out:
+ return;
+}
+void
shd_cleanup_event (void *event)
{
shd_event_t *shd_event = event;
@@ -128,6 +156,123 @@ _build_index_loc (xlator_t *this, loc_t *loc, char *name, loc_t *parent)
}
int
+_add_crawl_stats_to_dict (xlator_t *this, dict_t *output, int child,
+ shd_crawl_event_t *shd_event, struct timeval *tv)
+{
+ int ret = 0;
+ uint64_t count = 0;
+ char key[256] = {0};
+ int xl_id = 0;
+ uint64_t healed_count = 0;
+ uint64_t split_brain_count = 0;
+ uint64_t heal_failed_count = 0;
+ char *start_time_str = NULL;
+ char *end_time_str = NULL;
+ char *crawl_type = NULL;
+ int progress = -1;
+
+ healed_count = shd_event->healed_count;
+ split_brain_count = shd_event->split_brain_count;
+ heal_failed_count = shd_event->heal_failed_count;
+ start_time_str = shd_event->start_time_str;
+ end_time_str = shd_event->end_time_str;
+ crawl_type = shd_event->crawl_type;
+
+ if (!start_time_str) {
+ ret = -1;
+ goto out;
+ }
+
+
+ ret = dict_get_int32 (output, this->name, &xl_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "xl does not have id");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child);
+ ret = dict_get_uint64 (output, key, &count);
+
+ snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64(output, key, healed_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_"
+ "healed_count to outout");
+ goto out;
+ }
+ snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64 (output, key, split_brain_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_"
+ "split_brain_count to outout");
+ goto out;
+ }
+ snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_dynstr (output, key, gf_strdup (crawl_type));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_"
+ "crawl_type to output");
+ goto out;
+ }
+ snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64 (output, key, heal_failed_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_"
+ "healed_failed_count to outout");
+ goto out;
+ }
+ snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_dynstr (output, key, gf_strdup(start_time_str));
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_"
+ "crawl_start_time to outout");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64,
+ xl_id, child, count);
+
+ if (!end_time_str)
+ end_time_str = "Could not determine the end time";
+ ret = dict_set_dynstr (output, key, gf_strdup(end_time_str));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_"
+ "crawl_end_time to outout");
+ goto out;
+ }
+ snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64,
+ xl_id, child, count);
+
+ if (shd_event->crawl_inprogress == _gf_true)
+ progress = 1;
+ else
+ progress = 0;
+
+ ret = dict_set_int32 (output, key, progress);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_"
+ "inprogress to outout");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics-%d-%d-count",xl_id, child);
+ ret = dict_set_uint64 (output, key, count + 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not increment the "
+ "counter.");
+ goto out;
+ }
+out:
+ return ret;
+}
+
+int
_add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path,
struct timeval *tv, gf_boolean_t dyn)
{
@@ -182,16 +327,18 @@ out:
int
_get_path_from_gfid_loc (xlator_t *this, xlator_t *readdir_xl, loc_t *child,
- char **fpath)
+ char **fpath, gf_boolean_t *missing)
{
dict_t *xattr = NULL;
char *path = NULL;
int ret = -1;
- ret = syncop_getxattr (readdir_xl, child, &xattr,
- GFID_TO_PATH_KEY);
- if (ret)
+ ret = syncop_getxattr (readdir_xl, child, &xattr, GFID_TO_PATH_KEY);
+ if (ret < 0) {
+ if ((errno == ENOENT) && missing)
+ *missing = _gf_true;
goto out;
+ }
ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Failed to get path for "
@@ -231,6 +378,20 @@ out:
}
int
+_add_crawl_event_statistics_to_dict (circular_buffer_t *cb, void *data)
+{
+ int ret = 0;
+ shd_dump_t *dump_data = NULL;
+ shd_crawl_event_t *shd_event = NULL;
+
+ dump_data = data;
+ shd_event = cb->data;
+ ret = _add_crawl_stats_to_dict (dump_data->this, dump_data->dict,
+ dump_data->child, shd_event, &cb->tv);
+ return ret;
+}
+
+int
_add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child)
{
shd_dump_t dump_data = {0};
@@ -242,32 +403,24 @@ _add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child)
return 0;
}
+
int
-_add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data,
- gf_dirent_t *entry,
- loc_t *childloc, loc_t *parentloc, struct iatt *iattr)
+_add_statistics_to_dict (xlator_t *this, dict_t *dict, int child)
{
- dict_t *output = NULL;
- xlator_t *readdir_xl = NULL;
- int ret = -1;
- char *path = NULL;
-
- if (uuid_is_null (childloc->gfid))
- goto out;
+ shd_dump_t dump_data = {0};
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
- output = crawl_data->op_data;
- readdir_xl = crawl_data->readdir_xl;
+ priv = this->private;
+ shd = &priv->shd;
- ret = _get_path_from_gfid_loc (this, readdir_xl, childloc, &path);
- if (ret)
- goto out;
+ dump_data.this = this;
+ dump_data.dict = dict;
+ dump_data.child = child;
+ eh_dump (shd->statistics[child], &dump_data,
+ _add_crawl_event_statistics_to_dict);
+ return 0;
- ret = _add_path_to_dict (this, output, crawl_data->child, path, NULL,
- _gf_true);
-out:
- if (ret && path)
- GF_FREE (path);
- return ret;
}
void
@@ -294,30 +447,108 @@ out:
return;
}
+int
+_count_hard_links_under_base_indices_dir (xlator_t *this,
+ afr_crawl_data_t *crawl_data,
+ gf_dirent_t *entry, loc_t *childloc,
+ loc_t *parentloc, struct iatt *iattr)
+{
+ xlator_t *readdir_xl = crawl_data->readdir_xl;
+ struct iatt parent = {0};
+ int ret = 0;
+ dict_t *output = NULL;
+ int xl_id = 0;
+ char key[256] = {0};
+ int child = -1;
+ uint64_t hardlinks = 0;
+
+ output = crawl_data->op_data;
+ child = crawl_data->child;
+
+ ret = syncop_lookup (readdir_xl, childloc, NULL, iattr, NULL, &parent);
+ if (ret)
+ goto out;
+
+ ret = dict_get_int32 (output, this->name, &xl_id);
+ if (ret)
+ goto out;
+
+ snprintf (key, sizeof (key), "%d-%d-hardlinks", xl_id, child);
+ ret = dict_get_uint64 (output, key, &hardlinks);
+
+ /*Removing the count of base_entry under indices/base_indicies and
+ * entry under indices/xattrop */
+ hardlinks = hardlinks + iattr->ia_nlink - 2;
+ ret = dict_set_uint64 (output, key, hardlinks);
+ if (ret)
+ goto out;
+
+out:
+ return ret;
+}
+
+int
+_add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data,
+ gf_dirent_t *entry,
+ loc_t *childloc, loc_t *parentloc, struct iatt *iattr)
+{
+ dict_t *output = NULL;
+ xlator_t *readdir_xl = NULL;
+ int ret = -1;
+ char *path = NULL;
+ gf_boolean_t missing = _gf_false;
+ char gfid_str[64] = {0};
+
+ if (uuid_is_null (childloc->gfid))
+ goto out;
+
+ output = crawl_data->op_data;
+ readdir_xl = crawl_data->readdir_xl;
+
+ ret = _get_path_from_gfid_loc (this, readdir_xl, childloc, &path,
+ &missing);
+ if (ret == 0) {
+ ret = _add_path_to_dict (this, output, crawl_data->child, path,
+ NULL, _gf_true);
+ } else if (missing) {
+ _remove_stale_index (this, readdir_xl, parentloc,
+ uuid_utoa_r (childloc->gfid, gfid_str));
+ }
+
+out:
+ if (ret && path)
+ GF_FREE (path);
+ return ret;
+}
+
void
_crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child,
int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp,
afr_crawl_data_t *crawl_data)
{
- int ret = 0;
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- eh_t *eh = NULL;
- char *path = NULL;
- shd_event_t *event = NULL;
- int32_t sh_failed = 0;
- gf_boolean_t split_brain = 0;
- int32_t actual_sh_done = 0;
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ eh_t *eh = NULL;
+ char *path = NULL;
+ char gfid_str[64] = {0};
+ shd_event_t *event = NULL;
+ int32_t sh_failed = 0;
+ gf_boolean_t split_brain = 0;
+ int32_t actual_sh_done = 0;
+ shd_crawl_event_t **shd_crawl_event = NULL;
+
priv = this->private;
shd = &priv->shd;
if (crawl_data->crawl == INDEX) {
if ((op_ret < 0) && (op_errno == ENOENT)) {
_remove_stale_index (this, crawl_data->readdir_xl,
- parent, uuid_utoa (child->gfid));
+ parent, uuid_utoa_r (child->gfid,
+ gfid_str));
goto out;
}
ret = _get_path_from_gfid_loc (this, crawl_data->readdir_xl,
- child, &path);
+ child, &path, NULL);
if (ret)
goto out;
} else {
@@ -333,16 +564,19 @@ _crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child,
ret = dict_get_int32 (xattr_rsp, "actual-sh-done", &actual_sh_done);
}
- split_brain = afr_is_split_brain (this, child->inode);
+ shd_crawl_event = (shd_crawl_event_t**)(shd->crawl_events);
+ split_brain = afr_is_split_brain (this, child->inode);
if ((op_ret < 0 && op_errno == EIO) || split_brain) {
eh = shd->split_brain;
+ shd_crawl_event[crawl_data->child]->split_brain_count += 1;
} else if ((op_ret < 0) || sh_failed) {
eh = shd->heal_failed;
+ shd_crawl_event[crawl_data->child]->heal_failed_count += 1;
} else if (actual_sh_done == 1) {
- eh = shd->healed;
+ eh = shd->healed;
+ shd_crawl_event[crawl_data->child]->healed_count += 1;
}
-
ret = -1;
if (eh != NULL) {
@@ -373,21 +607,56 @@ out:
}
int
+_link_inode_update_loc (xlator_t *this, loc_t *loc, struct iatt *iattr)
+{
+ inode_t *link_inode = NULL;
+ int ret = -1;
+
+ link_inode = inode_link (loc->inode, NULL, NULL, iattr);
+ if (link_inode == NULL) {
+ gf_log (this->name, GF_LOG_ERROR, "inode link failed "
+ "on the inode (%s)", uuid_utoa (iattr->ia_gfid));
+ goto out;
+ }
+ inode_unref (loc->inode);
+ loc->inode = link_inode;
+ ret = 0;
+out:
+ return ret;
+}
+
+int
_self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry,
loc_t *child, loc_t *parent, struct iatt *iattr)
{
struct iatt parentbuf = {0};
int ret = 0;
dict_t *xattr_rsp = NULL;
+ dict_t *xattr_req = NULL;
+
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_int32 (xattr_req, "allow-sh-for-running-transaction", 1);
gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path);
- ret = syncop_lookup (this, child, NULL,
+ ret = syncop_lookup (this, child, xattr_req,
iattr, &xattr_rsp, &parentbuf);
_crawl_post_sh_action (this, parent, child, ret, errno, xattr_rsp,
crawl_data);
if (xattr_rsp)
dict_unref (xattr_rsp);
+ if (ret == 0)
+ ret = _link_inode_update_loc (this, child, iattr);
+
+out:
+ if (xattr_req)
+ dict_unref(xattr_req);
return ret;
}
@@ -496,12 +765,20 @@ _do_crawl_op_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl,
status = "Started self-heal";
_do_self_heal_on_subvol (this, i,
crawl);
- } else if (output) {
+ } else if (output && (op == INFO)) {
status = "";
afr_start_crawl (this, i, INDEX,
_add_summary_to_dict,
output, _gf_false, 0,
NULL);
+ } else if (output &&
+ (op == STATISTICS_TO_BE_HEALED)) {
+ status = "";
+ afr_start_crawl (this, i,
+ INDEX_TO_BE_HEALED,
+ _count_hard_links_under_base_indices_dir,
+ output, _gf_false,
+ 0, NULL);
}
}
if (output) {
@@ -535,8 +812,103 @@ _get_index_summary_on_local_subvols (xlator_t *this, dict_t *output)
return _do_crawl_op_on_local_subvols (this, INDEX, INFO, output);
}
+void
+afr_fill_completed_crawl_statistics_to_dict (xlator_t *this, dict_t *dict)
+{
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ int i = 0;
+ priv = this->private;
+ shd= &priv->shd;
+ for (i = 0; i < priv->child_count; i++) {
+ if (shd->pos[i] != AFR_POS_LOCAL)
+ continue;
+ _add_statistics_to_dict (this, dict, i);
+ }
+
+ return ;
+}
+
+static void
+reset_crawl_event (shd_crawl_event_t *crawl_event)
+{
+ crawl_event->healed_count = 0;
+ crawl_event->split_brain_count = 0;
+ crawl_event->heal_failed_count = 0;
+ GF_FREE (crawl_event->start_time_str);
+ crawl_event->start_time_str = NULL;
+ crawl_event->end_time_str = NULL;
+ crawl_event->crawl_type = NULL;
+ crawl_event->crawl_inprogress = _gf_false;
+ return;
+}
+
+static void
+afr_copy_crawl_event_struct (shd_crawl_event_t *src, shd_crawl_event_t *dst)
+{
+ dst->healed_count = src->healed_count;
+ dst->split_brain_count = src->split_brain_count;
+ dst->heal_failed_count = src->heal_failed_count;
+ dst->start_time_str = gf_strdup (src->start_time_str);
+ dst->end_time_str = "Crawl is already in progress";
+ dst->crawl_type = src->crawl_type;
+ dst->crawl_inprogress = _gf_true;
+ return;
+}
+
+static int
+afr_fill_crawl_statistics_of_running_crawl(xlator_t *this, dict_t *dict)
+{
+ shd_crawl_event_t *evnt = NULL;
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ int i = 0;
+ priv = this->private;
+ shd = &priv->shd;
+
+ evnt = GF_CALLOC (1, sizeof (shd_crawl_event_t),
+ gf_afr_mt_shd_crawl_event_t);
+ if (!evnt) {
+ ret = -1;
+ goto out;
+ }
+ LOCK (&priv->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (shd->pos[i] != AFR_POS_LOCAL)
+ continue;
+
+ reset_crawl_event (evnt);
+
+ if (!shd->crawl_events[i]) {
+ continue;
+ }
+
+ afr_copy_crawl_event_struct (shd->crawl_events[i],
+ evnt);
+ _add_crawl_stats_to_dict (this, dict, i, evnt, NULL);
+
+ }
+ }
+ UNLOCK (&priv->lock);
+ reset_crawl_event (evnt);
+ GF_FREE (evnt);
+
+out:
+ return ret;
+}
+
+static int
+_add_local_subvols_crawl_statistics_to_dict (xlator_t *this, dict_t *dict)
+{
+ int ret = 0;
+ afr_fill_completed_crawl_statistics_to_dict (this, dict);
+ ret = afr_fill_crawl_statistics_of_running_crawl (this, dict);
+ return ret;
+}
int
-_add_all_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict)
+_add_local_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict)
{
afr_private_t *priv = NULL;
afr_self_heald_t *shd = NULL;
@@ -586,16 +958,25 @@ afr_xl_op (xlator_t *this, dict_t *input, dict_t *output)
ret = 0;
break;
case GF_AFR_OP_HEALED_FILES:
- ret = _add_all_subvols_eh_to_dict (this, shd->healed, output);
+ ret = _add_local_subvols_eh_to_dict (this, shd->healed, output);
break;
case GF_AFR_OP_HEAL_FAILED_FILES:
- ret = _add_all_subvols_eh_to_dict (this, shd->heal_failed,
+ ret = _add_local_subvols_eh_to_dict (this, shd->heal_failed,
output);
break;
case GF_AFR_OP_SPLIT_BRAIN_FILES:
- ret = _add_all_subvols_eh_to_dict (this, shd->split_brain,
+ ret = _add_local_subvols_eh_to_dict (this, shd->split_brain,
output);
break;
+ case GF_AFR_OP_STATISTICS:
+ ret = _add_local_subvols_crawl_statistics_to_dict (this, output);
+ break;
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT:
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED,
+ STATISTICS_TO_BE_HEALED,
+ output);
+ break;
default:
gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op);
break;
@@ -610,7 +991,7 @@ afr_poll_self_heal (void *data)
{
afr_private_t *priv = NULL;
afr_self_heald_t *shd = NULL;
- struct timeval timeout = {0};
+ struct timespec timeout = {0};
xlator_t *this = NULL;
long child = (long)data;
gf_timer_t *old_timer = NULL;
@@ -634,7 +1015,7 @@ afr_poll_self_heal (void *data)
if (shd->enabled && (shd->pos[child] == AFR_POS_LOCAL))
_do_self_heal_on_subvol (this, child, INDEX);
timeout.tv_sec = shd->timeout;
- timeout.tv_usec = 0;
+ timeout.tv_nsec = 0;
//notify and previous timer should be synchronized.
LOCK (&priv->lock);
{
@@ -733,25 +1114,6 @@ out:
}
int
-_link_inode_update_loc (xlator_t *this, loc_t *loc, struct iatt *iattr)
-{
- inode_t *link_inode = NULL;
- int ret = -1;
-
- link_inode = inode_link (loc->inode, NULL, NULL, iattr);
- if (link_inode == NULL) {
- gf_log (this->name, GF_LOG_ERROR, "inode link failed "
- "on the inode (%s)", uuid_utoa (iattr->ia_gfid));
- goto out;
- }
- inode_unref (loc->inode);
- loc->inode = link_inode;
- ret = 0;
-out:
- return ret;
-}
-
-int
afr_local_pathinfo (char *pathinfo, gf_boolean_t *local)
{
int ret = 0;
@@ -787,6 +1149,7 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data,
afr_private_t *priv = NULL;
dict_t *xattr = NULL;
void *index_gfid = NULL;
+ void *base_indices_holder_vgfid = NULL;
loc_t rootloc = {0};
struct iatt iattr = {0};
struct iatt parent = {0};
@@ -796,7 +1159,7 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data,
priv = this->private;
if (crawl_data->crawl == FULL) {
afr_build_root_loc (this, dirloc);
- } else {
+ } else if (crawl_data->crawl == INDEX) {
afr_build_root_loc (this, &rootloc);
ret = syncop_getxattr (readdir_xl, &rootloc, &xattr,
GF_XATTROP_INDEX_GFID);
@@ -830,6 +1193,47 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data,
ret = _link_inode_update_loc (this, dirloc, &iattr);
if (ret)
goto out;
+ } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) {
+ afr_build_root_loc (this, &rootloc);
+ ret = syncop_getxattr (readdir_xl, &rootloc, &xattr,
+ GF_BASE_INDICES_HOLDER_GFID);
+ if (ret < 0)
+ goto out;
+ ret = dict_get_ptr (xattr, GF_BASE_INDICES_HOLDER_GFID,
+ &base_indices_holder_vgfid);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "index gfid empty "
+ "on %s", readdir_xl->name);
+ ret = -1;
+ goto out;
+ }
+ if (!base_indices_holder_vgfid) {
+ gf_log (this->name, GF_LOG_ERROR, "Base indices holder"
+ "virtual gfid is null on %s", readdir_xl->name);
+ ret = -1;
+ goto out;
+ }
+ uuid_copy (dirloc->gfid, base_indices_holder_vgfid);
+ dirloc->path = "";
+ dirloc->inode = inode_new (priv->root_inode->table);
+ ret = syncop_lookup (readdir_xl, dirloc, NULL, &iattr, NULL,
+ &parent);
+ if (ret < 0) {
+ if (errno != ENOENT) {
+ gf_log (this->name, GF_LOG_ERROR, "lookup "
+ "failed for base_indices_holder dir"
+ " on %s - (%s)", readdir_xl->name,
+ strerror (errno));
+
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "base_indices"
+ "_holder is not yet created.");
+ }
+ goto out;
+ }
+ ret = _link_inode_update_loc (this, dirloc, &iattr);
+ if (ret)
+ goto out;
}
ret = 0;
out:
@@ -894,6 +1298,16 @@ afr_crawl_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent,
priv = this->private;
if (crawl_data->crawl == FULL) {
ret = afr_build_child_loc (this, child, parent, entry->d_name);
+ } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) {
+ ret = _build_index_loc (this, child, entry->d_name, parent);
+ if (ret)
+ goto out;
+ child->inode = inode_new (priv->root_inode->table);
+ if (!child->inode) {
+ ret = -1;
+ goto out;
+ }
+ child->path = NULL;
} else {
child->inode = inode_new (priv->root_inode->table);
if (!child->inode)
@@ -943,13 +1357,14 @@ _process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries,
ret = crawl_data->process_entry (this, crawl_data, entry,
&entry_loc, parentloc, &iattr);
- if (ret)
+ if (crawl_data->crawl == INDEX_TO_BE_HEALED && ret) {
+ goto out;
+ } else if (ret) {
continue;
+ }
- ret = _link_inode_update_loc (this, &entry_loc, &iattr);
- if (ret)
- goto out;
- if (crawl_data->crawl == INDEX)
+ if ((crawl_data->crawl == INDEX) ||
+ (crawl_data->crawl == INDEX_TO_BE_HEALED))
continue;
if (!IA_ISDIR (iattr.ia_type))
@@ -964,6 +1379,10 @@ _process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries,
}
ret = 0;
out:
+ if ((crawl_data->crawl == INDEX_TO_BE_HEALED) && ret) {
+ gf_log (this->name, GF_LOG_ERROR,"Failed to get the hardlink "
+ "count");
+ }
loc_wipe (&entry_loc);
return ret;
}
@@ -1011,6 +1430,9 @@ _crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data)
ret = _process_entries (this, loc, &entries, &offset,
crawl_data);
+ if ((ret < 0) && (crawl_data->crawl == INDEX_TO_BE_HEALED)) {
+ goto out;
+ }
gf_dirent_free (&entries);
free_entries = _gf_false;
}
@@ -1052,7 +1474,7 @@ afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos)
ret = syncop_getxattr (priv->children[child], &loc, &xattr_rsp,
GF_XATTR_NODE_UUID_KEY);
- if (ret) {
+ if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR, "getxattr failed on %s - "
"(%s)", priv->children[child]->name, strerror (errno));
goto out;
@@ -1116,8 +1538,13 @@ afr_dir_crawl (void *data)
goto out;
ret = afr_crawl_opendir (this, crawl_data, &fd, &dirloc);
- if (ret)
+ if (ret) {
+ if (crawl_data->crawl == INDEX_TO_BE_HEALED) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to open base_"
+ "indices_holder");
+ }
goto out;
+ }
ret = _crawl_directory (fd, &dirloc, crawl_data);
if (ret)
@@ -1131,12 +1558,102 @@ afr_dir_crawl (void *data)
out:
if (fd)
fd_unref (fd);
- if (crawl_data->crawl == INDEX)
+ if ((crawl_data->crawl == INDEX) ||
+ (crawl_data->crawl == INDEX_TO_BE_HEALED ))
dirloc.path = NULL;
loc_wipe (&dirloc);
return ret;
}
+char *
+get_crawl_type_in_string (afr_crawl_type_t crawl)
+{
+ char *index = "INDEX";
+ char *full = "FULL";
+ char *crawl_type = NULL;
+
+ if (crawl == INDEX){
+ crawl_type = index;
+ } else if (crawl == FULL) {
+ crawl_type = full;
+ }
+
+ return crawl_type;
+}
+
+static int
+afr_allocate_crawl_event (xlator_t *this, int child, afr_crawl_type_t crawl)
+{
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ int ret = 0;
+ shd_crawl_event_t *crawl_event = NULL;
+ time_t get_time = 0;
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ crawl_event = GF_CALLOC (sizeof (shd_crawl_event_t), 1,
+ gf_afr_mt_shd_crawl_event_t);
+ if (!crawl_event) {
+ ret = -1;
+ goto out;
+ }
+
+ get_time = time(NULL);
+ if (get_time == ((time_t)-1)) {
+ ret = -1;
+ goto out;
+ }
+
+ crawl_event->start_time_str = gf_strdup (ctime(&get_time));
+
+ crawl_event->crawl_type = get_crawl_type_in_string (crawl);
+ if (!crawl_event->crawl_type) {
+ ret = -1;
+ goto out;
+ }
+ LOCK (&priv->lock);
+ {
+ shd->crawl_events[child] = crawl_event;
+ }
+ UNLOCK (&priv->lock);
+ ret = 0;
+out:
+ return ret;
+
+}
+
+static int
+afr_put_crawl_event_in_eh (xlator_t *this, int child)
+{
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ int ret = 0;
+ time_t get_time = 0;
+ shd_crawl_event_t **crawl_event = NULL;
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ get_time = time(NULL);
+ if (get_time == ((time_t)-1)) {
+ ret = -1;
+ goto out;
+ }
+ crawl_event = (shd_crawl_event_t**)shd->crawl_events;
+ LOCK (&priv->lock);
+ {
+ crawl_event[child]->end_time_str = gf_strdup (ctime(&get_time));
+ ret = eh_save_history (shd->statistics[child],
+ crawl_event[child]);
+ crawl_event[child] = NULL;
+ }
+ UNLOCK (&priv->lock);
+out:
+ return ret;
+}
+
static int
afr_dir_exclusive_crawl (void *data)
{
@@ -1172,7 +1689,15 @@ afr_dir_exclusive_crawl (void *data)
}
do {
+ ret = afr_allocate_crawl_event (this, child, crawl_data->crawl);
+ if (ret)
+ goto out;
afr_dir_crawl (data);
+
+ ret = afr_put_crawl_event_in_eh (this, child);
+ if (ret < 0)
+ goto out;
+
LOCK (&priv->lock);
{
if (shd->pending[child] != NONE) {
@@ -1229,8 +1754,8 @@ afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl,
ret = synctask_new (this->ctx->env, crawler,
crawl_done, frame, crawl_data);
if (ret)
- gf_log (this->name, GF_LOG_ERROR, "Could not create the "
- "task for %d ret %d", idx, ret);
+ gf_log (this->name, GF_LOG_ERROR, "afr crawl failed for child"
+ " %d with ret %d", idx, ret);
out:
return;
}
@@ -1260,4 +1785,3 @@ afr_set_root_gfid (dict_t *dict)
return ret;
}
-
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
index 32a8aaca5..e0c083754 100644
--- a/xlators/cluster/afr/src/afr-self-heald.h
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -29,6 +29,19 @@ typedef struct afr_crawl_data_ {
struct iatt *iattr);
} afr_crawl_data_t;
+typedef struct crawl_event_stats_ {
+ uint64_t healed_count;
+ uint64_t split_brain_count;
+ uint64_t heal_failed_count;
+ char *start_time_str;
+ char *end_time_str;
+ char *crawl_type;
+ gf_boolean_t crawl_inprogress;
+} shd_crawl_event_t;
+
+void _destroy_crawl_event_data (void *data);
+void _destroy_shd_event_data (void *data);
+
typedef int (*process_entry_cbk_t) (xlator_t *this, afr_crawl_data_t *crawl_data,
gf_dirent_t *entry, loc_t *child, loc_t *parent,
struct iatt *iattr);
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 5e636d8dc..20306e469 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -79,7 +79,7 @@ afr_save_lk_owner (call_frame_t *frame)
local = frame->local;
- local->saved_lk_owner = frame->root->lk_owner;
+ local->saved_lk_owner = frame->root->lk_owner;
}
@@ -90,10 +90,9 @@ afr_restore_lk_owner (call_frame_t *frame)
local = frame->local;
- frame->root->lk_owner = local->saved_lk_owner;
+ frame->root->lk_owner = local->saved_lk_owner;
}
-
static void
__mark_all_pending (int32_t *pending[], int child_count,
afr_transaction_type type)
@@ -146,34 +145,6 @@ out:
return;
}
-
-static void
-__mark_pre_op_undone_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
-{
- afr_local_t *local = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- local = frame->local;
-
- if (!local->fd)
- return;
-
- fd_ctx = afr_fd_ctx_get (local->fd, this);
-
- if (!fd_ctx)
- goto out;
-
- LOCK (&local->fd->lock);
- {
- if (local->transaction.type == AFR_DATA_TRANSACTION)
- fd_ctx->pre_op_done[child_index]--;
- }
- UNLOCK (&local->fd->lock);
-out:
- return;
-}
-
-
static void
__mark_non_participant_children (int32_t *pending[], int child_count,
unsigned char *participants,
@@ -190,7 +161,7 @@ __mark_non_participant_children (int32_t *pending[], int child_count,
}
-static void
+void
__mark_all_success (int32_t *pending[], int child_count,
afr_transaction_type type)
{
@@ -218,9 +189,12 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
+ fd_t *fd = NULL;
local = frame->local;
priv = this->private;
+ fd = local->fd;
+
__mark_all_success (local->pending, priv->child_count,
local->transaction.type);
@@ -234,6 +208,17 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
frame->root->lk_owner =
local->transaction.main_frame->root->lk_owner;
+
+ /* The wake up needs to happen independent of
+ what type of fop arrives here. If it was
+ a write, then it has already inherited the
+ lock and changelog. If it was not a write,
+ then the presumption of the optimization (of
+ optimizing for successive write operations)
+ fails.
+ */
+ if (fd)
+ afr_delayed_changelog_wake_up (this, fd);
local->transaction.fop (frame, this);
}
@@ -382,6 +367,11 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
UNLOCK (&frame->lock);
if (call_count == 0) {
+ if (local->transaction.resume_stub) {
+ call_resume (local->transaction.resume_stub);
+ local->transaction.resume_stub = NULL;
+ }
+
if (afr_lock_server_count (priv, local->transaction.type) == 0) {
local->transaction.done (frame, this);
} else {
@@ -452,19 +442,37 @@ out:
return;
}
+afr_inodelk_t*
+afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom)
+{
+ afr_inodelk_t *inodelk = NULL;
+ int i = 0;
+
+ for (i = 0; int_lock->inodelk[i].domain; i++) {
+ inodelk = &int_lock->inodelk[i];
+ if (strcmp (dom, inodelk->domain) == 0)
+ return inodelk;
+ }
+ return NULL;
+}
+
unsigned char*
afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)
{
unsigned char *locked_nodes = NULL;
+ afr_inodelk_t *inodelk = NULL;
switch (type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- locked_nodes = int_lock->inode_locked_nodes;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ locked_nodes = inodelk->locked_nodes;
break;
case AFR_ENTRY_TRANSACTION:
case AFR_ENTRY_RENAME_TRANSACTION:
- locked_nodes = int_lock->entry_locked_nodes;
+ /*Because same set of subvols participate in all lockee
+ * entities*/
+ locked_nodes = int_lock->lockee[0].locked_nodes;
break;
}
return locked_nodes;
@@ -556,6 +564,82 @@ afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr,
"failed to set pending entry");
}
+
+gf_boolean_t
+afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int index = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ index = afr_index_for_transaction_type (local->transaction.type);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->pending[i][index] == 0)
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
+static void
+afr_dir_fop_handle_all_fop_failures (call_frame_t *frame)
+{
+ xlator_t *this = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ this = frame->this;
+ local = frame->local;
+ priv = this->private;
+
+ if ((local->transaction.type != AFR_ENTRY_TRANSACTION) &&
+ (local->transaction.type != AFR_ENTRY_RENAME_TRANSACTION))
+ return;
+
+ if (local->op_ret >= 0)
+ goto out;
+
+ __mark_all_success (local->pending, priv->child_count,
+ local->transaction.type);
+out:
+ return;
+}
+
+static void
+afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ gf_boolean_t all_quota_failures = _gf_false;
+
+ local = frame->local;
+ priv = this->private;
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ return;
+ /*
+ * Idea is to not leave the file in FOOL-FOOL scenario in case on
+ * all the bricks data transaction failed with EDQUOT to avoid
+ * increasing un-necessary load of self-heals in the system.
+ */
+ all_quota_failures = _gf_true;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] &&
+ (local->child_errno[i] != EDQUOT)) {
+ all_quota_failures = _gf_false;
+ break;
+ }
+ }
+ if (all_quota_failures)
+ __mark_all_success (local->pending, priv->child_count,
+ local->transaction.type);
+}
+
int
afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
{
@@ -568,7 +652,6 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
afr_fd_ctx_t *fdctx = NULL;
dict_t **xattr = NULL;
int piggyback = 0;
- int index = 0;
int nothing_failed = 1;
local = frame->local;
@@ -578,6 +661,9 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
local->transaction.pre_op,
local->transaction.type);
+ afr_data_handle_quota_errors (frame, this);
+ afr_dir_fop_handle_all_fop_failures (frame);
+
if (local->fd)
afr_transaction_rm_stale_children (frame, this,
local->fd->inode,
@@ -604,15 +690,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
goto out;
}
- /* check if something has failed, to handle piggybacking */
- nothing_failed = 1;
- index = afr_index_for_transaction_type (local->transaction.type);
- for (i = 0; i < priv->child_count; i++) {
- if (local->pending[i][index] == 0) {
- nothing_failed = 0;
- break;
- }
- }
+ nothing_failed = afr_txn_nothing_failed (frame, this);
afr_compute_txn_changelog (local , priv);
@@ -638,15 +716,14 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
break;
}
- LOCK (&local->fd->lock);
- {
- piggyback = 0;
- if (fdctx->pre_op_piggyback[i]) {
- fdctx->pre_op_piggyback[i]--;
- piggyback = 1;
- }
- }
- UNLOCK (&local->fd->lock);
+ /* local->transaction.postop_piggybacked[] was
+ precomputed in is_piggyback_postop() when called from
+ afr_changelog_post_op_safe()
+ */
+
+ piggyback = 0;
+ if (local->transaction.postop_piggybacked[i])
+ piggyback = 1;
afr_set_postop_dict (local, this, xattr[i],
piggyback, i);
@@ -655,7 +732,6 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
afr_changelog_post_op_cbk (frame, (void *)(long)i,
this, 1, 0, xattr[i], NULL);
} else {
- __mark_pre_op_undone_on_fd (frame, this, i);
STACK_WIND_COOKIE (frame,
afr_changelog_post_op_cbk,
(void *) (long) i,
@@ -901,7 +977,7 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
}
UNLOCK (&local->fd->lock);
- afr_set_delayed_post_op (frame, this);
+ afr_set_delayed_post_op (frame, this);
if (piggyback)
afr_changelog_pre_op_cbk (frame, (void *)(long)i,
@@ -1174,12 +1250,14 @@ int
afr_set_transaction_flock (afr_local_t *local)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
int_lock = &local->internal_lock;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- int_lock->lk_flock.l_len = local->transaction.len;
- int_lock->lk_flock.l_start = local->transaction.start;
- int_lock->lk_flock.l_type = F_WRLCK;
+ inodelk->flock.l_len = local->transaction.len;
+ inodelk->flock.l_start = local->transaction.start;
+ inodelk->flock.l_type = F_WRLCK;
return 0;
}
@@ -1194,6 +1272,7 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
int_lock->transaction_lk_type = AFR_TRANSACTION_LK;
+ int_lock->domain = this->name;
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
@@ -1207,8 +1286,8 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)
case AFR_ENTRY_RENAME_TRANSACTION:
- int_lock->lock_cbk = afr_post_blocking_rename_cbk;
- afr_blocking_lock (frame, this);
+ int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk;
+ afr_nonblocking_entrylk (frame, this);
break;
case AFR_ENTRY_TRANSACTION:
@@ -1254,84 +1333,384 @@ afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
void
afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- /* call this function from any of the related optimizations
- which benefit from delaying post op are enabled, namely:
+ /* call this function from any of the related optimizations
+ which benefit from delaying post op are enabled, namely:
- - changelog piggybacking
- - eager locking
- */
+ - changelog piggybacking
+ - eager locking
+ */
- priv = this->private;
- if (!priv)
- return;
+ priv = this->private;
+ if (!priv)
+ return;
- if (!priv->post_op_delay_secs)
- return;
+ if (!priv->post_op_delay_secs)
+ return;
- local = frame->local;
- if (!local)
- return;
+ local = frame->local;
+ if (!local->transaction.eager_lock_on)
+ return;
- if (!local->fd)
- return;
+ if (!local)
+ return;
- if (local->op == GF_FOP_WRITE)
- local->delayed_post_op = _gf_true;
+ if (!local->fd)
+ return;
+
+ if (local->op == GF_FOP_WRITE)
+ local->delayed_post_op = _gf_true;
}
+gf_boolean_t
+afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this)
+{
+ afr_inode_ctx_t *ictx = NULL;
+
+ if (!inode) {
+ /* If false is returned, it may keep on taking eager-lock
+ * which may lead to starvation, so return true to avoid that.
+ */
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid inode");
+ return _gf_true;
+ }
+ /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
+ * is taken mount2 opened the same file, it won't be able to
+ * perform any data operations until mount1 releases eager-lock.
+ * To avoid such scenario do not enable eager-lock for this transaction
+ * if open-fd-count is > 1
+ */
+
+ ictx = afr_inode_ctx_get (inode, this);
+ if (!ictx)
+ return _gf_true;
+
+ if (ictx->open_fd_count > 1)
+ return _gf_true;
+
+ return _gf_false;
+}
+
+gf_boolean_t
+afr_any_fops_failed (afr_local_t *local, afr_private_t *priv)
+{
+ if (local->success_count != priv->child_count)
+ return _gf_true;
+ return _gf_false;
+}
gf_boolean_t
is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- gf_boolean_t res = _gf_false;
+ afr_local_t *local = NULL;
+ gf_boolean_t res = _gf_false;
+ afr_private_t *priv = NULL;
- local = frame->local;
- if (!local)
- goto out;
+ priv = this->private;
+
+ local = frame->local;
+ if (!local)
+ goto out;
- if (!local->delayed_post_op)
- goto out;
+ if (!local->delayed_post_op)
+ goto out;
- res = _gf_true;
+ //Mark pending changelog ASAP
+ if (afr_any_fops_failed (local, priv))
+ goto out;
+
+ if (local->fd && afr_are_multiple_fds_opened (local->fd->inode, this))
+ goto out;
+
+ res = _gf_true;
out:
- return res;
+ return res;
}
void
-afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd);
+afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
+ call_stub_t *stub);
void
afr_delayed_changelog_wake_up_cbk (void *data)
{
- fd_t *fd = NULL;
+ fd_t *fd = NULL;
- fd = data;
+ fd = data;
- afr_delayed_changelog_wake_up (THIS, fd);
+ afr_delayed_changelog_wake_up (THIS, fd);
+}
+
+
+/*
+ Check if the frame is destined to get optimized away
+ with changelog piggybacking
+*/
+static gf_boolean_t
+is_piggyback_post_op (call_frame_t *frame, fd_t *fd)
+{
+ afr_fd_ctx_t *fdctx = NULL;
+ afr_local_t *local = NULL;
+ gf_boolean_t piggyback = _gf_true;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = frame->this->private;
+ local = frame->local;
+ fdctx = afr_fd_ctx_get (fd, frame->this);
+
+ LOCK(&fd->lock);
+ {
+ piggyback = _gf_true;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
+ if (fdctx->pre_op_piggyback[i]) {
+ fdctx->pre_op_piggyback[i]--;
+ local->transaction.postop_piggybacked[i] = 1;
+ } else {
+ /* For at least _one_ subvolume we cannot
+ piggyback on the changelog, and have to
+ perform a hard POST-OP and therefore fsync
+ if necesssary
+ */
+ piggyback = _gf_false;
+ GF_ASSERT (fdctx->pre_op_done[i]);
+ fdctx->pre_op_done[i]--;
+ }
+ }
+ }
+ UNLOCK(&fd->lock);
+
+ if (!afr_txn_nothing_failed (frame, frame->this)) {
+ /* something failed in this transaction,
+ we will be performing a hard post-op
+ */
+ return _gf_false;
+ }
+
+ return piggyback;
+}
+
+
+/* SET operation */
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
+{
+ afr_fd_ctx_t *fdctx = NULL;
+
+ fdctx = afr_fd_ctx_get (fd, this);
+
+ LOCK(&fd->lock);
+ {
+ fdctx->witnessed_unstable_write = _gf_true;
+ }
+ UNLOCK(&fd->lock);
+
+ return 0;
+}
+
+/* TEST and CLEAR operation */
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
+{
+ afr_fd_ctx_t *fdctx = NULL;
+ gf_boolean_t witness = _gf_false;
+
+ fdctx = afr_fd_ctx_get (fd, this);
+ if (!fdctx)
+ return _gf_true;
+
+ LOCK(&fd->lock);
+ {
+ if (fdctx->witnessed_unstable_write) {
+ witness = _gf_true;
+ fdctx->witnessed_unstable_write = _gf_false;
+ }
+ }
+ UNLOCK (&fd->lock);
+
+ return witness;
+}
+
+
+int
+afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (afr_fop_failed (op_ret, op_errno)) {
+ /* Failure of fsync() is as good as failure of previous
+ write(). So treat it like one.
+ */
+ gf_log (this->name, GF_LOG_WARNING,
+ "fsync(%s) failed on subvolume %s. Transaction was %s",
+ uuid_utoa (local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ gf_fop_list[local->op]);
+
+ afr_transaction_fop_failed (frame, this, child_index);
+ }
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_changelog_post_op_now (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_changelog_fsync (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ GF_UNUSED int ret = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
+ priv->child_count);
+
+ if (!call_count) {
+ /* will go straight to unlock */
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ xdata = dict_new();
+ if (xdata)
+ ret = dict_set_int32 (xdata, "batch-fsync", 1);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->fsync, local->fd,
+ 1, xdata);
+ if (!--call_count)
+ break;
+ }
+
+ if (xdata)
+ dict_unref (xdata);
+
+ return 0;
+}
+
+
+ int
+afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ if (is_piggyback_post_op (frame, local->fd)) {
+ /* just detected that this post-op is about to
+ be optimized away as a new write() has
+ already piggybacked on this frame's changelog.
+ */
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ /* Calling afr_changelog_post_op_now() now will result in
+ issuing ->[f]xattrop().
+
+ Performing a hard POST-OP (->[f]xattrop() FOP) is a more
+ responsible operation that what it might appear on the surface.
+
+ The changelog of a file (in the xattr of the file on the server)
+ stores information (pending count) about the state of the file
+ on the OTHER server. This changelog is blindly trusted, and must
+ therefore be updated in such a way it remains trustworthy. This
+ implies that decrementing the pending count (essentially "clearing
+ the dirty flag") must be done STRICTLY after we are sure that the
+ operation on the other server has reached stable storage.
+
+ While the backend filesystem on that server will eventually flush
+ it to stable storage, we (being in userspace) have no mechanism
+ to get notified when the write became "stable".
+
+ This means we need take matter into our own hands and issue an
+ fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
+ and get an acknowledgement for it. And we need to wait for the
+ fsync() acknowledgement before initiating the hard POST-OP.
+
+ However if the FD itself was opened in O_SYNC or O_DSYNC then
+ we are already guaranteed that the writes were made stable as
+ part of the FOP itself. The same holds true for NFS stable
+ writes which happen on an anonymous FD with O_DSYNC or O_SYNC
+ flag set in the writev() @flags param. For all other write types,
+ mark a flag in the fdctx whenever an unstable write is witnessed.
+ */
+
+ if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ /* Check whether users want durability and perform fsync/post-op
+ * accordingly.
+ */
+ if (priv->ensure_durability) {
+ /* Time to fsync() */
+ afr_changelog_fsync (frame, this);
+ } else {
+ afr_changelog_post_op_now (frame, this);
+ }
+
+ return 0;
}
void
-afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd)
+afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
+ call_stub_t *stub)
{
afr_fd_ctx_t *fd_ctx = NULL;
call_frame_t *prev_frame = NULL;
- struct timeval delta = {0, };
+ struct timespec delta = {0, };
afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
priv = this->private;
fd_ctx = afr_fd_ctx_get (fd, this);
if (!fd_ctx)
- return;
+ goto out;
delta.tv_sec = priv->post_op_delay_secs;
- delta.tv_usec = 0;
+ delta.tv_nsec = 0;
pthread_mutex_lock (&fd_ctx->delay_lock);
{
@@ -1350,8 +1729,13 @@ afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd)
unlock:
pthread_mutex_unlock (&fd_ctx->delay_lock);
+out:
if (prev_frame) {
- afr_changelog_post_op_now (prev_frame, this);
+ local = prev_frame->local;
+ local->transaction.resume_stub = stub;
+ afr_changelog_post_op_safe (prev_frame, this);
+ } else if (stub) {
+ call_resume (stub);
}
}
@@ -1359,49 +1743,62 @@ unlock:
void
afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (is_afr_delayed_changelog_post_op_needed (frame, this))
- afr_delayed_changelog_post_op (this, frame, local->fd);
- else
- afr_changelog_post_op_now (frame, this);
+ if (is_afr_delayed_changelog_post_op_needed (frame, this))
+ afr_delayed_changelog_post_op (this, frame, local->fd, NULL);
+ else
+ afr_changelog_post_op_safe (frame, this);
+}
+
+
+
+/* Wake up the sleeping/delayed post-op, and also register
+ a stub to have it resumed after this transaction
+ completely finishes.
+
+ The @stub gets saved in @local and gets resumed in
+ afr_local_cleanup()
+ */
+void
+afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub)
+{
+ afr_delayed_changelog_post_op (this, NULL, fd, stub);
}
void
afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd)
{
- afr_delayed_changelog_post_op (this, NULL, fd);
+ afr_delayed_changelog_post_op (this, NULL, fd, NULL);
}
-int
+ int
afr_transaction_resume (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- fd_t *fd = NULL;
local = frame->local;
int_lock = &local->internal_lock;
priv = this->private;
- fd = local->fd;
- if (fd)
- /* The wake up needs to happen independent of
- what type of fop arrives here. If it was
- a write, then it has already inherited the
- lock and changelog. If it was not a write,
- then the presumption of the optimization (of
- optimizing for successive write operations)
- fails.
- */
- afr_delayed_changelog_wake_up (this, fd);
+ if (local->transaction.eager_lock_on) {
+ /* We don't need to retain "local" in the
+ fd list anymore, writes to all subvols
+ are finished by now */
+ LOCK (&local->fd->lock);
+ {
+ list_del_init (&local->transaction.eager_locked);
+ }
+ UNLOCK (&local->fd->lock);
+ }
- afr_restore_lk_owner (frame);
+ afr_restore_lk_owner (frame);
if (__fop_changelog_needed (frame, this)) {
afr_changelog_post_op (frame, this);
@@ -1423,7 +1820,8 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this)
*/
void
-afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index)
+afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
+ int child_index)
{
afr_local_t * local = NULL;
afr_private_t * priv = NULL;
@@ -1432,22 +1830,92 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index
priv = this->private;
__mark_child_dead (local->pending, priv->child_count,
- child_index, local->transaction.type);
+ child_index, local->transaction.type);
}
-gf_boolean_t
-_does_transaction_conflict_with_delayed_post_op (call_frame_t *frame)
+
+
+ static gf_boolean_t
+afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
{
- afr_local_t *local = frame->local;
- //check if it is going to compete with inode lock from the fd
- if (local->fd)
- return _gf_false;
- if ((local->transaction.type == AFR_DATA_TRANSACTION) ||
- (local->transaction.type == AFR_METADATA_TRANSACTION))
- return _gf_true;
- return _gf_false;
+ uint64_t start1 = local1->transaction.start;
+ uint64_t start2 = local2->transaction.start;
+ uint64_t end1 = 0;
+ uint64_t end2 = 0;
+
+ if (local1->transaction.len)
+ end1 = start1 + local1->transaction.len - 1;
+ else
+ end1 = ULLONG_MAX;
+
+ if (local2->transaction.len)
+ end2 = start2 + local2->transaction.len - 1;
+ else
+ end2 = ULLONG_MAX;
+
+ return ((end1 >= start2) && (end2 >= start1));
}
+void
+afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fdctx = NULL;
+ afr_local_t *each = NULL;
+
+ priv = this->private;
+
+ if (!local->fd)
+ return;
+
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ return;
+
+ if (!priv->eager_lock)
+ return;
+
+ fdctx = afr_fd_ctx_get (local->fd, this);
+ if (!fdctx)
+ return;
+
+ if (afr_are_multiple_fds_opened (local->fd->inode, this))
+ return;
+ /*
+ * Once full file lock is acquired in eager-lock phase, overlapping
+ * writes do not compete for inode-locks, instead are transferred to the
+ * next writes. Because of this overlapping writes are not ordered.
+ * This can cause inconsistencies in replication.
+ * Example:
+ * Two overlapping writes w1, w2 are sent in parallel on same fd
+ * in two threads t1, t2.
+ * Both threads can execute afr_writev_wind in the following manner.
+ * t1 winds w1 on brick-0
+ * t2 winds w2 on brick-0
+ * t2 winds w2 on brick-1
+ * t1 winds w1 on brick-1
+ *
+ * This check makes sure the locks are not transferred for
+ * overlapping writes.
+ */
+ LOCK (&local->fd->lock);
+ {
+ list_for_each_entry (each, &fdctx->eager_locked,
+ transaction.eager_locked) {
+ if (afr_locals_overlap (each, local)) {
+ local->transaction.eager_lock_on = _gf_false;
+ goto unlock;
+ }
+ }
+
+ local->transaction.eager_lock_on = _gf_true;
+ list_add_tail (&local->transaction.eager_locked,
+ &fdctx->eager_locked);
+ }
+unlock:
+ UNLOCK (&local->fd->lock);
+}
+
+
int
afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
{
@@ -1459,23 +1927,25 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
local = frame->local;
priv = this->private;
+ local->transaction.resume = afr_transaction_resume;
+ local->transaction.type = type;
+
ret = afr_transaction_local_init (local, this);
- if (ret < 0) {
+ if (ret < 0)
goto out;
- }
- local->transaction.resume = afr_transaction_resume;
- local->transaction.type = type;
+ afr_transaction_eager_lock_init (local, this);
- if (local->fd && priv->eager_lock &&
- local->transaction.type == AFR_DATA_TRANSACTION)
+ if (local->fd && local->transaction.eager_lock_on)
afr_set_lk_owner (frame, this, local->fd);
else
afr_set_lk_owner (frame, this, frame->root);
- if (_does_transaction_conflict_with_delayed_post_op (frame) &&
- local->loc.inode) {
+ if (!local->transaction.eager_lock_on && local->loc.inode) {
fd = fd_lookup (local->loc.inode, frame->root->pid);
+ if (fd == NULL)
+ fd = fd_lookup_anonymous (local->loc.inode);
+
if (fd) {
afr_delayed_changelog_wake_up (this, fd);
fd_unref (fd);
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index e95bc5b14..fa626fd0d 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -23,6 +23,9 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
int
afr_lock_server_count (afr_private_t *priv, afr_transaction_type type);
+afr_inodelk_t*
+afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom);
+
int32_t
afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
@@ -37,4 +40,12 @@ afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this);
void
afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd);
+void
+__mark_all_success (int32_t *pending[], int child_count,
+ afr_transaction_type type);
+gf_boolean_t
+afr_any_fops_failed (afr_local_t *local, afr_private_t *priv);
+
+gf_boolean_t
+afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this);
#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index bee10fd01..c724eb2ae 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -191,6 +191,8 @@ reconfigure (xlator_t *this, dict_t *options)
/* Reset this so we re-discover in case the topology changed. */
GF_OPTION_RECONF ("readdir-failover", priv->readdir_failover, options,
bool, out);
+ GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options,
+ bool, out);
priv->did_discovery = _gf_false;
ret = 0;
@@ -335,6 +337,8 @@ init (xlator_t *this)
GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out);
GF_OPTION_INIT ("readdir-failover", priv->readdir_failover, bool, out);
+ GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool,
+ out);
priv->wait_count = 1;
@@ -376,8 +380,6 @@ init (xlator_t *this)
AFR_XATTR_PREFIX,
trav->xlator->name);
if (-1 == ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "asprintf failed to set pending key");
ret = -ENOMEM;
goto out;
}
@@ -386,6 +388,13 @@ init (xlator_t *this)
i++;
}
+ ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT,
+ this->name);
+ if (-1 == ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event),
gf_afr_mt_int32_t);
if (!priv->last_event) {
@@ -430,15 +439,18 @@ init (xlator_t *this)
if (!priv->shd.timer)
goto out;
- priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false);
+ priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false,
+ _destroy_shd_event_data);
if (!priv->shd.healed)
goto out;
- priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false);
+ priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false,
+ _destroy_shd_event_data);
if (!priv->shd.heal_failed)
goto out;
- priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false);
+ priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false,
+ _destroy_shd_event_data);
if (!priv->shd.split_brain)
goto out;
@@ -448,7 +460,9 @@ init (xlator_t *this)
priv->root_inode = inode_ref (this->itable->root);
GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out);
GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out);
-
+ ret = afr_initialise_statistics (this);
+ if (ret)
+ goto out;
ret = 0;
out:
return ret;
@@ -483,6 +497,9 @@ struct xlator_fops fops = {
.finodelk = afr_finodelk,
.entrylk = afr_entrylk,
.fentrylk = afr_fentrylk,
+ .fallocate = afr_fallocate,
+ .discard = afr_discard,
+ .zerofill = afr_zerofill,
/* inode read */
.access = afr_access,
@@ -765,5 +782,12 @@ struct volume_options options[] = {
.description = "readdir(p) will not failover if this option is off",
.default_value = "on",
},
+ { .key = {"ensure-durability"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .description = "Afr performs fsyncs for transactions if this "
+ "option is on to make sure the changelogs/data is "
+ "written to the disk",
+ .default_value = "on",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index da9cc67c6..21064db58 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -28,6 +28,10 @@
#define AFR_XATTR_PREFIX "trusted.afr"
#define AFR_PATHINFO_HEADER "REPLICATE:"
#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
+#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
+
+#define AFR_LOCKEE_COUNT_MAX 3
+#define AFR_DOM_COUNT_MAX 3
struct _pump_private;
@@ -58,10 +62,8 @@ typedef enum {
AFR_INODE_SET_READ_CTX = 1,
AFR_INODE_RM_STALE_CHILDREN,
AFR_INODE_SET_OPENDIR_DONE,
- AFR_INODE_SET_SPLIT_BRAIN,
AFR_INODE_GET_READ_CTX,
AFR_INODE_GET_OPENDIR_DONE,
- AFR_INODE_GET_SPLIT_BRAIN,
} afr_inode_op_t;
typedef struct afr_inode_params_ {
@@ -75,29 +77,41 @@ typedef struct afr_inode_params_ {
} u;
} afr_inode_params_t;
+typedef enum afr_spb_state {
+ DONT_KNOW,
+ SPB,
+ NO_SPB
+} afr_spb_state_t;
+
typedef struct afr_inode_ctx_ {
uint64_t masks;
int32_t *fresh_children;//increasing order of latency
+ afr_spb_state_t mdata_spb;
+ afr_spb_state_t data_spb;
+ uint32_t open_fd_count;
} afr_inode_ctx_t;
typedef enum {
NONE,
INDEX,
+ INDEX_TO_BE_HEALED,
FULL,
} afr_crawl_type_t;
typedef struct afr_self_heald_ {
- gf_boolean_t enabled;
- gf_boolean_t iamshd;
- afr_crawl_type_t *pending;
- gf_boolean_t *inprogress;
- afr_child_pos_t *pos;
- gf_timer_t **timer;
- eh_t *healed;
- eh_t *heal_failed;
- eh_t *split_brain;
- char *node_uuid;
- int timeout;
+ gf_boolean_t enabled;
+ gf_boolean_t iamshd;
+ afr_crawl_type_t *pending;
+ gf_boolean_t *inprogress;
+ afr_child_pos_t *pos;
+ gf_timer_t **timer;
+ eh_t *healed;
+ eh_t *heal_failed;
+ eh_t *split_brain;
+ eh_t **statistics;
+ void **crawl_events;
+ char *node_uuid;
+ int timeout;
} afr_self_heald_t;
typedef struct _afr_private {
@@ -162,9 +176,38 @@ typedef struct _afr_private {
gf_boolean_t did_discovery;
gf_boolean_t readdir_failover;
uint64_t sh_readdir_size;
+ gf_boolean_t ensure_durability;
+ char *sh_domain;
} afr_private_t;
+typedef enum {
+ AFR_SELF_HEAL_NOT_ATTEMPTED,
+ AFR_SELF_HEAL_STARTED,
+ AFR_SELF_HEAL_FAILED,
+ AFR_SELF_HEAL_SYNC_BEGIN,
+} afr_self_heal_status;
+
typedef struct {
+ afr_self_heal_status gfid_or_missing_entry_self_heal;
+ afr_self_heal_status metadata_self_heal;
+ afr_self_heal_status data_self_heal;
+ afr_self_heal_status entry_self_heal;
+} afr_sh_status_for_all_type;
+
+typedef enum {
+ AFR_SELF_HEAL_ENTRY,
+ AFR_SELF_HEAL_METADATA,
+ AFR_SELF_HEAL_DATA,
+ AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY,
+ AFR_SELF_HEAL_INVALID = -1,
+} afr_self_heal_type;
+
+typedef enum {
+ AFR_CHECK_ALL,
+ AFR_CHECK_SPECIFIC,
+} afr_sh_fail_check_type;
+
+struct afr_self_heal_ {
/* External interface: These are variables (some optional) that
are set by whoever has triggered self-heal */
@@ -241,10 +284,10 @@ typedef struct {
const char *linkname;
gf_boolean_t entries_skipped;
- int op_failed;
gf_boolean_t actual_sh_started;
gf_boolean_t sync_done;
gf_boolean_t data_lock_held;
+ gf_boolean_t sh_dom_lock_held;
gf_boolean_t eof_reached;
fd_t *healing_fd;
int file_has_holes;
@@ -255,13 +298,17 @@ typedef struct {
uint8_t *checksum;
afr_post_remove_call_t post_remove_call;
- loc_t parent_loc;
+ char *data_sh_info;
+ char *metadata_sh_info;
+ loc_t parent_loc;
call_frame_t *orig_frame;
call_frame_t *old_loop_frame;
gf_boolean_t unwound;
afr_sh_algo_private_t *private;
+ afr_sh_status_for_all_type afr_all_sh_status;
+ afr_self_heal_type sh_type_in_action;
struct afr_sh_algorithm *algo;
afr_lock_cbk_t data_lock_success_handler;
@@ -272,11 +319,11 @@ typedef struct {
int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this);
int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this);
void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this);
- gf_boolean_t mdata_spb;
- gf_boolean_t data_spb;
call_frame_t *sh_frame;
-} afr_self_heal_t;
+};
+
+typedef struct afr_self_heal_ afr_self_heal_t;
typedef enum {
AFR_DATA_TRANSACTION, /* truncate, write, ... */
@@ -338,11 +385,31 @@ afr_index_for_transaction_type (afr_transaction_type type)
return -1; /* make gcc happy */
}
+typedef struct {
+ loc_t loc;
+ char *basename;
+ unsigned char *locked_nodes;
+ int locked_count;
+
+} afr_entry_lockee_t;
+
+int
+afr_entry_lockee_cmp (const void *l1, const void *l2);
+
+typedef struct {
+ char *domain; /* Domain on which inodelk is taken */
+ struct gf_flock flock;
+ unsigned char *locked_nodes;
+ int32_t lock_count;
+} afr_inodelk_t;
typedef struct {
loc_t *lk_loc;
- struct gf_flock lk_flock;
+ int lockee_count;
+ afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
+
+ afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX];
const char *lk_basename;
const char *lower_basename;
const char *higher_basename;
@@ -351,23 +418,22 @@ typedef struct {
unsigned char *locked_nodes;
unsigned char *lower_locked_nodes;
- unsigned char *inode_locked_nodes;
- unsigned char *entry_locked_nodes;
selfheal_lk_type_t selfheal_lk_type;
transaction_lk_type_t transaction_lk_type;
int32_t lock_count;
- int32_t inodelk_lock_count;
int32_t entrylk_lock_count;
uint64_t lock_number;
int32_t lk_call_count;
int32_t lk_expected_count;
+ int32_t lk_attempted_count;
int32_t lock_op_ret;
int32_t lock_op_errno;
afr_lock_cbk_t lock_cbk;
+ char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
} afr_internal_lock_t;
typedef struct _afr_locked_fd {
@@ -387,9 +453,11 @@ typedef struct _afr_local {
unsigned int call_count;
unsigned int success_count;
unsigned int enoent_count;
+ uint32_t open_fd_count;
+ gf_boolean_t update_open_fd_count;
- unsigned int govinda_gOvinda;
+ unsigned int unhealable;
unsigned int read_child_index;
unsigned char read_child_returned;
@@ -406,7 +474,6 @@ typedef struct _afr_local {
loc_t newloc;
fd_t *fd;
- unsigned char *fd_open_on;
glusterfs_fop_t fop;
@@ -430,12 +497,23 @@ typedef struct _afr_local {
int optimistic_change_log;
gf_boolean_t delayed_post_op;
- gf_boolean_t fop_paused;
- int (*fop_call_continue) (call_frame_t *frame, xlator_t *this);
- /*
- This struct contains the arguments for the "continuation"
- (scheme-like) of fops
+ /* Is the current writev() going to perform a stable write?
+ i.e, is fd->flags or @flags writev param have O_SYNC or
+ O_DSYNC?
+ */
+ gf_boolean_t stable_write;
+
+ /* This write appended to the file. Nnot necessarily O_APPEND,
+ just means the offset of write was at the end of file.
+ */
+ gf_boolean_t append_write;
+
+ int allow_sh_for_running_transaction;
+
+
+ /* This struct contains the arguments for the "continuation"
+ (scheme-like) of fops
*/
int op;
@@ -531,7 +609,9 @@ typedef struct _afr_local {
struct {
struct iatt prebuf;
struct iatt postbuf;
+ } inode_wfop; //common structure for all inode-write-fops
+ struct {
int32_t op_ret;
struct iovec *vector;
@@ -542,34 +622,21 @@ typedef struct _afr_local {
} writev;
struct {
- struct iatt prebuf;
- struct iatt postbuf;
- } fsync;
-
- struct {
off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
} truncate;
struct {
off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
} ftruncate;
struct {
struct iatt in_buf;
int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
} setattr;
struct {
struct iatt in_buf;
int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
} fsetattr;
struct {
@@ -631,11 +698,32 @@ typedef struct _afr_local {
dict_t *params;
char *linkpath;
} symlink;
+
+ struct {
+ int32_t mode;
+ off_t offset;
+ size_t len;
+ } fallocate;
+
+ struct {
+ off_t offset;
+ size_t len;
+ } discard;
+
+ struct {
+ off_t offset;
+ size_t len;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } zerofill;
+
+
} cont;
struct {
off_t start, len;
+ gf_boolean_t eager_lock_on;
int *eager_lock;
char *basename;
@@ -646,6 +734,17 @@ typedef struct _afr_local {
afr_transaction_type type;
+ /* pre-compute the post piggyback status before
+ entering POST-OP phase
+ */
+ int *postop_piggybacked;
+
+ /* stub to resume on destruction
+ of the transaction frame */
+ call_stub_t *resume_stub;
+
+ struct list_head eager_locked;
+
int32_t **txn_changelog;//changelog after pre+post ops
unsigned char *pre_op;
@@ -683,11 +782,6 @@ typedef enum {
} afr_fd_open_status_t;
typedef struct {
- struct list_head call_list;
- call_frame_t *frame;
-} afr_fd_paused_call_t;
-
-typedef struct {
unsigned int *pre_op_done;
afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
unsigned int *pre_op_piggyback;
@@ -706,13 +800,20 @@ typedef struct {
struct list_head entries; /* needed for readdir failover */
unsigned char *locked_on; /* which subvolumes locks have been successful */
- struct list_head paused_calls; /* queued calls while fix_open happens */
/* used for delayed-post-op optimization */
pthread_mutex_t delay_lock;
gf_timer_t *delay_timer;
call_frame_t *delay_frame;
int call_child;
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
+
+ /* list of frames currently in progress */
+ struct list_head eager_locked;
} afr_fd_ctx_t;
@@ -748,6 +849,13 @@ int32_t
afr_notify (xlator_t *this, int32_t event, void *data, void *data2);
int
+afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local,
+ loc_t *loc, char *basename, int child_count);
+
+void
+afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock);
+
+int
afr_attempt_lock_recovery (xlator_t *this, int32_t child_index);
int
@@ -782,8 +890,8 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this);
int
afr_internal_lock_finish (call_frame_t *frame, xlator_t *this);
-void
-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src,
+int
+afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
unsigned int child_count);
int pump_start (call_frame_t *frame, xlator_t *this);
@@ -833,7 +941,8 @@ gf_boolean_t
afr_is_split_brain (xlator_t *this, inode_t *inode);
void
-afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set);
+afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb,
+ afr_spb_state_t data_spb);
int
afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
@@ -948,8 +1057,9 @@ afr_children_rm_child (int32_t *children, int32_t child,
int32_t child_count);
void
afr_reset_children (int32_t *children, int32_t child_count);
-gf_boolean_t
-afr_error_more_important (int32_t old_errno, int32_t new_errno);
+int32_t
+afr_most_important_error(int32_t old_errno, int32_t new_errno,
+ gf_boolean_t eio);
int
afr_errno_count (int32_t *children, int *child_errno,
unsigned int child_count, int32_t op_errno);
@@ -992,11 +1102,11 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode,
int (*unwind) (call_frame_t *frame, xlator_t *this,
int32_t op_ret, int32_t op_errno,
int32_t sh_failed));
-int
-afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx,
- int need_open_count, int *need_open);
-int
-afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop);
+void
+afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open);
+
+void
+afr_open_fd_fix (fd_t *fd, xlator_t *this);
int
afr_set_elem_count_get (unsigned char *elems, int child_count);
@@ -1030,6 +1140,9 @@ afr_is_errno_set (int *child_errno, int child);
gf_boolean_t
afr_is_errno_unset (int *child_errno, int child);
+gf_boolean_t
+afr_is_fd_fixable (fd_t *fd);
+
void
afr_prepare_new_entry_pending_matrix (int32_t **pending,
gf_boolean_t (*is_pending) (int *, int),
@@ -1056,4 +1169,44 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count);
} \
} while (0);
+
+#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO."
+
+#define AFR_SBRAIN_CHECK_FD(fd, label) do { \
+ if (fd->inode && afr_is_split_brain (this, fd->inode)) { \
+ op_errno = EIO; \
+ gf_log (this->name, GF_LOG_WARNING, \
+ AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid)); \
+ goto label; \
+ } \
+} while (0)
+
+#define AFR_SBRAIN_CHECK_LOC(loc, label) do { \
+ if (loc->inode && afr_is_split_brain (this, loc->inode)) { \
+ op_errno = EIO; \
+ loc_path (loc, NULL); \
+ gf_log (this->name, GF_LOG_WARNING, \
+ AFR_SBRAIN_MSG , loc->path); \
+ goto label; \
+ } \
+} while (0)
+
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
+
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
+
+void
+afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
+
+int
+afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
+
+void
+afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
+
+afr_inode_ctx_t*
+afr_inode_ctx_get (inode_t *inode, xlator_t *this);
+
#endif /* __AFR_H__ */
diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c
index d3823383c..a7f72fb30 100644
--- a/xlators/cluster/afr/src/pump.c
+++ b/xlators/cluster/afr/src/pump.c
@@ -2443,6 +2443,7 @@ init (xlator_t *this)
priv->metadata_change_log = 1;
priv->entry_change_log = 1;
priv->use_afr_in_pump = 1;
+ priv->sh_readdir_size = 65536;
/* Locking options */
@@ -2500,6 +2501,12 @@ init (xlator_t *this)
i++;
}
+ ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name);
+ if (-1 == ret) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
priv->first_lookup = 1;
priv->root_inode = NULL;
@@ -2531,7 +2538,7 @@ init (xlator_t *this)
goto out;
}
- pump_priv->env = syncenv_new (0);
+ pump_priv->env = this->ctx->env;
if (!pump_priv->env) {
gf_log (this->name, GF_LOG_ERROR,
"Could not create new sync-environment");
@@ -2572,9 +2579,6 @@ fini (xlator_t *this)
if (!pump_priv)
goto afr_priv;
- if (pump_priv->env)
- syncenv_destroy (pump_priv->env);
-
GF_FREE (pump_priv->resume_path);
LOCK_DESTROY (&pump_priv->resume_path_lock);
LOCK_DESTROY (&pump_priv->pump_state_lock);
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
index b78af1ff3..174bea841 100644
--- a/xlators/cluster/dht/src/Makefile.am
+++ b/xlators/cluster/dht/src/Makefile.am
@@ -4,7 +4,7 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \
dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \
- dht-common.c dht-inode-write.c dht-inode-read.c \
+ dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \
$(top_builddir)/xlators/lib/src/libxlator.c
dht_la_SOURCES = $(dht_common_source) dht.c
@@ -12,13 +12,13 @@ dht_la_SOURCES = $(dht_common_source) dht.c
nufa_la_SOURCES = $(dht_common_source) nufa.c
switch_la_SOURCES = $(dht_common_source) switch.c
-dht_la_LDFLAGS = -module -avoidversion
+dht_la_LDFLAGS = -module -avoid-version
dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-nufa_la_LDFLAGS = -module -avoidversion
+nufa_la_LDFLAGS = -module -avoid-version
nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-switch_la_LDFLAGS = -module -avoidversion
+switch_la_LDFLAGS = -module -avoid-version
switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
noinst_HEADERS = dht-common.h dht-mem-types.h \
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 70e737c89..8f61339e6 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -22,6 +22,7 @@
#include "dht-common.h"
#include "defaults.h"
#include "byte-order.h"
+#include "glusterfs-acl.h"
#include <sys/time.h>
#include <libgen.h>
@@ -62,6 +63,11 @@ dht_aggregate (dict_t *this, char *key, data_t *value, void *data)
}
*size = hton64 (ntoh64 (*size) + ntoh64 (*ptr));
+
+ } else if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) {
+ ret = gf_get_min_stime (THIS, dst, key, value);
+ if (ret < 0)
+ return ret;
} else {
/* compare user xattrs only */
if (!strncmp (key, "user.", strlen ("user."))) {
@@ -148,9 +154,11 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
int op_errno = 0;
int ret = -1;
dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
local = discover_frame->local;
layout = local->layout;
+ conf = this->private;
LOCK(&discover_frame->lock);
{
@@ -193,11 +201,14 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
"(overlaps/holes present: %s, "
"ENOENT errors: %d)", local->loc.path,
(ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0);
- op_errno = EINVAL;
- goto out;
+ if ((ret > 0) && (ret == conf->subvolume_cnt)) {
+ op_errno = ESTALE;
+ goto out;
+ }
}
- dht_layout_set (this, local->inode, layout);
+ if (local->inode)
+ dht_layout_set (this, local->inode, layout);
}
DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno,
@@ -226,6 +237,7 @@ dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int is_dir = 0;
int is_linkfile = 0;
int attempt_unwind = 0;
+ dht_conf_t *conf = 0;
GF_VALIDATE_OR_GOTO ("dht", frame, out);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -235,6 +247,7 @@ dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
prev = cookie;
+ conf = this->private;
layout = local->layout;
@@ -269,7 +282,8 @@ dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto unlock;
}
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
is_dir = check_is_dir (inode, stbuf, xattr);
if (is_dir) {
@@ -328,23 +342,20 @@ dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc)
int i = 0;
call_frame_t *discover_frame = NULL;
-
conf = this->private;
local = frame->local;
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ ret = dict_set_uint32 (local->xattr_req, conf->xattr_name, 4 * 4);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set 'trusted.glusterfs.dht' key",
- loc->path);
+ "%s: failed to set '%s' key",
+ loc->path, conf->xattr_name);
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht.linkto", 256);
+ ret = dict_set_uint32 (local->xattr_req, conf->link_xattr_name, 256);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set 'trusted.glusterfs.dht.linkto' key",
- loc->path);
+ "%s: failed to set '%s' key",
+ loc->path, conf->link_xattr_name);
call_cnt = conf->subvolume_cnt;
local->call_cnt = call_cnt;
@@ -430,7 +441,7 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
op_ret, op_errno, xattr);
if (op_ret == -1) {
- local->op_errno = ENOENT;
+ local->op_errno = op_errno;
gf_log (this->name, GF_LOG_DEBUG,
"lookup of %s on %s returned error (%s)",
local->loc.path, prev->this->name,
@@ -585,7 +596,8 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
layout = local->layout;
is_dir = check_is_dir (inode, stbuf, xattr);
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
if (is_linkfile) {
gf_log (this->name, GF_LOG_INFO,
@@ -597,7 +609,7 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
if (is_dir) {
- ret = dht_dir_has_layout (xattr);
+ ret = dht_dir_has_layout (xattr, conf->xattr_name);
if (ret >= 0) {
if (is_greater_time(local->stbuf.ia_ctime,
local->stbuf.ia_ctime_nsec,
@@ -701,7 +713,7 @@ cont:
if (local->loc.parent) {
dht_inode_ctx_time_update (local->loc.parent, this,
- postparent, 1);
+ &local->postparent, 1);
}
DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
@@ -754,12 +766,15 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
local->stbuf.ia_prot.sticky = 1;
}
-unwind:
if (local->loc.parent) {
dht_inode_ctx_time_update (local->loc.parent, this,
postparent, 1);
}
+unwind:
+ if (local->linked == _gf_true)
+ dht_linkfile_attr_heal (frame, this);
+
DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
local->inode, &local->stbuf, local->xattr,
@@ -883,7 +898,7 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
hashed_subvol->name);
ret = dht_linkfile_create (frame,
- dht_lookup_linkfile_create_cbk,
+ dht_lookup_linkfile_create_cbk, this,
cached_subvol, hashed_subvol, &local->loc);
return ret;
@@ -921,8 +936,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
xlator_t *subvol = NULL;
loc_t *loc = NULL;
xlator_t *link_subvol = NULL;
- int ret = -1;
- int32_t fd_count = 0;
+ int ret = -1;
+ int32_t fd_count = 0;
+ dht_conf_t *conf = NULL;
GF_VALIDATE_OR_GOTO ("dht", frame, out);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -932,6 +948,7 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
loc = &local->loc;
+ conf = this->private;
prev = cookie;
subvol = prev->this;
@@ -953,7 +970,8 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
loc->path, prev->this->name);
}
- is_linkfile = check_is_linkfile (inode, buf, xattr);
+ is_linkfile = check_is_linkfile (inode, buf, xattr,
+ conf->link_xattr_name);
is_dir = check_is_dir (inode, buf, xattr);
if (is_linkfile) {
@@ -1095,7 +1113,16 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
gf_log (this->name, GF_LOG_INFO,
"lookup of %s on %s (following linkfile) failed (%s)",
local->loc.path, subvol->name, strerror (op_errno));
- goto err;
+
+ /* If cached subvol returned ENOTCONN, do not do
+ lookup_everywhere. We need to make sure linkfile does not get
+ removed, which can take away the namespace, and subvol is
+ anyways down. */
+
+ if (op_errno != ENOTCONN)
+ goto err;
+ else
+ goto unwind;
}
if (check_is_dir (inode, stbuf, xattr)) {
@@ -1105,7 +1132,7 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
goto err;
}
- if (check_is_linkfile (inode, stbuf, xattr)) {
+ if (check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) {
gf_log (this->name, GF_LOG_INFO,
"lookup of %s on %s (following linkfile) reached link",
local->loc.path, subvol->name);
@@ -1133,12 +1160,12 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
op_errno = EINVAL;
}
-unwind:
if (local->loc.parent) {
dht_inode_ctx_time_update (local->loc.parent, this,
postparent, 1);
}
+unwind:
DHT_STRIP_PHASE1_FLAGS (stbuf);
DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
postparent);
@@ -1282,7 +1309,8 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
if (!is_linkfile) {
/* non-directory and not a linkfile */
@@ -1322,7 +1350,7 @@ out:
* from each of the subvolume. See dht_iatt_merge for reference.
*/
- if (local->loc.parent) {
+ if (!op_ret && local->loc.parent) {
dht_inode_ctx_time_update (local->loc.parent, this,
postparent, 1);
}
@@ -1387,7 +1415,6 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
conf = this->private;
if (!conf)
@@ -1462,7 +1489,7 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
* revalidates directly go to the cached-subvolume.
*/
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (IA_ISDIR (local->inode->ia_type)) {
local->call_cnt = call_cnt = conf->subvolume_cnt;
@@ -1497,10 +1524,10 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
do_fresh_lookup:
/* TODO: remove the hard-coding */
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
ret = dict_set_uint32 (local->xattr_req,
- DHT_LINKFILE_KEY, 256);
+ conf->link_xattr_name, 256);
/* need it for self-healing linkfiles which is
'in-migration' state */
@@ -1608,7 +1635,8 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
LOCK (&frame->lock);
{
- if (op_ret == -1) {
+ if ((op_ret == -1) && !((op_errno == ENOENT) ||
+ (op_errno == ENOTCONN))) {
local->op_errno = op_errno;
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
@@ -1621,7 +1649,7 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
unlock:
UNLOCK (&frame->lock);
- if (op_ret == -1)
+ if (local->op_ret == -1)
goto err;
cached_subvol = dht_subvol_get_cached (this, local->loc.inode);
@@ -1645,41 +1673,6 @@ err:
return 0;
}
-static int
-dht_ufo_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
-
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_ret = -1;
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
- }
-unlock:
- UNLOCK (&frame->lock);
-
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- DHT_STACK_UNWIND (setxattr, frame, local->op_ret,
- local->op_errno, NULL);
- }
-
- return 0;
-}
-
-
int
dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
@@ -1752,115 +1745,222 @@ dht_fill_pathinfo_xattr (xlator_t *this, dht_local_t *local,
}
int
-dht_vgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+dht_vgetxattr_alloc_and_fill (dht_local_t *local, dict_t *xattr, xlator_t *this,
+ int op_errno)
{
- dht_local_t *local = NULL;
- int ret = 0;
- int flag = 0;
- int this_call_cnt = 0;
- char *value_got = NULL;
- char layout_buf[8192] = {0,};
- char *xattr_buf = NULL;
- dict_t *dict = NULL;
- int32_t alloc_len = 0;
- int32_t plen = 0;
- call_frame_t *prev = NULL;
+ int ret = -1;
+ char *value = NULL;
+ int32_t plen = 0;
- local = frame->local;
- prev = cookie;
+ ret = dict_get_str (xattr, local->xsel, &value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Subvolume %s returned -1 (%s)", this->name,
+ strerror (op_errno));
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto out;
+ }
- if (op_ret >= 0) {
- ret = dict_get_str (xattr, local->xsel, &value_got);
- if (!ret) {
- alloc_len = strlen (value_got);
+ local->alloc_len += strlen(value);
- /**
- * allocate the buffer:- we allocate 10 bytes extra in
- * case we need to append ' Link: ' in the buffer for
- * another STACK_WIND
- */
+ if (!local->xattr_val) {
+ local->alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10);
+ local->xattr_val = GF_CALLOC (local->alloc_len, sizeof (char),
+ gf_common_mt_char);
+ if (!local->xattr_val) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+ if (local->xattr_val) {
+ plen = strlen (local->xattr_val);
+ if (plen) {
+ /* extra byte(s) for \0 to be safe */
+ local->alloc_len += (plen + 2);
+ local->xattr_val = GF_REALLOC (local->xattr_val,
+ local->alloc_len);
if (!local->xattr_val) {
- alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10);
- local->xattr_val =
- GF_CALLOC (alloc_len,
- sizeof (char),
- gf_common_mt_char);
+ ret = -1;
+ goto out;
}
+ }
- if (local->xattr_val) {
- plen = strlen (local->xattr_val);
- if (plen) {
- void *p;
- /* extra byte(s) for \0 to be safe */
- alloc_len += (plen + 2);
- p = GF_REALLOC (local->xattr_val,
- alloc_len);
- if (!p)
- goto out;
- local->xattr_val = p;
- }
+ (void) strcat (local->xattr_val, value);
+ (void) strcat (local->xattr_val, " ");
+ local->op_ret = 0;
+ }
- strcat (local->xattr_val, value_got);
- }
- local->op_ret = 0;
- }
+ ret = 0;
+
+ out:
+ return ret;
+}
+
+int
+dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this,
+ gf_boolean_t flag)
+{
+ int ret = -1;
+ char *xattr_buf = NULL;
+ char layout_buf[8192] = {0,};
+
+ if (flag)
+ fill_layout_info (local->layout, layout_buf);
+
+ *dict = dict_new ();
+ if (!*dict)
+ goto out;
+
+ local->xattr_val[strlen (local->xattr_val) - 1] = '\0';
+
+ /* we would need max this many bytes to create xattr string
+ * extra 40 bytes is just an estimated amount of additional
+ * space required as we include translator name and some
+ * spaces, brackets etc. when forming the pathinfo string.
+ *
+ * For node-uuid we just don't have all the pretty formatting,
+ * but since this is a generic routine for pathinfo & node-uuid
+ * we dont have conditional space allocation and try to be
+ * generic
+ */
+ local->alloc_len += (2 * strlen (this->name))
+ + strlen (layout_buf)
+ + 40;
+ xattr_buf = GF_CALLOC (local->alloc_len, sizeof (char),
+ gf_common_mt_char);
+ if (!xattr_buf)
+ goto out;
+
+ if (XATTR_IS_PATHINFO (local->xsel)) {
+ (void) dht_fill_pathinfo_xattr (this, local, xattr_buf,
+ local->alloc_len, flag,
+ layout_buf);
+ } else if (XATTR_IS_NODE_UUID (local->xsel)) {
+ (void) snprintf (xattr_buf, local->alloc_len, "%s",
+ local->xattr_val);
} else {
- local->op_ret = -1;
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_ERROR, "Subvolume %s returned -1 "
- "(%s)", prev->this->name, strerror (op_errno));
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unknown local->xsel (%s)", local->xsel);
+ goto out;
}
+ ret = dict_set_dynstr (*dict, local->xsel, xattr_buf);
+ GF_FREE (local->xattr_val);
+
out:
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
+ return ret;
+}
- if (local->op_ret == -1) {
- goto unwind;
- }
+int
+dht_vgetxattr_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ int ret = 0;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ dict_t *dict = NULL;
- if (local->layout->cnt > 1) {
- /* Set it for directory */
- fill_layout_info (local->layout, layout_buf);
- flag = 1;
- }
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (frame->local, out);
- dict = dict_new ();
-
- /* we would need max this many bytes to create xattr string */
- alloc_len += (2 * strlen (this->name))
- + strlen (layout_buf)
- + 40;
- xattr_buf = GF_CALLOC (alloc_len,
- sizeof (char), gf_common_mt_char);
-
- if (XATTR_IS_PATHINFO (local->xsel)) {
- (void) dht_fill_pathinfo_xattr (this, local, xattr_buf,
- alloc_len, flag,
- layout_buf);
- } else if (XATTR_IS_NODE_UUID (local->xsel)) {
- (void) snprintf (xattr_buf, alloc_len, "%s",
- local->xattr_val);
- } else
- gf_log (this->name, GF_LOG_WARNING,
- "Unknown local->xsel (%s)", local->xsel);
+ local = frame->local;
- ret = dict_set_dynstr (dict, local->xsel, xattr_buf);
+ LOCK (&frame->lock);
+ {
+ this_call_cnt = --local->call_cnt;
+ if (op_ret < 0) {
+ if (op_errno != ENOTCONN) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "getxattr err (%s) for dir",
+ strerror (op_errno));
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
- GF_FREE (local->xattr_val);
+ goto unlock;
+ }
- DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict,
- xdata);
+ ret = dht_vgetxattr_alloc_and_fill (local, xattr, this,
+ op_errno);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "alloc or fill failure");
+ }
+ unlock:
+ UNLOCK (&frame->lock);
- if (dict)
- dict_unref (dict);
+ if (!is_last_call (this_call_cnt))
+ goto out;
- return 0;
+ /* -- last call: do patch ups -- */
+
+ if (local->op_ret == -1) {
+ goto unwind;
}
-unwind:
+ ret = dht_vgetxattr_fill_and_set (local, &dict, this, _gf_true);
+ if (ret)
+ goto unwind;
+
+ DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata);
+ goto cleanup;
+
+ unwind:
DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, NULL);
+ cleanup:
+ if (dict)
+ dict_unref (dict);
+ out:
+ return 0;
+}
+
+int
+dht_vgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int ret = 0;
+ dict_t *dict = NULL;
+ call_frame_t *prev = NULL;
+ gf_boolean_t flag = _gf_true;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ gf_log (this->name, GF_LOG_ERROR, "Subvolume %s returned -1 "
+ "(%s)", prev->this->name, strerror (op_errno));
+ goto unwind;
+ }
+
+ ret = dht_vgetxattr_alloc_and_fill (local, xattr, this,
+ op_errno);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "alloc or fill failure");
+ goto unwind;
+ }
+
+ flag = (local->layout->cnt > 1) ? _gf_true : _gf_false;
+
+ ret = dht_vgetxattr_fill_and_set (local, &dict, this, flag);
+ if (ret)
+ goto unwind;
+
+ DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata);
+ goto cleanup;
+
+ unwind:
+ DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno,
+ NULL, NULL);
+ cleanup:
+ if (dict)
+ dict_unref (dict);
+
return 0;
}
@@ -1893,10 +1993,13 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
int this_call_cnt = 0;
dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (frame->local, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ conf = this->private;
local = frame->local;
this_call_cnt = dht_frame_return (frame);
@@ -1904,8 +2007,8 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (!xattr || (op_ret == -1))
goto out;
- if (dict_get (xattr, "trusted.glusterfs.dht")) {
- dict_del (xattr, "trusted.glusterfs.dht");
+ if (dict_get (xattr, conf->xattr_name)) {
+ dict_del (xattr, conf->xattr_name);
}
local->op_ret = 0;
@@ -1938,9 +2041,72 @@ dht_getxattr_unwind (call_frame_t *frame,
int
+dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
+{
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+
+
+ local = frame->local;
+
+ if (op_ret != -1) {
+ if (local->xattr)
+ dict_unref (local->xattr);
+ local->xattr = dict_ref (xattr);
+
+ if (local->xattr_req)
+ dict_unref (local->xattr_req);
+ local->xattr_req = dict_ref (xdata);
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno,
+ local->xattr, local->xattr_req);
+ }
+
+ return 0;
+}
+
+
+int
+dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *key, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int i = 0;
+ dht_layout_t *layout = NULL;
+ int cnt = 0;
+ xlator_t *subvol = NULL;
+
+
+ local = frame->local;
+ layout = local->layout;
+
+ cnt = local->call_cnt = layout->cnt;
+
+ local->op_ret = -1;
+ local->op_errno = ENODATA;
+
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND (frame, dht_getxattr_get_real_filename_cbk,
+ subvol, subvol->fops->getxattr,
+ loc, key, xdata);
+ }
+
+ return 0;
+}
+
+
+int
dht_getxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *key, dict_t *xdata)
+#define DHT_IS_DIR(layout) (layout->cnt > 1)
{
+
xlator_t *subvol = NULL;
xlator_t *hashed_subvol = NULL;
xlator_t *cached_subvol = NULL;
@@ -1956,7 +2122,6 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
VALIDATE_OR_GOTO (this->private, err);
conf = this->private;
@@ -1984,8 +2149,40 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
}
}
- if (key && ((strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)
- || strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0)) {
+ if (key &&
+ (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY,
+ strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)
+ && DHT_IS_DIR(layout)) {
+ dht_getxattr_get_real_filename (frame, this, loc, key, xdata);
+ return 0;
+ }
+
+ /* for file use cached subvolume (obviously!): see if {}
+ * below
+ * for directory:
+ * wind to all subvolumes and exclude subvolumes which
+ * return ENOTCONN (in callback)
+ *
+ * NOTE: Don't trust inode here, as that may not be valid
+ * (until inode_link() happens)
+ */
+ if (key && DHT_IS_DIR(layout) &&
+ ((strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)
+ || (strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0))) {
+ (void) strncpy (local->xsel, key, 256);
+ cnt = local->call_cnt = layout->cnt;
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND (frame, dht_vgetxattr_dir_cbk,
+ subvol, subvol->fops->getxattr,
+ loc, key, NULL);
+ }
+ return 0;
+ }
+
+ /* node-uuid or pathinfo for files */
+ if (key && ((strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0)
+ || (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0))) {
cached_subvol = local->cached_subvol;
(void) strncpy (local->xsel, key, 256);
@@ -1995,6 +2192,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
+
if (key && (strcmp (key, GF_XATTR_LINKINFO_KEY) == 0)) {
hashed_subvol = dht_subvol_get_hashed (this, loc);
if (!hashed_subvol) {
@@ -2027,13 +2225,13 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
}
if (key && (!strcmp (GF_XATTR_MARKER_KEY, key))
- && (-1 == frame->root->pid)) {
-
- if (loc->inode-> ia_type == IA_IFDIR) {
+ && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
+ if (DHT_IS_DIR(layout)) {
cnt = layout->cnt;
} else {
cnt = 1;
}
+
sub_volumes = alloca ( cnt * sizeof (xlator_t *));
for (i = 0; i < cnt; i++)
*(sub_volumes + i) = layout->list[i].xlator;
@@ -2041,7 +2239,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
if (cluster_getmarkerattr (frame, this, loc, key,
local, dht_getxattr_unwind,
sub_volumes, cnt,
- MARKER_UUID_TYPE, conf->vol_uuid)) {
+ MARKER_UUID_TYPE, marker_uuid_default_gauge,
+ conf->vol_uuid)) {
op_errno = EINVAL;
goto err;
}
@@ -2051,8 +2250,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
if (key && *conf->vol_uuid) {
if ((match_uuid_local (key, conf->vol_uuid) == 0) &&
- (-1 == frame->root->pid)) {
- if (loc->inode-> ia_type == IA_IFDIR) {
+ (GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
+ if (DHT_IS_DIR(layout)) {
cnt = layout->cnt;
} else {
cnt = 1;
@@ -2065,6 +2264,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
local, dht_getxattr_unwind,
sub_volumes, cnt,
MARKER_XTIME_TYPE,
+ marker_xtime_default_gauge,
conf->vol_uuid)) {
op_errno = EINVAL;
goto err;
@@ -2074,7 +2274,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
}
}
- if (loc->inode-> ia_type == IA_IFDIR) {
+ if (DHT_IS_DIR(layout)) {
cnt = local->call_cnt = layout->cnt;
} else {
cnt = local->call_cnt = 1;
@@ -2094,6 +2294,7 @@ err:
return 0;
}
+#undef DHT_IS_DIR
int
dht_fgetxattr (call_frame_t *frame, xlator_t *this,
@@ -2165,13 +2366,17 @@ dht_fsetxattr (call_frame_t *frame, xlator_t *this,
xlator_t *subvol = NULL;
dht_local_t *local = NULL;
int op_errno = EINVAL;
+ dht_conf_t *conf = NULL;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
VALIDATE_OR_GOTO (fd->inode, err);
+ VALIDATE_OR_GOTO (this->private, err);
+
+ conf = this->private;
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.dht*", xattr,
+ GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
op_errno, err);
local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR);
@@ -2275,12 +2480,12 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.dht*", xattr,
+ conf = this->private;
+
+ GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
op_errno, err);
- conf = this->private;
local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR);
if (!local) {
op_errno = ENOMEM;
@@ -2305,25 +2510,6 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
local->call_cnt = call_cnt = layout->cnt;
- /* This key is sent by Unified File and Object storage
- * to test xattr support in backend.
- */
- tmp = dict_get (xattr, "user.ufo-test");
- if (tmp) {
- if (IA_ISREG (loc->inode->ia_type)) {
- op_errno = ENOTSUP;
- goto err;
- }
- local->op_ret = 0;
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_ufo_xattr_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->setxattr,
- loc, xattr, flags, NULL);
- }
- return 0;
- }
-
tmp = dict_get (xattr, "distribute.migrate-data");
if (tmp) {
if (IA_ISDIR (loc->inode->ia_type)) {
@@ -2493,18 +2679,20 @@ dht_removexattr (call_frame_t *frame, xlator_t *this,
dht_local_t *local = NULL;
dht_layout_t *layout = NULL;
int call_cnt = 0;
+ dht_conf_t *conf = NULL;
int i;
VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (this->private, err);
+
+ conf = this->private;
- GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.dht*",
- key, op_errno, err);
+ GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err);
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
local = dht_local_init (frame, loc, NULL, GF_FOP_REMOVEXATTR);
if (!local) {
@@ -2556,13 +2744,16 @@ dht_fremovexattr (call_frame_t *frame, xlator_t *this,
dht_local_t *local = NULL;
dht_layout_t *layout = NULL;
int call_cnt = 0;
+ dht_conf_t *conf = 0;
int i;
VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (this->private, err);
+
+ conf = this->private;
- GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.dht*",
- key, op_errno, err);
+ GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err);
VALIDATE_OR_GOTO (frame, err);
@@ -2733,7 +2924,6 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
VALIDATE_OR_GOTO (this->private, err);
conf = this->private;
@@ -2853,10 +3043,13 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
list_for_each_entry (orig_entry, (&orig_entries->list), list) {
next_offset = orig_entry->d_off;
- if ((check_is_dir (NULL, (&orig_entry->d_stat), NULL) &&
- (prev->this != dht_first_up_subvol (this))) ||
- check_is_linkfile (NULL, (&orig_entry->d_stat),
- orig_entry->dict)) {
+ if (check_is_dir (NULL, (&orig_entry->d_stat), NULL) &&
+ (prev->this != local->first_up_subvol)) {
+ continue;
+ }
+ if (check_is_linkfile (NULL, (&orig_entry->d_stat),
+ orig_entry->dict,
+ conf->link_xattr_name)) {
continue;
}
@@ -2933,13 +3126,16 @@ done:
}
if (conf->readdir_optimize == _gf_true) {
- if (next_subvol != dht_first_up_subvol (this)) {
+ if (next_subvol != local->first_up_subvol) {
ret = dict_set_int32 (local->xattr,
GF_READDIR_SKIP_DIRS, 1);
if (ret)
gf_log (this->name, GF_LOG_ERROR,
"dict set failed");
- }
+ } else {
+ dict_del (local->xattr,
+ GF_READDIR_SKIP_DIRS);
+ }
}
STACK_WIND (frame, dht_readdirp_cbk,
@@ -3072,6 +3268,7 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (this->private, err);
conf = this->private;
@@ -3084,6 +3281,7 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
local->fd = fd_ref (fd);
local->size = size;
local->xattr_req = (dict)? dict_ref (dict) : NULL;
+ local->first_up_subvol = dht_first_up_subvol (this);
dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff);
@@ -3096,20 +3294,22 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
if (local->xattr) {
ret = dict_set_uint32 (local->xattr,
- "trusted.glusterfs.dht.linkto",
- 256);
+ conf->link_xattr_name, 256);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
- "failed to set 'glusterfs.dht.linkto'"
- " key");
+ "failed to set '%s' key",
+ conf->link_xattr_name);
if (conf->readdir_optimize == _gf_true) {
- if (xvol != dht_first_up_subvol (this)) {
+ if (xvol != local->first_up_subvol) {
ret = dict_set_int32 (local->xattr,
GF_READDIR_SKIP_DIRS, 1);
if (ret)
gf_log (this->name,
GF_LOG_ERROR,
"Dict set failed");
+ } else {
+ dict_del (local->xattr,
+ GF_READDIR_SKIP_DIRS);
}
}
}
@@ -3245,7 +3445,7 @@ dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_t *inode, struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- call_frame_t *prev = NULL;
+ xlator_t *prev = NULL;
int ret = -1;
dht_local_t *local = NULL;
@@ -3270,15 +3470,17 @@ dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
postparent, 1);
}
- ret = dht_layout_preset (this, prev->this, inode);
+ ret = dht_layout_preset (this, prev, inode);
if (ret < 0) {
gf_log (this->name, GF_LOG_DEBUG,
"could not set pre-set layout for subvolume %s",
- prev->this->name);
+ prev? prev->name: NULL);
op_ret = -1;
op_errno = EINVAL;
goto out;
}
+ if (local->linked == _gf_true)
+ dht_linkfile_attr_heal (frame, this);
out:
/*
* FIXME: ia_size and st_blocks of preparent and postparent do not have
@@ -3287,7 +3489,6 @@ out:
* corresponding values from each of the subvolume.
* See dht_iatt_merge for reference.
*/
-
DHT_STRIP_PHASE1_FLAGS (stbuf);
DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, stbuf,
preparent, postparent, xdata);
@@ -3309,12 +3510,17 @@ dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie,
goto err;
local = frame->local;
+ if (!local || !local->cached_subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
cached_subvol = local->cached_subvol;
- STACK_WIND (frame, dht_newfile_cbk,
- cached_subvol, cached_subvol->fops->mknod,
- &local->loc, local->mode, local->rdev, local->umask,
- local->params);
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)cached_subvol,
+ cached_subvol, cached_subvol->fops->mknod,
+ &local->loc, local->mode, local->rdev, local->umask,
+ local->params);
return 0;
err:
@@ -3357,11 +3563,13 @@ dht_mknod (call_frame_t *frame, xlator_t *this,
gf_log (this->name, GF_LOG_TRACE,
"creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->mknod,
- loc, mode, rdev, umask, params);
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol,
+ subvol, subvol->fops->mknod, loc, mode,
+ rdev, umask, params);
} else {
- avail_subvol = dht_free_disk_available_subvol (this, subvol);
+
+ avail_subvol = dht_free_disk_available_subvol (this, subvol,
+ local);
if (avail_subvol != subvol) {
/* Choose the minimum filled volume, and create the
files there */
@@ -3373,14 +3581,15 @@ dht_mknod (call_frame_t *frame, xlator_t *this,
local->umask = umask;
dht_linkfile_create (frame,
dht_mknod_linkfile_create_cbk,
- avail_subvol, subvol, loc);
+ this, avail_subvol, subvol, loc);
} else {
gf_log (this->name, GF_LOG_TRACE,
"creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->mknod,
- loc, mode, rdev, umask, params);
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk,
+ (void *)subvol, subvol,
+ subvol->fops->mknod, loc, mode,
+ rdev, umask, params);
}
}
@@ -3425,9 +3634,9 @@ dht_symlink (call_frame_t *frame, xlator_t *this,
gf_log (this->name, GF_LOG_TRACE,
"creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->symlink,
- linkname, loc, umask, params);
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol,
+ subvol->fops->symlink, linkname, loc, umask,
+ params);
return 0;
@@ -3541,7 +3750,10 @@ dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_inode_ctx_time_update (local->loc.parent, this,
postparent, 1);
}
-
+ if (local->linked == _gf_true) {
+ local->stbuf = *stbuf;
+ dht_linkfile_attr_heal (frame, this);
+ }
out:
DHT_STRIP_PHASE1_FLAGS (stbuf);
DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent,
@@ -3628,7 +3840,7 @@ dht_link (call_frame_t *frame, xlator_t *this,
if (hashed_subvol != cached_subvol) {
uuid_copy (local->gfid, oldloc->inode->gfid);
- dht_linkfile_create (frame, dht_link_linkfile_cbk,
+ dht_linkfile_create (frame, dht_link_linkfile_cbk, this,
cached_subvol, hashed_subvol, newloc);
} else {
STACK_WIND (frame, dht_link_cbk,
@@ -3685,7 +3897,10 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
op_errno = EINVAL;
goto out;
}
-
+ if (local->linked == _gf_true) {
+ local->stbuf = *stbuf;
+ dht_linkfile_attr_heal (frame, this);
+ }
out:
DHT_STRIP_PHASE1_FLAGS (stbuf);
DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent,
@@ -3775,7 +3990,7 @@ dht_create (call_frame_t *frame, xlator_t *this,
}
/* Choose the minimum filled volume, and create the
files there */
- avail_subvol = dht_free_disk_available_subvol (this, subvol);
+ avail_subvol = dht_free_disk_available_subvol (this, subvol, local);
if (avail_subvol != subvol) {
local->params = dict_ref (params);
local->flags = flags;
@@ -3786,9 +4001,8 @@ dht_create (call_frame_t *frame, xlator_t *this,
gf_log (this->name, GF_LOG_TRACE,
"creating %s on %s (link at %s)", loc->path,
avail_subvol->name, subvol->name);
- dht_linkfile_create (frame,
- dht_create_linkfile_create_cbk,
- avail_subvol, subvol, loc);
+ dht_linkfile_create (frame, dht_create_linkfile_create_cbk,
+ this, avail_subvol, subvol, loc);
goto done;
}
gf_log (this->name, GF_LOG_TRACE,
@@ -3861,6 +4075,15 @@ dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
ret = dht_layout_merge (this, layout, prev->this,
-1, ENOSPC, NULL);
} else {
+ if (op_ret == -1 && op_errno == EEXIST)
+ /* Very likely just a race between mkdir and
+ self-heal (from lookup of a concurrent mkdir
+ attempt).
+ Ignore error for now. layout setting will
+ anyways fail if this was a different (old)
+ pre-existing different directory.
+ */
+ op_ret = 0;
ret = dht_layout_merge (this, layout, prev->this,
op_ret, op_errno, NULL);
}
@@ -4318,6 +4541,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_frame_t *main_frame = NULL;
dht_local_t *main_local = NULL;
int this_call_cnt = 0;
+ dht_conf_t *conf = this->private;
local = frame->local;
prev = cookie;
@@ -4329,7 +4553,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret != 0)
goto err;
- if (check_is_linkfile (inode, stbuf, xattr) == 0) {
+ if (!check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) {
main_local->op_ret = -1;
main_local->op_errno = ENOTEMPTY;
@@ -4364,6 +4588,7 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
dht_local_t *lookup_local = NULL;
dht_local_t *local = NULL;
dict_t *xattrs = NULL;
+ dht_conf_t *conf = this->private;
local = frame->local;
@@ -4372,7 +4597,8 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
continue;
if (strcmp (trav->d_name, "..") == 0)
continue;
- if (check_is_linkfile (NULL, (&trav->d_stat), trav->dict)) {
+ if (check_is_linkfile (NULL, (&trav->d_stat), trav->dict,
+ conf->link_xattr_name)) {
ret++;
continue;
}
@@ -4390,7 +4616,7 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
return -1;
}
- ret = dict_set_uint32 (xattrs, DHT_LINKFILE_KEY, 256);
+ ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "failed to set linkto key"
" in dict");
@@ -4513,6 +4739,7 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_frame_t *prev = NULL;
dict_t *dict = NULL;
int ret = 0;
+ dht_conf_t *conf = this->private;
local = frame->local;
prev = cookie;
@@ -4536,12 +4763,11 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto err;
}
- ret = dict_set_uint32 (dict,
- "trusted.glusterfs.dht.linkto", 256);
+ ret = dict_set_uint32 (dict, conf->link_xattr_name, 256);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set 'trusted.glusterfs.dht.linkto' key",
- local->loc.path);
+ "%s: failed to set '%s' key",
+ local->loc.path, conf->link_xattr_name);
STACK_WIND (frame, dht_rmdir_readdirp_cbk,
prev->this, prev->this->fops->readdirp,
@@ -4640,7 +4866,6 @@ dht_entrylk (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
local = dht_local_init (frame, loc, NULL, GF_FOP_ENTRYLK);
if (!local) {
@@ -4921,15 +5146,7 @@ unlock:
or wait for anything else. Just propagate blindly */
if (have_heard_from_all) {
propagate = 1;
- if (conf->defrag) {
- ret = pthread_create (&conf->defrag->th, NULL,
- gf_defrag_start, this);
- if (ret) {
- conf->defrag = NULL;
- GF_FREE (conf->defrag);
- kill (getpid(), SIGTERM);
- }
- }
+
}
@@ -4951,6 +5168,19 @@ unlock:
/* continue to check other events for CHILD_UP */
}
}
+
+ /* rebalance is started with assert_no_child_down. So we do
+ * not need to handle CHILD_DOWN event here.
+ */
+ if (conf->defrag) {
+ ret = gf_thread_create (&conf->defrag->th, NULL,
+ gf_defrag_start, this);
+ if (ret) {
+ conf->defrag = NULL;
+ GF_FREE (conf->defrag);
+ kill (getpid(), SIGTERM);
+ }
+ }
}
ret = 0;
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 83ec345d6..5ccd66799 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -13,6 +13,8 @@
#include "config.h"
#endif
+#include <regex.h>
+
#include "dht-mem-types.h"
#include "libxlator.h"
#include "syncop.h"
@@ -53,7 +55,7 @@ struct dht_layout {
uint32_t start;
uint32_t stop;
xlator_t *xlator;
- } list[0];
+ } list[];
};
typedef struct dht_layout dht_layout_t;
@@ -160,6 +162,7 @@ struct dht_local {
/* which xattr request? */
char xsel[256];
+ int32_t alloc_len;
char *newpath;
@@ -176,7 +179,11 @@ struct dht_local {
glusterfs_fop_t fop;
+ gf_boolean_t linked;
+ xlator_t *link_subvol;
+
struct dht_rebalance_ rebalance;
+ xlator_t *first_up_subvol;
};
typedef struct dht_local dht_local_t;
@@ -205,15 +212,27 @@ enum gf_defrag_status_t {
GF_DEFRAG_STATUS_STOPPED,
GF_DEFRAG_STATUS_COMPLETE,
GF_DEFRAG_STATUS_FAILED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED,
};
typedef enum gf_defrag_status_t gf_defrag_status_t;
+typedef struct gf_defrag_pattern_list gf_defrag_pattern_list_t;
+
+struct gf_defrag_pattern_list {
+ char path_pattern[256];
+ uint64_t size;
+ gf_defrag_pattern_list_t *next;
+};
struct gf_defrag_info_ {
uint64_t total_files;
uint64_t total_data;
uint64_t num_files_lookedup;
uint64_t total_failures;
+ uint64_t skipped;
gf_lock_t lock;
int cmd;
pthread_t th;
@@ -226,7 +245,7 @@ struct gf_defrag_info_ {
uuid_t node_uuid;
struct timeval start_time;
gf_boolean_t stats;
-
+ gf_defrag_pattern_list_t *defrag_pattern;
};
typedef struct gf_defrag_info_ gf_defrag_info_t;
@@ -271,6 +290,17 @@ struct dht_conf {
/* Request to filter directory entries in readdir request */
gf_boolean_t readdir_optimize;
+
+ /* Support regex-based name reinterpretation. */
+ regex_t rsync_regex;
+ gf_boolean_t rsync_regex_valid;
+ regex_t extra_regex;
+ gf_boolean_t extra_regex_valid;
+
+ /* Support variable xattr names. */
+ char *xattr_name;
+ char *link_xattr_name;
+ char *wild_xattr_name;
};
typedef struct dht_conf dht_conf_t;
@@ -301,13 +331,12 @@ typedef enum {
#define DHT_MIGRATION_IN_PROGRESS 1
#define DHT_MIGRATION_COMPLETED 2
-#define DHT_LINKFILE_KEY "trusted.glusterfs.dht.linkto"
#define DHT_LINKFILE_MODE (S_ISVTX)
-#define check_is_linkfile(i,s,x) ( \
+#define check_is_linkfile(i,s,x,n) ( \
((st_mode_from_ia ((s)->ia_prot, (s)->ia_type) & ~S_IFMT) \
- == DHT_LINKFILE_MODE) && \
- dict_get (x, DHT_LINKFILE_KEY))
+ == DHT_LINKFILE_MODE) && \
+ dict_get (x, n))
#define IS_DHT_MIGRATION_PHASE2(buf) ( \
IA_ISREG ((buf)->ia_type) && \
@@ -414,12 +443,14 @@ int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt
xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc);
xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode);
xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev);
+xlator_t *dht_subvol_next_available (xlator_t *this, xlator_t *prev);
int dht_subvol_cnt (xlator_t *this, xlator_t *subvol);
-int dht_hash_compute (int type, const char *name, uint32_t *hash_p);
+int dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p);
int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
- xlator_t *tovol, xlator_t *fromvol, loc_t *loc);
+ xlator_t *this, xlator_t *tovol,
+ xlator_t *fromvol, loc_t *loc);
int dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc);
int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc);
int
@@ -437,7 +468,8 @@ dht_layout_sort_volname (dht_layout_t *layout);
int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc);
gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol);
-xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol);
+xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
+ dht_local_t *layout);
int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx);
int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode);
@@ -659,7 +691,16 @@ int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
struct iatt *stbuf, int32_t valid, dict_t *xdata);
int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iatt *stbuf, int32_t valid, dict_t *xdata);
-
+int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t mode, off_t offset, size_t len, dict_t *xdata);
+int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, size_t len, dict_t *xdata);
+int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, size_t len, dict_t *xdata);
+
+int32_t dht_init (xlator_t *this);
+void dht_fini (xlator_t *this);
+int dht_reconfigure (xlator_t *this, dict_t *options);
int32_t dht_notify (xlator_t *this, int32_t event, void *data, ...);
/* definitions for nufa/switch */
@@ -721,7 +762,26 @@ dht_dir_attr_heal (void *data);
int
dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data);
int
-dht_dir_has_layout (dict_t *xattr);
+dht_dir_has_layout (dict_t *xattr, char *name);
gf_boolean_t
dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator);
+xlator_t *
+dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout);
+xlator_t *
+dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout);
+int
+dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this);
+
+void
+dht_layout_dump (dht_layout_t *layout, const char *prefix);
+int32_t
+dht_priv_dump (xlator_t *this);
+int32_t
+dht_inodectx_dump (xlator_t *this, inode_t *inode);
+
+int
+dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol);
+
#endif/* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
index 52ea3a32a..fe3955ecb 100644
--- a/xlators/cluster/dht/src/dht-diskusage.c
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -248,50 +248,167 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)
return is_subvol_filled;
}
+
+/*Get the best subvolume to create the file in*/
xlator_t *
-dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol)
+dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
+ dht_local_t *local)
{
- int i = 0;
- double max = 0;
- double max_inodes = 0;
xlator_t *avail_subvol = NULL;
dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ loc_t *loc = NULL;
conf = this->private;
-
- LOCK (&conf->subvolume_lock);
+ if (!local)
+ goto out;
+ loc = &local->loc;
+ if (!local->layout) {
+ layout = dht_layout_get (this, loc->parent);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "layout missing path=%s parent=%s",
+ loc->path, uuid_utoa (loc->parent->gfid));
+ goto out;
+ }
+ } else {
+ layout = dht_layout_ref (this, local->layout);
+ }
+
+ LOCK (&conf->subvolume_lock);
{
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->disk_unit == 'p') {
- if ((conf->du_stats[i].avail_percent > max)
- && (conf->du_stats[i].avail_inodes > max_inodes)) {
- max = conf->du_stats[i].avail_percent;
- max_inodes = conf->du_stats[i].avail_inodes;
- avail_subvol = conf->subvolumes[i];
- }
- } else {
- if ((conf->du_stats[i].avail_space > max)
- && (conf->du_stats[i].avail_inodes > max_inodes)) {
- max = conf->du_stats[i].avail_space;
- max_inodes = conf->du_stats[i].avail_inodes;
- avail_subvol = conf->subvolumes[i];
- }
+ avail_subvol = dht_subvol_with_free_space_inodes(this, subvol,
+ layout);
+ if(!avail_subvol)
+ {
+ avail_subvol = dht_subvol_maxspace_nonzeroinode(this,
+ subvol,
+ layout);
+ }
- }
- }
}
UNLOCK (&conf->subvolume_lock);
-
+out:
if (!avail_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume has enough free space and inodes to create");
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "no subvolume has enough free space and/or inodes\
+ to create");
+ avail_subvol = subvol;
}
- if ((max < conf->min_free_disk) && (max_inodes < conf->min_free_inodes))
- avail_subvol = subvol;
+ if (layout)
+ dht_layout_unref (this, layout);
+ return avail_subvol;
+}
+
+static inline
+int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout)
+{
+ int ret = -1;
+ int i = 0;
+
+ if (!this || !layout)
+ goto out;
+
+ /* check if subvol has layout errors, before selecting it */
+ for (i = 0; i < layout->cnt; i++) {
+ if (!strcmp (layout->list[i].xlator->name, this->name) &&
+ (layout->list[i].err != 0)) {
+ ret = -1;
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+/*Get subvolume which has both space and inodes more than the min criteria*/
+xlator_t *
+dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout)
+{
+ int i = 0;
+ double max = 0;
+ double max_inodes = 0;
+ int ignore_subvol = 0;
+
+ xlator_t *avail_subvol = NULL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ for(i=0; i < conf->subvolume_cnt; i++) {
+ /* check if subvol has layout errors, before selecting it */
+ ignore_subvol = dht_subvol_has_err (conf->subvolumes[i],
+ layout);
+ if (ignore_subvol)
+ continue;
+
+ if ((conf->disk_unit == 'p') &&
+ (conf->du_stats[i].avail_percent > conf->min_free_disk) &&
+ (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
+ if ((conf->du_stats[i].avail_inodes > max_inodes) ||
+ (conf->du_stats[i].avail_percent > max)) {
+ max = conf->du_stats[i].avail_percent;
+ max_inodes = conf->du_stats[i].avail_inodes;
+ avail_subvol = conf->subvolumes[i];
+ }
+ }
+
+ if ((conf->disk_unit != 'p') &&
+ (conf->du_stats[i].avail_space > conf->min_free_disk) &&
+ (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
+ if ((conf->du_stats[i].avail_inodes > max_inodes) ||
+ (conf->du_stats[i].avail_space > max)) {
+ max = conf->du_stats[i].avail_space;
+ max_inodes = conf->du_stats[i].avail_inodes;
+ avail_subvol = conf->subvolumes[i];
+ }
+ }
+ }
+
+ return avail_subvol;
+}
- if (!avail_subvol)
- avail_subvol = subvol;
- return avail_subvol;
+/* Get subvol which has atleast one inode and maximum space */
+xlator_t *
+dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout)
+{
+ int i = 0;
+ double max = 0;
+ int ignore_subvol = 0;
+
+ xlator_t *avail_subvol = NULL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ /* check if subvol has layout errors, before selecting it */
+ ignore_subvol = dht_subvol_has_err (conf->subvolumes[i],
+ layout);
+ if (ignore_subvol)
+ continue;
+
+ if (conf->disk_unit == 'p') {
+ if ((conf->du_stats[i].avail_percent > max)
+ && (conf->du_stats[i].avail_inodes > 0 )) {
+ max = conf->du_stats[i].avail_percent;
+ avail_subvol = conf->subvolumes[i];
+ }
+ } else {
+ if ((conf->du_stats[i].avail_space > max)
+ && (conf->du_stats[i].avail_inodes > 0)) {
+ max = conf->du_stats[i].avail_space;
+ avail_subvol = conf->subvolumes[i];
+ }
+ }
+ }
+
+ return avail_subvol;
}
diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c
index 97eb1f05f..656cf23a0 100644
--- a/xlators/cluster/dht/src/dht-hashfn.c
+++ b/xlators/cluster/dht/src/dht-hashfn.c
@@ -44,30 +44,68 @@ dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p)
}
-#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do { \
- rsync_frndly_name = (char *) name; \
- if (name[0] == '.') { \
- char *dot = 0; \
- int namelen = 0; \
- \
- dot = strrchr (name, '.'); \
- if (dot && dot > (name + 1) && *(dot + 1)) { \
- namelen = (dot - name); \
- rsync_frndly_name = alloca (namelen); \
- strncpy (rsync_frndly_name, name + 1, \
- namelen); \
- rsync_frndly_name[namelen - 1] = 0; \
- } \
- } \
- } while (0);
+static inline
+gf_boolean_t
+dht_munge_name (const char *original, char *modified, size_t len, regex_t *re)
+{
+ regmatch_t matches[2];
+ size_t new_len;
+ if (regexec(re,original,2,matches,0) != REG_NOMATCH) {
+ if (matches[1].rm_so != -1) {
+ new_len = matches[1].rm_eo - matches[1].rm_so;
+ /* Equal would fail due to the NUL at the end. */
+ if (new_len < len) {
+ memcpy (modified,original+matches[1].rm_so,
+ new_len);
+ modified[new_len] = '\0';
+ return _gf_true;
+ }
+ }
+ }
+
+ /* This is guaranteed safe because of how the dest was allocated. */
+ strcpy(modified,original);
+ return _gf_false;
+}
int
-dht_hash_compute (int type, const char *name, uint32_t *hash_p)
+dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p)
{
- char *rsync_friendly_name = NULL;
+ char *rsync_friendly_name = NULL;
+ dht_conf_t *priv = this->private;
+ size_t len = 0;
+ gf_boolean_t munged = _gf_false;
+
+ /*
+ * It wouldn't be safe to use alloca in an inline function that doesn't
+ * actually get inlined, and it wouldn't be efficient to do a real
+ * allocation, so we use alloca here (if needed) and pass that to the
+ * inline.
+ */
- MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name);
+ if (priv->extra_regex_valid) {
+ len = strlen(name) + 1;
+ rsync_friendly_name = alloca(len);
+ munged = dht_munge_name (name, rsync_friendly_name, len,
+ &priv->extra_regex);
+ }
+
+ if (!munged && priv->rsync_regex_valid) {
+ len = strlen(name) + 1;
+ rsync_friendly_name = alloca(len);
+ gf_log (this->name, GF_LOG_TRACE, "trying regex for %s", name);
+ munged = dht_munge_name (name, rsync_friendly_name, len,
+ &priv->rsync_regex);
+ if (munged) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "munged down to %s", rsync_friendly_name);
+ }
+ }
+
+ if (!munged) {
+ rsync_friendly_name = (char *)name;
+ }
return dht_hash_compute_internal (type, rsync_friendly_name, hash_p);
}
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
index 264172a90..311a48112 100644
--- a/xlators/cluster/dht/src/dht-helper.c
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -18,6 +18,28 @@
#include "xlator.h"
#include "dht-common.h"
+static inline int
+dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol)
+{
+ uint64_t tmp_subvol = 0;
+
+ tmp_subvol = (long)subvol;
+ return inode_ctx_set1 (inode, this, &tmp_subvol);
+}
+
+int
+dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol)
+{
+ int ret = -1;
+ uint64_t tmp_subvol = 0;
+
+ ret = inode_ctx_get1 (inode, this, &tmp_subvol);
+ if (tmp_subvol && subvol)
+ *subvol = (xlator_t *)tmp_subvol;
+
+ return ret;
+}
+
int
dht_frame_return (call_frame_t *frame)
@@ -40,6 +62,43 @@ dht_frame_return (call_frame_t *frame)
}
+static uint64_t
+dht_bits_for (uint64_t num)
+{
+ uint64_t bits = 0, ctrl = 1;
+
+ while (ctrl < num) {
+ ctrl *= 2;
+ bits ++;
+ }
+
+ return bits;
+}
+
+/*
+ * A slightly "updated" version of the algorithm described in the commit log
+ * is used here.
+ *
+ * The only enhancement is that:
+ *
+ * - The number of bits used by the backend filesystem for HUGE d_off which
+ * is described as 63, and
+ * - The number of bits used by the d_off presented by the transformation
+ * upwards which is described as 64, are both made "configurable."
+ */
+
+
+#define BACKEND_D_OFF_BITS 63
+#define PRESENT_D_OFF_BITS 63
+
+#define ONE 1ULL
+#define MASK (~0ULL)
+#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS))
+#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS))
+
+#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1))
+#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1)))
+
int
dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
{
@@ -47,6 +106,9 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
int cnt = 0;
int max = 0;
uint64_t y = 0;
+ uint64_t hi_mask = 0;
+ uint64_t off_mask = 0;
+ int max_bits = 0;
if (x == ((uint64_t) -1)) {
y = (uint64_t) -1;
@@ -60,7 +122,23 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
max = conf->subvolume_cnt;
cnt = dht_subvol_cnt (this, subvol);
- y = ((x * max) + cnt);
+ if (max == 1) {
+ y = x;
+ goto out;
+ }
+
+ max_bits = dht_bits_for (max);
+
+ hi_mask = ~(PRESENT_MASK >> (max_bits + 1));
+
+ if (x & hi_mask) {
+ /* HUGE d_off */
+ off_mask = MASK << max_bits;
+ y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt;
+ } else {
+ /* small d_off */
+ y = ((x * max) + cnt);
+ }
out:
if (y_p)
@@ -135,16 +213,38 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p,
int max = 0;
uint64_t x = 0;
xlator_t *subvol = 0;
+ int max_bits = 0;
+ uint64_t off_mask = 0;
+ uint64_t host_mask = 0;
if (!this->private)
- goto out;
+ return -1;
conf = this->private;
max = conf->subvolume_cnt;
- cnt = y % max;
- x = y / max;
+ if (max == 1) {
+ x = y;
+ cnt = 0;
+ goto out;
+ }
+
+ if (y & TOP_BIT) {
+ /* HUGE d_off */
+ max_bits = dht_bits_for (max);
+ off_mask = (MASK << max_bits);
+ host_mask = ~(off_mask);
+
+ x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS;
+
+ cnt = y & host_mask;
+ } else {
+ /* small d_off */
+ cnt = y % max;
+ x = y / max;
+ }
+out:
subvol = conf->subvolumes[cnt];
if (subvol_p)
@@ -153,7 +253,6 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p,
if (x_p)
*x_p = x;
-out:
return 0;
}
@@ -263,20 +362,6 @@ out:
return local;
}
-
-char *
-basestr (const char *str)
-{
- char *basestr = NULL;
-
- basestr = strrchr (str, '/');
- if (basestr)
- basestr ++;
-
- return basestr;
-}
-
-
xlator_t *
dht_first_up_subvol (xlator_t *this)
{
@@ -428,7 +513,36 @@ out:
return next;
}
+/* This func wraps around, if prev is actually the last subvol.
+ */
+xlator_t *
+dht_subvol_next_available (xlator_t *this, xlator_t *prev)
+{
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *next = NULL;
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == prev) {
+ /* if prev is last in conf->subvolumes, then wrap
+ * around.
+ */
+ if ((i + 1) < conf->subvolume_cnt) {
+ next = conf->subvolumes[i + 1];
+ } else {
+ next = conf->subvolumes[0];
+ }
+ break;
+ }
+ }
+
+out:
+ return next;
+}
int
dht_subvol_cnt (xlator_t *this, xlator_t *subvol)
{
@@ -620,20 +734,36 @@ dht_migration_complete_check_task (void *data)
call_frame_t *frame = NULL;
loc_t tmp_loc = {0,};
char *path = NULL;
+ dht_conf_t *conf = NULL;
+ inode_t *inode = NULL;
+ fd_t *iter_fd = NULL;
+ uint64_t tmp_subvol = 0;
+ int open_failed = 0;
this = THIS;
frame = data;
local = frame->local;
+ conf = this->private;
src_node = local->cached_subvol;
- /* getxattr on cached_subvol for 'linkto' value */
- if (!local->loc.inode)
+ if (!local->loc.inode && !local->fd)
+ goto out;
+
+ inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
+ /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
+ * as root:root. If a fd is already open, access check wont be done*/
+
+ if (!local->loc.inode) {
ret = syncop_fgetxattr (src_node, local->fd, &dict,
- DHT_LINKFILE_KEY);
- else
+ conf->link_xattr_name);
+ } else {
+ SYNCTASK_SETID (0, 0);
ret = syncop_getxattr (src_node, &local->loc, &dict,
- DHT_LINKFILE_KEY);
+ conf->link_xattr_name);
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+ }
if (!ret)
dst_node = dht_linkfile_subvol (this, NULL, NULL, dict);
@@ -684,10 +814,7 @@ dht_migration_complete_check_task (void *data)
/* update inode ctx (the layout) */
dht_layout_unref (this, local->layout);
- if (!local->loc.inode)
- ret = dht_layout_preset (this, dst_node, local->fd->inode);
- else
- ret = dht_layout_preset (this, dst_node, local->loc.inode);
+ ret = dht_layout_preset (this, dst_node, inode);
if (ret != 0) {
gf_log (this->name, GF_LOG_DEBUG,
"%s: could not set preset layout for subvol %s",
@@ -705,10 +832,7 @@ dht_migration_complete_check_task (void *data)
goto out;
}
- if (!local->loc.inode)
- ret = dht_layout_set (this, local->fd->inode, layout);
- else
- ret = dht_layout_set (this, local->loc.inode, layout);
+ ret = dht_layout_set (this, inode, layout);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to set the new layout",
@@ -719,42 +843,46 @@ dht_migration_complete_check_task (void *data)
local->cached_subvol = dst_node;
ret = 0;
- if (!local->fd)
+ /* once we detect the migration complete, the inode-ctx2 is no more
+ required.. delete the ctx and also, it means, open() already
+ done on all the fd of inode */
+ ret = inode_ctx_reset1 (inode, this, &tmp_subvol);
+ if (tmp_subvol)
goto out;
- /* once we detect the migration complete, the fd-ctx is no more
- required.. delete the ctx, and do one extra 'fd_unref' for open fd */
- ret = fd_ctx_del (local->fd, this, NULL);
- if (!ret) {
- fd_unref (local->fd);
- ret = 0;
+ if (list_empty (&inode->fd_list))
goto out;
- }
- /* if 'local->fd' (ie, fd based operation), send a 'open()' on
- destination if not already done */
- if (local->loc.inode) {
- ret = syncop_open (dst_node, &local->loc,
- local->fd->flags, local->fd);
- } else {
- tmp_loc.inode = local->fd->inode;
- inode_path (local->fd->inode, NULL, &path);
- if (path)
- tmp_loc.path = path;
- ret = syncop_open (dst_node, &tmp_loc,
- local->fd->flags, local->fd);
- GF_FREE (path);
+ /* perform open as root:root. There is window between linkfile
+ * creation(root:root) and setattr with the correct uid/gid
+ */
+ SYNCTASK_SETID(0, 0);
+
+ /* perform 'open()' on all the fd's present on the inode */
+ tmp_loc.inode = inode;
+ inode_path (inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ if (fd_is_anonymous (iter_fd))
+ continue;
+ ret = syncop_open (dst_node, &tmp_loc,
+ iter_fd->flags, iter_fd);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to open "
+ "the fd (%p, flags=0%o) on file %s @ %s",
+ iter_fd, iter_fd->flags, path, dst_node->name);
+ open_failed = 1;
+ }
}
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to send open() on target file at %s",
- local->loc.path, dst_node->name);
+ GF_FREE (path);
+
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+ if (open_failed) {
+ ret = -1;
goto out;
}
-
- /* need this unref for the fd on src_node */
- fd_unref (local->fd);
ret = 0;
out:
@@ -798,20 +926,34 @@ dht_rebalance_inprogress_task (void *data)
char *path = NULL;
struct iatt stbuf = {0,};
loc_t tmp_loc = {0,};
+ dht_conf_t *conf = NULL;
+ inode_t *inode = NULL;
+ fd_t *iter_fd = NULL;
+ int open_failed = 0;
this = THIS;
frame = data;
local = frame->local;
+ conf = this->private;
src_node = local->cached_subvol;
- /* getxattr on cached_subvol for 'linkto' value */
- if (local->loc.inode)
+ if (!local->loc.inode && !local->fd)
+ goto out;
+
+ inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
+ /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
+ * as root:root. If a fd is already open, access check wont be done*/
+ if (local->loc.inode) {
+ SYNCTASK_SETID (0, 0);
ret = syncop_getxattr (src_node, &local->loc, &dict,
- DHT_LINKFILE_KEY);
- else
+ conf->link_xattr_name);
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+ } else {
ret = syncop_fgetxattr (src_node, local->fd, &dict,
- DHT_LINKFILE_KEY);
+ conf->link_xattr_name);
+ }
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -853,33 +995,46 @@ dht_rebalance_inprogress_task (void *data)
ret = 0;
- if (!local->fd)
- goto out;
+ if (list_empty (&inode->fd_list))
+ goto done;
+
+ /* perform open as root:root. There is window between linkfile
+ * creation(root:root) and setattr with the correct uid/gid
+ */
+ SYNCTASK_SETID (0, 0);
+
+ tmp_loc.inode = inode;
+ inode_path (inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ if (fd_is_anonymous (iter_fd))
+ continue;
- if (local->loc.inode) {
- ret = syncop_open (dst_node, &local->loc,
- local->fd->flags, local->fd);
- } else {
- tmp_loc.inode = local->fd->inode;
- inode_path (local->fd->inode, NULL, &path);
- if (path)
- tmp_loc.path = path;
ret = syncop_open (dst_node, &tmp_loc,
- local->fd->flags, local->fd);
- GF_FREE (path);
+ iter_fd->flags, iter_fd);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to send open "
+ "the fd (%p, flags=0%o) on file %s @ %s",
+ iter_fd, iter_fd->flags, path, dst_node->name);
+ open_failed = 1;
+ }
}
+ GF_FREE (path);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to send open() on target file at %s",
- local->loc.path, dst_node->name);
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+
+ if (open_failed) {
+ ret = -1;
goto out;
}
- ret = fd_ctx_set (local->fd, this, (uint64_t)(long)dst_node);
+done:
+ ret = dht_inode_ctx_set1 (this, inode, dst_node);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to set fd-ctx target file at %s",
+ "%s: failed to set inode-ctx target file at %s",
local->loc.path, dst_node->name);
goto out;
}
@@ -931,6 +1086,9 @@ dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat,
dht_stat_time_t *time = 0;
int ret = -1;
+ GF_VALIDATE_OR_GOTO (this->name, stat, out);
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+
ret = dht_inode_ctx_get (inode, this, &ctx);
if (ret) {
@@ -949,7 +1107,7 @@ dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat,
stat->ia_atime, stat->ia_atime_nsec, inode, post);
ret = dht_inode_ctx_set (inode, this, ctx);
-
+out:
return 0;
}
diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c
index f17cb73b9..ece84151a 100644
--- a/xlators/cluster/dht/src/dht-inode-read.c
+++ b/xlators/cluster/dht/src/dht-inode-read.c
@@ -130,10 +130,11 @@ int
dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata)
{
- uint64_t tmp_subvol = 0;
+ xlator_t *subvol = 0;
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ inode_t *inode = NULL;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -154,21 +155,23 @@ dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (local->call_cnt != 1)
goto out;
+ local->op_errno = op_errno;
/* Check if the rebalance phase2 is true */
if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) {
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (ret) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+ ret = dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
/* Phase 2 of migration */
local->rebalance.target_op_fn = dht_attr2;
ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
} else {
/* value is already set in fd_ctx, that means no need
to check for whether its complete or not. */
dht_attr2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
}
out:
@@ -381,6 +384,8 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
dht_local_t *local = NULL;
int ret = 0;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
if (!local) {
@@ -396,19 +401,21 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if ((op_ret == -1) && (op_errno != ENOENT))
goto out;
+ local->op_errno = op_errno;
if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) {
/* File would be migrated to other node */
- ret = fd_ctx_get (local->fd, this, NULL);
- if (ret) {
+ ret = dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
local->rebalance.target_op_fn = dht_readv2;
ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
} else {
/* value is already set in fd_ctx, that means no need
to check for whether its complete or not. */
dht_readv2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
}
out:
@@ -499,24 +506,34 @@ dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int ret = -1;
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
+ call_frame_t *prev = NULL;
local = frame->local;
+ prev = cookie;
+ if (!prev || !prev->this)
+ goto out;
if (local->call_cnt != 1)
goto out;
if ((op_ret == -1) && (op_errno == ENOTCONN) &&
IA_ISDIR(local->loc.inode->ia_type)) {
- subvol = dht_first_up_subvol (this);
+ subvol = dht_subvol_next_available (this, prev->this);
if (!subvol)
goto out;
+ /* check if we are done with visiting every node */
+ if (subvol == local->cached_subvol) {
+ goto out;
+ }
+
STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access,
&local->loc, local->rebalance.flags, NULL);
return 0;
}
if ((op_ret == -1) && (op_errno == ENOENT)) {
/* File would be migrated to other node */
+ local->op_errno = op_errno;
local->rebalance.target_op_fn = dht_access2;
ret = dht_rebalance_complete_check (frame->this, frame);
if (!ret)
@@ -604,8 +621,9 @@ int
dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int ret = -1;
+ dht_local_t *local = NULL;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
@@ -615,8 +633,8 @@ dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
/* If context is set, then send flush() it to the destination */
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (subvol) {
dht_flush2 (this, frame, 0);
return 0;
}
@@ -632,14 +650,10 @@ dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -701,12 +715,14 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
prev = cookie;
local->op_errno = op_errno;
- if (op_ret == -1) {
+ if (op_ret == -1 && (op_errno != ENOENT)) {
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
prev->this->name, strerror (op_errno));
@@ -721,8 +737,9 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
goto out;
}
- ret = fd_ctx_get (local->fd, this, NULL);
- if (ret) {
+ local->op_errno = op_errno;
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
local->rebalance.target_op_fn = dht_fsync2;
/* Check if the rebalance phase1 is true */
@@ -737,11 +754,12 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
if (IS_DHT_MIGRATION_PHASE2 (postbuf)) {
ret = dht_rebalance_complete_check (this, frame);
}
+ if (!ret)
+ return 0;
} else {
dht_fsync2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
out:
DHT_STRIP_PHASE1_FLAGS (postbuf);
@@ -757,15 +775,10 @@ dht_fsync2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
-
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c
index d4a3ecc39..4b3f3a049 100644
--- a/xlators/cluster/dht/src/dht-inode-write.c
+++ b/xlators/cluster/dht/src/dht-inode-write.c
@@ -19,6 +19,9 @@
int dht_writev2 (xlator_t *this, call_frame_t *frame, int ret);
int dht_truncate2 (xlator_t *this, call_frame_t *frame, int ret);
int dht_setattr2 (xlator_t *this, call_frame_t *frame, int ret);
+int dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret);
+int dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret);
+int dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret);
int
dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -27,8 +30,9 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
dht_local_t *local = NULL;
int ret = -1;
+ xlator_t *subvol = NULL;
- if (op_ret == -1) {
+ if (op_ret == -1 && (op_errno != ENOENT)) {
goto out;
}
@@ -50,6 +54,7 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->rebalance.target_op_fn = dht_writev2;
+ local->op_errno = op_errno;
/* Phase 2 of migration */
if (IS_DHT_MIGRATION_PHASE2 (postbuf)) {
ret = dht_rebalance_complete_check (this, frame);
@@ -62,8 +67,8 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ ret = dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
dht_writev2 (this, frame, 0);
return 0;
}
@@ -87,14 +92,10 @@ dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -169,6 +170,8 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -198,6 +201,7 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->rebalance.target_op_fn = dht_truncate2;
+ local->op_errno = op_errno;
/* Phase 2 of migration */
if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
ret = dht_rebalance_complete_check (this, frame);
@@ -209,8 +213,9 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (subvol) {
dht_truncate2 (this, frame, 0);
return 0;
}
@@ -234,16 +239,13 @@ dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
+ inode_t *inode = NULL;
local = frame->local;
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ inode = local->fd ? local->fd->inode : local->loc.inode;
+ dht_inode_ctx_get1 (this, inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -346,6 +348,407 @@ err:
return 0;
}
+
+int
+dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+ xlator_t *subvol = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_fallocate2;
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
+ dht_fallocate2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (fallocate, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_fallocate_cbk, subvol, subvol->fops->fallocate,
+ local->fd, local->rebalance.flags, local->rebalance.offset,
+ local->rebalance.size, NULL);
+
+ return 0;
+}
+
+int
+dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_FALLOCATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.flags = mode;
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_fallocate_cbk,
+ subvol, subvol->fops->fallocate,
+ fd, mode, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+ xlator_t *subvol = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_discard2;
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
+ dht_discard2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (discard, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_discard_cbk, subvol, subvol->fops->discard,
+ local->fd, local->rebalance.offset, local->rebalance.size,
+ NULL);
+
+ return 0;
+}
+
+int
+dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_DISCARD);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_discard_cbk, subvol, subvol->fops->discard,
+ fd, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_zerofill2;
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ ret = fd_ctx_get (local->fd, this, NULL);
+ if (!ret) {
+ dht_zerofill2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ uint64_t tmp_subvol = 0;
+ int ret = -1;
+
+ local = frame->local;
+
+ if (local->fd)
+ ret = fd_ctx_get (local->fd, this, &tmp_subvol);
+ if (!ret)
+ subvol = (xlator_t *)(long)tmp_subvol;
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill,
+ local->fd, local->rebalance.offset, local->rebalance.size,
+ NULL);
+
+ return 0;
+}
+
+int
+dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill,
+ fd, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+
/* handle cases of migration here for 'setattr()' calls */
int
dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -397,15 +800,13 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
+ inode_t *inode = NULL;
local = frame->local;
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+ dht_inode_ctx_get1 (this, inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
index 8cae52653..38e9970a7 100644
--- a/xlators/cluster/dht/src/dht-layout.c
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -158,7 +158,7 @@ dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name)
int ret = 0;
- ret = dht_hash_compute (layout->type, name, &hash);
+ ret = dht_hash_compute (this, layout->type, name, &hash);
if (ret != 0) {
gf_log (this->name, GF_LOG_WARNING,
"hash computation failed for type=%d name=%s",
@@ -335,11 +335,12 @@ int
dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
int op_ret, int op_errno, dict_t *xattr)
{
- int i = 0;
- int ret = -1;
- int err = -1;
- void *disk_layout_raw = NULL;
- int disk_layout_len = 0;
+ int i = 0;
+ int ret = -1;
+ int err = -1;
+ void *disk_layout_raw = NULL;
+ int disk_layout_len = 0;
+ dht_conf_t *conf = this->private;
if (op_ret != 0) {
err = op_errno;
@@ -360,12 +361,12 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
if (xattr) {
/* during lookup and not mkdir */
- ret = dict_get_ptr_and_len (xattr, "trusted.glusterfs.dht",
+ ret = dict_get_ptr_and_len (xattr, conf->xattr_name,
&disk_layout_raw, &disk_layout_len);
}
if (ret != 0) {
- layout->list[i].err = -1;
+ layout->list[i].err = 0;
gf_log (this->name, GF_LOG_TRACE,
"missing disk layout on %s. err = %d",
subvol->name, err);
@@ -412,6 +413,22 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j)
layout->list[j].err = err_swap;
}
+void
+dht_layout_range_swap (dht_layout_t *layout, int i, int j)
+{
+ uint32_t start_swap = 0;
+ uint32_t stop_swap = 0;
+
+ start_swap = layout->list[i].start;
+ stop_swap = layout->list[i].stop;
+
+ layout->list[i].start = layout->list[j].start;
+ layout->list[i].stop = layout->list[j].stop;
+
+ layout->list[j].start = start_swap;
+ layout->list[j].stop = stop_swap;
+}
+
int64_t
dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j)
{
@@ -437,12 +454,19 @@ dht_layout_entry_cmp (dht_layout_t *layout, int i, int j)
{
int64_t diff = 0;
+ /* swap zero'ed out layouts to front, if needed */
+ if (!layout->list[j].start && !layout->list[j].stop) {
+ diff = (int64_t) layout->list[i].stop
+ - (int64_t) layout->list[j].stop;
+ goto out;
+ }
if (layout->list[i].err || layout->list[j].err)
diff = layout->list[i].err - layout->list[j].err;
else
diff = (int64_t) layout->list[i].start
- (int64_t) layout->list[j].start;
+out:
return diff;
}
@@ -513,23 +537,30 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
prev_stop = last_stop;
for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].err) {
- switch (layout->list[i].err) {
- case -1:
- case ENOENT:
- missing++;
- break;
- case ENOTCONN:
- down++;
- break;
- case ENOSPC:
- no_space++;
- break;
- default:
- misc++;
+ switch (layout->list[i].err) {
+ case -1:
+ case ENOENT:
+ missing++;
+ continue;
+ case ENOTCONN:
+ down++;
+ continue;
+ case ENOSPC:
+ no_space++;
+ continue;
+ case 0:
+ /* if err == 0 and start == stop, then it is a non misc++;
+ * participating subvolume(spread-cnt). Then, do not
+ * check for anomalies. If start != stop, then treat it
+ * as misc err */
+ if (layout->list[i].start == layout->list[i].stop) {
+ continue;
}
+ break;
+ default:
+ misc++;
continue;
- }
+ }
is_virgin = 0;
@@ -634,30 +665,29 @@ out:
}
int
-dht_dir_has_layout (dict_t *xattr)
+dht_dir_has_layout (dict_t *xattr, char *name)
{
void *disk_layout_raw = NULL;
- return dict_get_ptr (xattr, "trusted.glusterfs.dht",
- &disk_layout_raw);
-
+ return dict_get_ptr (xattr, name, &disk_layout_raw);
}
int
dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
loc_t *loc, dict_t *xattr)
{
- int idx = 0;
- int pos = -1;
- int ret = 0;
- int err = 0;
- int dict_ret = 0;
- int32_t disk_layout[4];
- void *disk_layout_raw = NULL;
- int32_t count = -1;
- uint32_t start_off = -1;
- uint32_t stop_off = -1;
+ int idx = 0;
+ int pos = -1;
+ int ret = 0;
+ int err = 0;
+ int dict_ret = 0;
+ int32_t disk_layout[4];
+ void *disk_layout_raw = NULL;
+ int32_t count = -1;
+ uint32_t start_off = -1;
+ uint32_t stop_off = -1;
+ dht_conf_t *conf = this->private;
for (idx = 0; idx < layout->cnt; idx++) {
@@ -687,7 +717,7 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
goto out;
}
- dict_ret = dict_get_ptr (xattr, "trusted.glusterfs.dht",
+ dict_ret = dict_get_ptr (xattr, conf->xattr_name,
&disk_layout_raw);
if (dict_ret < 0) {
diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c
index 803f344af..dbc9d0b3c 100644
--- a/xlators/cluster/dht/src/dht-linkfile.c
+++ b/xlators/cluster/dht/src/dht-linkfile.c
@@ -19,8 +19,37 @@
#include "compat.h"
#include "dht-common.h"
+int
+dht_linkfile_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ char is_linkfile = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (op_ret)
+ goto out;
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
+ if (!is_linkfile)
+ gf_log (this->name, GF_LOG_WARNING, "got non-linkfile %s:%s",
+ prev->this->name, local->loc.path);
+out:
+ local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
+ inode, stbuf, postparent, postparent,
+ xattr);
+ return 0;
+}
+#define is_equal(a, b) (a == b)
int
dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, inode_t *inode,
@@ -28,29 +57,68 @@ dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postparent, dict_t *xdata)
{
dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ call_frame_t *prev = NULL;
+ dict_t *xattrs = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
local = frame->local;
+ if (!op_ret)
+ local->linked = _gf_true;
+
+ FRAME_SU_UNDO (frame, dht_local_t);
+
+ if (op_ret && (op_errno == EEXIST)) {
+ conf = this->private;
+ prev = cookie;
+ subvol = prev->this;
+ if (!subvol)
+ goto out;
+ xattrs = dict_new ();
+ if (!xattrs)
+ goto out;
+ ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set linkto key");
+ goto out;
+ }
+
+ STACK_WIND (frame, dht_linkfile_lookup_cbk, subvol,
+ subvol->fops->lookup, &local->loc, xattrs);
+ if (xattrs)
+ dict_unref (xattrs);
+ return 0;
+ }
+out:
local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
inode, stbuf, preparent, postparent,
xdata);
+ if (xattrs)
+ dict_unref (xattrs);
return 0;
}
int
dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
+ xlator_t *this,
xlator_t *tovol, xlator_t *fromvol, loc_t *loc)
{
dht_local_t *local = NULL;
dict_t *dict = NULL;
int need_unref = 0;
int ret = 0;
+ dht_conf_t *conf = this->private;
local = frame->local;
local->linkfile.linkfile_cbk = linkfile_cbk;
local->linkfile.srcvol = tovol;
+ local->linked = _gf_false;
+
dict = local->params;
if (!dict) {
dict = dict_new ();
@@ -71,8 +139,7 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
gf_log ("dht-linkfile", GF_LOG_INFO,
"%s: internal-fop set failed", loc->path);
- ret = dict_set_str (dict, "trusted.glusterfs.dht.linkto",
- tovol->name);
+ ret = dict_set_str (dict, conf->link_xattr_name, tovol->name);
if (ret < 0) {
gf_log (frame->this->name, GF_LOG_INFO,
@@ -81,6 +148,10 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
goto out;
}
+ local->link_subvol = fromvol;
+ /* Always create as root:root. dht_linkfile_attr_heal fixes the
+ * ownsership */
+ FRAME_SU_DO (frame, dht_local_t);
STACK_WIND (frame, dht_linkfile_create_cbk,
fromvol, fromvol->fops->mknod, loc,
S_IFREG | DHT_LINKFILE_MODE, 0, 0, dict);
@@ -173,7 +244,7 @@ dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf,
if (!xattr)
goto out;
- ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname);
+ ret = dict_get_ptr (xattr, conf->link_xattr_name, &volname);
if ((-1 == ret) || !volname)
goto out;
@@ -188,3 +259,70 @@ dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf,
out:
return subvol;
}
+
+int
+dht_linkfile_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *statpre,
+ struct iatt *statpost, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ loc_t *loc = NULL;
+
+ local = frame->local;
+ loc = &local->loc;
+
+ if (op_ret)
+ gf_log (this->name, GF_LOG_ERROR, "setattr of uid/gid on %s"
+ " :<gfid:%s> failed (%s)",
+ (loc->path? loc->path: "NULL"),
+ uuid_utoa(local->gfid), strerror(op_errno));
+
+ DHT_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+int
+dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this)
+{
+ int ret = -1;
+ call_frame_t *copy = NULL;
+ dht_local_t *local = NULL;
+ dht_local_t *copy_local = NULL;
+ xlator_t *subvol = NULL;
+ struct iatt stbuf = {0,};
+
+ local = frame->local;
+
+ GF_VALIDATE_OR_GOTO ("dht", local, out);
+ GF_VALIDATE_OR_GOTO ("dht", local->link_subvol, out);
+
+ if (local->stbuf.ia_type == IA_INVAL)
+ return 0;
+
+ uuid_copy (local->loc.gfid, local->stbuf.ia_gfid);
+
+ copy = copy_frame (frame);
+
+ if (!copy)
+ goto out;
+
+ copy_local = dht_local_init (copy, &local->loc, NULL, 0);
+
+ if (!copy_local)
+ goto out;
+
+ stbuf = local->stbuf;
+ subvol = local->link_subvol;
+
+ copy->local = copy_local;
+
+ FRAME_SU_DO (copy, dht_local_t);
+
+ STACK_WIND (copy, dht_linkfile_setattr_cbk, subvol,
+ subvol->fops->setattr, &copy_local->loc,
+ &stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL);
+ ret = 0;
+out:
+ return ret;
+}
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index a2e96a1b2..bcb19f23e 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -16,6 +16,7 @@
#include "dht-common.h"
#include "xlator.h"
+#include <fnmatch.h>
#define GF_DISK_SECTOR_SIZE 512
#define DHT_REBALANCE_PID 4242 /* Change it if required */
@@ -101,12 +102,16 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
data_t *data = NULL;
struct iatt iatt = {0,};
int32_t op_errno = 0;
+ dht_conf_t *conf = NULL;
GF_VALIDATE_OR_GOTO ("defrag", loc, out);
GF_VALIDATE_OR_GOTO ("defrag", loc->name, out);
GF_VALIDATE_OR_GOTO ("defrag", stbuf, out);
GF_VALIDATE_OR_GOTO ("defrag", this, out);
GF_VALIDATE_OR_GOTO ("defrag", xattrs, out);
+ GF_VALIDATE_OR_GOTO ("defrag", this->private, out);
+
+ conf = this->private;
if (uuid_is_null (loc->pargfid)) {
gf_log ("", GF_LOG_ERROR, "loc->pargfid is NULL for "
@@ -137,10 +142,10 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
gf_log (this->name, GF_LOG_INFO, "Attempting to migrate hardlink %s "
"with gfid %s from %s -> %s", loc->name, uuid_utoa (loc->gfid),
cached_subvol->name, hashed_subvol->name);
- data = dict_get (xattrs, DHT_LINKFILE_KEY);
+ data = dict_get (xattrs, conf->link_xattr_name);
/* set linkto on cached -> hashed if not present, else link it */
if (!data) {
- ret = dict_set_str (xattrs, DHT_LINKFILE_KEY,
+ ret = dict_set_str (xattrs, conf->link_xattr_name,
hashed_subvol->name);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Failed to set "
@@ -238,14 +243,16 @@ out:
static inline int
__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf,
- dict_t *dict, fd_t **dst_fd)
+ dict_t *dict, fd_t **dst_fd, dict_t *xattr)
{
- xlator_t *this = NULL;
- int ret = -1;
- fd_t *fd = NULL;
- struct iatt new_stbuf = {0,};
+ xlator_t *this = NULL;
+ int ret = -1;
+ fd_t *fd = NULL;
+ struct iatt new_stbuf = {0,};
+ dht_conf_t *conf = NULL;
this = THIS;
+ conf = this->private;
ret = dict_set_static_bin (dict, "gfid-req", stbuf->ia_gfid, 16);
if (ret) {
@@ -254,7 +261,7 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
goto out;
}
- ret = dict_set_str (dict, DHT_LINKFILE_KEY, from->name);
+ ret = dict_set_str (dict, conf->link_xattr_name, from->name);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to set gfid in dict for create", loc->path);
@@ -292,7 +299,7 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
/* Create the destination with LINKFILE mode, and linkto xattr,
if the linkfile already exists, it will just open the file */
ret = syncop_create (to, loc, O_RDWR, DHT_LINKFILE_MODE, fd,
- dict);
+ dict, &new_stbuf);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"failed to create %s on %s (%s)",
@@ -300,6 +307,18 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
goto out;
}
+ ret = syncop_fsetxattr (to, fd, xattr, 0);
+ if (ret == -1)
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to set xattr on %s (%s)",
+ loc->path, to->name, strerror (errno));
+
+ ret = syncop_ftruncate (to, fd, stbuf->ia_size);
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR,
+ "ftruncate failed for %s on %s (%s)",
+ loc->path, to->name, strerror (errno));
+
ret = syncop_fsetattr (to, fd, stbuf,
(GF_SET_ATTR_UID | GF_SET_ATTR_GID),
NULL, NULL);
@@ -327,6 +346,9 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
int ret = -1;
xlator_t *this = NULL;
+ uint64_t src_statfs_blocks = 1;
+ uint64_t dst_statfs_blocks = 1;
+
this = THIS;
ret = syncop_statfs (from, loc, &src_statfs);
@@ -350,22 +372,34 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
if (flag != GF_DHT_MIGRATE_DATA)
goto check_avail_space;
- if (((dst_statfs.f_bavail *
- dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) <
- (((src_statfs.f_bavail * src_statfs.f_bsize) /
- GF_DISK_SECTOR_SIZE) - stbuf->ia_blocks)) {
- gf_log (this->name, GF_LOG_WARNING,
- "data movement attempted from node (%s) with"
- " higher disk space to a node (%s) with "
- "lesser disk space (%s)", from->name,
- to->name, loc->path);
-
- /* this is not a 'failure', but we don't want to
- consider this as 'success' too :-/ */
- ret = 1;
- goto out;
+ /* Check:
+ During rebalance `migrate-data` - Destination subvol experiences
+ a `reduction` in 'blocks' of free space, at the same time source
+ subvol gains certain 'blocks' of free space. A valid check is
+ necessary here to avoid errorneous move to destination where
+ the space could be scantily available.
+ */
+ if (stbuf) {
+ dst_statfs_blocks = ((dst_statfs.f_bavail *
+ dst_statfs.f_bsize) /
+ GF_DISK_SECTOR_SIZE);
+ src_statfs_blocks = ((src_statfs.f_bavail *
+ src_statfs.f_bsize) /
+ GF_DISK_SECTOR_SIZE);
+ if ((dst_statfs_blocks - stbuf->ia_blocks) <
+ (src_statfs_blocks + stbuf->ia_blocks)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "data movement attempted from node (%s) with"
+ " higher disk space to a node (%s) with "
+ "lesser disk space (%s)", from->name,
+ to->name, loc->path);
+
+ /* this is not a 'failure', but we don't want to
+ consider this as 'success' too :-/ */
+ ret = 1;
+ goto out;
+ }
}
-
check_avail_space:
if (((dst_statfs.f_bavail * dst_statfs.f_bsize) /
GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) {
@@ -442,8 +476,10 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc,
dict_t *dict = NULL;
xlator_t *this = NULL;
struct iatt iatt = {0,};
+ dht_conf_t *conf = NULL;
this = THIS;
+ conf = this->private;
fd = fd_create (loc->inode, DHT_REBALANCE_PID);
if (!fd) {
@@ -466,7 +502,7 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc,
if (!dict)
goto out;
- ret = dict_set_str (dict, DHT_LINKFILE_KEY, to->name);
+ ret = dict_set_str (dict, conf->link_xattr_name, to->name);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"failed to set xattr in dict for %s (linkto:%s)",
@@ -519,12 +555,13 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
dict_t *dict = NULL;
char *link = NULL;
struct iatt stbuf = {0,};
+ dht_conf_t *conf = this->private;
dict = dict_new ();
if (!dict)
goto out;
- ret = dict_set_int32 (dict, DHT_LINKFILE_KEY, 256);
+ ret = dict_set_int32 (dict, conf->link_xattr_name, 256);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to set 'linkto' key in dict", loc->path);
@@ -540,12 +577,13 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
}
/* we no more require this key */
- dict_del (dict, DHT_LINKFILE_KEY);
+ dict_del (dict, conf->link_xattr_name);
/* file exists in target node, only if it is 'linkfile' its valid,
otherwise, error out */
if (!ret) {
- if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict)) {
+ if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict,
+ conf->link_xattr_name)) {
gf_log (this->name, GF_LOG_WARNING,
"%s: file exists in destination", loc->path);
ret = -1;
@@ -581,7 +619,7 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
goto out;
}
- ret = syncop_symlink (to, loc, link, dict);
+ ret = syncop_symlink (to, loc, link, dict, 0);
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: creating symlink failed (%s)",
@@ -595,7 +633,7 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
ret = syncop_mknod (to, loc, st_mode_from_ia (buf->ia_prot,
buf->ia_type),
makedev (ia_major (buf->ia_rdev),
- ia_minor (buf->ia_rdev)), dict);
+ ia_minor (buf->ia_rdev)), dict, 0);
if (ret) {
gf_log (this->name, GF_LOG_WARNING, "%s: mknod failed (%s)",
loc->path, strerror (errno));
@@ -603,6 +641,15 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
}
done:
+ ret = syncop_setattr (to, loc, buf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID |
+ GF_SET_ATTR_MODE), NULL, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform setattr on %s (%s)",
+ loc->path, to->name, strerror (errno));
+ }
+
ret = syncop_unlink (from, loc);
if (ret)
gf_log (this->name, GF_LOG_WARNING, "%s: unlink failed (%s)",
@@ -640,6 +687,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
dict_t *xattr = NULL;
dict_t *xattr_rsp = NULL;
int file_has_holes = 0;
+ dht_conf_t *conf = this->private;
gf_log (this->name, GF_LOG_INFO, "%s: attempting to move from %s to %s",
loc->path, from->name, to->name);
@@ -648,7 +696,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (!dict)
goto out;
- ret = dict_set_int32 (dict, DHT_LINKFILE_KEY, 256);
+ ret = dict_set_int32 (dict, conf->link_xattr_name, 256);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to set 'linkto' key in dict", loc->path);
@@ -664,7 +712,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
}
/* we no more require this key */
- dict_del (dict, DHT_LINKFILE_KEY);
+ dict_del (dict, conf->link_xattr_name);
/* preserve source mode, so set the same to the destination */
src_ia_prot = stbuf.ia_prot;
@@ -681,9 +729,16 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
+ /* TODO: move all xattr related operations to fd based operations */
+ ret = syncop_listxattr (from, loc, &xattr);
+ if (ret == -1)
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to get xattr from %s (%s)",
+ loc->path, from->name, strerror (errno));
+
/* create the destination, with required modes/xattr */
ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf,
- dict, &dst_fd);
+ dict, &dst_fd, xattr);
if (ret)
goto out;
@@ -700,6 +755,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
+
ret = syncop_fstat (from, src_fd, &stbuf);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s (%s)",
@@ -729,19 +785,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
- /* TODO: move all xattr related operations to fd based operations */
- ret = syncop_listxattr (from, loc, &xattr);
- if (ret == -1)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to get xattr from %s (%s)",
- loc->path, from->name, strerror (errno));
-
- ret = syncop_setxattr (to, loc, xattr, 0);
- if (ret == -1)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set xattr on %s (%s)",
- loc->path, to->name, strerror (errno));
-
/* TODO: Sync the locks */
ret = syncop_fsync (to, dst_fd, 0);
@@ -805,6 +848,23 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
+ /* Free up the data blocks on the source node, as the whole
+ file is migrated */
+ ret = syncop_ftruncate (from, src_fd, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform truncate on %s (%s)",
+ loc->path, from->name, strerror (errno));
+ }
+
+ /* remove the 'linkto' xattr from the destination */
+ ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform removexattr on %s (%s)",
+ loc->path, to->name, strerror (errno));
+ }
+
/* Do a stat and check the gfid before unlink */
ret = syncop_stat (from, loc, &empty_iatt);
if (ret) {
@@ -825,23 +885,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
}
}
- /* Free up the data blocks on the source node, as the whole
- file is migrated */
- ret = syncop_ftruncate (from, src_fd, 0);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to perform truncate on %s (%s)",
- loc->path, from->name, strerror (errno));
- }
-
- /* remove the 'linkto' xattr from the destination */
- ret = syncop_fremovexattr (to, dst_fd, DHT_LINKFILE_KEY);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to perform removexattr on %s (%s)",
- loc->path, to->name, strerror (errno));
- }
-
ret = syncop_lookup (this, loc, NULL, NULL, NULL, NULL);
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
@@ -1031,6 +1074,31 @@ gf_defrag_handle_migrate_error (int32_t op_errno, gf_defrag_info_t *defrag)
return 0;
}
+static gf_boolean_t
+gf_defrag_pattern_match (gf_defrag_info_t *defrag, char *name, uint64_t size)
+{
+ gf_defrag_pattern_list_t *trav = NULL;
+ gf_boolean_t match = _gf_false;
+ gf_boolean_t ret = _gf_false;
+
+ GF_VALIDATE_OR_GOTO ("dht", defrag, out);
+
+ trav = defrag->defrag_pattern;
+ while (trav) {
+ if (!fnmatch (trav->path_pattern, name, FNM_NOESCAPE)) {
+ match = _gf_true;
+ break;
+ }
+ trav = trav->next;
+ }
+
+ if ((match == _gf_true) && (size >= trav->size))
+ ret = _gf_true;
+
+ out:
+ return ret;
+}
+
/* We do a depth first traversal of directories. But before we move into
* subdirs, we complete the data migration of those directories whose layouts
* have been fixed
@@ -1058,6 +1126,7 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
struct timeval end = {0,};
double elapsed = {0,};
struct timeval start = {0,};
+ int32_t err = 0;
gf_log (this->name, GF_LOG_INFO, "migrate data called on %s",
loc->path);
@@ -1080,17 +1149,24 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL,
&entries)) != 0) {
- if (ret < 0)
- break;
+
+ if (ret < 0) {
+
+ gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s."
+ " Aborting migrate-data",
+ strerror(readdir_operrno));
+ goto out;
+ }
/* Need to keep track of ENOENT errno, that means, there is no
need to send more readdirp() */
readdir_operrno = errno;
- free_entries = _gf_true;
-
if (list_empty (&entries.list))
break;
+
+ free_entries = _gf_true;
+
list_for_each_entry_safe (entry, tmp, &entries.list, list) {
if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
ret = 1;
@@ -1110,6 +1186,12 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (defrag->stats == _gf_true) {
gettimeofday (&start, NULL);
}
+ if (defrag->defrag_pattern &&
+ (gf_defrag_pattern_match (defrag, entry->d_name,
+ entry->d_stat.ia_size)
+ == _gf_false)) {
+ continue;
+ }
loc_wipe (&entry_loc);
ret =dht_build_child_loc (this, &entry_loc, loc,
entry->d_name);
@@ -1202,9 +1284,21 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
ret = syncop_setxattr (this, &entry_loc, migrate_data,
0);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "migrate-data"
- " failed for %s", entry_loc.path);
- defrag->total_failures +=1;
+ err = op_errno;
+ /* errno is overloaded. See
+ * rebalance_task_completion () */
+ if (err != ENOSPC) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "migrate-data skipped for %s"
+ " due to space constraints",
+ entry_loc.path);
+ defrag->skipped +=1;
+ } else{
+ gf_log (this->name, GF_LOG_ERROR,
+ "migrate-data failed for %s",
+ entry_loc.path);
+ defrag->total_failures +=1;
+ }
}
if (ret == -1) {
@@ -1284,6 +1378,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
dict_t *dict = NULL;
off_t offset = 0;
struct iatt iatt = {0,};
+ int readdirp_errno = 0;
ret = syncop_lookup (this, loc, NULL, &iatt, NULL, NULL);
if (ret) {
@@ -1319,12 +1414,22 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL,
&entries)) != 0)
{
- if ((ret < 0) || (ret && (errno == ENOENT)))
- break;
- free_entries = _gf_true;
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s"
+ ". Aborting fix-layout",strerror(errno));
+ goto out;
+ }
+
+ /* Need to keep track of ENOENT errno, that means, there is no
+ need to send more readdirp() */
+ readdirp_errno = errno;
if (list_empty (&entries.list))
break;
+
+ free_entries = _gf_true;
+
list_for_each_entry_safe (entry, tmp, &entries.list, list) {
if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
ret = 1;
@@ -1351,7 +1456,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (uuid_is_null (entry->d_stat.ia_gfid)) {
gf_log (this->name, GF_LOG_ERROR, "%s/%s"
- "gfid not present", loc->path,
+ " gfid not present", loc->path,
entry->d_name);
continue;
}
@@ -1361,7 +1466,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid);
if (uuid_is_null (loc->gfid)) {
gf_log (this->name, GF_LOG_ERROR, "%s/%s"
- "gfid not present", loc->path,
+ " gfid not present", loc->path,
entry->d_name);
continue;
}
@@ -1400,6 +1505,8 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
gf_dirent_free (&entries);
free_entries = _gf_false;
INIT_LIST_HEAD (&entries.list);
+ if (readdirp_errno == ENOENT)
+ break;
}
ret = 0;
@@ -1590,6 +1697,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
uint64_t size = 0;
uint64_t lookup = 0;
uint64_t failures = 0;
+ uint64_t skipped = 0;
char *status = "";
double elapsed = 0;
struct timeval end = {0,};
@@ -1606,6 +1714,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
size = defrag->total_data;
lookup = defrag->num_files_lookedup;
failures = defrag->total_failures;
+ skipped = defrag->skipped;
gettimeofday (&end, NULL);
@@ -1629,6 +1738,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
gf_log (THIS->name, GF_LOG_WARNING,
"failed to set lookedup file count");
+
ret = dict_set_int32 (dict, "status", defrag->defrag_status);
if (ret)
gf_log (THIS->name, GF_LOG_WARNING,
@@ -1641,6 +1751,14 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
}
ret = dict_set_uint64 (dict, "failures", failures);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to set failure count");
+
+ ret = dict_set_uint64 (dict, "skipped", skipped);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to set skipped file count");
log:
switch (defrag->defrag_status) {
case GF_DEFRAG_STATUS_NOT_STARTED:
@@ -1658,13 +1776,15 @@ log:
case GF_DEFRAG_STATUS_FAILED:
status = "failed";
break;
+ default:
+ break;
}
gf_log (THIS->name, GF_LOG_INFO, "Rebalance is %s. Time taken is %.2f "
"secs", status, elapsed);
gf_log (THIS->name, GF_LOG_INFO, "Files migrated: %"PRIu64", size: %"
- PRIu64", lookups: %"PRIu64", failures: %"PRIu64, files, size,
- lookup, failures);
+ PRIu64", lookups: %"PRIu64", failures: %"PRIu64", skipped: "
+ "%"PRIu64, files, size, lookup, failures, skipped);
out:
diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c
index f1a4364df..5d6f4f232 100644
--- a/xlators/cluster/dht/src/dht-rename.c
+++ b/xlators/cluster/dht/src/dht-rename.c
@@ -306,7 +306,38 @@ err:
NULL, NULL);
return 0;
}
+#define DHT_MARK_FOP_INTERNAL(xattr) do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new (); \
+ if (!xattr) \
+ break; \
+ } \
+ tmp = dict_set_str (xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \
+ if (tmp) { \
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set" \
+ " internal dict key for %s", local->loc.path); \
+ } \
+ }while (0)
+int
+dht_rename_done (call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ local = frame->local;
+
+ if (local->linked == _gf_true) {
+ local->linked = _gf_false;
+ dht_linkfile_attr_heal (frame, this);
+ }
+ DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
+ DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
+ &local->stbuf, &local->preoldparent,
+ &local->postoldparent, &local->preparent,
+ &local->postparent, NULL);
+
+ return 0;
+}
int
dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -340,11 +371,7 @@ dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
WIPE (&local->postparent);
if (is_last_call (this_call_cnt)) {
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent, &local->preparent,
- &local->postparent, NULL);
+ dht_rename_done (frame, this);
}
out:
@@ -362,7 +389,7 @@ dht_rename_cleanup (call_frame_t *frame)
xlator_t *dst_hashed = NULL;
xlator_t *dst_cached = NULL;
int call_cnt = 0;
-
+ dict_t *xattr = NULL;
local = frame->local;
this = frame->this;
@@ -386,13 +413,15 @@ dht_rename_cleanup (call_frame_t *frame)
if (!call_cnt)
goto nolinks;
+ DHT_MARK_FOP_INTERNAL (xattr);
+
if (dst_hashed != src_hashed && dst_hashed != src_cached) {
gf_log (this->name, GF_LOG_TRACE,
"unlinking linkfile %s @ %s => %s",
local->loc.path, dst_hashed->name, src_cached->name);
STACK_WIND (frame, dht_rename_unlink_cbk,
dst_hashed, dst_hashed->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr);
}
if (src_cached != dst_hashed) {
@@ -401,9 +430,12 @@ dht_rename_cleanup (call_frame_t *frame)
local->loc2.path, src_cached->name);
STACK_WIND (frame, dht_rename_unlink_cbk,
src_cached, src_cached->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr);
}
+ if (xattr)
+ dict_unref (xattr);
+
return 0;
nolinks:
@@ -441,6 +473,10 @@ dht_rename_links_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->loc.path, prev->this->name, strerror (op_errno));
}
+ if (local->linked == _gf_true) {
+ local->linked = _gf_false;
+ dht_linkfile_attr_heal (frame, this);
+ }
DHT_STACK_DESTROY (frame);
return 0;
@@ -463,6 +499,7 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
xlator_t *rename_subvol = NULL;
call_frame_t *link_frame = NULL;
dht_local_t *link_local = NULL;
+ dict_t *xattr = NULL;
local = frame->local;
prev = cookie;
@@ -472,6 +509,8 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dst_hashed = local->dst_hashed;
dst_cached = local->dst_cached;
+ if (local->linked == _gf_true)
+ FRAME_SU_UNDO (frame, dht_local_t);
if (op_ret == -1) {
gf_log (this->name, GF_LOG_WARNING,
"%s: rename on %s failed (%s)", local->loc.path,
@@ -501,15 +540,25 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
uuid_copy (link_local->gfid, local->loc.inode->gfid);
dht_linkfile_create (link_frame, dht_rename_links_create_cbk,
- src_cached, dst_hashed, &link_local->loc);
+ this, src_cached, dst_hashed,
+ &link_local->loc);
}
err:
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->preoldparent, preoldparent, prev->this);
- dht_iatt_merge (this, &local->postoldparent, postoldparent, prev->this);
- dht_iatt_merge (this, &local->preparent, prenewparent, prev->this);
- dht_iatt_merge (this, &local->postparent, postnewparent, prev->this);
+ /* Merge attrs only from src_cached. In case there of src_cached !=
+ * dst_hashed, this ignores linkfile attrs. */
+ if (prev->this == src_cached) {
+ dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
+ dht_iatt_merge (this, &local->preoldparent, preoldparent,
+ prev->this);
+ dht_iatt_merge (this, &local->postoldparent, postoldparent,
+ prev->this);
+ dht_iatt_merge (this, &local->preparent, prenewparent,
+ prev->this);
+ dht_iatt_merge (this, &local->postparent, postnewparent,
+ prev->this);
+ }
+
/* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk
* is called. since rename has already happened on rename_subvol,
@@ -534,6 +583,8 @@ err:
if (local->call_cnt == 0)
goto unwind;
+ DHT_MARK_FOP_INTERNAL (xattr);
+
if (src_cached != dst_hashed && src_cached != dst_cached) {
gf_log (this->name, GF_LOG_TRACE,
"deleting old src datafile %s @ %s",
@@ -541,7 +592,7 @@ err:
STACK_WIND (frame, dht_rename_unlink_cbk,
src_cached, src_cached->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr);
}
if (src_hashed != rename_subvol && src_hashed != src_cached) {
@@ -551,7 +602,7 @@ err:
STACK_WIND (frame, dht_rename_unlink_cbk,
src_hashed, src_hashed->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr);
}
if (dst_cached
@@ -563,8 +614,10 @@ err:
STACK_WIND (frame, dht_rename_unlink_cbk,
dst_cached, dst_cached->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr);
}
+ if (xattr)
+ dict_unref (xattr);
return 0;
unwind:
@@ -572,16 +625,16 @@ unwind:
WIPE (&local->postoldparent);
WIPE (&local->preparent);
WIPE (&local->postparent);
+ if (xattr)
+ dict_unref (xattr);
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent, &local->preparent,
- &local->postparent, NULL);
+ dht_rename_done (frame, this);
return 0;
cleanup:
+ if (xattr)
+ dict_unref (xattr);
dht_rename_cleanup (frame);
return 0;
@@ -615,6 +668,8 @@ dht_do_rename (call_frame_t *frame)
"renaming %s => %s (%s)",
local->loc.path, local->loc2.path, rename_subvol->name);
+ if (local->linked == _gf_true)
+ FRAME_SU_DO (frame, dht_local_t);
STACK_WIND (frame, dht_rename_cbk,
rename_subvol, rename_subvol->fops->rename,
&local->loc, &local->loc2, NULL);
@@ -645,6 +700,9 @@ dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->op_ret = -1;
if (op_errno != ENOENT)
local->op_errno = op_errno;
+ } else if (local->src_cached == prev->this) {
+ /* merge of attr returned only from linkfile creation */
+ dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
}
this_call_cnt = dht_frame_return (frame);
@@ -710,6 +768,7 @@ dht_rename_create_links (call_frame_t *frame)
xlator_t *dst_hashed = NULL;
xlator_t *dst_cached = NULL;
int call_cnt = 0;
+ dict_t *xattr = NULL;
local = frame->local;
@@ -720,6 +779,7 @@ dht_rename_create_links (call_frame_t *frame)
dst_hashed = local->dst_hashed;
dst_cached = local->dst_cached;
+ DHT_MARK_FOP_INTERNAL (xattr);
if (src_cached == dst_cached) {
if (dst_hashed == dst_cached)
@@ -731,7 +791,7 @@ dht_rename_create_links (call_frame_t *frame)
STACK_WIND (frame, dht_rename_unlink_links_cbk,
dst_hashed, dst_hashed->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr);
return 0;
}
@@ -748,7 +808,7 @@ dht_rename_create_links (call_frame_t *frame)
"linkfile %s @ %s => %s",
local->loc.path, dst_hashed->name, src_cached->name);
memcpy (local->gfid, local->loc.inode->gfid, 16);
- dht_linkfile_create (frame, dht_rename_links_cbk,
+ dht_linkfile_create (frame, dht_rename_links_cbk, this,
src_cached, dst_hashed, &local->loc);
}
@@ -758,7 +818,7 @@ dht_rename_create_links (call_frame_t *frame)
local->loc2.path, src_cached->name);
STACK_WIND (frame, dht_rename_links_cbk,
src_cached, src_cached->fops->link,
- &local->loc, &local->loc2, NULL);
+ &local->loc, &local->loc2, xattr);
}
nolinks:
@@ -766,6 +826,8 @@ nolinks:
/* skip to next step */
dht_do_rename (frame);
}
+ if (xattr)
+ dict_unref (xattr);
return 0;
}
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index fbe4cab3e..3fe96b1c7 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -17,6 +17,7 @@
#include "glusterfs.h"
#include "xlator.h"
#include "dht-common.h"
+#include "glusterfs-acl.h"
#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \
layout->list[i].start = srt; \
@@ -28,6 +29,13 @@
layout->list[i].xlator->name, path); \
} while (0)
+#define DHT_RESET_LAYOUT_RANGE(layout) do { \
+ int cnt = 0; \
+ for (cnt = 0; cnt < layout->cnt; cnt++ ) { \
+ layout->list[cnt].start = 0; \
+ layout->list[cnt].stop = 0; \
+ } \
+ } while (0)
static uint32_t
dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n)
@@ -38,12 +46,20 @@ dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n)
if (old->list[o].err > 0 || new->list[n].err > 0)
return 0;
+ if (old->list[o].start == old->list[o].stop) {
+ return 0;
+ }
+
+ if (new->list[n].start == new->list[n].stop) {
+ return 0;
+ }
+
if ((old->list[o].start > new->list[n].stop) ||
(old->list[o].stop < new->list[n].start))
return 0;
- return max (old->list[o].start, new->list[n].start) -
- min (old->list[o].stop, new->list[n].stop);
+ return min (old->list[o].stop, new->list[n].stop) -
+ max (old->list[o].start, new->list[n].start) + 1;
}
@@ -110,7 +126,7 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
xlator_t *this = NULL;
int32_t *disk_layout = NULL;
dht_local_t *local = NULL;
-
+ dht_conf_t *conf = NULL;
local = frame->local;
if (req_subvol)
@@ -123,6 +139,9 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
GF_VALIDATE_OR_GOTO (this->name, layout, err);
GF_VALIDATE_OR_GOTO (this->name, local, err);
GF_VALIDATE_OR_GOTO (this->name, subvol, err);
+ VALIDATE_OR_GOTO (this->private, err);
+
+ conf = this->private;
xattr = get_new_dict ();
if (!xattr) {
@@ -137,8 +156,7 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
goto err;
}
- ret = dict_set_bin (xattr, "trusted.glusterfs.dht",
- disk_layout, 4 * 4);
+ ret = dict_set_bin (xattr, conf->xattr_name, disk_layout, 4 * 4);
if (ret == -1) {
gf_log (this->name, GF_LOG_WARNING,
"%s: (subvol %s) failed to set xattr dictionary",
@@ -229,9 +247,12 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
int missing_xattr = 0;
int i = 0;
xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *dummy = NULL;
local = frame->local;
this = frame->this;
+ conf = this->private;
for (i = 0; i < layout->cnt; i++) {
if (layout->list[i].err != -1 || !layout->list[i].stop) {
@@ -265,6 +286,18 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
if (--missing_xattr == 0)
break;
}
+ dummy = dht_layout_new (this, 1);
+ if (!dummy)
+ goto out;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (_gf_false ==
+ dht_is_subvol_in_layout (layout, conf->subvolumes[i])) {
+ dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0,
+ conf->subvolumes[i]);
+ }
+ }
+ dht_layout_unref (this, dummy);
+out:
return 0;
}
@@ -501,7 +534,7 @@ dht_selfheal_layout_alloc_start (xlator_t *this, loc_t *loc,
uint32_t hashval = 0;
int ret = 0;
- ret = dht_hash_compute (layout->type, loc->path, &hashval);
+ ret = dht_hash_compute (this, layout->type, loc->path, &hashval);
if (ret == 0) {
start = (hashval % layout->cnt);
}
@@ -532,9 +565,33 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)
for (i = 0; i < layout->cnt; i++) {
err = layout->list[i].err;
- if (err == -1 || err == 0) {
- layout->list[i].err = -1;
+ if (err == -1 || err == 0 || err == ENOENT) {
+ /* Setting list[i].err = -1 is an indication for
+ dht_selfheal_layout_new_directory() to assign
+ a range. We set it to -1 based on any one of
+ the three criteria:
+
+ - err == -1 already, which means directory
+ existed but layout was not set on it.
+
+ - err == 0, which means directory exists and
+ has an old layout piece which will be
+ overwritten now.
+
+ - err == ENOENT, which means directory does
+ not exist (possibly racing with mkdir or
+ finishing half done mkdir). The missing
+ directory will be attempted to be recreated.
+
+ It is important to note that it is safe
+ to race with mkdir() as self-heal and
+ mkdir are idempotent operations. Both will
+ strive to set the directory and layouts to
+ the same final state.
+ */
count++;
+ if (!err)
+ layout->list[i].err = -1;
}
}
@@ -550,11 +607,10 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)
}
/* if layout->spread_cnt is set, check if it is <= available
- * subvolumes (excluding bricks that are being decommissioned). Else
- * return count */
+ * subvolumes (down brick and decommissioned bricks are considered
+ * un-availbale). Else return count (available up bricks) */
count = ((layout->spread_cnt &&
- (layout->spread_cnt <=
- (conf->subvolume_cnt - conf->decommission_subvols_cnt))) ?
+ (layout->spread_cnt <= count)) ?
layout->spread_cnt : ((count) ? count : 1));
return count;
@@ -565,18 +621,25 @@ void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
dht_layout_t *new_layout);
void dht_layout_entry_swap (dht_layout_t *layout, int i, int j);
+void dht_layout_range_swap (dht_layout_t *layout, int i, int j);
+
+/*
+ * It's a bit icky using local variables in a macro, but it makes the rest
+ * of the code a lot clearer.
+ */
+#define OV_ENTRY(x,y) table[x*new->cnt+y]
void
dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc,
dht_layout_t *new, dht_layout_t *old)
{
int i = 0;
- int k = 0;
+ int j = 0;
uint32_t curr_overlap = 0;
uint32_t max_overlap = 0;
int max_overlap_idx = -1;
uint32_t overlap = 0;
-
+ uint32_t *table = NULL;
dht_layout_sort_volname (old);
/* Now both old_layout->list[] and new_layout->list[]
@@ -585,31 +648,61 @@ dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc,
to the same subvolumes
*/
+ /* Build a table of overlaps between new[i] and old[j]. */
+ table = alloca(sizeof(overlap)*old->cnt*new->cnt);
+ if (!table) {
+ return;
+ }
+ memset(table,0,sizeof(overlap)*old->cnt*new->cnt);
+ for (i = 0; i < new->cnt; ++i) {
+ for (j = 0; j < old->cnt; ++j) {
+ OV_ENTRY(i,j) = dht_overlap_calc(old,j,new,i);
+ }
+ }
+
for (i = 0; i < new->cnt; i++) {
- if (new->list[i].err > 0)
+ if (new->list[i].err > 0) {
/* Subvol might be marked for decommission
with EINVAL, or some other serious error
marked with positive errno.
*/
continue;
+ }
- curr_overlap = dht_overlap_calc (old, i, new, i);
- max_overlap = curr_overlap;
- max_overlap_idx = i;
-
- /* Subvols up to `i' have already found their best
- matches. Search only from the rest.
- */
- for (k = (i + 1); k < new->cnt; k++) {
- overlap = dht_overlap_calc (old, i, new, k);
- if (overlap > max_overlap) {
- max_overlap_idx = k;
- max_overlap = overlap;
- }
- }
-
- if (max_overlap > curr_overlap)
- dht_layout_entry_swap (new, i, max_overlap_idx);
+ max_overlap = 0;
+ max_overlap_idx = i;
+ for (j = (i + 1); j < new->cnt; ++j) {
+ if (new->list[j].err > 0) {
+ /* Subvol might be marked for decommission
+ with EINVAL, or some other serious error
+ marked with positive errno.
+ */
+ continue;
+ }
+ /* Calculate the overlap now. */
+ curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j);
+ /* Calculate the overlap after the proposed swap. */
+ overlap = OV_ENTRY(i,j) + OV_ENTRY(j,i);
+ /* Are we better than status quo? */
+ if (overlap > curr_overlap) {
+ overlap -= curr_overlap;
+ /* Are we better than the previous choice? */
+ if (overlap > max_overlap) {
+ max_overlap = overlap;
+ max_overlap_idx = j;
+ }
+ }
+ }
+
+ if (max_overlap_idx != i) {
+ dht_layout_range_swap (new, i, max_overlap_idx);
+ /* Need to swap the table values too. */
+ for (j = 0; j < old->cnt; ++j) {
+ overlap = OV_ENTRY(i,j);
+ OV_ENTRY(i,j) = OV_ENTRY(max_overlap_idx,j);
+ OV_ENTRY(max_overlap_idx,j) = overlap;
+ }
+ }
}
}
@@ -704,9 +797,11 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout);
+ /* clear out the range, as we are re-computing here */
+ DHT_RESET_LAYOUT_RANGE (layout);
for (i = start_subvol; i < layout->cnt; i++) {
err = layout->list[i].err;
- if (err == -1) {
+ if (err == -1 || err == ENOENT) {
DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
cnt, loc->path);
if (--cnt == 0) {
@@ -719,7 +814,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
for (i = 0; i < start_subvol; i++) {
err = layout->list[i].err;
- if (err == -1) {
+ if (err == -1 || err == ENOENT) {
DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
cnt, loc->path);
if (--cnt == 0) {
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
new file mode 100644
index 000000000..70aac7710
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -0,0 +1,758 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+/* TODO: add NS locking */
+
+#include "statedump.h"
+#include "dht-common.h"
+
+/* TODO:
+ - use volumename in xattr instead of "dht"
+ - use NS locks
+ - handle all cases in self heal layout reconstruction
+ - complete linkfile selfheal
+*/
+struct volume_options options[];
+
+void
+dht_layout_dump (dht_layout_t *layout, const char *prefix)
+{
+
+ char key[GF_DUMP_MAX_BUF_LEN];
+ int i = 0;
+
+ if (!layout)
+ goto out;
+ if (!prefix)
+ goto out;
+
+ gf_proc_dump_build_key(key, prefix, "cnt");
+ gf_proc_dump_write(key, "%d", layout->cnt);
+ gf_proc_dump_build_key(key, prefix, "preset");
+ gf_proc_dump_write(key, "%d", layout->preset);
+ gf_proc_dump_build_key(key, prefix, "gen");
+ gf_proc_dump_write(key, "%d", layout->gen);
+ if (layout->type != IA_INVAL) {
+ gf_proc_dump_build_key(key, prefix, "inode type");
+ gf_proc_dump_write(key, "%d", layout->type);
+ }
+
+ if (!IA_ISDIR (layout->type))
+ goto out;
+
+ for (i = 0; i < layout->cnt; i++) {
+ gf_proc_dump_build_key(key, prefix,"list[%d].err", i);
+ gf_proc_dump_write(key, "%d", layout->list[i].err);
+ gf_proc_dump_build_key(key, prefix,"list[%d].start", i);
+ gf_proc_dump_write(key, "%u", layout->list[i].start);
+ gf_proc_dump_build_key(key, prefix,"list[%d].stop", i);
+ gf_proc_dump_write(key, "%u", layout->list[i].stop);
+ if (layout->list[i].xlator) {
+ gf_proc_dump_build_key(key, prefix,
+ "list[%d].xlator.type", i);
+ gf_proc_dump_write(key, "%s",
+ layout->list[i].xlator->type);
+ gf_proc_dump_build_key(key, prefix,
+ "list[%d].xlator.name", i);
+ gf_proc_dump_write(key, "%s",
+ layout->list[i].xlator->name);
+ }
+ }
+
+out:
+ return;
+}
+
+
+int32_t
+dht_priv_dump (xlator_t *this)
+{
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
+ char key[GF_DUMP_MAX_BUF_LEN];
+ int i = 0;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+
+ if (!this)
+ goto out;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ ret = TRY_LOCK(&conf->subvolume_lock);
+ if (ret != 0) {
+ return ret;
+ }
+
+ gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name);
+ gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv",
+ this->name);
+ gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt);
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ sprintf (key, "subvolumes[%d]", i);
+ gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type,
+ conf->subvolumes[i]->name);
+ if (conf->file_layouts && conf->file_layouts[i]){
+ sprintf (key, "file_layouts[%d]", i);
+ dht_layout_dump(conf->file_layouts[i], key);
+ }
+ if (conf->dir_layouts && conf->dir_layouts[i]) {
+ sprintf (key, "dir_layouts[%d]", i);
+ dht_layout_dump(conf->dir_layouts[i], key);
+ }
+ if (conf->subvolume_status) {
+
+ sprintf (key, "subvolume_status[%d]", i);
+ gf_proc_dump_write(key, "%d",
+ (int)conf->subvolume_status[i]);
+ }
+
+ }
+
+ gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed);
+ gf_proc_dump_write("gen", "%d", conf->gen);
+ gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk);
+ gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes);
+ gf_proc_dump_write("disk_unit", "%c", conf->disk_unit);
+ gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval);
+ gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit);
+ if (conf ->du_stats) {
+ gf_proc_dump_write("du_stats.avail_percent", "%lf",
+ conf->du_stats->avail_percent);
+ gf_proc_dump_write("du_stats.avail_space", "%lu",
+ conf->du_stats->avail_space);
+ gf_proc_dump_write("du_stats.avail_inodes", "%lf",
+ conf->du_stats->avail_inodes);
+ gf_proc_dump_write("du_stats.log", "%lu", conf->du_stats->log);
+ }
+
+ if (conf->last_stat_fetch.tv_sec)
+ gf_proc_dump_write("last_stat_fetch", "%s",
+ ctime(&conf->last_stat_fetch.tv_sec));
+
+ UNLOCK(&conf->subvolume_lock);
+
+out:
+ return ret;
+}
+
+int32_t
+dht_inodectx_dump (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ dht_layout_t *layout = NULL;
+
+ if (!this)
+ goto out;
+ if (!inode)
+ goto out;
+
+ ret = dht_inode_ctx_layout_get (inode, this, &layout);
+
+ if ((ret != 0) || !layout)
+ return ret;
+
+ gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name);
+ dht_layout_dump(layout, "layout");
+
+out:
+ return ret;
+}
+
+void
+dht_fini (xlator_t *this)
+{
+ int i = 0;
+ dht_conf_t *conf = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+
+ conf = this->private;
+ this->private = NULL;
+ if (conf) {
+ if (conf->file_layouts) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ GF_FREE (conf->file_layouts[i]);
+ }
+ GF_FREE (conf->file_layouts);
+ }
+
+ GF_FREE (conf->subvolumes);
+
+ GF_FREE (conf->subvolume_status);
+
+ GF_FREE (conf);
+ }
+out:
+ return;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+
+ ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+ return ret;
+ }
+out:
+ return ret;
+}
+
+
+int
+dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf,
+ const char *bricks)
+{
+ int i = 0;
+ int ret = -1;
+ char *tmpstr = NULL;
+ char *dup_brick = NULL;
+ char *node = NULL;
+
+ if (!conf || !bricks)
+ goto out;
+
+ dup_brick = gf_strdup (bricks);
+ node = strtok_r (dup_brick, ",", &tmpstr);
+ while (node) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!strcmp (conf->subvolumes[i]->name, node)) {
+ conf->decommissioned_bricks[i] =
+ conf->subvolumes[i];
+ conf->decommission_subvols_cnt++;
+ gf_log (this->name, GF_LOG_INFO,
+ "decommissioning subvolume %s",
+ conf->subvolumes[i]->name);
+ break;
+ }
+ }
+ if (i == conf->subvolume_cnt) {
+ /* Wrong node given. */
+ goto out;
+ }
+ node = strtok_r (NULL, ",", &tmpstr);
+ }
+
+ ret = 0;
+ conf->decommission_in_progress = 1;
+out:
+ GF_FREE (dup_brick);
+
+ return ret;
+}
+
+
+int
+dht_decommissioned_remove (xlator_t *this, dht_conf_t *conf)
+{
+ int i = 0;
+ int ret = -1;
+
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i]) {
+ conf->decommissioned_bricks[i] = NULL;
+ conf->decommission_subvols_cnt--;
+ }
+ }
+
+ ret = 0;
+out:
+
+ return ret;
+}
+void
+dht_init_regex (xlator_t *this, dict_t *odict, char *name,
+ regex_t *re, gf_boolean_t *re_valid)
+{
+ char *temp_str;
+
+ if (dict_get_str (odict, name, &temp_str) != 0) {
+ if (strcmp(name,"rsync-hash-regex")) {
+ return;
+ }
+ temp_str = "^\\.(.+)\\.[^.]+$";
+ }
+
+ if (*re_valid) {
+ regfree(re);
+ *re_valid = _gf_false;
+ }
+
+ if (!strcmp(temp_str,"none")) {
+ return;
+ }
+
+ if (regcomp(re,temp_str,REG_EXTENDED) == 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "using regex %s = %s", name, temp_str);
+ *re_valid = _gf_true;
+ }
+ else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "compiling regex %s failed", temp_str);
+ }
+}
+
+int
+dht_reconfigure (xlator_t *this, dict_t *options)
+{
+ dht_conf_t *conf = NULL;
+ char *temp_str = NULL;
+ gf_boolean_t search_unhashed;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", options, out);
+
+ conf = this->private;
+ if (!conf)
+ return 0;
+
+ if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) {
+ /* If option is not "auto", other options _should_ be boolean*/
+ if (strcasecmp (temp_str, "auto")) {
+ if (!gf_string2boolean (temp_str, &search_unhashed)) {
+ gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
+ " lookup-unhashed reconfigured (%s)",
+ temp_str);
+ conf->search_unhashed = search_unhashed;
+ } else {
+ gf_log(this->name, GF_LOG_ERROR, "Reconfigure:"
+ " lookup-unhashed should be boolean,"
+ " not (%s), defaulting to (%d)",
+ temp_str, conf->search_unhashed);
+ //return -1;
+ ret = -1;
+ goto out;
+ }
+ } else {
+ gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
+ " lookup-unhashed reconfigured auto ");
+ conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
+ }
+ }
+
+ GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options,
+ percent_or_size, out);
+ /* option can be any one of percent or bytes */
+ conf->disk_unit = 0;
+ if (conf->min_free_disk < 100.0)
+ conf->disk_unit = 'p';
+
+ GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options,
+ percent, out);
+
+ GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt,
+ options, uint32, out);
+
+ GF_OPTION_RECONF ("readdir-optimize", conf->readdir_optimize, options,
+ bool, out);
+ if (conf->defrag) {
+ GF_OPTION_RECONF ("rebalance-stats", conf->defrag->stats,
+ options, bool, out);
+ }
+
+ if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) {
+ ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
+ if (ret == -1)
+ goto out;
+ } else {
+ ret = dht_decommissioned_remove (this, conf);
+ if (ret == -1)
+ goto out;
+ }
+
+ dht_init_regex (this, options, "rsync-hash-regex",
+ &conf->rsync_regex, &conf->rsync_regex_valid);
+ dht_init_regex (this, options, "extra-hash-regex",
+ &conf->extra_regex, &conf->extra_regex_valid);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int
+gf_defrag_pattern_list_fill (xlator_t *this, gf_defrag_info_t *defrag, char *data)
+{
+ int ret = -1;
+ char *tmp_str = NULL;
+ char *tmp_str1 = NULL;
+ char *dup_str = NULL;
+ char *num = NULL;
+ char *pattern_str = NULL;
+ char *pattern = NULL;
+ gf_defrag_pattern_list_t *temp_list = NULL;
+ gf_defrag_pattern_list_t *pattern_list = NULL;
+
+ if (!this || !defrag || !data)
+ goto out;
+
+ /* Get the pattern for pattern list. "pattern:<optional-size>"
+ * eg: *avi, *pdf:10MB, *:1TB
+ */
+ pattern_str = strtok_r (data, ",", &tmp_str);
+ while (pattern_str) {
+ dup_str = gf_strdup (pattern_str);
+ pattern_list = GF_CALLOC (1, sizeof (gf_defrag_pattern_list_t),
+ 1);
+ if (!pattern_list) {
+ goto out;
+ }
+ pattern = strtok_r (dup_str, ":", &tmp_str1);
+ num = strtok_r (NULL, ":", &tmp_str1);
+ if (!pattern)
+ goto out;
+ if (!num) {
+ if (gf_string2bytesize(pattern, &pattern_list->size)
+ == 0) {
+ pattern = "*";
+ }
+ } else if (gf_string2bytesize (num, &pattern_list->size) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "invalid number format \"%s\"", num);
+ goto out;
+ }
+ memcpy (pattern_list->path_pattern, pattern, strlen (dup_str));
+
+ if (!defrag->defrag_pattern)
+ temp_list = NULL;
+ else
+ temp_list = defrag->defrag_pattern;
+
+ pattern_list->next = temp_list;
+
+ defrag->defrag_pattern = pattern_list;
+ pattern_list = NULL;
+
+ GF_FREE (dup_str);
+ dup_str = NULL;
+
+ pattern_str = strtok_r (NULL, ",", &tmp_str);
+ }
+
+ ret = 0;
+out:
+ if (ret)
+ GF_FREE (pattern_list);
+ GF_FREE (dup_str);
+
+ return ret;
+}
+
+int
+dht_init (xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ char *temp_str = NULL;
+ int ret = -1;
+ int i = 0;
+ gf_defrag_info_t *defrag = NULL;
+ int cmd = 0;
+ char *node_uuid = NULL;
+
+
+ GF_VALIDATE_OR_GOTO ("dht", this, err);
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Distribute needs more than one subvolume");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile");
+ }
+
+ conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t);
+ if (!conf) {
+ goto err;
+ }
+
+ ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd);
+
+ if (cmd) {
+ defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t),
+ gf_defrag_info_mt);
+
+ GF_VALIDATE_OR_GOTO (this->name, defrag, err);
+
+ LOCK_INIT (&defrag->lock);
+
+ defrag->is_exiting = 0;
+
+ conf->defrag = defrag;
+
+ ret = dict_get_str (this->options, "node-uuid", &node_uuid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "node-uuid not "
+ "specified");
+ goto err;
+ }
+
+ if (uuid_parse (node_uuid, defrag->node_uuid)) {
+ gf_log (this->name, GF_LOG_ERROR, "Cannot parse "
+ "glusterd node uuid");
+ goto err;
+ }
+
+ defrag->cmd = cmd;
+
+ defrag->stats = _gf_false;
+ }
+
+ conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
+ if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
+ /* If option is not "auto", other options _should_ be boolean */
+ if (strcasecmp (temp_str, "auto"))
+ gf_string2boolean (temp_str, &conf->search_unhashed);
+ else
+ conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
+ }
+
+ GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool,
+ err);
+
+ GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err);
+
+ GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size,
+ err);
+
+ GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent,
+ err);
+
+ conf->dir_spread_cnt = conf->subvolume_cnt;
+ GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt,
+ uint32, err);
+
+ GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down,
+ bool, err);
+
+ GF_OPTION_INIT ("readdir-optimize", conf->readdir_optimize, bool, err);
+
+ if (defrag) {
+ GF_OPTION_INIT ("rebalance-stats", defrag->stats, bool, err);
+ if (dict_get_str (this->options, "rebalance-filter", &temp_str)
+ == 0) {
+ if (gf_defrag_pattern_list_fill (this, defrag, temp_str)
+ == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "Cannot parse"
+ " rebalance-filter (%s)", temp_str);
+ goto err;
+ }
+ }
+ }
+
+ /* option can be any one of percent or bytes */
+ conf->disk_unit = 0;
+ if (conf->min_free_disk < 100)
+ conf->disk_unit = 'p';
+
+ ret = dht_init_subvolumes (this, conf);
+ if (ret == -1) {
+ goto err;
+ }
+
+ if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) {
+ ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
+ if (ret == -1)
+ goto err;
+ }
+
+ dht_init_regex (this, this->options, "rsync-hash-regex",
+ &conf->rsync_regex, &conf->rsync_regex_valid);
+ dht_init_regex (this, this->options, "extra-hash-regex",
+ &conf->extra_regex, &conf->extra_regex_valid);
+
+ ret = dht_layouts_init (this, conf);
+ if (ret == -1) {
+ goto err;
+ }
+
+ LOCK_INIT (&conf->subvolume_lock);
+ LOCK_INIT (&conf->layout_lock);
+
+ conf->gen = 1;
+
+ this->local_pool = mem_pool_new (dht_local_t, 512);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create local_t's memory pool");
+ goto err;
+ }
+
+ GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err);
+ gf_asprintf (&conf->link_xattr_name, "%s.linkto", conf->xattr_name);
+ gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name);
+ if (!conf->link_xattr_name || !conf->wild_xattr_name) {
+ goto err;
+ }
+
+ this->private = conf;
+
+ return 0;
+
+err:
+ if (conf) {
+ if (conf->file_layouts) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ GF_FREE (conf->file_layouts[i]);
+ }
+ GF_FREE (conf->file_layouts);
+ }
+
+ GF_FREE (conf->subvolumes);
+
+ GF_FREE (conf->subvolume_status);
+
+ GF_FREE (conf->du_stats);
+
+ GF_FREE (conf->defrag);
+
+ GF_FREE (conf->xattr_name);
+ GF_FREE (conf->link_xattr_name);
+ GF_FREE (conf->wild_xattr_name);
+
+ GF_FREE (conf);
+ }
+
+ return -1;
+}
+
+
+struct volume_options options[] = {
+ { .key = {"lookup-unhashed"},
+ .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
+ "on", "off"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "on",
+ .description = "This option if set to ON, does a lookup through "
+ "all the sub-volumes, in case a lookup didn't return any result "
+ "from the hash subvolume. If set to OFF, it does not do a lookup "
+ "on the remaining subvolumes."
+ },
+ { .key = {"min-free-disk"},
+ .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
+ .default_value = "10%",
+ .description = "Percentage/Size of disk space, after which the "
+ "process starts balancing out the cluster, and logs will appear "
+ "in log files",
+ },
+ { .key = {"min-free-inodes"},
+ .type = GF_OPTION_TYPE_PERCENT,
+ .default_value = "5%",
+ .description = "after system has only N% of inodes, warnings "
+ "starts to appear in log files",
+ },
+ { .key = {"unhashed-sticky-bit"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
+ { .key = {"use-readdirp"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "This option if set to ON, forces the use of "
+ "readdirp, and hence also displays the stats of the files."
+ },
+ { .key = {"assert-no-child-down"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option if set to ON, in the event of "
+ "CHILD_DOWN, will call exit."
+ },
+ { .key = {"directory-layout-spread"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Specifies the directory layout spread."
+ },
+ { .key = {"decommissioned-bricks"},
+ .type = GF_OPTION_TYPE_ANY,
+ .description = "This option if set to ON, decommissions "
+ "the brick, so that no new data is allowed to be created "
+ "on that brick."
+ },
+ { .key = {"rebalance-cmd"},
+ .type = GF_OPTION_TYPE_INT,
+ },
+ { .key = {"node-uuid"},
+ .type = GF_OPTION_TYPE_STR,
+ },
+ { .key = {"rebalance-stats"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option if set to ON displays and logs the "
+ " time taken for migration of each file, during the rebalance "
+ "process. If set to OFF, the rebalance logs will only display the "
+ "time spent in each directory."
+ },
+ { .key = {"readdir-optimize"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option if set to ON enables the optimization "
+ "that allows DHT to requests non-first subvolumes to filter out "
+ "directory entries."
+ },
+ { .key = {"rsync-hash-regex"},
+ .type = GF_OPTION_TYPE_STR,
+ /* Setting a default here doesn't work. See dht_init_regex. */
+ .description = "Regular expression for stripping temporary-file "
+ "suffix and prefix used by rsync, to prevent relocation when the "
+ "file is renamed."
+ },
+ { .key = {"extra-hash-regex"},
+ .type = GF_OPTION_TYPE_STR,
+ /* Setting a default here doesn't work. See dht_init_regex. */
+ .description = "Regular expression for stripping temporary-file "
+ "suffix and prefix used by an application, to prevent relocation when "
+ "the file is renamed."
+ },
+ { .key = {"rebalance-filter"},
+ .type = GF_OPTION_TYPE_STR,
+ },
+
+ { .key = {"xattr-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "trusted.glusterfs.dht",
+ .description = "Base for extended attributes used by this "
+ "translator instance, to avoid conflicts with others above or "
+ "below it."
+ },
+
+ /* NUFA option */
+ { .key = {"local-volume-name"},
+ .type = GF_OPTION_TYPE_XLATOR
+ },
+
+ /* switch option */
+ { .key = {"pattern.switch.case"},
+ .type = GF_OPTION_TYPE_ANY
+ },
+
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c
index d70d713ac..fc0ca2f77 100644
--- a/xlators/cluster/dht/src/dht.c
+++ b/xlators/cluster/dht/src/dht.c
@@ -14,502 +14,15 @@
#include "config.h"
#endif
-/* TODO: add NS locking */
-
#include "statedump.h"
#include "dht-common.h"
-/* TODO:
- - use volumename in xattr instead of "dht"
- - use NS locks
- - handle all cases in self heal layout reconstruction
- - complete linkfile selfheal
-*/
-struct volume_options options[];
-
-void
-dht_layout_dump (dht_layout_t *layout, const char *prefix)
-{
-
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 0;
-
- if (!layout)
- goto out;
- if (!prefix)
- goto out;
-
- gf_proc_dump_build_key(key, prefix, "cnt");
- gf_proc_dump_write(key, "%d", layout->cnt);
- gf_proc_dump_build_key(key, prefix, "preset");
- gf_proc_dump_write(key, "%d", layout->preset);
- gf_proc_dump_build_key(key, prefix, "gen");
- gf_proc_dump_write(key, "%d", layout->gen);
- if (layout->type != IA_INVAL) {
- gf_proc_dump_build_key(key, prefix, "inode type");
- gf_proc_dump_write(key, "%d", layout->type);
- }
-
- if (!IA_ISDIR (layout->type))
- goto out;
-
- for (i = 0; i < layout->cnt; i++) {
- gf_proc_dump_build_key(key, prefix,"list[%d].err", i);
- gf_proc_dump_write(key, "%d", layout->list[i].err);
- gf_proc_dump_build_key(key, prefix,"list[%d].start", i);
- gf_proc_dump_write(key, "%u", layout->list[i].start);
- gf_proc_dump_build_key(key, prefix,"list[%d].stop", i);
- gf_proc_dump_write(key, "%u", layout->list[i].stop);
- if (layout->list[i].xlator) {
- gf_proc_dump_build_key(key, prefix,
- "list[%d].xlator.type", i);
- gf_proc_dump_write(key, "%s",
- layout->list[i].xlator->type);
- gf_proc_dump_build_key(key, prefix,
- "list[%d].xlator.name", i);
- gf_proc_dump_write(key, "%s",
- layout->list[i].xlator->name);
- }
- }
-
-out:
- return;
-}
-
-
-int32_t
-dht_priv_dump (xlator_t *this)
-{
- char key_prefix[GF_DUMP_MAX_BUF_LEN];
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 0;
- dht_conf_t *conf = NULL;
- int ret = -1;
-
- if (!this)
- goto out;
-
- conf = this->private;
- if (!conf)
- goto out;
-
- ret = TRY_LOCK(&conf->subvolume_lock);
- if (ret != 0) {
- return ret;
- }
-
- gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name);
- gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv",
- this->name);
- gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt);
- for (i = 0; i < conf->subvolume_cnt; i++) {
- sprintf (key, "subvolumes[%d]", i);
- gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type,
- conf->subvolumes[i]->name);
- if (conf->file_layouts && conf->file_layouts[i]){
- sprintf (key, "file_layouts[%d]", i);
- dht_layout_dump(conf->file_layouts[i], key);
- }
- if (conf->dir_layouts && conf->dir_layouts[i]) {
- sprintf (key, "dir_layouts[%d]", i);
- dht_layout_dump(conf->dir_layouts[i], key);
- }
- if (conf->subvolume_status) {
-
- sprintf (key, "subvolume_status[%d]", i);
- gf_proc_dump_write(key, "%d",
- (int)conf->subvolume_status[i]);
- }
-
- }
-
- gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed);
- gf_proc_dump_write("gen", "%d", conf->gen);
- gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk);
- gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes);
- gf_proc_dump_write("disk_unit", "%c", conf->disk_unit);
- gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval);
- gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit);
- if (conf ->du_stats) {
- gf_proc_dump_write("du_stats.avail_percent", "%lf",
- conf->du_stats->avail_percent);
- gf_proc_dump_write("du_stats.avail_space", "%lu",
- conf->du_stats->avail_space);
- gf_proc_dump_write("du_stats.avail_inodes", "%lf",
- conf->du_stats->avail_inodes);
- gf_proc_dump_write("du_stats.log", "%lu", conf->du_stats->log);
- }
-
- if (conf->last_stat_fetch.tv_sec)
- gf_proc_dump_write("last_stat_fetch", "%s",
- ctime(&conf->last_stat_fetch.tv_sec));
-
- UNLOCK(&conf->subvolume_lock);
-
-out:
- return ret;
-}
-
-int32_t
-dht_inodectx_dump (xlator_t *this, inode_t *inode)
-{
- int ret = -1;
- dht_layout_t *layout = NULL;
-
- if (!this)
- goto out;
- if (!inode)
- goto out;
-
- ret = dht_inode_ctx_layout_get (inode, this, &layout);
-
- if ((ret != 0) || !layout)
- return ret;
-
- gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name);
- dht_layout_dump(layout, "layout");
-
-out:
- return ret;
-}
-
-int
-notify (xlator_t *this, int event, void *data, ...)
-{
- int ret = -1;
- va_list ap;
- dict_t *output = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
-
-
- if (!data)
- goto out;
-
- va_start (ap, data);
- output = va_arg (ap, dict_t*);
-
- ret = dht_notify (this, event, data, output);
-
-out:
- return ret;
-}
-
-void
-fini (xlator_t *this)
-{
- int i = 0;
- dht_conf_t *conf = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
-
- conf = this->private;
- this->private = NULL;
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf);
- }
-out:
- return;
-}
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
-
- ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- return ret;
- }
-out:
- return ret;
-}
-
-
-int
-dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf,
- const char *bricks)
-{
- int i = 0;
- int ret = -1;
- char *tmpstr = NULL;
- char *dup_brick = NULL;
- char *node = NULL;
-
- if (!conf || !bricks)
- goto out;
-
- dup_brick = gf_strdup (bricks);
- node = strtok_r (dup_brick, ",", &tmpstr);
- while (node) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (!strcmp (conf->subvolumes[i]->name, node)) {
- conf->decommissioned_bricks[i] =
- conf->subvolumes[i];
- conf->decommission_subvols_cnt++;
- gf_log (this->name, GF_LOG_INFO,
- "decommissioning subvolume %s",
- conf->subvolumes[i]->name);
- break;
- }
- }
- if (i == conf->subvolume_cnt) {
- /* Wrong node given. */
- goto out;
- }
- node = strtok_r (NULL, ",", &tmpstr);
- }
-
- ret = 0;
- conf->decommission_in_progress = 1;
-out:
- GF_FREE (dup_brick);
-
- return ret;
-}
-
-int
-reconfigure (xlator_t *this, dict_t *options)
-{
- dht_conf_t *conf = NULL;
- char *temp_str = NULL;
- gf_boolean_t search_unhashed;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", options, out);
-
- conf = this->private;
- if (!conf)
- return 0;
-
- if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean*/
- if (strcasecmp (temp_str, "auto")) {
- if (!gf_string2boolean (temp_str, &search_unhashed)) {
- gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
- " lookup-unhashed reconfigured (%s)",
- temp_str);
- conf->search_unhashed = search_unhashed;
- } else {
- gf_log(this->name, GF_LOG_ERROR, "Reconfigure:"
- " lookup-unhashed should be boolean,"
- " not (%s), defaulting to (%d)",
- temp_str, conf->search_unhashed);
- //return -1;
- ret = -1;
- goto out;
- }
- } else {
- gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
- " lookup-unhashed reconfigured auto ");
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
- }
-
- GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options,
- percent_or_size, out);
- /* option can be any one of percent or bytes */
- conf->disk_unit = 0;
- if (conf->min_free_disk < 100.0)
- conf->disk_unit = 'p';
-
- GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options,
- percent, out);
-
- GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt,
- options, uint32, out);
-
- GF_OPTION_RECONF ("readdir-optimize", conf->readdir_optimize, options,
- bool, out);
- if (conf->defrag) {
- GF_OPTION_RECONF ("rebalance-stats", conf->defrag->stats,
- options, bool, out);
- }
-
- if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) {
- ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
- if (ret == -1)
- goto out;
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-
-int
-init (xlator_t *this)
-{
- dht_conf_t *conf = NULL;
- char *temp_str = NULL;
- int ret = -1;
- int i = 0;
- gf_defrag_info_t *defrag = NULL;
- int cmd = 0;
- char *node_uuid = NULL;
-
-
- GF_VALIDATE_OR_GOTO ("dht", this, err);
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "Distribute needs more than one subvolume");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile");
- }
-
- conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t);
- if (!conf) {
- goto err;
- }
-
- ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd);
-
- if (cmd) {
- defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t),
- gf_defrag_info_mt);
-
- GF_VALIDATE_OR_GOTO (this->name, defrag, err);
-
- LOCK_INIT (&defrag->lock);
-
- defrag->is_exiting = 0;
-
- conf->defrag = defrag;
-
- ret = dict_get_str (this->options, "node-uuid", &node_uuid);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "node-uuid not "
- "specified");
- goto err;
- }
-
- if (uuid_parse (node_uuid, defrag->node_uuid)) {
- gf_log (this->name, GF_LOG_ERROR, "Cannot parse "
- "glusterd node uuid");
- goto err;
- }
-
- defrag->cmd = cmd;
-
- defrag->stats = _gf_false;
- }
-
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
- if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean */
- if (strcasecmp (temp_str, "auto"))
- gf_string2boolean (temp_str, &conf->search_unhashed);
- else
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
-
- GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool,
- err);
-
- GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err);
-
- GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size,
- err);
-
- GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent,
- err);
-
- conf->dir_spread_cnt = conf->subvolume_cnt;
- GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt,
- uint32, err);
-
- GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down,
- bool, err);
-
- GF_OPTION_INIT ("readdir-optimize", conf->readdir_optimize, bool, err);
-
- if (defrag) {
- GF_OPTION_INIT ("rebalance-stats", defrag->stats, bool, err);
- }
-
- /* option can be any one of percent or bytes */
- conf->disk_unit = 0;
- if (conf->min_free_disk < 100)
- conf->disk_unit = 'p';
-
- ret = dht_init_subvolumes (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) {
- ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
- if (ret == -1)
- goto err;
- }
-
- ret = dht_layouts_init (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- LOCK_INIT (&conf->subvolume_lock);
- LOCK_INIT (&conf->layout_lock);
-
- conf->gen = 1;
-
- this->local_pool = mem_pool_new (dht_local_t, 512);
- if (!this->local_pool) {
- gf_log (this->name, GF_LOG_ERROR,
- "failed to create local_t's memory pool");
- goto err;
- }
-
- this->private = conf;
-
- return 0;
-
-err:
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf->du_stats);
-
- GF_FREE (conf->defrag);
-
- GF_FREE (conf);
- }
-
- return -1;
-}
-
+class_methods_t class_methods = {
+ .init = dht_init,
+ .fini = dht_fini,
+ .reconfigure = dht_reconfigure,
+ .notify = dht_notify
+};
struct xlator_fops fops = {
.lookup = dht_lookup,
@@ -557,6 +70,9 @@ struct xlator_fops fops = {
.fxattrop = dht_fxattrop,
.setattr = dht_setattr,
.fsetattr = dht_fsetattr,
+ .fallocate = dht_fallocate,
+ .discard = dht_discard,
+ .zerofill = dht_zerofill,
};
struct xlator_dumpops dumpops = {
@@ -570,59 +86,4 @@ struct xlator_cbks cbks = {
// .releasedir = dht_releasedir,
.forget = dht_forget
};
-
-
-struct volume_options options[] = {
- { .key = {"lookup-unhashed"},
- .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
- "on", "off"},
- .type = GF_OPTION_TYPE_STR,
- .default_value = "on",
- },
- { .key = {"min-free-disk"},
- .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
- .default_value = "10%",
- .description = "Percentage/Size of disk space that must be "
- "kept free."
- },
- { .key = {"min-free-inodes"},
- .type = GF_OPTION_TYPE_PERCENT,
- .default_value = "5%",
- .description = "Percentage inodes that must be "
- "kept free."
- },
- { .key = {"unhashed-sticky-bit"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- },
- { .key = {"use-readdirp"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
- },
- { .key = {"assert-no-child-down"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- },
- { .key = {"directory-layout-spread"},
- .type = GF_OPTION_TYPE_INT,
- },
- { .key = {"decommissioned-bricks"},
- .type = GF_OPTION_TYPE_ANY,
- },
- { .key = {"rebalance-cmd"},
- .type = GF_OPTION_TYPE_INT,
- },
- { .key = {"node-uuid"},
- .type = GF_OPTION_TYPE_STR,
- },
- { .key = {"rebalance-stats"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- },
- { .key = {"readdir-optimize"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- },
-
- { .key = {NULL} },
-};
+;
diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c
index 701d7ae8d..e934acdf0 100644
--- a/xlators/cluster/dht/src/nufa.c
+++ b/xlators/cluster/dht/src/nufa.c
@@ -18,6 +18,8 @@
/* TODO: all 'TODO's in dht.c holds good */
+extern struct volume_options options[];
+
int
nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno,
@@ -52,7 +54,8 @@ nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == -1)
goto out;
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
is_dir = check_is_dir (inode, stbuf, xattr);
if (!is_dir && !is_linkfile) {
@@ -201,7 +204,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this,
* revalidates directly go to the cached-subvolume.
*/
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"Failed to set dict value.");
@@ -222,7 +225,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this,
} else {
do_fresh_lookup:
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"Failed to set dict value.");
@@ -231,7 +234,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this,
}
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht.linkto", 256);
+ conf->link_xattr_name, 256);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"Failed to set dict value.");
@@ -320,7 +323,8 @@ nufa_create (call_frame_t *frame, xlator_t *this,
if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) {
avail_subvol =
dht_free_disk_available_subvol (this,
- (xlator_t *)conf->private);
+ (xlator_t *)conf->private,
+ local);
}
if (subvol != avail_subvol) {
@@ -330,9 +334,8 @@ nufa_create (call_frame_t *frame, xlator_t *this,
local->flags = flags;
local->umask = umask;
local->cached_subvol = avail_subvol;
- dht_linkfile_create (frame,
- nufa_create_linkfile_create_cbk,
- avail_subvol, subvol, loc);
+ dht_linkfile_create (frame, nufa_create_linkfile_create_cbk,
+ this, avail_subvol, subvol, loc);
return 0;
}
@@ -362,17 +365,22 @@ nufa_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_local_t *local = NULL;
local = frame->local;
+ if (!local || !local->cached_subvol) {
+ op_errno = EINVAL;
+ op_ret = -1;
+ goto err;
+ }
if (op_ret >= 0) {
- STACK_WIND (frame, dht_newfile_cbk,
- local->cached_subvol,
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk,
+ (void *)local->cached_subvol, local->cached_subvol,
local->cached_subvol->fops->mknod,
&local->loc, local->mode, local->rdev,
local->umask, local->params);
return 0;
}
-
+err:
WIPE (postparent);
WIPE (preparent);
@@ -420,7 +428,8 @@ nufa_mknod (call_frame_t *frame, xlator_t *this,
if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) {
avail_subvol =
dht_free_disk_available_subvol (this,
- (xlator_t *)conf->private);
+ (xlator_t *)conf->private,
+ local);
}
if (avail_subvol != subvol) {
@@ -432,7 +441,7 @@ nufa_mknod (call_frame_t *frame, xlator_t *this,
local->rdev = rdev;
local->cached_subvol = avail_subvol;
- dht_linkfile_create (frame, nufa_mknod_linkfile_cbk,
+ dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, this,
avail_subvol, subvol, loc);
return 0;
}
@@ -440,9 +449,9 @@ nufa_mknod (call_frame_t *frame, xlator_t *this,
gf_log (this->name, GF_LOG_TRACE,
"creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->mknod,
- loc, mode, rdev, umask, params);
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol,
+ subvol->fops->mknod, loc, mode, rdev, umask,
+ params);
return 0;
@@ -455,42 +464,6 @@ err:
}
-int
-notify (xlator_t *this, int event, void *data, ...)
-{
- int ret = -1;
-
- ret = dht_notify (this, event, data);
-
- return ret;
-}
-
-void
-fini (xlator_t *this)
-{
- int i = 0;
- dht_conf_t *conf = NULL;
-
- conf = this->private;
-
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf);
- }
-
- return;
-}
-
gf_boolean_t
same_first_part (char *str1, char term1, char *str2, char term2)
{
@@ -511,194 +484,152 @@ same_first_part (char *str1, char term1, char *str2, char term2)
}
}
-int
-init (xlator_t *this)
-{
- dht_conf_t *conf = NULL;
- xlator_list_t *trav = NULL;
- data_t *data = NULL;
- char *local_volname = NULL;
- char *temp_str = NULL;
- int ret = -1;
- int i = 0;
- char my_hostname[256];
- double temp_free_disk = 0;
- uint64_t size = 0;
- xlator_t *local_subvol = NULL;
- char *brick_host = NULL;
- xlator_t *kid = NULL;
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "NUFA needs more than one subvolume");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile");
- }
+typedef struct nufa_args {
+ xlator_t *this;
+ char *volname;
+ gf_boolean_t addr_match;
+} nufa_args_t;
- conf = GF_CALLOC (1, sizeof (*conf),
- gf_dht_mt_dht_conf_t);
- if (!conf) {
- goto err;
- }
-
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
- if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean */
- if (strcasecmp (temp_str, "auto"))
- gf_string2boolean (temp_str, &conf->search_unhashed);
- else
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
-
- ret = dht_init_subvolumes (this, conf);
- if (ret == -1) {
- goto err;
+static void
+nufa_find_local_brick (xlator_t *xl, void *data)
+{
+ nufa_args_t *args = data;
+ xlator_t *this = args->this;
+ char *local_volname = args->volname;
+ gf_boolean_t addr_match = args->addr_match;
+ char *brick_host = NULL;
+ dht_conf_t *conf = this->private;
+ int ret = -1;
+
+ /*This means a local subvol was already found. We pick the first brick
+ * that is local*/
+ if (conf->private)
+ return;
+
+ if (strcmp (xl->name, local_volname) == 0) {
+ conf->private = xl;
+ gf_log (this->name, GF_LOG_INFO, "Using specified subvol %s",
+ local_volname);
+ return;
+ }
+
+ if (!addr_match)
+ return;
+
+ ret = dict_get_str (xl->options, "remote-host", &brick_host);
+ if ((ret == 0) &&
+ (gf_is_same_address (local_volname, brick_host) ||
+ gf_is_local_addr (brick_host))) {
+ conf->private = xl;
+ gf_log (this->name, GF_LOG_INFO, "Using the first local "
+ "subvol %s", xl->name);
+ return;
}
- ret = dht_layouts_init (this, conf);
- if (ret == -1) {
- goto err;
- }
+}
- LOCK_INIT (&conf->subvolume_lock);
- LOCK_INIT (&conf->layout_lock);
+static void
+nufa_to_dht (xlator_t *this)
+{
+ GF_ASSERT (this);
+ GF_ASSERT (this->fops);
- conf->gen = 1;
+ this->fops->lookup = dht_lookup;
+ this->fops->create = dht_create;
+ this->fops->mknod = dht_mknod;
+}
- local_volname = "localhost";
- ret = gethostname (my_hostname, 256);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "could not find hostname (%s)",
- strerror (errno));
+int
+nufa_find_local_subvol (xlator_t *this,
+ void (*fn) (xlator_t *each, void* data), void *data)
+{
+ int ret = -1;
+ dht_conf_t *conf = this->private;
+ xlator_list_t *trav = NULL;
+ xlator_t *parent = NULL;
+ xlator_t *candidate = NULL;
+
+ xlator_foreach_depth_first (this, fn, data);
+ if (!conf->private) {
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't find a local "
+ "brick");
+ return -1;
}
- if (ret == 0)
- local_volname = my_hostname;
-
- data = dict_get (this->options, "local-volume-name");
- if (data) {
- local_volname = data->data;
- }
+ candidate = conf->private;
+ trav = candidate->parents;
+ while (trav) {
- for (trav = this->children; trav; trav = trav->next) {
- if (strcmp (trav->xlator->name, local_volname) == 0)
+ parent = trav->xlator;
+ if (strcmp (parent->type, "cluster/nufa") == 0) {
+ gf_log (this->name, GF_LOG_INFO, "Found local subvol, "
+ "%s", candidate->name);
+ ret = 0;
+ conf->private = candidate;
break;
- if (local_subvol) {
- continue;
- }
- kid = trav->xlator;
- for (;;) {
- if (dict_get_str(trav->xlator->options,"remote-host",
- &brick_host) == 0) {
- /* Found it. */
- break;
- }
- if (!kid->children) {
- /* Nowhere further to look. */
- gf_log (this->name, GF_LOG_ERROR,
- "could not get remote-host");
- goto err;
- }
- if (kid->children->next) {
- /* Multiple choices, can't/shouldn't decide. */
- gf_log (this->name, GF_LOG_ERROR,
- "NUFA found fan-out (type %s) volume",
- kid->type);
- goto err;
- }
- /* One-to-one xlators are OK, try the next one. */
- kid = kid->children->xlator;
}
- if (same_first_part(my_hostname,'.',brick_host,'.')) {
- local_subvol = trav->xlator;
- }
- }
- if (trav) {
- gf_log (this->name, GF_LOG_INFO,
- "Using specified subvol %s", local_volname);
- conf->private = trav->xlator;
- }
- else if (local_subvol) {
- gf_log (this->name, GF_LOG_INFO,
- "Using first local subvol %s", local_subvol->name);
- conf->private = local_subvol;
- }
- else {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not find specified or local subvol");
- goto err;
-
+ candidate = parent;
+ trav = parent->parents;
}
- /* The volume specified exists */
-
- conf->min_free_disk = 10;
- conf->disk_unit = 'p';
-
- if (dict_get_str (this->options, "min-free-disk",
- &temp_str) == 0) {
- if (gf_string2percent (temp_str,
- &temp_free_disk) == 0) {
- if (temp_free_disk > 100) {
- gf_string2bytesize (temp_str, &size);
- conf->min_free_disk = size;
- conf->disk_unit = 'b';
- } else {
- conf->min_free_disk = temp_free_disk;
- conf->disk_unit = 'p';
- }
- } else {
- gf_string2bytesize (temp_str, &size);
- conf->min_free_disk = size;
- conf->disk_unit = 'b';
- }
- }
+ return ret;
+}
- conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t),
- gf_dht_mt_dht_du_t);
- if (!conf->du_stats) {
- goto err;
- }
+int
+nufa_init (xlator_t *this)
+{
+ data_t *data = NULL;
+ char *local_volname = NULL;
+ int ret = -1;
+ char my_hostname[256];
+ gf_boolean_t addr_match = _gf_false;
+ nufa_args_t args = {0, };
- this->local_pool = mem_pool_new (dht_local_t, 128);
- if (!this->local_pool) {
- gf_log (this->name, GF_LOG_ERROR,
- "failed to create local_t's memory pool");
- goto err;
+ ret = dht_init(this);
+ if (ret) {
+ return ret;
}
- this->private = conf;
-
- return 0;
-
-err:
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
+ if ((data = dict_get (this->options, "local-volume-name"))) {
+ local_volname = data->data;
- GF_FREE (conf->subvolume_status);
+ } else {
+ addr_match = _gf_true;
+ local_volname = "localhost";
+ ret = gethostname (my_hostname, 256);
+ if (ret == 0)
+ local_volname = my_hostname;
- GF_FREE (conf->du_stats);
+ else
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not find hostname (%s)",
+ strerror (errno));
- GF_FREE (conf);
}
- return -1;
+ args.this = this;
+ args.volname = local_volname;
+ args.addr_match = addr_match;
+ ret = nufa_find_local_subvol (this, nufa_find_local_brick, &args);
+ if (ret) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Unable to find local subvolume, switching "
+ "to dht mode");
+ nufa_to_dht (this);
+ }
+ return 0;
}
+class_methods_t class_methods = {
+ .init = nufa_init,
+ .fini = dht_fini,
+ .reconfigure = dht_reconfigure,
+ .notify = dht_notify
+};
+
+
struct xlator_fops fops = {
.lookup = nufa_lookup,
.create = nufa_create,
@@ -743,19 +674,3 @@ struct xlator_fops fops = {
struct xlator_cbks cbks = {
.forget = dht_forget
};
-
-
-struct volume_options options[] = {
- { .key = {"lookup-unhashed"},
- .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
- "on", "off"},
- .type = GF_OPTION_TYPE_STR
- },
- { .key = {"local-volume-name"},
- .type = GF_OPTION_TYPE_XLATOR
- },
- { .key = {"min-free-disk"},
- .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
- },
- { .key = {NULL} },
-};
diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c
index 2b4f97387..d3ea90ba8 100644
--- a/xlators/cluster/dht/src/switch.c
+++ b/xlators/cluster/dht/src/switch.c
@@ -22,6 +22,8 @@
#include <fnmatch.h>
#include <string.h>
+extern struct volume_options options[];
+
struct switch_sched_array {
xlator_t *xl;
int32_t eligible;
@@ -135,7 +137,8 @@ switch_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == -1)
goto out;
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
is_dir = check_is_dir (inode, stbuf, xattr);
if (!is_dir && !is_linkfile) {
@@ -289,11 +292,11 @@ switch_lookup (call_frame_t *frame, xlator_t *this,
* attribute, revalidates directly go to the cached-subvolume.
*/
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (ret < 0)
gf_log (this->name, GF_LOG_WARNING,
- "failed to set dict value for "
- "trusted.glusterfs.dht");
+ "failed to set dict value for %s",
+ conf->xattr_name);
for (i = 0; i < layout->cnt; i++) {
subvol = layout->list[i].xlator;
@@ -308,18 +311,18 @@ switch_lookup (call_frame_t *frame, xlator_t *this,
} else {
do_fresh_lookup:
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (ret < 0)
gf_log (this->name, GF_LOG_WARNING,
- "failed to set dict value for "
- "trusted.glusterfs.dht");
+ "failed to set dict value for %s",
+ conf->xattr_name);
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht.linkto", 256);
+ conf->link_xattr_name, 256);
if (ret < 0)
gf_log (this->name, GF_LOG_WARNING,
- "failed to set dict value for "
- "trusted.glusterfs.dht.linkto");
+ "failed to set dict value for %s",
+ conf->link_xattr_name);
if (!hashed_subvol) {
gf_log (this->name, GF_LOG_DEBUG,
@@ -434,7 +437,8 @@ switch_create (call_frame_t *frame, xlator_t *this,
avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol);
if (dht_is_subvol_filled (this, avail_subvol)) {
avail_subvol =
- dht_free_disk_available_subvol (this, avail_subvol);
+ dht_free_disk_available_subvol (this, avail_subvol,
+ local);
}
if (subvol != avail_subvol) {
@@ -443,9 +447,8 @@ switch_create (call_frame_t *frame, xlator_t *this,
local->flags = flags;
local->umask = umask;
local->cached_subvol = avail_subvol;
- dht_linkfile_create (frame,
- switch_create_linkfile_create_cbk,
- avail_subvol, subvol, loc);
+ dht_linkfile_create (frame, switch_create_linkfile_create_cbk,
+ this, avail_subvol, subvol, loc);
return 0;
}
@@ -475,17 +478,22 @@ switch_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_local_t *local = NULL;
local = frame->local;
+ if (!local || !local->cached_subvol) {
+ op_errno = EINVAL;
+ op_ret = -1;
+ goto err;
+ }
if (op_ret >= 0) {
- STACK_WIND (frame, dht_newfile_cbk,
- local->cached_subvol,
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk,
+ (void *)local->cached_subvol, local->cached_subvol,
local->cached_subvol->fops->mknod,
&local->loc, local->mode, local->rdev,
local->umask, local->params);
return 0;
}
-
+err:
DHT_STACK_UNWIND (link, frame, op_ret, op_errno,
inode, stbuf, preparent, postparent, xdata);
return 0;
@@ -529,7 +537,8 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol);
if (dht_is_subvol_filled (this, avail_subvol)) {
avail_subvol =
- dht_free_disk_available_subvol (this, avail_subvol);
+ dht_free_disk_available_subvol (this, avail_subvol,
+ local);
}
if (avail_subvol != subvol) {
@@ -542,16 +551,16 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
local->cached_subvol = avail_subvol;
dht_linkfile_create (frame, switch_mknod_linkfile_cbk,
- avail_subvol, subvol, loc);
+ this, avail_subvol, subvol, loc);
return 0;
}
gf_log (this->name, GF_LOG_TRACE,
"creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->mknod,
- loc, mode, rdev, umask, params);
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol,
+ subvol->fops->mknod, loc, mode, rdev, umask,
+ params);
return 0;
@@ -564,20 +573,9 @@ err:
}
-int
-notify (xlator_t *this, int event, void *data, ...)
-{
- int ret = -1;
-
- ret = dht_notify (this, event, data);
-
- return ret;
-}
-
void
-fini (xlator_t *this)
+switch_fini (xlator_t *this)
{
- int i = 0;
dht_conf_t *conf = NULL;
struct switch_struct *trav = NULL;
struct switch_struct *prev = NULL;
@@ -593,22 +591,9 @@ fini (xlator_t *this)
trav = trav->next;
GF_FREE (prev);
}
-
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf);
}
- return;
+ dht_fini(this);
}
int
@@ -834,68 +819,18 @@ err:
}
-int
-init (xlator_t *this)
+int32_t
+switch_init (xlator_t *this)
{
dht_conf_t *conf = NULL;
data_t *data = NULL;
- char *temp_str = NULL;
int ret = -1;
- int i = 0;
- double temp_free_disk = 0;
- uint64_t size = 0;
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "SWITCH needs more than one subvolume");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile");
- }
-
- conf = GF_CALLOC (1, sizeof (*conf), gf_switch_mt_dht_conf_t);
- if (!conf) {
- goto err;
- }
-
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
- if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean */
- if (strcasecmp (temp_str, "auto"))
- gf_string2boolean (temp_str, &conf->search_unhashed);
- else
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
-
- conf->unhashed_sticky_bit = 0;
- if (dict_get_str (this->options, "unhashed-sticky-bit",
- &temp_str) == 0) {
- gf_string2boolean (temp_str, &conf->unhashed_sticky_bit);
- }
- conf->min_free_disk = 10.0;
- conf->disk_unit = 'p';
-
- if (dict_get_str (this->options, "min-free-disk",
- &temp_str) == 0) {
- if (gf_string2percent (temp_str, &temp_free_disk) == 0) {
- if (temp_free_disk > 100) {
- gf_string2bytesize (temp_str, &size);
- conf->min_free_disk = size;
- conf->disk_unit = 'b';
- } else {
- conf->min_free_disk = temp_free_disk;
- conf->disk_unit = 'p';
- }
- } else {
- gf_string2bytesize (temp_str, &size);
- conf->min_free_disk = size;
- conf->disk_unit = 'b';
- }
+ ret = dht_init(this);
+ if (ret) {
+ return ret;
}
+ conf = this->private;
data = dict_get (this->options, "pattern.switch.case");
if (data) {
@@ -906,60 +841,23 @@ init (xlator_t *this)
}
}
- ret = dht_init_subvolumes (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- ret = dht_layouts_init (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- LOCK_INIT (&conf->subvolume_lock);
- LOCK_INIT (&conf->layout_lock);
-
- conf->gen = 1;
-
- conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t),
- gf_switch_mt_dht_du_t);
- if (!conf->du_stats) {
- goto err;
- }
-
- this->local_pool = mem_pool_new (dht_local_t, 128);
- if (!this->local_pool) {
- gf_log (this->name, GF_LOG_ERROR,
- "failed to create local_t's memory pool");
- goto err;
- }
-
this->private = conf;
-
return 0;
err:
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf->du_stats);
-
- GF_FREE (conf);
- }
-
+ dht_fini(this);
return -1;
}
+class_methods_t class_methods = {
+ .init = switch_init,
+ .fini = switch_fini,
+ .reconfigure = dht_reconfigure,
+ .notify = dht_notify
+};
+
+
struct xlator_fops fops = {
.lookup = switch_lookup,
.create = switch_create,
@@ -1004,19 +902,3 @@ struct xlator_fops fops = {
struct xlator_cbks cbks = {
.forget = dht_forget
};
-
-
-struct volume_options options[] = {
- { .key = {"lookup-unhashed"},
- .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
- "on", "off"},
- .type = GF_OPTION_TYPE_STR
- },
- { .key = {"pattern.switch.case"},
- .type = GF_OPTION_TYPE_ANY
- },
- { .key = {"min-free-disk"},
- .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
- },
- { .key = {NULL} },
-};
diff --git a/xlators/cluster/ha/src/Makefile.am b/xlators/cluster/ha/src/Makefile.am
index c5da56ff1..5c1364b7f 100644
--- a/xlators/cluster/ha/src/Makefile.am
+++ b/xlators/cluster/ha/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = ha.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster
-ha_la_LDFLAGS = -module -avoidversion
+ha_la_LDFLAGS = -module -avoid-version
ha_la_SOURCES = ha-helpers.c ha.c
ha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/cluster/map/src/Makefile.am b/xlators/cluster/map/src/Makefile.am
index a51c13573..a278b05e2 100644
--- a/xlators/cluster/map/src/Makefile.am
+++ b/xlators/cluster/map/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = map.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster
-map_la_LDFLAGS = -module -avoidversion
+map_la_LDFLAGS = -module -avoid-version
map_la_SOURCES = map.c map-helper.c
map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/cluster/stripe/src/Makefile.am b/xlators/cluster/stripe/src/Makefile.am
index 42846df83..2d151422a 100644
--- a/xlators/cluster/stripe/src/Makefile.am
+++ b/xlators/cluster/stripe/src/Makefile.am
@@ -2,7 +2,7 @@
xlator_LTLIBRARIES = stripe.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
-stripe_la_LDFLAGS = -module -avoidversion
+stripe_la_LDFLAGS = -module -avoid-version
stripe_la_SOURCES = stripe.c stripe-helpers.c \
$(top_builddir)/xlators/lib/src/libxlator.c
diff --git a/xlators/cluster/stripe/src/stripe-mem-types.h b/xlators/cluster/stripe/src/stripe-mem-types.h
index e05ba0c29..e9ac9cf46 100644
--- a/xlators/cluster/stripe/src/stripe-mem-types.h
+++ b/xlators/cluster/stripe/src/stripe-mem-types.h
@@ -20,6 +20,7 @@ enum gf_stripe_mem_types_ {
gf_stripe_mt_stripe_fd_ctx_t,
gf_stripe_mt_char,
gf_stripe_mt_int8_t,
+ gf_stripe_mt_int32_t,
gf_stripe_mt_xlator_t,
gf_stripe_mt_stripe_private_t,
gf_stripe_mt_stripe_options,
diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c
index 43e3c3ed2..69b510e23 100644
--- a/xlators/cluster/stripe/src/stripe.c
+++ b/xlators/cluster/stripe/src/stripe.c
@@ -3774,6 +3774,524 @@ err:
int32_t
+stripe_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = NULL;
+ stripe_local_t *mlocal = NULL;
+ call_frame_t *prev = NULL;
+ call_frame_t *mframe = NULL;
+
+ if (!this || !frame || !frame->local || !cookie) {
+ gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
+ goto out;
+ }
+
+ prev = cookie;
+ local = frame->local;
+ mframe = local->orig_frame;
+ mlocal = mframe->local;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = ++mlocal->call_count;
+
+ if (op_ret == 0) {
+ mlocal->post_buf = *postbuf;
+ mlocal->pre_buf = *prebuf;
+
+ mlocal->prebuf_blocks += prebuf->ia_blocks;
+ mlocal->postbuf_blocks += postbuf->ia_blocks;
+
+ correct_file_size(prebuf, mlocal->fctx, prev);
+ correct_file_size(postbuf, mlocal->fctx, prev);
+
+ if (mlocal->prebuf_size < prebuf->ia_size)
+ mlocal->prebuf_size = prebuf->ia_size;
+ if (mlocal->postbuf_size < postbuf->ia_size)
+ mlocal->postbuf_size = postbuf->ia_size;
+ }
+
+ /* return the first failure */
+ if (mlocal->op_ret == 0) {
+ mlocal->op_ret = op_ret;
+ mlocal->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
+ mlocal->pre_buf.ia_size = mlocal->prebuf_size;
+ mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
+ mlocal->post_buf.ia_size = mlocal->postbuf_size;
+ mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
+
+ STRIPE_STACK_UNWIND (fallocate, mframe, mlocal->op_ret,
+ mlocal->op_errno, &mlocal->pre_buf,
+ &mlocal->post_buf, NULL);
+ }
+out:
+ STRIPE_STACK_DESTROY(frame);
+ return 0;
+}
+
+int32_t
+stripe_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ stripe_local_t *local = NULL;
+ stripe_fd_ctx_t *fctx = NULL;
+ int32_t op_errno = 1;
+ int32_t idx = 0;
+ int32_t offset_offset = 0;
+ int32_t remaining_size = 0;
+ off_t fill_size = 0;
+ uint64_t stripe_size = 0;
+ uint64_t tmp_fctx = 0;
+ off_t dest_offset = 0;
+ call_frame_t *fframe = NULL;
+ stripe_local_t *flocal = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ inode_ctx_get (fd->inode, this, &tmp_fctx);
+ if (!tmp_fctx) {
+ op_errno = EINVAL;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+ local->stripe_size = stripe_size;
+ local->fctx = fctx;
+
+ if (!stripe_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Wrong stripe size for the file");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ while (1) {
+ fframe = copy_frame(frame);
+ flocal = mem_get0(this->local_pool);
+ if (!flocal) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ flocal->orig_frame = frame;
+ fframe->local = flocal;
+
+ /* send fallocate request to the associated child node */
+ idx = (((offset + offset_offset) /
+ local->stripe_size) % fctx->stripe_count);
+
+ fill_size = (local->stripe_size -
+ ((offset + offset_offset) % local->stripe_size));
+ if (fill_size > remaining_size)
+ fill_size = remaining_size;
+
+ remaining_size -= fill_size;
+
+ local->wind_count++;
+ if (remaining_size == 0)
+ local->unwind = 1;
+
+ dest_offset = offset + offset_offset;
+ if (fctx->stripe_coalesce)
+ dest_offset = coalesced_offset(dest_offset,
+ local->stripe_size, fctx->stripe_count);
+
+ /*
+ * TODO: Create a separate handler for coalesce mode that sends a
+ * single fallocate per-child (since the ranges are linear).
+ */
+ STACK_WIND(fframe, stripe_fallocate_cbk, fctx->xl_array[idx],
+ fctx->xl_array[idx]->fops->fallocate, fd, mode,
+ dest_offset, fill_size, xdata);
+
+ offset_offset += fill_size;
+ if (remaining_size == 0)
+ break;
+ }
+
+ return 0;
+err:
+ if (fframe)
+ STRIPE_STACK_DESTROY(fframe);
+
+ STRIPE_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+stripe_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = NULL;
+ stripe_local_t *mlocal = NULL;
+ call_frame_t *prev = NULL;
+ call_frame_t *mframe = NULL;
+
+ if (!this || !frame || !frame->local || !cookie) {
+ gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
+ goto out;
+ }
+
+ prev = cookie;
+ local = frame->local;
+ mframe = local->orig_frame;
+ mlocal = mframe->local;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = ++mlocal->call_count;
+
+ if (op_ret == 0) {
+ mlocal->post_buf = *postbuf;
+ mlocal->pre_buf = *prebuf;
+
+ mlocal->prebuf_blocks += prebuf->ia_blocks;
+ mlocal->postbuf_blocks += postbuf->ia_blocks;
+
+ correct_file_size(prebuf, mlocal->fctx, prev);
+ correct_file_size(postbuf, mlocal->fctx, prev);
+
+ if (mlocal->prebuf_size < prebuf->ia_size)
+ mlocal->prebuf_size = prebuf->ia_size;
+ if (mlocal->postbuf_size < postbuf->ia_size)
+ mlocal->postbuf_size = postbuf->ia_size;
+ }
+
+ /* return the first failure */
+ if (mlocal->op_ret == 0) {
+ mlocal->op_ret = op_ret;
+ mlocal->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
+ mlocal->pre_buf.ia_size = mlocal->prebuf_size;
+ mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
+ mlocal->post_buf.ia_size = mlocal->postbuf_size;
+ mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
+
+ STRIPE_STACK_UNWIND (discard, mframe, mlocal->op_ret,
+ mlocal->op_errno, &mlocal->pre_buf,
+ &mlocal->post_buf, NULL);
+ }
+out:
+ STRIPE_STACK_DESTROY(frame);
+ return 0;
+}
+
+int32_t
+stripe_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ stripe_local_t *local = NULL;
+ stripe_fd_ctx_t *fctx = NULL;
+ int32_t op_errno = 1;
+ int32_t idx = 0;
+ int32_t offset_offset = 0;
+ int32_t remaining_size = 0;
+ off_t fill_size = 0;
+ uint64_t stripe_size = 0;
+ uint64_t tmp_fctx = 0;
+ off_t dest_offset = 0;
+ call_frame_t *fframe = NULL;
+ stripe_local_t *flocal = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ inode_ctx_get (fd->inode, this, &tmp_fctx);
+ if (!tmp_fctx) {
+ op_errno = EINVAL;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+ local->stripe_size = stripe_size;
+ local->fctx = fctx;
+
+ if (!stripe_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Wrong stripe size for the file");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ while (1) {
+ fframe = copy_frame(frame);
+ flocal = mem_get0(this->local_pool);
+ if (!flocal) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ flocal->orig_frame = frame;
+ fframe->local = flocal;
+
+ /* send discard request to the associated child node */
+ idx = (((offset + offset_offset) /
+ local->stripe_size) % fctx->stripe_count);
+
+ fill_size = (local->stripe_size -
+ ((offset + offset_offset) % local->stripe_size));
+ if (fill_size > remaining_size)
+ fill_size = remaining_size;
+
+ remaining_size -= fill_size;
+
+ local->wind_count++;
+ if (remaining_size == 0)
+ local->unwind = 1;
+
+ dest_offset = offset + offset_offset;
+ if (fctx->stripe_coalesce)
+ dest_offset = coalesced_offset(dest_offset,
+ local->stripe_size, fctx->stripe_count);
+
+ /*
+ * TODO: Create a separate handler for coalesce mode that sends a
+ * single discard per-child (since the ranges are linear).
+ */
+ STACK_WIND(fframe, stripe_discard_cbk, fctx->xl_array[idx],
+ fctx->xl_array[idx]->fops->discard, fd, dest_offset,
+ fill_size, xdata);
+
+ offset_offset += fill_size;
+ if (remaining_size == 0)
+ break;
+ }
+
+ return 0;
+err:
+ if (fframe)
+ STRIPE_STACK_DESTROY(fframe);
+
+ STRIPE_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+stripe_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = NULL;
+ stripe_local_t *mlocal = NULL;
+ call_frame_t *prev = NULL;
+ call_frame_t *mframe = NULL;
+
+ if (!this || !frame || !frame->local || !cookie) {
+ gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
+ goto out;
+ }
+
+ prev = cookie;
+ local = frame->local;
+ mframe = local->orig_frame;
+ mlocal = mframe->local;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = ++mlocal->call_count;
+
+ if (op_ret == 0) {
+ mlocal->post_buf = *postbuf;
+ mlocal->pre_buf = *prebuf;
+
+ mlocal->prebuf_blocks += prebuf->ia_blocks;
+ mlocal->postbuf_blocks += postbuf->ia_blocks;
+
+ correct_file_size(prebuf, mlocal->fctx, prev);
+ correct_file_size(postbuf, mlocal->fctx, prev);
+
+ if (mlocal->prebuf_size < prebuf->ia_size)
+ mlocal->prebuf_size = prebuf->ia_size;
+ if (mlocal->postbuf_size < postbuf->ia_size)
+ mlocal->postbuf_size = postbuf->ia_size;
+ }
+
+ /* return the first failure */
+ if (mlocal->op_ret == 0) {
+ mlocal->op_ret = op_ret;
+ mlocal->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
+ mlocal->pre_buf.ia_size = mlocal->prebuf_size;
+ mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
+ mlocal->post_buf.ia_size = mlocal->postbuf_size;
+ mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
+
+ STRIPE_STACK_UNWIND (zerofill, mframe, mlocal->op_ret,
+ mlocal->op_errno, &mlocal->pre_buf,
+ &mlocal->post_buf, NULL);
+ }
+out:
+ STRIPE_STACK_DESTROY(frame);
+ return 0;
+}
+
+int32_t
+stripe_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ stripe_local_t *local = NULL;
+ stripe_fd_ctx_t *fctx = NULL;
+ int32_t op_errno = 1;
+ int32_t idx = 0;
+ int32_t offset_offset = 0;
+ int32_t remaining_size = 0;
+ off_t fill_size = 0;
+ uint64_t stripe_size = 0;
+ uint64_t tmp_fctx = 0;
+ off_t dest_offset = 0;
+ call_frame_t *fframe = NULL;
+ stripe_local_t *flocal = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ inode_ctx_get (fd->inode, this, &tmp_fctx);
+ if (!tmp_fctx) {
+ op_errno = EINVAL;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+ local->stripe_size = stripe_size;
+ local->fctx = fctx;
+
+ if (!stripe_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Wrong stripe size for the file");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ while (1) {
+ fframe = copy_frame(frame);
+ flocal = mem_get0(this->local_pool);
+ if (!flocal) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ flocal->orig_frame = frame;
+ fframe->local = flocal;
+
+ idx = (((offset + offset_offset) /
+ local->stripe_size) % fctx->stripe_count);
+
+ fill_size = (local->stripe_size -
+ ((offset + offset_offset) % local->stripe_size));
+ if (fill_size > remaining_size)
+ fill_size = remaining_size;
+
+ remaining_size -= fill_size;
+
+ local->wind_count++;
+ if (remaining_size == 0)
+ local->unwind = 1;
+
+ dest_offset = offset + offset_offset;
+ if (fctx->stripe_coalesce)
+ dest_offset = coalesced_offset(dest_offset,
+ local->stripe_size,
+ fctx->stripe_count);
+
+ STACK_WIND(fframe, stripe_zerofill_cbk, fctx->xl_array[idx],
+ fctx->xl_array[idx]->fops->zerofill, fd,
+ dest_offset, fill_size, xdata);
+ offset_offset += fill_size;
+ if (remaining_size == 0)
+ break;
+ }
+
+ return 0;
+err:
+ if (fframe)
+ STRIPE_STACK_DESTROY(fframe);
+
+ STRIPE_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
stripe_release (xlator_t *this, fd_t *fd)
{
return 0;
@@ -3809,6 +4327,7 @@ notify (xlator_t *this, int32_t event, void *data, ...)
stripe_private_t *priv = NULL;
int down_client = 0;
int i = 0;
+ gf_boolean_t heard_from_all_children = _gf_false;
if (!this)
return 0;
@@ -3820,30 +4339,34 @@ notify (xlator_t *this, int32_t event, void *data, ...)
switch (event)
{
case GF_EVENT_CHILD_UP:
- case GF_EVENT_CHILD_CONNECTING:
{
/* get an index number to set */
for (i = 0; i < priv->child_count; i++) {
if (data == priv->xl_array[i])
break;
}
- priv->state[i] = 1;
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->state[i])
- down_client++;
+
+ if (priv->child_count == i) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "got GF_EVENT_CHILD_UP bad subvolume %s",
+ data? ((xlator_t *)data)->name: NULL);
+ break;
}
LOCK (&priv->lock);
{
- priv->nodes_down = down_client;
if (data == FIRST_CHILD (this))
priv->first_child_down = 0;
- if (!priv->nodes_down)
- default_notify (this, event, data);
+ priv->last_event[i] = event;
}
UNLOCK (&priv->lock);
}
break;
+ case GF_EVENT_CHILD_CONNECTING:
+ {
+ // 'CONNECTING' doesn't ensure its CHILD_UP, so do nothing
+ goto out;
+ }
case GF_EVENT_CHILD_DOWN:
{
/* get an index number to set */
@@ -3851,20 +4374,19 @@ notify (xlator_t *this, int32_t event, void *data, ...)
if (data == priv->xl_array[i])
break;
}
- priv->state[i] = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->state[i])
- down_client++;
+
+ if (priv->child_count == i) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "got GF_EVENT_CHILD_DOWN bad subvolume %s",
+ data? ((xlator_t *)data)->name: NULL);
+ break;
}
LOCK (&priv->lock);
{
- priv->nodes_down = down_client;
-
if (data == FIRST_CHILD (this))
priv->first_child_down = 1;
- if (priv->nodes_down)
- default_notify (this, event, data);
+ priv->last_event[i] = event;
}
UNLOCK (&priv->lock);
}
@@ -3874,10 +4396,30 @@ notify (xlator_t *this, int32_t event, void *data, ...)
{
/* */
default_notify (this, event, data);
+ goto out;
}
break;
}
+ // Consider child as down if it's last_event is not CHILD_UP
+ for (i = 0, down_client = 0; i < priv->child_count; i++)
+ if (priv->last_event[i] != GF_EVENT_CHILD_UP)
+ down_client++;
+
+ LOCK (&priv->lock);
+ {
+ priv->nodes_down = down_client;
+ }
+ UNLOCK (&priv->lock);
+
+ heard_from_all_children = _gf_true;
+ for (i = 0; i < priv->child_count; i++)
+ if (!priv->last_event[i])
+ heard_from_all_children = _gf_false;
+
+ if (heard_from_all_children)
+ default_notify (this, event, data);
+out:
return 0;
}
@@ -3923,6 +4465,37 @@ stripe_setxattr_cbk (call_frame_t *frame, void *cookie,
return 0;
}
+#ifdef HAVE_BD_XLATOR
+int
+stripe_is_bd (dict_t *this, char *key, data_t *value, void *data)
+{
+ gf_boolean_t *is_bd = data;
+
+ if (data == NULL)
+ return 0;
+
+ if (XATTR_IS_BD (key))
+ *is_bd = _gf_true;
+
+ return 0;
+}
+
+inline gf_boolean_t
+stripe_setxattr_is_bd (dict_t *dict)
+{
+ gf_boolean_t is_bd = _gf_false;
+
+ if (dict == NULL)
+ goto out;
+
+ dict_foreach (dict, stripe_is_bd, &is_bd);
+out:
+ return is_bd;
+}
+#else
+#define stripe_setxattr_is_bd(dict) _gf_false
+#endif
+
int
stripe_setxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *dict, int flags, dict_t *xdata)
@@ -3932,6 +4505,7 @@ stripe_setxattr (call_frame_t *frame, xlator_t *this,
stripe_private_t *priv = NULL;
stripe_local_t *local = NULL;
int i = 0;
+ gf_boolean_t is_bd = _gf_false;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
@@ -3954,11 +4528,15 @@ stripe_setxattr (call_frame_t *frame, xlator_t *this,
local->wind_count = priv->child_count;
local->op_ret = local->op_errno = 0;
+ is_bd = stripe_setxattr_is_bd (dict);
+
/**
* Set xattrs for directories on all subvolumes. Additionally
- * this power is only given to a special client.
+ * this power is only given to a special client. Bd xlator
+ * also needs xattrs for regular files (ie LVs)
*/
- if ((frame->root->pid == -1) && IA_ISDIR (loc->inode->ia_type)) {
+ if (((frame->root->pid == GF_CLIENT_PID_GSYNCD) &&
+ IA_ISDIR (loc->inode->ia_type)) || is_bd) {
for (i = 0; i < priv->child_count; i++, trav = trav->next) {
STACK_WIND (frame, stripe_setxattr_cbk,
trav->xlator, trav->xlator->fops->setxattr,
@@ -3989,21 +4567,21 @@ stripe_fsetxattr_cbk (call_frame_t *frame, void *cookie,
int
-stripe_is_lockinfo (dict_t *this,
- char *key,
- data_t *value,
- void *data)
+stripe_is_special_key (dict_t *this,
+ char *key,
+ data_t *value,
+ void *data)
{
- gf_boolean_t *is_lockinfo = NULL;
+ gf_boolean_t *is_special = NULL;
if (data == NULL) {
goto out;
}
- is_lockinfo = data;
+ is_special = data;
- if (XATTR_IS_LOCKINFO (key))
- *is_lockinfo = _gf_true;
+ if (XATTR_IS_LOCKINFO (key) || XATTR_IS_BD (key))
+ *is_special = _gf_true;
out:
return 0;
@@ -4080,7 +4658,7 @@ stripe_fsetxattr_is_special (dict_t *dict)
goto out;
}
- dict_foreach (dict, stripe_is_lockinfo, &is_spl);
+ dict_foreach (dict, stripe_is_special_key, &is_spl);
out:
return is_spl;
@@ -4598,9 +5176,9 @@ init (xlator_t *this)
if (!priv->xl_array)
goto out;
- priv->state = GF_CALLOC (count, sizeof (int8_t),
- gf_stripe_mt_int8_t);
- if (!priv->state)
+ priv->last_event = GF_CALLOC (count, sizeof (int),
+ gf_stripe_mt_int32_t);
+ if (!priv->last_event)
goto out;
priv->child_count = count;
@@ -4701,6 +5279,7 @@ fini (xlator_t *this)
trav = trav->next;
GF_FREE (prev);
}
+ GF_FREE (priv->last_event);
LOCK_DESTROY (&priv->lock);
GF_FREE (priv);
}
@@ -4841,10 +5420,10 @@ stripe_vgetxattr_cbk (call_frame_t *frame, void *cookie,
xattr->pos = cky;
xattr->xattr_value = gf_memdup (xattr_val,
- xattr->xattr_value,
xattr->xattr_len);
- local->xattr_total_len += xattr->xattr_len + 1;
+ if (xattr->xattr_value != NULL)
+ local->xattr_total_len += xattr->xattr_len + 1;
}
}
out:
@@ -4929,7 +5508,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this,
if (name && (strcmp (GF_XATTR_MARKER_KEY, name) == 0)
- && (-1 == frame->root->pid)) {
+ && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
local->marker.call_count = priv->child_count;
sub_volumes = alloca ( priv->child_count *
@@ -4944,7 +5523,8 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this,
if (cluster_getmarkerattr (frame, this, loc, name,
local, stripe_getxattr_unwind,
sub_volumes, priv->child_count,
- MARKER_UUID_TYPE, priv->vol_uuid)) {
+ MARKER_UUID_TYPE, marker_uuid_default_gauge,
+ priv->vol_uuid)) {
op_errno = EINVAL;
goto err;
}
@@ -5000,7 +5580,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this,
if (name &&(*priv->vol_uuid)) {
if ((match_uuid_local (name, priv->vol_uuid) == 0)
- && (-1 == frame->root->pid)) {
+ && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
if (!IA_FILE_OR_DIR (loc->inode->ia_type))
local->marker.call_count = 1;
@@ -5023,6 +5603,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this,
sub_volumes,
local->marker.call_count,
MARKER_XTIME_TYPE,
+ marker_xtime_default_gauge,
priv->vol_uuid)) {
op_errno = EINVAL;
goto err;
@@ -5196,6 +5777,9 @@ struct xlator_fops fops = {
.removexattr = stripe_removexattr,
.fremovexattr = stripe_fremovexattr,
.readdirp = stripe_readdirp,
+ .fallocate = stripe_fallocate,
+ .discard = stripe_discard,
+ .zerofill = stripe_zerofill,
};
struct xlator_cbks cbks = {
@@ -5221,10 +5805,10 @@ struct volume_options options[] = {
},
{ .key = {"coalesce"},
.type = GF_OPTION_TYPE_BOOL,
- .default_value = "false",
- .description = "Enable coalesce mode to flatten striped files as "
- "stored on the server (i.e., eliminate holes caused "
- "by the traditional format)."
+ .default_value = "true",
+ .description = "Enable/Disable coalesce mode to flatten striped "
+ "files as stored on the server (i.e., eliminate holes "
+ "caused by the traditional format)."
},
{ .key = {NULL} },
};
diff --git a/xlators/cluster/stripe/src/stripe.h b/xlators/cluster/stripe/src/stripe.h
index 53b683c73..5673d18f3 100644
--- a/xlators/cluster/stripe/src/stripe.h
+++ b/xlators/cluster/stripe/src/stripe.h
@@ -98,8 +98,8 @@ struct stripe_private {
gf_lock_t lock;
uint8_t nodes_down;
int8_t first_child_down;
+ int *last_event;
int8_t child_count;
- int8_t *state; /* Current state of child node */
gf_boolean_t xattr_supported; /* default yes */
gf_boolean_t coalesce;
char vol_uuid[UUID_SIZE + 1];