summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/dht/src
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/dht/src')
-rw-r--r--xlators/cluster/dht/src/Makefile.am2
-rw-r--r--xlators/cluster/dht/src/dht-common.c312
-rw-r--r--xlators/cluster/dht/src/dht-common.h55
-rw-r--r--xlators/cluster/dht/src/dht-diskusage.c75
-rw-r--r--xlators/cluster/dht/src/dht-hashfn.c2
-rw-r--r--xlators/cluster/dht/src/dht-helper.c306
-rw-r--r--xlators/cluster/dht/src/dht-inode-read.c75
-rw-r--r--xlators/cluster/dht/src/dht-inode-write.c445
-rw-r--r--xlators/cluster/dht/src/dht-layout.c88
-rw-r--r--xlators/cluster/dht/src/dht-linkfile.c79
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c198
-rw-r--r--xlators/cluster/dht/src/dht-rename.c111
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c74
-rw-r--r--xlators/cluster/dht/src/dht-shared.c758
-rw-r--r--xlators/cluster/dht/src/dht.c717
-rw-r--r--xlators/cluster/dht/src/nufa.c361
-rw-r--r--xlators/cluster/dht/src/switch.c197
17 files changed, 2331 insertions, 1524 deletions
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
index 5324d2115..174bea841 100644
--- a/xlators/cluster/dht/src/Makefile.am
+++ b/xlators/cluster/dht/src/Makefile.am
@@ -4,7 +4,7 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \
dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \
- dht-common.c dht-inode-write.c dht-inode-read.c \
+ dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \
$(top_builddir)/xlators/lib/src/libxlator.c
dht_la_SOURCES = $(dht_common_source) dht.c
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index f95bb84e6..8f61339e6 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -22,6 +22,7 @@
#include "dht-common.h"
#include "defaults.h"
#include "byte-order.h"
+#include "glusterfs-acl.h"
#include <sys/time.h>
#include <libgen.h>
@@ -62,6 +63,11 @@ dht_aggregate (dict_t *this, char *key, data_t *value, void *data)
}
*size = hton64 (ntoh64 (*size) + ntoh64 (*ptr));
+
+ } else if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) {
+ ret = gf_get_min_stime (THIS, dst, key, value);
+ if (ret < 0)
+ return ret;
} else {
/* compare user xattrs only */
if (!strncmp (key, "user.", strlen ("user."))) {
@@ -148,9 +154,11 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
int op_errno = 0;
int ret = -1;
dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
local = discover_frame->local;
layout = local->layout;
+ conf = this->private;
LOCK(&discover_frame->lock);
{
@@ -193,11 +201,14 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
"(overlaps/holes present: %s, "
"ENOENT errors: %d)", local->loc.path,
(ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0);
- op_errno = EINVAL;
- goto out;
+ if ((ret > 0) && (ret == conf->subvolume_cnt)) {
+ op_errno = ESTALE;
+ goto out;
+ }
}
- dht_layout_set (this, local->inode, layout);
+ if (local->inode)
+ dht_layout_set (this, local->inode, layout);
}
DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno,
@@ -226,6 +237,7 @@ dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int is_dir = 0;
int is_linkfile = 0;
int attempt_unwind = 0;
+ dht_conf_t *conf = 0;
GF_VALIDATE_OR_GOTO ("dht", frame, out);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -235,6 +247,7 @@ dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
prev = cookie;
+ conf = this->private;
layout = local->layout;
@@ -269,7 +282,8 @@ dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto unlock;
}
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
is_dir = check_is_dir (inode, stbuf, xattr);
if (is_dir) {
@@ -328,23 +342,20 @@ dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc)
int i = 0;
call_frame_t *discover_frame = NULL;
-
conf = this->private;
local = frame->local;
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ ret = dict_set_uint32 (local->xattr_req, conf->xattr_name, 4 * 4);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set 'trusted.glusterfs.dht' key",
- loc->path);
+ "%s: failed to set '%s' key",
+ loc->path, conf->xattr_name);
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht.linkto", 256);
+ ret = dict_set_uint32 (local->xattr_req, conf->link_xattr_name, 256);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set 'trusted.glusterfs.dht.linkto' key",
- loc->path);
+ "%s: failed to set '%s' key",
+ loc->path, conf->link_xattr_name);
call_cnt = conf->subvolume_cnt;
local->call_cnt = call_cnt;
@@ -430,7 +441,7 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
op_ret, op_errno, xattr);
if (op_ret == -1) {
- local->op_errno = ENOENT;
+ local->op_errno = op_errno;
gf_log (this->name, GF_LOG_DEBUG,
"lookup of %s on %s returned error (%s)",
local->loc.path, prev->this->name,
@@ -585,7 +596,8 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
layout = local->layout;
is_dir = check_is_dir (inode, stbuf, xattr);
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
if (is_linkfile) {
gf_log (this->name, GF_LOG_INFO,
@@ -597,7 +609,7 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
if (is_dir) {
- ret = dht_dir_has_layout (xattr);
+ ret = dht_dir_has_layout (xattr, conf->xattr_name);
if (ret >= 0) {
if (is_greater_time(local->stbuf.ia_ctime,
local->stbuf.ia_ctime_nsec,
@@ -886,7 +898,7 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
hashed_subvol->name);
ret = dht_linkfile_create (frame,
- dht_lookup_linkfile_create_cbk,
+ dht_lookup_linkfile_create_cbk, this,
cached_subvol, hashed_subvol, &local->loc);
return ret;
@@ -924,8 +936,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
xlator_t *subvol = NULL;
loc_t *loc = NULL;
xlator_t *link_subvol = NULL;
- int ret = -1;
- int32_t fd_count = 0;
+ int ret = -1;
+ int32_t fd_count = 0;
+ dht_conf_t *conf = NULL;
GF_VALIDATE_OR_GOTO ("dht", frame, out);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -935,6 +948,7 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
loc = &local->loc;
+ conf = this->private;
prev = cookie;
subvol = prev->this;
@@ -956,7 +970,8 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
loc->path, prev->this->name);
}
- is_linkfile = check_is_linkfile (inode, buf, xattr);
+ is_linkfile = check_is_linkfile (inode, buf, xattr,
+ conf->link_xattr_name);
is_dir = check_is_dir (inode, buf, xattr);
if (is_linkfile) {
@@ -1117,7 +1132,7 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
goto err;
}
- if (check_is_linkfile (inode, stbuf, xattr)) {
+ if (check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) {
gf_log (this->name, GF_LOG_INFO,
"lookup of %s on %s (following linkfile) reached link",
local->loc.path, subvol->name);
@@ -1294,7 +1309,8 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
if (!is_linkfile) {
/* non-directory and not a linkfile */
@@ -1399,7 +1415,6 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
conf = this->private;
if (!conf)
@@ -1474,7 +1489,7 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
* revalidates directly go to the cached-subvolume.
*/
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (IA_ISDIR (local->inode->ia_type)) {
local->call_cnt = call_cnt = conf->subvolume_cnt;
@@ -1509,10 +1524,10 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
do_fresh_lookup:
/* TODO: remove the hard-coding */
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
ret = dict_set_uint32 (local->xattr_req,
- DHT_LINKFILE_KEY, 256);
+ conf->link_xattr_name, 256);
/* need it for self-healing linkfiles which is
'in-migration' state */
@@ -1620,7 +1635,8 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
LOCK (&frame->lock);
{
- if (op_ret == -1) {
+ if ((op_ret == -1) && !((op_errno == ENOENT) ||
+ (op_errno == ENOTCONN))) {
local->op_errno = op_errno;
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
@@ -1633,7 +1649,7 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
unlock:
UNLOCK (&frame->lock);
- if (op_ret == -1)
+ if (local->op_ret == -1)
goto err;
cached_subvol = dht_subvol_get_cached (this, local->loc.inode);
@@ -1657,41 +1673,6 @@ err:
return 0;
}
-static int
-dht_ufo_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
-
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_ret = -1;
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
- }
-unlock:
- UNLOCK (&frame->lock);
-
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- DHT_STACK_UNWIND (setxattr, frame, local->op_ret,
- local->op_errno, NULL);
- }
-
- return 0;
-}
-
-
int
dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
@@ -1807,6 +1788,7 @@ dht_vgetxattr_alloc_and_fill (dht_local_t *local, dict_t *xattr, xlator_t *this,
}
(void) strcat (local->xattr_val, value);
+ (void) strcat (local->xattr_val, " ");
local->op_ret = 0;
}
@@ -1831,6 +1813,8 @@ dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this,
if (!*dict)
goto out;
+ local->xattr_val[strlen (local->xattr_val) - 1] = '\0';
+
/* we would need max this many bytes to create xattr string
* extra 40 bytes is just an estimated amount of additional
* space required as we include translator name and some
@@ -2009,10 +1993,13 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
int this_call_cnt = 0;
dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (frame->local, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ conf = this->private;
local = frame->local;
this_call_cnt = dht_frame_return (frame);
@@ -2020,8 +2007,8 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (!xattr || (op_ret == -1))
goto out;
- if (dict_get (xattr, "trusted.glusterfs.dht")) {
- dict_del (xattr, "trusted.glusterfs.dht");
+ if (dict_get (xattr, conf->xattr_name)) {
+ dict_del (xattr, conf->xattr_name);
}
local->op_ret = 0;
@@ -2054,6 +2041,67 @@ dht_getxattr_unwind (call_frame_t *frame,
int
+dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
+{
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+
+
+ local = frame->local;
+
+ if (op_ret != -1) {
+ if (local->xattr)
+ dict_unref (local->xattr);
+ local->xattr = dict_ref (xattr);
+
+ if (local->xattr_req)
+ dict_unref (local->xattr_req);
+ local->xattr_req = dict_ref (xdata);
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno,
+ local->xattr, local->xattr_req);
+ }
+
+ return 0;
+}
+
+
+int
+dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *key, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int i = 0;
+ dht_layout_t *layout = NULL;
+ int cnt = 0;
+ xlator_t *subvol = NULL;
+
+
+ local = frame->local;
+ layout = local->layout;
+
+ cnt = local->call_cnt = layout->cnt;
+
+ local->op_ret = -1;
+ local->op_errno = ENODATA;
+
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND (frame, dht_getxattr_get_real_filename_cbk,
+ subvol, subvol->fops->getxattr,
+ loc, key, xdata);
+ }
+
+ return 0;
+}
+
+
+int
dht_getxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *key, dict_t *xdata)
#define DHT_IS_DIR(layout) (layout->cnt > 1)
@@ -2074,7 +2122,6 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
VALIDATE_OR_GOTO (this->private, err);
conf = this->private;
@@ -2102,6 +2149,14 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
}
}
+ if (key &&
+ (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY,
+ strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)
+ && DHT_IS_DIR(layout)) {
+ dht_getxattr_get_real_filename (frame, this, loc, key, xdata);
+ return 0;
+ }
+
/* for file use cached subvolume (obviously!): see if {}
* below
* for directory:
@@ -2111,8 +2166,9 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
* NOTE: Don't trust inode here, as that may not be valid
* (until inode_link() happens)
*/
- if (key && (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)
- && DHT_IS_DIR(layout)) {
+ if (key && DHT_IS_DIR(layout) &&
+ ((strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)
+ || (strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0))) {
(void) strncpy (local->xsel, key, 256);
cnt = local->call_cnt = layout->cnt;
for (i = 0; i < cnt; i++) {
@@ -2183,7 +2239,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
if (cluster_getmarkerattr (frame, this, loc, key,
local, dht_getxattr_unwind,
sub_volumes, cnt,
- MARKER_UUID_TYPE, conf->vol_uuid)) {
+ MARKER_UUID_TYPE, marker_uuid_default_gauge,
+ conf->vol_uuid)) {
op_errno = EINVAL;
goto err;
}
@@ -2207,6 +2264,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
local, dht_getxattr_unwind,
sub_volumes, cnt,
MARKER_XTIME_TYPE,
+ marker_xtime_default_gauge,
conf->vol_uuid)) {
op_errno = EINVAL;
goto err;
@@ -2308,13 +2366,17 @@ dht_fsetxattr (call_frame_t *frame, xlator_t *this,
xlator_t *subvol = NULL;
dht_local_t *local = NULL;
int op_errno = EINVAL;
+ dht_conf_t *conf = NULL;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
VALIDATE_OR_GOTO (fd->inode, err);
+ VALIDATE_OR_GOTO (this->private, err);
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.dht*", xattr,
+ conf = this->private;
+
+ GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
op_errno, err);
local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR);
@@ -2418,12 +2480,12 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.dht*", xattr,
+ conf = this->private;
+
+ GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
op_errno, err);
- conf = this->private;
local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR);
if (!local) {
op_errno = ENOMEM;
@@ -2448,25 +2510,6 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
local->call_cnt = call_cnt = layout->cnt;
- /* This key is sent by Unified File and Object storage
- * to test xattr support in backend.
- */
- tmp = dict_get (xattr, "user.ufo-test");
- if (tmp) {
- if (IA_ISREG (loc->inode->ia_type)) {
- op_errno = ENOTSUP;
- goto err;
- }
- local->op_ret = 0;
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_ufo_xattr_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->setxattr,
- loc, xattr, flags, NULL);
- }
- return 0;
- }
-
tmp = dict_get (xattr, "distribute.migrate-data");
if (tmp) {
if (IA_ISDIR (loc->inode->ia_type)) {
@@ -2636,18 +2679,20 @@ dht_removexattr (call_frame_t *frame, xlator_t *this,
dht_local_t *local = NULL;
dht_layout_t *layout = NULL;
int call_cnt = 0;
+ dht_conf_t *conf = NULL;
int i;
VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (this->private, err);
- GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.dht*",
- key, op_errno, err);
+ conf = this->private;
+
+ GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err);
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
local = dht_local_init (frame, loc, NULL, GF_FOP_REMOVEXATTR);
if (!local) {
@@ -2699,13 +2744,16 @@ dht_fremovexattr (call_frame_t *frame, xlator_t *this,
dht_local_t *local = NULL;
dht_layout_t *layout = NULL;
int call_cnt = 0;
+ dht_conf_t *conf = 0;
int i;
VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (this->private, err);
- GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.dht*",
- key, op_errno, err);
+ conf = this->private;
+
+ GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err);
VALIDATE_OR_GOTO (frame, err);
@@ -2876,7 +2924,6 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
VALIDATE_OR_GOTO (this->private, err);
conf = this->private;
@@ -2996,10 +3043,13 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
list_for_each_entry (orig_entry, (&orig_entries->list), list) {
next_offset = orig_entry->d_off;
- if ((check_is_dir (NULL, (&orig_entry->d_stat), NULL) &&
- (prev->this != dht_first_up_subvol (this))) ||
- check_is_linkfile (NULL, (&orig_entry->d_stat),
- orig_entry->dict)) {
+ if (check_is_dir (NULL, (&orig_entry->d_stat), NULL) &&
+ (prev->this != local->first_up_subvol)) {
+ continue;
+ }
+ if (check_is_linkfile (NULL, (&orig_entry->d_stat),
+ orig_entry->dict,
+ conf->link_xattr_name)) {
continue;
}
@@ -3076,13 +3126,16 @@ done:
}
if (conf->readdir_optimize == _gf_true) {
- if (next_subvol != dht_first_up_subvol (this)) {
+ if (next_subvol != local->first_up_subvol) {
ret = dict_set_int32 (local->xattr,
GF_READDIR_SKIP_DIRS, 1);
if (ret)
gf_log (this->name, GF_LOG_ERROR,
"dict set failed");
- }
+ } else {
+ dict_del (local->xattr,
+ GF_READDIR_SKIP_DIRS);
+ }
}
STACK_WIND (frame, dht_readdirp_cbk,
@@ -3215,6 +3268,7 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (this->private, err);
conf = this->private;
@@ -3227,6 +3281,7 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
local->fd = fd_ref (fd);
local->size = size;
local->xattr_req = (dict)? dict_ref (dict) : NULL;
+ local->first_up_subvol = dht_first_up_subvol (this);
dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff);
@@ -3239,20 +3294,22 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
if (local->xattr) {
ret = dict_set_uint32 (local->xattr,
- "trusted.glusterfs.dht.linkto",
- 256);
+ conf->link_xattr_name, 256);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
- "failed to set 'glusterfs.dht.linkto'"
- " key");
+ "failed to set '%s' key",
+ conf->link_xattr_name);
if (conf->readdir_optimize == _gf_true) {
- if (xvol != dht_first_up_subvol (this)) {
+ if (xvol != local->first_up_subvol) {
ret = dict_set_int32 (local->xattr,
GF_READDIR_SKIP_DIRS, 1);
if (ret)
gf_log (this->name,
GF_LOG_ERROR,
"Dict set failed");
+ } else {
+ dict_del (local->xattr,
+ GF_READDIR_SKIP_DIRS);
}
}
}
@@ -3510,7 +3567,9 @@ dht_mknod (call_frame_t *frame, xlator_t *this,
subvol, subvol->fops->mknod, loc, mode,
rdev, umask, params);
} else {
- avail_subvol = dht_free_disk_available_subvol (this, subvol);
+
+ avail_subvol = dht_free_disk_available_subvol (this, subvol,
+ local);
if (avail_subvol != subvol) {
/* Choose the minimum filled volume, and create the
files there */
@@ -3522,7 +3581,7 @@ dht_mknod (call_frame_t *frame, xlator_t *this,
local->umask = umask;
dht_linkfile_create (frame,
dht_mknod_linkfile_create_cbk,
- avail_subvol, subvol, loc);
+ this, avail_subvol, subvol, loc);
} else {
gf_log (this->name, GF_LOG_TRACE,
"creating %s on %s", loc->path, subvol->name);
@@ -3781,7 +3840,7 @@ dht_link (call_frame_t *frame, xlator_t *this,
if (hashed_subvol != cached_subvol) {
uuid_copy (local->gfid, oldloc->inode->gfid);
- dht_linkfile_create (frame, dht_link_linkfile_cbk,
+ dht_linkfile_create (frame, dht_link_linkfile_cbk, this,
cached_subvol, hashed_subvol, newloc);
} else {
STACK_WIND (frame, dht_link_cbk,
@@ -3931,7 +3990,7 @@ dht_create (call_frame_t *frame, xlator_t *this,
}
/* Choose the minimum filled volume, and create the
files there */
- avail_subvol = dht_free_disk_available_subvol (this, subvol);
+ avail_subvol = dht_free_disk_available_subvol (this, subvol, local);
if (avail_subvol != subvol) {
local->params = dict_ref (params);
local->flags = flags;
@@ -3942,9 +4001,8 @@ dht_create (call_frame_t *frame, xlator_t *this,
gf_log (this->name, GF_LOG_TRACE,
"creating %s on %s (link at %s)", loc->path,
avail_subvol->name, subvol->name);
- dht_linkfile_create (frame,
- dht_create_linkfile_create_cbk,
- avail_subvol, subvol, loc);
+ dht_linkfile_create (frame, dht_create_linkfile_create_cbk,
+ this, avail_subvol, subvol, loc);
goto done;
}
gf_log (this->name, GF_LOG_TRACE,
@@ -4483,6 +4541,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_frame_t *main_frame = NULL;
dht_local_t *main_local = NULL;
int this_call_cnt = 0;
+ dht_conf_t *conf = this->private;
local = frame->local;
prev = cookie;
@@ -4494,7 +4553,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret != 0)
goto err;
- if (check_is_linkfile (inode, stbuf, xattr) == 0) {
+ if (!check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) {
main_local->op_ret = -1;
main_local->op_errno = ENOTEMPTY;
@@ -4529,6 +4588,7 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
dht_local_t *lookup_local = NULL;
dht_local_t *local = NULL;
dict_t *xattrs = NULL;
+ dht_conf_t *conf = this->private;
local = frame->local;
@@ -4537,7 +4597,8 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
continue;
if (strcmp (trav->d_name, "..") == 0)
continue;
- if (check_is_linkfile (NULL, (&trav->d_stat), trav->dict)) {
+ if (check_is_linkfile (NULL, (&trav->d_stat), trav->dict,
+ conf->link_xattr_name)) {
ret++;
continue;
}
@@ -4555,7 +4616,7 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
return -1;
}
- ret = dict_set_uint32 (xattrs, DHT_LINKFILE_KEY, 256);
+ ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "failed to set linkto key"
" in dict");
@@ -4678,6 +4739,7 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_frame_t *prev = NULL;
dict_t *dict = NULL;
int ret = 0;
+ dht_conf_t *conf = this->private;
local = frame->local;
prev = cookie;
@@ -4701,12 +4763,11 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto err;
}
- ret = dict_set_uint32 (dict,
- "trusted.glusterfs.dht.linkto", 256);
+ ret = dict_set_uint32 (dict, conf->link_xattr_name, 256);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set 'trusted.glusterfs.dht.linkto' key",
- local->loc.path);
+ "%s: failed to set '%s' key",
+ local->loc.path, conf->link_xattr_name);
STACK_WIND (frame, dht_rmdir_readdirp_cbk,
prev->this, prev->this->fops->readdirp,
@@ -4805,7 +4866,6 @@ dht_entrylk (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
local = dht_local_init (frame, loc, NULL, GF_FOP_ENTRYLK);
if (!local) {
@@ -5113,8 +5173,8 @@ unlock:
* not need to handle CHILD_DOWN event here.
*/
if (conf->defrag) {
- ret = pthread_create (&conf->defrag->th, NULL,
- gf_defrag_start, this);
+ ret = gf_thread_create (&conf->defrag->th, NULL,
+ gf_defrag_start, this);
if (ret) {
conf->defrag = NULL;
GF_FREE (conf->defrag);
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 65983c0c4..5ccd66799 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -183,6 +183,7 @@ struct dht_local {
xlator_t *link_subvol;
struct dht_rebalance_ rebalance;
+ xlator_t *first_up_subvol;
};
typedef struct dht_local dht_local_t;
@@ -211,6 +212,10 @@ enum gf_defrag_status_t {
GF_DEFRAG_STATUS_STOPPED,
GF_DEFRAG_STATUS_COMPLETE,
GF_DEFRAG_STATUS_FAILED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED,
};
typedef enum gf_defrag_status_t gf_defrag_status_t;
@@ -227,6 +232,7 @@ struct gf_defrag_info_ {
uint64_t total_data;
uint64_t num_files_lookedup;
uint64_t total_failures;
+ uint64_t skipped;
gf_lock_t lock;
int cmd;
pthread_t th;
@@ -290,6 +296,11 @@ struct dht_conf {
gf_boolean_t rsync_regex_valid;
regex_t extra_regex;
gf_boolean_t extra_regex_valid;
+
+ /* Support variable xattr names. */
+ char *xattr_name;
+ char *link_xattr_name;
+ char *wild_xattr_name;
};
typedef struct dht_conf dht_conf_t;
@@ -320,13 +331,12 @@ typedef enum {
#define DHT_MIGRATION_IN_PROGRESS 1
#define DHT_MIGRATION_COMPLETED 2
-#define DHT_LINKFILE_KEY "trusted.glusterfs.dht.linkto"
#define DHT_LINKFILE_MODE (S_ISVTX)
-#define check_is_linkfile(i,s,x) ( \
+#define check_is_linkfile(i,s,x,n) ( \
((st_mode_from_ia ((s)->ia_prot, (s)->ia_type) & ~S_IFMT) \
- == DHT_LINKFILE_MODE) && \
- dict_get (x, DHT_LINKFILE_KEY))
+ == DHT_LINKFILE_MODE) && \
+ dict_get (x, n))
#define IS_DHT_MIGRATION_PHASE2(buf) ( \
IA_ISREG ((buf)->ia_type) && \
@@ -433,12 +443,14 @@ int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt
xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc);
xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode);
xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev);
+xlator_t *dht_subvol_next_available (xlator_t *this, xlator_t *prev);
int dht_subvol_cnt (xlator_t *this, xlator_t *subvol);
int dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p);
int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
- xlator_t *tovol, xlator_t *fromvol, loc_t *loc);
+ xlator_t *this, xlator_t *tovol,
+ xlator_t *fromvol, loc_t *loc);
int dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc);
int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc);
int
@@ -456,7 +468,8 @@ dht_layout_sort_volname (dht_layout_t *layout);
int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc);
gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol);
-xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol);
+xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
+ dht_local_t *layout);
int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx);
int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode);
@@ -678,7 +691,16 @@ int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
struct iatt *stbuf, int32_t valid, dict_t *xdata);
int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iatt *stbuf, int32_t valid, dict_t *xdata);
-
+int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t mode, off_t offset, size_t len, dict_t *xdata);
+int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, size_t len, dict_t *xdata);
+int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, size_t len, dict_t *xdata);
+
+int32_t dht_init (xlator_t *this);
+void dht_fini (xlator_t *this);
+int dht_reconfigure (xlator_t *this, dict_t *options);
int32_t dht_notify (xlator_t *this, int32_t event, void *data, ...);
/* definitions for nufa/switch */
@@ -740,13 +762,26 @@ dht_dir_attr_heal (void *data);
int
dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data);
int
-dht_dir_has_layout (dict_t *xattr);
+dht_dir_has_layout (dict_t *xattr, char *name);
gf_boolean_t
dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator);
xlator_t *
-dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol);
+dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout);
xlator_t *
-dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol);
+dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout);
int
dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this);
+
+void
+dht_layout_dump (dht_layout_t *layout, const char *prefix);
+int32_t
+dht_priv_dump (xlator_t *this);
+int32_t
+dht_inodectx_dump (xlator_t *this, inode_t *inode);
+
+int
+dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol);
+
#endif/* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
index 0c87f4a64..fe3955ecb 100644
--- a/xlators/cluster/dht/src/dht-diskusage.c
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -251,25 +251,45 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)
/*Get the best subvolume to create the file in*/
xlator_t *
-dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol)
+dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
+ dht_local_t *local)
{
xlator_t *avail_subvol = NULL;
dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ loc_t *loc = NULL;
conf = this->private;
+ if (!local)
+ goto out;
+ loc = &local->loc;
+ if (!local->layout) {
+ layout = dht_layout_get (this, loc->parent);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "layout missing path=%s parent=%s",
+ loc->path, uuid_utoa (loc->parent->gfid));
+ goto out;
+ }
+ } else {
+ layout = dht_layout_ref (this, local->layout);
+ }
- LOCK (&conf->subvolume_lock);
+ LOCK (&conf->subvolume_lock);
{
- avail_subvol = dht_subvol_with_free_space_inodes(this, subvol);
+ avail_subvol = dht_subvol_with_free_space_inodes(this, subvol,
+ layout);
if(!avail_subvol)
{
avail_subvol = dht_subvol_maxspace_nonzeroinode(this,
- subvol);
+ subvol,
+ layout);
}
}
UNLOCK (&conf->subvolume_lock);
-
+out:
if (!avail_subvol) {
gf_log (this->name,
GF_LOG_DEBUG,
@@ -278,17 +298,42 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol)
avail_subvol = subvol;
}
-
+ if (layout)
+ dht_layout_unref (this, layout);
return avail_subvol;
}
+static inline
+int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout)
+{
+ int ret = -1;
+ int i = 0;
+
+ if (!this || !layout)
+ goto out;
+
+ /* check if subvol has layout errors, before selecting it */
+ for (i = 0; i < layout->cnt; i++) {
+ if (!strcmp (layout->list[i].xlator->name, this->name) &&
+ (layout->list[i].err != 0)) {
+ ret = -1;
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
/*Get subvolume which has both space and inodes more than the min criteria*/
xlator_t *
-dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol)
+dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout)
{
int i = 0;
double max = 0;
double max_inodes = 0;
+ int ignore_subvol = 0;
xlator_t *avail_subvol = NULL;
dht_conf_t *conf = NULL;
@@ -296,6 +341,12 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol)
conf = this->private;
for(i=0; i < conf->subvolume_cnt; i++) {
+ /* check if subvol has layout errors, before selecting it */
+ ignore_subvol = dht_subvol_has_err (conf->subvolumes[i],
+ layout);
+ if (ignore_subvol)
+ continue;
+
if ((conf->disk_unit == 'p') &&
(conf->du_stats[i].avail_percent > conf->min_free_disk) &&
(conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
@@ -325,10 +376,12 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol)
/* Get subvol which has atleast one inode and maximum space */
xlator_t *
-dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol)
+dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout)
{
int i = 0;
double max = 0;
+ int ignore_subvol = 0;
xlator_t *avail_subvol = NULL;
dht_conf_t *conf = NULL;
@@ -336,6 +389,12 @@ dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol)
conf = this->private;
for (i = 0; i < conf->subvolume_cnt; i++) {
+ /* check if subvol has layout errors, before selecting it */
+ ignore_subvol = dht_subvol_has_err (conf->subvolumes[i],
+ layout);
+ if (ignore_subvol)
+ continue;
+
if (conf->disk_unit == 'p') {
if ((conf->du_stats[i].avail_percent > max)
&& (conf->du_stats[i].avail_inodes > 0 )) {
diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c
index 519dbfbb2..656cf23a0 100644
--- a/xlators/cluster/dht/src/dht-hashfn.c
+++ b/xlators/cluster/dht/src/dht-hashfn.c
@@ -94,7 +94,7 @@ dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p)
if (!munged && priv->rsync_regex_valid) {
len = strlen(name) + 1;
rsync_friendly_name = alloca(len);
- gf_log (this->name, GF_LOG_DEBUG, "trying regex for %s", name);
+ gf_log (this->name, GF_LOG_TRACE, "trying regex for %s", name);
munged = dht_munge_name (name, rsync_friendly_name, len,
&priv->rsync_regex);
if (munged) {
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
index ecd06e394..311a48112 100644
--- a/xlators/cluster/dht/src/dht-helper.c
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -18,6 +18,28 @@
#include "xlator.h"
#include "dht-common.h"
+static inline int
+dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol)
+{
+ uint64_t tmp_subvol = 0;
+
+ tmp_subvol = (long)subvol;
+ return inode_ctx_set1 (inode, this, &tmp_subvol);
+}
+
+int
+dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol)
+{
+ int ret = -1;
+ uint64_t tmp_subvol = 0;
+
+ ret = inode_ctx_get1 (inode, this, &tmp_subvol);
+ if (tmp_subvol && subvol)
+ *subvol = (xlator_t *)tmp_subvol;
+
+ return ret;
+}
+
int
dht_frame_return (call_frame_t *frame)
@@ -40,6 +62,43 @@ dht_frame_return (call_frame_t *frame)
}
+static uint64_t
+dht_bits_for (uint64_t num)
+{
+ uint64_t bits = 0, ctrl = 1;
+
+ while (ctrl < num) {
+ ctrl *= 2;
+ bits ++;
+ }
+
+ return bits;
+}
+
+/*
+ * A slightly "updated" version of the algorithm described in the commit log
+ * is used here.
+ *
+ * The only enhancement is that:
+ *
+ * - The number of bits used by the backend filesystem for HUGE d_off which
+ * is described as 63, and
+ * - The number of bits used by the d_off presented by the transformation
+ * upwards which is described as 64, are both made "configurable."
+ */
+
+
+#define BACKEND_D_OFF_BITS 63
+#define PRESENT_D_OFF_BITS 63
+
+#define ONE 1ULL
+#define MASK (~0ULL)
+#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS))
+#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS))
+
+#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1))
+#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1)))
+
int
dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
{
@@ -47,6 +106,9 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
int cnt = 0;
int max = 0;
uint64_t y = 0;
+ uint64_t hi_mask = 0;
+ uint64_t off_mask = 0;
+ int max_bits = 0;
if (x == ((uint64_t) -1)) {
y = (uint64_t) -1;
@@ -60,7 +122,23 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
max = conf->subvolume_cnt;
cnt = dht_subvol_cnt (this, subvol);
- y = ((x * max) + cnt);
+ if (max == 1) {
+ y = x;
+ goto out;
+ }
+
+ max_bits = dht_bits_for (max);
+
+ hi_mask = ~(PRESENT_MASK >> (max_bits + 1));
+
+ if (x & hi_mask) {
+ /* HUGE d_off */
+ off_mask = MASK << max_bits;
+ y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt;
+ } else {
+ /* small d_off */
+ y = ((x * max) + cnt);
+ }
out:
if (y_p)
@@ -135,16 +213,38 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p,
int max = 0;
uint64_t x = 0;
xlator_t *subvol = 0;
+ int max_bits = 0;
+ uint64_t off_mask = 0;
+ uint64_t host_mask = 0;
if (!this->private)
- goto out;
+ return -1;
conf = this->private;
max = conf->subvolume_cnt;
- cnt = y % max;
- x = y / max;
+ if (max == 1) {
+ x = y;
+ cnt = 0;
+ goto out;
+ }
+
+ if (y & TOP_BIT) {
+ /* HUGE d_off */
+ max_bits = dht_bits_for (max);
+ off_mask = (MASK << max_bits);
+ host_mask = ~(off_mask);
+
+ x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS;
+
+ cnt = y & host_mask;
+ } else {
+ /* small d_off */
+ cnt = y % max;
+ x = y / max;
+ }
+out:
subvol = conf->subvolumes[cnt];
if (subvol_p)
@@ -153,7 +253,6 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p,
if (x_p)
*x_p = x;
-out:
return 0;
}
@@ -263,20 +362,6 @@ out:
return local;
}
-
-char *
-basestr (const char *str)
-{
- char *basestr = NULL;
-
- basestr = strrchr (str, '/');
- if (basestr)
- basestr ++;
-
- return basestr;
-}
-
-
xlator_t *
dht_first_up_subvol (xlator_t *this)
{
@@ -428,7 +513,36 @@ out:
return next;
}
+/* This func wraps around, if prev is actually the last subvol.
+ */
+xlator_t *
+dht_subvol_next_available (xlator_t *this, xlator_t *prev)
+{
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *next = NULL;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == prev) {
+ /* if prev is last in conf->subvolumes, then wrap
+ * around.
+ */
+ if ((i + 1) < conf->subvolume_cnt) {
+ next = conf->subvolumes[i + 1];
+ } else {
+ next = conf->subvolumes[0];
+ }
+ break;
+ }
+ }
+out:
+ return next;
+}
int
dht_subvol_cnt (xlator_t *this, xlator_t *subvol)
{
@@ -620,23 +734,36 @@ dht_migration_complete_check_task (void *data)
call_frame_t *frame = NULL;
loc_t tmp_loc = {0,};
char *path = NULL;
+ dht_conf_t *conf = NULL;
+ inode_t *inode = NULL;
+ fd_t *iter_fd = NULL;
+ uint64_t tmp_subvol = 0;
+ int open_failed = 0;
this = THIS;
frame = data;
local = frame->local;
+ conf = this->private;
src_node = local->cached_subvol;
if (!local->loc.inode && !local->fd)
goto out;
- /* getxattr on cached_subvol for 'linkto' value */
- if (!local->loc.inode)
+ inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
+ /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
+ * as root:root. If a fd is already open, access check wont be done*/
+
+ if (!local->loc.inode) {
ret = syncop_fgetxattr (src_node, local->fd, &dict,
- DHT_LINKFILE_KEY);
- else
+ conf->link_xattr_name);
+ } else {
+ SYNCTASK_SETID (0, 0);
ret = syncop_getxattr (src_node, &local->loc, &dict,
- DHT_LINKFILE_KEY);
+ conf->link_xattr_name);
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+ }
if (!ret)
dst_node = dht_linkfile_subvol (this, NULL, NULL, dict);
@@ -687,10 +814,7 @@ dht_migration_complete_check_task (void *data)
/* update inode ctx (the layout) */
dht_layout_unref (this, local->layout);
- if (!local->loc.inode)
- ret = dht_layout_preset (this, dst_node, local->fd->inode);
- else
- ret = dht_layout_preset (this, dst_node, local->loc.inode);
+ ret = dht_layout_preset (this, dst_node, inode);
if (ret != 0) {
gf_log (this->name, GF_LOG_DEBUG,
"%s: could not set preset layout for subvol %s",
@@ -708,10 +832,7 @@ dht_migration_complete_check_task (void *data)
goto out;
}
- if (!local->loc.inode)
- ret = dht_layout_set (this, local->fd->inode, layout);
- else
- ret = dht_layout_set (this, local->loc.inode, layout);
+ ret = dht_layout_set (this, inode, layout);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to set the new layout",
@@ -722,42 +843,46 @@ dht_migration_complete_check_task (void *data)
local->cached_subvol = dst_node;
ret = 0;
- /* once we detect the migration complete, the fd-ctx is no more
- required.. delete the ctx, and do one extra 'fd_unref' for open fd */
- ret = fd_ctx_del (local->fd, this, NULL);
- if (!ret) {
- fd_unref (local->fd);
- ret = 0;
+ /* once we detect the migration complete, the inode-ctx2 is no more
+ required.. delete the ctx and also, it means, open() already
+ done on all the fd of inode */
+ ret = inode_ctx_reset1 (inode, this, &tmp_subvol);
+ if (tmp_subvol)
+ goto out;
+
+ if (list_empty (&inode->fd_list))
goto out;
- }
/* perform open as root:root. There is window between linkfile
* creation(root:root) and setattr with the correct uid/gid
*/
SYNCTASK_SETID(0, 0);
- /* if 'local->fd' (ie, fd based operation), send a 'open()' on
- destination if not already done */
- if (local->loc.inode) {
- ret = syncop_open (dst_node, &local->loc,
- local->fd->flags, local->fd);
- } else {
- tmp_loc.inode = local->fd->inode;
- inode_path (local->fd->inode, NULL, &path);
- if (path)
- tmp_loc.path = path;
- ret = syncop_open (dst_node, &tmp_loc,
- local->fd->flags, local->fd);
- GF_FREE (path);
+ /* perform 'open()' on all the fd's present on the inode */
+ tmp_loc.inode = inode;
+ inode_path (inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ if (fd_is_anonymous (iter_fd))
+ continue;
+
+ ret = syncop_open (dst_node, &tmp_loc,
+ iter_fd->flags, iter_fd);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to open "
+ "the fd (%p, flags=0%o) on file %s @ %s",
+ iter_fd, iter_fd->flags, path, dst_node->name);
+ open_failed = 1;
+ }
}
+ GF_FREE (path);
+
SYNCTASK_SETID (frame->root->uid, frame->root->gid);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to send open() on target file at %s",
- local->loc.path, dst_node->name);
+ if (open_failed) {
+ ret = -1;
goto out;
}
-
ret = 0;
out:
@@ -801,23 +926,34 @@ dht_rebalance_inprogress_task (void *data)
char *path = NULL;
struct iatt stbuf = {0,};
loc_t tmp_loc = {0,};
+ dht_conf_t *conf = NULL;
+ inode_t *inode = NULL;
+ fd_t *iter_fd = NULL;
+ int open_failed = 0;
this = THIS;
frame = data;
local = frame->local;
+ conf = this->private;
src_node = local->cached_subvol;
if (!local->loc.inode && !local->fd)
goto out;
- /* getxattr on cached_subvol for 'linkto' value */
- if (local->loc.inode)
+ inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
+ /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
+ * as root:root. If a fd is already open, access check wont be done*/
+ if (local->loc.inode) {
+ SYNCTASK_SETID (0, 0);
ret = syncop_getxattr (src_node, &local->loc, &dict,
- DHT_LINKFILE_KEY);
- else
+ conf->link_xattr_name);
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+ } else {
ret = syncop_fgetxattr (src_node, local->fd, &dict,
- DHT_LINKFILE_KEY);
+ conf->link_xattr_name);
+ }
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -858,35 +994,47 @@ dht_rebalance_inprogress_task (void *data)
}
ret = 0;
+
+ if (list_empty (&inode->fd_list))
+ goto done;
+
/* perform open as root:root. There is window between linkfile
* creation(root:root) and setattr with the correct uid/gid
*/
SYNCTASK_SETID (0, 0);
- if (local->loc.inode) {
- ret = syncop_open (dst_node, &local->loc,
- local->fd->flags, local->fd);
- } else {
- tmp_loc.inode = local->fd->inode;
- inode_path (local->fd->inode, NULL, &path);
- if (path)
- tmp_loc.path = path;
+
+ tmp_loc.inode = inode;
+ inode_path (inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ if (fd_is_anonymous (iter_fd))
+ continue;
+
ret = syncop_open (dst_node, &tmp_loc,
- local->fd->flags, local->fd);
- GF_FREE (path);
+ iter_fd->flags, iter_fd);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to send open "
+ "the fd (%p, flags=0%o) on file %s @ %s",
+ iter_fd, iter_fd->flags, path, dst_node->name);
+ open_failed = 1;
+ }
}
+ GF_FREE (path);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to send open() on target file at %s",
- local->loc.path, dst_node->name);
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+
+ if (open_failed) {
+ ret = -1;
goto out;
}
- SYNCTASK_SETID (frame->root->uid, frame->root->gid);
- ret = fd_ctx_set (local->fd, this, (uint64_t)(long)dst_node);
+done:
+ ret = dht_inode_ctx_set1 (this, inode, dst_node);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to set fd-ctx target file at %s",
+ "%s: failed to set inode-ctx target file at %s",
local->loc.path, dst_node->name);
goto out;
}
diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c
index f17cb73b9..ece84151a 100644
--- a/xlators/cluster/dht/src/dht-inode-read.c
+++ b/xlators/cluster/dht/src/dht-inode-read.c
@@ -130,10 +130,11 @@ int
dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata)
{
- uint64_t tmp_subvol = 0;
+ xlator_t *subvol = 0;
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ inode_t *inode = NULL;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -154,21 +155,23 @@ dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (local->call_cnt != 1)
goto out;
+ local->op_errno = op_errno;
/* Check if the rebalance phase2 is true */
if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) {
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (ret) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+ ret = dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
/* Phase 2 of migration */
local->rebalance.target_op_fn = dht_attr2;
ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
} else {
/* value is already set in fd_ctx, that means no need
to check for whether its complete or not. */
dht_attr2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
}
out:
@@ -381,6 +384,8 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
dht_local_t *local = NULL;
int ret = 0;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
if (!local) {
@@ -396,19 +401,21 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if ((op_ret == -1) && (op_errno != ENOENT))
goto out;
+ local->op_errno = op_errno;
if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) {
/* File would be migrated to other node */
- ret = fd_ctx_get (local->fd, this, NULL);
- if (ret) {
+ ret = dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
local->rebalance.target_op_fn = dht_readv2;
ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
} else {
/* value is already set in fd_ctx, that means no need
to check for whether its complete or not. */
dht_readv2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
}
out:
@@ -499,24 +506,34 @@ dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int ret = -1;
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
+ call_frame_t *prev = NULL;
local = frame->local;
+ prev = cookie;
+ if (!prev || !prev->this)
+ goto out;
if (local->call_cnt != 1)
goto out;
if ((op_ret == -1) && (op_errno == ENOTCONN) &&
IA_ISDIR(local->loc.inode->ia_type)) {
- subvol = dht_first_up_subvol (this);
+ subvol = dht_subvol_next_available (this, prev->this);
if (!subvol)
goto out;
+ /* check if we are done with visiting every node */
+ if (subvol == local->cached_subvol) {
+ goto out;
+ }
+
STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access,
&local->loc, local->rebalance.flags, NULL);
return 0;
}
if ((op_ret == -1) && (op_errno == ENOENT)) {
/* File would be migrated to other node */
+ local->op_errno = op_errno;
local->rebalance.target_op_fn = dht_access2;
ret = dht_rebalance_complete_check (frame->this, frame);
if (!ret)
@@ -604,8 +621,9 @@ int
dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int ret = -1;
+ dht_local_t *local = NULL;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
@@ -615,8 +633,8 @@ dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
/* If context is set, then send flush() it to the destination */
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (subvol) {
dht_flush2 (this, frame, 0);
return 0;
}
@@ -632,14 +650,10 @@ dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -701,12 +715,14 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
prev = cookie;
local->op_errno = op_errno;
- if (op_ret == -1) {
+ if (op_ret == -1 && (op_errno != ENOENT)) {
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
prev->this->name, strerror (op_errno));
@@ -721,8 +737,9 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
goto out;
}
- ret = fd_ctx_get (local->fd, this, NULL);
- if (ret) {
+ local->op_errno = op_errno;
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
local->rebalance.target_op_fn = dht_fsync2;
/* Check if the rebalance phase1 is true */
@@ -737,11 +754,12 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
if (IS_DHT_MIGRATION_PHASE2 (postbuf)) {
ret = dht_rebalance_complete_check (this, frame);
}
+ if (!ret)
+ return 0;
} else {
dht_fsync2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
out:
DHT_STRIP_PHASE1_FLAGS (postbuf);
@@ -757,15 +775,10 @@ dht_fsync2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
-
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c
index d4a3ecc39..4b3f3a049 100644
--- a/xlators/cluster/dht/src/dht-inode-write.c
+++ b/xlators/cluster/dht/src/dht-inode-write.c
@@ -19,6 +19,9 @@
int dht_writev2 (xlator_t *this, call_frame_t *frame, int ret);
int dht_truncate2 (xlator_t *this, call_frame_t *frame, int ret);
int dht_setattr2 (xlator_t *this, call_frame_t *frame, int ret);
+int dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret);
+int dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret);
+int dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret);
int
dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -27,8 +30,9 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
dht_local_t *local = NULL;
int ret = -1;
+ xlator_t *subvol = NULL;
- if (op_ret == -1) {
+ if (op_ret == -1 && (op_errno != ENOENT)) {
goto out;
}
@@ -50,6 +54,7 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->rebalance.target_op_fn = dht_writev2;
+ local->op_errno = op_errno;
/* Phase 2 of migration */
if (IS_DHT_MIGRATION_PHASE2 (postbuf)) {
ret = dht_rebalance_complete_check (this, frame);
@@ -62,8 +67,8 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ ret = dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
dht_writev2 (this, frame, 0);
return 0;
}
@@ -87,14 +92,10 @@ dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -169,6 +170,8 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -198,6 +201,7 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->rebalance.target_op_fn = dht_truncate2;
+ local->op_errno = op_errno;
/* Phase 2 of migration */
if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
ret = dht_rebalance_complete_check (this, frame);
@@ -209,8 +213,9 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (subvol) {
dht_truncate2 (this, frame, 0);
return 0;
}
@@ -234,16 +239,13 @@ dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
+ inode_t *inode = NULL;
local = frame->local;
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ inode = local->fd ? local->fd->inode : local->loc.inode;
+ dht_inode_ctx_get1 (this, inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -346,6 +348,407 @@ err:
return 0;
}
+
+int
+dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+ xlator_t *subvol = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_fallocate2;
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
+ dht_fallocate2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (fallocate, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_fallocate_cbk, subvol, subvol->fops->fallocate,
+ local->fd, local->rebalance.flags, local->rebalance.offset,
+ local->rebalance.size, NULL);
+
+ return 0;
+}
+
+int
+dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_FALLOCATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.flags = mode;
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_fallocate_cbk,
+ subvol, subvol->fops->fallocate,
+ fd, mode, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+ xlator_t *subvol = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_discard2;
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
+ dht_discard2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (discard, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_discard_cbk, subvol, subvol->fops->discard,
+ local->fd, local->rebalance.offset, local->rebalance.size,
+ NULL);
+
+ return 0;
+}
+
+int
+dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_DISCARD);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_discard_cbk, subvol, subvol->fops->discard,
+ fd, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_zerofill2;
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ ret = fd_ctx_get (local->fd, this, NULL);
+ if (!ret) {
+ dht_zerofill2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ uint64_t tmp_subvol = 0;
+ int ret = -1;
+
+ local = frame->local;
+
+ if (local->fd)
+ ret = fd_ctx_get (local->fd, this, &tmp_subvol);
+ if (!ret)
+ subvol = (xlator_t *)(long)tmp_subvol;
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill,
+ local->fd, local->rebalance.offset, local->rebalance.size,
+ NULL);
+
+ return 0;
+}
+
+int
+dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill,
+ fd, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+
/* handle cases of migration here for 'setattr()' calls */
int
dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -397,15 +800,13 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
+ inode_t *inode = NULL;
local = frame->local;
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+ dht_inode_ctx_get1 (this, inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
index 71aa1b70c..38e9970a7 100644
--- a/xlators/cluster/dht/src/dht-layout.c
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -335,11 +335,12 @@ int
dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
int op_ret, int op_errno, dict_t *xattr)
{
- int i = 0;
- int ret = -1;
- int err = -1;
- void *disk_layout_raw = NULL;
- int disk_layout_len = 0;
+ int i = 0;
+ int ret = -1;
+ int err = -1;
+ void *disk_layout_raw = NULL;
+ int disk_layout_len = 0;
+ dht_conf_t *conf = this->private;
if (op_ret != 0) {
err = op_errno;
@@ -360,12 +361,12 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
if (xattr) {
/* during lookup and not mkdir */
- ret = dict_get_ptr_and_len (xattr, "trusted.glusterfs.dht",
+ ret = dict_get_ptr_and_len (xattr, conf->xattr_name,
&disk_layout_raw, &disk_layout_len);
}
if (ret != 0) {
- layout->list[i].err = -1;
+ layout->list[i].err = 0;
gf_log (this->name, GF_LOG_TRACE,
"missing disk layout on %s. err = %d",
subvol->name, err);
@@ -453,12 +454,19 @@ dht_layout_entry_cmp (dht_layout_t *layout, int i, int j)
{
int64_t diff = 0;
+ /* swap zero'ed out layouts to front, if needed */
+ if (!layout->list[j].start && !layout->list[j].stop) {
+ diff = (int64_t) layout->list[i].stop
+ - (int64_t) layout->list[j].stop;
+ goto out;
+ }
if (layout->list[i].err || layout->list[j].err)
diff = layout->list[i].err - layout->list[j].err;
else
diff = (int64_t) layout->list[i].start
- (int64_t) layout->list[j].start;
+out:
return diff;
}
@@ -529,23 +537,30 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
prev_stop = last_stop;
for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].err) {
- switch (layout->list[i].err) {
- case -1:
- case ENOENT:
- missing++;
- break;
- case ENOTCONN:
- down++;
- break;
- case ENOSPC:
- no_space++;
- break;
- default:
- misc++;
+ switch (layout->list[i].err) {
+ case -1:
+ case ENOENT:
+ missing++;
+ continue;
+ case ENOTCONN:
+ down++;
+ continue;
+ case ENOSPC:
+ no_space++;
+ continue;
+ case 0:
+ /* if err == 0 and start == stop, then it is a non misc++;
+ * participating subvolume(spread-cnt). Then, do not
+ * check for anomalies. If start != stop, then treat it
+ * as misc err */
+ if (layout->list[i].start == layout->list[i].stop) {
+ continue;
}
+ break;
+ default:
+ misc++;
continue;
- }
+ }
is_virgin = 0;
@@ -650,30 +665,29 @@ out:
}
int
-dht_dir_has_layout (dict_t *xattr)
+dht_dir_has_layout (dict_t *xattr, char *name)
{
void *disk_layout_raw = NULL;
- return dict_get_ptr (xattr, "trusted.glusterfs.dht",
- &disk_layout_raw);
-
+ return dict_get_ptr (xattr, name, &disk_layout_raw);
}
int
dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
loc_t *loc, dict_t *xattr)
{
- int idx = 0;
- int pos = -1;
- int ret = 0;
- int err = 0;
- int dict_ret = 0;
- int32_t disk_layout[4];
- void *disk_layout_raw = NULL;
- int32_t count = -1;
- uint32_t start_off = -1;
- uint32_t stop_off = -1;
+ int idx = 0;
+ int pos = -1;
+ int ret = 0;
+ int err = 0;
+ int dict_ret = 0;
+ int32_t disk_layout[4];
+ void *disk_layout_raw = NULL;
+ int32_t count = -1;
+ uint32_t start_off = -1;
+ uint32_t stop_off = -1;
+ dht_conf_t *conf = this->private;
for (idx = 0; idx < layout->cnt; idx++) {
@@ -703,7 +717,7 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
goto out;
}
- dict_ret = dict_get_ptr (xattr, "trusted.glusterfs.dht",
+ dict_ret = dict_get_ptr (xattr, conf->xattr_name,
&disk_layout_raw);
if (dict_ret < 0) {
diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c
index 67d6ce583..dbc9d0b3c 100644
--- a/xlators/cluster/dht/src/dht-linkfile.c
+++ b/xlators/cluster/dht/src/dht-linkfile.c
@@ -19,6 +19,35 @@
#include "compat.h"
#include "dht-common.h"
+int
+dht_linkfile_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ char is_linkfile = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (op_ret)
+ goto out;
+
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
+ if (!is_linkfile)
+ gf_log (this->name, GF_LOG_WARNING, "got non-linkfile %s:%s",
+ prev->this->name, local->loc.path);
+out:
+ local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
+ inode, stbuf, postparent, postparent,
+ xattr);
+ return 0;
+}
#define is_equal(a, b) (a == b)
int
@@ -28,27 +57,61 @@ dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postparent, dict_t *xdata)
{
dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ call_frame_t *prev = NULL;
+ dict_t *xattrs = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
local = frame->local;
if (!op_ret)
local->linked = _gf_true;
+ FRAME_SU_UNDO (frame, dht_local_t);
+
+ if (op_ret && (op_errno == EEXIST)) {
+ conf = this->private;
+ prev = cookie;
+ subvol = prev->this;
+ if (!subvol)
+ goto out;
+ xattrs = dict_new ();
+ if (!xattrs)
+ goto out;
+ ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set linkto key");
+ goto out;
+ }
+
+ STACK_WIND (frame, dht_linkfile_lookup_cbk, subvol,
+ subvol->fops->lookup, &local->loc, xattrs);
+ if (xattrs)
+ dict_unref (xattrs);
+ return 0;
+ }
+out:
local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
inode, stbuf, preparent, postparent,
xdata);
+ if (xattrs)
+ dict_unref (xattrs);
return 0;
}
int
dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
+ xlator_t *this,
xlator_t *tovol, xlator_t *fromvol, loc_t *loc)
{
dht_local_t *local = NULL;
dict_t *dict = NULL;
int need_unref = 0;
int ret = 0;
+ dht_conf_t *conf = this->private;
local = frame->local;
local->linkfile.linkfile_cbk = linkfile_cbk;
@@ -76,8 +139,7 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
gf_log ("dht-linkfile", GF_LOG_INFO,
"%s: internal-fop set failed", loc->path);
- ret = dict_set_str (dict, "trusted.glusterfs.dht.linkto",
- tovol->name);
+ ret = dict_set_str (dict, conf->link_xattr_name, tovol->name);
if (ret < 0) {
gf_log (frame->this->name, GF_LOG_INFO,
@@ -87,6 +149,9 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
}
local->link_subvol = fromvol;
+ /* Always create as root:root. dht_linkfile_attr_heal fixes the
+ * ownsership */
+ FRAME_SU_DO (frame, dht_local_t);
STACK_WIND (frame, dht_linkfile_create_cbk,
fromvol, fromvol->fops->mknod, loc,
S_IFREG | DHT_LINKFILE_MODE, 0, 0, dict);
@@ -179,7 +244,7 @@ dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf,
if (!xattr)
goto out;
- ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname);
+ ret = dict_get_ptr (xattr, conf->link_xattr_name, &volname);
if ((-1 == ret) || !volname)
goto out;
@@ -232,11 +297,11 @@ dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this)
GF_VALIDATE_OR_GOTO ("dht", local, out);
GF_VALIDATE_OR_GOTO ("dht", local->link_subvol, out);
- if ((local->stbuf.ia_type == IA_INVAL) ||
- (is_equal (frame->root->uid, local->stbuf.ia_uid) &&
- is_equal (frame->root->gid, local->stbuf.ia_gid)))
+ if (local->stbuf.ia_type == IA_INVAL)
return 0;
+ uuid_copy (local->loc.gfid, local->stbuf.ia_gfid);
+
copy = copy_frame (frame);
if (!copy)
@@ -252,6 +317,8 @@ dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this)
copy->local = copy_local;
+ FRAME_SU_DO (copy, dht_local_t);
+
STACK_WIND (copy, dht_linkfile_setattr_cbk, subvol,
subvol->fops->setattr, &copy_local->loc,
&stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL);
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index af31072aa..bcb19f23e 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -102,12 +102,16 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
data_t *data = NULL;
struct iatt iatt = {0,};
int32_t op_errno = 0;
+ dht_conf_t *conf = NULL;
GF_VALIDATE_OR_GOTO ("defrag", loc, out);
GF_VALIDATE_OR_GOTO ("defrag", loc->name, out);
GF_VALIDATE_OR_GOTO ("defrag", stbuf, out);
GF_VALIDATE_OR_GOTO ("defrag", this, out);
GF_VALIDATE_OR_GOTO ("defrag", xattrs, out);
+ GF_VALIDATE_OR_GOTO ("defrag", this->private, out);
+
+ conf = this->private;
if (uuid_is_null (loc->pargfid)) {
gf_log ("", GF_LOG_ERROR, "loc->pargfid is NULL for "
@@ -138,10 +142,10 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
gf_log (this->name, GF_LOG_INFO, "Attempting to migrate hardlink %s "
"with gfid %s from %s -> %s", loc->name, uuid_utoa (loc->gfid),
cached_subvol->name, hashed_subvol->name);
- data = dict_get (xattrs, DHT_LINKFILE_KEY);
+ data = dict_get (xattrs, conf->link_xattr_name);
/* set linkto on cached -> hashed if not present, else link it */
if (!data) {
- ret = dict_set_str (xattrs, DHT_LINKFILE_KEY,
+ ret = dict_set_str (xattrs, conf->link_xattr_name,
hashed_subvol->name);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Failed to set "
@@ -239,14 +243,16 @@ out:
static inline int
__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf,
- dict_t *dict, fd_t **dst_fd)
+ dict_t *dict, fd_t **dst_fd, dict_t *xattr)
{
- xlator_t *this = NULL;
- int ret = -1;
- fd_t *fd = NULL;
- struct iatt new_stbuf = {0,};
+ xlator_t *this = NULL;
+ int ret = -1;
+ fd_t *fd = NULL;
+ struct iatt new_stbuf = {0,};
+ dht_conf_t *conf = NULL;
this = THIS;
+ conf = this->private;
ret = dict_set_static_bin (dict, "gfid-req", stbuf->ia_gfid, 16);
if (ret) {
@@ -255,7 +261,7 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
goto out;
}
- ret = dict_set_str (dict, DHT_LINKFILE_KEY, from->name);
+ ret = dict_set_str (dict, conf->link_xattr_name, from->name);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to set gfid in dict for create", loc->path);
@@ -293,7 +299,7 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
/* Create the destination with LINKFILE mode, and linkto xattr,
if the linkfile already exists, it will just open the file */
ret = syncop_create (to, loc, O_RDWR, DHT_LINKFILE_MODE, fd,
- dict);
+ dict, &new_stbuf);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"failed to create %s on %s (%s)",
@@ -301,6 +307,12 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
goto out;
}
+ ret = syncop_fsetxattr (to, fd, xattr, 0);
+ if (ret == -1)
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to set xattr on %s (%s)",
+ loc->path, to->name, strerror (errno));
+
ret = syncop_ftruncate (to, fd, stbuf->ia_size);
if (ret < 0)
gf_log (this->name, GF_LOG_ERROR,
@@ -334,6 +346,9 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
int ret = -1;
xlator_t *this = NULL;
+ uint64_t src_statfs_blocks = 1;
+ uint64_t dst_statfs_blocks = 1;
+
this = THIS;
ret = syncop_statfs (from, loc, &src_statfs);
@@ -357,22 +372,34 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
if (flag != GF_DHT_MIGRATE_DATA)
goto check_avail_space;
- if (((dst_statfs.f_bavail *
- dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) <
- (((src_statfs.f_bavail * src_statfs.f_bsize) /
- GF_DISK_SECTOR_SIZE) - stbuf->ia_blocks)) {
- gf_log (this->name, GF_LOG_WARNING,
- "data movement attempted from node (%s) with"
- " higher disk space to a node (%s) with "
- "lesser disk space (%s)", from->name,
- to->name, loc->path);
-
- /* this is not a 'failure', but we don't want to
- consider this as 'success' too :-/ */
- ret = 1;
- goto out;
+ /* Check:
+ During rebalance `migrate-data` - Destination subvol experiences
+ a `reduction` in 'blocks' of free space, at the same time source
+ subvol gains certain 'blocks' of free space. A valid check is
+ necessary here to avoid errorneous move to destination where
+ the space could be scantily available.
+ */
+ if (stbuf) {
+ dst_statfs_blocks = ((dst_statfs.f_bavail *
+ dst_statfs.f_bsize) /
+ GF_DISK_SECTOR_SIZE);
+ src_statfs_blocks = ((src_statfs.f_bavail *
+ src_statfs.f_bsize) /
+ GF_DISK_SECTOR_SIZE);
+ if ((dst_statfs_blocks - stbuf->ia_blocks) <
+ (src_statfs_blocks + stbuf->ia_blocks)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "data movement attempted from node (%s) with"
+ " higher disk space to a node (%s) with "
+ "lesser disk space (%s)", from->name,
+ to->name, loc->path);
+
+ /* this is not a 'failure', but we don't want to
+ consider this as 'success' too :-/ */
+ ret = 1;
+ goto out;
+ }
}
-
check_avail_space:
if (((dst_statfs.f_bavail * dst_statfs.f_bsize) /
GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) {
@@ -449,8 +476,10 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc,
dict_t *dict = NULL;
xlator_t *this = NULL;
struct iatt iatt = {0,};
+ dht_conf_t *conf = NULL;
this = THIS;
+ conf = this->private;
fd = fd_create (loc->inode, DHT_REBALANCE_PID);
if (!fd) {
@@ -473,7 +502,7 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc,
if (!dict)
goto out;
- ret = dict_set_str (dict, DHT_LINKFILE_KEY, to->name);
+ ret = dict_set_str (dict, conf->link_xattr_name, to->name);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"failed to set xattr in dict for %s (linkto:%s)",
@@ -526,12 +555,13 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
dict_t *dict = NULL;
char *link = NULL;
struct iatt stbuf = {0,};
+ dht_conf_t *conf = this->private;
dict = dict_new ();
if (!dict)
goto out;
- ret = dict_set_int32 (dict, DHT_LINKFILE_KEY, 256);
+ ret = dict_set_int32 (dict, conf->link_xattr_name, 256);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to set 'linkto' key in dict", loc->path);
@@ -547,12 +577,13 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
}
/* we no more require this key */
- dict_del (dict, DHT_LINKFILE_KEY);
+ dict_del (dict, conf->link_xattr_name);
/* file exists in target node, only if it is 'linkfile' its valid,
otherwise, error out */
if (!ret) {
- if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict)) {
+ if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict,
+ conf->link_xattr_name)) {
gf_log (this->name, GF_LOG_WARNING,
"%s: file exists in destination", loc->path);
ret = -1;
@@ -588,7 +619,7 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
goto out;
}
- ret = syncop_symlink (to, loc, link, dict);
+ ret = syncop_symlink (to, loc, link, dict, 0);
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: creating symlink failed (%s)",
@@ -602,7 +633,7 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
ret = syncop_mknod (to, loc, st_mode_from_ia (buf->ia_prot,
buf->ia_type),
makedev (ia_major (buf->ia_rdev),
- ia_minor (buf->ia_rdev)), dict);
+ ia_minor (buf->ia_rdev)), dict, 0);
if (ret) {
gf_log (this->name, GF_LOG_WARNING, "%s: mknod failed (%s)",
loc->path, strerror (errno));
@@ -610,6 +641,15 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
}
done:
+ ret = syncop_setattr (to, loc, buf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID |
+ GF_SET_ATTR_MODE), NULL, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform setattr on %s (%s)",
+ loc->path, to->name, strerror (errno));
+ }
+
ret = syncop_unlink (from, loc);
if (ret)
gf_log (this->name, GF_LOG_WARNING, "%s: unlink failed (%s)",
@@ -647,6 +687,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
dict_t *xattr = NULL;
dict_t *xattr_rsp = NULL;
int file_has_holes = 0;
+ dht_conf_t *conf = this->private;
gf_log (this->name, GF_LOG_INFO, "%s: attempting to move from %s to %s",
loc->path, from->name, to->name);
@@ -655,7 +696,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (!dict)
goto out;
- ret = dict_set_int32 (dict, DHT_LINKFILE_KEY, 256);
+ ret = dict_set_int32 (dict, conf->link_xattr_name, 256);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to set 'linkto' key in dict", loc->path);
@@ -671,7 +712,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
}
/* we no more require this key */
- dict_del (dict, DHT_LINKFILE_KEY);
+ dict_del (dict, conf->link_xattr_name);
/* preserve source mode, so set the same to the destination */
src_ia_prot = stbuf.ia_prot;
@@ -688,9 +729,16 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
+ /* TODO: move all xattr related operations to fd based operations */
+ ret = syncop_listxattr (from, loc, &xattr);
+ if (ret == -1)
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to get xattr from %s (%s)",
+ loc->path, from->name, strerror (errno));
+
/* create the destination, with required modes/xattr */
ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf,
- dict, &dst_fd);
+ dict, &dst_fd, xattr);
if (ret)
goto out;
@@ -707,6 +755,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
+
ret = syncop_fstat (from, src_fd, &stbuf);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s (%s)",
@@ -736,19 +785,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
- /* TODO: move all xattr related operations to fd based operations */
- ret = syncop_listxattr (from, loc, &xattr);
- if (ret == -1)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to get xattr from %s (%s)",
- loc->path, from->name, strerror (errno));
-
- ret = syncop_setxattr (to, loc, xattr, 0);
- if (ret == -1)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set xattr on %s (%s)",
- loc->path, to->name, strerror (errno));
-
/* TODO: Sync the locks */
ret = syncop_fsync (to, dst_fd, 0);
@@ -812,6 +848,23 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
+ /* Free up the data blocks on the source node, as the whole
+ file is migrated */
+ ret = syncop_ftruncate (from, src_fd, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform truncate on %s (%s)",
+ loc->path, from->name, strerror (errno));
+ }
+
+ /* remove the 'linkto' xattr from the destination */
+ ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform removexattr on %s (%s)",
+ loc->path, to->name, strerror (errno));
+ }
+
/* Do a stat and check the gfid before unlink */
ret = syncop_stat (from, loc, &empty_iatt);
if (ret) {
@@ -832,23 +885,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
}
}
- /* Free up the data blocks on the source node, as the whole
- file is migrated */
- ret = syncop_ftruncate (from, src_fd, 0);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to perform truncate on %s (%s)",
- loc->path, from->name, strerror (errno));
- }
-
- /* remove the 'linkto' xattr from the destination */
- ret = syncop_fremovexattr (to, dst_fd, DHT_LINKFILE_KEY);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to perform removexattr on %s (%s)",
- loc->path, to->name, strerror (errno));
- }
-
ret = syncop_lookup (this, loc, NULL, NULL, NULL, NULL);
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
@@ -1090,6 +1126,7 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
struct timeval end = {0,};
double elapsed = {0,};
struct timeval start = {0,};
+ int32_t err = 0;
gf_log (this->name, GF_LOG_INFO, "migrate data called on %s",
loc->path);
@@ -1247,9 +1284,21 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
ret = syncop_setxattr (this, &entry_loc, migrate_data,
0);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "migrate-data"
- " failed for %s", entry_loc.path);
- defrag->total_failures +=1;
+ err = op_errno;
+ /* errno is overloaded. See
+ * rebalance_task_completion () */
+ if (err != ENOSPC) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "migrate-data skipped for %s"
+ " due to space constraints",
+ entry_loc.path);
+ defrag->skipped +=1;
+ } else{
+ gf_log (this->name, GF_LOG_ERROR,
+ "migrate-data failed for %s",
+ entry_loc.path);
+ defrag->total_failures +=1;
+ }
}
if (ret == -1) {
@@ -1648,6 +1697,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
uint64_t size = 0;
uint64_t lookup = 0;
uint64_t failures = 0;
+ uint64_t skipped = 0;
char *status = "";
double elapsed = 0;
struct timeval end = {0,};
@@ -1664,6 +1714,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
size = defrag->total_data;
lookup = defrag->num_files_lookedup;
failures = defrag->total_failures;
+ skipped = defrag->skipped;
gettimeofday (&end, NULL);
@@ -1687,6 +1738,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
gf_log (THIS->name, GF_LOG_WARNING,
"failed to set lookedup file count");
+
ret = dict_set_int32 (dict, "status", defrag->defrag_status);
if (ret)
gf_log (THIS->name, GF_LOG_WARNING,
@@ -1699,6 +1751,14 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
}
ret = dict_set_uint64 (dict, "failures", failures);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to set failure count");
+
+ ret = dict_set_uint64 (dict, "skipped", skipped);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to set skipped file count");
log:
switch (defrag->defrag_status) {
case GF_DEFRAG_STATUS_NOT_STARTED:
@@ -1716,13 +1776,15 @@ log:
case GF_DEFRAG_STATUS_FAILED:
status = "failed";
break;
+ default:
+ break;
}
gf_log (THIS->name, GF_LOG_INFO, "Rebalance is %s. Time taken is %.2f "
"secs", status, elapsed);
gf_log (THIS->name, GF_LOG_INFO, "Files migrated: %"PRIu64", size: %"
- PRIu64", lookups: %"PRIu64", failures: %"PRIu64, files, size,
- lookup, failures);
+ PRIu64", lookups: %"PRIu64", failures: %"PRIu64", skipped: "
+ "%"PRIu64, files, size, lookup, failures, skipped);
out:
diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c
index 35fedeaa7..5d6f4f232 100644
--- a/xlators/cluster/dht/src/dht-rename.c
+++ b/xlators/cluster/dht/src/dht-rename.c
@@ -306,7 +306,38 @@ err:
NULL, NULL);
return 0;
}
+#define DHT_MARK_FOP_INTERNAL(xattr) do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new (); \
+ if (!xattr) \
+ break; \
+ } \
+ tmp = dict_set_str (xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \
+ if (tmp) { \
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set" \
+ " internal dict key for %s", local->loc.path); \
+ } \
+ }while (0)
+int
+dht_rename_done (call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (local->linked == _gf_true) {
+ local->linked = _gf_false;
+ dht_linkfile_attr_heal (frame, this);
+ }
+ DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
+ DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
+ &local->stbuf, &local->preoldparent,
+ &local->postoldparent, &local->preparent,
+ &local->postparent, NULL);
+ return 0;
+}
int
dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -340,11 +371,7 @@ dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
WIPE (&local->postparent);
if (is_last_call (this_call_cnt)) {
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent, &local->preparent,
- &local->postparent, NULL);
+ dht_rename_done (frame, this);
}
out:
@@ -362,7 +389,7 @@ dht_rename_cleanup (call_frame_t *frame)
xlator_t *dst_hashed = NULL;
xlator_t *dst_cached = NULL;
int call_cnt = 0;
-
+ dict_t *xattr = NULL;
local = frame->local;
this = frame->this;
@@ -386,13 +413,15 @@ dht_rename_cleanup (call_frame_t *frame)
if (!call_cnt)
goto nolinks;
+ DHT_MARK_FOP_INTERNAL (xattr);
+
if (dst_hashed != src_hashed && dst_hashed != src_cached) {
gf_log (this->name, GF_LOG_TRACE,
"unlinking linkfile %s @ %s => %s",
local->loc.path, dst_hashed->name, src_cached->name);
STACK_WIND (frame, dht_rename_unlink_cbk,
dst_hashed, dst_hashed->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr);
}
if (src_cached != dst_hashed) {
@@ -401,9 +430,12 @@ dht_rename_cleanup (call_frame_t *frame)
local->loc2.path, src_cached->name);
STACK_WIND (frame, dht_rename_unlink_cbk,
src_cached, src_cached->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr);
}
+ if (xattr)
+ dict_unref (xattr);
+
return 0;
nolinks:
@@ -467,6 +499,7 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
xlator_t *rename_subvol = NULL;
call_frame_t *link_frame = NULL;
dht_local_t *link_local = NULL;
+ dict_t *xattr = NULL;
local = frame->local;
prev = cookie;
@@ -476,6 +509,8 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dst_hashed = local->dst_hashed;
dst_cached = local->dst_cached;
+ if (local->linked == _gf_true)
+ FRAME_SU_UNDO (frame, dht_local_t);
if (op_ret == -1) {
gf_log (this->name, GF_LOG_WARNING,
"%s: rename on %s failed (%s)", local->loc.path,
@@ -505,21 +540,26 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
uuid_copy (link_local->gfid, local->loc.inode->gfid);
dht_linkfile_create (link_frame, dht_rename_links_create_cbk,
- src_cached, dst_hashed, &link_local->loc);
+ this, src_cached, dst_hashed,
+ &link_local->loc);
}
err:
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->preoldparent, preoldparent, prev->this);
- dht_iatt_merge (this, &local->postoldparent, postoldparent, prev->this);
- dht_iatt_merge (this, &local->preparent, prenewparent, prev->this);
- dht_iatt_merge (this, &local->postparent, postnewparent, prev->this);
-
- if (local->linked == _gf_true) {
- local->linked = _gf_false;
- dht_linkfile_attr_heal (frame, this);
+ /* Merge attrs only from src_cached. In case there of src_cached !=
+ * dst_hashed, this ignores linkfile attrs. */
+ if (prev->this == src_cached) {
+ dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
+ dht_iatt_merge (this, &local->preoldparent, preoldparent,
+ prev->this);
+ dht_iatt_merge (this, &local->postoldparent, postoldparent,
+ prev->this);
+ dht_iatt_merge (this, &local->preparent, prenewparent,
+ prev->this);
+ dht_iatt_merge (this, &local->postparent, postnewparent,
+ prev->this);
}
+
/* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk
* is called. since rename has already happened on rename_subvol,
* unlink should not be sent for oldpath (either linkfile or cached-file)
@@ -543,6 +583,8 @@ err:
if (local->call_cnt == 0)
goto unwind;
+ DHT_MARK_FOP_INTERNAL (xattr);
+
if (src_cached != dst_hashed && src_cached != dst_cached) {
gf_log (this->name, GF_LOG_TRACE,
"deleting old src datafile %s @ %s",
@@ -550,7 +592,7 @@ err:
STACK_WIND (frame, dht_rename_unlink_cbk,
src_cached, src_cached->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr);
}
if (src_hashed != rename_subvol && src_hashed != src_cached) {
@@ -560,7 +602,7 @@ err:
STACK_WIND (frame, dht_rename_unlink_cbk,
src_hashed, src_hashed->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr);
}
if (dst_cached
@@ -572,8 +614,10 @@ err:
STACK_WIND (frame, dht_rename_unlink_cbk,
dst_cached, dst_cached->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr);
}
+ if (xattr)
+ dict_unref (xattr);
return 0;
unwind:
@@ -581,16 +625,16 @@ unwind:
WIPE (&local->postoldparent);
WIPE (&local->preparent);
WIPE (&local->postparent);
+ if (xattr)
+ dict_unref (xattr);
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent, &local->preparent,
- &local->postparent, NULL);
+ dht_rename_done (frame, this);
return 0;
cleanup:
+ if (xattr)
+ dict_unref (xattr);
dht_rename_cleanup (frame);
return 0;
@@ -624,6 +668,8 @@ dht_do_rename (call_frame_t *frame)
"renaming %s => %s (%s)",
local->loc.path, local->loc2.path, rename_subvol->name);
+ if (local->linked == _gf_true)
+ FRAME_SU_DO (frame, dht_local_t);
STACK_WIND (frame, dht_rename_cbk,
rename_subvol, rename_subvol->fops->rename,
&local->loc, &local->loc2, NULL);
@@ -654,7 +700,8 @@ dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->op_ret = -1;
if (op_errno != ENOENT)
local->op_errno = op_errno;
- } else {
+ } else if (local->src_cached == prev->this) {
+ /* merge of attr returned only from linkfile creation */
dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
}
@@ -721,6 +768,7 @@ dht_rename_create_links (call_frame_t *frame)
xlator_t *dst_hashed = NULL;
xlator_t *dst_cached = NULL;
int call_cnt = 0;
+ dict_t *xattr = NULL;
local = frame->local;
@@ -731,6 +779,7 @@ dht_rename_create_links (call_frame_t *frame)
dst_hashed = local->dst_hashed;
dst_cached = local->dst_cached;
+ DHT_MARK_FOP_INTERNAL (xattr);
if (src_cached == dst_cached) {
if (dst_hashed == dst_cached)
@@ -742,7 +791,7 @@ dht_rename_create_links (call_frame_t *frame)
STACK_WIND (frame, dht_rename_unlink_links_cbk,
dst_hashed, dst_hashed->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr);
return 0;
}
@@ -759,7 +808,7 @@ dht_rename_create_links (call_frame_t *frame)
"linkfile %s @ %s => %s",
local->loc.path, dst_hashed->name, src_cached->name);
memcpy (local->gfid, local->loc.inode->gfid, 16);
- dht_linkfile_create (frame, dht_rename_links_cbk,
+ dht_linkfile_create (frame, dht_rename_links_cbk, this,
src_cached, dst_hashed, &local->loc);
}
@@ -769,7 +818,7 @@ dht_rename_create_links (call_frame_t *frame)
local->loc2.path, src_cached->name);
STACK_WIND (frame, dht_rename_links_cbk,
src_cached, src_cached->fops->link,
- &local->loc, &local->loc2, NULL);
+ &local->loc, &local->loc2, xattr);
}
nolinks:
@@ -777,6 +826,8 @@ nolinks:
/* skip to next step */
dht_do_rename (frame);
}
+ if (xattr)
+ dict_unref (xattr);
return 0;
}
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index 22c61130f..3fe96b1c7 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -17,6 +17,7 @@
#include "glusterfs.h"
#include "xlator.h"
#include "dht-common.h"
+#include "glusterfs-acl.h"
#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \
layout->list[i].start = srt; \
@@ -28,6 +29,13 @@
layout->list[i].xlator->name, path); \
} while (0)
+#define DHT_RESET_LAYOUT_RANGE(layout) do { \
+ int cnt = 0; \
+ for (cnt = 0; cnt < layout->cnt; cnt++ ) { \
+ layout->list[cnt].start = 0; \
+ layout->list[cnt].stop = 0; \
+ } \
+ } while (0)
static uint32_t
dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n)
@@ -118,7 +126,7 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
xlator_t *this = NULL;
int32_t *disk_layout = NULL;
dht_local_t *local = NULL;
-
+ dht_conf_t *conf = NULL;
local = frame->local;
if (req_subvol)
@@ -131,6 +139,9 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
GF_VALIDATE_OR_GOTO (this->name, layout, err);
GF_VALIDATE_OR_GOTO (this->name, local, err);
GF_VALIDATE_OR_GOTO (this->name, subvol, err);
+ VALIDATE_OR_GOTO (this->private, err);
+
+ conf = this->private;
xattr = get_new_dict ();
if (!xattr) {
@@ -145,8 +156,7 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
goto err;
}
- ret = dict_set_bin (xattr, "trusted.glusterfs.dht",
- disk_layout, 4 * 4);
+ ret = dict_set_bin (xattr, conf->xattr_name, disk_layout, 4 * 4);
if (ret == -1) {
gf_log (this->name, GF_LOG_WARNING,
"%s: (subvol %s) failed to set xattr dictionary",
@@ -237,9 +247,12 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
int missing_xattr = 0;
int i = 0;
xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *dummy = NULL;
local = frame->local;
this = frame->this;
+ conf = this->private;
for (i = 0; i < layout->cnt; i++) {
if (layout->list[i].err != -1 || !layout->list[i].stop) {
@@ -273,6 +286,18 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
if (--missing_xattr == 0)
break;
}
+ dummy = dht_layout_new (this, 1);
+ if (!dummy)
+ goto out;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (_gf_false ==
+ dht_is_subvol_in_layout (layout, conf->subvolumes[i])) {
+ dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0,
+ conf->subvolumes[i]);
+ }
+ }
+ dht_layout_unref (this, dummy);
+out:
return 0;
}
@@ -540,9 +565,33 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)
for (i = 0; i < layout->cnt; i++) {
err = layout->list[i].err;
- if (err == -1 || err == 0) {
- layout->list[i].err = -1;
+ if (err == -1 || err == 0 || err == ENOENT) {
+ /* Setting list[i].err = -1 is an indication for
+ dht_selfheal_layout_new_directory() to assign
+ a range. We set it to -1 based on any one of
+ the three criteria:
+
+ - err == -1 already, which means directory
+ existed but layout was not set on it.
+
+ - err == 0, which means directory exists and
+ has an old layout piece which will be
+ overwritten now.
+
+ - err == ENOENT, which means directory does
+ not exist (possibly racing with mkdir or
+ finishing half done mkdir). The missing
+ directory will be attempted to be recreated.
+
+ It is important to note that it is safe
+ to race with mkdir() as self-heal and
+ mkdir are idempotent operations. Both will
+ strive to set the directory and layouts to
+ the same final state.
+ */
count++;
+ if (!err)
+ layout->list[i].err = -1;
}
}
@@ -604,7 +653,7 @@ dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc,
if (!table) {
return;
}
- memset(table,0,sizeof(overlap)*new->cnt*new->cnt);
+ memset(table,0,sizeof(overlap)*old->cnt*new->cnt);
for (i = 0; i < new->cnt; ++i) {
for (j = 0; j < old->cnt; ++j) {
OV_ENTRY(i,j) = dht_overlap_calc(old,j,new,i);
@@ -623,6 +672,13 @@ dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc,
max_overlap = 0;
max_overlap_idx = i;
for (j = (i + 1); j < new->cnt; ++j) {
+ if (new->list[j].err > 0) {
+ /* Subvol might be marked for decommission
+ with EINVAL, or some other serious error
+ marked with positive errno.
+ */
+ continue;
+ }
/* Calculate the overlap now. */
curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j);
/* Calculate the overlap after the proposed swap. */
@@ -741,9 +797,11 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout);
+ /* clear out the range, as we are re-computing here */
+ DHT_RESET_LAYOUT_RANGE (layout);
for (i = start_subvol; i < layout->cnt; i++) {
err = layout->list[i].err;
- if (err == -1) {
+ if (err == -1 || err == ENOENT) {
DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
cnt, loc->path);
if (--cnt == 0) {
@@ -756,7 +814,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
for (i = 0; i < start_subvol; i++) {
err = layout->list[i].err;
- if (err == -1) {
+ if (err == -1 || err == ENOENT) {
DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
cnt, loc->path);
if (--cnt == 0) {
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
new file mode 100644
index 000000000..70aac7710
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -0,0 +1,758 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+/* TODO: add NS locking */
+
+#include "statedump.h"
+#include "dht-common.h"
+
+/* TODO:
+ - use volumename in xattr instead of "dht"
+ - use NS locks
+ - handle all cases in self heal layout reconstruction
+ - complete linkfile selfheal
+*/
+struct volume_options options[];
+
+void
+dht_layout_dump (dht_layout_t *layout, const char *prefix)
+{
+
+ char key[GF_DUMP_MAX_BUF_LEN];
+ int i = 0;
+
+ if (!layout)
+ goto out;
+ if (!prefix)
+ goto out;
+
+ gf_proc_dump_build_key(key, prefix, "cnt");
+ gf_proc_dump_write(key, "%d", layout->cnt);
+ gf_proc_dump_build_key(key, prefix, "preset");
+ gf_proc_dump_write(key, "%d", layout->preset);
+ gf_proc_dump_build_key(key, prefix, "gen");
+ gf_proc_dump_write(key, "%d", layout->gen);
+ if (layout->type != IA_INVAL) {
+ gf_proc_dump_build_key(key, prefix, "inode type");
+ gf_proc_dump_write(key, "%d", layout->type);
+ }
+
+ if (!IA_ISDIR (layout->type))
+ goto out;
+
+ for (i = 0; i < layout->cnt; i++) {
+ gf_proc_dump_build_key(key, prefix,"list[%d].err", i);
+ gf_proc_dump_write(key, "%d", layout->list[i].err);
+ gf_proc_dump_build_key(key, prefix,"list[%d].start", i);
+ gf_proc_dump_write(key, "%u", layout->list[i].start);
+ gf_proc_dump_build_key(key, prefix,"list[%d].stop", i);
+ gf_proc_dump_write(key, "%u", layout->list[i].stop);
+ if (layout->list[i].xlator) {
+ gf_proc_dump_build_key(key, prefix,
+ "list[%d].xlator.type", i);
+ gf_proc_dump_write(key, "%s",
+ layout->list[i].xlator->type);
+ gf_proc_dump_build_key(key, prefix,
+ "list[%d].xlator.name", i);
+ gf_proc_dump_write(key, "%s",
+ layout->list[i].xlator->name);
+ }
+ }
+
+out:
+ return;
+}
+
+
+int32_t
+dht_priv_dump (xlator_t *this)
+{
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
+ char key[GF_DUMP_MAX_BUF_LEN];
+ int i = 0;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+
+ if (!this)
+ goto out;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ ret = TRY_LOCK(&conf->subvolume_lock);
+ if (ret != 0) {
+ return ret;
+ }
+
+ gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name);
+ gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv",
+ this->name);
+ gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt);
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ sprintf (key, "subvolumes[%d]", i);
+ gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type,
+ conf->subvolumes[i]->name);
+ if (conf->file_layouts && conf->file_layouts[i]){
+ sprintf (key, "file_layouts[%d]", i);
+ dht_layout_dump(conf->file_layouts[i], key);
+ }
+ if (conf->dir_layouts && conf->dir_layouts[i]) {
+ sprintf (key, "dir_layouts[%d]", i);
+ dht_layout_dump(conf->dir_layouts[i], key);
+ }
+ if (conf->subvolume_status) {
+
+ sprintf (key, "subvolume_status[%d]", i);
+ gf_proc_dump_write(key, "%d",
+ (int)conf->subvolume_status[i]);
+ }
+
+ }
+
+ gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed);
+ gf_proc_dump_write("gen", "%d", conf->gen);
+ gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk);
+ gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes);
+ gf_proc_dump_write("disk_unit", "%c", conf->disk_unit);
+ gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval);
+ gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit);
+ if (conf ->du_stats) {
+ gf_proc_dump_write("du_stats.avail_percent", "%lf",
+ conf->du_stats->avail_percent);
+ gf_proc_dump_write("du_stats.avail_space", "%lu",
+ conf->du_stats->avail_space);
+ gf_proc_dump_write("du_stats.avail_inodes", "%lf",
+ conf->du_stats->avail_inodes);
+ gf_proc_dump_write("du_stats.log", "%lu", conf->du_stats->log);
+ }
+
+ if (conf->last_stat_fetch.tv_sec)
+ gf_proc_dump_write("last_stat_fetch", "%s",
+ ctime(&conf->last_stat_fetch.tv_sec));
+
+ UNLOCK(&conf->subvolume_lock);
+
+out:
+ return ret;
+}
+
+int32_t
+dht_inodectx_dump (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ dht_layout_t *layout = NULL;
+
+ if (!this)
+ goto out;
+ if (!inode)
+ goto out;
+
+ ret = dht_inode_ctx_layout_get (inode, this, &layout);
+
+ if ((ret != 0) || !layout)
+ return ret;
+
+ gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name);
+ dht_layout_dump(layout, "layout");
+
+out:
+ return ret;
+}
+
+void
+dht_fini (xlator_t *this)
+{
+ int i = 0;
+ dht_conf_t *conf = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+
+ conf = this->private;
+ this->private = NULL;
+ if (conf) {
+ if (conf->file_layouts) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ GF_FREE (conf->file_layouts[i]);
+ }
+ GF_FREE (conf->file_layouts);
+ }
+
+ GF_FREE (conf->subvolumes);
+
+ GF_FREE (conf->subvolume_status);
+
+ GF_FREE (conf);
+ }
+out:
+ return;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+
+ ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+ return ret;
+ }
+out:
+ return ret;
+}
+
+
+int
+dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf,
+ const char *bricks)
+{
+ int i = 0;
+ int ret = -1;
+ char *tmpstr = NULL;
+ char *dup_brick = NULL;
+ char *node = NULL;
+
+ if (!conf || !bricks)
+ goto out;
+
+ dup_brick = gf_strdup (bricks);
+ node = strtok_r (dup_brick, ",", &tmpstr);
+ while (node) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!strcmp (conf->subvolumes[i]->name, node)) {
+ conf->decommissioned_bricks[i] =
+ conf->subvolumes[i];
+ conf->decommission_subvols_cnt++;
+ gf_log (this->name, GF_LOG_INFO,
+ "decommissioning subvolume %s",
+ conf->subvolumes[i]->name);
+ break;
+ }
+ }
+ if (i == conf->subvolume_cnt) {
+ /* Wrong node given. */
+ goto out;
+ }
+ node = strtok_r (NULL, ",", &tmpstr);
+ }
+
+ ret = 0;
+ conf->decommission_in_progress = 1;
+out:
+ GF_FREE (dup_brick);
+
+ return ret;
+}
+
+
+int
+dht_decommissioned_remove (xlator_t *this, dht_conf_t *conf)
+{
+ int i = 0;
+ int ret = -1;
+
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i]) {
+ conf->decommissioned_bricks[i] = NULL;
+ conf->decommission_subvols_cnt--;
+ }
+ }
+
+ ret = 0;
+out:
+
+ return ret;
+}
+void
+dht_init_regex (xlator_t *this, dict_t *odict, char *name,
+ regex_t *re, gf_boolean_t *re_valid)
+{
+ char *temp_str;
+
+ if (dict_get_str (odict, name, &temp_str) != 0) {
+ if (strcmp(name,"rsync-hash-regex")) {
+ return;
+ }
+ temp_str = "^\\.(.+)\\.[^.]+$";
+ }
+
+ if (*re_valid) {
+ regfree(re);
+ *re_valid = _gf_false;
+ }
+
+ if (!strcmp(temp_str,"none")) {
+ return;
+ }
+
+ if (regcomp(re,temp_str,REG_EXTENDED) == 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "using regex %s = %s", name, temp_str);
+ *re_valid = _gf_true;
+ }
+ else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "compiling regex %s failed", temp_str);
+ }
+}
+
+int
+dht_reconfigure (xlator_t *this, dict_t *options)
+{
+ dht_conf_t *conf = NULL;
+ char *temp_str = NULL;
+ gf_boolean_t search_unhashed;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", options, out);
+
+ conf = this->private;
+ if (!conf)
+ return 0;
+
+ if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) {
+ /* If option is not "auto", other options _should_ be boolean*/
+ if (strcasecmp (temp_str, "auto")) {
+ if (!gf_string2boolean (temp_str, &search_unhashed)) {
+ gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
+ " lookup-unhashed reconfigured (%s)",
+ temp_str);
+ conf->search_unhashed = search_unhashed;
+ } else {
+ gf_log(this->name, GF_LOG_ERROR, "Reconfigure:"
+ " lookup-unhashed should be boolean,"
+ " not (%s), defaulting to (%d)",
+ temp_str, conf->search_unhashed);
+ //return -1;
+ ret = -1;
+ goto out;
+ }
+ } else {
+ gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
+ " lookup-unhashed reconfigured auto ");
+ conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
+ }
+ }
+
+ GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options,
+ percent_or_size, out);
+ /* option can be any one of percent or bytes */
+ conf->disk_unit = 0;
+ if (conf->min_free_disk < 100.0)
+ conf->disk_unit = 'p';
+
+ GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options,
+ percent, out);
+
+ GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt,
+ options, uint32, out);
+
+ GF_OPTION_RECONF ("readdir-optimize", conf->readdir_optimize, options,
+ bool, out);
+ if (conf->defrag) {
+ GF_OPTION_RECONF ("rebalance-stats", conf->defrag->stats,
+ options, bool, out);
+ }
+
+ if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) {
+ ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
+ if (ret == -1)
+ goto out;
+ } else {
+ ret = dht_decommissioned_remove (this, conf);
+ if (ret == -1)
+ goto out;
+ }
+
+ dht_init_regex (this, options, "rsync-hash-regex",
+ &conf->rsync_regex, &conf->rsync_regex_valid);
+ dht_init_regex (this, options, "extra-hash-regex",
+ &conf->extra_regex, &conf->extra_regex_valid);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int
+gf_defrag_pattern_list_fill (xlator_t *this, gf_defrag_info_t *defrag, char *data)
+{
+ int ret = -1;
+ char *tmp_str = NULL;
+ char *tmp_str1 = NULL;
+ char *dup_str = NULL;
+ char *num = NULL;
+ char *pattern_str = NULL;
+ char *pattern = NULL;
+ gf_defrag_pattern_list_t *temp_list = NULL;
+ gf_defrag_pattern_list_t *pattern_list = NULL;
+
+ if (!this || !defrag || !data)
+ goto out;
+
+ /* Get the pattern for pattern list. "pattern:<optional-size>"
+ * eg: *avi, *pdf:10MB, *:1TB
+ */
+ pattern_str = strtok_r (data, ",", &tmp_str);
+ while (pattern_str) {
+ dup_str = gf_strdup (pattern_str);
+ pattern_list = GF_CALLOC (1, sizeof (gf_defrag_pattern_list_t),
+ 1);
+ if (!pattern_list) {
+ goto out;
+ }
+ pattern = strtok_r (dup_str, ":", &tmp_str1);
+ num = strtok_r (NULL, ":", &tmp_str1);
+ if (!pattern)
+ goto out;
+ if (!num) {
+ if (gf_string2bytesize(pattern, &pattern_list->size)
+ == 0) {
+ pattern = "*";
+ }
+ } else if (gf_string2bytesize (num, &pattern_list->size) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "invalid number format \"%s\"", num);
+ goto out;
+ }
+ memcpy (pattern_list->path_pattern, pattern, strlen (dup_str));
+
+ if (!defrag->defrag_pattern)
+ temp_list = NULL;
+ else
+ temp_list = defrag->defrag_pattern;
+
+ pattern_list->next = temp_list;
+
+ defrag->defrag_pattern = pattern_list;
+ pattern_list = NULL;
+
+ GF_FREE (dup_str);
+ dup_str = NULL;
+
+ pattern_str = strtok_r (NULL, ",", &tmp_str);
+ }
+
+ ret = 0;
+out:
+ if (ret)
+ GF_FREE (pattern_list);
+ GF_FREE (dup_str);
+
+ return ret;
+}
+
+int
+dht_init (xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ char *temp_str = NULL;
+ int ret = -1;
+ int i = 0;
+ gf_defrag_info_t *defrag = NULL;
+ int cmd = 0;
+ char *node_uuid = NULL;
+
+
+ GF_VALIDATE_OR_GOTO ("dht", this, err);
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Distribute needs more than one subvolume");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile");
+ }
+
+ conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t);
+ if (!conf) {
+ goto err;
+ }
+
+ ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd);
+
+ if (cmd) {
+ defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t),
+ gf_defrag_info_mt);
+
+ GF_VALIDATE_OR_GOTO (this->name, defrag, err);
+
+ LOCK_INIT (&defrag->lock);
+
+ defrag->is_exiting = 0;
+
+ conf->defrag = defrag;
+
+ ret = dict_get_str (this->options, "node-uuid", &node_uuid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "node-uuid not "
+ "specified");
+ goto err;
+ }
+
+ if (uuid_parse (node_uuid, defrag->node_uuid)) {
+ gf_log (this->name, GF_LOG_ERROR, "Cannot parse "
+ "glusterd node uuid");
+ goto err;
+ }
+
+ defrag->cmd = cmd;
+
+ defrag->stats = _gf_false;
+ }
+
+ conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
+ if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
+ /* If option is not "auto", other options _should_ be boolean */
+ if (strcasecmp (temp_str, "auto"))
+ gf_string2boolean (temp_str, &conf->search_unhashed);
+ else
+ conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
+ }
+
+ GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool,
+ err);
+
+ GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err);
+
+ GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size,
+ err);
+
+ GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent,
+ err);
+
+ conf->dir_spread_cnt = conf->subvolume_cnt;
+ GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt,
+ uint32, err);
+
+ GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down,
+ bool, err);
+
+ GF_OPTION_INIT ("readdir-optimize", conf->readdir_optimize, bool, err);
+
+ if (defrag) {
+ GF_OPTION_INIT ("rebalance-stats", defrag->stats, bool, err);
+ if (dict_get_str (this->options, "rebalance-filter", &temp_str)
+ == 0) {
+ if (gf_defrag_pattern_list_fill (this, defrag, temp_str)
+ == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "Cannot parse"
+ " rebalance-filter (%s)", temp_str);
+ goto err;
+ }
+ }
+ }
+
+ /* option can be any one of percent or bytes */
+ conf->disk_unit = 0;
+ if (conf->min_free_disk < 100)
+ conf->disk_unit = 'p';
+
+ ret = dht_init_subvolumes (this, conf);
+ if (ret == -1) {
+ goto err;
+ }
+
+ if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) {
+ ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
+ if (ret == -1)
+ goto err;
+ }
+
+ dht_init_regex (this, this->options, "rsync-hash-regex",
+ &conf->rsync_regex, &conf->rsync_regex_valid);
+ dht_init_regex (this, this->options, "extra-hash-regex",
+ &conf->extra_regex, &conf->extra_regex_valid);
+
+ ret = dht_layouts_init (this, conf);
+ if (ret == -1) {
+ goto err;
+ }
+
+ LOCK_INIT (&conf->subvolume_lock);
+ LOCK_INIT (&conf->layout_lock);
+
+ conf->gen = 1;
+
+ this->local_pool = mem_pool_new (dht_local_t, 512);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create local_t's memory pool");
+ goto err;
+ }
+
+ GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err);
+ gf_asprintf (&conf->link_xattr_name, "%s.linkto", conf->xattr_name);
+ gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name);
+ if (!conf->link_xattr_name || !conf->wild_xattr_name) {
+ goto err;
+ }
+
+ this->private = conf;
+
+ return 0;
+
+err:
+ if (conf) {
+ if (conf->file_layouts) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ GF_FREE (conf->file_layouts[i]);
+ }
+ GF_FREE (conf->file_layouts);
+ }
+
+ GF_FREE (conf->subvolumes);
+
+ GF_FREE (conf->subvolume_status);
+
+ GF_FREE (conf->du_stats);
+
+ GF_FREE (conf->defrag);
+
+ GF_FREE (conf->xattr_name);
+ GF_FREE (conf->link_xattr_name);
+ GF_FREE (conf->wild_xattr_name);
+
+ GF_FREE (conf);
+ }
+
+ return -1;
+}
+
+
+struct volume_options options[] = {
+ { .key = {"lookup-unhashed"},
+ .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
+ "on", "off"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "on",
+ .description = "This option if set to ON, does a lookup through "
+ "all the sub-volumes, in case a lookup didn't return any result "
+ "from the hash subvolume. If set to OFF, it does not do a lookup "
+ "on the remaining subvolumes."
+ },
+ { .key = {"min-free-disk"},
+ .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
+ .default_value = "10%",
+ .description = "Percentage/Size of disk space, after which the "
+ "process starts balancing out the cluster, and logs will appear "
+ "in log files",
+ },
+ { .key = {"min-free-inodes"},
+ .type = GF_OPTION_TYPE_PERCENT,
+ .default_value = "5%",
+ .description = "after system has only N% of inodes, warnings "
+ "starts to appear in log files",
+ },
+ { .key = {"unhashed-sticky-bit"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
+ { .key = {"use-readdirp"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "This option if set to ON, forces the use of "
+ "readdirp, and hence also displays the stats of the files."
+ },
+ { .key = {"assert-no-child-down"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option if set to ON, in the event of "
+ "CHILD_DOWN, will call exit."
+ },
+ { .key = {"directory-layout-spread"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Specifies the directory layout spread."
+ },
+ { .key = {"decommissioned-bricks"},
+ .type = GF_OPTION_TYPE_ANY,
+ .description = "This option if set to ON, decommissions "
+ "the brick, so that no new data is allowed to be created "
+ "on that brick."
+ },
+ { .key = {"rebalance-cmd"},
+ .type = GF_OPTION_TYPE_INT,
+ },
+ { .key = {"node-uuid"},
+ .type = GF_OPTION_TYPE_STR,
+ },
+ { .key = {"rebalance-stats"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option if set to ON displays and logs the "
+ " time taken for migration of each file, during the rebalance "
+ "process. If set to OFF, the rebalance logs will only display the "
+ "time spent in each directory."
+ },
+ { .key = {"readdir-optimize"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option if set to ON enables the optimization "
+ "that allows DHT to requests non-first subvolumes to filter out "
+ "directory entries."
+ },
+ { .key = {"rsync-hash-regex"},
+ .type = GF_OPTION_TYPE_STR,
+ /* Setting a default here doesn't work. See dht_init_regex. */
+ .description = "Regular expression for stripping temporary-file "
+ "suffix and prefix used by rsync, to prevent relocation when the "
+ "file is renamed."
+ },
+ { .key = {"extra-hash-regex"},
+ .type = GF_OPTION_TYPE_STR,
+ /* Setting a default here doesn't work. See dht_init_regex. */
+ .description = "Regular expression for stripping temporary-file "
+ "suffix and prefix used by an application, to prevent relocation when "
+ "the file is renamed."
+ },
+ { .key = {"rebalance-filter"},
+ .type = GF_OPTION_TYPE_STR,
+ },
+
+ { .key = {"xattr-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "trusted.glusterfs.dht",
+ .description = "Base for extended attributes used by this "
+ "translator instance, to avoid conflicts with others above or "
+ "below it."
+ },
+
+ /* NUFA option */
+ { .key = {"local-volume-name"},
+ .type = GF_OPTION_TYPE_XLATOR
+ },
+
+ /* switch option */
+ { .key = {"pattern.switch.case"},
+ .type = GF_OPTION_TYPE_ANY
+ },
+
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c
index 14f3eb1d1..fc0ca2f77 100644
--- a/xlators/cluster/dht/src/dht.c
+++ b/xlators/cluster/dht/src/dht.c
@@ -14,621 +14,15 @@
#include "config.h"
#endif
-/* TODO: add NS locking */
-
#include "statedump.h"
#include "dht-common.h"
-/* TODO:
- - use volumename in xattr instead of "dht"
- - use NS locks
- - handle all cases in self heal layout reconstruction
- - complete linkfile selfheal
-*/
-struct volume_options options[];
-
-void
-dht_layout_dump (dht_layout_t *layout, const char *prefix)
-{
-
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 0;
-
- if (!layout)
- goto out;
- if (!prefix)
- goto out;
-
- gf_proc_dump_build_key(key, prefix, "cnt");
- gf_proc_dump_write(key, "%d", layout->cnt);
- gf_proc_dump_build_key(key, prefix, "preset");
- gf_proc_dump_write(key, "%d", layout->preset);
- gf_proc_dump_build_key(key, prefix, "gen");
- gf_proc_dump_write(key, "%d", layout->gen);
- if (layout->type != IA_INVAL) {
- gf_proc_dump_build_key(key, prefix, "inode type");
- gf_proc_dump_write(key, "%d", layout->type);
- }
-
- if (!IA_ISDIR (layout->type))
- goto out;
-
- for (i = 0; i < layout->cnt; i++) {
- gf_proc_dump_build_key(key, prefix,"list[%d].err", i);
- gf_proc_dump_write(key, "%d", layout->list[i].err);
- gf_proc_dump_build_key(key, prefix,"list[%d].start", i);
- gf_proc_dump_write(key, "%u", layout->list[i].start);
- gf_proc_dump_build_key(key, prefix,"list[%d].stop", i);
- gf_proc_dump_write(key, "%u", layout->list[i].stop);
- if (layout->list[i].xlator) {
- gf_proc_dump_build_key(key, prefix,
- "list[%d].xlator.type", i);
- gf_proc_dump_write(key, "%s",
- layout->list[i].xlator->type);
- gf_proc_dump_build_key(key, prefix,
- "list[%d].xlator.name", i);
- gf_proc_dump_write(key, "%s",
- layout->list[i].xlator->name);
- }
- }
-
-out:
- return;
-}
-
-
-int32_t
-dht_priv_dump (xlator_t *this)
-{
- char key_prefix[GF_DUMP_MAX_BUF_LEN];
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 0;
- dht_conf_t *conf = NULL;
- int ret = -1;
-
- if (!this)
- goto out;
-
- conf = this->private;
- if (!conf)
- goto out;
-
- ret = TRY_LOCK(&conf->subvolume_lock);
- if (ret != 0) {
- return ret;
- }
-
- gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name);
- gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv",
- this->name);
- gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt);
- for (i = 0; i < conf->subvolume_cnt; i++) {
- sprintf (key, "subvolumes[%d]", i);
- gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type,
- conf->subvolumes[i]->name);
- if (conf->file_layouts && conf->file_layouts[i]){
- sprintf (key, "file_layouts[%d]", i);
- dht_layout_dump(conf->file_layouts[i], key);
- }
- if (conf->dir_layouts && conf->dir_layouts[i]) {
- sprintf (key, "dir_layouts[%d]", i);
- dht_layout_dump(conf->dir_layouts[i], key);
- }
- if (conf->subvolume_status) {
-
- sprintf (key, "subvolume_status[%d]", i);
- gf_proc_dump_write(key, "%d",
- (int)conf->subvolume_status[i]);
- }
-
- }
-
- gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed);
- gf_proc_dump_write("gen", "%d", conf->gen);
- gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk);
- gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes);
- gf_proc_dump_write("disk_unit", "%c", conf->disk_unit);
- gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval);
- gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit);
- if (conf ->du_stats) {
- gf_proc_dump_write("du_stats.avail_percent", "%lf",
- conf->du_stats->avail_percent);
- gf_proc_dump_write("du_stats.avail_space", "%lu",
- conf->du_stats->avail_space);
- gf_proc_dump_write("du_stats.avail_inodes", "%lf",
- conf->du_stats->avail_inodes);
- gf_proc_dump_write("du_stats.log", "%lu", conf->du_stats->log);
- }
-
- if (conf->last_stat_fetch.tv_sec)
- gf_proc_dump_write("last_stat_fetch", "%s",
- ctime(&conf->last_stat_fetch.tv_sec));
-
- UNLOCK(&conf->subvolume_lock);
-
-out:
- return ret;
-}
-
-int32_t
-dht_inodectx_dump (xlator_t *this, inode_t *inode)
-{
- int ret = -1;
- dht_layout_t *layout = NULL;
-
- if (!this)
- goto out;
- if (!inode)
- goto out;
-
- ret = dht_inode_ctx_layout_get (inode, this, &layout);
-
- if ((ret != 0) || !layout)
- return ret;
-
- gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name);
- dht_layout_dump(layout, "layout");
-
-out:
- return ret;
-}
-
-int
-notify (xlator_t *this, int event, void *data, ...)
-{
- int ret = -1;
- va_list ap;
- dict_t *output = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
-
-
- if (!data)
- goto out;
-
- va_start (ap, data);
- output = va_arg (ap, dict_t*);
-
- ret = dht_notify (this, event, data, output);
-
-out:
- return ret;
-}
-
-void
-fini (xlator_t *this)
-{
- int i = 0;
- dht_conf_t *conf = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
-
- conf = this->private;
- this->private = NULL;
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf);
- }
-out:
- return;
-}
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
-
- ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- return ret;
- }
-out:
- return ret;
-}
-
-
-int
-dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf,
- const char *bricks)
-{
- int i = 0;
- int ret = -1;
- char *tmpstr = NULL;
- char *dup_brick = NULL;
- char *node = NULL;
-
- if (!conf || !bricks)
- goto out;
-
- dup_brick = gf_strdup (bricks);
- node = strtok_r (dup_brick, ",", &tmpstr);
- while (node) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (!strcmp (conf->subvolumes[i]->name, node)) {
- conf->decommissioned_bricks[i] =
- conf->subvolumes[i];
- conf->decommission_subvols_cnt++;
- gf_log (this->name, GF_LOG_INFO,
- "decommissioning subvolume %s",
- conf->subvolumes[i]->name);
- break;
- }
- }
- if (i == conf->subvolume_cnt) {
- /* Wrong node given. */
- goto out;
- }
- node = strtok_r (NULL, ",", &tmpstr);
- }
-
- ret = 0;
- conf->decommission_in_progress = 1;
-out:
- GF_FREE (dup_brick);
-
- return ret;
-}
-
-void
-dht_init_regex (xlator_t *this, dict_t *odict, char *name,
- regex_t *re, gf_boolean_t *re_valid)
-{
- char *temp_str;
-
- if (dict_get_str (odict, name, &temp_str) != 0) {
- if (strcmp(name,"rsync-hash-regex")) {
- return;
- }
- temp_str = "^\\.(.+)\\.[^.]+$";
- }
-
- if (*re_valid) {
- regfree(re);
- *re_valid = _gf_false;
- }
-
- if (!strcmp(temp_str,"none")) {
- return;
- }
-
- if (regcomp(re,temp_str,REG_EXTENDED) == 0) {
- gf_log (this->name, GF_LOG_INFO,
- "using regex %s = %s", name, temp_str);
- *re_valid = _gf_true;
- }
- else {
- gf_log (this->name, GF_LOG_WARNING,
- "compiling regex %s failed", temp_str);
- }
-}
-
-int
-reconfigure (xlator_t *this, dict_t *options)
-{
- dht_conf_t *conf = NULL;
- char *temp_str = NULL;
- gf_boolean_t search_unhashed;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", options, out);
-
- conf = this->private;
- if (!conf)
- return 0;
-
- if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean*/
- if (strcasecmp (temp_str, "auto")) {
- if (!gf_string2boolean (temp_str, &search_unhashed)) {
- gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
- " lookup-unhashed reconfigured (%s)",
- temp_str);
- conf->search_unhashed = search_unhashed;
- } else {
- gf_log(this->name, GF_LOG_ERROR, "Reconfigure:"
- " lookup-unhashed should be boolean,"
- " not (%s), defaulting to (%d)",
- temp_str, conf->search_unhashed);
- //return -1;
- ret = -1;
- goto out;
- }
- } else {
- gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
- " lookup-unhashed reconfigured auto ");
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
- }
-
- GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options,
- percent_or_size, out);
- /* option can be any one of percent or bytes */
- conf->disk_unit = 0;
- if (conf->min_free_disk < 100.0)
- conf->disk_unit = 'p';
-
- GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options,
- percent, out);
-
- GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt,
- options, uint32, out);
-
- GF_OPTION_RECONF ("readdir-optimize", conf->readdir_optimize, options,
- bool, out);
- if (conf->defrag) {
- GF_OPTION_RECONF ("rebalance-stats", conf->defrag->stats,
- options, bool, out);
- }
-
- if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) {
- ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
- if (ret == -1)
- goto out;
- }
-
- dht_init_regex (this, options, "rsync-hash-regex",
- &conf->rsync_regex, &conf->rsync_regex_valid);
- dht_init_regex (this, options, "extra-hash-regex",
- &conf->extra_regex, &conf->extra_regex_valid);
-
- ret = 0;
-out:
- return ret;
-}
-
-static int
-gf_defrag_pattern_list_fill (xlator_t *this, gf_defrag_info_t *defrag, char *data)
-{
- int ret = -1;
- char *tmp_str = NULL;
- char *tmp_str1 = NULL;
- char *dup_str = NULL;
- char *num = NULL;
- char *pattern_str = NULL;
- char *pattern = NULL;
- gf_defrag_pattern_list_t *temp_list = NULL;
- gf_defrag_pattern_list_t *pattern_list = NULL;
-
- if (!this || !defrag || !data)
- goto out;
-
- /* Get the pattern for pattern list. "pattern:<optional-size>"
- * eg: *avi, *pdf:10MB, *:1TB
- */
- pattern_str = strtok_r (data, ",", &tmp_str);
- while (pattern_str) {
- dup_str = gf_strdup (pattern_str);
- pattern_list = GF_CALLOC (1, sizeof (gf_defrag_pattern_list_t),
- 1);
- if (!pattern_list) {
- goto out;
- }
- pattern = strtok_r (dup_str, ":", &tmp_str1);
- num = strtok_r (NULL, ":", &tmp_str1);
- if (!pattern)
- goto out;
- if (!num) {
- if (gf_string2bytesize(pattern, &pattern_list->size)
- == 0) {
- pattern = "*";
- }
- } else if (gf_string2bytesize (num, &pattern_list->size) != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "invalid number format \"%s\"", num);
- goto out;
- }
- memcpy (pattern_list->path_pattern, pattern, strlen (dup_str));
-
- if (!defrag->defrag_pattern)
- temp_list = NULL;
- else
- temp_list = defrag->defrag_pattern;
-
- pattern_list->next = temp_list;
-
- defrag->defrag_pattern = pattern_list;
- pattern_list = NULL;
-
- GF_FREE (dup_str);
- dup_str = NULL;
-
- pattern_str = strtok_r (NULL, ",", &tmp_str);
- }
-
- ret = 0;
-out:
- if (ret)
- GF_FREE (pattern_list);
- GF_FREE (dup_str);
-
- return ret;
-}
-
-int
-init (xlator_t *this)
-{
- dht_conf_t *conf = NULL;
- char *temp_str = NULL;
- int ret = -1;
- int i = 0;
- gf_defrag_info_t *defrag = NULL;
- int cmd = 0;
- char *node_uuid = NULL;
-
-
- GF_VALIDATE_OR_GOTO ("dht", this, err);
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "Distribute needs more than one subvolume");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile");
- }
-
- conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t);
- if (!conf) {
- goto err;
- }
-
- ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd);
-
- if (cmd) {
- defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t),
- gf_defrag_info_mt);
-
- GF_VALIDATE_OR_GOTO (this->name, defrag, err);
-
- LOCK_INIT (&defrag->lock);
-
- defrag->is_exiting = 0;
-
- conf->defrag = defrag;
-
- ret = dict_get_str (this->options, "node-uuid", &node_uuid);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "node-uuid not "
- "specified");
- goto err;
- }
-
- if (uuid_parse (node_uuid, defrag->node_uuid)) {
- gf_log (this->name, GF_LOG_ERROR, "Cannot parse "
- "glusterd node uuid");
- goto err;
- }
-
- defrag->cmd = cmd;
-
- defrag->stats = _gf_false;
- }
-
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
- if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean */
- if (strcasecmp (temp_str, "auto"))
- gf_string2boolean (temp_str, &conf->search_unhashed);
- else
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
-
- GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool,
- err);
-
- GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err);
-
- GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size,
- err);
-
- GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent,
- err);
-
- conf->dir_spread_cnt = conf->subvolume_cnt;
- GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt,
- uint32, err);
-
- GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down,
- bool, err);
-
- GF_OPTION_INIT ("readdir-optimize", conf->readdir_optimize, bool, err);
-
- if (defrag) {
- GF_OPTION_INIT ("rebalance-stats", defrag->stats, bool, err);
- if (dict_get_str (this->options, "rebalance-filter", &temp_str)
- == 0) {
- if (gf_defrag_pattern_list_fill (this, defrag, temp_str)
- == -1) {
- gf_log (this->name, GF_LOG_ERROR, "Cannot parse"
- " rebalance-filter (%s)", temp_str);
- goto err;
- }
- }
- }
-
- /* option can be any one of percent or bytes */
- conf->disk_unit = 0;
- if (conf->min_free_disk < 100)
- conf->disk_unit = 'p';
-
- ret = dht_init_subvolumes (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) {
- ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
- if (ret == -1)
- goto err;
- }
-
- dht_init_regex (this, this->options, "rsync-hash-regex",
- &conf->rsync_regex, &conf->rsync_regex_valid);
- dht_init_regex (this, this->options, "extra-hash-regex",
- &conf->extra_regex, &conf->extra_regex_valid);
-
- ret = dht_layouts_init (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- LOCK_INIT (&conf->subvolume_lock);
- LOCK_INIT (&conf->layout_lock);
-
- conf->gen = 1;
-
- this->local_pool = mem_pool_new (dht_local_t, 512);
- if (!this->local_pool) {
- gf_log (this->name, GF_LOG_ERROR,
- "failed to create local_t's memory pool");
- goto err;
- }
-
- this->private = conf;
-
- return 0;
-
-err:
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf->du_stats);
-
- GF_FREE (conf->defrag);
-
- GF_FREE (conf);
- }
-
- return -1;
-}
-
+class_methods_t class_methods = {
+ .init = dht_init,
+ .fini = dht_fini,
+ .reconfigure = dht_reconfigure,
+ .notify = dht_notify
+};
struct xlator_fops fops = {
.lookup = dht_lookup,
@@ -676,6 +70,9 @@ struct xlator_fops fops = {
.fxattrop = dht_fxattrop,
.setattr = dht_setattr,
.fsetattr = dht_fsetattr,
+ .fallocate = dht_fallocate,
+ .discard = dht_discard,
+ .zerofill = dht_zerofill,
};
struct xlator_dumpops dumpops = {
@@ -689,98 +86,4 @@ struct xlator_cbks cbks = {
// .releasedir = dht_releasedir,
.forget = dht_forget
};
-
-
-struct volume_options options[] = {
- { .key = {"lookup-unhashed"},
- .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
- "on", "off"},
- .type = GF_OPTION_TYPE_STR,
- .default_value = "on",
- .description = "This option if set to ON, does a lookup through "
- "all the sub-volumes, in case a lookup didn't return any result "
- "from the hash subvolume. If set to OFF, it does not do a lookup "
- "on the remaining subvolumes."
- },
- { .key = {"min-free-disk"},
- .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
- .default_value = "10%",
- .description = "Percentage/Size of disk space, after which the "
- "process starts balancing out the cluster, and logs will appear "
- "in log files",
- },
- { .key = {"min-free-inodes"},
- .type = GF_OPTION_TYPE_PERCENT,
- .default_value = "5%",
- .description = "after system has only N% of inodes, warnings "
- "starts to appear in log files",
- },
- { .key = {"unhashed-sticky-bit"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- },
- { .key = {"use-readdirp"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
- .description = "This option if set to ON, forces the use of "
- "readdirp, and hence also displays the stats of the files."
- },
- { .key = {"assert-no-child-down"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- .description = "This option if set to ON, in the event of "
- "CHILD_DOWN, will call exit."
- },
- { .key = {"directory-layout-spread"},
- .type = GF_OPTION_TYPE_INT,
- .min = 1,
- .validate = GF_OPT_VALIDATE_MIN,
- .description = "Specifies the directory layout spread."
- },
- { .key = {"decommissioned-bricks"},
- .type = GF_OPTION_TYPE_ANY,
- .description = "This option if set to ON, decommissions "
- "the brick, so that no new data is allowed to be created "
- "on that brick."
- },
- { .key = {"rebalance-cmd"},
- .type = GF_OPTION_TYPE_INT,
- },
- { .key = {"node-uuid"},
- .type = GF_OPTION_TYPE_STR,
- },
- { .key = {"rebalance-stats"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- .description = "This option if set to ON displays and logs the "
- " time taken for migration of each file, during the rebalance "
- "process. If set to OFF, the rebalance logs will only display the "
- "time spent in each directory."
- },
- { .key = {"readdir-optimize"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- .description = "This option if set to ON enables the optimization "
- "that allows DHT to requests non-first subvolumes to filter out "
- "directory entries."
- },
- { .key = {"rsync-hash-regex"},
- .type = GF_OPTION_TYPE_STR,
- /* Setting a default here doesn't work. See dht_init_regex. */
- .description = "Regular expression for stripping temporary-file "
- "suffix and prefix used by rsync, to prevent relocation when the "
- "file is renamed."
- },
- { .key = {"extra-hash-regex"},
- .type = GF_OPTION_TYPE_STR,
- /* Setting a default here doesn't work. See dht_init_regex. */
- .description = "Regular expression for stripping temporary-file "
- "suffix and prefix used by an application, to prevent relocation when "
- "the file is renamed."
- },
- { .key = {"rebalance-filter"},
- .type = GF_OPTION_TYPE_STR,
- },
-
- { .key = {NULL} },
-};
+;
diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c
index 52086e268..e934acdf0 100644
--- a/xlators/cluster/dht/src/nufa.c
+++ b/xlators/cluster/dht/src/nufa.c
@@ -18,6 +18,8 @@
/* TODO: all 'TODO's in dht.c holds good */
+extern struct volume_options options[];
+
int
nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno,
@@ -52,7 +54,8 @@ nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == -1)
goto out;
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
is_dir = check_is_dir (inode, stbuf, xattr);
if (!is_dir && !is_linkfile) {
@@ -201,7 +204,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this,
* revalidates directly go to the cached-subvolume.
*/
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"Failed to set dict value.");
@@ -222,7 +225,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this,
} else {
do_fresh_lookup:
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"Failed to set dict value.");
@@ -231,7 +234,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this,
}
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht.linkto", 256);
+ conf->link_xattr_name, 256);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"Failed to set dict value.");
@@ -320,7 +323,8 @@ nufa_create (call_frame_t *frame, xlator_t *this,
if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) {
avail_subvol =
dht_free_disk_available_subvol (this,
- (xlator_t *)conf->private);
+ (xlator_t *)conf->private,
+ local);
}
if (subvol != avail_subvol) {
@@ -330,9 +334,8 @@ nufa_create (call_frame_t *frame, xlator_t *this,
local->flags = flags;
local->umask = umask;
local->cached_subvol = avail_subvol;
- dht_linkfile_create (frame,
- nufa_create_linkfile_create_cbk,
- avail_subvol, subvol, loc);
+ dht_linkfile_create (frame, nufa_create_linkfile_create_cbk,
+ this, avail_subvol, subvol, loc);
return 0;
}
@@ -425,7 +428,8 @@ nufa_mknod (call_frame_t *frame, xlator_t *this,
if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) {
avail_subvol =
dht_free_disk_available_subvol (this,
- (xlator_t *)conf->private);
+ (xlator_t *)conf->private,
+ local);
}
if (avail_subvol != subvol) {
@@ -437,7 +441,7 @@ nufa_mknod (call_frame_t *frame, xlator_t *this,
local->rdev = rdev;
local->cached_subvol = avail_subvol;
- dht_linkfile_create (frame, nufa_mknod_linkfile_cbk,
+ dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, this,
avail_subvol, subvol, loc);
return 0;
}
@@ -460,42 +464,6 @@ err:
}
-int
-notify (xlator_t *this, int event, void *data, ...)
-{
- int ret = -1;
-
- ret = dht_notify (this, event, data);
-
- return ret;
-}
-
-void
-fini (xlator_t *this)
-{
- int i = 0;
- dht_conf_t *conf = NULL;
-
- conf = this->private;
-
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf);
- }
-
- return;
-}
-
gf_boolean_t
same_first_part (char *str1, char term1, char *str2, char term2)
{
@@ -516,201 +484,150 @@ same_first_part (char *str1, char term1, char *str2, char term2)
}
}
-int32_t
-mem_acct_init (xlator_t *this)
+typedef struct nufa_args {
+ xlator_t *this;
+ char *volname;
+ gf_boolean_t addr_match;
+} nufa_args_t;
+
+static void
+nufa_find_local_brick (xlator_t *xl, void *data)
{
- int ret = -1;
+ nufa_args_t *args = data;
+ xlator_t *this = args->this;
+ char *local_volname = args->volname;
+ gf_boolean_t addr_match = args->addr_match;
+ char *brick_host = NULL;
+ dht_conf_t *conf = this->private;
+ int ret = -1;
+
+ /*This means a local subvol was already found. We pick the first brick
+ * that is local*/
+ if (conf->private)
+ return;
+
+ if (strcmp (xl->name, local_volname) == 0) {
+ conf->private = xl;
+ gf_log (this->name, GF_LOG_INFO, "Using specified subvol %s",
+ local_volname);
+ return;
+ }
+
+ if (!addr_match)
+ return;
+
+ ret = dict_get_str (xl->options, "remote-host", &brick_host);
+ if ((ret == 0) &&
+ (gf_is_same_address (local_volname, brick_host) ||
+ gf_is_local_addr (brick_host))) {
+ conf->private = xl;
+ gf_log (this->name, GF_LOG_INFO, "Using the first local "
+ "subvol %s", xl->name);
+ return;
+ }
- ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1);
- return ret;
}
-int
-init (xlator_t *this)
+static void
+nufa_to_dht (xlator_t *this)
{
- dht_conf_t *conf = NULL;
- xlator_list_t *trav = NULL;
- data_t *data = NULL;
- char *local_volname = NULL;
- char *temp_str = NULL;
- int ret = -1;
- int i = 0;
- char my_hostname[256];
- double temp_free_disk = 0;
- uint64_t size = 0;
- xlator_t *local_subvol = NULL;
- char *brick_host = NULL;
- xlator_t *kid = NULL;
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "NUFA needs more than one subvolume");
- return -1;
- }
+ GF_ASSERT (this);
+ GF_ASSERT (this->fops);
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile");
- }
+ this->fops->lookup = dht_lookup;
+ this->fops->create = dht_create;
+ this->fops->mknod = dht_mknod;
+}
- conf = GF_CALLOC (1, sizeof (*conf),
- gf_dht_mt_dht_conf_t);
- if (!conf) {
- goto err;
+int
+nufa_find_local_subvol (xlator_t *this,
+ void (*fn) (xlator_t *each, void* data), void *data)
+{
+ int ret = -1;
+ dht_conf_t *conf = this->private;
+ xlator_list_t *trav = NULL;
+ xlator_t *parent = NULL;
+ xlator_t *candidate = NULL;
+
+ xlator_foreach_depth_first (this, fn, data);
+ if (!conf->private) {
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't find a local "
+ "brick");
+ return -1;
}
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
- if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean */
- if (strcasecmp (temp_str, "auto"))
- gf_string2boolean (temp_str, &conf->search_unhashed);
- else
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
+ candidate = conf->private;
+ trav = candidate->parents;
+ while (trav) {
- ret = dht_init_subvolumes (this, conf);
- if (ret == -1) {
- goto err;
- }
+ parent = trav->xlator;
+ if (strcmp (parent->type, "cluster/nufa") == 0) {
+ gf_log (this->name, GF_LOG_INFO, "Found local subvol, "
+ "%s", candidate->name);
+ ret = 0;
+ conf->private = candidate;
+ break;
+ }
- ret = dht_layouts_init (this, conf);
- if (ret == -1) {
- goto err;
+ candidate = parent;
+ trav = parent->parents;
}
- LOCK_INIT (&conf->subvolume_lock);
- LOCK_INIT (&conf->layout_lock);
+ return ret;
+}
- conf->gen = 1;
+int
+nufa_init (xlator_t *this)
+{
+ data_t *data = NULL;
+ char *local_volname = NULL;
+ int ret = -1;
+ char my_hostname[256];
+ gf_boolean_t addr_match = _gf_false;
+ nufa_args_t args = {0, };
- local_volname = "localhost";
- ret = gethostname (my_hostname, 256);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "could not find hostname (%s)",
- strerror (errno));
+ ret = dht_init(this);
+ if (ret) {
+ return ret;
}
- if (ret == 0)
- local_volname = my_hostname;
-
- data = dict_get (this->options, "local-volume-name");
- if (data) {
+ if ((data = dict_get (this->options, "local-volume-name"))) {
local_volname = data->data;
- }
- for (trav = this->children; trav; trav = trav->next) {
- if (strcmp (trav->xlator->name, local_volname) == 0)
- break;
- if (local_subvol) {
- continue;
- }
- kid = trav->xlator;
- for (;;) {
- if (dict_get_str(trav->xlator->options,"remote-host",
- &brick_host) == 0) {
- /* Found it. */
- break;
- }
- if (!kid->children) {
- /* Nowhere further to look. */
- gf_log (this->name, GF_LOG_ERROR,
- "could not get remote-host");
- goto err;
- }
- if (kid->children->next) {
- /* Multiple choices, can't/shouldn't decide. */
- gf_log (this->name, GF_LOG_ERROR,
- "NUFA found fan-out (type %s) volume",
- kid->type);
- goto err;
- }
- /* One-to-one xlators are OK, try the next one. */
- kid = kid->children->xlator;
- }
- if (same_first_part(my_hostname,'.',brick_host,'.')) {
- local_subvol = trav->xlator;
- }
- }
-
- if (trav) {
- gf_log (this->name, GF_LOG_INFO,
- "Using specified subvol %s", local_volname);
- conf->private = trav->xlator;
- }
- else if (local_subvol) {
- gf_log (this->name, GF_LOG_INFO,
- "Using first local subvol %s", local_subvol->name);
- conf->private = local_subvol;
- }
- else {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not find specified or local subvol");
- goto err;
-
- }
+ } else {
+ addr_match = _gf_true;
+ local_volname = "localhost";
+ ret = gethostname (my_hostname, 256);
+ if (ret == 0)
+ local_volname = my_hostname;
- /* The volume specified exists */
-
- conf->min_free_disk = 10;
- conf->disk_unit = 'p';
-
- if (dict_get_str (this->options, "min-free-disk",
- &temp_str) == 0) {
- if (gf_string2percent (temp_str,
- &temp_free_disk) == 0) {
- if (temp_free_disk > 100) {
- gf_string2bytesize (temp_str, &size);
- conf->min_free_disk = size;
- conf->disk_unit = 'b';
- } else {
- conf->min_free_disk = temp_free_disk;
- conf->disk_unit = 'p';
- }
- } else {
- gf_string2bytesize (temp_str, &size);
- conf->min_free_disk = size;
- conf->disk_unit = 'b';
- }
- }
+ else
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not find hostname (%s)",
+ strerror (errno));
- conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t),
- gf_dht_mt_dht_du_t);
- if (!conf->du_stats) {
- goto err;
}
- this->local_pool = mem_pool_new (dht_local_t, 128);
- if (!this->local_pool) {
- gf_log (this->name, GF_LOG_ERROR,
- "failed to create local_t's memory pool");
- goto err;
+ args.this = this;
+ args.volname = local_volname;
+ args.addr_match = addr_match;
+ ret = nufa_find_local_subvol (this, nufa_find_local_brick, &args);
+ if (ret) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Unable to find local subvolume, switching "
+ "to dht mode");
+ nufa_to_dht (this);
}
-
- this->private = conf;
-
return 0;
+}
-err:
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf->du_stats);
-
- GF_FREE (conf);
- }
- return -1;
-}
+class_methods_t class_methods = {
+ .init = nufa_init,
+ .fini = dht_fini,
+ .reconfigure = dht_reconfigure,
+ .notify = dht_notify
+};
struct xlator_fops fops = {
@@ -757,19 +674,3 @@ struct xlator_fops fops = {
struct xlator_cbks cbks = {
.forget = dht_forget
};
-
-
-struct volume_options options[] = {
- { .key = {"lookup-unhashed"},
- .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
- "on", "off"},
- .type = GF_OPTION_TYPE_STR
- },
- { .key = {"local-volume-name"},
- .type = GF_OPTION_TYPE_XLATOR
- },
- { .key = {"min-free-disk"},
- .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
- },
- { .key = {NULL} },
-};
diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c
index 6ec343102..d3ea90ba8 100644
--- a/xlators/cluster/dht/src/switch.c
+++ b/xlators/cluster/dht/src/switch.c
@@ -22,6 +22,8 @@
#include <fnmatch.h>
#include <string.h>
+extern struct volume_options options[];
+
struct switch_sched_array {
xlator_t *xl;
int32_t eligible;
@@ -135,7 +137,8 @@ switch_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == -1)
goto out;
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
is_dir = check_is_dir (inode, stbuf, xattr);
if (!is_dir && !is_linkfile) {
@@ -289,11 +292,11 @@ switch_lookup (call_frame_t *frame, xlator_t *this,
* attribute, revalidates directly go to the cached-subvolume.
*/
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (ret < 0)
gf_log (this->name, GF_LOG_WARNING,
- "failed to set dict value for "
- "trusted.glusterfs.dht");
+ "failed to set dict value for %s",
+ conf->xattr_name);
for (i = 0; i < layout->cnt; i++) {
subvol = layout->list[i].xlator;
@@ -308,18 +311,18 @@ switch_lookup (call_frame_t *frame, xlator_t *this,
} else {
do_fresh_lookup:
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (ret < 0)
gf_log (this->name, GF_LOG_WARNING,
- "failed to set dict value for "
- "trusted.glusterfs.dht");
+ "failed to set dict value for %s",
+ conf->xattr_name);
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht.linkto", 256);
+ conf->link_xattr_name, 256);
if (ret < 0)
gf_log (this->name, GF_LOG_WARNING,
- "failed to set dict value for "
- "trusted.glusterfs.dht.linkto");
+ "failed to set dict value for %s",
+ conf->link_xattr_name);
if (!hashed_subvol) {
gf_log (this->name, GF_LOG_DEBUG,
@@ -434,7 +437,8 @@ switch_create (call_frame_t *frame, xlator_t *this,
avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol);
if (dht_is_subvol_filled (this, avail_subvol)) {
avail_subvol =
- dht_free_disk_available_subvol (this, avail_subvol);
+ dht_free_disk_available_subvol (this, avail_subvol,
+ local);
}
if (subvol != avail_subvol) {
@@ -443,9 +447,8 @@ switch_create (call_frame_t *frame, xlator_t *this,
local->flags = flags;
local->umask = umask;
local->cached_subvol = avail_subvol;
- dht_linkfile_create (frame,
- switch_create_linkfile_create_cbk,
- avail_subvol, subvol, loc);
+ dht_linkfile_create (frame, switch_create_linkfile_create_cbk,
+ this, avail_subvol, subvol, loc);
return 0;
}
@@ -534,7 +537,8 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol);
if (dht_is_subvol_filled (this, avail_subvol)) {
avail_subvol =
- dht_free_disk_available_subvol (this, avail_subvol);
+ dht_free_disk_available_subvol (this, avail_subvol,
+ local);
}
if (avail_subvol != subvol) {
@@ -547,7 +551,7 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
local->cached_subvol = avail_subvol;
dht_linkfile_create (frame, switch_mknod_linkfile_cbk,
- avail_subvol, subvol, loc);
+ this, avail_subvol, subvol, loc);
return 0;
}
@@ -569,20 +573,9 @@ err:
}
-int
-notify (xlator_t *this, int event, void *data, ...)
-{
- int ret = -1;
-
- ret = dht_notify (this, event, data);
-
- return ret;
-}
-
void
-fini (xlator_t *this)
+switch_fini (xlator_t *this)
{
- int i = 0;
dht_conf_t *conf = NULL;
struct switch_struct *trav = NULL;
struct switch_struct *prev = NULL;
@@ -598,22 +591,9 @@ fini (xlator_t *this)
trav = trav->next;
GF_FREE (prev);
}
-
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf);
}
- return;
+ dht_fini(this);
}
int
@@ -839,68 +819,18 @@ err:
}
-int
-init (xlator_t *this)
+int32_t
+switch_init (xlator_t *this)
{
dht_conf_t *conf = NULL;
data_t *data = NULL;
- char *temp_str = NULL;
int ret = -1;
- int i = 0;
- double temp_free_disk = 0;
- uint64_t size = 0;
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "SWITCH needs more than one subvolume");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile");
- }
-
- conf = GF_CALLOC (1, sizeof (*conf), gf_switch_mt_dht_conf_t);
- if (!conf) {
- goto err;
- }
-
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
- if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean */
- if (strcasecmp (temp_str, "auto"))
- gf_string2boolean (temp_str, &conf->search_unhashed);
- else
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
- conf->unhashed_sticky_bit = 0;
- if (dict_get_str (this->options, "unhashed-sticky-bit",
- &temp_str) == 0) {
- gf_string2boolean (temp_str, &conf->unhashed_sticky_bit);
- }
-
- conf->min_free_disk = 10.0;
- conf->disk_unit = 'p';
-
- if (dict_get_str (this->options, "min-free-disk",
- &temp_str) == 0) {
- if (gf_string2percent (temp_str, &temp_free_disk) == 0) {
- if (temp_free_disk > 100) {
- gf_string2bytesize (temp_str, &size);
- conf->min_free_disk = size;
- conf->disk_unit = 'b';
- } else {
- conf->min_free_disk = temp_free_disk;
- conf->disk_unit = 'p';
- }
- } else {
- gf_string2bytesize (temp_str, &size);
- conf->min_free_disk = size;
- conf->disk_unit = 'b';
- }
+ ret = dht_init(this);
+ if (ret) {
+ return ret;
}
+ conf = this->private;
data = dict_get (this->options, "pattern.switch.case");
if (data) {
@@ -911,60 +841,23 @@ init (xlator_t *this)
}
}
- ret = dht_init_subvolumes (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- ret = dht_layouts_init (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- LOCK_INIT (&conf->subvolume_lock);
- LOCK_INIT (&conf->layout_lock);
-
- conf->gen = 1;
-
- conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t),
- gf_switch_mt_dht_du_t);
- if (!conf->du_stats) {
- goto err;
- }
-
- this->local_pool = mem_pool_new (dht_local_t, 128);
- if (!this->local_pool) {
- gf_log (this->name, GF_LOG_ERROR,
- "failed to create local_t's memory pool");
- goto err;
- }
-
this->private = conf;
-
return 0;
err:
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf->du_stats);
-
- GF_FREE (conf);
- }
-
+ dht_fini(this);
return -1;
}
+class_methods_t class_methods = {
+ .init = switch_init,
+ .fini = switch_fini,
+ .reconfigure = dht_reconfigure,
+ .notify = dht_notify
+};
+
+
struct xlator_fops fops = {
.lookup = switch_lookup,
.create = switch_create,
@@ -1009,19 +902,3 @@ struct xlator_fops fops = {
struct xlator_cbks cbks = {
.forget = dht_forget
};
-
-
-struct volume_options options[] = {
- { .key = {"lookup-unhashed"},
- .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
- "on", "off"},
- .type = GF_OPTION_TYPE_STR
- },
- { .key = {"pattern.switch.case"},
- .type = GF_OPTION_TYPE_ANY
- },
- { .key = {"min-free-disk"},
- .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
- },
- { .key = {NULL} },
-};