From 654a720eed0bc5faaeeaa4eb34f1cfc10df76230 Mon Sep 17 00:00:00 2001 From: Amar Tumballi Date: Wed, 14 Jul 2010 13:58:20 +0000 Subject: proper way to do defrag of a mountpoint Usage: "glusterfs-defrag " there are new features added to distribute with this patch: * bash# getfattr -n trusted.distribute.linkinfo Gives output in the format ":", if there is a linkfile present, where hostname is server, directory is backend directory where the actual linkfile is present. * bash# getfattr -n trusted.glusterfs.pathinfo Gives layout information if directory, and info about actual location of file in backend servers, in the form of 'hostname:directory'. (TODO: should extend it to all xlators) * bash# getfattr -n trusted.distribute.fix.layout scales out the directory layout in distribute, so when new servers are added, layouts are fixed to include them. * removed 'scale-n-defrag.sh' as its no more required. * changed 'defrag.sh' to have a feature to specify target bricks, so defrag happens to only those nodes. moved the file to 'glusterfs-defrag', which now gets installed to '${prefix}/bin' * storage/posix: takes new option 'hostname' so it can resolve to same hostname given during 'gluster volume create'. with 'trusted.glusterfs.pathinfo' posix returns the value in 'hostname:real_path' format. Signed-off-by: Amar Tumballi Signed-off-by: Anand V. Avati BUG: 1073 ('gluster defrag ' fails in mainline) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=1073 --- configure.ac | 1 + extras/Makefile.am | 2 + extras/defrag.sh | 60 ---------- extras/glusterfs-defrag.in | 109 ++++++++++++++++++ extras/scale-n-defrag.sh | 37 ------ extras/volgen/CreateVolfile.py | 1 + libglusterfs/src/glusterfs.h | 3 + xlators/cluster/dht/src/dht-common.c | 217 ++++++++++++++++++++++++++++++++++- xlators/cluster/dht/src/dht-common.h | 5 + xlators/mount/fuse/src/fuse-bridge.c | 8 +- xlators/storage/posix/src/posix.c | 55 +++++---- xlators/storage/posix/src/posix.h | 2 +- 12 files changed, 373 insertions(+), 127 deletions(-) delete mode 100644 extras/defrag.sh create mode 100644 extras/glusterfs-defrag.in delete mode 100644 extras/scale-n-defrag.sh diff --git a/configure.ac b/configure.ac index 4eaf0522231..16ea31e6055 100644 --- a/configure.ac +++ b/configure.ac @@ -114,6 +114,7 @@ AC_CONFIG_FILES([Makefile doc/examples/Makefile doc/hacker-guide/Makefile extras/Makefile + extras/glusterfs-defrag extras/init.d/Makefile extras/init.d/glusterfs-server.plist extras/init.d/glusterfsd-Debian diff --git a/extras/Makefile.am b/extras/Makefile.am index c11137143b5..6ea4744c912 100644 --- a/extras/Makefile.am +++ b/extras/Makefile.am @@ -3,6 +3,8 @@ docdir = $(datadir)/doc/glusterfs/ EditorModedir = $(docdir)/ EditorMode_DATA = glusterfs-mode.el glusterfs.vim +dist_bin_SCRIPTS = glusterfs-defrag + SUBDIRS = init.d benchmarking volgen EXTRA_DIST = specgen.scm MacOSX/Portfile glusterfs-mode.el glusterfs.vim migrate-unify-to-distribute.sh backend-xattr-sanitize.sh backend-cleanup.sh defrag.sh scale-n-defrag.sh disk_usage_sync.sh diff --git a/extras/defrag.sh b/extras/defrag.sh deleted file mode 100644 index 465b0979488..00000000000 --- a/extras/defrag.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/sh - -# This script gets called from 'scale-n-defrag.sh' script. -# Don't run this stand alone. -# -# - -set -e - -CP="cp" -MV="mv" - -scan_dir() -{ - path=$1; - find "$path" -type f -perm +01000 -exec $0 '{}' \; -} - -rsync_filename() -{ - path=$1 - dir=$(dirname "$path"); - file=$(basename "$path"); - - echo "$dir/.$file.zr$$"; -} - -relocate_file() -{ - path=$1; - tmp_path=$(rsync_filename "$path"); - - pre_mtime=$(stat -c '%Y' "$path"); - $CP -a "$path" "$tmp_path"; - post_mtime=$(stat -c '%Y' "$path"); - - if [ $pre_mtime = $post_mtime ]; then - chmod -t "$tmp_path"; - $MV "$tmp_path" "$path"; - echo "file '$path' relocated" - else - echo "file '$path' modified during defrag. skipping" - rm -f "$tmp_path"; - fi -} - -main() -{ - path="$1"; - - if [ -d "$path" ]; then - scan_dir "$path"; - else - relocate_file "$@"; - fi - - usleep 500000 # 500ms -} - -main "$1" diff --git a/extras/glusterfs-defrag.in b/extras/glusterfs-defrag.in new file mode 100644 index 00000000000..982878bdbe9 --- /dev/null +++ b/extras/glusterfs-defrag.in @@ -0,0 +1,109 @@ +#!/bin/bash + +# Please leave 'added_bricks' as empty if you want 100% defrag. +# If you want to move data to newly added bricks, properly give +# brick info as ":" form (which was given +# in 'gluster volume create' command) +# More than one brick can be given with space inbetween. + +# +# (c) 2010 Gluster Inc +# + +set -e; + +added_bricks="node1:/gfs/export1" + +CP="cp" +MV="mv" + +scan_dir() +{ + path=$1; + # run defrag on files first # + find "$path" -maxdepth 1 -type f -perm +01000 -exec $0 '{}' \; + + for subdir in $(find "$path" -maxdepth 1 -type d | sed 1d); do + $0 "$subdir"; + done +} + +fix_xattr() +{ + path=$1; + getfattr -n trusted.distribute.fix.layout "$path" 2>/dev/null; +} + +rsync_filename() +{ + path=$1 + dir=$(dirname "$path"); + file=$(basename "$path"); + + echo "$dir/.$file.zr$$"; +} + +relocate_file() +{ + path=$1; + stat_info=$(stat -c '%a' "$path"); + if [ $stat_info -lt 1000 ] ; then + return; + fi + + flag=0; + linknode=$(getfattr --only-values -n trusted.distribute.linkinfo $path 2>/dev/null); + if [ -z $linknode ] ; then + return; + fi + + for bricks in ${added_bricks}; do + current_brick=${linknode:0:${#bricks}}; + if [ "${bricks}" == "${current_brick}" ]; then + flag=1; + fi + done + + if [ $flag -ne 1 ]; then + return; + fi + + tmp_path=$(rsync_filename "$path"); + + pre_mtime=$(stat -c '%Y' "$path"); + $CP -a "$path" "$tmp_path"; + post_mtime=$(stat -c '%Y' "$path"); + + if [ $pre_mtime = $post_mtime ]; then + chmod -t "$tmp_path"; + $MV "$tmp_path" "$path"; + echo "file '$path' relocated" + else + echo "file '$path' modified during defrag. skipping" + rm -f "$tmp_path"; + fi +} + +defrag_usage() +{ + echo "Usage: $0 " +} + +main() +{ + path="$1"; + + if [ -z "$path" ]; then + defrag_usage; + return; + fi + + if [ -d "$path" ]; then + fix_xattr "$path"; + scan_dir "$path"; + else + relocate_file "$@"; + fi +} + +main "$1" diff --git a/extras/scale-n-defrag.sh b/extras/scale-n-defrag.sh deleted file mode 100644 index 1031b3931a8..00000000000 --- a/extras/scale-n-defrag.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/sh - -# This script runs over the GlusterFS mountpoint (from just one client) -# to handle the distribution of 'data', after the distribute translator's -# subvolumes count changes. -# -# (c) 2009 Gluster Inc, -# -# -# Make sure the following variables are properly initialized - -MOUNTPOINT=/tmp/testdir -directory_to_be_scaled="${MOUNTPOINT}/" - -logdir=$(dirname $0) -cd $logdir -LOGDIR=$(pwd) -cd - - -# The below command is enough to make sure the new layout will be scaled across new -# nodes. -find ${directory_to_be_scaled} -type d -exec setfattr -x "trusted.glusterfs.dht" {} \; - -# Now do a lookup on files so the scaling/re-hashing is done -find ${directory_to_be_scaled} > /dev/null - - -# copy the defrag (to copy data across for new nodes (for linkfiles)) -# - - -cd ${directory_to_be_scaled}; -for dir in *; do - echo "Defragmenting directory ${directory_to_be_scaled}/$dir ($LOGDIR/defrag-store-$dir.log)" - $LOGDIR/defrag.sh $dir >> $LOGDIR/defrag-store-$dir.log 2>&1 - echo Completed directory ${directory_to_be_scaled}/$dir -done diff --git a/extras/volgen/CreateVolfile.py b/extras/volgen/CreateVolfile.py index ca5043a8d78..378766cf7ca 100644 --- a/extras/volgen/CreateVolfile.py +++ b/extras/volgen/CreateVolfile.py @@ -270,6 +270,7 @@ class CreateVolfile: exp_fd.write ("# option background-unlink yes # (default: no) boolean type\n") exp_fd.write (" option directory %s\n" % export) + exp_fd.write (" option hostname %s\n" % host) exp_fd.write ("end-volume\n\n") if self.nfs: diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 8ddb4e74c8c..8dc781d81ec 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -64,6 +64,9 @@ #define O_DIRECTORY 0 #endif +#define GF_XATTR_PATHINFO_KEY "trusted.glusterfs.pathinfo" +#define GF_XATTR_LINKINFO_KEY "trusted.distribute.linkinfo" + #define ZR_FILE_CONTENT_STR "glusterfs.file." #define ZR_FILE_CONTENT_STRLEN 15 diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 1cfeae690f9..dd2a3f3f1e1 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -1409,6 +1409,127 @@ err: } +int +dht_fix_layout_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + DHT_STACK_UNWIND (getxattr, frame, -1, ENODATA, NULL); + + return 0; +} + +static void +fill_layout_info (dht_layout_t *layout, char *buf) +{ + int i = 0; + char tmp_buf[128] = {0,}; + + for (i = 0; i < layout->cnt; i++) { + snprintf (tmp_buf, 128, "(%s %u %u)", + layout->list[i].xlator->name, + layout->list[i].start, + layout->list[i].stop); + if (i) + strcat (buf, " "); + strcat (buf, tmp_buf); + } +} + +int +dht_pathinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr) +{ + dht_local_t *local = NULL; + int ret = 0; + int flag = 0; + int this_call_cnt = 0; + char *value_got = NULL; + char layout_buf[8192] = {0,}; + char xattr_buf[8192 + 1024] = {0,}; + dict_t *dict = NULL; + + local = frame->local; + + if (op_ret != -1) { + ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value_got); + if (!ret) { + if (!local->pathinfo) + local->pathinfo = GF_CALLOC (8192, sizeof (char), + gf_common_mt_char); + if (local->pathinfo) + strcat (local->pathinfo, value_got); + } + } + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + if (local->layout->cnt > 1) { + /* Set it for directory */ + fill_layout_info (local->layout, layout_buf); + flag = 1; + } + + dict = dict_new (); + + if (flag && local->pathinfo) + snprintf (xattr_buf, 9216, "((%s %s) (%s-layout %s))", + this->name, local->pathinfo, this->name, + layout_buf); + else if (local->pathinfo) + snprintf (xattr_buf, 9216, "(%s %s)", + this->name, local->pathinfo); + else if (flag) + snprintf (xattr_buf, 9216, "(%s-layout %s)", + this->name, layout_buf); + + ret = dict_set_str (dict, GF_XATTR_PATHINFO_KEY, + xattr_buf); + + if (local->pathinfo) + GF_FREE (local->pathinfo); + GF_FREE (local->key); + + DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + + if (dict) + dict_unref (dict); + + return 0; + } + + if (local->pathinfo) + strcat (local->pathinfo, " Link: "); + + /* This will happen if there pending */ + STACK_WIND (frame, dht_pathinfo_getxattr_cbk, local->hashed_subvol, + local->hashed_subvol->fops->getxattr, + &local->loc, local->key); + + return 0; +} + +int +dht_linkinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr) +{ + int ret = 0; + char *value = NULL; + + if (op_ret != -1) { + ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value); + if (!ret) { + ret = dict_set_str (xattr, GF_XATTR_LINKINFO_KEY, value); + if (!ret) + gf_log (this->name, GF_LOG_TRACE, + "failed to set linkinfo"); + } + } + + DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr); + + return 0; +} + int dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xattr) @@ -1429,9 +1550,14 @@ int dht_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key) { - xlator_t *subvol = NULL; - int op_errno = -1; - + xlator_t *subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int op_errno = -1; + int ret = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -1439,6 +1565,91 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (loc->inode, err); VALIDATE_OR_GOTO (loc->path, err); + conf = this->private; + layout = dht_layout_get (this, loc->inode); + if (key && (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)) { + hashed_subvol = dht_subvol_get_hashed (this, loc); + cached_subvol = dht_subvol_get_cached (this, loc->inode); + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Out of memory"); + goto err; + } + + ret = loc_dup (loc, &local->loc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Out of memory"); + goto err; + } + local->key = gf_strdup (key); + if (!local->key) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Out of memory"); + goto err; + } + local->layout = layout; + + local->call_cnt = 1; + if (hashed_subvol != cached_subvol) { + local->call_cnt = 2; + local->hashed_subvol = hashed_subvol; + } + + STACK_WIND (frame, dht_pathinfo_getxattr_cbk, cached_subvol, + cached_subvol->fops->getxattr, loc, key); + + return 0; + } + if (key && (strcmp (key, GF_XATTR_LINKINFO_KEY) == 0)) { + hashed_subvol = dht_subvol_get_hashed (this, loc); + cached_subvol = dht_subvol_get_cached (this, loc->inode); + if (hashed_subvol == cached_subvol) { + op_errno = ENODATA; + goto err; + } + if (hashed_subvol) { + STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol, + hashed_subvol->fops->getxattr, loc, + GF_XATTR_PATHINFO_KEY); + return 0; + } + op_errno = ENODATA; + goto err; + } + if (key && (strcmp (key, GF_XATTR_FIX_LAYOUT_KEY) == 0)) { + if (layout->cnt < conf->subvolume_cnt) { + gf_log (this->name, GF_LOG_INFO, + "expanding layout of %s from %d to %d", + loc->path, layout->cnt, conf->subvolume_cnt); + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Out of memory"); + goto err; + } + + ret = loc_dup (loc, &local->loc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Out of memory"); + goto err; + } + local->layout = layout; + dht_selfheal_new_directory (frame, dht_fix_layout_cbk, + layout); + return 0; + } + op_errno = ENODATA; + goto err; + } subvol = dht_subvol_get_cached (this, loc->inode); if (!subvol) { gf_log (this->name, GF_LOG_DEBUG, diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index b361f14426e..d5a5c7b2c59 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -27,6 +27,7 @@ #ifndef _DHT_H #define _DHT_H +#define GF_XATTR_FIX_LAYOUT_KEY "trusted.distribute.fix.layout" #define GF_DHT_LOOKUP_UNHASHED_ON 1 #define GF_DHT_LOOKUP_UNHASHED_AUTO 2 @@ -116,6 +117,10 @@ struct dht_local { int32_t flags; mode_t mode; dev_t rdev; + + /* need for file-info */ + char *pathinfo; + char *key; }; typedef struct dht_local dht_local_t; diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c index 142ff064e55..9c9dff0b926 100644 --- a/xlators/mount/fuse/src/fuse-bridge.c +++ b/xlators/mount/fuse/src/fuse-bridge.c @@ -2629,11 +2629,11 @@ fuse_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, state->loc.path, strerror (op_errno)); } } else { - gf_log ("glusterfs-fuse", GF_LOG_WARNING, - "%"PRIu64": %s() %s => -1 (%s)", + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRIu64": %s(%s) %s => -1 (%s)", frame->root->unique, - gf_fop_list[frame->root->op], state->loc.path, - strerror (op_errno)); + gf_fop_list[frame->root->op], state->name, + state->loc.path, strerror (op_errno)); } /* if(op_errno!= ENODATA)...else */ send_fuse_err (this, finh, op_errno); diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 7c0d165ac99..2810bbd6a8c 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -3032,6 +3032,7 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, size_t size = 0; size_t remaining_size = 0; char key[1024] = {0,}; + char host_buf[1024] = {0,}; char gen_key[1024] = {0,}; char * value = NULL; char * list = NULL; @@ -3072,18 +3073,18 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, } if (loc->inode && IA_ISREG (loc->inode->ia_type) && name && - (strcmp (name, "trusted.glusterfs.location") == 0)) { - ret = dict_set_static_ptr (dict, - "trusted.glusterfs.location", - priv->hostname); + (strcmp (name, GF_XATTR_PATHINFO_KEY) == 0)) { + snprintf (host_buf, 1024, "%s:%s", priv->hostname, + real_path); + ret = dict_set_str (dict, GF_XATTR_PATHINFO_KEY, + host_buf); if (ret < 0) { gf_log (this->name, GF_LOG_WARNING, - "could not set hostname (%s) in dictionary", - priv->hostname); + "could not set value (%s) in dictionary", + host_buf); } goto done; } - size = sys_llistxattr (real_path, NULL, 0); if (size == -1) { @@ -4260,17 +4261,16 @@ mem_acct_init (xlator_t *this) int init (xlator_t *this) { - int ret = 0; - int op_ret = -1; - gf_boolean_t tmp_bool = 0; - struct stat buf = {0,}; - struct posix_private * _private = NULL; - data_t * dir_data = NULL; - data_t * tmp_data = NULL; - uint64_t time64 = 0; - - int dict_ret = 0; - int32_t janitor_sleep; + struct posix_private *_private = NULL; + data_t *dir_data = NULL; + data_t *tmp_data = NULL; + struct stat buf = {0,}; + gf_boolean_t tmp_bool = 0; + uint64_t time64 = 0; + int dict_ret = 0; + int ret = 0; + int op_ret = -1; + int32_t janitor_sleep = 0; dir_data = dict_get (this->options, "directory"); @@ -4370,10 +4370,19 @@ init (xlator_t *this) LOCK_INIT (&_private->lock); - ret = gethostname (_private->hostname, 256); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "could not find hostname (%s)", strerror (errno)); + ret = dict_get_str (this->options, "hostname", &_private->hostname); + if (ret) { + _private->hostname = GF_CALLOC (256, sizeof (char), + gf_common_mt_char); + if (!_private->hostname) { + gf_log (this->name, GF_LOG_ERROR, "not enough memory"); + goto out; + } + ret = gethostname (_private->hostname, 256); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "could not find hostname (%s)", strerror (errno)); + } } _private->export_statfs = 1; @@ -4574,6 +4583,8 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_BOOL }, { .key = {"directory"}, .type = GF_OPTION_TYPE_PATH }, + { .key = {"hostname"}, + .type = GF_OPTION_TYPE_ANY }, { .key = {"export-statfs-size"}, .type = GF_OPTION_TYPE_BOOL }, { .key = {"mandate-attribute"}, diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 0295a1f4804..2aff0a6ca66 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -73,7 +73,7 @@ struct posix_private { gf_lock_t lock; - char hostname[256]; + char *hostname; /* Statistics, provides activity of the server */ struct timeval prev_fetch_time; -- cgit