diff options
| author | Dan Lambright <dlambrig@redhat.com> | 2015-11-04 15:33:22 -0500 | 
|---|---|---|
| committer | Dan Lambright <dlambrig@redhat.com> | 2015-11-23 04:05:55 -0800 | 
| commit | 3b52c71b0ab57a9daaf31bf3dc8563da37927a66 (patch) | |
| tree | 2d47ba199f5cde08b2d9b639670d0ed5c75204b6 | |
| parent | f12efd1827077292eba08a109d212a25c62476fe (diff) | |
cluster/tier: readdirp to cold tier only
It is possible a file would get migrated in the middle
of a readdir operation. If there are four subvolumes A,B,C,D,
and if readdir reads them in order and reaches subvol B,
then, if a file is moved from D to A, it will not be included
in the readdir output.
This phenonema has pre-existed in DHT migration but is more
apparent in tiering.
When a file is moved off the hashed subvolume a T file is created.
For tiering, we will make the cold subvolume the hashed subvolume.
This will ensure the creation of a T file. Readdir will not skip T
files in the tier translator.
Making the cold subvolume the hashed subvolume ensures the T
files created on promotions or creates will be less likely to
fill the volume.
Creates still put the data on the hot subvolume.
Change-Id: Ifde557d3d0e94a4570ca9f115adee3db2ee75407
BUG:  1281598
Signed-off-by: Dan Lambright <dlambrig@redhat.com>
Reviewed-on: http://review.gluster.org/12530
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Tested-by: NetBSD Build System <jenkins@build.gluster.org>
Reviewed-by: N Balachandran <nbalacha@redhat.com>
Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
| -rwxr-xr-x | run-tests.sh | 1 | ||||
| -rwxr-xr-x | tests/basic/tier/fops-during-migration.t | 13 | ||||
| -rw-r--r-- | tests/basic/tier/readdir-during-migration.t | 64 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/Makefile.am | 4 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 189 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/tier-common.c | 337 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/tier-common.h | 26 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/tier.c | 46 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/tier.h | 3 | 
9 files changed, 564 insertions, 119 deletions
diff --git a/run-tests.sh b/run-tests.sh index 2b934f9134d..a64219c5901 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -198,6 +198,7 @@ function is_bad_test ()                ./tests/basic/quota-nfs.t \                ./tests/basic/tier/tier_lookup_heal.t \                ./tests/basic/tier/bug-1214222-directories_missing_after_attach_tier.t \ +              ./tests/basic/tier/fops-during-migration.t \  	      ./tests/basic/tier/record-metadata-heat.t \                ./tests/bugs/snapshot/bug-1109889.t \                ./tests/bugs/distribute/bug-1066798.t \ diff --git a/tests/basic/tier/fops-during-migration.t b/tests/basic/tier/fops-during-migration.t index b80511a400d..96180d23917 100755 --- a/tests/basic/tier/fops-during-migration.t +++ b/tests/basic/tier/fops-during-migration.t @@ -61,9 +61,11 @@ create_dist_tier_vol $NUM_BRICKS  # Mount FUSE  TEST glusterfs -s $H0 --volfile-id $V0 $M0 +$CLI volume set $V0 diagnostics.client-log-level DEBUG +  TEST mkdir $M0/dir1 -# Create a large file (200MB), so that rebalance takes time +# Create a large file (320MB), so that rebalance takes time  # The file will be created on the hot tier  dd if=/dev/zero of=$M0/dir1/FILE1 bs=64k count=5120 @@ -83,14 +85,9 @@ echo "File path on cold tier: "$CPATH  # Test setxattr  TEST setfattr -n "user.test_xattr" -v "qwerty" $M0/dir1/FILE1 -# Test hard link creation -TEST ln $M0/dir1/FILE1 $M0/dir1/lnk1 -TEST ln $M0/dir1/FILE1 $M0/lnk2 -  # Change the file contents while it is being migrated  echo $TEST_STR > $M0/dir1/FILE1 -  # The file contents should have changed even if the file  # is not done migrating  EXPECT "1" check_file_content $M0/dir1/FILE1 "$TEST_STR" @@ -103,10 +100,6 @@ EXPECT_WITHIN $REBALANCE_TIMEOUT "no" is_sticky_set $CPATH  EXPECT "1" check_file_content $M0/dir1/FILE1 "$TEST_STR" -linkcountsrc=$(stat -c %h $M0/dir1/FILE1) -echo $linkcountsrc -TEST [[ $linkcountsrc == 3 ]] -  TEST getfattr -n "user.test_xattr" $M0/dir1/FILE1  cleanup; diff --git a/tests/basic/tier/readdir-during-migration.t b/tests/basic/tier/readdir-during-migration.t new file mode 100644 index 00000000000..42199c57768 --- /dev/null +++ b/tests/basic/tier/readdir-during-migration.t @@ -0,0 +1,64 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../tier.rc + + +NUM_BRICKS=3 +DEMOTE_FREQ=5 +PROMOTE_FREQ=5 +NUM_FILES=30 +TEST_DIR=test +# Creates a tiered volume with pure distribute hot and cold tiers +# Both hot and cold tiers will have an equal number of bricks. + +function create_dist_tier_vol () { +        mkdir $B0/cold +        mkdir $B0/hot +        TEST $CLI volume create $V0 $H0:$B0/cold/${V0}{0..$1} +        TEST $CLI volume set $V0 performance.quick-read off +        TEST $CLI volume set $V0 performance.io-cache off +        TEST $CLI volume start $V0 +        TEST $CLI volume attach-tier $V0 $H0:$B0/hot/${V0}{0..$1} +        TEST $CLI volume set $V0 cluster.tier-mode test +        TEST $CLI volume set $V0 cluster.tier-demote-frequency $DEMOTE_FREQ +        TEST $CLI volume set $V0 cluster.tier-promote-frequency $PROMOTE_FREQ +        TEST $CLI volume set $V0 cluster.read-freq-threshold 0 +        TEST $CLI volume set $V0 cluster.write-freq-threshold 0 +} + +function check_file_count() { +    if [ $(ls -1 | wc -l) == $1 ]; then +        echo "1" +    else +        echo "0" +    fi +} + +cleanup; + + +TEST glusterd + +#Create and start a tiered volume +create_dist_tier_vol $NUM_BRICKS + +# Mount FUSE +TEST glusterfs -s $H0 --volfile-id $V0 $M0 + +# Create a number of "legacy" files before attaching tier +mkdir $M0/${TEST_DIR} +cd $M0/${TEST_DIR} +TEST create_many_files tfile $NUM_FILES + +EXPECT "1" check_file_count $NUM_FILES + +sleep $DEMOTE_FREQ + +EXPECT "1" check_file_count $NUM_FILES + +cd / + +cleanup; + diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am index bb3308f14a3..29be5ce4776 100644 --- a/xlators/cluster/dht/src/Makefile.am +++ b/xlators/cluster/dht/src/Makefile.am @@ -16,7 +16,7 @@ dht_la_SOURCES = $(dht_common_source) dht.c  nufa_la_SOURCES = $(dht_common_source) nufa.c  switch_la_SOURCES = $(dht_common_source) switch.c -tier_la_SOURCES = $(dht_common_source) tier.c +tier_la_SOURCES = $(dht_common_source) tier.c tier-common.c  dht_la_LDFLAGS = -module -avoid-version -export-symbols $(top_srcdir)/xlators/cluster/dht/src/dht.sym  dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la @@ -30,7 +30,7 @@ switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la  tier_la_LDFLAGS = -module -avoid-version -export-symbols $(top_srcdir)/xlators/cluster/dht/src/tier.sym  tier_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h dht-helper.h tier.h\ +noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h dht-helper.h tier-common.h tier.h\  	$(top_builddir)/xlators/lib/src/libxlator.h  AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 23968518644..26bf7a01106 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -182,7 +182,7 @@ dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie,  {          dht_local_t  *local = NULL;          dht_layout_t *layout = NULL; -	int           ret = -1; +        int           ret = -1;          GF_VALIDATE_OR_GOTO ("dht", frame, out);          GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -783,13 +783,13 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          if ((op_errno != ENOTCONN)                              && (op_errno != ENOENT)                              && (op_errno != ESTALE)) { -				gf_msg (this->name, GF_LOG_INFO, op_errno, +                                gf_msg (this->name, GF_LOG_INFO, op_errno,                                          DHT_MSG_REVALIDATE_CBK_INFO, -					"Revalidate: subvolume %s for %s " +                                        "Revalidate: subvolume %s for %s "                                          "(gfid = %s) returned -1", -					prev->this->name, local->loc.path, +                                        prev->this->name, local->loc.path,                                          gfid); -			} +                        }                          if (op_errno == ESTALE) {                                  /* propagate the ESTALE to parent.                                   * setting local->return_estale would send @@ -936,7 +936,7 @@ out:                          }                  }  cont: -		if (local->layout_mismatch) { +                if (local->layout_mismatch) {                          /* Found layout mismatch in the directory, need to                             fix this in the inode context */                          dht_layout_unref (this, local->layout); @@ -2332,18 +2332,18 @@ dht_lookup (call_frame_t *frame, xlator_t *this,                  /* need it for dir self-heal */                  dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req); -		for (i = 0; i < call_cnt; i++) { -			subvol = layout->list[i].xlator; +                for (i = 0; i < call_cnt; i++) { +                        subvol = layout->list[i].xlator;                          gf_msg_debug (this->name, 0, "calling "                                        "revalidate lookup for %s at %s",                                        loc->path, subvol->name); -			STACK_WIND (frame, dht_revalidate_cbk, -				    subvol, subvol->fops->lookup, -				    &local->loc, local->xattr_req); +                        STACK_WIND (frame, dht_revalidate_cbk, +                                    subvol, subvol->fops->lookup, +                                    &local->loc, local->xattr_req); -		} +                }          } else {          do_fresh_lookup:                  /* TODO: remove the hard-coding */ @@ -2827,8 +2827,8 @@ dht_vgetxattr_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                                  gf_msg (this->name, GF_LOG_ERROR, op_errno,                                          DHT_MSG_GET_XATTR_FAILED,                                          "getxattr err for dir"); -				local->op_ret = -1; -				local->op_errno = op_errno; +                                local->op_ret = -1; +                                local->op_errno = op_errno;                          }                          goto unlock; @@ -3007,8 +3007,8 @@ dht_getxattr_unwind (call_frame_t *frame,  int  dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie, -				    xlator_t *this, int op_ret, int op_errno, -				    dict_t *xattr, dict_t *xdata) +                                    xlator_t *this, int op_ret, int op_errno, +                                    dict_t *xattr, dict_t *xdata)  {          int             this_call_cnt = 0;          dht_local_t     *local = NULL; @@ -3100,44 +3100,44 @@ unlock:          UNLOCK (&frame->lock); -	this_call_cnt = dht_frame_return (frame); -	if (is_last_call (this_call_cnt)) { +        this_call_cnt = dht_frame_return (frame); +        if (is_last_call (this_call_cnt)) {                  DHT_STACK_UNWIND (getxattr, frame, local->op_ret,                                    local->op_errno, local->xattr,                                    local->xattr_req); -	} +        } -	return 0; +        return 0;  }  int  dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this, -				loc_t *loc, const char *key, dict_t *xdata) +                                loc_t *loc, const char *key, dict_t *xdata)  { -	dht_local_t     *local = NULL; -	int              i = 0; -	dht_layout_t    *layout = NULL; -	int              cnt = 0; -	xlator_t        *subvol = NULL; +        dht_local_t     *local = NULL; +        int              i = 0; +        dht_layout_t    *layout = NULL; +        int              cnt = 0; +        xlator_t        *subvol = NULL; -	local = frame->local; -	layout = local->layout; +        local = frame->local; +        layout = local->layout; -	cnt = local->call_cnt = layout->cnt; +        cnt = local->call_cnt = layout->cnt; -	local->op_ret = -1; -	local->op_errno = ENOENT; +        local->op_ret = -1; +        local->op_errno = ENOENT; -	for (i = 0; i < cnt; i++) { -		subvol = layout->list[i].xlator; -		STACK_WIND (frame, dht_getxattr_get_real_filename_cbk, -			    subvol, subvol->fops->getxattr, -			    loc, key, xdata); -	} +        for (i = 0; i < cnt; i++) { +                subvol = layout->list[i].xlator; +                STACK_WIND (frame, dht_getxattr_get_real_filename_cbk, +                            subvol, subvol->fops->getxattr, +                            loc, key, xdata); +        } -	return 0; +        return 0;  }  int @@ -3206,13 +3206,13 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,                  }          } -	if (key && -	    (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY, -		      strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) -	    && DHT_IS_DIR(layout)) { -		dht_getxattr_get_real_filename (frame, this, loc, key, xdata); -		return 0; -	} +        if (key && +            (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY, +                      strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) +            && DHT_IS_DIR(layout)) { +                dht_getxattr_get_real_filename (frame, this, loc, key, xdata); +                return 0; +        }          if (key && DHT_IS_DIR(layout) &&             (!strcmp (key, GF_REBAL_FIND_LOCAL_SUBVOL))) { @@ -3390,7 +3390,7 @@ dht_fgetxattr (call_frame_t *frame, xlator_t *this,          }          if ((fd->inode->ia_type == IA_IFDIR) -	    && key +            && key              && (strncmp (key, GF_XATTR_LOCKINFO_KEY,                           strlen (GF_XATTR_LOCKINFO_KEY)) != 0)) {                  cnt = local->call_cnt = layout->cnt; @@ -4560,6 +4560,7 @@ err:          return 0;  } +  int  dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,                    int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) @@ -4750,17 +4751,17 @@ done:                          goto unwind;                  } -		if (conf->readdir_optimize == _gf_true) { +                if (conf->readdir_optimize == _gf_true) {                          if (next_subvol != local->first_up_subvol) {                                  ret = dict_set_int32 (local->xattr,                                                        GF_READDIR_SKIP_DIRS, 1);                                  if (ret)                                          gf_msg (this->name, GF_LOG_ERROR, 0,                                                  DHT_MSG_DICT_SET_FAILED, -					        "Failed to set dictionary value" +                                                "Failed to set dictionary value"                                                  ":key = %s",                                                  GF_READDIR_SKIP_DIRS ); -		        } else { +                        } else {                                   dict_del (local->xattr,                                             GF_READDIR_SKIP_DIRS);                          } @@ -4937,23 +4938,23 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,                                          " : key = %s",                                          conf->link_xattr_name); -			if (conf->readdir_optimize == _gf_true) { +                        if (conf->readdir_optimize == _gf_true) {                                  if (xvol != local->first_up_subvol) { -				        ret = dict_set_int32 (local->xattr, -			                               GF_READDIR_SKIP_DIRS, 1); -				        if (ret) -					        gf_msg (this->name, +                                        ret = dict_set_int32 (local->xattr, +                                                       GF_READDIR_SKIP_DIRS, 1); +                                        if (ret) +                                                gf_msg (this->name,                                                          GF_LOG_ERROR, 0,                                                          DHT_MSG_DICT_SET_FAILED,                                                          "Failed to set "                                                          "dictionary value: "                                                          "key = %s", -						        GF_READDIR_SKIP_DIRS); +                                                        GF_READDIR_SKIP_DIRS);                                  } else {                                          dict_del (local->xattr,                                                    GF_READDIR_SKIP_DIRS);                                  } -			} +                        }                  }                  STACK_WIND (frame, dht_readdirp_cbk, xvol, xvol->fops->readdirp, @@ -6041,6 +6042,58 @@ err:  }  int +dht_create_tier_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, +                                      xlator_t *subvol, loc_t *loc, int32_t flags, +                                      mode_t mode, mode_t umask, fd_t *fd, +                                      dict_t *params) +{ +        xlator_t                *hot_subvol  = NULL; +        xlator_t                *cold_subvol = NULL; +        dht_conf_t              *conf        = NULL; +        dht_local_t             *local       = NULL; + +        local = frame->local; + +        conf = this->private; + +        cold_subvol = subvol; +        hot_subvol = conf->subvolumes[1]; +        if (conf->subvolumes[0] != cold_subvol) { +                hot_subvol = conf->subvolumes[0]; +        } + +        /* if hot tier full, write to cold */ +        if (dht_is_subvol_filled (this, hot_subvol)) { +                gf_msg_debug (this->name, 0, +                              "creating %s on %s", loc->path, +                              cold_subvol->name); + +                STACK_WIND (frame, dht_create_cbk, +                            cold_subvol, cold_subvol->fops->create, +                            loc, flags, mode, umask, fd, params); +        } else { +                local->params = dict_ref (params); +                local->flags = flags; +                local->mode = mode; +                local->umask = umask; +                local->cached_subvol = hot_subvol; +                local->hashed_subvol = cold_subvol; + +                gf_msg_debug (this->name, 0, +                              "creating %s on %s (link at %s)", loc->path, +                              hot_subvol->name, cold_subvol->name); + +                dht_linkfile_create (frame, dht_create_linkfile_create_cbk, +                                     this, hot_subvol, cold_subvol, loc); + +                goto out; +        } +out: +        return 0; +} + + +int  dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,                                   xlator_t *subvol, loc_t *loc, int32_t flags,                                   mode_t mode, mode_t umask, fd_t *fd, @@ -6051,6 +6104,11 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,          local = frame->local; +        if (strcmp (this->type, "cluster/tier") == 0) +                return dht_create_tier_wind_to_avail_subvol(frame, this, subvol, +                                                            loc, flags, mode, +                                                            umask, fd, params); +          if (!dht_is_subvol_filled (this, subvol)) {                  gf_msg_debug (this->name, 0,                                "creating %s on %s", loc->path, @@ -6523,15 +6581,15 @@ dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          ret = dht_layout_merge (this, layout, prev->this,                                                  -1, ENOSPC, NULL);                  } else { -			if (op_ret == -1 && op_errno == EEXIST) { -				/* Very likely just a race between mkdir and -				   self-heal (from lookup of a concurrent mkdir -				   attempt). -				   Ignore error for now. layout setting will -				   anyways fail if this was a different (old) -				   pre-existing different directory. -				*/ -				op_ret = 0; +                        if (op_ret == -1 && op_errno == EEXIST) { +                                /* Very likely just a race between mkdir and +                                   self-heal (from lookup of a concurrent mkdir +                                   attempt). +                                   Ignore error for now. layout setting will +                                   anyways fail if this was a different (old) +                                   pre-existing different directory. +                                */ +                                op_ret = 0;                                  dir_exists = _gf_true;                          }                          ret = dht_layout_merge (this, layout, prev->this, @@ -8046,3 +8104,4 @@ int32_t dht_set_local_rebalance (xlator_t *this, dht_local_t *local,          return 0;  } + diff --git a/xlators/cluster/dht/src/tier-common.c b/xlators/cluster/dht/src/tier-common.c new file mode 100644 index 00000000000..7d05c2973f2 --- /dev/null +++ b/xlators/cluster/dht/src/tier-common.c @@ -0,0 +1,337 @@ +/* +  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include "glusterfs.h" +#include "xlator.h" +#include "libxlator.h" +#include "dht-common.h" +#include "defaults.h" +#include "tier-common.h" +#include "tier.h" + +int +tier_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                 int op_ret, int op_errno, gf_dirent_t *orig_entries, +                 dict_t *xdata) +{ +        dht_local_t  *local = NULL; +        gf_dirent_t   entries; +        gf_dirent_t  *orig_entry = NULL; +        gf_dirent_t  *entry = NULL; +        call_frame_t *prev = NULL; +        xlator_t     *next_subvol = NULL; +        off_t         next_offset = 0; +        int           count = 0; + +        INIT_LIST_HEAD (&entries.list); +        prev = cookie; +        local = frame->local; + +        if (op_ret < 0) +                goto done; + +        list_for_each_entry (orig_entry, (&orig_entries->list), list) { +                next_offset = orig_entry->d_off; + +                entry = gf_dirent_for_name (orig_entry->d_name); +                if (!entry) { +                        gf_msg (this->name, GF_LOG_ERROR, ENOMEM, +                                DHT_MSG_NO_MEMORY, +                                "Memory allocation failed "); +                        goto unwind; +                } + +                entry->d_off  = orig_entry->d_off; +                entry->d_ino  = orig_entry->d_ino; +                entry->d_type = orig_entry->d_type; +                entry->d_len  = orig_entry->d_len; + +                list_add_tail (&entry->list, &entries.list); +                count++; +        } +        op_ret = count; + +done: +        if (count == 0) { +                /* non-zero next_offset means that +                   EOF is not yet hit on the current subvol +                */ +                if (next_offset != 0) { +                        next_subvol = prev->this; +                } else { +                        goto unwind; +                } + +                STACK_WIND (frame, tier_readdir_cbk, +                            next_subvol, next_subvol->fops->readdir, +                            local->fd, local->size, next_offset, NULL); +                return 0; +        } + +unwind: +        if (op_ret < 0) +                op_ret = 0; + +        DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, NULL); + +        gf_dirent_free (&entries); + +        return 0; +} + +int +tier_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, +                  int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) +{ +        dht_local_t  *local = NULL; +        gf_dirent_t   entries; +        gf_dirent_t  *orig_entry = NULL; +        gf_dirent_t  *entry = NULL; +        call_frame_t *prev = NULL; +        xlator_t     *next_subvol = NULL; +        off_t         next_offset = 0; +        int           count = 0; +        dht_conf_t   *conf   = NULL; +        int           ret    = 0; +        inode_table_t           *itable = NULL; +        inode_t                 *inode = NULL; + +        INIT_LIST_HEAD (&entries.list); +        prev = cookie; +        local = frame->local; +        itable = local->fd ? local->fd->inode->table : NULL; + +        conf  = this->private; +        GF_VALIDATE_OR_GOTO(this->name, conf, unwind); + +        if (op_ret < 0) +                goto done; + +        list_for_each_entry (orig_entry, (&orig_entries->list), list) { +                next_offset = orig_entry->d_off; + +                if (IA_ISINVAL(orig_entry->d_stat.ia_type)) { +                        /*stat failed somewhere- ignore this entry*/ +                        continue; +                } + +                entry = gf_dirent_for_name (orig_entry->d_name); +                if (!entry) { + +                        goto unwind; +                } + +                entry->d_off  = orig_entry->d_off; +                entry->d_stat = orig_entry->d_stat; +                entry->d_ino  = orig_entry->d_ino; +                entry->d_type = orig_entry->d_type; +                entry->d_len  = orig_entry->d_len; + +                if (orig_entry->dict) +                        entry->dict = dict_ref (orig_entry->dict); + +                if (check_is_linkfile (NULL, (&orig_entry->d_stat), +                                       orig_entry->dict, +                                       conf->link_xattr_name)) { +                        inode = inode_find (itable, +                                            orig_entry->d_stat.ia_gfid); +                        if (inode) { +                                ret = dht_layout_preset +                                        (this, TIER_UNHASHED_SUBVOL, +                                         inode); +                                if (ret) +                                        gf_msg (this->name, +                                                GF_LOG_WARNING, 0, +                                                DHT_MSG_LAYOUT_SET_FAILED, +                                                "failed to link the layout" +                                                " in inode"); +                                inode_unref (inode); +                                inode = NULL; +                        } + +                } else if (IA_ISDIR(entry->d_stat.ia_type)) { +                        if (orig_entry->inode) { +                                dht_inode_ctx_time_update (orig_entry->inode, +                                                           this, &entry->d_stat, +                                                           1); +                        } +                } else { +                        if (orig_entry->inode) { +                                ret = dht_layout_preset (this, prev->this, +                                                         orig_entry->inode); +                                if (ret) +                                        gf_msg (this->name, GF_LOG_WARNING, 0, +                                                DHT_MSG_LAYOUT_SET_FAILED, +                                                "failed to link the layout " +                                                "in inode"); + +                                entry->inode = inode_ref (orig_entry->inode); +                        } else if (itable) { +                                /* +                                 * orig_entry->inode might be null if any upper +                                 * layer xlators below client set to null, to +                                 * force a lookup on the inode even if the inode +                                 * is present in the inode table. In that case +                                 * we just update the ctx to make sure we didn't +                                 * missed anything. +                                 */ +                                inode = inode_find (itable, +                                                    orig_entry->d_stat.ia_gfid); +                                if (inode) { +                                        ret = dht_layout_preset +                                                (this, TIER_HASHED_SUBVOL, +                                                 inode); +                                        if (ret) +                                                gf_msg (this->name, +                                                     GF_LOG_WARNING, 0, +                                                     DHT_MSG_LAYOUT_SET_FAILED, +                                                     "failed to link the layout" +                                                     " in inode"); +                                        inode_unref (inode); +                                        inode = NULL; +                                } +                        } +                } +                list_add_tail (&entry->list, &entries.list); +                count++; +        } +        op_ret = count; + +done: +        if (count == 0) { +                /* non-zero next_offset means that +                   EOF is not yet hit on the current subvol +                */ +                if (next_offset != 0) { +                        next_subvol = prev->this; +                } else { +                        goto unwind; +                } + +                STACK_WIND (frame, tier_readdirp_cbk, +                            next_subvol, next_subvol->fops->readdirp, +                            local->fd, local->size, next_offset, +                            local->xattr); +                return 0; +        } + +unwind: +        if (op_ret < 0) +                op_ret = 0; + +        DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno, &entries, NULL); + +        gf_dirent_free (&entries); + +        return 0; +} + +int +tier_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +                off_t yoff, int whichop, dict_t *dict) +{ +        dht_local_t  *local         = NULL; +        int           op_errno      = -1; +        xlator_t     *hashed_subvol = NULL; +        int           ret           = 0; +        dht_conf_t   *conf          = NULL; + +        VALIDATE_OR_GOTO (frame, err); +        VALIDATE_OR_GOTO (this, err); +        VALIDATE_OR_GOTO (fd, err); +        VALIDATE_OR_GOTO (this->private, err); + +        conf = this->private; + +        local = dht_local_init (frame, NULL, NULL, whichop); +        if (!local) { +                op_errno = ENOMEM; +                goto err; +        } + +        local->fd = fd_ref (fd); +        local->size = size; +        local->xattr_req = (dict) ? dict_ref (dict) : NULL; + +        hashed_subvol = TIER_HASHED_SUBVOL; + + +        /* TODO: do proper readdir */ +        if (whichop == GF_FOP_READDIRP) { +                if (dict) +                        local->xattr = dict_ref (dict); +                else +                        local->xattr = dict_new (); + +                if (local->xattr) { +                        ret = dict_set_uint32 (local->xattr, +                                               conf->link_xattr_name, 256); +                        if (ret) +                                gf_msg (this->name, GF_LOG_WARNING, 0, +                                        DHT_MSG_DICT_SET_FAILED, +                                        "Failed to set dictionary value" +                                        " : key = %s", +                                        conf->link_xattr_name); + +                } + +                STACK_WIND (frame, tier_readdirp_cbk, hashed_subvol, +                            hashed_subvol->fops->readdirp, +                            fd, size, yoff, local->xattr); + +        } else { +                STACK_WIND (frame, tier_readdir_cbk, hashed_subvol, +                            hashed_subvol->fops->readdir, +                            fd, size, yoff, local->xattr); +        } + +        return 0; + +err: +        op_errno = (op_errno == -1) ? errno : op_errno; +        DHT_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); + +        return 0; +} + +int +tier_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +             off_t yoff, dict_t *xdata) +{ +        int          op = GF_FOP_READDIR; +        dht_conf_t  *conf = NULL; +        int          i = 0; + +        conf = this->private; +        if (!conf) +                goto out; + +        for (i = 0; i < conf->subvolume_cnt; i++) { +                if (!conf->subvolume_status[i]) { +                        op = GF_FOP_READDIRP; +                        break; +                } +        } + +        if (conf->use_readdirp) +                op = GF_FOP_READDIRP; + +out: +        tier_do_readdir (frame, this, fd, size, yoff, op, 0); +        return 0; +} + +int +tier_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +               off_t yoff, dict_t *dict) +{ +        tier_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP, dict); +        return 0; +} diff --git a/xlators/cluster/dht/src/tier-common.h b/xlators/cluster/dht/src/tier-common.h new file mode 100644 index 00000000000..93a71f75252 --- /dev/null +++ b/xlators/cluster/dht/src/tier-common.h @@ -0,0 +1,26 @@ +/* +  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef _TIER_COMMON_H_ +#define _TIER_COMMON_H_ + +int32_t +tier_readdirp (call_frame_t *frame, +               xlator_t *this, +               fd_t     *fd, +               size_t    size, off_t off, dict_t *dict); + +int +tier_readdir (call_frame_t *frame, +              xlator_t *this, fd_t *fd, size_t size, +              off_t yoff, dict_t *xdata); + +#endif + diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c index 6a9bddfc179..dbb34c67620 100644 --- a/xlators/cluster/dht/src/tier.c +++ b/xlators/cluster/dht/src/tier.c @@ -12,6 +12,7 @@  #include "dht-common.h"  #include "tier.h" +#include "tier-common.h"  #include "syscall.h"  /*Hard coded DB info*/ @@ -442,7 +443,7 @@ tier_migrate_using_query_file (void *_args)                                  goto abort;                          } -                        gf_msg_trace (this->name, 0, +                        gf_msg_debug (this->name, 0,                                  "Tier %d"                                  " src_subvol %s file %s",                                  query_cbk_args->is_promotion, @@ -1557,53 +1558,16 @@ xlator_t *  tier_search (xlator_t *this, dht_layout_t *layout, const char *name)  {          xlator_t                *subvol = NULL; -        void                    *value; -        int                      search_subvol = 0;          dht_conf_t              *conf   = NULL; -        gf_defrag_info_t        *defrag = NULL; -        int                      layout_cold = 0; -        int                      layout_hot = 1;          GF_VALIDATE_OR_GOTO ("tier", this, out); -        GF_VALIDATE_OR_GOTO (this->name, layout, out); -        GF_VALIDATE_OR_GOTO (this->name, name, out);          GF_VALIDATE_OR_GOTO (this->name, this->private, out);          conf = this->private; -        /* The first subvolume in the graph is always cold. */ -        /* Find the position of the cold subvolume in the layout. */ -        layout_cold = 0; -        layout_hot = 1; -        if (conf->subvolumes[0] != layout->list[0].xlator) { -                layout_cold = 1; -                layout_hot = 0; -        } - -        search_subvol = layout_hot; - -        defrag = conf->defrag; -        if (defrag && defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER) -                search_subvol = layout_cold; - -        /* "decommission_subvols_cnt" can only be non-zero on detach. */ -        /* This will change once brick add/remove is supported for */ -        /* tiered volumes. */ -        else if (conf->decommission_subvols_cnt) { -                search_subvol = layout_cold; -        } -        else if (!dict_get_ptr (this->options, "rule", &value) && -                 !strcmp(layout->list[layout_cold].xlator->name, value)) { -                search_subvol = layout_cold; -        } +        subvol = TIER_HASHED_SUBVOL; -        if ((layout->list[search_subvol].err > 0) && -            (layout->list[search_subvol].err != ENOTCONN)) -                search_subvol = layout_cold; - -        subvol = layout->list[search_subvol].xlator;   out: -          return subvol;  } @@ -1970,8 +1934,8 @@ struct xlator_fops fops = {          .open        = dht_open,          .statfs      = dht_statfs,          .opendir     = dht_opendir, -        .readdir     = dht_readdir, -        .readdirp    = dht_readdirp, +        .readdir     = tier_readdir, +        .readdirp    = tier_readdirp,          .fsyncdir    = dht_fsyncdir,          .symlink     = dht_symlink,          .unlink      = dht_unlink, diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h index 2a72ae2caf6..92e2fda6e5c 100644 --- a/xlators/cluster/dht/src/tier.h +++ b/xlators/cluster/dht/src/tier.h @@ -33,7 +33,8 @@  #define PROMOTION_QFILE "promotequeryfile"  #define DEMOTION_QFILE "demotequeryfile" -#define TIER_HASHED_SUBVOL   conf->subvolumes[1] +#define TIER_HASHED_SUBVOL   conf->subvolumes[0] +#define TIER_UNHASHED_SUBVOL   conf->subvolumes[1]  #define GET_QFILE_PATH(is_promotion)\          (is_promotion) ? promotion_qfile : demotion_qfile  | 
