summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Lambright <dlambrig@redhat.com>2015-11-04 15:33:22 -0500
committerDan Lambright <dlambrig@redhat.com>2015-11-24 04:13:53 -0800
commitb8c56ebe617327d570c252f8c411c85a84e727e6 (patch)
treeb8a94d37a3768a83ce3fc589d94efe86aea3676e
parent974837c3da9deb45cd2dc31afe56e5960268cf01 (diff)
cluster/tier: readdirp to cold tier only
It is possible a file would get migrated in the middle of a readdir operation. If there are four subvolumes A,B,C,D, and if readdir reads them in order and reaches subvol B, then, if a file is moved from D to A, it will not be included in the readdir output. This phenonema has pre-existed in DHT migration but is more apparent in tiering. When a file is moved off the hashed subvolume a T file is created. For tiering, we will make the cold subvolume the hashed subvolume. This will ensure the creation of a T file. Readdir will not skip T files in the tier translator. Making the cold subvolume the hashed subvolume ensures the T files created on promotions or creates will be less likely to fill the volume. Creates still put the data on the hot subvolume. This is a backport of 12530 > Change-Id: Ifde557d3d0e94a4570ca9f115adee3db2ee75407 > BUG: 1281598 > Signed-off-by: Dan Lambright <dlambrig@redhat.com> > Reviewed-on: http://review.gluster.org/12530 > Tested-by: Gluster Build System <jenkins@build.gluster.com> > Tested-by: NetBSD Build System <jenkins@build.gluster.org> > Reviewed-by: N Balachandran <nbalacha@redhat.com> > Reviewed-by: Raghavendra G <rgowdapp@redhat.com> Signed-off-by: Dan Lambright <dlambrig@redhat.com> Signed-off-by: Dan Lambright <dlambrig@redhat.com> Conflicts: xlators/cluster/dht/src/tier.c Change-Id: I5720a4cd04ae5088e5d7d23439b0f90d6bbc6265 BUG: 1283923 Reviewed-on: http://review.gluster.org/12722 Tested-by: NetBSD Build System <jenkins@build.gluster.org> Reviewed-by: N Balachandran <nbalacha@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Dan Lambright <dlambrig@redhat.com> Tested-by: Dan Lambright <dlambrig@redhat.com>
-rwxr-xr-xrun-tests.sh1
-rwxr-xr-xtests/basic/tier/fops-during-migration.t13
-rw-r--r--tests/basic/tier/readdir-during-migration.t64
-rw-r--r--xlators/cluster/dht/src/Makefile.am4
-rw-r--r--xlators/cluster/dht/src/dht-common.c189
-rw-r--r--xlators/cluster/dht/src/tier-common.c342
-rw-r--r--xlators/cluster/dht/src/tier-common.h31
-rw-r--r--xlators/cluster/dht/src/tier.c47
-rw-r--r--xlators/cluster/dht/src/tier.h3
9 files changed, 575 insertions, 119 deletions
diff --git a/run-tests.sh b/run-tests.sh
index 18c8cd652b1..fa924dc916a 100755
--- a/run-tests.sh
+++ b/run-tests.sh
@@ -198,6 +198,7 @@ function is_bad_test ()
./tests/basic/quota-nfs.t \
./tests/basic/tier/tier_lookup_heal.t \
./tests/basic/tier/bug-1214222-directories_missing_after_attach_tier.t \
+ ./tests/basic/tier/fops-during-migration.t \
./tests/basic/tier/record-metadata-heat.t \
./tests/bugs/snapshot/bug-1109889.t \
./tests/bugs/distribute/bug-1066798.t \
diff --git a/tests/basic/tier/fops-during-migration.t b/tests/basic/tier/fops-during-migration.t
index b80511a400d..96180d23917 100755
--- a/tests/basic/tier/fops-during-migration.t
+++ b/tests/basic/tier/fops-during-migration.t
@@ -61,9 +61,11 @@ create_dist_tier_vol $NUM_BRICKS
# Mount FUSE
TEST glusterfs -s $H0 --volfile-id $V0 $M0
+$CLI volume set $V0 diagnostics.client-log-level DEBUG
+
TEST mkdir $M0/dir1
-# Create a large file (200MB), so that rebalance takes time
+# Create a large file (320MB), so that rebalance takes time
# The file will be created on the hot tier
dd if=/dev/zero of=$M0/dir1/FILE1 bs=64k count=5120
@@ -83,14 +85,9 @@ echo "File path on cold tier: "$CPATH
# Test setxattr
TEST setfattr -n "user.test_xattr" -v "qwerty" $M0/dir1/FILE1
-# Test hard link creation
-TEST ln $M0/dir1/FILE1 $M0/dir1/lnk1
-TEST ln $M0/dir1/FILE1 $M0/lnk2
-
# Change the file contents while it is being migrated
echo $TEST_STR > $M0/dir1/FILE1
-
# The file contents should have changed even if the file
# is not done migrating
EXPECT "1" check_file_content $M0/dir1/FILE1 "$TEST_STR"
@@ -103,10 +100,6 @@ EXPECT_WITHIN $REBALANCE_TIMEOUT "no" is_sticky_set $CPATH
EXPECT "1" check_file_content $M0/dir1/FILE1 "$TEST_STR"
-linkcountsrc=$(stat -c %h $M0/dir1/FILE1)
-echo $linkcountsrc
-TEST [[ $linkcountsrc == 3 ]]
-
TEST getfattr -n "user.test_xattr" $M0/dir1/FILE1
cleanup;
diff --git a/tests/basic/tier/readdir-during-migration.t b/tests/basic/tier/readdir-during-migration.t
new file mode 100644
index 00000000000..42199c57768
--- /dev/null
+++ b/tests/basic/tier/readdir-during-migration.t
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../tier.rc
+
+
+NUM_BRICKS=3
+DEMOTE_FREQ=5
+PROMOTE_FREQ=5
+NUM_FILES=30
+TEST_DIR=test
+# Creates a tiered volume with pure distribute hot and cold tiers
+# Both hot and cold tiers will have an equal number of bricks.
+
+function create_dist_tier_vol () {
+ mkdir $B0/cold
+ mkdir $B0/hot
+ TEST $CLI volume create $V0 $H0:$B0/cold/${V0}{0..$1}
+ TEST $CLI volume set $V0 performance.quick-read off
+ TEST $CLI volume set $V0 performance.io-cache off
+ TEST $CLI volume start $V0
+ TEST $CLI volume attach-tier $V0 $H0:$B0/hot/${V0}{0..$1}
+ TEST $CLI volume set $V0 cluster.tier-mode test
+ TEST $CLI volume set $V0 cluster.tier-demote-frequency $DEMOTE_FREQ
+ TEST $CLI volume set $V0 cluster.tier-promote-frequency $PROMOTE_FREQ
+ TEST $CLI volume set $V0 cluster.read-freq-threshold 0
+ TEST $CLI volume set $V0 cluster.write-freq-threshold 0
+}
+
+function check_file_count() {
+ if [ $(ls -1 | wc -l) == $1 ]; then
+ echo "1"
+ else
+ echo "0"
+ fi
+}
+
+cleanup;
+
+
+TEST glusterd
+
+#Create and start a tiered volume
+create_dist_tier_vol $NUM_BRICKS
+
+# Mount FUSE
+TEST glusterfs -s $H0 --volfile-id $V0 $M0
+
+# Create a number of "legacy" files before attaching tier
+mkdir $M0/${TEST_DIR}
+cd $M0/${TEST_DIR}
+TEST create_many_files tfile $NUM_FILES
+
+EXPECT "1" check_file_count $NUM_FILES
+
+sleep $DEMOTE_FREQ
+
+EXPECT "1" check_file_count $NUM_FILES
+
+cd /
+
+cleanup;
+
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
index f6c9ef2607a..7f2286210c3 100644
--- a/xlators/cluster/dht/src/Makefile.am
+++ b/xlators/cluster/dht/src/Makefile.am
@@ -16,7 +16,7 @@ dht_la_SOURCES = $(dht_common_source) dht.c
nufa_la_SOURCES = $(dht_common_source) nufa.c
switch_la_SOURCES = $(dht_common_source) switch.c
-tier_la_SOURCES = $(dht_common_source) tier.c
+tier_la_SOURCES = $(dht_common_source) tier.c tier-common.c
dht_la_LDFLAGS = -module -avoid-version
dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
@@ -31,7 +31,7 @@ switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
tier_la_LDFLAGS = -module -avoid-version
tier_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h dht-helper.h tier.h\
+noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h dht-helper.h tier-common.h tier.h\
$(top_builddir)/xlators/lib/src/libxlator.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 3228f20e53e..e664b576f92 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -187,7 +187,7 @@ dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie,
{
dht_local_t *local = NULL;
dht_layout_t *layout = NULL;
- int ret = -1;
+ int ret = -1;
GF_VALIDATE_OR_GOTO ("dht", frame, out);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -787,13 +787,13 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if ((op_errno != ENOTCONN)
&& (op_errno != ENOENT)
&& (op_errno != ESTALE)) {
- gf_msg (this->name, GF_LOG_INFO, op_errno,
+ gf_msg (this->name, GF_LOG_INFO, op_errno,
DHT_MSG_REVALIDATE_CBK_INFO,
- "Revalidate: subvolume %s for %s "
+ "Revalidate: subvolume %s for %s "
"(gfid = %s) returned -1",
- prev->this->name, local->loc.path,
+ prev->this->name, local->loc.path,
gfid);
- }
+ }
if (op_errno == ESTALE) {
/* propagate the ESTALE to parent.
* setting local->return_estale would send
@@ -940,7 +940,7 @@ out:
}
}
cont:
- if (local->layout_mismatch) {
+ if (local->layout_mismatch) {
/* Found layout mismatch in the directory, need to
fix this in the inode context */
dht_layout_unref (this, local->layout);
@@ -2336,18 +2336,18 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
/* need it for dir self-heal */
dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req);
- for (i = 0; i < call_cnt; i++) {
- subvol = layout->list[i].xlator;
+ for (i = 0; i < call_cnt; i++) {
+ subvol = layout->list[i].xlator;
gf_msg_debug (this->name, 0, "calling "
"revalidate lookup for %s at %s",
loc->path, subvol->name);
- STACK_WIND (frame, dht_revalidate_cbk,
- subvol, subvol->fops->lookup,
- &local->loc, local->xattr_req);
+ STACK_WIND (frame, dht_revalidate_cbk,
+ subvol, subvol->fops->lookup,
+ &local->loc, local->xattr_req);
- }
+ }
} else {
do_fresh_lookup:
/* TODO: remove the hard-coding */
@@ -2831,8 +2831,8 @@ dht_vgetxattr_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_msg (this->name, GF_LOG_ERROR, op_errno,
DHT_MSG_GET_XATTR_FAILED,
"getxattr err for dir");
- local->op_ret = -1;
- local->op_errno = op_errno;
+ local->op_ret = -1;
+ local->op_errno = op_errno;
}
goto unlock;
@@ -3011,8 +3011,8 @@ dht_getxattr_unwind (call_frame_t *frame,
int
dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- dict_t *xattr, dict_t *xdata)
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
{
int this_call_cnt = 0;
dht_local_t *local = NULL;
@@ -3104,44 +3104,44 @@ unlock:
UNLOCK (&frame->lock);
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
DHT_STACK_UNWIND (getxattr, frame, local->op_ret,
local->op_errno, local->xattr,
local->xattr_req);
- }
+ }
- return 0;
+ return 0;
}
int
dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *key, dict_t *xdata)
+ loc_t *loc, const char *key, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int i = 0;
- dht_layout_t *layout = NULL;
- int cnt = 0;
- xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ int i = 0;
+ dht_layout_t *layout = NULL;
+ int cnt = 0;
+ xlator_t *subvol = NULL;
- local = frame->local;
- layout = local->layout;
+ local = frame->local;
+ layout = local->layout;
- cnt = local->call_cnt = layout->cnt;
+ cnt = local->call_cnt = layout->cnt;
- local->op_ret = -1;
- local->op_errno = ENOENT;
+ local->op_ret = -1;
+ local->op_errno = ENOENT;
- for (i = 0; i < cnt; i++) {
- subvol = layout->list[i].xlator;
- STACK_WIND (frame, dht_getxattr_get_real_filename_cbk,
- subvol, subvol->fops->getxattr,
- loc, key, xdata);
- }
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND (frame, dht_getxattr_get_real_filename_cbk,
+ subvol, subvol->fops->getxattr,
+ loc, key, xdata);
+ }
- return 0;
+ return 0;
}
int
@@ -3210,13 +3210,13 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
}
}
- if (key &&
- (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY,
- strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)
- && DHT_IS_DIR(layout)) {
- dht_getxattr_get_real_filename (frame, this, loc, key, xdata);
- return 0;
- }
+ if (key &&
+ (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY,
+ strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)
+ && DHT_IS_DIR(layout)) {
+ dht_getxattr_get_real_filename (frame, this, loc, key, xdata);
+ return 0;
+ }
if (key && DHT_IS_DIR(layout) &&
(!strcmp (key, GF_REBAL_FIND_LOCAL_SUBVOL))) {
@@ -3394,7 +3394,7 @@ dht_fgetxattr (call_frame_t *frame, xlator_t *this,
}
if ((fd->inode->ia_type == IA_IFDIR)
- && key
+ && key
&& (strncmp (key, GF_XATTR_LOCKINFO_KEY,
strlen (GF_XATTR_LOCKINFO_KEY)) != 0)) {
cnt = local->call_cnt = layout->cnt;
@@ -4563,6 +4563,7 @@ err:
return 0;
}
+
int
dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
@@ -4753,17 +4754,17 @@ done:
goto unwind;
}
- if (conf->readdir_optimize == _gf_true) {
+ if (conf->readdir_optimize == _gf_true) {
if (next_subvol != local->first_up_subvol) {
ret = dict_set_int32 (local->xattr,
GF_READDIR_SKIP_DIRS, 1);
if (ret)
gf_msg (this->name, GF_LOG_ERROR, 0,
DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value"
+ "Failed to set dictionary value"
":key = %s",
GF_READDIR_SKIP_DIRS );
- } else {
+ } else {
dict_del (local->xattr,
GF_READDIR_SKIP_DIRS);
}
@@ -4940,23 +4941,23 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
" : key = %s",
conf->link_xattr_name);
- if (conf->readdir_optimize == _gf_true) {
+ if (conf->readdir_optimize == _gf_true) {
if (xvol != local->first_up_subvol) {
- ret = dict_set_int32 (local->xattr,
- GF_READDIR_SKIP_DIRS, 1);
- if (ret)
- gf_msg (this->name,
+ ret = dict_set_int32 (local->xattr,
+ GF_READDIR_SKIP_DIRS, 1);
+ if (ret)
+ gf_msg (this->name,
GF_LOG_ERROR, 0,
DHT_MSG_DICT_SET_FAILED,
"Failed to set "
"dictionary value: "
"key = %s",
- GF_READDIR_SKIP_DIRS);
+ GF_READDIR_SKIP_DIRS);
} else {
dict_del (local->xattr,
GF_READDIR_SKIP_DIRS);
}
- }
+ }
}
STACK_WIND (frame, dht_readdirp_cbk, xvol, xvol->fops->readdirp,
@@ -6043,6 +6044,58 @@ err:
}
int
+dht_create_tier_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
+ xlator_t *subvol, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd,
+ dict_t *params)
+{
+ xlator_t *hot_subvol = NULL;
+ xlator_t *cold_subvol = NULL;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ conf = this->private;
+
+ cold_subvol = subvol;
+ hot_subvol = conf->subvolumes[1];
+ if (conf->subvolumes[0] != cold_subvol) {
+ hot_subvol = conf->subvolumes[0];
+ }
+
+ /* if hot tier full, write to cold */
+ if (dht_is_subvol_filled (this, hot_subvol)) {
+ gf_msg_debug (this->name, 0,
+ "creating %s on %s", loc->path,
+ cold_subvol->name);
+
+ STACK_WIND (frame, dht_create_cbk,
+ cold_subvol, cold_subvol->fops->create,
+ loc, flags, mode, umask, fd, params);
+ } else {
+ local->params = dict_ref (params);
+ local->flags = flags;
+ local->mode = mode;
+ local->umask = umask;
+ local->cached_subvol = hot_subvol;
+ local->hashed_subvol = cold_subvol;
+
+ gf_msg_debug (this->name, 0,
+ "creating %s on %s (link at %s)", loc->path,
+ hot_subvol->name, cold_subvol->name);
+
+ dht_linkfile_create (frame, dht_create_linkfile_create_cbk,
+ this, hot_subvol, cold_subvol, loc);
+
+ goto out;
+ }
+out:
+ return 0;
+}
+
+
+int
dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
xlator_t *subvol, loc_t *loc, int32_t flags,
mode_t mode, mode_t umask, fd_t *fd,
@@ -6053,6 +6106,11 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
local = frame->local;
+ if (strcmp (this->type, "cluster/tier") == 0)
+ return dht_create_tier_wind_to_avail_subvol(frame, this, subvol,
+ loc, flags, mode,
+ umask, fd, params);
+
if (!dht_is_subvol_filled (this, subvol)) {
gf_msg_debug (this->name, 0,
"creating %s on %s", loc->path,
@@ -6525,15 +6583,15 @@ dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
ret = dht_layout_merge (this, layout, prev->this,
-1, ENOSPC, NULL);
} else {
- if (op_ret == -1 && op_errno == EEXIST) {
- /* Very likely just a race between mkdir and
- self-heal (from lookup of a concurrent mkdir
- attempt).
- Ignore error for now. layout setting will
- anyways fail if this was a different (old)
- pre-existing different directory.
- */
- op_ret = 0;
+ if (op_ret == -1 && op_errno == EEXIST) {
+ /* Very likely just a race between mkdir and
+ self-heal (from lookup of a concurrent mkdir
+ attempt).
+ Ignore error for now. layout setting will
+ anyways fail if this was a different (old)
+ pre-existing different directory.
+ */
+ op_ret = 0;
dir_exists = _gf_true;
}
ret = dht_layout_merge (this, layout, prev->this,
@@ -8041,3 +8099,4 @@ int32_t dht_set_local_rebalance (xlator_t *this, dht_local_t *local,
return 0;
}
+
diff --git a/xlators/cluster/dht/src/tier-common.c b/xlators/cluster/dht/src/tier-common.c
new file mode 100644
index 00000000000..129932c9ef9
--- /dev/null
+++ b/xlators/cluster/dht/src/tier-common.c
@@ -0,0 +1,342 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "libxlator.h"
+#include "dht-common.h"
+#include "defaults.h"
+#include "tier-common.h"
+#include "tier.h"
+
+int
+tier_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *orig_entries,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ gf_dirent_t entries;
+ gf_dirent_t *orig_entry = NULL;
+ gf_dirent_t *entry = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *next_subvol = NULL;
+ off_t next_offset = 0;
+ int count = 0;
+
+ INIT_LIST_HEAD (&entries.list);
+ prev = cookie;
+ local = frame->local;
+
+ if (op_ret < 0)
+ goto done;
+
+ list_for_each_entry (orig_entry, (&orig_entries->list), list) {
+ next_offset = orig_entry->d_off;
+
+ entry = gf_dirent_for_name (orig_entry->d_name);
+ if (!entry) {
+ gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
+ DHT_MSG_NO_MEMORY,
+ "Memory allocation failed ");
+ goto unwind;
+ }
+
+ entry->d_off = orig_entry->d_off;
+ entry->d_ino = orig_entry->d_ino;
+ entry->d_type = orig_entry->d_type;
+ entry->d_len = orig_entry->d_len;
+
+ list_add_tail (&entry->list, &entries.list);
+ count++;
+ }
+ op_ret = count;
+
+done:
+ if (count == 0) {
+ /* non-zero next_offset means that
+ EOF is not yet hit on the current subvol
+ */
+ if (next_offset != 0) {
+ next_subvol = prev->this;
+ } else {
+ goto unwind;
+ }
+
+ STACK_WIND (frame, tier_readdir_cbk,
+ next_subvol, next_subvol->fops->readdir,
+ local->fd, local->size, next_offset, NULL);
+ return 0;
+ }
+
+unwind:
+ if (op_ret < 0)
+ op_ret = 0;
+
+ DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, NULL);
+
+ gf_dirent_free (&entries);
+
+ return 0;
+}
+
+int
+tier_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ gf_dirent_t entries;
+ gf_dirent_t *orig_entry = NULL;
+ gf_dirent_t *entry = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *next_subvol = NULL;
+ off_t next_offset = 0;
+ int count = 0;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
+ inode_table_t *itable = NULL;
+ inode_t *inode = NULL;
+
+ INIT_LIST_HEAD (&entries.list);
+ prev = cookie;
+ local = frame->local;
+ itable = local->fd ? local->fd->inode->table : NULL;
+
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, unwind);
+
+ if (op_ret < 0)
+ goto done;
+
+ list_for_each_entry (orig_entry, (&orig_entries->list), list) {
+ next_offset = orig_entry->d_off;
+
+ if (IA_ISINVAL(orig_entry->d_stat.ia_type)) {
+ /*stat failed somewhere- ignore this entry*/
+ continue;
+ }
+
+ entry = gf_dirent_for_name (orig_entry->d_name);
+ if (!entry) {
+
+ goto unwind;
+ }
+
+ entry->d_off = orig_entry->d_off;
+ entry->d_stat = orig_entry->d_stat;
+ entry->d_ino = orig_entry->d_ino;
+ entry->d_type = orig_entry->d_type;
+ entry->d_len = orig_entry->d_len;
+
+ if (orig_entry->dict)
+ entry->dict = dict_ref (orig_entry->dict);
+
+ if (check_is_linkfile (NULL, (&orig_entry->d_stat),
+ orig_entry->dict,
+ conf->link_xattr_name)) {
+ inode = inode_find (itable,
+ orig_entry->d_stat.ia_gfid);
+ if (inode) {
+ ret = dht_layout_preset
+ (this, TIER_UNHASHED_SUBVOL,
+ inode);
+ if (ret)
+ gf_msg (this->name,
+ GF_LOG_WARNING, 0,
+ DHT_MSG_LAYOUT_SET_FAILED,
+ "failed to link the layout"
+ " in inode");
+ inode_unref (inode);
+ inode = NULL;
+ }
+
+ } else if (IA_ISDIR(entry->d_stat.ia_type)) {
+ if (orig_entry->inode) {
+ dht_inode_ctx_time_update (orig_entry->inode,
+ this, &entry->d_stat,
+ 1);
+ }
+ } else {
+ if (orig_entry->inode) {
+ ret = dht_layout_preset (this, prev->this,
+ orig_entry->inode);
+ if (ret)
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LAYOUT_SET_FAILED,
+ "failed to link the layout "
+ "in inode");
+
+ entry->inode = inode_ref (orig_entry->inode);
+ } else if (itable) {
+ /*
+ * orig_entry->inode might be null if any upper
+ * layer xlators below client set to null, to
+ * force a lookup on the inode even if the inode
+ * is present in the inode table. In that case
+ * we just update the ctx to make sure we didn't
+ * missed anything.
+ */
+ inode = inode_find (itable,
+ orig_entry->d_stat.ia_gfid);
+ if (inode) {
+ ret = dht_layout_preset
+ (this, TIER_HASHED_SUBVOL,
+ inode);
+ if (ret)
+ gf_msg (this->name,
+ GF_LOG_WARNING, 0,
+ DHT_MSG_LAYOUT_SET_FAILED,
+ "failed to link the layout"
+ " in inode");
+ inode_unref (inode);
+ inode = NULL;
+ }
+ }
+ }
+ list_add_tail (&entry->list, &entries.list);
+ count++;
+ }
+ op_ret = count;
+
+done:
+ if (count == 0) {
+ /* non-zero next_offset means that
+ EOF is not yet hit on the current subvol
+ */
+ if (next_offset != 0) {
+ next_subvol = prev->this;
+ } else {
+ goto unwind;
+ }
+
+ STACK_WIND (frame, tier_readdirp_cbk,
+ next_subvol, next_subvol->fops->readdirp,
+ local->fd, local->size, next_offset,
+ local->xattr);
+ return 0;
+ }
+
+unwind:
+ if (op_ret < 0)
+ op_ret = 0;
+
+ DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno, &entries, NULL);
+
+ gf_dirent_free (&entries);
+
+ return 0;
+}
+
+int
+tier_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, int whichop, dict_t *dict)
+{
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ xlator_t *hashed_subvol = NULL;
+ int ret = 0;
+ dht_conf_t *conf = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame, NULL, NULL, whichop);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->fd = fd_ref (fd);
+ local->size = size;
+ local->xattr_req = (dict) ? dict_ref (dict) : NULL;
+
+ hashed_subvol = TIER_HASHED_SUBVOL;
+
+
+ /* TODO: do proper readdir */
+ if (whichop == GF_FOP_READDIRP) {
+ if (dict)
+ local->xattr = dict_ref (dict);
+ else
+ local->xattr = dict_new ();
+
+ if (local->xattr) {
+ ret = dict_set_uint32 (local->xattr,
+ conf->link_xattr_name, 256);
+ if (ret)
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value"
+ " : key = %s",
+ conf->link_xattr_name);
+
+ }
+
+ STACK_WIND (frame, tier_readdirp_cbk, hashed_subvol,
+ hashed_subvol->fops->readdirp,
+ fd, size, yoff, local->xattr);
+
+ } else {
+ STACK_WIND (frame, tier_readdir_cbk, hashed_subvol,
+ hashed_subvol->fops->readdir,
+ fd, size, yoff, local->xattr);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int
+tier_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, dict_t *xdata)
+{
+ int op = GF_FOP_READDIR;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i]) {
+ op = GF_FOP_READDIRP;
+ break;
+ }
+ }
+
+ if (conf->use_readdirp)
+ op = GF_FOP_READDIRP;
+
+out:
+ tier_do_readdir (frame, this, fd, size, yoff, op, 0);
+ return 0;
+}
+
+int
+tier_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, dict_t *dict)
+{
+ tier_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP, dict);
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/tier-common.h b/xlators/cluster/dht/src/tier-common.h
new file mode 100644
index 00000000000..db5bc8c9226
--- /dev/null
+++ b/xlators/cluster/dht/src/tier-common.h
@@ -0,0 +1,31 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _TIER_COMMON_H_
+#define _TIER_COMMON_H_
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+int32_t
+tier_readdirp (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size, off_t off, dict_t *dict);
+
+int
+tier_readdir (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, dict_t *xdata);
+
+#endif
+
diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c
index ce6ba4ce57d..476d3323087 100644
--- a/xlators/cluster/dht/src/tier.c
+++ b/xlators/cluster/dht/src/tier.c
@@ -16,6 +16,8 @@
#include "dht-common.h"
#include "tier.h"
+#include "tier-common.h"
+#include "syscall.h"
/*Hard coded DB info*/
static gfdb_db_type_t dht_tier_db_type = GFDB_SQLITE3;
@@ -445,7 +447,7 @@ tier_migrate_using_query_file (void *_args)
goto abort;
}
- gf_msg_trace (this->name, 0,
+ gf_msg_debug (this->name, 0,
"Tier %d"
" src_subvol %s file %s",
query_cbk_args->is_promotion,
@@ -1557,53 +1559,16 @@ xlator_t *
tier_search (xlator_t *this, dht_layout_t *layout, const char *name)
{
xlator_t *subvol = NULL;
- void *value;
- int search_subvol = 0;
dht_conf_t *conf = NULL;
- gf_defrag_info_t *defrag = NULL;
- int layout_cold = 0;
- int layout_hot = 1;
GF_VALIDATE_OR_GOTO ("tier", this, out);
- GF_VALIDATE_OR_GOTO (this->name, layout, out);
- GF_VALIDATE_OR_GOTO (this->name, name, out);
GF_VALIDATE_OR_GOTO (this->name, this->private, out);
conf = this->private;
- /* The first subvolume in the graph is always cold. */
- /* Find the position of the cold subvolume in the layout. */
- layout_cold = 0;
- layout_hot = 1;
- if (conf->subvolumes[0] != layout->list[0].xlator) {
- layout_cold = 1;
- layout_hot = 0;
- }
-
- search_subvol = layout_hot;
-
- defrag = conf->defrag;
- if (defrag && defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER)
- search_subvol = layout_cold;
-
- /* "decommission_subvols_cnt" can only be non-zero on detach. */
- /* This will change once brick add/remove is supported for */
- /* tiered volumes. */
- else if (conf->decommission_subvols_cnt) {
- search_subvol = layout_cold;
- }
- else if (!dict_get_ptr (this->options, "rule", &value) &&
- !strcmp(layout->list[layout_cold].xlator->name, value)) {
- search_subvol = layout_cold;
- }
+ subvol = TIER_HASHED_SUBVOL;
- if ((layout->list[search_subvol].err > 0) &&
- (layout->list[search_subvol].err != ENOTCONN))
- search_subvol = layout_cold;
-
- subvol = layout->list[search_subvol].xlator;
out:
-
return subvol;
}
@@ -1970,8 +1935,8 @@ struct xlator_fops fops = {
.open = dht_open,
.statfs = dht_statfs,
.opendir = dht_opendir,
- .readdir = dht_readdir,
- .readdirp = dht_readdirp,
+ .readdir = tier_readdir,
+ .readdirp = tier_readdirp,
.fsyncdir = dht_fsyncdir,
.symlink = dht_symlink,
.unlink = dht_unlink,
diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h
index 1f9f2da04e6..fe66a6950db 100644
--- a/xlators/cluster/dht/src/tier.h
+++ b/xlators/cluster/dht/src/tier.h
@@ -38,7 +38,8 @@
#define PROMOTION_QFILE "promotequeryfile"
#define DEMOTION_QFILE "demotequeryfile"
-#define TIER_HASHED_SUBVOL conf->subvolumes[1]
+#define TIER_HASHED_SUBVOL conf->subvolumes[0]
+#define TIER_UNHASHED_SUBVOL conf->subvolumes[1]
#define GET_QFILE_PATH(is_promotion)\
(is_promotion) ? promotion_qfile : demotion_qfile